Skip to content
Snippets Groups Projects
Commit 71b30bfa authored by Maximilian Blunck's avatar Maximilian Blunck
Browse files
parents 65340bcf 5c99e1e1
No related branches found
No related tags found
No related merge requests found
......@@ -3,56 +3,130 @@ from nltk.tokenize import word_tokenize
from corpus import read_corpus
"""
turning the entire corpus into a bag of words (lemmas).
returns: list
TODO
- something useful to do with the feature vectors
"""
def to_bag_of_words(corpus):
for entry in corpus:
for word in word_tokenize(str(entry['REVIEW'])):
if word not in bag_of_words:
bag_of_words.append(word)
return bag_of_words
"""
pos-tagging the entire corpus token-wise.
Returns the raw corpus as a list
e.g. [[('No', 'DT')], [('Just', 'RB'), ('no', 'DT')]]
"""
def corpus_pos_tagger(corpus):
tagged_corpus = []
temp_entry = []
for entry in corpus:
tagged_corpus.append(nltk.pos_tag(word_tokenize(str(entry['REVIEW']))))
return tagged_corpus
temp_entry = nltk.pos_tag(word_tokenize(str(entry['REVIEW'])))
tagged_corpus.append(temp_entry)
temp_entry = []
return (tagged_corpus)
"""
for each review in the corpus, the number of occurences of each token is written
into a feature vector of the same length as the bag of words list.
returns: list of lists
Same format as above, reduces the tuples to pos-tags
e.g. [['DT', ',', 'NN'], ['DT', ',', 'NN']]
"""
def to_vector(bag_of_words, corpus):
sentence_vector_list = []
def tagged_corpus_to_pos_unigrams(tagged_corpus):
pos_unigrams = []
temp_pos = []
for entry in tagged_corpus:
for token in entry:
temp_pos.append(token[1])
pos_unigrams.append(temp_pos)
temp_pos = []
return pos_unigrams
"""
Returns the bigrams for each review
e.g. [[('DT', ','), (',', 'NN')], [('DT', ','), (',', 'NN')]]
"""
def pos_unigrams_to_bigrams(input_list):
bigram_list = []
temp_bigram = []
for review in input_list:
for i in range(len(review)-1):
temp_bigram.append((review[i], review[i+1]))
bigram_list.append(temp_bigram)
temp_bigram = []
return bigram_list
"""
Takes all the bigrams and turns them into a bag of bigrams
e.g. [('DT', ','), (',', 'NN')]
"""
def to_bag_of_bigrams(bigram_list):
bag_of_bigrams = []
for review in bigram_list:
for bigram in review:
if bigram not in bag_of_bigrams:
bag_of_bigrams.append(bigram)
return bag_of_bigrams
"""
TODO: explanation that's not stupid
"""
def to_bigram_vector(bag_of_bigrams, corpus): #corpus is the bigram_list
review_vector_list = []
for entry in corpus:
sentence_vector = []
review = word_tokenize(str(entry['REVIEW']))
review_vector = []
for bigram in bag_of_bigrams:
review_vector.append(entry.count(bigram))
review_vector_list.append(review_vector)
return review_vector_list
"""
The functions below are intended to be used on token-level (bag of words)
"""
def to_token_vector(bag_of_words, corpus):
review_vector_list = []
for entry in corpus:
review_vector = []
review = word_tokenize(str(entry['REVIEW']))
for word in bag_of_words:
sentence_vector.append(review.count(word))
review_vector.append(review.count(word))
review_vector_list.append(review_vector)
return review_vector_list
sentence_vector_list.append(sentence_vector)
return sentence_vector_list
def to_bag_of_words(corpus):
bag_of_words = []
for entry in corpus:
for word in word_tokenize(str(entry['REVIEW'])):
if word not in bag_of_words:
bag_of_words.append(word)
return bag_of_words
#fun fact: len(bag_of_words) is 25325 for corpus.csv
if __name__ == '__main__':
corpus = read_corpus("minicorpus.csv")
bag_of_words = []
tagged_corpus = []
bag_of_words = to_bag_of_words(corpus)
#das sollte rausgenommen werden, wenn mit dem kompletten korpus gearbeitet wird
for vektor in to_vector(bag_of_words, corpus):
print (str(vektor) + "\n")
tagged_corpus = corpus_pos_tagger(corpus)
pos_unigrams = tagged_corpus_to_pos_unigrams(tagged_corpus)
pos_bigrams = pos_unigrams_to_bigrams(pos_unigrams)
bag_of_bigrams = to_bag_of_bigrams(pos_bigrams)
if len(bag_of_words) != len(to_vector(bag_of_words, corpus)[0]):
print("Irgendwas lief schief (Featurevektor und Bag of Words nicht gleich lang)")
corpus_vector = to_bigram_vector(bag_of_bigrams, pos_bigrams)
for vector in corpus_vector:
print(vector)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment