diff --git a/postagger.py b/postagger.py index e3c9501726e3c27293928fedc51c630dec46e28b..8483594a52b6c8ff3c8d1ae812fe7422b7544ab1 100644 --- a/postagger.py +++ b/postagger.py @@ -3,56 +3,130 @@ from nltk.tokenize import word_tokenize from corpus import read_corpus """ -turning the entire corpus into a bag of words (lemmas). -returns: list +TODO +- something useful to do with the feature vectors """ -def to_bag_of_words(corpus): - for entry in corpus: - for word in word_tokenize(str(entry['REVIEW'])): - if word not in bag_of_words: - bag_of_words.append(word) - return bag_of_words - """ -pos-tagging the entire corpus token-wise. +Returns the raw corpus as a list +e.g. [[('No', 'DT')], [('Just', 'RB'), ('no', 'DT')]] """ def corpus_pos_tagger(corpus): + tagged_corpus = [] + temp_entry = [] + for entry in corpus: - tagged_corpus.append(nltk.pos_tag(word_tokenize(str(entry['REVIEW'])))) - return tagged_corpus + temp_entry = nltk.pos_tag(word_tokenize(str(entry['REVIEW']))) + tagged_corpus.append(temp_entry) + temp_entry = [] + return (tagged_corpus) """ -for each review in the corpus, the number of occurences of each token is written -into a feature vector of the same length as the bag of words list. -returns: list of lists +Same format as above, reduces the tuples to pos-tags +e.g. [['DT', ',', 'NN'], ['DT', ',', 'NN']] + """ -def to_vector(bag_of_words, corpus): - sentence_vector_list = [] +def tagged_corpus_to_pos_unigrams(tagged_corpus): + pos_unigrams = [] + temp_pos = [] + + for entry in tagged_corpus: + for token in entry: + temp_pos.append(token[1]) + pos_unigrams.append(temp_pos) + temp_pos = [] + + return pos_unigrams + +""" +Returns the bigrams for each review +e.g. [[('DT', ','), (',', 'NN')], [('DT', ','), (',', 'NN')]] +""" +def pos_unigrams_to_bigrams(input_list): + bigram_list = [] + temp_bigram = [] + + for review in input_list: + for i in range(len(review)-1): + temp_bigram.append((review[i], review[i+1])) + bigram_list.append(temp_bigram) + temp_bigram = [] + + return bigram_list + + +""" +Takes all the bigrams and turns them into a bag of bigrams +e.g. [('DT', ','), (',', 'NN')] +""" +def to_bag_of_bigrams(bigram_list): + bag_of_bigrams = [] + + for review in bigram_list: + for bigram in review: + if bigram not in bag_of_bigrams: + bag_of_bigrams.append(bigram) + + return bag_of_bigrams + + +""" +TODO: explanation that's not stupid +""" +def to_bigram_vector(bag_of_bigrams, corpus): #corpus is the bigram_list + review_vector_list = [] + for entry in corpus: - sentence_vector = [] - review = word_tokenize(str(entry['REVIEW'])) + review_vector = [] + + for bigram in bag_of_bigrams: + review_vector.append(entry.count(bigram)) + + review_vector_list.append(review_vector) + + return review_vector_list + +""" +The functions below are intended to be used on token-level (bag of words) +""" +def to_token_vector(bag_of_words, corpus): + review_vector_list = [] + + for entry in corpus: + review_vector = [] + review = word_tokenize(str(entry['REVIEW'])) + for word in bag_of_words: - sentence_vector.append(review.count(word)) + review_vector.append(review.count(word)) + + review_vector_list.append(review_vector) + + return review_vector_list - sentence_vector_list.append(sentence_vector) - return sentence_vector_list +def to_bag_of_words(corpus): + bag_of_words = [] + + for entry in corpus: + for word in word_tokenize(str(entry['REVIEW'])): + if word not in bag_of_words: + bag_of_words.append(word) + + return bag_of_words +#fun fact: len(bag_of_words) is 25325 for corpus.csv if __name__ == '__main__': corpus = read_corpus("minicorpus.csv") - bag_of_words = [] - tagged_corpus = [] - - bag_of_words = to_bag_of_words(corpus) - - #das sollte rausgenommen werden, wenn mit dem kompletten korpus gearbeitet wird - for vektor in to_vector(bag_of_words, corpus): - print (str(vektor) + "\n") + tagged_corpus = corpus_pos_tagger(corpus) + pos_unigrams = tagged_corpus_to_pos_unigrams(tagged_corpus) + pos_bigrams = pos_unigrams_to_bigrams(pos_unigrams) + bag_of_bigrams = to_bag_of_bigrams(pos_bigrams) - if len(bag_of_words) != len(to_vector(bag_of_words, corpus)[0]): - print("Irgendwas lief schief (Featurevektor und Bag of Words nicht gleich lang)") + corpus_vector = to_bigram_vector(bag_of_bigrams, pos_bigrams) + + for vector in corpus_vector: + print(vector)