Skip to content
Snippets Groups Projects
Commit f7ec792e authored by blunck's avatar blunck
Browse files
parents 798918af d2a625e6
No related branches found
No related tags found
No related merge requests found
import nltk
from nltk.tokenize import word_tokenize
from corpus import read_corpus
import csv
"""
TODO
- something useful to do with the feature vectors
These functions are meant to be accessed from training_testing.py
"""
"""
not yet functional
"""
def extract(corpus_instance, bigram_pos_vocab):
corpus_instance_formatted = []
reader = csv.DictReader(corpus_instance)
for row in reader:
data = row
corpus_instance_formatted.append(data)
corpus_instance_pos_bigrams = corpus_pos_tagger(corpus_instance)
corpus_instance_vector = []
for bigram in bigram_pos_vocab:
corpus_instance_vector.append(corpus_instance_pos_bigrams.count(bigram))
return corpus_instance_vector
"""
already functional
"""
def get_pos_vocabulary(corpus):
tagged_corpus = corpus_pos_tagger(corpus)
pos_unigrams = tagged_corpus_to_pos_unigrams(tagged_corpus)
pos_bigrams = pos_unigrams_to_bigrams(pos_unigrams)
pos_vocab = to_bag_of_bigrams(pos_bigrams)
return pos_vocab
"""
These functions are intended for internal use.
"""
"""
......@@ -16,9 +52,13 @@ def corpus_pos_tagger(corpus):
temp_entry = []
for entry in corpus:
if not isinstance(entry, dict):
continue
temp_entry = nltk.pos_tag(word_tokenize(str(entry['REVIEW'])))
tagged_corpus.append(temp_entry)
temp_entry = []
return (tagged_corpus)
......@@ -89,55 +129,45 @@ def to_bigram_vector(bag_of_bigrams, corpus): #corpus is the bigram_list
return review_vector_list
if __name__ == '__main__':
corpus = read_corpus("minicorpus.csv")
bigram_pos_vocab = get_pos_vocabulary(corpus)
corpus_instance = "1,36_12_R3W1O661T65OBH.txt,3.0,It's Just A Broom - No Handle!,'October 6, 2004',D. Richardson,WOLF-Garten Outdoor Broom B40M (Lawn & Patio),'Unless you want to sweep on your hands and knees, make sure you purchase the handle separately!'"
f4 = extract(corpus_instance, bigram_pos_vocab)
#print(f4)
#corpus_vector = to_bigram_vector(bag_of_bigrams, pos_bigrams)
#for vector in corpus_vector:
#print(vector)
"""
The functions below are intended to be used on token-level (bag of words)
The functions below are intended to be used on token-level (bag of words) and possibly obsolete
"""
def to_token_vector(bag_of_words, corpus):
review_vector_list = []
#def to_token_vector(bag_of_words, corpus):
#review_vector_list = []
for entry in corpus:
review_vector = []
review = word_tokenize(str(entry['REVIEW']))
#for entry in corpus:
#review_vector = []
#review = word_tokenize(str(entry['REVIEW']))
for word in bag_of_words:
review_vector.append(review.count(word))
#for word in bag_of_words:
#review_vector.append(review.count(word))
review_vector_list.append(review_vector)
#review_vector_list.append(review_vector)
return review_vector_list
#return review_vector_list
def to_bag_of_words(corpus):
bag_of_words = []
#def to_bag_of_words(corpus):
#bag_of_words = []
for entry in corpus:
for word in word_tokenize(str(entry['REVIEW'])):
if word not in bag_of_words:
bag_of_words.append(word)
#for entry in corpus:
#for word in word_tokenize(str(entry['REVIEW'])):
#if word not in bag_of_words:
#bag_of_words.append(word)
return bag_of_words
#return bag_of_words
#fun fact: len(bag_of_words) is 25325 for corpus.csv
def extract(corpus_instance, pos_vocab):
"nimmt einzelnes dict, und gibt featurevector der größe len(bag-bigram) zurück"
pass
def get_pos_vocabulary(corpus):
"geht über ganzes corpus, gibt bigram-bag zurück"
pass
if __name__ == '__main__':
corpus = read_corpus("minicorpus.csv")
tagged_corpus = corpus_pos_tagger(corpus)
pos_unigrams = tagged_corpus_to_pos_unigrams(tagged_corpus)
pos_bigrams = pos_unigrams_to_bigrams(pos_unigrams)
bag_of_bigrams = to_bag_of_bigrams(pos_bigrams)
corpus_vector = to_bigram_vector(bag_of_bigrams, pos_bigrams)
for vector in corpus_vector:
print(vector)
......@@ -13,9 +13,10 @@ def create_vector(corpus_instance, vocabulary=None, pos_vocabulary=None):
Calls all feature extraction programms and combines
resulting arrays to a single input vector (for a
single corpus instance)
Example for corpus instance: OrderedDict([('LABEL', '0'), ('FILENAME', '36_19_RPRRQDRSHDV6J.txt'), ('STARS', '5.0'), ('TITLE', etc.
"""
f1 = ngram_feature.extract(corpus_instance, vocabulary)
# f2 = postagger.to_bigram_vector(corpus_instance, pos_vocabulary)
#f2 = postagger.extract(corpus_instance, bigram_pos_vocab)
f4 = sent_rating_feature.extract(corpus_instance)
return np.concatenate((f1,f4))
......@@ -35,9 +36,9 @@ if __name__ == '__main__':
bigram_vocab = ngram_feature.get_vocabulary(train_set, 2)
# pos_bags
# bigram_pos_vocab = postagger.get_pos_vocabulary(train_set) (entspricht corpus in postagger.py)
bigram_pos_vocab = postagger.get_pos_vocabulary(train_set)
#print(bigram_pos_vocab) #already lookin' good
# inputs:
train_inputs = [create_vector(el, unigram_vocab)
for el in train_set] # 1000 vectors
......@@ -52,6 +53,7 @@ if __name__ == '__main__':
print("Number of features per train sample: {}".format(len(train_inputs[0])))
print("Unigram vocab size: {}".format(len(unigram_vocab)))
print("Bigram vocab size: {}".format(len(bigram_vocab)))
print("POS-Bigram vocab size: {}".format(len(bigram_pos_vocab)))
# training
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment