diff --git a/postagger.py b/postagger.py index 8483594a52b6c8ff3c8d1ae812fe7422b7544ab1..5bfc6af2af40a60767ab357c4128053cfc158d6e 100644 --- a/postagger.py +++ b/postagger.py @@ -119,6 +119,16 @@ def to_bag_of_words(corpus): #fun fact: len(bag_of_words) is 25325 for corpus.csv +def extract(corpus_instance, pos_vocab): + "nimmt einzelnes dict, und gibt featurevector der größe len(bag-bigram) zurück" + pass + + +def get_pos_vocabulary(corpus): + "geht über ganzes corpus, gibt bigram-bag zurück" + pass + + if __name__ == '__main__': corpus = read_corpus("minicorpus.csv") tagged_corpus = corpus_pos_tagger(corpus) @@ -130,3 +140,4 @@ if __name__ == '__main__': for vector in corpus_vector: print(vector) + diff --git a/training_testing.py b/training_testing.py index 01af8fb15083597c56bf8d944eb43e663146281c..8d5625e2c5be86e78837d462fa190711d51573a4 100644 --- a/training_testing.py +++ b/training_testing.py @@ -5,6 +5,7 @@ import ngram_feature import numpy as np from sklearn import svm from sklearn.tree import DecisionTreeClassifier +import postagger def create_vector(corpus_instance, vocabulary=None): @@ -14,6 +15,7 @@ def create_vector(corpus_instance, vocabulary=None): single corpus instance) """ f1 = ngram_feature.extract(corpus_instance, vocabulary) + # f2 = postagger.to_bigram_vector(corpus_instance, pos_vocab) f4 = sent_rating_feature.extract(corpus_instance) return np.concatenate((f1,f4)) @@ -31,6 +33,10 @@ if __name__ == '__main__': # vocabularies unigram_vocab = ngram_feature.get_vocabulary(train_set, 1) bigram_vocab = ngram_feature.get_vocabulary(train_set, 2) + + # pos_bags + # bigram_pos_vocab = postagger.get_pos_vocabulary(train_set) (entspricht corpus in postagger.py) + # inputs: train_inputs = [create_vector(el, unigram_vocab)