From cba928c10e92998f34da5b506e2e93ab9543209f Mon Sep 17 00:00:00 2001 From: Maximilian Blunck <blunck@cl.uni-heidelberg.de> Date: Tue, 16 Jan 2018 01:02:37 +0100 Subject: [PATCH] Training script and its outputs now a little friendlier for tuning :) Added option for reusing extracted features. --- sent_rating_feature.py | 4 +- training_testing.py | 110 ++++++++++++++++++++++++----------------- 2 files changed, 66 insertions(+), 48 deletions(-) diff --git a/sent_rating_feature.py b/sent_rating_feature.py index df297d0..16c68cf 100644 --- a/sent_rating_feature.py +++ b/sent_rating_feature.py @@ -9,12 +9,12 @@ def extract(corpus_instance): Returns numpy array of size 1. """ review = corpus_instance["REVIEW"] - stars = corpus_instance["STARS"] + stars = float(corpus_instance["STARS"]) #sent = get_sent_vader(review) sent = get_sent_textblob(review) - if (sent <= 0.0 and stars == "5.0") or (sent > 0.0 and stars == "1.0"): + if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0): return np.array([1]) else: return np.array([0]) diff --git a/training_testing.py b/training_testing.py index 35adee9..c3c0065 100644 --- a/training_testing.py +++ b/training_testing.py @@ -10,6 +10,36 @@ from sklearn import tree from sklearn import naive_bayes from sklearn import linear_model from sklearn.model_selection import cross_val_score +import time +import pickle + +def extract_features(training_set, test_set): + + # vocabularies + unigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 1) + bigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 2) + trigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 3) + pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set) + surface_bigram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', 2) + + # inputs: + print("------Feature Extraction------\n") + train_inputs = [create_vector(el, unigram_vocab, pos_bigram_vocab, surface_bigram_vocab) + for el in train_set] # 1000 vectors + test_inputs = [create_vector(el, unigram_vocab, pos_bigram_vocab, surface_bigram_vocab) + for el in test_set] # 254 vectors + + # stats + print("Number of train samples: {}".format(len(train_inputs))) + print("Unigram vocab size: {}".format(len(unigram_vocab))) + print("Bigram vocab size: {}".format(len(bigram_vocab))) + print("Trigram vocab size: {}".format(len(trigram_vocab))) + print("POS-Bigram vocab size: {}".format(len(pos_bigram_vocab))) + print("SP-Bigram vocab size: {}".format(len(surface_bigram_vocab))) + print("Total features per train sample: {}".format(len(train_inputs[0]))) + print("---> Duration Feature Extraction: {} sec.\n".format(int(time.time()-start_time))) + + return train_inputs, test_inputs def create_vector(corpus_instance, vocabulary=None, pos_vocabulary=None, surface_vocabulary=None): @@ -33,16 +63,20 @@ def train_multiple(classifiers, train_input, train_labels): classifier.fit(train_input, train_labels) -def score_multiple(classifiers, train_input, train_labels): - scores = [] +def validate_multiple(classifiers, train_input, train_labels): + print("\n------Cross Validation------") + for classifier in classifiers: + print("\n{}".format(classifier)) + accuracy = cross_val_score(classifier, train_inputs, train_labels, cv=5, scoring='accuracy').mean() f1 = cross_val_score(classifier, train_inputs, train_labels, cv=5, scoring='f1').mean() - scores.append(accuracy, f1) - return scores + + print("\nAccuracy: {}, F1-Score: {}\n".format(accuracy, f1)) if __name__ == '__main__': + start_time = time.time() corpus = corpus.read_corpus("corpus_shuffled.csv") extended_corpus = surface_patterns.extract_surface_patterns(corpus, 1000) @@ -51,64 +85,48 @@ if __name__ == '__main__': train_set = extended_corpus[:1000] test_set = extended_corpus[1000:] - # vocabularies - unigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 1) - bigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 2) - pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set) - surface_bigram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', 2) + train_inputs, train_labels = [], [] + test_inputs, test_labels = [], [] - # inputs: - train_inputs = [create_vector(el, bigram_vocab, pos_bigram_vocab, surface_bigram_vocab) - for el in train_set] # 1000 vectors - #test_inputs = [create_vector(el, bigram_vocab, pos_bigram_vocab, surface_bigram_vocab) - # for el in test_set] # 254 vectors + re_extract = True # change to False if features are unchanged since previous run - # labels - train_labels = np.array([int(el['LABEL']) for el in train_set]) # 1000 labels - test_labels = np.array([int(el['LABEL']) for el in test_set]) # 254 labels + if re_extract == True: - print("Number of train samples: {}".format(len(train_inputs))) - print("Number of features per train sample: {}".format(len(train_inputs[0]))) - print("Unigram vocab size: {}".format(len(unigram_vocab))) - print("Bigram vocab size: {}".format(len(bigram_vocab))) - print("POS-Bigram vocab size: {}".format(len(pos_bigram_vocab))) - print("Surface Patterns-Bigram vocab size: {}".format(len(surface_bigram_vocab))) + # inputs (x) + train_inputs, test_inputs = extract_features(train_set, test_set) - # TODO Pickle/outsource + # labels (y) + train_labels = np.array([int(el['LABEL']) for el in train_set]) # 1000 labels + test_labels = np.array([int(el['LABEL']) for el in test_set]) # 254 labels - # ML + # save to pickle + pickle.dump([train_inputs, train_labels, test_inputs, test_labels], open("vectors.pickle", "wb")) + + else: + # load from pickle + v = pickle.load(open("vectors.pickle", "rb")) + train_inputs, train_labels = v[0], v[1] + test_inputs, test_labels = v[2], v[3] + + + # Machine Learning # init - svm_clf = svm.SVC(C=200.0, kernel='linear') # large C: smaller-margin hyperplane + svm_clf = svm.SVC(C=500.0, kernel='linear') # large C: smaller-margin hyperplane tree_clf = tree.DecisionTreeClassifier() nb_clf = naive_bayes.MultinomialNB() lr_clf = linear_model.LogisticRegression() # training - train_multiple([svm_clf, tree_clf, nb_clf, lr_clf], train_inputs, train_labels) + train_multiple([svm_clf, tree_clf], train_inputs, train_labels) #, nb_clf, lr_clf + print("---> Duration Training: {} sec.\n".format(int(time.time()-start_time))) # validation - svm_acc = cross_val_score(svm_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean() - tree_acc = cross_val_score(tree_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean() - nb_acc = cross_val_score(nb_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean() - lr_acc = cross_val_score(lr_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean() - - svm_f1 = cross_val_score(svm_clf, train_inputs, train_labels, cv=5, scoring='f1').mean() - tree_f1 = cross_val_score(tree_clf, train_inputs, train_labels, cv=5, scoring='f1').mean() - nb_f1 = cross_val_score(nb_clf, train_inputs, train_labels, cv=5, scoring='f1').mean() - lr_f1 = cross_val_score(lr_clf, train_inputs, train_labels, cv=5, scoring='f1').mean() - - print("\n--Cross Validation Scores-- ") - print("\nSVM: Accuracy: {}, F1-Score: {}".format(svm_acc, svm_f1)) - print("\nTree: Accuracy: {}, F1-Score: {}".format(tree_acc, tree_f1)) - print("\nN. Bayes: Accuracy: {}, F1-Score: {}".format(nb_acc, nb_f1)) - print("\nLog. Regression: Accuracy: {}, F1-Score: {}".format(lr_acc, lr_f1)) + validate_multiple([svm_clf, tree_clf], train_inputs, train_labels) #, nb_clf, lr_clf + print("---> Duration CV: {} sec.".format(int(time.time()-start_time))) + # testing # print("\nSVM: Score on test Data:") # print(svm_clf.score(test_inputs, test_labels)) - - # print("\nDTree: Score on test Data:") - # print(tree_clf.score(test_inputs, test_labels)) - # predictions = svm_classifier.predict(train_inputs) -- GitLab