From cba928c10e92998f34da5b506e2e93ab9543209f Mon Sep 17 00:00:00 2001
From: Maximilian Blunck <blunck@cl.uni-heidelberg.de>
Date: Tue, 16 Jan 2018 01:02:37 +0100
Subject: [PATCH] Training script and its outputs now a little friendlier for
 tuning :) Added option for reusing extracted features.

---
 sent_rating_feature.py |   4 +-
 training_testing.py    | 110 ++++++++++++++++++++++++-----------------
 2 files changed, 66 insertions(+), 48 deletions(-)

diff --git a/sent_rating_feature.py b/sent_rating_feature.py
index df297d0..16c68cf 100644
--- a/sent_rating_feature.py
+++ b/sent_rating_feature.py
@@ -9,12 +9,12 @@ def extract(corpus_instance):
 	Returns numpy array of size 1.
 	"""
 	review = corpus_instance["REVIEW"]
-	stars = corpus_instance["STARS"]
+	stars = float(corpus_instance["STARS"])
 
     #sent = get_sent_vader(review)
 	sent = get_sent_textblob(review)
 	
-	if (sent <= 0.0 and stars == "5.0") or (sent > 0.0 and stars == "1.0"):
+	if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0):
 		return np.array([1])
 	else:
 		return np.array([0])
diff --git a/training_testing.py b/training_testing.py
index 35adee9..c3c0065 100644
--- a/training_testing.py
+++ b/training_testing.py
@@ -10,6 +10,36 @@ from sklearn import tree
 from sklearn import naive_bayes
 from sklearn import linear_model
 from sklearn.model_selection import cross_val_score
+import time
+import pickle
+
+def extract_features(training_set, test_set):
+
+    # vocabularies
+    unigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 1)
+    bigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 2)
+    trigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 3)
+    pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set)
+    surface_bigram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', 2)
+
+    # inputs:
+    print("------Feature Extraction------\n")
+    train_inputs = [create_vector(el, unigram_vocab, pos_bigram_vocab, surface_bigram_vocab)
+                    for el in train_set]  # 1000 vectors
+    test_inputs = [create_vector(el, unigram_vocab, pos_bigram_vocab, surface_bigram_vocab)
+                   for el in test_set]  # 254 vectors
+
+    # stats
+    print("Number of train samples:          {}".format(len(train_inputs)))
+    print("Unigram vocab size:               {}".format(len(unigram_vocab)))
+    print("Bigram vocab size:                {}".format(len(bigram_vocab)))
+    print("Trigram vocab size:               {}".format(len(trigram_vocab)))
+    print("POS-Bigram vocab size:            {}".format(len(pos_bigram_vocab)))
+    print("SP-Bigram vocab size:             {}".format(len(surface_bigram_vocab)))
+    print("Total features per train sample:  {}".format(len(train_inputs[0])))
+    print("---> Duration Feature Extraction: {} sec.\n".format(int(time.time()-start_time)))
+
+    return train_inputs, test_inputs
 
 
 def create_vector(corpus_instance, vocabulary=None, pos_vocabulary=None, surface_vocabulary=None):
@@ -33,16 +63,20 @@ def train_multiple(classifiers, train_input, train_labels):
         classifier.fit(train_input, train_labels)
 
 
-def score_multiple(classifiers, train_input, train_labels):
-    scores = []
+def validate_multiple(classifiers, train_input, train_labels):
+    print("\n------Cross Validation------")
+
     for classifier in classifiers:
+        print("\n{}".format(classifier))
+
         accuracy = cross_val_score(classifier, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
         f1 = cross_val_score(classifier, train_inputs, train_labels, cv=5, scoring='f1').mean()
-        scores.append(accuracy, f1)
-    return scores
+        
+        print("\nAccuracy: {}, F1-Score: {}\n".format(accuracy, f1))
 
 
 if __name__ == '__main__':
+    start_time = time.time()
 
     corpus = corpus.read_corpus("corpus_shuffled.csv")
     extended_corpus = surface_patterns.extract_surface_patterns(corpus, 1000)
@@ -51,64 +85,48 @@ if __name__ == '__main__':
     train_set = extended_corpus[:1000]
     test_set = extended_corpus[1000:]
 
-    # vocabularies
-    unigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 1)
-    bigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 2)
-    pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set)
-    surface_bigram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', 2)
+    train_inputs, train_labels = [], []
+    test_inputs, test_labels = [], []
 
-    # inputs:
-    train_inputs = [create_vector(el, bigram_vocab, pos_bigram_vocab, surface_bigram_vocab)
-                    for el in train_set]  # 1000 vectors
-    #test_inputs = [create_vector(el, bigram_vocab, pos_bigram_vocab, surface_bigram_vocab)
-    #               for el in test_set]  # 254 vectors
+    re_extract = True # change to False if features are unchanged since previous run
 
-    # labels
-    train_labels = np.array([int(el['LABEL']) for el in train_set])  # 1000 labels
-    test_labels = np.array([int(el['LABEL']) for el in test_set])  # 254 labels
+    if re_extract == True:
 
-    print("Number of train samples:             {}".format(len(train_inputs)))
-    print("Number of features per train sample: {}".format(len(train_inputs[0])))
-    print("Unigram vocab size:                  {}".format(len(unigram_vocab)))
-    print("Bigram vocab size:                   {}".format(len(bigram_vocab)))
-    print("POS-Bigram vocab size:               {}".format(len(pos_bigram_vocab)))
-    print("Surface Patterns-Bigram vocab size:  {}".format(len(surface_bigram_vocab)))
+        # inputs (x)
+        train_inputs, test_inputs = extract_features(train_set, test_set)
 
-    # TODO Pickle/outsource
+        # labels (y)
+        train_labels = np.array([int(el['LABEL']) for el in train_set])  # 1000 labels
+        test_labels = np.array([int(el['LABEL']) for el in test_set])  # 254 labels
 
-    # ML
+        # save to pickle
+        pickle.dump([train_inputs, train_labels, test_inputs, test_labels], open("vectors.pickle", "wb"))
+
+    else:
+        # load from pickle
+        v = pickle.load(open("vectors.pickle", "rb"))
+        train_inputs, train_labels = v[0], v[1]
+        test_inputs, test_labels = v[2], v[3]
+
+
+    # Machine Learning
 
     # init
-    svm_clf = svm.SVC(C=200.0, kernel='linear') # large C: smaller-margin hyperplane
+    svm_clf = svm.SVC(C=500.0, kernel='linear') # large C: smaller-margin hyperplane
     tree_clf = tree.DecisionTreeClassifier()
     nb_clf = naive_bayes.MultinomialNB()
     lr_clf = linear_model.LogisticRegression()
 
     # training
-    train_multiple([svm_clf, tree_clf, nb_clf, lr_clf], train_inputs, train_labels)
+    train_multiple([svm_clf, tree_clf], train_inputs, train_labels) #, nb_clf, lr_clf
+    print("---> Duration Training: {} sec.\n".format(int(time.time()-start_time)))
 
     # validation
-    svm_acc = cross_val_score(svm_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
-    tree_acc = cross_val_score(tree_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
-    nb_acc = cross_val_score(nb_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
-    lr_acc = cross_val_score(lr_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
-
-    svm_f1 = cross_val_score(svm_clf, train_inputs, train_labels, cv=5, scoring='f1').mean()
-    tree_f1 = cross_val_score(tree_clf, train_inputs, train_labels, cv=5, scoring='f1').mean()
-    nb_f1 = cross_val_score(nb_clf, train_inputs, train_labels, cv=5, scoring='f1').mean()
-    lr_f1 = cross_val_score(lr_clf, train_inputs, train_labels, cv=5, scoring='f1').mean()
-
-    print("\n--Cross Validation Scores-- ")
-    print("\nSVM: Accuracy: {}, F1-Score: {}".format(svm_acc, svm_f1))
-    print("\nTree: Accuracy: {}, F1-Score: {}".format(tree_acc, tree_f1))
-    print("\nN. Bayes: Accuracy: {}, F1-Score: {}".format(nb_acc, nb_f1))
-    print("\nLog. Regression: Accuracy: {}, F1-Score: {}".format(lr_acc, lr_f1))
+    validate_multiple([svm_clf, tree_clf], train_inputs, train_labels) #, nb_clf, lr_clf
+    print("---> Duration CV: {} sec.".format(int(time.time()-start_time)))
+
 
     # testing
     # print("\nSVM: Score on test Data:")
     # print(svm_clf.score(test_inputs, test_labels))
-
-    # print("\nDTree: Score on test Data:")
-    # print(tree_clf.score(test_inputs, test_labels))
-
     # predictions = svm_classifier.predict(train_inputs)
-- 
GitLab