Skip to content
Snippets Groups Projects
Commit cba928c1 authored by blunck's avatar blunck
Browse files

Training script and its outputs now a little friendlier for tuning :) Added...

Training script and its outputs now a little friendlier for tuning :) Added option for reusing extracted features.
parent 760ca3e3
No related branches found
No related tags found
No related merge requests found
......@@ -9,12 +9,12 @@ def extract(corpus_instance):
Returns numpy array of size 1.
"""
review = corpus_instance["REVIEW"]
stars = corpus_instance["STARS"]
stars = float(corpus_instance["STARS"])
#sent = get_sent_vader(review)
sent = get_sent_textblob(review)
if (sent <= 0.0 and stars == "5.0") or (sent > 0.0 and stars == "1.0"):
if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0):
return np.array([1])
else:
return np.array([0])
......
......@@ -10,6 +10,36 @@ from sklearn import tree
from sklearn import naive_bayes
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
import time
import pickle
def extract_features(training_set, test_set):
# vocabularies
unigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 1)
bigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 2)
trigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 3)
pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set)
surface_bigram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', 2)
# inputs:
print("------Feature Extraction------\n")
train_inputs = [create_vector(el, unigram_vocab, pos_bigram_vocab, surface_bigram_vocab)
for el in train_set] # 1000 vectors
test_inputs = [create_vector(el, unigram_vocab, pos_bigram_vocab, surface_bigram_vocab)
for el in test_set] # 254 vectors
# stats
print("Number of train samples: {}".format(len(train_inputs)))
print("Unigram vocab size: {}".format(len(unigram_vocab)))
print("Bigram vocab size: {}".format(len(bigram_vocab)))
print("Trigram vocab size: {}".format(len(trigram_vocab)))
print("POS-Bigram vocab size: {}".format(len(pos_bigram_vocab)))
print("SP-Bigram vocab size: {}".format(len(surface_bigram_vocab)))
print("Total features per train sample: {}".format(len(train_inputs[0])))
print("---> Duration Feature Extraction: {} sec.\n".format(int(time.time()-start_time)))
return train_inputs, test_inputs
def create_vector(corpus_instance, vocabulary=None, pos_vocabulary=None, surface_vocabulary=None):
......@@ -33,16 +63,20 @@ def train_multiple(classifiers, train_input, train_labels):
classifier.fit(train_input, train_labels)
def score_multiple(classifiers, train_input, train_labels):
scores = []
def validate_multiple(classifiers, train_input, train_labels):
print("\n------Cross Validation------")
for classifier in classifiers:
print("\n{}".format(classifier))
accuracy = cross_val_score(classifier, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
f1 = cross_val_score(classifier, train_inputs, train_labels, cv=5, scoring='f1').mean()
scores.append(accuracy, f1)
return scores
print("\nAccuracy: {}, F1-Score: {}\n".format(accuracy, f1))
if __name__ == '__main__':
start_time = time.time()
corpus = corpus.read_corpus("corpus_shuffled.csv")
extended_corpus = surface_patterns.extract_surface_patterns(corpus, 1000)
......@@ -51,64 +85,48 @@ if __name__ == '__main__':
train_set = extended_corpus[:1000]
test_set = extended_corpus[1000:]
# vocabularies
unigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 1)
bigram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', 2)
pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set)
surface_bigram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', 2)
train_inputs, train_labels = [], []
test_inputs, test_labels = [], []
# inputs:
train_inputs = [create_vector(el, bigram_vocab, pos_bigram_vocab, surface_bigram_vocab)
for el in train_set] # 1000 vectors
#test_inputs = [create_vector(el, bigram_vocab, pos_bigram_vocab, surface_bigram_vocab)
# for el in test_set] # 254 vectors
re_extract = True # change to False if features are unchanged since previous run
# labels
train_labels = np.array([int(el['LABEL']) for el in train_set]) # 1000 labels
test_labels = np.array([int(el['LABEL']) for el in test_set]) # 254 labels
if re_extract == True:
print("Number of train samples: {}".format(len(train_inputs)))
print("Number of features per train sample: {}".format(len(train_inputs[0])))
print("Unigram vocab size: {}".format(len(unigram_vocab)))
print("Bigram vocab size: {}".format(len(bigram_vocab)))
print("POS-Bigram vocab size: {}".format(len(pos_bigram_vocab)))
print("Surface Patterns-Bigram vocab size: {}".format(len(surface_bigram_vocab)))
# inputs (x)
train_inputs, test_inputs = extract_features(train_set, test_set)
# TODO Pickle/outsource
# labels (y)
train_labels = np.array([int(el['LABEL']) for el in train_set]) # 1000 labels
test_labels = np.array([int(el['LABEL']) for el in test_set]) # 254 labels
# ML
# save to pickle
pickle.dump([train_inputs, train_labels, test_inputs, test_labels], open("vectors.pickle", "wb"))
else:
# load from pickle
v = pickle.load(open("vectors.pickle", "rb"))
train_inputs, train_labels = v[0], v[1]
test_inputs, test_labels = v[2], v[3]
# Machine Learning
# init
svm_clf = svm.SVC(C=200.0, kernel='linear') # large C: smaller-margin hyperplane
svm_clf = svm.SVC(C=500.0, kernel='linear') # large C: smaller-margin hyperplane
tree_clf = tree.DecisionTreeClassifier()
nb_clf = naive_bayes.MultinomialNB()
lr_clf = linear_model.LogisticRegression()
# training
train_multiple([svm_clf, tree_clf, nb_clf, lr_clf], train_inputs, train_labels)
train_multiple([svm_clf, tree_clf], train_inputs, train_labels) #, nb_clf, lr_clf
print("---> Duration Training: {} sec.\n".format(int(time.time()-start_time)))
# validation
svm_acc = cross_val_score(svm_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
tree_acc = cross_val_score(tree_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
nb_acc = cross_val_score(nb_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
lr_acc = cross_val_score(lr_clf, train_inputs, train_labels, cv=5, scoring='accuracy').mean()
svm_f1 = cross_val_score(svm_clf, train_inputs, train_labels, cv=5, scoring='f1').mean()
tree_f1 = cross_val_score(tree_clf, train_inputs, train_labels, cv=5, scoring='f1').mean()
nb_f1 = cross_val_score(nb_clf, train_inputs, train_labels, cv=5, scoring='f1').mean()
lr_f1 = cross_val_score(lr_clf, train_inputs, train_labels, cv=5, scoring='f1').mean()
print("\n--Cross Validation Scores-- ")
print("\nSVM: Accuracy: {}, F1-Score: {}".format(svm_acc, svm_f1))
print("\nTree: Accuracy: {}, F1-Score: {}".format(tree_acc, tree_f1))
print("\nN. Bayes: Accuracy: {}, F1-Score: {}".format(nb_acc, nb_f1))
print("\nLog. Regression: Accuracy: {}, F1-Score: {}".format(lr_acc, lr_f1))
validate_multiple([svm_clf, tree_clf], train_inputs, train_labels) #, nb_clf, lr_clf
print("---> Duration CV: {} sec.".format(int(time.time()-start_time)))
# testing
# print("\nSVM: Score on test Data:")
# print(svm_clf.score(test_inputs, test_labels))
# print("\nDTree: Score on test Data:")
# print(tree_clf.score(test_inputs, test_labels))
# predictions = svm_classifier.predict(train_inputs)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment