From 65340bcf18dd6f082b3709f2fe0870b4ca7fd5bc Mon Sep 17 00:00:00 2001 From: Maximilian Blunck <max@Maximilians-MacBook-Air.local> Date: Thu, 4 Jan 2018 19:56:14 +0100 Subject: [PATCH] Added feature programms F1 and F4 and training/testing script --- ngram_feature.py | 27 +++++++++++++++++ sent_rating_feature.py | 40 +++++++++++++++++++++++++ training_testing.py | 68 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100644 ngram_feature.py create mode 100644 sent_rating_feature.py create mode 100644 training_testing.py diff --git a/ngram_feature.py b/ngram_feature.py new file mode 100644 index 0000000..1de7b4a --- /dev/null +++ b/ngram_feature.py @@ -0,0 +1,27 @@ +from sklearn.feature_extraction.text import CountVectorizer + + +def extract(corpus_instance, vocabulary): + """ + Extracts n-gram features from a single corpus instance. + n depends on vocabulary, which needs to be extracted using get_vocabulary. + Returns numpy array of size of vocabulary + """ + vectorizer = CountVectorizer(vocabulary=vocabulary) + vector = vectorizer.transform([corpus_instance['REVIEW']]) # takes a list + + return vector.toarray()[0] + + +def get_vocabulary(corpus, n): + """ + Creates vocabulary based on given corpus. + """ + all_reviews = [] + for line in corpus: + all_reviews.append(line['REVIEW']) + + vectorizer = CountVectorizer(ngram_range=(n, n)) + vectorizer.fit(all_reviews) + + return vectorizer.vocabulary_ diff --git a/sent_rating_feature.py b/sent_rating_feature.py new file mode 100644 index 0000000..df297d0 --- /dev/null +++ b/sent_rating_feature.py @@ -0,0 +1,40 @@ +import numpy as np +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +from textblob import TextBlob + + +def extract(corpus_instance): + """ + Extracts single "contrast" feature from a single corpus instance. + Returns numpy array of size 1. + """ + review = corpus_instance["REVIEW"] + stars = corpus_instance["STARS"] + + #sent = get_sent_vader(review) + sent = get_sent_textblob(review) + + if (sent <= 0.0 and stars == "5.0") or (sent > 0.0 and stars == "1.0"): + return np.array([1]) + else: + return np.array([0]) + + +def get_sent_vader(string): + analyser = SentimentIntensityAnalyzer() + sent = analyser.polarity_scores(string) + return sent['compound'] + + +def get_sent_textblob(string): + blob = TextBlob(string) + return blob.sentiment.polarity + + +def confusion_matrix(true_labels, predicted_labels): + matrix = np.zeros(shape=(2, 2)) + + for true, pred in zip(true_labels, predicted_labels): + matrix[true][pred] += 1 + + return matrix diff --git a/training_testing.py b/training_testing.py new file mode 100644 index 0000000..01af8fb --- /dev/null +++ b/training_testing.py @@ -0,0 +1,68 @@ +import corpus +from random import shuffle +import sent_rating_feature +import ngram_feature +import numpy as np +from sklearn import svm +from sklearn.tree import DecisionTreeClassifier + + +def create_vector(corpus_instance, vocabulary=None): + """ + Calls all feature extraction programms and combines + resulting arrays to a single input vector (for a + single corpus instance) + """ + f1 = ngram_feature.extract(corpus_instance, vocabulary) + f4 = sent_rating_feature.extract(corpus_instance) + + return np.concatenate((f1,f4)) + + +if __name__ == '__main__': + + corpus = corpus.read_corpus("corpus.csv") + + # shuffle & split data set 80:20 + shuffle(corpus) + train_set = corpus[:1000] + test_set = corpus[1000:] + + # vocabularies + unigram_vocab = ngram_feature.get_vocabulary(train_set, 1) + bigram_vocab = ngram_feature.get_vocabulary(train_set, 2) + + # inputs: + train_inputs = [create_vector(el, unigram_vocab) + for el in train_set] # 1000 vectors + test_inputs = [create_vector(el, unigram_vocab) + for el in test_set] # 254 vectors + + # labels + train_labels = np.array([int(el['LABEL']) for el in train_set]) # 1000 labels + test_labels = np.array([int(el['LABEL']) for el in test_set]) # 254 labels + + print("Number of train samples: {}".format(len(train_inputs))) + print("Number of features per train sample: {}".format(len(train_inputs[0]))) + print("Unigram vocab size: {}".format(len(unigram_vocab))) + print("Bigram vocab size: {}".format(len(bigram_vocab))) + + # training + + # SVM + svm_classifier = svm.SVC() + svm_classifier.fit(train_inputs, train_labels) + predictions = svm_classifier.predict(test_inputs) + print(svm_classifier.score(test_inputs, test_labels)) + print("Predictions: \n {}".format(predictions)) + print("Targets: \n {}".format(test_labels)) + + # Trees + tree_clf = DecisionTreeClassifier() + tree_clf.fit(train_inputs, train_labels) + predictions = tree_clf.predict(test_inputs) + print(tree_clf.score(test_inputs, test_labels)) + print("Predictions: \n {}".format(predictions)) + print("Targets: \n {}".format(test_labels)) + + -- GitLab