From 65340bcf18dd6f082b3709f2fe0870b4ca7fd5bc Mon Sep 17 00:00:00 2001
From: Maximilian Blunck <max@Maximilians-MacBook-Air.local>
Date: Thu, 4 Jan 2018 19:56:14 +0100
Subject: [PATCH] Added feature programms F1 and F4 and training/testing script

---
 ngram_feature.py       | 27 +++++++++++++++++
 sent_rating_feature.py | 40 +++++++++++++++++++++++++
 training_testing.py    | 68 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+)
 create mode 100644 ngram_feature.py
 create mode 100644 sent_rating_feature.py
 create mode 100644 training_testing.py

diff --git a/ngram_feature.py b/ngram_feature.py
new file mode 100644
index 0000000..1de7b4a
--- /dev/null
+++ b/ngram_feature.py
@@ -0,0 +1,27 @@
+from sklearn.feature_extraction.text import CountVectorizer
+
+
+def extract(corpus_instance, vocabulary):
+	"""
+	Extracts n-gram features from a single corpus instance.
+	n depends on vocabulary, which needs to be extracted using get_vocabulary.
+	Returns numpy array of size of vocabulary
+	"""
+	vectorizer = CountVectorizer(vocabulary=vocabulary)
+	vector = vectorizer.transform([corpus_instance['REVIEW']]) # takes a list
+
+	return vector.toarray()[0]
+
+
+def get_vocabulary(corpus, n):
+	"""
+	Creates vocabulary based on given corpus.
+	"""
+	all_reviews = []
+	for line in corpus:
+		all_reviews.append(line['REVIEW'])
+
+	vectorizer = CountVectorizer(ngram_range=(n, n))
+	vectorizer.fit(all_reviews)
+
+	return vectorizer.vocabulary_
diff --git a/sent_rating_feature.py b/sent_rating_feature.py
new file mode 100644
index 0000000..df297d0
--- /dev/null
+++ b/sent_rating_feature.py
@@ -0,0 +1,40 @@
+import numpy as np
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from textblob import TextBlob
+
+
+def extract(corpus_instance):
+	"""
+	Extracts single "contrast" feature from a single corpus instance.
+	Returns numpy array of size 1.
+	"""
+	review = corpus_instance["REVIEW"]
+	stars = corpus_instance["STARS"]
+
+    #sent = get_sent_vader(review)
+	sent = get_sent_textblob(review)
+	
+	if (sent <= 0.0 and stars == "5.0") or (sent > 0.0 and stars == "1.0"):
+		return np.array([1])
+	else:
+		return np.array([0])
+
+
+def get_sent_vader(string):
+    analyser = SentimentIntensityAnalyzer()
+    sent = analyser.polarity_scores(string)
+    return sent['compound']
+
+
+def get_sent_textblob(string):
+    blob = TextBlob(string)
+    return blob.sentiment.polarity
+
+
+def confusion_matrix(true_labels, predicted_labels):
+    matrix = np.zeros(shape=(2, 2))
+
+    for true, pred in zip(true_labels, predicted_labels):
+        matrix[true][pred] += 1
+
+    return matrix
diff --git a/training_testing.py b/training_testing.py
new file mode 100644
index 0000000..01af8fb
--- /dev/null
+++ b/training_testing.py
@@ -0,0 +1,68 @@
+import corpus
+from random import shuffle
+import sent_rating_feature
+import ngram_feature
+import numpy as np
+from sklearn import svm
+from sklearn.tree import DecisionTreeClassifier
+
+
+def create_vector(corpus_instance, vocabulary=None):
+    """
+    Calls all feature extraction programms and combines
+    resulting arrays to a single input vector (for a 
+    single corpus instance)
+    """
+    f1 = ngram_feature.extract(corpus_instance, vocabulary)
+    f4 = sent_rating_feature.extract(corpus_instance)
+
+    return np.concatenate((f1,f4))
+
+
+if __name__ == '__main__':
+
+    corpus = corpus.read_corpus("corpus.csv")
+
+    # shuffle & split data set 80:20
+    shuffle(corpus)
+    train_set = corpus[:1000]
+    test_set = corpus[1000:]
+
+    # vocabularies
+    unigram_vocab = ngram_feature.get_vocabulary(train_set, 1)
+    bigram_vocab = ngram_feature.get_vocabulary(train_set, 2)
+
+    # inputs:
+    train_inputs = [create_vector(el, unigram_vocab)
+                    for el in train_set]  # 1000 vectors
+    test_inputs = [create_vector(el, unigram_vocab)
+                   for el in test_set]  # 254 vectors
+
+    # labels
+    train_labels = np.array([int(el['LABEL']) for el in train_set])  # 1000 labels
+    test_labels = np.array([int(el['LABEL']) for el in test_set])  # 254 labels
+
+    print("Number of train samples:             {}".format(len(train_inputs)))
+    print("Number of features per train sample: {}".format(len(train_inputs[0])))
+    print("Unigram vocab size:                  {}".format(len(unigram_vocab)))
+    print("Bigram vocab size:                   {}".format(len(bigram_vocab)))
+
+    # training
+
+    # SVM
+    svm_classifier = svm.SVC()
+    svm_classifier.fit(train_inputs, train_labels)
+    predictions = svm_classifier.predict(test_inputs)
+    print(svm_classifier.score(test_inputs, test_labels))
+    print("Predictions: \n {}".format(predictions))
+    print("Targets:     \n {}".format(test_labels))
+
+    # Trees
+    tree_clf = DecisionTreeClassifier()
+    tree_clf.fit(train_inputs, train_labels)
+    predictions = tree_clf.predict(test_inputs)
+    print(tree_clf.score(test_inputs, test_labels))
+    print("Predictions: \n {}".format(predictions))
+    print("Targets:     \n {}".format(test_labels))
+
+
-- 
GitLab