Skip to content
Snippets Groups Projects
Commit 65340bcf authored by Maximilian Blunck's avatar Maximilian Blunck
Browse files

Added feature programms F1 and F4 and training/testing script

parent b770e69f
No related branches found
No related tags found
No related merge requests found
from sklearn.feature_extraction.text import CountVectorizer
def extract(corpus_instance, vocabulary):
"""
Extracts n-gram features from a single corpus instance.
n depends on vocabulary, which needs to be extracted using get_vocabulary.
Returns numpy array of size of vocabulary
"""
vectorizer = CountVectorizer(vocabulary=vocabulary)
vector = vectorizer.transform([corpus_instance['REVIEW']]) # takes a list
return vector.toarray()[0]
def get_vocabulary(corpus, n):
"""
Creates vocabulary based on given corpus.
"""
all_reviews = []
for line in corpus:
all_reviews.append(line['REVIEW'])
vectorizer = CountVectorizer(ngram_range=(n, n))
vectorizer.fit(all_reviews)
return vectorizer.vocabulary_
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
def extract(corpus_instance):
"""
Extracts single "contrast" feature from a single corpus instance.
Returns numpy array of size 1.
"""
review = corpus_instance["REVIEW"]
stars = corpus_instance["STARS"]
#sent = get_sent_vader(review)
sent = get_sent_textblob(review)
if (sent <= 0.0 and stars == "5.0") or (sent > 0.0 and stars == "1.0"):
return np.array([1])
else:
return np.array([0])
def get_sent_vader(string):
analyser = SentimentIntensityAnalyzer()
sent = analyser.polarity_scores(string)
return sent['compound']
def get_sent_textblob(string):
blob = TextBlob(string)
return blob.sentiment.polarity
def confusion_matrix(true_labels, predicted_labels):
matrix = np.zeros(shape=(2, 2))
for true, pred in zip(true_labels, predicted_labels):
matrix[true][pred] += 1
return matrix
import corpus
from random import shuffle
import sent_rating_feature
import ngram_feature
import numpy as np
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
def create_vector(corpus_instance, vocabulary=None):
"""
Calls all feature extraction programms and combines
resulting arrays to a single input vector (for a
single corpus instance)
"""
f1 = ngram_feature.extract(corpus_instance, vocabulary)
f4 = sent_rating_feature.extract(corpus_instance)
return np.concatenate((f1,f4))
if __name__ == '__main__':
corpus = corpus.read_corpus("corpus.csv")
# shuffle & split data set 80:20
shuffle(corpus)
train_set = corpus[:1000]
test_set = corpus[1000:]
# vocabularies
unigram_vocab = ngram_feature.get_vocabulary(train_set, 1)
bigram_vocab = ngram_feature.get_vocabulary(train_set, 2)
# inputs:
train_inputs = [create_vector(el, unigram_vocab)
for el in train_set] # 1000 vectors
test_inputs = [create_vector(el, unigram_vocab)
for el in test_set] # 254 vectors
# labels
train_labels = np.array([int(el['LABEL']) for el in train_set]) # 1000 labels
test_labels = np.array([int(el['LABEL']) for el in test_set]) # 254 labels
print("Number of train samples: {}".format(len(train_inputs)))
print("Number of features per train sample: {}".format(len(train_inputs[0])))
print("Unigram vocab size: {}".format(len(unigram_vocab)))
print("Bigram vocab size: {}".format(len(bigram_vocab)))
# training
# SVM
svm_classifier = svm.SVC()
svm_classifier.fit(train_inputs, train_labels)
predictions = svm_classifier.predict(test_inputs)
print(svm_classifier.score(test_inputs, test_labels))
print("Predictions: \n {}".format(predictions))
print("Targets: \n {}".format(test_labels))
# Trees
tree_clf = DecisionTreeClassifier()
tree_clf.fit(train_inputs, train_labels)
predictions = tree_clf.predict(test_inputs)
print(tree_clf.score(test_inputs, test_labels))
print("Predictions: \n {}".format(predictions))
print("Targets: \n {}".format(test_labels))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment