Skip to content
Snippets Groups Projects
Commit bf182284 authored by blunck's avatar blunck
Browse files

Added option for shuffling corpus and scoring for cross-validation

parent dde7746a
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,7 @@ import os, os.path
import re
import csv
from nltk.tokenize import word_tokenize
from random import shuffle
def read_corpus(csv_corpus_path):
"""
......@@ -25,7 +26,7 @@ def read_corpus(csv_corpus_path):
return corpus
def convert_corpus(corpus_path, out):
def convert_corpus(corpus_path, out, shuffle_corpus=False):
"""
Takes root path of raw Filatrova corpus and converts it into a single csv file.
......@@ -42,6 +43,9 @@ def convert_corpus(corpus_path, out):
if parent == "Regular" or parent == "Ironic":
corpus_files.append(os.path.join(root, name))
if shuffle_corpus == True:
shuffle(corpus_files)
with open(out, 'w') as csvfile:
fieldnames = ['LABEL', 'FILENAME', 'STARS', 'TITLE', 'DATE', 'AUTHOR', 'PRODUCT', 'REVIEW']
......@@ -92,6 +96,7 @@ if __name__ == '__main__':
"""
corpus_path = "../corpus/SarcasmAmazonReviewsCorpus"
convert_corpus(corpus_path, "corpus.csv")
convert_corpus(corpus_path, "corpus_shuffled.csv", shuffle_corpus=True)
corpus = read_corpus("corpus.csv")
print("Corpus size: "+str(len(corpus)))
......@@ -100,4 +105,3 @@ if __name__ == '__main__':
......@@ -4,7 +4,8 @@ import sent_rating_feature
import ngram_feature
import numpy as np
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score
import postagger
......@@ -22,12 +23,16 @@ def create_vector(corpus_instance, vocabulary=None, pos_vocabulary=None):
return np.concatenate((f1,f4))
def train_multiple(classifiers, train_input, train_labels):
for classifier in classifiers:
classifier.fit(train_input, train_labels)
if __name__ == '__main__':
corpus = corpus.read_corpus("corpus.csv")
corpus = corpus.read_corpus("corpus_shuffled.csv")
# shuffle & split data set 80:20
shuffle(corpus)
# split data set 80:20
train_set = corpus[:1000]
test_set = corpus[1000:]
......@@ -36,8 +41,8 @@ if __name__ == '__main__':
bigram_vocab = ngram_feature.get_vocabulary(train_set, 2)
# pos_bags
bigram_pos_vocab = postagger.get_pos_vocabulary(train_set)
#print(bigram_pos_vocab) #already lookin' good
pos_bigram_vocab = postagger.get_pos_vocabulary(train_set)
#print(pos_bigram_vocab) #already lookin' good
# inputs:
train_inputs = [create_vector(el, unigram_vocab)
......@@ -53,37 +58,34 @@ if __name__ == '__main__':
print("Number of features per train sample: {}".format(len(train_inputs[0])))
print("Unigram vocab size: {}".format(len(unigram_vocab)))
print("Bigram vocab size: {}".format(len(bigram_vocab)))
print("POS-Bigram vocab size: {}".format(len(bigram_pos_vocab)))
print("POS-Bigram vocab size: {}".format(len(pos_bigram_vocab)))
# TODO Pickle/outsource
# ML
# init
svm_clf = svm.SVC(C=200.0) # large C: smaller-margin hyperplane
tree_clf = tree.DecisionTreeClassifier()
# training
train_multiple([svm_clf, tree_clf], train_inputs, train_labels)
# validation
svm_score = cross_val_score(svm_clf, train_inputs, train_labels, cv=5).mean()#, scoring='f1')
tree_score = cross_val_score(tree_clf, train_inputs, train_labels, cv=5).mean()#, scoring='f1')
print("\n--Cross Validation Scores-- ")
print("\nSVM: {}".format(svm_score))
print("\nTree: {}".format(tree_score))
# testing
# print("\nSVM: Score on test Data:")
# print(svm_clf.score(test_inputs, test_labels))
# SVM
svm_classifier = svm.SVC(C=200.0) # large C: smaller-margin hyperplane
svm_classifier.fit(train_inputs, train_labels)
# print("\nDTree: Score on test Data:")
# print(tree_clf.score(test_inputs, test_labels))
print("\nSVM: Score on train Data:")
print(svm_classifier.score(train_inputs, train_labels))
# predictions = svm_classifier.predict(train_inputs)
# print("Predictions: \n {}".format(predictions))
# print("Targets: \n {}".format(train_labels))
print("\nSVM: Score on test Data:")
print(svm_classifier.score(test_inputs, test_labels))
# predictions = svm_classifier.predict(test_inputs)
# print("Predictions: \n {}".format(predictions))
# print("Targets: \n {}".format(test_labels))
# Trees
tree_clf = DecisionTreeClassifier()
tree_clf.fit(train_inputs, train_labels)
print("\nDTree: Score on train Data:")
print(tree_clf.score(train_inputs, train_labels))
# predictions = tree_clf.predict(test_inputs)
# print("Predictions: \n {}".format(predictions))
# print("Targets: \n {}".format(test_labels))
print("\nDTree: Score on test Data:")
print(tree_clf.score(test_inputs, test_labels))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment