Commit 6dd4f34e authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Add machine learning scripts.

parent f95764a0
Loading
Loading
Loading
Loading
+56 −0
Original line number Diff line number Diff line
from sklearn import tree
from sklearn.externals import joblib
import graphviz # doctest: +SKIP
import json
import numpy as np

#load data
train_file = open('../train0-9.json', 'r')
dev_file = open('../dev.json')

train = json.load(train_file)
dev = json.load(dev_file)

X,Y = [],[]
for verse in train:
    for reading in verse[2]:
        X.append(reading[0])
        Y.append(reading[1])


#build model
clf = tree.DecisionTreeClassifier(max_depth=3, criterion='entropy', splitter='best')

#fit
clf.fit(X, Y)

correct = 0
total = 0

for verse in dev:
    vectors = [reading[0] for reading in verse[2]]

    probs = clf.predict_proba(vectors)
    sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0])

    gold = sort_probs[0][1]

    if gold == 1:
        correct += 1

    total += 1

print("Recall: {}/{} ({})".format(correct, total, correct/total))

#precision = tp/(tp+fp)
#recall = tp/(tp+fn)
#accuracy = (tp+tn)/(tp+tn+fp+fn)
#f1 = 2*((precision*recall)/(precision+recall))
#print('Precision: {}\tRecall:{}'.format(precision,recall))
#print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) 

joblib.dump(clf, 'tree_classifier.joblib')

dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP
graph = graphviz.Source(dot_data) # doctest: +SKIP
graph.render("latin_tree") # doctest: +SKIP
+56 −0
Original line number Diff line number Diff line
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import graphviz # doctest: +SKIP
import json
import numpy as np

#load data
train_file = open('../train0-9.json', 'r')
dev_file = open('../dev.json')

train = json.load(train_file)
dev = json.load(dev_file)

X,Y = [],[]
for verse in train:
    for reading in verse[2]:
        X.append(reading[0])
        Y.append(reading[1])


#build model
clf = RandomForestClassifier()

#fit
clf.fit(X, Y)

correct = 0
total = 0

for verse in dev:
    vectors = [reading[0] for reading in verse[2]]

    probs = clf.predict_proba(vectors)
    sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0])

    gold = sort_probs[0][1]

    if gold == 1:
        correct += 1

    total += 1

print("Recall: {}/{} ({})".format(correct, total, correct/total))

#precision = tp/(tp+fp)
#recall = tp/(tp+fn)
#accuracy = (tp+tn)/(tp+tn+fp+fn)
#f1 = 2*((precision*recall)/(precision+recall))
#print('Precision: {}\tRecall:{}'.format(precision,recall))
#print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) 

joblib.dump(clf, 'forest_classifier.joblib')

#dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP
#graph = graphviz.Source(dot_data) # doctest: +SKIP
#graph.render("latin_tree") # doctest: +SKIP

allzweckmesser/svm.py

0 → 100644
+53 −0
Original line number Diff line number Diff line
from sklearn import svm
from sklearn.externals import joblib
from sklearn.ensemble import BaggingClassifier
from sklearn.calibration import CalibratedClassifierCV
import graphviz # doctest: +SKIP
import json
import numpy as np

#load data
train_file = open('../train0-9.json', 'r')
dev_file = open('../dev.json')

train = json.load(train_file)
dev = json.load(dev_file)

X,Y = [],[]
for verse in train:
    for reading in verse[2]:
        X.append(reading[0])
        Y.append(reading[1])


#build model
n_estimators = 10
svm = BaggingClassifier(svm.LinearSVC(), max_samples=1.0/n_estimators, n_estimators=n_estimators)
clf = CalibratedClassifierCV(svm)

#fit
clf.fit(X, Y)

correct = 0
total = 0

for verse in dev:
    vectors = [reading[0] for reading in verse[2]]

    probs = clf.predict_proba(vectors)
    sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0])

    gold = sort_probs[0][1]

    if gold == 1:
        correct += 1

    total += 1

print("Recall: {}/{} ({})".format(correct, total, correct/total))

joblib.dump(clf, 'svm_classifier.joblib')

#dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP
#graph = graphviz.Source(dot_data) # doctest: +SKIP
#graph.render("male_female") # doctest: +SKIP