From 6dd4f34e7c12de5ab2c999559282fb6d2a2d4abe Mon Sep 17 00:00:00 2001 From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de> Date: Sun, 30 Sep 2018 09:50:39 +0200 Subject: [PATCH] Add machine learning scripts. --- allzweckmesser/decision_tree.py | 56 +++++++++++++++++++++++++++++++++ allzweckmesser/random_forest.py | 56 +++++++++++++++++++++++++++++++++ allzweckmesser/svm.py | 53 +++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 allzweckmesser/decision_tree.py create mode 100644 allzweckmesser/random_forest.py create mode 100644 allzweckmesser/svm.py diff --git a/allzweckmesser/decision_tree.py b/allzweckmesser/decision_tree.py new file mode 100644 index 0000000..a24a96a --- /dev/null +++ b/allzweckmesser/decision_tree.py @@ -0,0 +1,56 @@ +from sklearn import tree +from sklearn.externals import joblib +import graphviz # doctest: +SKIP +import json +import numpy as np + +#load data +train_file = open('../train0-9.json', 'r') +dev_file = open('../dev.json') + +train = json.load(train_file) +dev = json.load(dev_file) + +X,Y = [],[] +for verse in train: + for reading in verse[2]: + X.append(reading[0]) + Y.append(reading[1]) + + +#build model +clf = tree.DecisionTreeClassifier(max_depth=3, criterion='entropy', splitter='best') + +#fit +clf.fit(X, Y) + +correct = 0 +total = 0 + +for verse in dev: + vectors = [reading[0] for reading in verse[2]] + + probs = clf.predict_proba(vectors) + sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) + + gold = sort_probs[0][1] + + if gold == 1: + correct += 1 + + total += 1 + +print("Recall: {}/{} ({})".format(correct, total, correct/total)) + +#precision = tp/(tp+fp) +#recall = tp/(tp+fn) +#accuracy = (tp+tn)/(tp+tn+fp+fn) +#f1 = 2*((precision*recall)/(precision+recall)) +#print('Precision: {}\tRecall:{}'.format(precision,recall)) +#print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) + +joblib.dump(clf, 'tree_classifier.joblib') + +dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP +graph = graphviz.Source(dot_data) # doctest: +SKIP +graph.render("latin_tree") # doctest: +SKIP diff --git a/allzweckmesser/random_forest.py b/allzweckmesser/random_forest.py new file mode 100644 index 0000000..3dda27e --- /dev/null +++ b/allzweckmesser/random_forest.py @@ -0,0 +1,56 @@ +from sklearn.ensemble import RandomForestClassifier +from sklearn.externals import joblib +import graphviz # doctest: +SKIP +import json +import numpy as np + +#load data +train_file = open('../train0-9.json', 'r') +dev_file = open('../dev.json') + +train = json.load(train_file) +dev = json.load(dev_file) + +X,Y = [],[] +for verse in train: + for reading in verse[2]: + X.append(reading[0]) + Y.append(reading[1]) + + +#build model +clf = RandomForestClassifier() + +#fit +clf.fit(X, Y) + +correct = 0 +total = 0 + +for verse in dev: + vectors = [reading[0] for reading in verse[2]] + + probs = clf.predict_proba(vectors) + sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) + + gold = sort_probs[0][1] + + if gold == 1: + correct += 1 + + total += 1 + +print("Recall: {}/{} ({})".format(correct, total, correct/total)) + +#precision = tp/(tp+fp) +#recall = tp/(tp+fn) +#accuracy = (tp+tn)/(tp+tn+fp+fn) +#f1 = 2*((precision*recall)/(precision+recall)) +#print('Precision: {}\tRecall:{}'.format(precision,recall)) +#print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) + +joblib.dump(clf, 'forest_classifier.joblib') + +#dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP +#graph = graphviz.Source(dot_data) # doctest: +SKIP +#graph.render("latin_tree") # doctest: +SKIP diff --git a/allzweckmesser/svm.py b/allzweckmesser/svm.py new file mode 100644 index 0000000..ced2a24 --- /dev/null +++ b/allzweckmesser/svm.py @@ -0,0 +1,53 @@ +from sklearn import svm +from sklearn.externals import joblib +from sklearn.ensemble import BaggingClassifier +from sklearn.calibration import CalibratedClassifierCV +import graphviz # doctest: +SKIP +import json +import numpy as np + +#load data +train_file = open('../train0-9.json', 'r') +dev_file = open('../dev.json') + +train = json.load(train_file) +dev = json.load(dev_file) + +X,Y = [],[] +for verse in train: + for reading in verse[2]: + X.append(reading[0]) + Y.append(reading[1]) + + +#build model +n_estimators = 10 +svm = BaggingClassifier(svm.LinearSVC(), max_samples=1.0/n_estimators, n_estimators=n_estimators) +clf = CalibratedClassifierCV(svm) + +#fit +clf.fit(X, Y) + +correct = 0 +total = 0 + +for verse in dev: + vectors = [reading[0] for reading in verse[2]] + + probs = clf.predict_proba(vectors) + sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) + + gold = sort_probs[0][1] + + if gold == 1: + correct += 1 + + total += 1 + +print("Recall: {}/{} ({})".format(correct, total, correct/total)) + +joblib.dump(clf, 'svm_classifier.joblib') + +#dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP +#graph = graphviz.Source(dot_data) # doctest: +SKIP +#graph.render("male_female") # doctest: +SKIP -- GitLab