From 6dd4f34e7c12de5ab2c999559282fb6d2a2d4abe Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Sun, 30 Sep 2018 09:50:39 +0200
Subject: [PATCH] Add machine learning scripts.

---
 allzweckmesser/decision_tree.py | 56 +++++++++++++++++++++++++++++++++
 allzweckmesser/random_forest.py | 56 +++++++++++++++++++++++++++++++++
 allzweckmesser/svm.py           | 53 +++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+)
 create mode 100644 allzweckmesser/decision_tree.py
 create mode 100644 allzweckmesser/random_forest.py
 create mode 100644 allzweckmesser/svm.py

diff --git a/allzweckmesser/decision_tree.py b/allzweckmesser/decision_tree.py
new file mode 100644
index 0000000..a24a96a
--- /dev/null
+++ b/allzweckmesser/decision_tree.py
@@ -0,0 +1,56 @@
+from sklearn import tree
+from sklearn.externals import joblib
+import graphviz # doctest: +SKIP
+import json
+import numpy as np
+
+#load data
+train_file = open('../train0-9.json', 'r')
+dev_file = open('../dev.json')
+
+train = json.load(train_file)
+dev = json.load(dev_file)
+
+X,Y = [],[]
+for verse in train:
+    for reading in verse[2]:
+        X.append(reading[0])
+        Y.append(reading[1])
+
+
+#build model
+clf = tree.DecisionTreeClassifier(max_depth=3, criterion='entropy', splitter='best')
+
+#fit
+clf.fit(X, Y)
+
+correct = 0
+total = 0
+
+for verse in dev:
+    vectors = [reading[0] for reading in verse[2]]
+
+    probs = clf.predict_proba(vectors)
+    sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0])
+
+    gold = sort_probs[0][1]
+
+    if gold == 1:
+        correct += 1
+
+    total += 1
+
+print("Recall: {}/{} ({})".format(correct, total, correct/total))
+
+#precision = tp/(tp+fp)
+#recall = tp/(tp+fn)
+#accuracy = (tp+tn)/(tp+tn+fp+fn)
+#f1 = 2*((precision*recall)/(precision+recall))
+#print('Precision: {}\tRecall:{}'.format(precision,recall))
+#print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) 
+
+joblib.dump(clf, 'tree_classifier.joblib')
+
+dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP
+graph = graphviz.Source(dot_data) # doctest: +SKIP
+graph.render("latin_tree") # doctest: +SKIP
diff --git a/allzweckmesser/random_forest.py b/allzweckmesser/random_forest.py
new file mode 100644
index 0000000..3dda27e
--- /dev/null
+++ b/allzweckmesser/random_forest.py
@@ -0,0 +1,56 @@
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.externals import joblib
+import graphviz # doctest: +SKIP
+import json
+import numpy as np
+
+#load data
+train_file = open('../train0-9.json', 'r')
+dev_file = open('../dev.json')
+
+train = json.load(train_file)
+dev = json.load(dev_file)
+
+X,Y = [],[]
+for verse in train:
+    for reading in verse[2]:
+        X.append(reading[0])
+        Y.append(reading[1])
+
+
+#build model
+clf = RandomForestClassifier()
+
+#fit
+clf.fit(X, Y)
+
+correct = 0
+total = 0
+
+for verse in dev:
+    vectors = [reading[0] for reading in verse[2]]
+
+    probs = clf.predict_proba(vectors)
+    sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0])
+
+    gold = sort_probs[0][1]
+
+    if gold == 1:
+        correct += 1
+
+    total += 1
+
+print("Recall: {}/{} ({})".format(correct, total, correct/total))
+
+#precision = tp/(tp+fp)
+#recall = tp/(tp+fn)
+#accuracy = (tp+tn)/(tp+tn+fp+fn)
+#f1 = 2*((precision*recall)/(precision+recall))
+#print('Precision: {}\tRecall:{}'.format(precision,recall))
+#print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) 
+
+joblib.dump(clf, 'forest_classifier.joblib')
+
+#dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP
+#graph = graphviz.Source(dot_data) # doctest: +SKIP
+#graph.render("latin_tree") # doctest: +SKIP
diff --git a/allzweckmesser/svm.py b/allzweckmesser/svm.py
new file mode 100644
index 0000000..ced2a24
--- /dev/null
+++ b/allzweckmesser/svm.py
@@ -0,0 +1,53 @@
+from sklearn import svm
+from sklearn.externals import joblib
+from sklearn.ensemble import BaggingClassifier
+from sklearn.calibration import CalibratedClassifierCV
+import graphviz # doctest: +SKIP
+import json
+import numpy as np
+
+#load data
+train_file = open('../train0-9.json', 'r')
+dev_file = open('../dev.json')
+
+train = json.load(train_file)
+dev = json.load(dev_file)
+
+X,Y = [],[]
+for verse in train:
+    for reading in verse[2]:
+        X.append(reading[0])
+        Y.append(reading[1])
+
+
+#build model
+n_estimators = 10
+svm = BaggingClassifier(svm.LinearSVC(), max_samples=1.0/n_estimators, n_estimators=n_estimators)
+clf = CalibratedClassifierCV(svm)
+
+#fit
+clf.fit(X, Y)
+
+correct = 0
+total = 0
+
+for verse in dev:
+    vectors = [reading[0] for reading in verse[2]]
+
+    probs = clf.predict_proba(vectors)
+    sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0])
+
+    gold = sort_probs[0][1]
+
+    if gold == 1:
+        correct += 1
+
+    total += 1
+
+print("Recall: {}/{} ({})".format(correct, total, correct/total))
+
+joblib.dump(clf, 'svm_classifier.joblib')
+
+#dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP
+#graph = graphviz.Source(dot_data) # doctest: +SKIP
+#graph.render("male_female") # doctest: +SKIP
-- 
GitLab