diff --git a/allzweckmesser/decision_tree.py b/allzweckmesser/decision_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..a24a96a39516d7b1e2b9185f269186ed7ad5c7da --- /dev/null +++ b/allzweckmesser/decision_tree.py @@ -0,0 +1,56 @@ +from sklearn import tree +from sklearn.externals import joblib +import graphviz # doctest: +SKIP +import json +import numpy as np + +#load data +train_file = open('../train0-9.json', 'r') +dev_file = open('../dev.json') + +train = json.load(train_file) +dev = json.load(dev_file) + +X,Y = [],[] +for verse in train: + for reading in verse[2]: + X.append(reading[0]) + Y.append(reading[1]) + + +#build model +clf = tree.DecisionTreeClassifier(max_depth=3, criterion='entropy', splitter='best') + +#fit +clf.fit(X, Y) + +correct = 0 +total = 0 + +for verse in dev: + vectors = [reading[0] for reading in verse[2]] + + probs = clf.predict_proba(vectors) + sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) + + gold = sort_probs[0][1] + + if gold == 1: + correct += 1 + + total += 1 + +print("Recall: {}/{} ({})".format(correct, total, correct/total)) + +#precision = tp/(tp+fp) +#recall = tp/(tp+fn) +#accuracy = (tp+tn)/(tp+tn+fp+fn) +#f1 = 2*((precision*recall)/(precision+recall)) +#print('Precision: {}\tRecall:{}'.format(precision,recall)) +#print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) + +joblib.dump(clf, 'tree_classifier.joblib') + +dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP +graph = graphviz.Source(dot_data) # doctest: +SKIP +graph.render("latin_tree") # doctest: +SKIP diff --git a/allzweckmesser/random_forest.py b/allzweckmesser/random_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..3dda27eb92fd63e19c46a60d82e3fd1858c3db34 --- /dev/null +++ b/allzweckmesser/random_forest.py @@ -0,0 +1,56 @@ +from sklearn.ensemble import RandomForestClassifier +from sklearn.externals import joblib +import graphviz # doctest: +SKIP +import json +import numpy as np + +#load data +train_file = open('../train0-9.json', 'r') +dev_file = open('../dev.json') + +train = json.load(train_file) +dev = json.load(dev_file) + +X,Y = [],[] +for verse in train: + for reading in verse[2]: + X.append(reading[0]) + Y.append(reading[1]) + + +#build model +clf = RandomForestClassifier() + +#fit +clf.fit(X, Y) + +correct = 0 +total = 0 + +for verse in dev: + vectors = [reading[0] for reading in verse[2]] + + probs = clf.predict_proba(vectors) + sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) + + gold = sort_probs[0][1] + + if gold == 1: + correct += 1 + + total += 1 + +print("Recall: {}/{} ({})".format(correct, total, correct/total)) + +#precision = tp/(tp+fp) +#recall = tp/(tp+fn) +#accuracy = (tp+tn)/(tp+tn+fp+fn) +#f1 = 2*((precision*recall)/(precision+recall)) +#print('Precision: {}\tRecall:{}'.format(precision,recall)) +#print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) + +joblib.dump(clf, 'forest_classifier.joblib') + +#dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP +#graph = graphviz.Source(dot_data) # doctest: +SKIP +#graph.render("latin_tree") # doctest: +SKIP diff --git a/allzweckmesser/svm.py b/allzweckmesser/svm.py new file mode 100644 index 0000000000000000000000000000000000000000..ced2a24a42c71622557c52180547099c8e2e05e2 --- /dev/null +++ b/allzweckmesser/svm.py @@ -0,0 +1,53 @@ +from sklearn import svm +from sklearn.externals import joblib +from sklearn.ensemble import BaggingClassifier +from sklearn.calibration import CalibratedClassifierCV +import graphviz # doctest: +SKIP +import json +import numpy as np + +#load data +train_file = open('../train0-9.json', 'r') +dev_file = open('../dev.json') + +train = json.load(train_file) +dev = json.load(dev_file) + +X,Y = [],[] +for verse in train: + for reading in verse[2]: + X.append(reading[0]) + Y.append(reading[1]) + + +#build model +n_estimators = 10 +svm = BaggingClassifier(svm.LinearSVC(), max_samples=1.0/n_estimators, n_estimators=n_estimators) +clf = CalibratedClassifierCV(svm) + +#fit +clf.fit(X, Y) + +correct = 0 +total = 0 + +for verse in dev: + vectors = [reading[0] for reading in verse[2]] + + probs = clf.predict_proba(vectors) + sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) + + gold = sort_probs[0][1] + + if gold == 1: + correct += 1 + + total += 1 + +print("Recall: {}/{} ({})".format(correct, total, correct/total)) + +joblib.dump(clf, 'svm_classifier.joblib') + +#dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP +#graph = graphviz.Source(dot_data) # doctest: +SKIP +#graph.render("male_female") # doctest: +SKIP