Loading allzweckmesser/decision_tree.py 0 → 100644 +56 −0 Original line number Diff line number Diff line from sklearn import tree from sklearn.externals import joblib import graphviz # doctest: +SKIP import json import numpy as np #load data train_file = open('../train0-9.json', 'r') dev_file = open('../dev.json') train = json.load(train_file) dev = json.load(dev_file) X,Y = [],[] for verse in train: for reading in verse[2]: X.append(reading[0]) Y.append(reading[1]) #build model clf = tree.DecisionTreeClassifier(max_depth=3, criterion='entropy', splitter='best') #fit clf.fit(X, Y) correct = 0 total = 0 for verse in dev: vectors = [reading[0] for reading in verse[2]] probs = clf.predict_proba(vectors) sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) gold = sort_probs[0][1] if gold == 1: correct += 1 total += 1 print("Recall: {}/{} ({})".format(correct, total, correct/total)) #precision = tp/(tp+fp) #recall = tp/(tp+fn) #accuracy = (tp+tn)/(tp+tn+fp+fn) #f1 = 2*((precision*recall)/(precision+recall)) #print('Precision: {}\tRecall:{}'.format(precision,recall)) #print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) joblib.dump(clf, 'tree_classifier.joblib') dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP graph = graphviz.Source(dot_data) # doctest: +SKIP graph.render("latin_tree") # doctest: +SKIP allzweckmesser/random_forest.py 0 → 100644 +56 −0 Original line number Diff line number Diff line from sklearn.ensemble import RandomForestClassifier from sklearn.externals import joblib import graphviz # doctest: +SKIP import json import numpy as np #load data train_file = open('../train0-9.json', 'r') dev_file = open('../dev.json') train = json.load(train_file) dev = json.load(dev_file) X,Y = [],[] for verse in train: for reading in verse[2]: X.append(reading[0]) Y.append(reading[1]) #build model clf = RandomForestClassifier() #fit clf.fit(X, Y) correct = 0 total = 0 for verse in dev: vectors = [reading[0] for reading in verse[2]] probs = clf.predict_proba(vectors) sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) gold = sort_probs[0][1] if gold == 1: correct += 1 total += 1 print("Recall: {}/{} ({})".format(correct, total, correct/total)) #precision = tp/(tp+fp) #recall = tp/(tp+fn) #accuracy = (tp+tn)/(tp+tn+fp+fn) #f1 = 2*((precision*recall)/(precision+recall)) #print('Precision: {}\tRecall:{}'.format(precision,recall)) #print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) joblib.dump(clf, 'forest_classifier.joblib') #dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP #graph = graphviz.Source(dot_data) # doctest: +SKIP #graph.render("latin_tree") # doctest: +SKIP allzweckmesser/svm.py 0 → 100644 +53 −0 Original line number Diff line number Diff line from sklearn import svm from sklearn.externals import joblib from sklearn.ensemble import BaggingClassifier from sklearn.calibration import CalibratedClassifierCV import graphviz # doctest: +SKIP import json import numpy as np #load data train_file = open('../train0-9.json', 'r') dev_file = open('../dev.json') train = json.load(train_file) dev = json.load(dev_file) X,Y = [],[] for verse in train: for reading in verse[2]: X.append(reading[0]) Y.append(reading[1]) #build model n_estimators = 10 svm = BaggingClassifier(svm.LinearSVC(), max_samples=1.0/n_estimators, n_estimators=n_estimators) clf = CalibratedClassifierCV(svm) #fit clf.fit(X, Y) correct = 0 total = 0 for verse in dev: vectors = [reading[0] for reading in verse[2]] probs = clf.predict_proba(vectors) sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) gold = sort_probs[0][1] if gold == 1: correct += 1 total += 1 print("Recall: {}/{} ({})".format(correct, total, correct/total)) joblib.dump(clf, 'svm_classifier.joblib') #dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP #graph = graphviz.Source(dot_data) # doctest: +SKIP #graph.render("male_female") # doctest: +SKIP Loading
allzweckmesser/decision_tree.py 0 → 100644 +56 −0 Original line number Diff line number Diff line from sklearn import tree from sklearn.externals import joblib import graphviz # doctest: +SKIP import json import numpy as np #load data train_file = open('../train0-9.json', 'r') dev_file = open('../dev.json') train = json.load(train_file) dev = json.load(dev_file) X,Y = [],[] for verse in train: for reading in verse[2]: X.append(reading[0]) Y.append(reading[1]) #build model clf = tree.DecisionTreeClassifier(max_depth=3, criterion='entropy', splitter='best') #fit clf.fit(X, Y) correct = 0 total = 0 for verse in dev: vectors = [reading[0] for reading in verse[2]] probs = clf.predict_proba(vectors) sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) gold = sort_probs[0][1] if gold == 1: correct += 1 total += 1 print("Recall: {}/{} ({})".format(correct, total, correct/total)) #precision = tp/(tp+fp) #recall = tp/(tp+fn) #accuracy = (tp+tn)/(tp+tn+fp+fn) #f1 = 2*((precision*recall)/(precision+recall)) #print('Precision: {}\tRecall:{}'.format(precision,recall)) #print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) joblib.dump(clf, 'tree_classifier.joblib') dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP graph = graphviz.Source(dot_data) # doctest: +SKIP graph.render("latin_tree") # doctest: +SKIP
allzweckmesser/random_forest.py 0 → 100644 +56 −0 Original line number Diff line number Diff line from sklearn.ensemble import RandomForestClassifier from sklearn.externals import joblib import graphviz # doctest: +SKIP import json import numpy as np #load data train_file = open('../train0-9.json', 'r') dev_file = open('../dev.json') train = json.load(train_file) dev = json.load(dev_file) X,Y = [],[] for verse in train: for reading in verse[2]: X.append(reading[0]) Y.append(reading[1]) #build model clf = RandomForestClassifier() #fit clf.fit(X, Y) correct = 0 total = 0 for verse in dev: vectors = [reading[0] for reading in verse[2]] probs = clf.predict_proba(vectors) sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) gold = sort_probs[0][1] if gold == 1: correct += 1 total += 1 print("Recall: {}/{} ({})".format(correct, total, correct/total)) #precision = tp/(tp+fp) #recall = tp/(tp+fn) #accuracy = (tp+tn)/(tp+tn+fp+fn) #f1 = 2*((precision*recall)/(precision+recall)) #print('Precision: {}\tRecall:{}'.format(precision,recall)) #print('Accuracy: {}\tF1-Measure:{}\n'.format(accuracy, f1)) joblib.dump(clf, 'forest_classifier.joblib') #dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP #graph = graphviz.Source(dot_data) # doctest: +SKIP #graph.render("latin_tree") # doctest: +SKIP
allzweckmesser/svm.py 0 → 100644 +53 −0 Original line number Diff line number Diff line from sklearn import svm from sklearn.externals import joblib from sklearn.ensemble import BaggingClassifier from sklearn.calibration import CalibratedClassifierCV import graphviz # doctest: +SKIP import json import numpy as np #load data train_file = open('../train0-9.json', 'r') dev_file = open('../dev.json') train = json.load(train_file) dev = json.load(dev_file) X,Y = [],[] for verse in train: for reading in verse[2]: X.append(reading[0]) Y.append(reading[1]) #build model n_estimators = 10 svm = BaggingClassifier(svm.LinearSVC(), max_samples=1.0/n_estimators, n_estimators=n_estimators) clf = CalibratedClassifierCV(svm) #fit clf.fit(X, Y) correct = 0 total = 0 for verse in dev: vectors = [reading[0] for reading in verse[2]] probs = clf.predict_proba(vectors) sort_probs = sorted([(probs[i], verse[2][i][1]) for i in range(len(probs))], key=lambda x: x[0][0]) gold = sort_probs[0][1] if gold == 1: correct += 1 total += 1 print("Recall: {}/{} ({})".format(correct, total, correct/total)) joblib.dump(clf, 'svm_classifier.joblib') #dot_data = tree.export_graphviz(clf, out_file=None) # doctest: +SKIP #graph = graphviz.Source(dot_data) # doctest: +SKIP #graph.render("male_female") # doctest: +SKIP