From 306e1df93ecb317221d38b193eed6e54d2b6d5b3 Mon Sep 17 00:00:00 2001 From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de> Date: Wed, 7 Mar 2018 15:44:01 +0100 Subject: [PATCH] =?UTF-8?q?Add=20Kleinschei=C3=9F.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- code/absinth_nx.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/code/absinth_nx.py b/code/absinth_nx.py index 3b97902..0215edb 100644 --- a/code/absinth_nx.py +++ b/code/absinth_nx.py @@ -112,7 +112,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei return G -def root_hubs(graph, edge_freq, min_neighbors=5, theshold=0.8): +def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): G = deepcopy(graph) V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...) @@ -170,6 +170,7 @@ def disambiguate(mst, hubs, contexts): T = mst H = hubs + C = [c.lower().strip() for c in contexts] backup_cluster = len(H) result = [] @@ -196,13 +197,12 @@ def disambiguate(mst, hubs, contexts): T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores]) - for c in contexts: + for c in C: - c = c.lower() toks = [t.text for t in nlp(c)] vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0) - idx = contexts.index(c) + 1 + idx = C.index(c) + 1 if len(vector) == 0: #if no senses are found -> all in one result.append((0, idx)) @@ -225,7 +225,7 @@ if __name__ == '__main__': data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/' #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/' - results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/' + results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/' stop = set(stopwords.words('english') + ['utc', 'new', 'other']) @@ -260,7 +260,8 @@ if __name__ == '__main__': target = value.strip() print("[A] Processing '"+target+"'.\n") - + if target[:4] == 'the_' and target.count('_') >= 2: #hard coded 'the'-protection + target = target[4:] f = open(results_path+target+'.absinth', 'w') f.write('subTopicID\tresultID\n') -- GitLab