diff --git a/code/absinth_nx.py b/code/absinth_nx.py index 3b97902f9d434c23cedb061fcb36e548ba03ba17..0215edbd10daae94ffebcf786328eae9b400721a 100644 --- a/code/absinth_nx.py +++ b/code/absinth_nx.py @@ -112,7 +112,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei return G -def root_hubs(graph, edge_freq, min_neighbors=5, theshold=0.8): +def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): G = deepcopy(graph) V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...) @@ -170,6 +170,7 @@ def disambiguate(mst, hubs, contexts): T = mst H = hubs + C = [c.lower().strip() for c in contexts] backup_cluster = len(H) result = [] @@ -196,13 +197,12 @@ def disambiguate(mst, hubs, contexts): T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores]) - for c in contexts: + for c in C: - c = c.lower() toks = [t.text for t in nlp(c)] vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0) - idx = contexts.index(c) + 1 + idx = C.index(c) + 1 if len(vector) == 0: #if no senses are found -> all in one result.append((0, idx)) @@ -225,7 +225,7 @@ if __name__ == '__main__': data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/' #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/' - results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/' + results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/' stop = set(stopwords.words('english') + ['utc', 'new', 'other']) @@ -260,7 +260,8 @@ if __name__ == '__main__': target = value.strip() print("[A] Processing '"+target+"'.\n") - + if target[:4] == 'the_' and target.count('_') >= 2: #hard coded 'the'-protection + target = target[4:] f = open(results_path+target+'.absinth', 'w') f.write('subTopicID\tresultID\n')