Skip to content
Snippets Groups Projects
Commit 7ff0e703 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Replace try-except blocks with if statements

parent f80f2d90
No related branches found
No related tags found
No related merge requests found
...@@ -5,11 +5,12 @@ import spacy # for nlp ...@@ -5,11 +5,12 @@ import spacy # for nlp
import networkx as nx # for visualisation import networkx as nx # for visualisation
import matplotlib.pyplot as plt # for visualisation import matplotlib.pyplot as plt # for visualisation
from copy import deepcopy from copy import deepcopy
from nltk.corpus import stopwords
import numpy as np # for calculations import numpy as np # for calculations
nlp = spacy.load('en') # standard english nlp nlp = spacy.load('en') # standard english nlp
def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=100000, max_edges=10000000): def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 4, max_nodes=100000, max_edges=10000000):
node_freq = dict() node_freq = dict()
edge_freq = dict() edge_freq = dict()
...@@ -111,7 +112,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei ...@@ -111,7 +112,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
return G return G
def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): def root_hubs(graph, edge_freq, min_neighbors=5, theshold=0.8):
G = deepcopy(graph) G = deepcopy(graph)
V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...) V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
...@@ -178,16 +179,19 @@ def disambiguate(mst, hubs, contexts): ...@@ -178,16 +179,19 @@ def disambiguate(mst, hubs, contexts):
for h in H: for h in H:
try: if nx.has_path(T,v,h):
path = nx.shortest_path(T,v,h,'weight') path = nx.shortest_path(T,v,h,'weight')
total_weight = 0 total_weight = 0
for i in range(1, len(path)): for i in range(1, len(path)):
total_weight += T[path[i-1]][path[i]]['weight'] total_weight += T[path[i-1]][path[i]]['weight']
scores.append(1/(1+total_weight)) scores.append(1/(1+total_weight))
except: else:
scores.append(0) scores.append(0)
T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores]) T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores])
...@@ -199,14 +203,12 @@ def disambiguate(mst, hubs, contexts): ...@@ -199,14 +203,12 @@ def disambiguate(mst, hubs, contexts):
idx = contexts.index(c) + 1 idx = contexts.index(c) + 1
try: if len(vector) == 0: #if no senses are found -> all in one
if max(vector) == 0:
pass
else:
cluster = np.argmax(vector)
result.append((cluster, idx))
except:
result.append((0, idx)) result.append((0, idx))
elif max(vector) == 0: #if no sense matches -> singletons
pass
else:
result.append((np.argmax(vector), idx))
return result return result
...@@ -224,6 +226,8 @@ if __name__ == '__main__': ...@@ -224,6 +226,8 @@ if __name__ == '__main__':
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/' corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/' results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'
stop = set(stopwords.words('english') + ['utc', 'new', 'other'])
results = dict() results = dict()
with open(data_path+'results.txt', 'r') as results_file: with open(data_path+'results.txt', 'r') as results_file:
...@@ -260,14 +264,16 @@ if __name__ == '__main__': ...@@ -260,14 +264,16 @@ if __name__ == '__main__':
f.write('subTopicID\tresultID\n') f.write('subTopicID\tresultID\n')
print('[A] Counting Tokens...') print('[A] Counting Tokens...')
node_freq, edge_freq = frequencies(corpus_path, target) node_freq, edge_freq = frequencies(corpus_path, target, stop)
print('\n[A] Building Graph.\n') print('\n[A] Building Graph.\n')
G = build_graph(node_freq, edge_freq) G = build_graph(node_freq, edge_freq)
print('[A] Collecting Root Hubs...') print('[A] Collecting Root Hubs...')
H = root_hubs(G, edge_freq) H = root_hubs(G, edge_freq)
print('Root Hubs:', H, '\n') for h in H:
mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
print('{}: {}\n'.format(h, mfn))
print('[A] Building Minimum Spanning Tree.\n') print('[A] Building Minimum Spanning Tree.\n')
T = components(G, H, target) T = components(G, H, target)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment