diff --git a/src/absinth.py b/src/absinth.py index 02fd7ae2a74b79983b76fedb24c2e70cdc22abd5..c9cef83300d3d10011b645351333baaecc539b42 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -1,16 +1,27 @@ -import os # for reading files import sys + print('[A] Loading ' + sys.argv[0] + '.\n') -import spacy # for nlp + +import os # for reading files import networkx as nx # for visualisation -import matplotlib.pyplot as plt # for visualisation from copy import deepcopy from nltk.corpus import stopwords import numpy as np # for calculations + +import config + +import spacy # for nlp nlp = spacy.load('en') # standard english nlp -def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 2, max_nodes=100000, max_edges=10000000): + +def frequencies(corpus_path, target): + + stop_words = set(stopwords.words('english') + config.stop_words) + allowed_tags = config.allowed_tags + min_context_size = config.min_context_size + max_nodes = config.max_nodes + max_edges = config.max_edges node_freq = dict() edge_freq = dict() @@ -89,7 +100,11 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ return node_freq, edge_freq -def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_weight= 0.9): +def build_graph(node_freq, edge_freq): + + min_node_freq = config.min_node_freq + min_edge_freq = config.min_edge_freq + max_weight = config.max_weight G = nx.Graph() @@ -115,6 +130,9 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): + min_neighbors = config.min_neighbors + threshold = config.threshold + G = deepcopy(graph) V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...) H = list() @@ -240,20 +258,12 @@ def disambiguate(mst, hubs, contexts): return result -def backup(contexts): - - pass - - if __name__ == '__main__': - data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/' - #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' - corpus_path = '/proj/absinth/wikipedia_reduced/' - results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/' - - stop = set(stopwords.words('english') + ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free']) + corpus_path = config.corpus + data_path = config.dataset + output_path = config.output results = dict() @@ -274,35 +284,38 @@ if __name__ == '__main__': with open(data_path+'topics.txt', 'r') as topics_file: - already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)] - - for line in topics_file.readlines()[1:5]: + for line in topics_file.readlines()[1:]: l = line.split('\t') - if l[1] not in already_processed: - topics[l[0]] = l[1] + topics[l[0]] = l[1] for key, value in topics.items(): o_target = value.strip() #original target print("[A] Processing '"+o_target+"'.\n") + if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection + target = o_target[4:] + else: + target = o_target - f = open(results_path+target+'.absinth', 'w') + f = open(output_path+target+'.absinth', 'w') f.write('subTopicID\tresultID\n') print('[A] Counting Tokens...') - node_freq, edge_freq = frequencies(corpus_path, target, stop) + node_freq, edge_freq = frequencies(corpus_path, target) print('\n[A] Building Graph.\n') G = build_graph(node_freq, edge_freq) print('[A] Collecting Root Hubs...') H = root_hubs(G, edge_freq) + for h in H: + mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6] print(' {}: {}'.format(h, mfn)) @@ -314,6 +327,7 @@ if __name__ == '__main__': print(' Mapping:', D, '\n') print('[A] Writing to file '+o_target+'.absinth.\n\n') + for d in D: f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n') diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000000000000000000000000000000000000..ecda9887ce494107b830108ad75695cc7c15c889 --- /dev/null +++ b/src/config.py @@ -0,0 +1,46 @@ +''' +Configuration file +''' + +''' +Choose paths for corpus, dataset and output. +- The output directory should be empty when starting absinth. +''' +corpus = "/proj/absinth/wikipedia_reduced/" +dataset = "../WSI-Evaluator/datasets/MORESQUE/" +output = "../output/" + +''' +Choose stop words and allowed pos-tags. +- Stop words will not be considered for nodes. +- Only tokens with allowed pos-tags will be considered. +''' +stop_words = ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'] +allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'] + +''' +Choose the maximum number of nodes and edges that should be considered before building the graph. +''' +max_nodes = 100000 +max_edges = 10000000 + +''' +Choose the minimum context size. +''' +min_context_size = 4 + +''' +Choose filters for building the graph. +- Only considers occurrences/cooccurrences for nodes/edges, that occur more often than these values. +- Only considers edges with a weight beneath the maximum weight +''' +min_node_freq = 10 +min_edge_freq = 5 +max_weight = 0.9 + +''' +Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs. +- the threshold is calculated using the media of the same number of neighbors declared in min_neighbors. +''' +min_neighbors = 6 +theshold = 0.8