Add config.py file.

a507c2dc · Victor Zimmermann · 7edc05ac · a507c2dc · a507c2dc
Commit a507c2dc authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
-import os # for reading files
 import sys
+
 print('[A] Loading ' + sys.argv[0] + '.\n')
-import spacy # for nlp
+
+import os # for reading files
 import networkx as nx # for visualisation
-import matplotlib.pyplot as plt # for visualisation
 from copy import deepcopy
 from nltk.corpus import stopwords
 import numpy as np # for calculations
+
+import config
+
+import spacy # for nlp
 nlp = spacy.load('en') # standard english nlp


-def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 2, max_nodes=100000, max_edges=10000000):
+
+def frequencies(corpus_path, target):
+    
+    stop_words = set(stopwords.words('english') + config.stop_words)
+    allowed_tags = config.allowed_tags
+    min_context_size = config.min_context_size
+    max_nodes = config.max_nodes
+    max_edges = config.max_edges
    
    node_freq = dict()
    edge_freq = dict()
@@ -89,7 +100,11 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
    return node_freq, edge_freq


-def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_weight= 0.9):
+def build_graph(node_freq, edge_freq):
+    
+    min_node_freq = config.min_node_freq
+    min_edge_freq = config.min_edge_freq
+    max_weight = config.max_weight
    
    G = nx.Graph()
    
@@ -115,6 +130,9 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei

 def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
    
+    min_neighbors = config.min_neighbors
+    threshold = config.threshold
+    
    G = deepcopy(graph)
    V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
    H = list()
@@ -240,20 +258,12 @@ def disambiguate(mst, hubs, contexts):

    return result

-def backup(contexts):
-    
-    pass
-        
-        

 if __name__ == '__main__':
    
-    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
-    #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
-    corpus_path = '/proj/absinth/wikipedia_reduced/'
-    results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
-    
-    stop = set(stopwords.words('english') + ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
+    corpus_path = config.corpus
+    data_path = config.dataset
+    output_path = config.output
    
    results = dict()
    
@@ -274,35 +284,38 @@ if __name__ == '__main__':
    
    with open(data_path+'topics.txt', 'r') as topics_file:
        
-        already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
-        
-        for line in topics_file.readlines()[1:5]:
+        for line in topics_file.readlines()[1:]:
            
            l = line.split('\t')
-            if l[1] not in already_processed:
-                topics[l[0]] = l[1]
+            topics[l[0]] = l[1]
        
    for key, value in topics.items():
            
        o_target = value.strip() #original target
        print("[A] Processing '"+o_target+"'.\n")
+        
        if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
+            
            target = o_target[4:]
+            
        else:
+            
            target = o_target
        
-        f = open(results_path+target+'.absinth', 'w')
+        f = open(output_path+target+'.absinth', 'w')
        f.write('subTopicID\tresultID\n')
        
        print('[A] Counting Tokens...')
-        node_freq, edge_freq = frequencies(corpus_path, target, stop)
+        node_freq, edge_freq = frequencies(corpus_path, target)
        
        print('\n[A] Building Graph.\n')
        G = build_graph(node_freq, edge_freq)
        
        print('[A] Collecting Root Hubs...')
        H = root_hubs(G, edge_freq)
+        
        for h in H:
+            
            mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
            print(' {}: {}'.format(h, mfn))
        
@@ -314,6 +327,7 @@ if __name__ == '__main__':
        print(' Mapping:', D, '\n')
        
        print('[A] Writing to file '+o_target+'.absinth.\n\n')
+        
        for d in D:
            
            f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')

--- a/src/config.py
+++ b/src/config.py
+'''
+Configuration file
+'''
+
+'''
+Choose paths for corpus, dataset and output.
+- The output directory should be empty when starting absinth.
+'''
+corpus = "/proj/absinth/wikipedia_reduced/"
+dataset = "../WSI-Evaluator/datasets/MORESQUE/"
+output = "../output/"
+
+'''
+Choose stop words and allowed pos-tags.
+- Stop words will not be considered for nodes.
+- Only tokens with allowed pos-tags will be considered.
+'''
+stop_words = ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free']
+allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG']
+
+'''
+Choose the maximum number of nodes and edges that should be considered before building the graph.
+'''
+max_nodes = 100000
+max_edges = 10000000
+
+'''
+Choose the minimum context size.
+'''
+min_context_size = 4
+
+'''
+Choose filters for building the graph.
+- Only considers occurrences/cooccurrences for nodes/edges, that occur more often than these values.
+- Only considers edges with a weight beneath the maximum weight
+'''
+min_node_freq = 10
+min_edge_freq = 5
+max_weight = 0.9
+
+'''
+Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs.
+- the threshold is calculated using the media of the same number of neighbors declared in min_neighbors.
+'''
+min_neighbors = 6
+theshold = 0.8