From a507c2dcbe6d99f045fa14ca52ee32ff65a2c33c Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Thu, 8 Mar 2018 19:10:03 +0100
Subject: [PATCH] Add config.py file.

---
 src/absinth.py | 60 +++++++++++++++++++++++++++++++-------------------
 src/config.py  | 46 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+), 23 deletions(-)
 create mode 100644 src/config.py

diff --git a/src/absinth.py b/src/absinth.py
index 02fd7ae..c9cef83 100644
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -1,16 +1,27 @@
-import os # for reading files
 import sys
+
 print('[A] Loading ' + sys.argv[0] + '.\n')
-import spacy # for nlp
+
+import os # for reading files
 import networkx as nx # for visualisation
-import matplotlib.pyplot as plt # for visualisation
 from copy import deepcopy
 from nltk.corpus import stopwords
 import numpy as np # for calculations
+
+import config
+
+import spacy # for nlp
 nlp = spacy.load('en') # standard english nlp
 
 
-def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 2, max_nodes=100000, max_edges=10000000):
+
+def frequencies(corpus_path, target):
+    
+    stop_words = set(stopwords.words('english') + config.stop_words)
+    allowed_tags = config.allowed_tags
+    min_context_size = config.min_context_size
+    max_nodes = config.max_nodes
+    max_edges = config.max_edges
     
     node_freq = dict()
     edge_freq = dict()
@@ -89,7 +100,11 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
     return node_freq, edge_freq
 
 
-def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_weight= 0.9):
+def build_graph(node_freq, edge_freq):
+    
+    min_node_freq = config.min_node_freq
+    min_edge_freq = config.min_edge_freq
+    max_weight = config.max_weight
     
     G = nx.Graph()
     
@@ -115,6 +130,9 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
 
 def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
     
+    min_neighbors = config.min_neighbors
+    threshold = config.threshold
+    
     G = deepcopy(graph)
     V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
     H = list()
@@ -240,20 +258,12 @@ def disambiguate(mst, hubs, contexts):
 
     return result
 
-def backup(contexts):
-    
-    pass
-        
-        
 
 if __name__ == '__main__':
     
-    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
-    #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
-    corpus_path = '/proj/absinth/wikipedia_reduced/'
-    results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
-    
-    stop = set(stopwords.words('english') + ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
+    corpus_path = config.corpus
+    data_path = config.dataset
+    output_path = config.output
     
     results = dict()
     
@@ -274,35 +284,38 @@ if __name__ == '__main__':
     
     with open(data_path+'topics.txt', 'r') as topics_file:
         
-        already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
-        
-        for line in topics_file.readlines()[1:5]:
+        for line in topics_file.readlines()[1:]:
             
             l = line.split('\t')
-            if l[1] not in already_processed:
-                topics[l[0]] = l[1]
+            topics[l[0]] = l[1]
         
     for key, value in topics.items():
             
         o_target = value.strip() #original target
         print("[A] Processing '"+o_target+"'.\n")
+        
         if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
+            
             target = o_target[4:]
+            
         else:
+            
             target = o_target
         
-        f = open(results_path+target+'.absinth', 'w')
+        f = open(output_path+target+'.absinth', 'w')
         f.write('subTopicID\tresultID\n')
         
         print('[A] Counting Tokens...')
-        node_freq, edge_freq = frequencies(corpus_path, target, stop)
+        node_freq, edge_freq = frequencies(corpus_path, target)
         
         print('\n[A] Building Graph.\n')
         G = build_graph(node_freq, edge_freq)
         
         print('[A] Collecting Root Hubs...')
         H = root_hubs(G, edge_freq)
+        
         for h in H:
+            
             mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
             print(' {}: {}'.format(h, mfn))
         
@@ -314,6 +327,7 @@ if __name__ == '__main__':
         print(' Mapping:', D, '\n')
         
         print('[A] Writing to file '+o_target+'.absinth.\n\n')
+        
         for d in D:
             
             f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
diff --git a/src/config.py b/src/config.py
new file mode 100644
index 0000000..ecda988
--- /dev/null
+++ b/src/config.py
@@ -0,0 +1,46 @@
+'''
+Configuration file
+'''
+
+'''
+Choose paths for corpus, dataset and output.
+- The output directory should be empty when starting absinth.
+'''
+corpus = "/proj/absinth/wikipedia_reduced/"
+dataset = "../WSI-Evaluator/datasets/MORESQUE/"
+output = "../output/"
+
+'''
+Choose stop words and allowed pos-tags.
+- Stop words will not be considered for nodes.
+- Only tokens with allowed pos-tags will be considered.
+'''
+stop_words = ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free']
+allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG']
+
+'''
+Choose the maximum number of nodes and edges that should be considered before building the graph.
+'''
+max_nodes = 100000
+max_edges = 10000000
+
+'''
+Choose the minimum context size.
+'''
+min_context_size = 4
+
+'''
+Choose filters for building the graph.
+- Only considers occurrences/cooccurrences for nodes/edges, that occur more often than these values.
+- Only considers edges with a weight beneath the maximum weight
+'''
+min_node_freq = 10
+min_edge_freq = 5
+max_weight = 0.9
+
+'''
+Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs.
+- the threshold is calculated using the media of the same number of neighbors declared in min_neighbors.
+'''
+min_neighbors = 6
+theshold = 0.8
-- 
GitLab