Skip to content
Snippets Groups Projects
Commit a507c2dc authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Add config.py file.

parent 7edc05ac
No related branches found
No related tags found
No related merge requests found
import os # for reading files
import sys
print('[A] Loading ' + sys.argv[0] + '.\n')
import spacy # for nlp
import os # for reading files
import networkx as nx # for visualisation
import matplotlib.pyplot as plt # for visualisation
from copy import deepcopy
from nltk.corpus import stopwords
import numpy as np # for calculations
import config
import spacy # for nlp
nlp = spacy.load('en') # standard english nlp
def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 2, max_nodes=100000, max_edges=10000000):
def frequencies(corpus_path, target):
stop_words = set(stopwords.words('english') + config.stop_words)
allowed_tags = config.allowed_tags
min_context_size = config.min_context_size
max_nodes = config.max_nodes
max_edges = config.max_edges
node_freq = dict()
edge_freq = dict()
......@@ -89,7 +100,11 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
return node_freq, edge_freq
def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_weight= 0.9):
def build_graph(node_freq, edge_freq):
min_node_freq = config.min_node_freq
min_edge_freq = config.min_edge_freq
max_weight = config.max_weight
G = nx.Graph()
......@@ -115,6 +130,9 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
min_neighbors = config.min_neighbors
threshold = config.threshold
G = deepcopy(graph)
V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
H = list()
......@@ -240,20 +258,12 @@ def disambiguate(mst, hubs, contexts):
return result
def backup(contexts):
pass
if __name__ == '__main__':
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
#corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
corpus_path = '/proj/absinth/wikipedia_reduced/'
results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
stop = set(stopwords.words('english') + ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
corpus_path = config.corpus
data_path = config.dataset
output_path = config.output
results = dict()
......@@ -274,35 +284,38 @@ if __name__ == '__main__':
with open(data_path+'topics.txt', 'r') as topics_file:
already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
for line in topics_file.readlines()[1:5]:
for line in topics_file.readlines()[1:]:
l = line.split('\t')
if l[1] not in already_processed:
topics[l[0]] = l[1]
topics[l[0]] = l[1]
for key, value in topics.items():
o_target = value.strip() #original target
print("[A] Processing '"+o_target+"'.\n")
if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
target = o_target[4:]
else:
target = o_target
f = open(results_path+target+'.absinth', 'w')
f = open(output_path+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
print('[A] Counting Tokens...')
node_freq, edge_freq = frequencies(corpus_path, target, stop)
node_freq, edge_freq = frequencies(corpus_path, target)
print('\n[A] Building Graph.\n')
G = build_graph(node_freq, edge_freq)
print('[A] Collecting Root Hubs...')
H = root_hubs(G, edge_freq)
for h in H:
mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
print(' {}: {}'.format(h, mfn))
......@@ -314,6 +327,7 @@ if __name__ == '__main__':
print(' Mapping:', D, '\n')
print('[A] Writing to file '+o_target+'.absinth.\n\n')
for d in D:
f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
......
'''
Configuration file
'''
'''
Choose paths for corpus, dataset and output.
- The output directory should be empty when starting absinth.
'''
corpus = "/proj/absinth/wikipedia_reduced/"
dataset = "../WSI-Evaluator/datasets/MORESQUE/"
output = "../output/"
'''
Choose stop words and allowed pos-tags.
- Stop words will not be considered for nodes.
- Only tokens with allowed pos-tags will be considered.
'''
stop_words = ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free']
allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG']
'''
Choose the maximum number of nodes and edges that should be considered before building the graph.
'''
max_nodes = 100000
max_edges = 10000000
'''
Choose the minimum context size.
'''
min_context_size = 4
'''
Choose filters for building the graph.
- Only considers occurrences/cooccurrences for nodes/edges, that occur more often than these values.
- Only considers edges with a weight beneath the maximum weight
'''
min_node_freq = 10
min_edge_freq = 5
max_weight = 0.9
'''
Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs.
- the threshold is calculated using the media of the same number of neighbors declared in min_neighbors.
'''
min_neighbors = 6
theshold = 0.8
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment