Skip to content
Snippets Groups Projects
config.py 2.01 KiB
'''
Configuration file
'''

'''
Choose paths for corpus, dataset and output.
- The output directory should be empty when starting absinth.
'''
graph = "./.graphs/"
corpus = "/proj/absinth/wikipedia_shuffled2/"
dataset = "../WSI-Evaluator/datasets/dataset/"
test = "../WSI-Evaluator/datasets/trial/"
output = "../output/"

'''
Disambiguation Pipeline
There are multiple disambiguation methods implemented. Specify the order in
which they should be merged and whether or not conflicts should be resolved.
If conflicts are not resolved, the first method with a positive result is used.
Methods labeled with 0 are ignored.
At least one method must be given a value != 0.
'''
resolve_conflicts = False #not yet implemented
colour_rank = 1
mst_rank = 2

'''
Choose stop words and allowed pos-tags.
- Stop words will not be considered for nodes.
- Only tokens with allowed pos-tags will be considered.
'''
stop_words = ['utc', "'s", 'new', 'p.', 'first', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free', 'pp', 'twitter', 'facebook', 'youtube', 'copyright', '®', '|']
allowed_tags = ['NN','NNS','JJ','JJS','JJR','NNP']

'''
Choose the maximum number of nodes and edges that should be considered before building the graph.
'''
max_nodes = 20000
max_edges = 2000000

'''
Choose the minimum context size.
'''
min_context_size = 4
max_context_size = 20

'''
Choose filters for building the graph.
- Only considers occurrences/cooccurrences for nodes/edges, that occur more often than these values.
- Only considers edges with a weight beneath the maximum weight
'''
min_node_freq = 10
min_edge_freq = 5
max_weight = 0.9

'''
Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs.
- the threshold is calculated using the media of the same number of neighbors declared in min_neighbors.
'''
min_neighbors = 4
threshold = 0.8

'''
Choose whether or not the tokens should be lemmatised.
'''
lemma = False

'''
Propagation options
'''
max_propagation_iteration_count = 50