Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Absinth - A Small World of Semantic Similarity
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Victor Zimmermann
Absinth - A Small World of Semantic Similarity
Commits
a507c2dc
Commit
a507c2dc
authored
7 years ago
by
Victor Zimmermann
Browse files
Options
Downloads
Patches
Plain Diff
Add config.py file.
parent
7edc05ac
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/absinth.py
+37
-23
37 additions, 23 deletions
src/absinth.py
src/config.py
+46
-0
46 additions, 0 deletions
src/config.py
with
83 additions
and
23 deletions
src/absinth.py
+
37
−
23
View file @
a507c2dc
import
os
# for reading files
import
sys
print
(
'
[A] Loading
'
+
sys
.
argv
[
0
]
+
'
.
\n
'
)
import
spacy
# for nlp
import
os
# for reading files
import
networkx
as
nx
# for visualisation
import
matplotlib.pyplot
as
plt
# for visualisation
from
copy
import
deepcopy
from
nltk.corpus
import
stopwords
import
numpy
as
np
# for calculations
import
config
import
spacy
# for nlp
nlp
=
spacy
.
load
(
'
en
'
)
# standard english nlp
def
frequencies
(
corpus_path
,
target
,
stop_words
=
[],
allowed_tags
=
[
'
NN
'
,
'
NNS
'
,
'
JJ
'
,
'
JJS
'
,
'
JJR
'
,
'
NNP
'
,
'
VBZ
'
,
'
VBG
'
],
min_context_size
=
2
,
max_nodes
=
100000
,
max_edges
=
10000000
):
def
frequencies
(
corpus_path
,
target
):
stop_words
=
set
(
stopwords
.
words
(
'
english
'
)
+
config
.
stop_words
)
allowed_tags
=
config
.
allowed_tags
min_context_size
=
config
.
min_context_size
max_nodes
=
config
.
max_nodes
max_edges
=
config
.
max_edges
node_freq
=
dict
()
edge_freq
=
dict
()
...
...
@@ -89,7 +100,11 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
return
node_freq
,
edge_freq
def
build_graph
(
node_freq
,
edge_freq
,
min_node_freq
=
10
,
min_edge_freq
=
5
,
max_weight
=
0.9
):
def
build_graph
(
node_freq
,
edge_freq
):
min_node_freq
=
config
.
min_node_freq
min_edge_freq
=
config
.
min_edge_freq
max_weight
=
config
.
max_weight
G
=
nx
.
Graph
()
...
...
@@ -115,6 +130,9 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
def
root_hubs
(
graph
,
edge_freq
,
min_neighbors
=
4
,
theshold
=
0.8
):
min_neighbors
=
config
.
min_neighbors
threshold
=
config
.
threshold
G
=
deepcopy
(
graph
)
V
=
sorted
(
G
.
nodes
,
key
=
lambda
key
:
G
.
degree
[
key
],
reverse
=
True
)
# -1 to sort descending (...3 -> 2 -> 1...)
H
=
list
()
...
...
@@ -240,20 +258,12 @@ def disambiguate(mst, hubs, contexts):
return
result
def
backup
(
contexts
):
pass
if
__name__
==
'
__main__
'
:
data_path
=
'
/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/
'
#corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
corpus_path
=
'
/proj/absinth/wikipedia_reduced/
'
results_path
=
'
/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/
'
stop
=
set
(
stopwords
.
words
(
'
english
'
)
+
[
'
utc
'
,
"'
s
"
,
'
new
'
,
'
other
'
,
'
talk
'
,
'
wikipedia
'
,
'
article
'
,
'
topic
'
,
'
page
'
,
'
editors
'
,
'
encyclopedia
'
,
'
free
'
])
corpus_path
=
config
.
corpus
data_path
=
config
.
dataset
output_path
=
config
.
output
results
=
dict
()
...
...
@@ -274,35 +284,38 @@ if __name__ == '__main__':
with
open
(
data_path
+
'
topics.txt
'
,
'
r
'
)
as
topics_file
:
already_processed
=
[
f
.
replace
(
'
.absinth
'
,
''
)
for
f
in
os
.
listdir
(
results_path
)]
for
line
in
topics_file
.
readlines
()[
1
:
5
]:
for
line
in
topics_file
.
readlines
()[
1
:]:
l
=
line
.
split
(
'
\t
'
)
if
l
[
1
]
not
in
already_processed
:
topics
[
l
[
0
]]
=
l
[
1
]
topics
[
l
[
0
]]
=
l
[
1
]
for
key
,
value
in
topics
.
items
():
o_target
=
value
.
strip
()
#original target
print
(
"
[A] Processing
'"
+
o_target
+
"'
.
\n
"
)
if
o_target
[:
4
]
==
'
the_
'
and
o_target
.
count
(
'
_
'
)
>=
2
:
#hard coded 'the'-protection
target
=
o_target
[
4
:]
else
:
target
=
o_target
f
=
open
(
results
_path
+
target
+
'
.absinth
'
,
'
w
'
)
f
=
open
(
output
_path
+
target
+
'
.absinth
'
,
'
w
'
)
f
.
write
(
'
subTopicID
\t
resultID
\n
'
)
print
(
'
[A] Counting Tokens...
'
)
node_freq
,
edge_freq
=
frequencies
(
corpus_path
,
target
,
stop
)
node_freq
,
edge_freq
=
frequencies
(
corpus_path
,
target
)
print
(
'
\n
[A] Building Graph.
\n
'
)
G
=
build_graph
(
node_freq
,
edge_freq
)
print
(
'
[A] Collecting Root Hubs...
'
)
H
=
root_hubs
(
G
,
edge_freq
)
for
h
in
H
:
mfn
=
sorted
(
G
.
adj
[
h
],
key
=
lambda
key
:
edge_freq
[
h
,
key
]
if
h
<
key
else
edge_freq
[
key
,
h
],
reverse
=
True
)[:
6
]
print
(
'
{}: {}
'
.
format
(
h
,
mfn
))
...
...
@@ -314,6 +327,7 @@ if __name__ == '__main__':
print
(
'
Mapping:
'
,
D
,
'
\n
'
)
print
(
'
[A] Writing to file
'
+
o_target
+
'
.absinth.
\n\n
'
)
for
d
in
D
:
f
.
write
(
key
+
'
.
'
+
str
(
d
[
0
])
+
'
\t
'
+
key
+
'
.
'
+
str
(
d
[
1
])
+
'
\n
'
)
...
...
This diff is collapsed.
Click to expand it.
src/config.py
0 → 100644
+
46
−
0
View file @
a507c2dc
'''
Configuration file
'''
'''
Choose paths for corpus, dataset and output.
- The output directory should be empty when starting absinth.
'''
corpus
=
"
/proj/absinth/wikipedia_reduced/
"
dataset
=
"
../WSI-Evaluator/datasets/MORESQUE/
"
output
=
"
../output/
"
'''
Choose stop words and allowed pos-tags.
- Stop words will not be considered for nodes.
- Only tokens with allowed pos-tags will be considered.
'''
stop_words
=
[
'
utc
'
,
"'
s
"
,
'
new
'
,
'
other
'
,
'
talk
'
,
'
wikipedia
'
,
'
article
'
,
'
topic
'
,
'
page
'
,
'
editors
'
,
'
encyclopedia
'
,
'
free
'
]
allowed_tags
=
[
'
NN
'
,
'
NNS
'
,
'
JJ
'
,
'
JJS
'
,
'
JJR
'
,
'
NNP
'
,
'
VBZ
'
,
'
VBG
'
]
'''
Choose the maximum number of nodes and edges that should be considered before building the graph.
'''
max_nodes
=
100000
max_edges
=
10000000
'''
Choose the minimum context size.
'''
min_context_size
=
4
'''
Choose filters for building the graph.
- Only considers occurrences/cooccurrences for nodes/edges, that occur more often than these values.
- Only considers edges with a weight beneath the maximum weight
'''
min_node_freq
=
10
min_edge_freq
=
5
max_weight
=
0.9
'''
Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs.
- the threshold is calculated using the media of the same number of neighbors declared in min_neighbors.
'''
min_neighbors
=
6
theshold
=
0.8
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment