Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Absinth - A Small World of Semantic Similarity
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Victor Zimmermann
Absinth - A Small World of Semantic Similarity
Commits
e96d083d
Commit
e96d083d
authored
7 years ago
by
Victor Zimmermann
Browse files
Options
Downloads
Patches
Plain Diff
Added Support for Evolutionary Graph Clustering.
parent
c71dbe78
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/absinth.py
+171
-13
171 additions, 13 deletions
src/absinth.py
src/config.py
+8
-2
8 additions, 2 deletions
src/config.py
with
179 additions
and
15 deletions
src/absinth.py
+
171
−
13
View file @
e96d083d
...
...
@@ -33,8 +33,8 @@ import pprint
import
random
import
re
import
spacy
# for nlp
from
multiprocessing
import
Pool
from
nltk.corpus
import
stopwords
from
copy
import
deepcopy
...
...
@@ -201,7 +201,7 @@ def process_file(context_list: list, target_string: str,
spaced_target_string
=
target_string
.
replace
(
'
_
'
,
'
'
)
stopword_list
=
set
(
stopwords
.
words
(
'
english
'
)
+
config
.
stop_words
)
stopword_list
=
config
.
stop_words
allowed_tag_list
=
config
.
allowed_tags
min_context_size
=
config
.
min_context_size
...
...
@@ -227,7 +227,7 @@ def process_file(context_list: list, target_string: str,
pass
# Do not add stop words to nodes.
elif
token
.
text
in
stopword_list
:
elif
token
.
is_stop
or
token
.
text
in
stopword_list
:
pass
# Add only tokens with allowed tags to nodes.
...
...
@@ -548,8 +548,154 @@ def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
return
graph
,
root_hub_list
,
stat_dict
def
colour_graph
(
graph
:
nx
.
Graph
,
root_hub_list
:
list
)
->
nx
.
Graph
:
"""
Colours graph accoring to root hubs.
Evolving network that colours neighboring nodes iterative.
Args:
graph: Weighted undirected graph.
root_hub_list: List of senses.
Returns:
Coloured graph.
"""
for
node
in
graph
.
nodes
:
if
node
in
root_hub_list
:
graph
.
node
[
node
][
'
sense
'
]
=
root_hub_list
.
index
(
node
)
else
:
graph
.
node
[
node
][
'
sense
'
]
=
None
max_iteration_count
=
config
.
max_colour_iteration_count
iteration_count
=
0
stable
=
False
while
stable
==
False
and
iteration_count
<=
max_iteration_count
:
graph_copy
=
deepcopy
(
graph
)
iteration_count
+=
1
stable
=
True
for
node
in
graph
.
nodes
:
neighbor_weight_list
=
[
0
]
*
len
(
root_hub_list
)
for
neighbor
in
graph_copy
[
node
]:
if
graph_copy
.
node
[
neighbor
][
'
sense
'
]
==
None
:
pass
else
:
neighbor_weight_list
[
graph_copy
.
node
[
neighbor
][
'
sense
'
]]
\
+=
1
-
graph_copy
[
node
][
neighbor
][
'
weight
'
]
if
any
(
neighbor_weight_list
):
old_colour
=
graph_copy
.
node
[
node
][
'
sense
'
]
new_colour
=
np
.
argmax
(
neighbor_weight_list
)
if
old_colour
!=
new_colour
:
stable
=
False
graph
.
node
[
node
][
'
sense
'
]
=
new_colour
else
:
pass
else
:
pass
return
graph
def
disambiguate_colour
(
graph
:
nx
.
Graph
,
root_hub_list
,
context_list
:
list
)
->
dict
:
"""
Clusters senses to root hubs using a coloured graph.
This algorithm colours the graph using (a method with a name i don
'
t know)
and calculates scores for each root hub given a context based on this graph.
Args:
graph: Undirected weighted graph.
root_hub_list: List of root hubs (senses).
context_list: List of search result strings to be clustered.
Returns:
A dictionary with root hub IDs as keys and context indices as values.
"""
coloured_graph
=
colour_graph
(
graph
,
root_hub_list
)
mapping_dict
=
{
i
:
list
()
for
i
in
range
(
1
,
len
(
root_hub_list
)
+
1
)}
if
len
(
root_hub_list
)
==
0
:
mapping_dict
=
{
0
:[
i
for
i
in
range
(
1
,
len
(
context_list
)
+
1
)]}
return
mapping_dict
context_id
=
0
for
context
in
context_list
:
context_id
+=
1
score
=
[
0
]
*
len
(
root_hub_list
)
parsed_context
=
nlp
(
context
)
for
token
in
parsed_context
:
if
config
.
lemma
==
True
:
text
=
token
.
lemma_
else
:
text
=
token
.
text
if
text
in
coloured_graph
.
nodes
:
text_colour
=
coloured_graph
.
node
[
text
][
'
sense
'
]
if
text_colour
==
None
:
pass
else
:
text_root
=
root_hub_list
[
text_colour
]
if
nx
.
has_path
(
coloured_graph
,
text
,
text_root
):
shortest_path
=
nx
.
shortest_path
(
coloured_graph
,
text
,
root_hub_list
[
text_colour
],
'
weight
'
)
total_weight
=
0
# Add weights of every sub-path.
for
i
in
range
(
1
,
len
(
shortest_path
)):
sub_from
,
sub_to
=
shortest_path
[
i
-
1
],
shortest_path
[
i
]
total_weight
+=
\
coloured_graph
[
sub_from
][
sub_to
][
'
weight
'
]
def
disambiguate
(
graph
:
nx
.
Graph
,
root_hub_list
:
list
,
score
[
text_colour
]
+=
1
/
(
1
+
total_weight
)
else
:
pass
else
:
pass
if
any
(
score
):
mapping_dict
[
np
.
argmax
(
score
)
+
1
].
append
(
context_id
)
else
:
pass
return
mapping_dict
def
disambiguate_mst
(
graph
:
nx
.
Graph
,
root_hub_list
:
list
,
context_list
:
list
,
topic_name
:
str
)
->
dict
:
"""
Matches contexts to senses.
...
...
@@ -568,7 +714,6 @@ def disambiguate(graph: nx.Graph, root_hub_list: list,
"""
#performs minimum_spanning_tree algorithm on graph
print
(
'
[a]
'
,
'
Building minimum spanning tree.
\t
(
'
+
topic_name
+
'
)
'
)
minimum_spanning_tree
=
components
(
graph
,
root_hub_list
,
topic_name
)
spaced_topic_name
=
topic_name
.
replace
(
'
_
'
,
'
'
)
...
...
@@ -581,7 +726,9 @@ def disambiguate(graph: nx.Graph, root_hub_list: list,
#if no sense is found for a target word, we should assume that there only is one sense
if
len
(
root_hub_list
)
==
0
:
return
{
0
:[
i
for
i
in
range
(
1
,
len
(
context_list
)
+
1
)]}
mapping_dict
=
{
0
:[
i
for
i
in
range
(
1
,
len
(
context_list
)
+
1
)]}
return
mapping_dict
idx
=
0
...
...
@@ -639,8 +786,8 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
"""
Calls induction and disambiguation functions, performs main task.
The task is to both induce senses and match search results to them. This
function calls in much the same way induce() and disambiguate() to
perform
these sub tasks. The result is then written to the output directory
function calls in much the same way induce() and disambiguate
_mst
() to
perform
these sub tasks. The result is then written to the output directory
specified in config.py.
Args:
...
...
@@ -657,8 +804,15 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
#matches senses to clusters
print
(
'
[a]
'
,
'
Disambiguating result_list.
\t
(
'
+
topic_name
+
'
)
'
)
mapping_dict
=
disambiguate
(
graph
,
root_hub_list
,
result_dict
[
topic_id
],
topic_name
)
if
config
.
use_colouring
==
True
:
print
(
'
[a]
'
,
'
Colouring graph.
\t
(
'
+
topic_name
+
'
)
'
)
mapping_dict
=
disambiguate_colour
(
graph
,
root_hub_list
,
result_dict
[
topic_id
])
else
:
print
(
'
[a]
'
,
'
Building minimum spanning tree.
\t
(
'
+
topic_name
+
'
)
'
)
mapping_dict
=
disambiguate_mst
(
graph
,
root_hub_list
,
result_dict
[
topic_id
],
topic_name
)
#collect statistics from result.
cluster_count
=
0
...
...
@@ -696,6 +850,13 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
if
__name__
==
'
__main__
'
:
"""
Check for modifiers and call main().
Only called when absinth.py is started manually. Checks for various
modifiers, i.e. test environment and number of processes to run
simultaneously.
"""
# If absinth.py is run in test environment.
if
'
-t
'
in
sys
.
argv
:
data_path
=
config
.
test
...
...
@@ -714,6 +875,3 @@ if __name__ == '__main__':
parameter_list
=
[(
topic_id
,
topic_name
,
result_dict
)
for
topic_id
,
topic_name
in
topic_dict
.
items
()]
pool
.
starmap
(
main
,
parameter_list
)
#for topic_id,topic_name in topics.items():
#word_sense_induction(topic_id,topic_name, results)
This diff is collapsed.
Click to expand it.
src/config.py
+
8
−
2
View file @
e96d083d
...
...
@@ -43,10 +43,16 @@ max_weight = 0.9
Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs.
- the threshold is calculated using the media of the same number of neighbors declared in min_neighbors.
'''
min_neighbors
=
5
min_neighbors
=
4
threshold
=
0.8
'''
Choose whether or not the tokens should be lemmatised.
'''
lemma
=
True
lemma
=
False
'''
colouring options
'''
use_colouring
=
True
max_colour_iteration_count
=
50
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment