Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Absinth - A Small World of Semantic Similarity
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Victor Zimmermann
Absinth - A Small World of Semantic Similarity
Commits
565b5233
Commit
565b5233
authored
7 years ago
by
Victor Zimmermann
Browse files
Options
Downloads
Patches
Plain Diff
Reimplement components() + more comment reform.
parent
657eb8e5
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/absinth.py
+194
-112
194 additions, 112 deletions
src/absinth.py
with
194 additions
and
112 deletions
src/absinth.py
+
194
−
112
View file @
565b5233
...
...
@@ -44,7 +44,7 @@ def frequencies(target_string, search_result_list):
bracketed_target_string
=
'
(
'
+
target_string
+
'
)
'
# Remove unnecessary tokens from snippets
# Remove unnecessary tokens from snippets
.
_search_result_list
=
list
()
for
r
in
search_result_list
:
r
=
r
.
replace
(
'
<b>
'
,
''
)
...
...
@@ -53,13 +53,12 @@ def frequencies(target_string, search_result_list):
r
=
r
.
strip
()
_search_result_list
.
append
(
r
)
#
i
nitialise
s
frequencies with counts from results
#
I
nitialise frequencies with counts from results
.
node_freq_dict
,
edge_freq_dict
=
process_file
(
_search_result_list
,
target_string
,
dict
(),
dict
())
#names of corpus files
corpus_file_path_list
=
[
corpus_path
+
f
for
f
in
os
.
listdir
(
corpus_path
)]
corpus_size
=
len
(
corpus_file_path_list
)
...
...
@@ -69,7 +68,7 @@ def frequencies(target_string, search_result_list):
node_count
=
len
(
node_freq_dict
)
edge_count
=
len
(
edge_freq_dict
)
#
p
rint
s
update after every 11th of the corpus is parsed
#
P
rint update after every 11th of the corpus is parsed
.
if
processed_file_count
%
int
(
corpus_size
/
11
)
==
0
:
file_ratio
=
processed_file_count
/
corpus_size
...
...
@@ -78,7 +77,7 @@ def frequencies(target_string, search_result_list):
ratios
=
[
file_ratio
,
max_node_ratio
,
max_edge_ratio
]
#
uses th
e ratio closest to 100%.
#
Us
e ratio closest to 100%.
highest_ratio
=
int
((
max
(
ratios
))
*
100
)
print
(
'
[a] ~{:02d}%
\t
Nodes: {}
\t
Edges: {}
\t
{}.
'
.
format
(
highest_ratio
,
...
...
@@ -86,7 +85,6 @@ def frequencies(target_string, search_result_list):
edge_count
,
bracketed_target_string
))
#checks maximum node values
if
node_count
>
max_node_count
:
print
(
'
[a] 100%
\t
Nodes: {}
\t
Edges: {}
\t
{}.
'
.
format
(
node_count
,
edge_count
,
...
...
@@ -148,11 +146,11 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
for
context
in
context_list
:
context
=
context
.
lower
()
if
spaced_target_string
in
context
:
#
greedy p
re
select
ion, not perfect
if
spaced_target_string
in
context
:
#
P
re
-
select
lines greedy.
token_set
=
set
()
#set of node candidates
token_set
=
set
()
#
This replacement a
llow
s
target to be treated as single entity.
#
A
llow target to be treated as single entity.
context
=
context
.
replace
(
spaced_target_string
,
target_string
)
processed_context
=
nlp
(
context
)
...
...
@@ -160,15 +158,15 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
for
token
in
processed_context
:
#
doesn'
t add target word to nodes
#
Do no
t add target word to nodes
.
if
token
.
text
==
target_string
:
pass
#
doesn'
t add stop words to nodes
#
Do no
t add stop words to nodes
.
elif
token
.
text
in
stopword_list
:
pass
#
only adds
tokens with allowed tags to nodes
#
Add only
tokens with allowed tags to nodes
.
elif
token
.
tag_
in
allowed_tag_list
:
token_set
.
add
(
token
.
text
)
...
...
@@ -190,14 +188,14 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
else
:
edge_freq_dict
[
edge
]
=
1
#
if a
file is corrupted (can't always be catched with if-else)
#
If
file is corrupted (can't always be catched with if-else)
, ignore file.
except
UnicodeDecodeError
:
pass
return
node_freq_dict
,
edge_freq_dict
#build graph from frequency dictionaries
def
build_graph
(
node_freq_dict
,
edge_freq_dict
):
"""
Builds undirected weighted graph from dictionaries.
...
...
@@ -221,13 +219,11 @@ def build_graph(node_freq_dict, edge_freq_dict):
cooccurence_graph
=
nx
.
Graph
()
#node : node frequency
for
node
,
frequency
in
node_freq_dict
.
items
():
if
frequency
>=
min_node_freq
:
cooccurence_graph
.
add_node
(
node
)
#edge : edge frequency
for
node_tuple
,
frequency
in
edge_freq_dict
.
items
():
if
frequency
<
min_edge_freq
:
...
...
@@ -265,166 +261,246 @@ def build_graph(node_freq_dict, edge_freq_dict):
return
cooccurence_graph
#Identifies senses by choosing nodes with high degrees
def
root_hubs
(
graph
,
edge_freq_dict
,
min_neighbors
=
4
,
theshold
=
0.8
):
def
root_hubs
(
graph
,
edge_freq_dict
):
"""
Identifies senses (root hubs) by choosing nodes with high degrees
Selects root hubs according to the algorithm in Véronis (2004). Nodes with
high degree and neighbors with low weights (high cooccurence) are chosen
until there are no more viable candidates. A root hub candidate is every
node that is not already a hub and is not a neighbor of one.
Args:
graph: Weighted undirected graph.
edge_freq_dict: Dictionary of weights for every tuple in our graph.
Returns:
hub_list: List of root hubs, i.e. strings that are selected using the
algorithm explained above.
"""
min_neighbors
=
config
.
min_neighbors
threshold
=
config
.
threshold
G
=
deepcopy
(
graph
)
V
=
sorted
(
G
.
nodes
,
key
=
lambda
key
:
G
.
degree
[
key
],
reverse
=
True
)
# sorts according to degree
H
=
list
()
#output list
# Allow operations on graph without altering original one.
graph_copy
=
deepcopy
(
graph
)
# Sort according to degree (number of neighbors).
candidate_list
=
sorted
(
graph_copy
.
nodes
,
key
=
lambda
node
:
graph_copy
.
degree
[
node
],
reverse
=
True
)
hub_list
=
list
()
while
V
:
# While there are still candidates, search for root hubs.
while
candidate_list
:
v
=
V
[
0
]
#best hub candidate
candidate
=
candidate_list
[
0
]
#best hub candidate
if
G
.
degree
[
v
]
>=
min_neighbors
:
if
graph_copy
.
degree
[
candidate
]
>=
min_neighbors
:
mfn
=
sorted
(
G
.
adj
[
v
],
key
=
lambda
key
:
edge_freq_dict
[
v
,
key
]
if
v
<
key
else
edge_freq_dict
[
key
,
v
],
reverse
=
True
)[:
min_neighbors
]
#most frequent neighbors
by_frequency
=
lambda
node
:
edge_freq_dict
[
candidate
,
node
]
\
if
candidate
<
node
\
else
edge_freq_dict
[
node
,
candidate
]
most_frequent_neighbor_list
=
sorted
(
graph_copy
.
adj
[
candidate
],
key
=
by_frequency
,
reverse
=
True
)
[:
min_neighbors
]
if
np
.
mean
([
G
.
edges
[
v
,
n
][
'
weight
'
]
for
n
in
mfn
])
<
theshold
:
#if the median weight of the most frequent neighbors is under threshold
# If the mean weight of the most frequent neighbors cooccur
# frequently enough with candidate, the candidate is approved.
if
np
.
mean
([
graph_copy
.
edges
[
candidate
,
node
][
'
weight
'
]
for
node
in
most_frequent_neighbor_list
])
<
threshold
:
H
.
append
(
v
)
# Add candidate as root hub.
hub_list
.
append
(
candidate
)
#removes neighbors of new hub as hub candidates
for
nbr
in
deepcopy
(
G
).
adj
[
v
]:
# Remove neighbors of new hub as hub candidates.
for
neighbor
in
deepcopy
(
graph_copy
).
adj
[
candidate
]:
graph_copy
.
remove_node
(
neighbor
)
G
.
remove_node
(
nbr
)
#removes hub candidate
G
.
remove_node
(
v
)
# Remove hub candidate.
graph_copy
.
remove_node
(
candidate
)
#reorderd potential hubs after deletions
V
=
sorted
(
G
.
nodes
,
key
=
lambda
key
:
G
.
degree
[
key
],
reverse
=
True
)
# Reorder potential hubs after deletions.
candidate_list
=
sorted
(
graph_copy
.
nodes
,
key
=
lambda
node
:
graph_copy
.
degree
[
node
],
reverse
=
True
)
else
:
return
H
return
hub_list
return
H
return
hub_list
#C
omponents
algorithm from Véronis (2004), converts graph for target into a MST
def
components
(
graph
,
hubs
,
target_string
):
def
c
omponents
(
graph
,
root_hub_list
,
target_string
):
"""
Builds minimum spanning tree from graph and removes singletons.
G
=
deepcopy
(
graph
)
H
=
hubs
#root hubs
t
=
target_string
Applies components algorithm from Véronis (2004) and removes singletons.
#G.add_node(t)
#for h in H:
#G.add_edge(t,h,weight=0)
Args:
graph: Undirected weighted graph.
root_hub_list: List of strings of root hubs of graph.
target_string: Root of minimum spanning tree.
T
=
nx
.
minimum_spanning_tree
(
G
)
Returns:
minimum_spanning_tree: Minimum spanning tree with target as
root and root hubs as direct children. Singletons removed.
"""
graph_copy
=
deepcopy
(
graph
)
graph_copy
.
add_node
(
target_string
)
for
root_hub
in
root_hub_list
:
graph_copy
.
add_edge
(
target_string
,
root_hub
,
weight
=
0
)
minimum_spanning_tree
=
nx
.
minimum_spanning_tree
(
graph_copy
)
#
r
emove
s
singletons
for
node
in
deepcopy
(
T
).
nodes
:
if
len
(
T
.
adj
[
node
])
==
0
:
T
.
remove_node
(
node
)
#
R
emove singletons
, deepcopy for iteration while being altered.
for
node
in
deepcopy
(
minimum_spanning_tree
).
nodes
:
if
len
(
minimum_spanning_tree
.
adj
[
node
])
==
0
:
minimum_spanning_tree
.
remove_node
(
node
)
return
T
return
minimum_spanning_tree
#Calculates score for a given path in a minimum spanning tree
def
score
(
graph
,
from_node
,
to_node
):
def
score
(
graph
,
component
,
root_hub_list
):
"""
Calculate score for a given component in a minimum spanning tree.
#if correct tree
if
nx
.
has_path
(
graph
,
from_node
,
to_node
):
# calculates shortest path (approximation for path with lowest total weight)
path
=
nx
.
shortest_path
(
graph
,
from_node
,
to_node
,
'
weight
'
)
total_weight
=
0
First the correct root for the component is chosen. If no root hub is
suitable, an empty array is returned. A score is calculated for the distance
of the component and its root and returned as part of an array filled with
zeroes.
#adds weights of every sub-path
for
i
in
range
(
1
,
len
(
path
)):
sub_from
,
sub_to
=
path
[
i
-
1
],
path
[
i
]
total_weight
+=
graph
[
sub_from
][
sub_to
][
'
weight
'
]
Args:
graph: Minimum spanning tree.
component: Node (string) from which the distances are to be calculated.
root_hub_list: List of strings of root hubs (senses) of original graph.
#the further the path, the lower the score
return
1
/
(
1
+
total_weight
)
else
:
return
0
Returns:
score_array: Array with one score for the correct root hub and filled
with zeroes..
"""
root_hub_count
=
len
(
root_hub_list
)
#Initialise score array.
score_array
=
np
.
zeros
(
root_hub_count
)
# Find root of component.
distance_list
=
list
()
for
root_hub
in
root_hub_list
:
if
nx
.
has_path
(
graph
,
component
,
root_hub
):
distance_list
.
append
(
1
/
(
1
+
len
(
nx
.
shortest_path
(
graph
,
component
,
root_hub
))))
else
:
distance_list
.
append
(
0
)
if
sum
(
distance_list
)
==
0
:
return
score_array
root_idx
=
np
.
argmax
(
distance_list
)
root
=
root_hub_list
[
root_idx
]
shortest_path
=
nx
.
shortest_path
(
graph
,
component
,
root
,
'
weight
'
)
total_weight
=
0
# Add weights of every sub-path.
for
i
in
range
(
1
,
len
(
shortest_path
)):
sub_from
,
sub_to
=
shortest_path
[
i
-
1
],
shortest_path
[
i
]
total_weight
+=
graph
[
sub_from
][
sub_to
][
'
weight
'
]
score_array
=
np
.
zeros
(
root_hub_count
)
score_array
[
root_idx
]
=
1
/
(
1
+
total_weight
)
return
score_array
# Basically Word Sense Disambiguation, matches context to sense
def
disambiguate
(
mst
,
hubs
,
contexts
,
target_string
):
def
disambiguate
(
minimum_spanning_tree
,
root_hub_list
,
context_list
,
target_string
):
"""
Matches contexts to senses.
Adds up scores for each token in a context string and matches the context
to the root hub with the highest score.
Args:
minimum_spanning_tree: Minimum spanning tree with target as root.
root_hub_list: List of strings of root hubs (senses).
context_list: List of sentence strings that are to be clustered.
target_string: String of target word, also root of MST.
Returns:
mapping_dict: Dictionary of root hubs (senses) as keys and context ids
as values.
"""
target_string
=
target_string
.
replace
(
'
_
'
,
'
'
)
T
=
mst
#minimum spanning tree
H
=
hubs
#root hubs
C
=
[
c
.
lower
().
strip
().
replace
(
target_string
,
''
)
for
c
in
contexts
]
#cleaned up contexts
context_list
=
[
context
.
lower
().
strip
().
replace
(
target_string
,
''
)
for
context
in
context_list
]
score_dict
=
dict
()
#memoisation for scores
mapping_dict
=
{
topic
:[]
for
topic
in
range
(
1
,
len
(
H
)
+
1
)}
#output of function
mapping_dict
=
{
topic
:[]
for
topic
in
range
(
1
,
len
(
root_hub_list
)
+
1
)}
#if no sense is found for a target word, we should assume that there only is one sense
if
len
(
H
)
==
0
:
if
len
(
root_hub_list
)
==
0
:
return
{
0
:[
i
for
i
in
range
(
1
,
len
(
C
)
+
1
)]}
return
{
0
:[
i
for
i
in
range
(
1
,
len
(
context_list
)
+
1
)]}
idx
=
0
for
c
in
C
:
for
c
ontext
in
context_list
:
idx
+=
1
#index based on position in list
doc
=
nlp
(
c
)
#parsed
context
text
s
=
[
tok
.
text
for
tok
in
doc
]
#tokens
processed_context
=
nlp
(
context
)
text
_list
=
[
tok
en
.
text
for
tok
en
in
processed_context
]
#tokens
score
s
=
np
.
zeros
(
len
(
H
))
#initialise with zeros for every sense
score
_array
=
np
.
zeros
(
len
(
root_hub_list
))
#initialise with zeros for every sense
for
text
in
text
s
:
for
text
in
text
_list
:
if
text
in
T
.
nodes
:
#if word wasn't filtered out
new_scores
=
list
()
#scores to be added to total scores
if
text
in
minimum_spanning_tree
.
nodes
:
#if word wasn't filtered out
for
h
in
H
:
#for each hub
if
(
text
,
h
)
in
score_dict
:
#memoisation
if
text
in
score_dict
:
#memoisation
new_scores
.
append
(
score_dict
[(
text
,
h
)])
new_scores
=
score_dict
[
text
]
else
:
else
:
new_score
=
score
(
T
,
text
,
h
)
new_scores
.
append
(
new_score
)
score_dict
[(
text
,
h
)]
=
new_score
#memoisation
new_score
=
score
(
minimum_spanning_tree
,
text
,
root_hub_list
)
score_dict
[
text
]
=
new_score
#memoisation
score
s
=
scores
+
np
.
array
(
new_score
s
)
score
_
array
+=
new_score
else
:
pass
#
if the
disambiguator
could
not detect a sense,
it should
return
a
singleton
, ie. nothing
if
np
.
max
(
score
s
)
==
0
:
#
If
disambiguator
does
not detect a sense, return singleton
.
if
np
.
max
(
score
_array
)
==
0
:
pass
else
:
#
a
ppl
ies
sense with the highest score to context
max_score
=
np
.
max
(
score
s
)
argmax_score
=
np
.
argmax
(
score
s
)
#
A
ppl
y
sense with the highest score to context
max_score
=
np
.
max
(
score
_array
)
argmax_score
=
np
.
argmax
(
score
_array
)
#
c
lusters begin at 1
#
C
lusters begin at 1
mapping_dict
[
argmax_score
+
1
].
append
(
idx
)
return
mapping_dict
# our main function, here the main stepps for word sense induction are called
def
word_sense_induction
(
topic_id
,
topic_name
,
result
s
):
def
word_sense_induction
(
topic_id
,
topic_name
,
result
_list
):
#buffer for useful information
out_buffer
=
'
\n
'
#path for output(directory)
output_path
=
'
./test/
'
#
config.output
output_path
=
config
.
output
#removes trailing new_lines
old_target_string
=
topic_name
.
strip
()
#original target
...
...
@@ -449,7 +525,7 @@ def word_sense_induction(topic_id, topic_name, results):
#counts occurences of single words, as well as cooccurrences, saves it in dictionary
print
(
'
[a]
'
,
'
Counting nodes and edges.
\t
(
'
+
old_target_string
+
'
)
'
)
node_freq_dict
,
edge_freq_dict
=
frequencies
(
target_string
,
result
s
[
topic_id
])
node_freq_dict
,
edge_freq_dict
=
frequencies
(
target_string
,
result
_list
[
topic_id
])
#builds graph from these dictionaries, also applies multiple filters
print
(
'
[a]
'
,
'
Building graph.
\t
(
'
+
old_target_string
+
'
)
'
)
...
...
@@ -474,20 +550,20 @@ def word_sense_induction(topic_id, topic_name, results):
T
=
components
(
G
,
H
,
target_string
)
#matches senses to clusters
print
(
'
[a]
'
,
'
Disambiguating result
s
.
\t
(
'
+
old_target_string
+
'
)
'
)
D
=
disambiguate
(
T
,
H
,
result
s
[
topic_id
],
target_string
)
print
(
'
[a]
'
,
'
Disambiguating result
_list
.
\t
(
'
+
old_target_string
+
'
)
'
)
D
=
disambiguate
(
T
,
H
,
result
_list
[
topic_id
],
target_string
)
out_buffer
+=
(
'
[A] Mapping:
\n
'
)
for
cluster
,
result
s
in
D
.
items
():
out_buffer
+=
(
'
{}. : {}
\n
'
.
format
(
cluster
,
'
,
'
.
join
([
str
(
r
)
for
r
in
result
s
])))
for
cluster
,
result
_list
in
D
.
items
():
out_buffer
+=
(
'
{}. : {}
\n
'
.
format
(
cluster
,
'
,
'
.
join
([
str
(
r
)
for
r
in
result
_list
])))
#prints buffer
print
(
'
[a]
'
,
'
Writing to file.
\t
(
'
+
old_target_string
+
'
)
'
)
print
(
out_buffer
)
#writes clustering to file
for
cluster
,
result
s
in
D
.
items
():
for
result
in
result
s
:
for
cluster
,
result
_list
in
D
.
items
():
for
result
in
result
_list
:
f
.
write
(
topic_id
+
'
.
'
+
str
(
cluster
)
+
'
\t
'
+
topic_id
+
'
.
'
+
str
(
result
)
+
'
\n
'
)
f
.
close
()
...
...
@@ -526,7 +602,7 @@ def read_dataset(data_path):
def
main
():
# If absinth.py is run in test environment
# If absinth.py is run in test environment
.
if
'
-t
'
in
sys
.
argv
:
data_path
=
config
.
test
else
:
...
...
@@ -534,7 +610,13 @@ def main():
results
,
topics
=
read_dataset
(
data_path
)
with
Pool
(
2
)
as
pool
:
# Enables manual setting of process count.
if
'
-p
'
in
sys
.
argv
:
process_count
=
int
(
sys
.
argv
[
sys
.
argv
.
index
(
'
-p
'
)
+
1
])
else
:
process_count
=
1
with
Pool
(
process_count
)
as
pool
:
parameter_list
=
[(
topic_id
,
topic_name
,
results
)
for
topic_id
,
topic_name
in
topics
.
items
()]
pool
.
starmap
(
word_sense_induction
,
parameter_list
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment