Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Absinth - A Small World of Semantic Similarity
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Victor Zimmermann
Absinth - A Small World of Semantic Similarity
Commits
6ee3d178
Commit
6ee3d178
authored
7 years ago
by
Victor Zimmermann
Browse files
Options
Downloads
Patches
Plain Diff
Better Statistics, bugfixes.
parent
72463d07
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/absinth.py
+155
-116
155 additions, 116 deletions
src/absinth.py
src/config.py
+1
-1
1 addition, 1 deletion
src/config.py
with
156 additions
and
117 deletions
src/absinth.py
+
155
−
116
View file @
6ee3d178
...
...
@@ -32,9 +32,11 @@ import os # for reading files
import
pprint
import
re
import
spacy
# for nlp
import
time
from
multiprocessing
import
Pool
from
copy
import
deepcopy
from
multiprocessing
import
Pool
from
scipy
import
stats
nlp
=
spacy
.
load
(
'
en
'
)
# standard english nlp
...
...
@@ -499,53 +501,46 @@ def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
stat_dict
=
dict
()
if
topic_name
in
[
output_file_name
.
replace
(
'
.absinth
'
,
''
)
for
output_file_name
in
os
.
listdir
(
config
.
output
)]:
stat_dict
[
'
target
'
]
=
topic_name
return
None
else
:
stat_dict
[
'
target
'
]
=
topic_name
#in topics longer than two words, the leading 'the' can generally be removed without changing the sense
if
topic_name
[:
4
]
==
'
the_
'
and
topic_name
.
count
(
'
_
'
)
>
1
:
#in topics longer than two words, the leading 'the' can generally be removed without changing the sense
if
topic_name
[:
4
]
==
'
the_
'
and
topic_name
.
count
(
'
_
'
)
>
1
:
target_string
=
topic_name
[
4
:]
else
:
target_string
=
topic_name
print
(
'
[a]
'
,
'
Counting nodes and edges.
\t
(
'
+
topic_name
+
'
)
'
)
node_freq_dict
,
edge_freq_dict
=
frequencies
(
target_string
,
result_list
)
target_string
=
topic_name
[
4
:]
#builds graph from these dictionaries, also applies multiple filters
print
(
'
[a]
'
,
'
Building graph.
\t
(
'
+
topic_name
+
'
)
'
)
graph
=
build_graph
(
node_freq_dict
,
edge_freq_dict
)
else
:
stat_dict
[
'
node count
'
]
=
len
(
graph
.
nodes
)
stat_dict
[
'
edge count
'
]
=
len
(
graph
.
edges
)
target_string
=
topic_name
print
(
'
[a]
'
,
'
Counting nodes and edges.
\t
(
'
+
topic_name
+
'
)
'
)
node_freq_dict
,
edge_freq_dict
=
frequencies
(
target_string
,
result_list
)
#builds graph from these dictionaries, also applies multiple filters
print
(
'
[a]
'
,
'
Building graph.
\t
(
'
+
topic_name
+
'
)
'
)
graph
=
build_graph
(
node_freq_dict
,
edge_freq_dict
)
stat_dict
[
'
node count
'
]
=
len
(
graph
.
nodes
)
stat_dict
[
'
edge count
'
]
=
len
(
graph
.
edges
)
#finds root hubs (senses) within the graph + more filters for these
print
(
'
[a]
'
,
'
Collecting root hubs.
\t
(
'
+
topic_name
+
'
)
'
)
root_hub_list
=
root_hubs
(
graph
,
edge_freq_dict
)
#adds sense inventory to buffer with some common neighbors for context
stat_dict
[
'
hubs
'
]
=
dict
()
for
root_hub
in
root_hub_list
:
#finds root hubs (senses) within the graph + more filters for these
print
(
'
[a]
'
,
'
Collecting root hubs.
\t
(
'
+
topic_name
+
'
)
'
)
root_hub_list
=
root_hubs
(
graph
,
edge_freq_dict
)
#adds sense inventory to buffer with some common neighbors for context
stat_dict
[
'
hubs
'
]
=
dict
()
for
root_hub
in
root_hub_list
:
by_frequency
=
lambda
node
:
edge_freq_dict
[
root_hub
,
node
]
\
if
root_hub
<
node
\
else
edge_freq_dict
[
node
,
root_hub
]
most_frequent_neighbor_list
=
sorted
(
graph
.
adj
[
root_hub
],
key
=
by_frequency
,
reverse
=
True
)
by_frequency
=
lambda
node
:
edge_freq_dict
[
root_hub
,
node
]
\
if
root_hub
<
node
\
else
edge_freq_dict
[
node
,
root_hub
]
most_frequent_neighbor_list
=
sorted
(
graph
.
adj
[
root_hub
],
key
=
by_frequency
,
reverse
=
True
)
stat_dict
[
'
hubs
'
][
root_hub
]
=
most_frequent_neighbor_list
[:
6
]
stat_dict
[
'
hubs
'
][
root_hub
]
=
most_frequent_neighbor_list
[:
6
]
return
graph
,
root_hub_list
,
stat_dict
return
graph
,
root_hub_list
,
stat_dict
def
colour_graph
(
graph
:
nx
.
Graph
,
root_hub_list
:
list
)
->
nx
.
Graph
:
...
...
@@ -692,9 +687,10 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
coloured_graph
[
sub_from
][
sub_to
][
'
weight
'
]
score
[
root_hub_idx
]
+=
(
1
/
(
1
+
total_weight
))
\
*
colour_graph
.
node
[
text
][
'
dist
'
][
root_hub_idx
]
*
colour
ed
_graph
.
node
[
text
][
'
dist
'
][
root_hub_idx
]
else
:
pass
else
:
...
...
@@ -799,6 +795,34 @@ def disambiguate_mst(graph: nx.Graph, root_hub_list: list,
return
mapping_dict
def
print_stats
(
stat_dict
:
dict
)
->
None
:
"""
Prints various statistics and logs them to file.
Args:
stat_dict: Dictionary with various statistics.
"""
stat_string
=
[]
ts
=
time
.
gmtime
()
stat_string
.
append
(
'
[A] Topic:
\t
{}.
'
.
format
(
stat_dict
[
'
target
'
]))
stat_string
.
append
(
'
[A] Processed {} at {}.
'
.
format
(
time
.
strftime
(
"
%Y-%m-%d
"
,
ts
),
time
.
strftime
(
"
%H:%M:%S
"
,
ts
)))
stat_string
.
append
(
'
[A] Nodes: {}
\t
Edges: {}.
'
.
format
(
stat_dict
[
'
node count
'
],
stat_dict
[
'
edge count
'
]))
stat_string
.
append
(
'
[A] Mean cluster length (harmonic):
\t
{}.
'
.
format
(
stat_dict
[
'
hmean_cluster_length
'
]))
stat_string
.
append
(
'
[A] Mean cluster length (arithmetic):
\t
{}.
'
.
format
(
stat_dict
[
'
mean_cluster_length
'
]))
stat_string
.
append
(
'
[A] Number of clusters: {}.
'
.
format
(
stat_dict
[
'
cluster_count
'
]))
stat_string
.
append
(
'
[A] Tuples gained through merging: {}.
'
.
format
(
stat_dict
[
'
merge_gain
'
]))
stat_string
.
append
(
'
[A] Sense inventory:
'
)
for
hub
in
stat_dict
[
'
hubs
'
].
keys
():
stat_string
.
append
(
'
[A] {}:
\t
{}.
'
.
format
(
hub
,
"
,
"
.
join
(
stat_dict
[
'
hubs
'
][
hub
])))
with
open
(
'
stats.txt
'
,
'
a
'
)
as
stat_file
:
stat_file
.
write
(
'
\n
'
.
join
(
stat_string
)
+
'
\n\n
'
)
print
(
'
\n
'
+
'
\n
'
.
join
(
stat_string
)
+
'
\n
'
)
def
main
(
topic_id
:
int
,
topic_name
:
str
,
result_dict
:
dict
)
->
None
:
"""
Calls induction and disambiguation functions, performs main task.
...
...
@@ -815,92 +839,101 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
"""
print
(
'
[a]
'
,
'
Inducing word senses for {}.
'
.
format
(
topic_name
))
graph
,
root_hub_list
,
stat_dict
=
induce
(
topic_name
,
result_dict
[
topic_id
])
colour_rank
=
config
.
colour_rank
mst_rank
=
config
.
mst_rank
#Merges Mappings according to pipeline
mapping_dict
=
dict
()
if
topic_name
in
[
output_file_name
.
replace
(
'
.absinth
'
,
''
)
for
output_file_name
in
os
.
listdir
(
config
.
output
)]:
return
None
#matches senses to clusters
print
(
'
[a]
'
,
'
Disambiguating results.
\t
(
'
+
topic_name
+
'
)
'
)
if
colour_rank
!=
0
:
else
:
print
(
'
[a]
'
,
'
Colouring graph.
\t
(
'
+
topic_name
+
'
)
'
)
mapping_dict
[
colour_rank
]
=
disambiguate_colour
(
graph
,
root_hub_list
,
result_dict
[
topic_id
])
print
(
'
[a]
'
,
'
Inducing word senses for {}.
'
.
format
(
topic_name
))
if
mst_rank
!=
0
:
graph
,
root_hub_list
,
stat_dict
=
induce
(
topic_name
,
result_dict
[
topic_id
])
print
(
'
[a]
'
,
'
Building minimum spanning tree.
\t
(
'
+
topic_name
+
'
)
'
)
mapping_dict
[
mst_rank
]
=
disambiguate_mst
(
graph
,
root_hub_list
,
result_dict
[
topic_id
],
topic_name
)
mapping_list
=
[
item
[
1
]
for
item
in
sorted
(
mapping_dict
.
items
())]
mapping_count
=
len
(
mapping_list
)
merged_mapping_dict
=
mapping_list
[
0
]
merged_entry_count
=
0
for
i
in
range
(
1
,
mapping_count
):
colour_rank
=
config
.
colour_rank
mst_rank
=
config
.
mst_rank
result_list
=
[
result
for
result_list
in
merged_mapping_dict
.
values
()
for
result
in
result_list
]
#individual mappings
relation_list
=
[(
topic
,
result
)
for
topic
in
mapping_list
[
i
].
keys
()
for
result
in
mapping_list
[
i
][
topic
]]
#Merges Mappings according to pipeline
mapping_dict
=
dict
()
for
topic
,
result
in
relation_list
:
#matches senses to clusters
print
(
'
[a]
'
,
'
Disambiguating results.
\t
(
'
+
topic_name
+
'
)
'
)
if
colour_rank
!=
0
:
if
result
not
in
result_list
:
merged_entry_count
+=
1
print
(
'
[a]
'
,
'
Colouring graph.
\t
(
'
+
topic_name
+
'
)
'
)
mapping_dict
[
colour_rank
]
=
disambiguate_colour
(
graph
,
root_hub_list
,
result_dict
[
topic_id
])
if
mst_rank
!=
0
:
print
(
'
[a]
'
,
'
Building minimum spanning tree.
\t
(
'
+
topic_name
+
'
)
'
)
mapping_dict
[
mst_rank
]
=
disambiguate_mst
(
graph
,
root_hub_list
,
result_dict
[
topic_id
],
topic_name
)
mapping_list
=
[
item
[
1
]
for
item
in
sorted
(
mapping_dict
.
items
())]
mapping_count
=
len
(
mapping_list
)
merged_mapping_dict
=
mapping_list
[
0
]
merged_entry_count
=
0
for
i
in
range
(
1
,
mapping_count
):
result_list
=
[
result
for
result_list
in
merged_mapping_dict
.
values
()
for
result
in
result_list
]
#individual mappings
relation_list
=
[(
topic
,
result
)
for
topic
in
mapping_list
[
i
].
keys
()
for
result
in
mapping_list
[
i
][
topic
]]
for
topic
,
result
in
relation_list
:
if
topic
in
merged_mapping_dict
:
merged_mapping_dict
[
topic
].
append
(
result
)
if
result
not
in
result_list
:
else
:
merged_mapping_dict
[
topic
]
=
[
result
]
merged_entry_count
+=
1
stat_dict
[
'
merge_gain
'
]
=
merged_entry_count
#collect statistics from result.
cluster_count
=
0
cluster_length_list
=
list
()
for
cluster
,
result_list
in
merged_mapping_dict
.
items
():
if
topic
in
merged_mapping_dict
:
merged_mapping_dict
[
topic
].
append
(
result
)
else
:
merged_mapping_dict
[
topic
]
=
[
result
]
stat_dict
[
'
merge_gain
'
]
=
merged_entry_count
cluster_length
=
len
(
result_list
)
#collect statistics from result.
cluster_count
=
0
cluster_length_list
=
list
()
i
f
cluster
_length
!=
0
:
f
or
cluster
,
result_list
in
merged_mapping_dict
.
items
()
:
cluster_count
+=
1
cluster_length_list
.
append
(
cluster_length
)
cluster_length
=
len
(
result_list
)
stat_dict
[
'
mean_cluster_length
'
]
=
np
.
mean
(
cluster_length_list
)
stat_dict
[
'
cluster_count
'
]
=
cluster_count
if
cluster_length
!=
0
:
cluster_count
+=
1
cluster_length_list
.
append
(
cluster_length
)
stat_dict
[
'
hmean_cluster_length
'
]
=
stats
.
hmean
(
cluster_length_list
)
stat_dict
[
'
mean_cluster_length
'
]
=
np
.
mean
(
cluster_length_list
)
stat_dict
[
'
cluster_count
'
]
=
cluster_count
print
(
'
[a]
'
,
'
Writing to file.
\t
(
'
+
topic_name
+
'
)
'
)
output_path
=
config
.
output
output_file_name
=
output_path
+
topic_name
+
'
.absinth
'
with
open
(
output_file_name
,
'
w
'
)
as
output_file
:
print
(
'
[a]
'
,
'
Writing to file.
\t
(
'
+
topic_name
+
'
)
'
)
output_path
=
config
.
output
output_file_name
=
output_path
+
topic_name
+
'
.absinth
'
with
open
(
output_file_name
,
'
w
'
)
as
output_file
:
output_file
.
write
(
'
subTopicID
\t
resultID
\n
'
)
output_file
.
write
(
'
subTopicID
\t
resultID
\n
'
)
for
cluster_id
,
result_list
in
merged_mapping_dict
.
items
():
for
result_id
in
result_list
:
output_line
=
'
{}.{}
\t
{}.{}
\n
'
.
format
(
topic_id
,
cluster_id
,
topic_id
,
result_id
)
output_file
.
write
(
output_line
)
pprint
.
pprint
(
stat_dict
)
for
cluster_id
,
result_list
in
merged_mapping_dict
.
items
():
for
result_id
in
result_list
:
output_line
=
'
{}.{}
\t
{}.{}
\n
'
.
format
(
topic_id
,
cluster_id
,
topic_id
,
result_id
)
output_file
.
write
(
output_line
)
print_stats
(
stat_dict
)
if
__name__
==
'
__main__
'
:
...
...
@@ -921,11 +954,17 @@ if __name__ == '__main__':
# Enables manual setting of process count.
if
'
-p
'
in
sys
.
argv
:
process_count
=
int
(
sys
.
argv
[
sys
.
argv
.
index
(
'
-p
'
)
+
1
])
with
Pool
(
process_count
)
as
pool
:
parameter_list
=
[(
topic_id
,
topic_name
,
result_dict
)
for
topic_id
,
topic_name
in
topic_dict
.
items
()]
pool
.
starmap
(
main
,
sorted
(
parameter_list
))
#determineate function
else
:
process_count
=
1
for
topic_id
,
topic_name
in
sorted
(
topic_dict
.
items
()):
main
(
topic_id
,
topic_name
,
result_dict
)
with
Pool
(
process_count
)
as
pool
:
parameter_list
=
[(
topic_id
,
topic_name
,
result_dict
)
for
topic_id
,
topic_name
in
topic_dict
.
items
()]
pool
.
starmap
(
main
,
sorted
(
parameter_list
))
#determineate function
This diff is collapsed.
Click to expand it.
src/config.py
+
1
−
1
View file @
6ee3d178
...
...
@@ -20,7 +20,7 @@ Methods labeled with 0 are ignored.
At least one method must be given a value != 0.
'''
resolve_conflicts
=
False
#not yet implemented
mst_rank
=
0
mst_rank
=
2
colour_rank
=
1
'''
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment