Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Absinth - A Small World of Semantic Similarity
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Victor Zimmermann
Absinth - A Small World of Semantic Similarity
Commits
b5581eb6
Commit
b5581eb6
authored
7 years ago
by
Victor Zimmermann
Browse files
Options
Downloads
Patches
Plain Diff
Redone output, shuffle corpus, disambiguation output is now a dict
parent
642789bc
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/absinth.py
+64
-57
64 additions, 57 deletions
src/absinth.py
with
64 additions
and
57 deletions
src/absinth.py
+
64
−
57
View file @
b5581eb6
...
@@ -8,13 +8,15 @@ import numpy as np # for calculations
...
@@ -8,13 +8,15 @@ import numpy as np # for calculations
import
config
import
config
import
spacy
# for nlp
import
spacy
# for nlp
from
multiprocessing
import
Pool
from
multiprocessing
import
Pool
import
random
nlp
=
spacy
.
load
(
'
en
'
)
# standard english nlp
nlp
=
spacy
.
load
(
'
en
'
)
# standard english nlp
#counts occurences of nodes and cooccurrences
#counts occurences of nodes and cooccurrences
def
frequencies
(
corpus_path
,
target
):
def
frequencies
(
corpus_path
,
target
):
random
.
seed
(
1
)
stop_words
=
set
(
stopwords
.
words
(
'
english
'
)
+
config
.
stop_words
)
stop_words
=
set
(
stopwords
.
words
(
'
english
'
)
+
config
.
stop_words
)
allowed_tags
=
config
.
allowed_tags
allowed_tags
=
config
.
allowed_tags
min_context_size
=
config
.
min_context_size
min_context_size
=
config
.
min_context_size
...
@@ -24,13 +26,15 @@ def frequencies(corpus_path, target):
...
@@ -24,13 +26,15 @@ def frequencies(corpus_path, target):
node_freq
=
dict
()
#counts (potential) nodes
node_freq
=
dict
()
#counts (potential) nodes
edge_freq
=
dict
()
#counts (potential) edges
edge_freq
=
dict
()
#counts (potential) edges
files
=
[
corpus_path
+
f
for
f
in
os
.
listdir
(
corpus_path
)]
#file names of corpus files
s_target
=
target
.
replace
(
'
_
'
,
'
'
)
#target word with spaces
s_target
=
target
.
replace
(
'
_
'
,
'
'
)
#target word with spaces
files
=
[
corpus_path
+
f
for
f
in
os
.
listdir
(
corpus_path
)]
#file names of corpus files
random
.
shuffle
(
files
)
i
=
0
#for update print statements
i
=
0
#for update print statements
for
f
in
files
:
for
f
in
files
:
if
i
%
int
(
len
(
files
)
/
1
0
)
==
0
:
#prints update after every 10th of the corpus is parsed
if
i
%
int
(
len
(
files
)
/
1
1
)
==
0
:
#prints update after every 10th of the corpus is parsed
file_ratio
=
i
/
len
(
files
[:])
file_ratio
=
i
/
len
(
files
[:])
max_node_ratio
=
len
(
node_freq
)
/
max_nodes
max_node_ratio
=
len
(
node_freq
)
/
max_nodes
...
@@ -41,7 +45,7 @@ def frequencies(corpus_path, target):
...
@@ -41,7 +45,7 @@ def frequencies(corpus_path, target):
#uses the ratio closest to 100%.
#uses the ratio closest to 100%.
percentage
=
int
((
max
(
ratios
))
*
100
)
percentage
=
int
((
max
(
ratios
))
*
100
)
print
(
'
[a] ~{:02d}%
\t
Nodes: {}
\t
Edges: {}.
'
.
format
(
percentage
,
len
(
node_freq
),
len
(
edge_freq
))
,
target
)
print
(
'
[a] ~{:02d}%
\t
Nodes: {}
\t
Edges: {}.
'
.
format
(
percentage
,
len
(
node_freq
),
len
(
edge_freq
))
+
'
\t
(
'
+
target
+
'
)
'
)
#checks maximum node values
#checks maximum node values
if
len
(
node_freq
)
>
max_nodes
:
if
len
(
node_freq
)
>
max_nodes
:
...
@@ -239,63 +243,61 @@ def disambiguate(mst, hubs, contexts, target=""):
...
@@ -239,63 +243,61 @@ def disambiguate(mst, hubs, contexts, target=""):
C
=
[
c
.
lower
().
strip
().
replace
(
target
,
''
)
for
c
in
contexts
]
#cleaned up contexts
C
=
[
c
.
lower
().
strip
().
replace
(
target
,
''
)
for
c
in
contexts
]
#cleaned up contexts
score_dict
=
dict
()
#memoisation for scores
score_dict
=
dict
()
#memoisation for scores
result
=
list
()
#output of function
mapping_dict
=
{
topic
:[]
for
topic
in
range
(
1
,
len
(
H
)
+
1
)}
#output of function
#if no sense is found for a target word, we should assume that there only is one sense
if
len
(
H
)
==
0
:
return
{
0
:[
i
for
i
in
range
(
1
,
len
(
C
)
+
1
)]}
for
c
in
C
:
for
c
in
C
:
idx
=
C
.
index
(
c
)
+
1
#index based on position in list
idx
=
C
.
index
(
c
)
+
1
#index based on position in list
doc
=
nlp
(
c
)
#parsed context
texts
=
[
tok
.
text
for
tok
in
doc
]
#tokens
#if no sense is found for a target word, we should assume that there only is one sense
scores
=
np
.
zeros
(
len
(
H
))
#initialise with zeros for every sense
if
len
(
H
)
==
0
:
result
.
append
((
1
,
idx
,
0
))
else
:
for
text
in
texts
:
doc
=
nlp
(
c
)
#parsed context
texts
=
[
tok
.
text
for
tok
in
doc
]
#tokens
scores
=
np
.
zeros
(
len
(
H
))
#initialise with zeros for every sense
if
text
in
T
.
nodes
:
#if word wasn't filtered out
for
text
in
texts
:
if
text
in
T
.
nodes
:
#if word wasn't filtered out
new_scores
=
list
()
#scores to be added to total scores
new_scores
=
list
()
#scores to be added to total scores
for
h
in
H
:
#for each hub
for
h
in
H
:
#for each hub
if
(
text
,
h
)
in
score_dict
:
#memoisation
if
(
text
,
h
)
in
score_dict
:
#memoisation
new_scores
.
append
(
score_dict
[(
text
,
h
)])
else
:
new_scores
.
append
(
score_dict
[(
text
,
h
)])
new_score
=
score
(
T
,
text
,
h
)
else
:
new_scores
.
append
(
new_score
)
score_dict
[(
text
,
h
)]
=
new_score
#memoisation
score
s
=
score
s
+
np
.
array
(
new_scores
)
new_
score
=
score
(
T
,
text
,
h
)
new_scores
.
append
(
new_score
)
else
:
score_dict
[(
text
,
h
)]
=
new_score
#memoisation
pass
scores
=
scores
+
np
.
array
(
new_scores
)
#if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
else
:
if
np
.
max
(
scores
)
==
0
:
pass
pass
#if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
if
np
.
max
(
scores
)
==
0
:
else
:
pass
#applies sense with the highest score to context
else
:
max_score
=
np
.
max
(
scores
)
argmax_score
=
np
.
argmax
(
scores
)
#clusters begin at 1
#applies sense with the highest score to context
result
.
append
((
argmax_score
+
1
,
idx
))
max_score
=
np
.
max
(
scores
)
argmax_score
=
np
.
argmax
(
scores
)
#clusters begin at 1
mapping_dict
[
argmax_score
+
1
].
append
(
idx
)
return
resul
t
return
mapping_dic
t
# our main function, here the main stepps for word sense induction are called
# our main function, here the main stepps for word sense induction are called
...
@@ -326,16 +328,16 @@ def WSI(topic_id, topic_name, results):
...
@@ -326,16 +328,16 @@ def WSI(topic_id, topic_name, results):
f
.
write
(
'
subTopicID
\t
resultID
\n
'
)
f
.
write
(
'
subTopicID
\t
resultID
\n
'
)
#counts occurences of single words, as well as cooccurrences, saves it in dictionary
#counts occurences of single words, as well as cooccurrences, saves it in dictionary
print
(
'
[a]
'
,
'
Counting nodes and edges.
'
,
old_target
)
print
(
'
[a]
'
,
'
Counting nodes and edges.
\t
(
'
+
old_target
+
'
)
'
)
node_freq
,
edge_freq
=
frequencies
(
corpus_path
,
target
)
node_freq
,
edge_freq
=
frequencies
(
corpus_path
,
target
)
out_buffer
+=
'
[A] Nodes: {}
\t
Edges: {}
\n
'
.
format
(
str
(
len
(
node_freq
)),
str
(
len
(
edge_freq
)))
out_buffer
+=
'
[A] Nodes: {}
\t
Edges: {}
\n
'
.
format
(
str
(
len
(
node_freq
)),
str
(
len
(
edge_freq
)))
#builds graph from these dictionaries, also applies multiple filters
#builds graph from these dictionaries, also applies multiple filters
print
(
'
[a]
'
,
'
Building graph.
'
,
old_target
)
print
(
'
[a]
'
,
'
Building graph.
\t
(
'
+
old_target
+
'
)
'
)
G
=
build_graph
(
node_freq
,
edge_freq
)
G
=
build_graph
(
node_freq
,
edge_freq
)
#finds root hubs (senses) within the graph + more filters for these
#finds root hubs (senses) within the graph + more filters for these
print
(
'
[a]
'
,
'
Collecting root hubs.
'
,
old_target
)
print
(
'
[a]
'
,
'
Collecting root hubs.
\t
(
'
+
old_target
+
'
)
'
)
H
=
root_hubs
(
G
,
edge_freq
)
H
=
root_hubs
(
G
,
edge_freq
)
out_buffer
+=
'
[A] Root hubs:
\n
'
out_buffer
+=
'
[A] Root hubs:
\n
'
...
@@ -344,26 +346,29 @@ def WSI(topic_id, topic_name, results):
...
@@ -344,26 +346,29 @@ def WSI(topic_id, topic_name, results):
for
h
in
H
:
for
h
in
H
:
mfn
=
sorted
(
G
.
adj
[
h
],
key
=
lambda
x
:
edge_freq
[
h
,
x
]
if
h
<
x
else
edge_freq
[
x
,
h
],
reverse
=
True
)[:
6
]
mfn
=
sorted
(
G
.
adj
[
h
],
key
=
lambda
x
:
edge_freq
[
h
,
x
]
if
h
<
x
else
edge_freq
[
x
,
h
],
reverse
=
True
)[:
6
]
out_buffer
+=
(
'
{}. {}: {}
\n
'
.
format
(
i
,
h
,
mfn
))
out_buffer
+=
(
'
{}. {}: {}
\n
'
.
format
(
i
,
h
,
'
,
'
.
join
(
mfn
))
)
i
+=
1
i
+=
1
#performs minimum_spanning_tree algorithm on graph
#performs minimum_spanning_tree algorithm on graph
print
(
'
[a]
'
,
'
Building minimum spanning tree.
'
,
old_target
)
print
(
'
[a]
'
,
'
Building minimum spanning tree.
\t
(
'
+
old_target
+
'
)
'
)
T
=
components
(
G
,
H
,
target
)
T
=
components
(
G
,
H
,
target
)
#matches senses to clusters
#matches senses to clusters
print
(
'
[a]
'
,
'
Disambiguating results.
'
,
old_target
)
print
(
'
[a]
'
,
'
Disambiguating results.
\t
(
'
+
old_target
+
'
)
'
)
D
=
disambiguate
(
T
,
H
,
results
[
topic_id
],
target
)
D
=
disambiguate
(
T
,
H
,
results
[
topic_id
],
target
)
out_buffer
+=
(
'
[A] Mapping:
'
+
str
(
D
)
+
'
\n
'
)
out_buffer
+=
(
'
[A] Mapping:
\n
'
)
for
cluster
,
results
in
D
.
items
():
out_buffer
+=
(
'
{}. : {}
\n
'
.
format
(
cluster
,
'
,
'
.
join
([
str
(
r
)
for
r
in
results
])))
#prints buffer
#prints buffer
print
(
'
[a]
'
,
'
Writing to file.
'
,
old_target
)
print
(
'
[a]
'
,
'
Writing to file.
\t
(
'
+
old_target
+
'
)
'
)
print
(
out_buffer
)
print
(
out_buffer
)
#writes clustering to file
#writes clustering to file
for
d
in
D
:
for
cluster
,
results
in
D
.
items
()
:
for
result
in
results
:
f
.
write
(
topic_id
+
'
.
'
+
str
(
d
[
0
]
)
+
'
\t
'
+
topic_id
+
'
.
'
+
str
(
d
[
1
]
)
+
'
\n
'
)
f
.
write
(
topic_id
+
'
.
'
+
str
(
cluster
)
+
'
\t
'
+
topic_id
+
'
.
'
+
str
(
result
)
+
'
\n
'
)
f
.
close
()
f
.
close
()
...
@@ -394,13 +399,15 @@ if __name__ == '__main__':
...
@@ -394,13 +399,15 @@ if __name__ == '__main__':
# topics.txt is a list of target words
# topics.txt is a list of target words
topics
=
dict
()
topics
=
dict
()
processed_topics
=
[
f
.
replace
(
'
.absinth
'
,
''
)
for
f
in
os
.
listdir
(
config
.
output
)]
with
open
(
data_path
+
'
topics.txt
'
,
'
r
'
)
as
topics_file
:
with
open
(
data_path
+
'
topics.txt
'
,
'
r
'
)
as
topics_file
:
for
line
in
topics_file
.
readlines
()[
1
:]:
for
line
in
topics_file
.
readlines
()[
1
:]:
l
=
line
.
split
(
'
\t
'
)
l
=
line
.
split
(
'
\t
'
)
topics
[
l
[
0
]]
=
l
[
1
]
if
l
[
1
].
strip
()
not
in
processed_topics
:
topics
[
l
[
0
]]
=
l
[
1
]
# multiprocessing
# multiprocessing
with
Pool
(
4
)
as
pool
:
with
Pool
(
4
)
as
pool
:
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment