Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Absinth - A Small World of Semantic Similarity
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container Registry
Model registry
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Victor Zimmermann
Absinth - A Small World of Semantic Similarity
Commits
154ec5e8
Commit
154ec5e8
authored
7 years ago
by
Victor Zimmermann
Browse files
Options
Downloads
Patches
Plain Diff
Clean up.
parent
9c8316f5
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
legacy/.gitkeep
+0
-0
0 additions, 0 deletions
legacy/.gitkeep
legacy/absinth.py
+0
-197
0 additions, 197 deletions
legacy/absinth.py
legacy/graphs.py
+0
-29
0 additions, 29 deletions
legacy/graphs.py
with
0 additions
and
226 deletions
legacy/.gitkeep
deleted
100644 → 0
+
0
−
0
View file @
9c8316f5
This diff is collapsed.
Click to expand it.
legacy/absinth.py
deleted
100644 → 0
+
0
−
197
View file @
9c8316f5
import
os
# for reading files
import
sys
#from tqdm import tqdm # for counting seconds
import
spacy
# for nlp
import
networkx
as
nx
# for visualisation
import
matplotlib.pyplot
as
plt
# for visualisation
import
copy
# for deepcopy
import
numpy
as
np
# for calculations
nlp
=
spacy
.
load
(
'
en
'
)
# standard english nlp
# wrapper class for nodes + functions on nodes
class
Graph
:
# can be initialised with nodes
def
__init__
(
self
,
nodes
=
{}):
self
.
nodes
=
nodes
# 'key in Graph' returns True if node with key exists in Graph
def
__contains__
(
self
,
key
):
return
key
in
self
.
nodes
.
keys
()
# returns all nodes (not keys)
def
get_nodes
(
self
):
return
self
.
nodes
.
values
()
# adds node or ups frequency of node if already in graph
def
add_node
(
self
,
key
):
if
key
in
self
:
self
.
nodes
[
key
].
freq
+=
1
else
:
self
.
nodes
[
key
]
=
Node
(
key
)
# removes node (doesn't work)
#def remove_node(self, key):
# del self.nodes[key]
# for node in self.nodes.values():
# node.remove_neighbor(key)
# adds neighbor to node
def
add_edge
(
self
,
from_key
,
to_key
):
self
.
nodes
[
from_key
].
add_neighbor
(
self
.
nodes
[
to_key
])
#builds graph from corpus for target word with applied filters
#filters: min_occurrences, min_cooccurrence, stop_words, allowed_tags, context_size, max_distance
def
build
(
self
,
corpus_path
,
word
,
filters
):
files
=
[
corpus_path
+
'
/
'
+
f
for
f
in
os
.
listdir
(
corpus_path
)]
# list of file paths (note that no other files should be in this directory)
spaced_word
=
word
.
replace
(
'
_
'
,
'
'
)
#input words are generally seperated with underscores
for
f
in
files
[:]:
#iterates over corpus
with
open
(
f
,
'
r
'
)
as
source
:
try
:
#some decoding throws the iteration
for
line
in
source
:
line
=
line
.
lower
()
if
spaced_word
in
line
:
#greedy filter (no processing on most lines)
new_line
=
line
.
replace
(
spaced_word
,
word
)
spacy_line
=
nlp
(
new_line
)
if
word
in
[
token
.
text
for
token
in
spacy_line
]:
#detailed filter on tokenised line
tokens
=
list
()
#collects tokens
for
token
in
spacy_line
:
text
=
token
.
text
tag
=
token
.
tag_
# if token is not a stop word and has right pos tag
if
text
!=
word
and
text
not
in
filters
[
'
stop_words
'
]
and
tag
in
filters
[
'
allowed_tags
'
]
:
tokens
.
append
(
token
.
text
)
# if paragraph is the right size after filters
if
len
(
tokens
)
>=
filters
[
'
context_size
'
]:
for
key
in
set
(
tokens
):
self
.
add_node
(
key
)
for
from_key
,
to_key
in
{(
x
,
y
)
for
x
in
tokens
for
y
in
tokens
if
x
!=
y
}:
self
.
add_edge
(
from_key
,
to_key
)
except
UnicodeDecodeError
:
print
(
'
Failed to decode:
'
,
f
)
#removes tokens with too few occurences
self
.
nodes
=
{
key
:
value
for
key
,
value
in
self
.
nodes
.
items
()
if
value
.
freq
>=
filters
[
'
min_occurrences
'
]}
#removes unneccessary edges and pairs with too few cooccurrences
for
node
in
self
.
nodes
.
values
():
node
.
neighbors
=
{
key
:
value
for
key
,
value
in
node
.
neighbors
.
items
()
if
value
>=
filters
[
'
min_cooccurrence
'
]
and
key
in
self
.
nodes
.
keys
()
and
node
.
weight
(
self
.
nodes
[
key
])
<=
filters
[
'
max_distance
'
]}
#removes singletons
self
.
nodes
=
{
key
:
value
for
key
,
value
in
self
.
nodes
.
items
()
if
len
(
value
.
neighbors
)
>
0
}
# finds a path from one node to another
# Variation on function from https://www.python-course.eu/graphs_python.php
def
find_path
(
self
,
start
,
end
,
path
=
None
):
if
path
==
None
:
path
=
[]
path
=
path
+
[
start
]
if
start
==
end
:
return
path
if
start
not
in
self
.
nodes
.
keys
():
return
None
for
neighbor
in
self
.
nodes
[
start
].
neighbors
.
keys
():
if
neighbor
not
in
path
:
print
(
path
)
extended_path
=
self
.
find_path
(
neighbor
,
end
,
path
)
if
extended_path
:
return
extended_path
return
None
# variation on algorithm from Véronis (2004)
def
root_hubs
(
self
,
min_neighbors
=
6
,
theshold
=
0.8
):
G
=
copy
.
deepcopy
(
self
)
V
=
sorted
(
G
.
nodes
.
values
(),
key
=
lambda
value
:
-
1
*
value
.
freq
)
# -1 to sort descending (...3 -> 2 -> 1...)
H
=
[]
while
V
:
v
=
V
[
0
]
if
len
(
v
.
neighbors
)
>=
min_neighbors
:
mfn
=
sorted
(
v
.
neighbors
.
keys
(),
key
=
lambda
key
:
v
.
neighbors
[
key
])[:
min_neighbors
]
#mfn: most frequent neighbors
if
np
.
mean
([
v
.
weight
(
G
.
nodes
[
n
])
for
n
in
mfn
])
<
theshold
:
H
.
append
(
v
)
G
.
nodes
=
{
key
:
value
for
key
,
value
in
G
.
nodes
.
items
()
if
key
!=
v
.
key
and
key
not
in
v
.
neighbors
.
keys
()}
for
node
in
G
.
nodes
.
values
():
node
.
neighbors
=
{
key
:
value
for
key
,
value
in
node
.
neighbors
.
items
()
if
key
in
G
.
nodes
.
keys
()}
V
=
sorted
(
G
.
nodes
.
values
(),
key
=
lambda
value
:
-
1
*
value
.
freq
)
else
:
return
H
return
H
#presents nodes in format key --> (weight, neighbors)
def
view
(
self
):
for
node
in
self
.
nodes
.
values
():
print
(
node
.
key
,
'
-->
'
,[(
node
.
weight
(
self
.
nodes
[
key
]),
key
)
for
key
in
node
.
neighbors
.
keys
()])
#draws graph using networkx
def
draw
(
self
):
G
=
nx
.
Graph
()
for
node
in
self
.
nodes
.
values
():
G
.
add_node
(
node
.
key
)
G
.
add_edges_from
([(
node
.
key
,
y
)
for
y
in
node
.
neighbors
.
keys
()])
nx
.
draw
(
G
,
with_labels
=
True
)
plt
.
show
()
#class for single words with frequency and neighbors
class
Node
:
#initialises node with key and frequency of 1
def
__init__
(
self
,
key
):
self
.
key
=
key
self
.
freq
=
1
self
.
neighbors
=
dict
()
#adds neighbor to neighbors dict or ups its cooccurence frequency
def
add_neighbor
(
self
,
other
):
if
other
.
key
in
self
.
neighbors
.
keys
():
self
.
neighbors
[
other
.
key
]
+=
1
else
:
self
.
neighbors
[
other
.
key
]
=
1
#removes neighbor from dictionary
def
remove_neighbor
(
self
,
key
):
del
self
.
neighbors
[
key
]
#calculates weight between self and other node
#if node is not neighbor, return 1 (no cooccurence), (0 would be complete cooccurrence)
def
weight
(
self
,
other
):
if
other
.
key
in
self
.
neighbors
.
keys
():
return
1
-
max
([
self
.
neighbors
[
other
.
key
]
/
other
.
freq
,
self
.
neighbors
[
other
.
key
]
/
self
.
freq
])
else
:
return
1
#see Kruskal's algorithm
def
minimum_spanning_tree
(
graph
,
target
):
pass
#Components algorithm from Véronis (2004), converts graph for target into a MST
def
components
(
graph
,
target
):
pass
#Uses MST to disambiguate context, should ideally write to evaluator format
def
disambiguation
(
mst
,
context
):
pass
if
__name__
==
'
__main__
'
:
filters
=
{
'
min_occurrences
'
:
10
,
'
min_cooccurrence
'
:
5
,
'
stop_words
'
:
[
'
utc
'
],
'
allowed_tags
'
:
[
'
NN
'
,
'
NNS
'
,
'
JJ
'
,
'
JJS
'
,
'
JJR
'
,
'
NNP
'
],
'
context_size
'
:
4
,
'
max_distance
'
:
0.9
}
data_path
=
'
/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE
'
#corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
corpus_path
=
'
/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI
'
G
=
Graph
()
#initialises graph
G
.
build
(
corpus_path
,
sys
.
argv
[
1
],
filters
)
#builds graph from corpus with target and filters
for
hub
in
G
.
root_hubs
():
print
(
hub
.
key
,
'
-->
'
,
list
(
hub
.
neighbors
.
keys
()),
'
\n
'
)
#prints senses
#G.view()
#print(G.find_path('english', 'kennel'))
G
.
draw
()
#draws graph
This diff is collapsed.
Click to expand it.
legacy/graphs.py
deleted
100644 → 0
+
0
−
29
View file @
9c8316f5
import
os
import
nltk
data_path
=
'
/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE
'
wiki_path
=
'
/home/students/zimmermann/Courses/ws17/fsem/absinth/test
'
topics
=
open
(
data_path
+
'
/topics.txt
'
,
'
r
'
).
readlines
()[
1
:]
topics
=
[
line
.
strip
(
'
\n
'
).
split
(
'
\t
'
)
for
line
in
topics
]
results
=
open
(
data_path
+
'
/results.txt
'
,
'
r
'
).
readlines
()[
1
:]
results
=
[
line
.
strip
(
'
\n
'
).
split
(
'
\t
'
)
for
line
in
results
]
def
get_paragraphs
(
word
):
files
=
[
wiki_path
+
'
/
'
+
f
for
f
in
os
.
listdir
(
wiki_path
)]
paragraphs
=
list
()
space_word
=
word
.
replace
(
'
_
'
,
'
'
)
for
f
in
files
:
with
open
(
f
,
'
r
'
)
as
source
:
for
line
in
source
:
line
=
line
.
lower
()
if
space_word
in
line
:
new_line
=
line
.
replace
(
space_word
,
word
)
tokens
=
nltk
.
word_tokenize
(
new_line
)
if
word
in
tokens
:
paragraphs
.
append
(
tokens
)
print
(
tokens
)
return
paragraphs
#for topic in topics:
print
(
get_paragraphs
(
'
the_block
'
))
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment