Skip to content
Snippets Groups Projects
Commit 909c4e12 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Tried some object oriented stuff. Also recursive path finder function.

parent deb3f84a
No related branches found
No related tags found
No related merge requests found
import os
import spacy
import networkx as nx
import matplotlib.pyplot as plt
nlp = spacy.load('en')
class Graph:
def __init__(self, nodes = {}):
self.nodes = nodes
def __contains__(self, key):
return key in self.nodes.keys()
def get_nodes(self):
return self.nodes.values()
def add_node(self, token):
if token in self:
self.nodes[token].freq += 1
else:
self.nodes[token] = Node(token)
def remove_node(self, token):
del self.nodes[token]
for node in self.nodes.values():
print('tada')
node.remove_neighbor(token)
def add_edge(self, from_token, to_token):
self.nodes[from_token].add_neighbor(self.nodes[to_token])
def build(self, corpus_path, word, filter_dict):
files = [corpus_path+'/'+f for f in os.listdir(corpus_path)]
spaced_word = word.replace('_', ' ') #input words are seperated with underscores
for f in files:
with open(f, 'r') as source:
try:
for line in source:
line = line.lower()
if spaced_word in line:
new_line = line.replace(spaced_word, word)
spacy_line = nlp(new_line)
if word in [token.text for token in spacy_line]:
tokens = list()
for token in spacy_line:
text = token.text
tag = token.tag_
if text != word and text not in filter_dict['stop_words'] and tag in filter_dict['allowed_tags'] :
tokens.append(token.text)
if len(tokens) >= filter_dict['context_size']:
for node in tokens:
self.add_node(node)
for edge in [(x,y) for x in tokens for y in tokens]:
from_token, to_token = edge
self.add_edge(from_token, to_token)
except UnicodeDecodeError:
print('Failed to decode:', f)
self.nodes = {key:value for key, value in self.nodes.items() if value.freq >= filter_dict['min_occurrences']}
for node in self.nodes.values():
node.neighbors = {key:value for key, value in node.neighbors.items() if value >= filter_dict['min_cooccurrence']}
def find_path(self, start, end, path=None):
if path == None:
path = []
path = path + [start]
if start == end:
return path
if start not in self.nodes.keys():
return None
for neighbor in self.nodes[start].neighbors.keys():
if neighbor not in path:
print(path)
extended_path = self.find_path(neighbor, end, path)
if extended_path:
return extended_path
return None
def view(self):
for node in self.nodes.values():
print(node.token,'-->',node.neighbors.keys())
def draw(self):
G = nx.Graph()
for node in self.nodes.values():
G.add_node(node.token)
G.add_edges_from([(node.token, y) for y in node.neighbors.keys()])
nx.draw(G, with_labels=True)
plt.show()
class Node:
def __init__(self, token):
self.token = token
self.freq = 1
self.neighbors = dict()
def add_neighbor(self, other):
token = other.token
if token in self.neighbors.keys():
self.neighbors[token] += 1
else:
self.neighbors[token] = 1
def remove_neighbor(self, token):
del self.neighbors[token]
def weight(self, other):
return 1 - max([self.neighbors[other.token]/other.count, other.neighbors[self.token]/self.count])
if __name__ == '__main__':
"""
Filter:
min_occurrences: minimum occurences of given word
min_cooccurrence: minimum cooccurences for word pairs
stop_words: list of stop words
allowed_tags: pos-tags of words
context_size: minimum size for paragraphs after word filtering
"""
filter_dict = {'min_occurrences' : 10, 'min_cooccurrence' : 5, 'stop_words' : [], 'allowed_tags' : ['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'NNP'], 'context_size' : 4, 'max_distance' : 0.9}
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
#corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
G = Graph()
G.build(corpus_path, 'dog', filter_dict)
G.view()
print(G.find_path('english', 'kennel'))
G.draw()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment