Skip to content
Snippets Groups Projects
Commit 154ec5e8 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Clean up.

parent 9c8316f5
No related branches found
No related tags found
No related merge requests found
import os # for reading files
import sys
#from tqdm import tqdm # for counting seconds
import spacy # for nlp
import networkx as nx # for visualisation
import matplotlib.pyplot as plt # for visualisation
import copy # for deepcopy
import numpy as np # for calculations
nlp = spacy.load('en') # standard english nlp
# wrapper class for nodes + functions on nodes
class Graph:
# can be initialised with nodes
def __init__(self, nodes = {}):
self.nodes = nodes
# 'key in Graph' returns True if node with key exists in Graph
def __contains__(self, key):
return key in self.nodes.keys()
# returns all nodes (not keys)
def get_nodes(self):
return self.nodes.values()
# adds node or ups frequency of node if already in graph
def add_node(self, key):
if key in self:
self.nodes[key].freq += 1
else:
self.nodes[key] = Node(key)
# removes node (doesn't work)
#def remove_node(self, key):
# del self.nodes[key]
# for node in self.nodes.values():
# node.remove_neighbor(key)
# adds neighbor to node
def add_edge(self, from_key, to_key):
self.nodes[from_key].add_neighbor(self.nodes[to_key])
#builds graph from corpus for target word with applied filters
#filters: min_occurrences, min_cooccurrence, stop_words, allowed_tags, context_size, max_distance
def build(self, corpus_path, word, filters):
files = [corpus_path+'/'+f for f in os.listdir(corpus_path)] # list of file paths (note that no other files should be in this directory)
spaced_word = word.replace('_', ' ') #input words are generally seperated with underscores
for f in files[:]: #iterates over corpus
with open(f, 'r') as source:
try: #some decoding throws the iteration
for line in source:
line = line.lower()
if spaced_word in line: #greedy filter (no processing on most lines)
new_line = line.replace(spaced_word, word)
spacy_line = nlp(new_line)
if word in [token.text for token in spacy_line]: #detailed filter on tokenised line
tokens = list() #collects tokens
for token in spacy_line:
text = token.text
tag = token.tag_
# if token is not a stop word and has right pos tag
if text != word and text not in filters['stop_words'] and tag in filters['allowed_tags'] :
tokens.append(token.text)
# if paragraph is the right size after filters
if len(tokens) >= filters['context_size']:
for key in set(tokens):
self.add_node(key)
for from_key, to_key in {(x,y) for x in tokens for y in tokens if x != y}:
self.add_edge(from_key, to_key)
except UnicodeDecodeError:
print('Failed to decode:', f)
#removes tokens with too few occurences
self.nodes = {key:value for key, value in self.nodes.items() if value.freq >= filters['min_occurrences']}
#removes unneccessary edges and pairs with too few cooccurrences
for node in self.nodes.values():
node.neighbors = {key:value for key, value in node.neighbors.items() if value >= filters['min_cooccurrence'] and key in self.nodes.keys() and node.weight(self.nodes[key])<=filters['max_distance']}
#removes singletons
self.nodes = {key:value for key, value in self.nodes.items() if len(value.neighbors) > 0}
# finds a path from one node to another
# Variation on function from https://www.python-course.eu/graphs_python.php
def find_path(self, start, end, path=None):
if path == None:
path = []
path = path + [start]
if start == end:
return path
if start not in self.nodes.keys():
return None
for neighbor in self.nodes[start].neighbors.keys():
if neighbor not in path:
print(path)
extended_path = self.find_path(neighbor, end, path)
if extended_path:
return extended_path
return None
# variation on algorithm from Véronis (2004)
def root_hubs(self, min_neighbors = 6, theshold = 0.8):
G = copy.deepcopy(self)
V = sorted(G.nodes.values(), key=lambda value: -1 * value.freq) # -1 to sort descending (...3 -> 2 -> 1...)
H = []
while V:
v = V[0]
if len(v.neighbors) >= min_neighbors:
mfn = sorted(v.neighbors.keys(), key=lambda key: v.neighbors[key])[:min_neighbors] #mfn: most frequent neighbors
if np.mean([v.weight(G.nodes[n]) for n in mfn]) < theshold:
H.append(v)
G.nodes = {key:value for key, value in G.nodes.items() if key != v.key and key not in v.neighbors.keys()}
for node in G.nodes.values():
node.neighbors = {key:value for key, value in node.neighbors.items() if key in G.nodes.keys()}
V = sorted(G.nodes.values(), key=lambda value: -1 * value.freq)
else:
return H
return H
#presents nodes in format key --> (weight, neighbors)
def view(self):
for node in self.nodes.values():
print(node.key,'-->',[(node.weight(self.nodes[key]), key) for key in node.neighbors.keys()])
#draws graph using networkx
def draw(self):
G = nx.Graph()
for node in self.nodes.values():
G.add_node(node.key)
G.add_edges_from([(node.key, y) for y in node.neighbors.keys()])
nx.draw(G, with_labels=True)
plt.show()
#class for single words with frequency and neighbors
class Node:
#initialises node with key and frequency of 1
def __init__(self, key):
self.key = key
self.freq = 1
self.neighbors = dict()
#adds neighbor to neighbors dict or ups its cooccurence frequency
def add_neighbor(self, other):
if other.key in self.neighbors.keys():
self.neighbors[other.key] += 1
else:
self.neighbors[other.key] = 1
#removes neighbor from dictionary
def remove_neighbor(self, key):
del self.neighbors[key]
#calculates weight between self and other node
#if node is not neighbor, return 1 (no cooccurence), (0 would be complete cooccurrence)
def weight(self, other):
if other.key in self.neighbors.keys():
return 1 - max([self.neighbors[other.key]/other.freq, self.neighbors[other.key]/self.freq])
else:
return 1
#see Kruskal's algorithm
def minimum_spanning_tree(graph, target):
pass
#Components algorithm from Véronis (2004), converts graph for target into a MST
def components(graph, target):
pass
#Uses MST to disambiguate context, should ideally write to evaluator format
def disambiguation(mst, context):
pass
if __name__ == '__main__':
filters = {'min_occurrences' : 10, 'min_cooccurrence' : 5, 'stop_words' : ['utc'], 'allowed_tags' : ['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'NNP'], 'context_size' : 4, 'max_distance' : 0.9}
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
#corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
G = Graph() #initialises graph
G.build(corpus_path, sys.argv[1], filters) #builds graph from corpus with target and filters
for hub in G.root_hubs():
print(hub.key,'-->', list(hub.neighbors.keys()), '\n') #prints senses
#G.view()
#print(G.find_path('english', 'kennel'))
G.draw() #draws graph
import os
import nltk
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
wiki_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
topics = open(data_path+'/topics.txt', 'r').readlines()[1:]
topics = [line.strip('\n').split('\t') for line in topics]
results = open(data_path+'/results.txt', 'r').readlines()[1:]
results = [line.strip('\n').split('\t') for line in results]
def get_paragraphs(word):
files = [wiki_path+'/'+f for f in os.listdir(wiki_path)]
paragraphs = list()
space_word = word.replace('_', ' ')
for f in files:
with open(f, 'r') as source:
for line in source:
line = line.lower()
if space_word in line:
new_line = line.replace(space_word, word)
tokens = nltk.word_tokenize(new_line)
if word in tokens:
paragraphs.append(tokens)
print(tokens)
return paragraphs
#for topic in topics:
print(get_paragraphs('the_block'))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment