diff --git a/code/absinth.py b/code/absinth.py new file mode 100644 index 0000000000000000000000000000000000000000..ccd9d6d48d61db115a72c67b96fd6c9f88554380 --- /dev/null +++ b/code/absinth.py @@ -0,0 +1,139 @@ +import os +import spacy +import networkx as nx +import matplotlib.pyplot as plt +nlp = spacy.load('en') + +class Graph: + def __init__(self, nodes = {}): + self.nodes = nodes + + def __contains__(self, key): + return key in self.nodes.keys() + + def get_nodes(self): + return self.nodes.values() + + def add_node(self, token): + if token in self: + self.nodes[token].freq += 1 + else: + self.nodes[token] = Node(token) + + def remove_node(self, token): + del self.nodes[token] + for node in self.nodes.values(): + print('tada') + node.remove_neighbor(token) + + def add_edge(self, from_token, to_token): + self.nodes[from_token].add_neighbor(self.nodes[to_token]) + + def build(self, corpus_path, word, filter_dict): + + files = [corpus_path+'/'+f for f in os.listdir(corpus_path)] + spaced_word = word.replace('_', ' ') #input words are seperated with underscores + + for f in files: + with open(f, 'r') as source: + + try: + for line in source: + line = line.lower() + if spaced_word in line: + new_line = line.replace(spaced_word, word) + spacy_line = nlp(new_line) + if word in [token.text for token in spacy_line]: + tokens = list() + for token in spacy_line: + text = token.text + tag = token.tag_ + if text != word and text not in filter_dict['stop_words'] and tag in filter_dict['allowed_tags'] : + tokens.append(token.text) + if len(tokens) >= filter_dict['context_size']: + for node in tokens: + self.add_node(node) + for edge in [(x,y) for x in tokens for y in tokens]: + from_token, to_token = edge + self.add_edge(from_token, to_token) + + except UnicodeDecodeError: + print('Failed to decode:', f) + + self.nodes = {key:value for key, value in self.nodes.items() if value.freq >= filter_dict['min_occurrences']} + + for node in self.nodes.values(): + node.neighbors = {key:value for key, value in node.neighbors.items() if value >= filter_dict['min_cooccurrence']} + + def find_path(self, start, end, path=None): + if path == None: + path = [] + path = path + [start] + if start == end: + return path + if start not in self.nodes.keys(): + return None + for neighbor in self.nodes[start].neighbors.keys(): + if neighbor not in path: + print(path) + extended_path = self.find_path(neighbor, end, path) + if extended_path: + return extended_path + return None + + def view(self): + for node in self.nodes.values(): + print(node.token,'-->',node.neighbors.keys()) + + def draw(self): + G = nx.Graph() + for node in self.nodes.values(): + G.add_node(node.token) + G.add_edges_from([(node.token, y) for y in node.neighbors.keys()]) + nx.draw(G, with_labels=True) + plt.show() + + +class Node: + + def __init__(self, token): + self.token = token + self.freq = 1 + self.neighbors = dict() + + + def add_neighbor(self, other): + token = other.token + + if token in self.neighbors.keys(): + self.neighbors[token] += 1 + else: + self.neighbors[token] = 1 + + def remove_neighbor(self, token): + del self.neighbors[token] + + def weight(self, other): + return 1 - max([self.neighbors[other.token]/other.count, other.neighbors[self.token]/self.count]) + + + +if __name__ == '__main__': + """ + Filter: + min_occurrences: minimum occurences of given word + min_cooccurrence: minimum cooccurences for word pairs + stop_words: list of stop words + allowed_tags: pos-tags of words + context_size: minimum size for paragraphs after word filtering + """ + filter_dict = {'min_occurrences' : 10, 'min_cooccurrence' : 5, 'stop_words' : [], 'allowed_tags' : ['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'NNP'], 'context_size' : 4, 'max_distance' : 0.9} + data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE' + corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' + #corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI' + + G = Graph() + G.build(corpus_path, 'dog', filter_dict) + G.view() + print(G.find_path('english', 'kennel')) + G.draw()