Tried some object oriented stuff. Also recursive path finder function.

909c4e12 · Victor Zimmermann · deb3f84a · 909c4e12
Commit 909c4e12 authored 7 years ago by Victor Zimmermann
--- a/code/absinth.py
+++ b/code/absinth.py
+import os
+import spacy
+import networkx as nx
+import matplotlib.pyplot as plt
+nlp = spacy.load('en')
+class Graph:
+    def __init__(self, nodes = {}):
+        self.nodes = nodes
+    def __contains__(self, key):
+        return key in self.nodes.keys()
+    def get_nodes(self):
+        return self.nodes.values()
+    def add_node(self, token):
+        if token in self:
+            self.nodes[token].freq += 1
+        else:
+            self.nodes[token] = Node(token)
+    def remove_node(self, token):
+        del self.nodes[token]
+        for node in self.nodes.values():
+            print('tada')
+            node.remove_neighbor(token)
+    def add_edge(self, from_token, to_token):
+        self.nodes[from_token].add_neighbor(self.nodes[to_token])
+    def build(self, corpus_path, word, filter_dict):
+        files = [corpus_path+'/'+f for f in os.listdir(corpus_path)]
+        spaced_word = word.replace('_', ' ') #input words are seperated with underscores
+        for f in files:
+            with open(f, 'r') as source:
+                try:
+                    for line in source:
+                        line = line.lower()
+                        if spaced_word in line:
+                            new_line = line.replace(spaced_word, word)
+                            spacy_line = nlp(new_line)
+                            if word in [token.text for token in spacy_line]:
+                                tokens = list()
+                                for token in spacy_line:
+                                    text = token.text
+                                    tag = token.tag_
+                                    if text != word and text not in filter_dict['stop_words'] and tag in filter_dict['allowed_tags'] :
+                                        tokens.append(token.text)
+                                if len(tokens) >= filter_dict['context_size']:
+                                    for node in tokens:
+                                        self.add_node(node)
+                                    for edge in [(x,y) for x in tokens for y in tokens]:
+                                        from_token, to_token = edge
+                                        self.add_edge(from_token, to_token)
+                except UnicodeDecodeError:
+                    print('Failed to decode:', f)
+        self.nodes = {key:value for key, value in self.nodes.items() if value.freq >= filter_dict['min_occurrences']}
+        for node in self.nodes.values():
+            node.neighbors = {key:value for key, value in node.neighbors.items() if value >= filter_dict['min_cooccurrence']}
+    def find_path(self, start, end, path=None):
+        if path == None:
+            path = []
+        path = path + [start]
+        if start == end:
+            return path
+        if start not in self.nodes.keys():
+            return None
+        for neighbor in self.nodes[start].neighbors.keys():
+            if neighbor not in path:
+                print(path)
+                extended_path = self.find_path(neighbor, end, path)
+                if extended_path:
+                    return extended_path
+        return None
+    def view(self):
+        for node in self.nodes.values():
+            print(node.token,'-->',node.neighbors.keys())
+    def draw(self):
+        G = nx.Graph()
+        for node in self.nodes.values():
+            G.add_node(node.token)
+            G.add_edges_from([(node.token, y) for y in node.neighbors.keys()])
+        nx.draw(G, with_labels=True)
+        plt.show()
+class Node:
+    def __init__(self, token):
+        self.token = token
+        self.freq = 1
+        self.neighbors = dict()
+    def add_neighbor(self, other):
+        token = other.token
+        if token in self.neighbors.keys():
+            self.neighbors[token] += 1
+        else:
+            self.neighbors[token] = 1
+    def remove_neighbor(self, token):
+        del self.neighbors[token]
+    def weight(self, other):
+        return 1 - max([self.neighbors[other.token]/other.count, other.neighbors[self.token]/self.count])
+if __name__ == '__main__':
+    """
+    Filter:
+    min_occurrences: minimum occurences of given word
+    min_cooccurrence: minimum cooccurences for word pairs
+    stop_words: list of stop words
+    allowed_tags: pos-tags of words
+    context_size: minimum size for paragraphs after word filtering
+    """
+    filter_dict = {'min_occurrences' : 10, 'min_cooccurrence' : 5, 'stop_words' : [], 'allowed_tags' : ['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'NNP'], 'context_size' : 4, 'max_distance' : 0.9}
+    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
+    corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
+    #corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
+    G = Graph()
+    G.build(corpus_path, 'dog', filter_dict)
+    G.view()
+    print(G.find_path('english', 'kennel'))
+    G.draw()