Clean up.

154ec5e8 · Victor Zimmermann · 9c8316f5 · 9c8316f5 · 9c8316f5 · 9c8316f5
Commit 154ec5e8 authored 7 years ago by Victor Zimmermann
--- a/legacy/.gitkeep
+++ b/legacy/.gitkeep
--- a/legacy/absinth.py
+++ b/legacy/absinth.py
-import os # for reading files
-import sys
-#from tqdm import tqdm # for counting seconds
-import spacy # for nlp
-import networkx as nx # for visualisation
-import matplotlib.pyplot as plt # for visualisation
-import copy # for deepcopy
-import numpy as np # for calculations
-nlp = spacy.load('en') # standard english nlp
-
-# wrapper class for nodes + functions on nodes
-class Graph:
-
-    # can be initialised with nodes
-    def __init__(self, nodes = {}):
-        self.nodes = nodes
-    
-    # 'key in Graph' returns True if node with key exists in Graph 
-    def __contains__(self, key):
-        return key in self.nodes.keys()
-    
-    # returns all nodes (not keys)
-    def get_nodes(self):
-        return self.nodes.values()
-    
-    # adds node or ups frequency of node if already in graph
-    def add_node(self, key):
-        if key in self:
-            self.nodes[key].freq += 1
-        else:
-            self.nodes[key] = Node(key)
-    
-    # removes node (doesn't work)
-    #def remove_node(self, key):
-    #    del self.nodes[key]
-    #    for node in self.nodes.values():
-    #        node.remove_neighbor(key)
-    
-    # adds neighbor to node
-    def add_edge(self, from_key, to_key):
-        self.nodes[from_key].add_neighbor(self.nodes[to_key])
-    
-    #builds graph from corpus for target word with applied filters
-    #filters: min_occurrences, min_cooccurrence, stop_words, allowed_tags, context_size, max_distance
-    def build(self, corpus_path, word, filters):
-        
-        files = [corpus_path+'/'+f for f in os.listdir(corpus_path)] # list of file paths (note that no other files should be in this directory)
-        spaced_word = word.replace('_', ' ') #input words are generally seperated with underscores
-        
-        for f in files[:]: #iterates over corpus
-            with open(f, 'r') as source:
-                
-                try: #some decoding throws the iteration
-                    for line in source:
-                        line = line.lower()
-                        if spaced_word in line: #greedy filter (no processing on most lines)
-                            new_line = line.replace(spaced_word, word)
-                            spacy_line = nlp(new_line)
-                            if word in [token.text for token in spacy_line]: #detailed filter on tokenised line
-                                tokens = list() #collects tokens
-                                for token in spacy_line:
-                                    text = token.text
-                                    tag = token.tag_
-                                    # if token is not a stop word and has right pos tag
-                                    if text != word and text not in filters['stop_words'] and tag in filters['allowed_tags'] :
-                                        tokens.append(token.text)
-                                # if paragraph is the right size after filters
-                                if len(tokens) >= filters['context_size']:
-                                    for key in set(tokens):
-                                        self.add_node(key)
-                                    for from_key, to_key in {(x,y) for x in tokens for y in tokens if x != y}:
-                                        self.add_edge(from_key, to_key)
-                                    
-                except UnicodeDecodeError:
-                    print('Failed to decode:', f)
-         
-        #removes tokens with too few occurences
-        self.nodes = {key:value for key, value in self.nodes.items() if value.freq >= filters['min_occurrences']}
-        
-        #removes unneccessary edges and pairs with too few cooccurrences
-        for node in self.nodes.values():
-            node.neighbors = {key:value for key, value in node.neighbors.items() if value >= filters['min_cooccurrence'] and key in self.nodes.keys() and node.weight(self.nodes[key])<=filters['max_distance']}
-        
-        #removes singletons
-        self.nodes = {key:value for key, value in self.nodes.items() if len(value.neighbors) > 0}
-    
-    # finds a path from one node to another
-    # Variation on function from https://www.python-course.eu/graphs_python.php
-    def find_path(self, start, end, path=None):
-        if path == None:
-            path = []
-        path = path + [start]
-        if start == end:
-            return path
-        if start not in self.nodes.keys():
-            return None
-        for neighbor in self.nodes[start].neighbors.keys():
-            if neighbor not in path:
-                print(path)
-                extended_path = self.find_path(neighbor, end, path)
-                if extended_path:
-                    return extended_path
-        return None
-    
-    # variation on algorithm from Véronis (2004)
-    def root_hubs(self, min_neighbors = 6, theshold = 0.8):
-        G = copy.deepcopy(self)
-        
-        V = sorted(G.nodes.values(), key=lambda value: -1 * value.freq) # -1 to sort descending (...3 -> 2 -> 1...)
-        H = []
-        
-        while V:
-            v = V[0]
-            if len(v.neighbors) >= min_neighbors:
-                mfn = sorted(v.neighbors.keys(), key=lambda key: v.neighbors[key])[:min_neighbors] #mfn: most frequent neighbors
-                if np.mean([v.weight(G.nodes[n]) for n in mfn]) < theshold:
-                    H.append(v)
-                G.nodes = {key:value for key, value in G.nodes.items() if key != v.key and key not in v.neighbors.keys()}
-                for node in G.nodes.values():
-                    node.neighbors = {key:value for key, value in node.neighbors.items() if key in G.nodes.keys()}
-                V = sorted(G.nodes.values(), key=lambda value: -1 * value.freq)
-            else:
-                return H
-        return H
-    
-    #presents nodes in format key --> (weight, neighbors)
-    def view(self):
-        for node in self.nodes.values():
-            print(node.key,'-->',[(node.weight(self.nodes[key]), key) for key in node.neighbors.keys()])
-            
-    #draws graph using networkx
-    def draw(self):
-        G = nx.Graph()
-        for node in self.nodes.values():
-            G.add_node(node.key)
-            G.add_edges_from([(node.key, y) for y in node.neighbors.keys()])
-        nx.draw(G, with_labels=True)
-        plt.show()
-    
-#class for single words with frequency and neighbors
-class Node:
-    
-    #initialises node with key and frequency of 1
-    def __init__(self, key):
-        self.key = key
-        self.freq = 1
-        self.neighbors = dict()
-        
-    #adds neighbor to neighbors dict or ups its cooccurence frequency
-    def add_neighbor(self, other):
-        if other.key in self.neighbors.keys():
-            self.neighbors[other.key] += 1
-        else:
-            self.neighbors[other.key] = 1
-    
-    #removes neighbor from dictionary
-    def remove_neighbor(self, key):
-        del self.neighbors[key]
-    
-    #calculates weight between self and other node
-    #if node is not neighbor, return 1 (no cooccurence), (0 would be complete cooccurrence)
-    def weight(self, other):
-        if other.key in self.neighbors.keys():
-            return 1 - max([self.neighbors[other.key]/other.freq, self.neighbors[other.key]/self.freq])
-        else:
-            return 1
-
-
-#see Kruskal's algorithm
-def minimum_spanning_tree(graph, target):
-    pass
-
-#Components algorithm from Véronis (2004), converts graph for target into a MST
-def components(graph, target):
-    pass
-
-#Uses MST to disambiguate context, should ideally write to evaluator format
-def disambiguation(mst, context):
-    pass
-
-
-if __name__ == '__main__':
-    
-    filters = {'min_occurrences' : 10, 'min_cooccurrence' : 5, 'stop_words' : ['utc'], 'allowed_tags' : ['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'NNP'], 'context_size' : 4, 'max_distance' : 0.9}
-    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
-    #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
-    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
-    
-    G = Graph() #initialises graph
-    G.build(corpus_path, sys.argv[1], filters) #builds graph from corpus with target and filters
-    
-    for hub in G.root_hubs():
-        print(hub.key,'-->', list(hub.neighbors.keys()), '\n') #prints senses
-        
-    #G.view()
-    #print(G.find_path('english', 'kennel'))
-    G.draw() #draws graph
--- a/legacy/graphs.py
+++ b/legacy/graphs.py
-import os
-import nltk
-
-data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
-wiki_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
-
-topics = open(data_path+'/topics.txt', 'r').readlines()[1:]
-topics = [line.strip('\n').split('\t') for line in topics]
-results = open(data_path+'/results.txt', 'r').readlines()[1:]
-results = [line.strip('\n').split('\t') for line in results]
-
-def get_paragraphs(word):
-    files = [wiki_path+'/'+f for f in os.listdir(wiki_path)]
-    paragraphs = list()
-    space_word = word.replace('_', ' ')
-    for f in files:
-        with open(f, 'r') as source:
-            for line in source:
-                line = line.lower()
-                if space_word in line:
-                    new_line = line.replace(space_word, word)
-                    tokens = nltk.word_tokenize(new_line)
-                    if word in tokens:
-                        paragraphs.append(tokens)
-                        print(tokens)
-    return paragraphs
-
-#for topic in topics:
-print(get_paragraphs('the_block'))