From 5b86b63e0cfe7d7a6bdaafbe1f962bfc5d0dc009 Mon Sep 17 00:00:00 2001 From: chernenko <chernenko@cl.uni-heidelberg.de> Date: Wed, 28 Mar 2018 14:52:44 +0200 Subject: [PATCH] Eine Neue Datei hochladen --- experiments/variant_34.py | 305 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 experiments/variant_34.py diff --git a/experiments/variant_34.py b/experiments/variant_34.py new file mode 100644 index 0000000..691fa1a --- /dev/null +++ b/experiments/variant_34.py @@ -0,0 +1,305 @@ +#!/bin/env python3.5 +#variant_34.py +# +#usage: +#$ ./variant_34.py +# +#author: <Tatjana Chernenko, Utaemon Toyota> + + +""" +<VARIANT 34> + +------------------- DESCRIPTION ------------------- + +The pipeline system performs the 34th variant of the system for WSI (word sense induction) task (the Task 11 at SemEval 2013). + +The system creates semantic related clusters from the given snippets (the text fragments we get back from the search engine) for each pre-defined ambigue topic. + +------------------- METHODS ------------------- + +For the WSI purposes it uses the following methods: + +- For pre-rpocessing: tokenization + remove punctuation +- Language model: sent2vec, plaintexts_bigramm - trained Model (Wikipedia 2017) +- Compositional semantics: vector mixture model (BOW (bag-of-words) representation with summarization for each snippet) +- Clustering: Cos similarity with max Similarity + +------------------- EVALUATION ------------------- + +=========== Final average value of F1: ===================== +average F1 = 0.6013936257538159 + +=========== Final average value of Rand Index: ============= +average Rand Index = 0.38823232323232326 + +=========== Final average value of Adjusted Rand Index: ==== +average Adj Rand Index = -0.0038108019555718464 + +=========== Final average value of Jaccard Index: ========== +average Jaccard Index = 0.21276973487337159 + +================ Statistics: ==================================== +============ average number of created clusters: 6.25 +============ average cluster size: 12.740079365079364 + +""" +import sys +sys.path.append("/home/students/toyota") +import sent2vec +from collections import defaultdict, deque +import re +import math +import numpy as np +from sklearn.cluster import MeanShift +import nltk +from scipy import spatial + +# Read Data; get the number of topics and number of subtopics for each topic: +def read_data(topics_file, subtopics_file, results_file): + with open(topics_file, "r") as f: + topics_data = f.readlines() + with open(subtopics_file, "r") as f: + subtopics_data = f.readlines() + with open(results_file, "r") as f: + results_data = f.readlines() + number_topics = 0 + for line in topics_data: + if not line.startswith("ID"): + number_topics += 1 + subtopics = {} + for line in subtopics_data: + if not line.startswith("ID"): + topic = line.split(" ")[0].split(".")[0] + if topic not in subtopics.keys(): + subtopics[topic]=1 + else: + subtopics[topic]+=1 + return topics_data, subtopics_data, results_data, number_topics, subtopics + +# Create a vocabulary with topics as keys and lists with lists of snippets with IDs as values: +def devide_text_into_topics(results_data, number_topics): + text = defaultdict(list) + for line in results_data: + if not line.startswith("ID"): + if line.split()[0].split(".")[0] not in text.keys(): + text[line.split()[0].split(".")[0]] = [] + help_structure = [] + help_structure.append(line.split(" ")[0]) + help_structure.append(line.split(" ")[2:]) + text[line.split()[0].split(".")[0]].append(help_structure) + else: + help_structure = [] + help_structure.append(line.split(" ")[0]) + help_structure.append(line.split(" ")[2:]) + text[line.split()[0].split(".")[0]].append(help_structure) + # Clean sentences from "\n": + for values in text.values(): + for paragr in values: + for sent in paragr: + if sent == "\n": + paragr.remove(sent) + #print(text["45"]) # example of the output for the topic "45" + return text + +# Preprocess Data (tokenize every sentence in every topic): +def preprocess_data(text): + # Tokenize: + for value in text.values(): + for paragr in value: + for i in range(1,len(paragr)): + tokens = re.findall(r"\w+", str(paragr[i])) # remove punctuation + words = [] + for word in tokens: + if word == "n": + words.append(" ") + else: + words.append(word.strip()) # delete first empty placeholder + paragr[i] = words + + prepr_data = text + #print(prepr_data["45"]) # example of the output for the topic "45" + return prepr_data + +# For every word in a sentence make a vector representation with sense2vec; make a compositional vector for every sentence as sum of BOW vectors: +def compos_sent2vec(prepr_data,len_vector): + model = sent2vec.Sent2vecModel() + model.load_model("/proj/toyota/plaintexts_bigramm.bin") + for value in prepr_data.values(): + for paragr in value: #one snippet + par_list = [] # list with a snippet + vector_paragr = np.zeros(len_vector) #vector for one snippet for a sum + for sent in paragr[1:]: #sent in a snippet + #print("sent: ", sent) + vector_sent = [] #list for all sentences in a snippet + for word in sent: #word + try: + query_vector = model.embed_sentence(word) + vector_sent.append(query_vector) #add a word-vector to a list for sentences in a snippet - BOW for all words in a snippet - now for a sentence - for every sentence + except: + continue + summe = np.zeros(len_vector) # vector for a summ + for vector in vector_sent: # for one word in all sentences + summe+=vector # summ all words in a snippet - vector for a snippet + par_list.append(summe)#?#add a summ(vector for a snippet) to a list with a snippet + for sentence in par_list: # for all snippet-vectors + vector_paragr+=sentence # sum all snippets + paragr.append(vector_paragr) #add to a snippet a summ of all snippets + compos_vectors = prepr_data + #print(compos_vectors["45"]) # example of the output for the topic "45" + return compos_vectors + +# Create a vocabulary for subtopics with topics as keys and lists with subtopics with IDs as values: +def devide_subtopics_into_topics(subtopics_data, number_topics): + subtopics_vectors = defaultdict(list) + for line in subtopics_data: + if not line.startswith("ID"): + if line.split()[0].split(".")[0] not in subtopics_vectors.keys(): + subtopics_vectors[line.split()[0].split(".")[0]] = [] + help_structure = [] + help_structure.append(line.split(" ")[0]) + help_structure.append(line.split(" ")[1]) + subtopics_vectors[line.split()[0].split(".")[0]].append(help_structure) + else: + help_structure = [] + help_structure.append(line.split(" ")[0]) + help_structure.append(line.split(" ")[1]) + subtopics_vectors[line.split()[0].split(".")[0]].append(help_structure) + # Clean sentences from "\n": + for values in subtopics_vectors.values(): + for paragr in values: + for sent in paragr: + if sent == "\n": + paragr.remove(sent) + #print("Subtopics Vectors: ", subtopics_vectors) + return subtopics_vectors + +# Make vector representation of Subtopics with sense2vec, sum of BOW with sense2vec: +def compose_sent2vec_subtopics(subtopics_vectors, len_vector): + model = sent2vec.Sent2vecModel() + model.load_model("/proj/toyota/plaintexts_bigramm.bin") + for value in subtopics_vectors.values(): + for paragr in value: #one snippet + par_list = [] # list with a snippet + vector_paragr = np.zeros(len_vector) #vector for one snippet for a sum + for sent in paragr[1:]: #sent in a snippet + vector_sent = [] #list for all sentences in a snippet + for word in sent: #word + try: + query_vector = model.embed_sentence(word) + vector_sent.append(query_vector) #add a word-vector to a list for sentences in a snippet - BOW for all words in a snippet - now for a sentence - for every sentence + except: + continue + summe = np.zeros(len_vector) # vector for a summ + for vector in vector_sent: # for one word in all sentences + summe+=vector # summ all words in a snippet - vector for a snippet + par_list.append(summe)#?#add a summ(vector for a snippet) to a list with a snippet + for sentence in par_list: # for all snippet-vectors + vector_paragr+=sentence # sum all snippets + paragr.append(vector_paragr) #add to a snippet a summ of all snippets + compos_subtopics_vectors = subtopics_vectors + #print("Composed Subtopics Vectors: ", compos_subtopics_vectors) + return compos_subtopics_vectors + + +# Create a vocabulary with cos similarities to subtopics for every snippet: +def cos_sim_vocab(compos_subtopics_vectors, compos_vectors): + all_sim = {} + for value in compos_vectors.values(): #value = all sent with metainfo for one topic + #print("\n\nTOPIC: ", value[0][0].split(".")[0], "\n") + for snippet in value: #snippet for one topic ( ['47.100', ['The|DET', ...], array([-2.11360547e+00, ...]] ) + similarities = {} + for all_subt in compos_subtopics_vectors[value[0][0].split(".")[0]]: + sim = 1 - spatial.distance.cosine(snippet[-1], all_subt[2]) + a = [] + a.append(all_subt[0]) + a.append(sim) + if snippet[0] not in similarities.keys(): + similarities[snippet[0]] = [] + similarities[snippet[0]].append(a) + else: + similarities[snippet[0]].append(a) + #print("\n\nSimilarities: ", similarities) + if value[0][0].split(".")[0] not in all_sim.keys(): + all_sim[value[0][0].split(".")[0]] = [] + all_sim[value[0][0].split(".")[0]].append(similarities) + else: + all_sim[value[0][0].split(".")[0]].append(similarities) + #print("\nAll similarities: ", all_sim) + return all_sim + +#LIKE WSD: Cluster sentences (snippets) based on cos similarity to Subtopics without sim_factor (use max cos sim); create an output file: +def cos_sim_clustering(cos_sim_vocab, output): + sim_factor = 0 + f = open(output, "a") + f.write("subTopicID"+" "+"resultID\n") + lines = [] + result = {} + for topic in cos_sim_vocab.keys(): # 46 + #print("TOPIC: ", topic) + if topic not in result.keys(): + result[topic] = [] + for snippet in cos_sim_vocab[topic]:# (list with) one vocab with keys=snippet ids + #print("\nSNIPPET: ", snippet) + voc = {} + max_sim = np.float(0) + max_id = "" + for simil in snippet.values(): + #print("\nsimil: ", simil) + for el in simil: + #print(type(el[1]), type(max_sim)) + if el[1] > max_sim and el[1] > np.float(sim_factor): + max_sim = el[1] + max_id = el[0] + max_value = [] + structure = [] + if max_sim != np.float(0): + structure.append(max_id) + structure.append(max_sim) + max_value.append(structure) + #print("\nMax_value: ", max_value) + if max_value != []: + for el in snippet.keys(): + #print("el ", el) + if el not in voc.keys(): + voc[el] = [] + voc[el].append(max_value) + else: + voc[el].append(max_value) + #print("\nvoc: ", voc, "for snippet: ", snippet) + if voc!={}: + result[topic].append(voc) + for value in result.values(): + #print("\nvalue:\n", value) + for el in value: + for e in el.keys(): + #print("subtid: ", el[e][0][0][0]) + one_line = str(el[e][0][0][0]) + " " + (str(e)+"\n") + lines.append(one_line) + + sort = sorted(lines) + for el in sort: + f.write(el) + f.close() + return result + + + +if __name__ == "__main__": + path = "/proj/toyota/Pool" + trial_path = "/semeval-2013_task11_trial" + trial_topics = "/topics.txt" + trial_subtopics = "/subTopics.txt" + trial_results = "/results.txt" + + topics_data, subtopics_data, results_data, number_topics, subtopics = read_data(path+trial_path+trial_topics,path+trial_path+trial_subtopics,path+trial_path+trial_results) + text = devide_text_into_topics(results_data, number_topics) + prepr_data = preprocess_data(text) + compos_vectors = compos_sent2vec(prepr_data, 700) + subtopics_vectors = devide_subtopics_into_topics(subtopics_data, number_topics) + compos_subtopics_vectors = compose_sent2vec_subtopics(subtopics_vectors, 700) + cos_sim_vocab = cos_sim_vocab(compos_subtopics_vectors, compos_vectors) + clusters = cos_sim_clustering(cos_sim_vocab, path+"/output_34.txt") + print("Done.") + + -- GitLab