Skip to content
Snippets Groups Projects
Commit d1bbe558 authored by chernenko's avatar chernenko
Browse files

Eine Neue Datei hochladen

parent 5940ac60
No related branches found
No related tags found
No related merge requests found
#!/bin/env python3.5
#variant_36.py
#
#usage:
#$ ./variant_36.py
#
#author: <Tatjana Chernenko, Utaemon Toyota>
"""
<VARIANT 36>
------------------- DESCRIPTION -------------------
The pipeline system performs the 36th variant of the pipeline for WSI (word sense induction) task (the Task 11 at SemEval 2013).
The system creates semantic related clusters from the given snippets (the text fragments we get back from the search engine) for each pre-defined ambigue topic.
------------------- METHODS -------------------
For the WSI purposes it uses the following methods:
- For pre-rpocessing: tokenization + remove punctuation
- Language model: sent2vec, wiki_unigrams - pretrained Model
- Compositional semantics: vector mixture model (BOW (bag-of-words) representation with summarization for each snippet)
- Clustering: Cos similarity with max Similarity
------------------- EVALUATION -------------------
=========== Final average value of F1: =====================
average F1 = 0.653265727172881
=========== Final average value of Rand Index: =============
average Rand Index = 0.43060606060606055
=========== Final average value of Adjusted Rand Index: ====
average Adj Rand Index = 0.03896746404850587
=========== Final average value of Jaccard Index: ==========
average Jaccard Index = 0.28157161636265343
================ Statistics: ====================================
============ average number of created clusters: 7.0
============ average cluster size: 18.010606060606058
"""
import sys
sys.path.append("/home/students/toyota")
import sent2vec
from collections import defaultdict, deque
import re
import math
import numpy as np
from sklearn.cluster import MeanShift
import nltk
from scipy import spatial
# Read Data; get the number of topics and number of subtopics for each topic:
def read_data(topics_file, subtopics_file, results_file):
with open(topics_file, "r") as f:
topics_data = f.readlines()
with open(subtopics_file, "r") as f:
subtopics_data = f.readlines()
with open(results_file, "r") as f:
results_data = f.readlines()
number_topics = 0
for line in topics_data:
if not line.startswith("ID"):
number_topics += 1
subtopics = {}
for line in subtopics_data:
if not line.startswith("ID"):
topic = line.split(" ")[0].split(".")[0]
if topic not in subtopics.keys():
subtopics[topic]=1
else:
subtopics[topic]+=1
return topics_data, subtopics_data, results_data, number_topics, subtopics
# Create a vocabulary with topics as keys and lists with lists of snippets with IDs as values:
def devide_text_into_topics(results_data, number_topics):
text = defaultdict(list)
for line in results_data:
if not line.startswith("ID"):
if line.split()[0].split(".")[0] not in text.keys():
text[line.split()[0].split(".")[0]] = []
help_structure = []
help_structure.append(line.split(" ")[0])
help_structure.append(line.split(" ")[2:])
text[line.split()[0].split(".")[0]].append(help_structure)
else:
help_structure = []
help_structure.append(line.split(" ")[0])
help_structure.append(line.split(" ")[2:])
text[line.split()[0].split(".")[0]].append(help_structure)
# Clean sentences from "\n":
for values in text.values():
for paragr in values:
for sent in paragr:
if sent == "\n":
paragr.remove(sent)
#print(text["45"]) # example of the output for the topic "45"
return text
# Preprocess Data (tokenize every sentence in every topic):
def preprocess_data(text):
# Tokenize:
for value in text.values():
for paragr in value:
for i in range(1,len(paragr)):
tokens = re.findall(r"\w+", str(paragr[i])) # remove punctuation
words = []
for word in tokens:
if word == "n":
words.append(" ")
else:
words.append(word.strip()) # delete first empty placeholder
paragr[i] = words
prepr_data = text
#print(prepr_data["45"]) # example of the output for the topic "45"
return prepr_data
# For every word in a sentence make a vector representation with sense2vec; make a compositional vector for every sentence as sum of BOW vectors:
def compos_sent2vec(prepr_data,len_vector):
model = sent2vec.Sent2vecModel()
model.load_model("/proj/toyota/wiki_unigrams.bin")
for value in prepr_data.values():
for paragr in value: #one snippet
par_list = [] # list with a snippet
vector_paragr = np.zeros(len_vector) #vector for one snippet for a sum
for sent in paragr[1:]: #sent in a snippet
#print("sent: ", sent)
vector_sent = [] #list for all sentences in a snippet
for word in sent: #word
try:
query_vector = model.embed_sentence(word)
vector_sent.append(query_vector) #add a word-vector to a list for sentences in a snippet - BOW for all words in a snippet - now for a sentence - for every sentence
except:
continue
summe = np.zeros(len_vector) # vector for a summ
for vector in vector_sent: # for one word in all sentences
summe+=vector # summ all words in a snippet - vector for a snippet
#par_list.append(summe) #?!!!!!! war
par_list.append(summe)#?#add a summ(vector for a snippet) to a list with a snippet
for sentence in par_list: # for all snippet-vectors
vector_paragr+=sentence # sum all snippets
paragr.append(vector_paragr) #add to a snippet a summ of all snippets
compos_vectors = prepr_data
#print(compos_vectors["45"]) # example of the output for the topic "45"
return compos_vectors
# Create a vocabulary for subtopics with topics as keys and lists with subtopics with IDs as values:
def devide_subtopics_into_topics(subtopics_data, number_topics):
subtopics_vectors = defaultdict(list)
for line in subtopics_data:
if not line.startswith("ID"):
if line.split()[0].split(".")[0] not in subtopics_vectors.keys():
subtopics_vectors[line.split()[0].split(".")[0]] = []
help_structure = []
help_structure.append(line.split(" ")[0])
help_structure.append(line.split(" ")[1])
subtopics_vectors[line.split()[0].split(".")[0]].append(help_structure)
else:
help_structure = []
help_structure.append(line.split(" ")[0])
help_structure.append(line.split(" ")[1])
subtopics_vectors[line.split()[0].split(".")[0]].append(help_structure)
# Clean sentences from "\n":
for values in subtopics_vectors.values():
for paragr in values:
for sent in paragr:
if sent == "\n":
paragr.remove(sent)
#print("Subtopics Vectors: ", subtopics_vectors)
return subtopics_vectors
# Make vector representation of Subtopics with sense2vec, sum of BOW with sense2vec:
def compose_sent2vec_subtopics(subtopics_vectors, len_vector):
model = sent2vec.Sent2vecModel()
model.load_model("/proj/toyota/wiki_unigrams.bin")
for value in subtopics_vectors.values():
for paragr in value: #one snippet
par_list = [] # list with a snippet
vector_paragr = np.zeros(len_vector) #vector for one snippet for a sum
for sent in paragr[1:]: #sent in a snippet
vector_sent = [] #list for all sentences in a snippet
for word in sent: #word
try:
query_vector = model.embed_sentence(word)
vector_sent.append(query_vector) #add a word-vector to a list for sentences in a snippet - BOW for all words in a snippet - now for a sentence - for every sentence
except:
continue
summe = np.zeros(len_vector) # vector for a summ
for vector in vector_sent: # for one word in all sentences
summe+=vector # summ all words in a snippet - vector for a snippet
par_list.append(summe)#?#add a summ(vector for a snippet) to a list with a snippet
for sentence in par_list: # for all snippet-vectors
vector_paragr+=sentence # sum all snippets
paragr.append(vector_paragr) #add to a snippet a summ of all snippets
compos_subtopics_vectors = subtopics_vectors
#print("Composed Subtopics Vectors: ", compos_subtopics_vectors)
return compos_subtopics_vectors
# Create a vocabulary with cos similarities to subtopics for every snippet:
def cos_sim_vocab(compos_subtopics_vectors, compos_vectors):
all_sim = {}
for value in compos_vectors.values(): #value = all sent with metainfo for one topic
#print("\n\nTOPIC: ", value[0][0].split(".")[0], "\n")
for snippet in value: #snippet for one topic ( ['47.100', ['The|DET', ...], array([-2.11360547e+00, ...]] )
similarities = {}
#print("\nsnippet id: ", snippet[0])
for all_subt in compos_subtopics_vectors[value[0][0].split(".")[0]]:
sim = 1 - spatial.distance.cosine(snippet[-1], all_subt[2])
#print("sim: ", sim)
a = []
a.append(all_subt[0])
a.append(sim)
if snippet[0] not in similarities.keys():
similarities[snippet[0]] = []
similarities[snippet[0]].append(a)
else:
similarities[snippet[0]].append(a)
#print("\n\nSimilarities: ", similarities)
if value[0][0].split(".")[0] not in all_sim.keys():
all_sim[value[0][0].split(".")[0]] = []
all_sim[value[0][0].split(".")[0]].append(similarities)
else:
all_sim[value[0][0].split(".")[0]].append(similarities)
#print("\nAll similarities: ", all_sim)
return all_sim
#LIKE WSD: Cluster sentences (snippets) based on cos similarity to Subtopics without sim_factor (use max cos sim); create an output file:
def cos_sim_clustering(cos_sim_vocab, output):
sim_factor = 0
f = open(output, "a")
f.write("subTopicID"+" "+"resultID\n")
lines = []
result = {}
for topic in cos_sim_vocab.keys(): # 46
#print("TOPIC: ", topic)
if topic not in result.keys():
result[topic] = []
for snippet in cos_sim_vocab[topic]:# (list with) one vocab with keys=snippet ids
voc = {}
max_sim = np.float(0)
max_id = ""
for simil in snippet.values():
for el in simil:
if el[1] > max_sim and el[1] > np.float(sim_factor):
max_sim = el[1]
max_id = el[0]
max_value = []
structure = []
if max_sim != np.float(0):
structure.append(max_id)
structure.append(max_sim)
max_value.append(structure)
if max_value != []:
for el in snippet.keys():
if el not in voc.keys():
voc[el] = []
voc[el].append(max_value)
else:
voc[el].append(max_value)
if voc!={}:
result[topic].append(voc)
for value in result.values():
for el in value:
for e in el.keys():
one_line = str(el[e][0][0][0]) + " " + (str(e)+"\n")
lines.append(one_line)
sort = sorted(lines)
for el in sort:
f.write(el)
f.close()
return result
if __name__ == "__main__":
path = "/proj/toyota/Pool"
trial_path = "/semeval-2013_task11_trial"
trial_topics = "/topics.txt"
trial_subtopics = "/subTopics.txt"
trial_results = "/results.txt"
topics_data, subtopics_data, results_data, number_topics, subtopics = read_data(path+trial_path+trial_topics,path+trial_path+trial_subtopics,path+trial_path+trial_results)
text = devide_text_into_topics(results_data, number_topics)
prepr_data = preprocess_data(text)
compos_vectors = compos_sent2vec(prepr_data, 600)
subtopics_vectors = devide_subtopics_into_topics(subtopics_data, number_topics)
compos_subtopics_vectors = compose_sent2vec_subtopics(subtopics_vectors, 600)
cos_sim_vocab = cos_sim_vocab(compos_subtopics_vectors, compos_vectors)
clusters = cos_sim_clustering(cos_sim_vocab, path+"/output_36.txt")
print("Done.")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment