From 4701ee43d7812f72c8292dbc2a9bfdf24663295d Mon Sep 17 00:00:00 2001 From: Nadia <nwarslan@cl.uni-heidelberg.de> Date: Wed, 27 Feb 2019 23:15:13 +0100 Subject: [PATCH] added wsd_method 1 --- ...hod1_nadia_confusion.py => wsd_method1.py} | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) rename scripts/wsd/{wsd_method1_nadia_confusion.py => wsd_method1.py} (87%) diff --git a/scripts/wsd/wsd_method1_nadia_confusion.py b/scripts/wsd/wsd_method1.py similarity index 87% rename from scripts/wsd/wsd_method1_nadia_confusion.py rename to scripts/wsd/wsd_method1.py index 5bfbcce..483f657 100644 --- a/scripts/wsd/wsd_method1_nadia_confusion.py +++ b/scripts/wsd/wsd_method1.py @@ -145,7 +145,8 @@ def get_distance(node_combi, dist_dict): def map_distances(sentence, embed_dict): """ - + this function computes every distance between a word and all senses of other words + returns a dictionary {(id1,id2):distance,...} """ dist_dict ={} for i in range(len(sentence[:-1])): @@ -158,7 +159,7 @@ def map_distances(sentence, embed_dict): def find_optimum(sentence, dist_dict): """ - + this function finds the optimal sense combination of words in a sentence """ optimum = [0,0] for n_combi in node_combi(sentence): @@ -171,13 +172,11 @@ def find_optimum(sentence, dist_dict): return optimum -def map_sense(): +def node_combi(sentence): """ - + IN:[int] + this function computes every possible sense combination of a given sentence """ - -def node_combi(sentence): - for combi in itertools.product(*sentence): yield combi @@ -193,10 +192,11 @@ def write_answer_to_file(data, filename): def iterate_over(senseval_data, label_embeddings, lemmata_mapping, id_mapping, sense_key_mapping): """ - + this function iterates over sentences in senseval data + computes the optimal combination of senses given a sentence """ ambig_sents = open_mapping(senseval_data) - + # split sentence if it contains more than 6 words for i, sent in enumerate(ambig_sents): if len(sent) >=6: ambig_sents[i] = sent[:6] @@ -204,12 +204,16 @@ def iterate_over(senseval_data, label_embeddings, lemmata_mapping, id_mapping, s sentences = [[[word['lemma'],word['pos'],word['key']] for word in sent] for sent in ambig_sents] solutions = [] + for sentence in sentences: mapped_sent = map_words(sentence, lemmata_mapping)[0] lemma_list = map_words(sentence, lemmata_mapping)[1] + + # check if compound word were splitted in two split = 0 if len(sentence) < len(mapped_sent): split = len(mapped_sent) - len(sentence) + # saves the index of an unknown word and removes it from mapped_sent no_key = [] for word in mapped_sent: if word[0]=='U': @@ -217,36 +221,33 @@ def iterate_over(senseval_data, label_embeddings, lemmata_mapping, id_mapping, s mapped_sent = [word for word in mapped_sent if word[0]!='U'] + # computes label embeddings for every node and saves them in a dict embed_dict = embed(mapped_sent, label_embeddings) - + # computes distances between nodes and saves them in to a dict dist_dict = map_distances(mapped_sent, embed_dict) - + # computes the optimal sense combination optimum = find_optimum(mapped_sent, dist_dict) - + # maps node id's to wn 3 synset id's optimum = [id_mapping[str(o)] for o in optimum[1]] - # map syn id to sense_key + # map synset id to sense_key for i, o in enumerate(optimum): key = tuple(o + [lemma_list[i]]) if key in sense_key_mapping.keys(): optimum[i] = sense_key_mapping[key] else: optimum[i] = '' - + # adds the removed unknown words if len(no_key) != 0: for el in no_key: optimum.insert(el, '') - + # resplits compound words resplit = 0 - for i, word in enumerate(sentence): if split != 0: if '-' in word[0]: optimum[i] = optimum[i] + ' ' + optimum[i+1] optimum.remove(optimum[i+1]) split -= 1 - #print(sentence) - #print(mapped_sent) - #print(optimum) solution_sent = [(word[2],optimum[k]) for k, word in enumerate(sentence)] solutions += solution_sent @@ -262,7 +263,7 @@ if __name__ == '__main__': sense_key_mapping = open_sense_keys(SENSE_KEY_MAPPING) - solutions = iterate_over(SENSEVAL_2, label_embeddings, lemmata_mapping, sense_key_mapping) + solutions = iterate_over(SENSEVAL_2, label_embeddings, lemmata_mapping, id_mapping sense_key_mapping) write_answer_to_file(solutions, OUTPUT2) -- GitLab