Skip to content
Snippets Groups Projects
Commit d001f1ee authored by weber's avatar weber
Browse files

Merge remote-tracking branch 'origin/master'

parents 6c41179c d9912a33
No related branches found
No related tags found
No related merge requests found
File deleted
File deleted
File deleted
No preview for this file type
No preview for this file type
No preview for this file type
......@@ -3,25 +3,26 @@ import pickle as pkl
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
file_path = "eng-all-words_seneval2.test.xml"
#file_path = "english-all-words.xml"
file_path2 = "eng-all-words_seneval2.test.xml" #senseval2
file_path3 = "english-all-words.xml" #senseval3
tree_paths3 = {"d000": "cl23.mrg", "d001": "wsj_1695.mrg", "d002":"wsj_1778.mrg"}
tree_paths2 = {"d00": "wsj_0089.mrg", "d01": "wsj_0465.mrg", "d02": "wsj_1286.mrg"}
tree_paths2 = {"d00": "wsj_0089.mrg", "d01": "wsj_0465.mrg", "d02": "wsj_1286.mrg"} #senseval2
tree_paths3 = {"d000": "cl23.mrg", "d001": "wsj_1695.mrg", "d002":"wsj_1778.mrg"} #senseval3
def get_stopword_list(path):
with open (path, "r") as f:
def get_stopword_list(stop_path):
with open (stop_path, "r") as f:
stopli = []
for line in f:
stopli.append(line[:-1])
return stopli
def get_infos(path = tree_paths2["d00"]):
with open (path, "r") as f:
def get_infos(tree_path):
with open (tree_path, "r") as f:
file = f.read()
li = re.findall("\([\.,'`A-Z:]+\$? [&%:\\\/`'a-zA-Z0-9\.,\?!-]+\)", file)
return li
def get_xml():
def get_xml(file_path):
with open(file_path, "r") as f:
list_lines = f.readlines()[4:]
text_dict = {}
......@@ -101,9 +102,9 @@ def validation_line(line):
line_info.append(line)
return line_info
def get_sentence(key = "d000"):
def get_sentence(key, file_path):
info_li = []
for token in get_xml()[key]:
for token in get_xml(file_path)[key]:
if len(token)==1:
info_li.append(None)
elif len(token) == 0:
......@@ -112,7 +113,7 @@ def get_sentence(key = "d000"):
info_li.append([token[1], token[2]])
return info_li
def get_tag(treebank_tag):
def get_wn_tag(treebank_tag):
if treebank_tag.startswith('J') or treebank_tag == "ADJ":
return "a"
elif treebank_tag.startswith('V'):
......@@ -124,48 +125,65 @@ def get_tag(treebank_tag):
else:
return ''
def get_tag_infos(path = tree_paths2["d00"]):
infos = get_infos(path)
def get_final_tag(treebank_tag):
if treebank_tag.startswith("NNP"):
return "PROPN"
elif treebank_tag.startswith('J') or treebank_tag == "ADJ":
return "ADJ"
elif treebank_tag.startswith('V') or treebank_tag == "MD":
return "VERB"
elif treebank_tag.startswith('N') or treebank_tag.startswith("PRO"):
return "NOUN"
elif treebank_tag.startswith('RB') or treebank_tag == "ADV":
return "ADV"
elif treebank_tag.startswith("CD"):
return "NUM"
else:
return ''
def get_tag_infos(tree_path):
infos = get_infos(tree_path)
new_list = []
for elm in infos:
a = elm[1:-1].split()
tag = get_tag(a[0])
tag = get_wn_tag(a[0])
gloss_tag = get_final_tag(a[0])
token = a[1]
if token == "'s":
token = "is"
if tag == "":
new_list.append([str(wnl.lemmatize(token)).lower(), a[0], tag])
new_list.append([str(wnl.lemmatize(token)).lower(), a[0], tag, gloss_tag])
else:
new_list.append([wnl.lemmatize(token, pos=tag), a[0], tag])
new_list.append([wnl.lemmatize(token, pos=tag).lower(), a[0], tag, gloss_tag])
return new_list
def get_tag_and_info_list(key = "d00"):
token_list = get_tag_infos(tree_paths2[key])
info_list = get_sentence(key)
def get_tag_and_info_list(key, file_path, tree_path):
token_list = get_tag_infos(tree_path[key])
info_list = get_sentence(key, file_path)
return token_list, info_list
def delete_stopwords(key, path):
stopwords = get_stopword_list(path)
tokens, infos = get_tag_and_info_list(key)
"""
print (len(tokens))
print (len(get_infos(path)))
print (len(infos))
for idx in range(len(infos)):
print (infos[idx])
try:
print (tokens[idx])
print (get_infos(path)[idx])
except IndexError:
print ("foo")
print ("\n")
"""
def get_gloss_lemmas(path):
with open (path, "r") as f:
gloss_lemma_set = set()
for line in f.readlines():
gloss_lemma_set.add(line.split()[1])
return gloss_lemma_set
def delete_stopwords(stop_path, key, gloss_path, file_path, tree_path):
stopwords = get_stopword_list(stop_path)
tokens, infos = get_tag_and_info_list(key, file_path, tree_path)
new_tokens = []
new_infos = []
gloss = get_gloss_lemmas(gloss_path)
punct = [",", ".", "!", "?", ":"]
for idx in range(len(infos)):
if infos[idx] == None:
if tokens[idx][0] in stopwords:
continue
elif tokens[idx][0] not in gloss:
continue
elif tokens[idx][0] in punct:
continue
else:
new_tokens.append(tokens[idx])
new_infos.append(infos[idx])
......@@ -174,7 +192,6 @@ def delete_stopwords(key, path):
new_infos.append(infos[idx])
return new_tokens, new_infos
def get_sats(tokens, info):
sat_dict = {}
for elm in info:
......@@ -209,23 +226,23 @@ def get_sats(tokens, info):
else:
new_tokens.append(tokens[idx])
new_info.append(info[idx])
print (len(info), len(new_info))
print (len(tokens), len(new_tokens))
return [new_tokens, new_info]
def write_pkl(path="stopwords.txt"):
for key in get_xml().keys():
without_stop_tokens, without_stop_info = delete_stopwords(key,path)
def write_pkl(version = 2, stop_path="stopwords.txt", gloss_path = "gloss_mapping.txt"):
file_path = ""
tree_path = ""
if version == 2:
file_path = file_path2
tree_path = tree_paths2
elif version == 3:
file_path = file_path3
tree_path = tree_paths3
for key in get_xml(file_path).keys():
without_stop_tokens, without_stop_info = delete_stopwords(stop_path, key, gloss_path, file_path, tree_path)
output = get_sats(without_stop_tokens, without_stop_info)
with open(key+".pkl", "wb") as f:
pkl.dump(output, f)
print (key, "Done")
#write_pkl()
def read_pkl(path = "d000.pkl"):
with open (path, "rb") as f:
li = pkl.load(f)
return li
if __name__ == "__main__":
write_pkl(version=3)
This diff is collapsed.
......@@ -42,7 +42,8 @@ def open_mapping(filename):
def open_sense_keys(filename):
"""
open wn30-17 sense key mapping
returns a dictionary {(syn_id_30, pos):sense_key_17}
"""
with open(filename, 'r') as input:
sense_keys = {(line.split()[0], line.split()[3]):line.split()[2] for line in input.readlines()}
......@@ -52,11 +53,10 @@ def map_words(sentence):
"""
takes a list of ambig words
mapps them to potential synsets
returns a list of synset_id lists
returns a list of synset_id lists [ [[w1s1],[w1s2],[w1s3]] , [[w2s1],...] ,...]
"""
pos = ['n','v','a','r']
ambig_list = []
#split = False
def get_lem_id(token):
if token in lemmata_mapping.keys():
ambig_list.append(lemmata_mapping[token][1])
......@@ -76,20 +76,17 @@ def map_words(sentence):
for word in sentence:
add = False
#print(word)
add = get_lem_id(word[0]+'/'+word[1])
#print(add)
if add: continue
elif '-' in word[0]:
words = word[0].split('-')
for w in words:
add = get_node_id(w)
#if add: split = True
else:
add = get_node_id(word[0])
if not add: ambig_list.append(['U'])
#print(ambig_list)
return ambig_list
def embed(node_id):
......@@ -97,23 +94,12 @@ def embed(node_id):
takes a node id (int)
returns it's embedding (array)
"""
#l1 = pos_embeddings
#l2 = lemmata_embeddings
embedding = np.concatenate((id_embeddings[node_id],pos_embeddings[node_id],lex_file_embeddings[node_id],lemmata_embeddings[node_id]), axis=0)
labels = (id_embeddings[node_id],pos_embeddings[node_id],lex_file_embeddings[node_id],lemmata_embeddings[node_id])
embedding = np.concatenate(labels, axis=0)
return embedding
#def concatenate(l1 , l2):
"""
soll: konkatiniert die Embeddings ausgewählter Labels pro Knoten
--> soll embed() werden
ist: konkatiniert die Embeddings ausgewählter Labels aller Knoten
"""
#embeddings = [np.concatenate((l1[i], l2[i]), axis=0) for i in range(len(l1))]
#return embeddings
def get_distance(node_combi):
"""
takes a list of node embedding lists
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment