Commit c21c81f9 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Change vocab size

parent f26cb9e5
Loading
Loading
Loading
Loading
+4 −6
Original line number Diff line number Diff line
@@ -6,7 +6,7 @@ from itertools import chain
from bpemb import BPEmb
from sklearn.metrics.pairwise import cosine_similarity as cosine

bpemb_de = BPEmb(lang="de", dim=300, vs=200000)
bpemb_de = BPEmb(lang="de", dim=300, vs=100000)
print(type(bpemb_de))

# Check presence of WEAT words in vocab
@@ -32,10 +32,8 @@ def get_word_embedding(word, model):


# Create SemEval output
with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/\
    test/subtask1-monolingual/output/de.test.bpemb_og.output-mean.txt", mode="w", encoding="utf-8") as output:
    with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/\
        test/subtask1-monolingual/data/de.test.data.txt", mode="r", encoding="utf-8") as input:
with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/output/de.test.bpemb_og.output-mean.txt", mode="w", encoding="utf-8") as output:
    with open("/home/aileen/heiBOX/BA/bias-mitigation-ba/semeval2017-task2/SemEval17-Task2/test/subtask1-monolingual/data/de.test.data.txt", mode="r", encoding="utf-8") as input:
        for line in input.readlines():
            word1, word2 = line.split("\t")
            embd1 = get_word_embedding(word1, bpemb_de)