Commit 094c3e5b authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Add script for checking fasttext vocab

parent 7e9f3739
Loading
Loading
Loading
Loading
+17 −0
Original line number Diff line number Diff line
"""Find out vocab counts and positions for names in
names lists, in German fasttext model."""

import fasttext
import pandas as pd

df = pd.read_csv("../data/names_nationality_data.csv")
df["in_fasttext_vocab"] = 0

model = fasttext.load_model("cc.de.300.bin")

for index, row in df.iterrows():
    name = row['name']
    if name in model.words:
        df.at[index, "in_fasttext_vocab"] = 1

df.to_csv("../data/names_nationality_fasttext.csv", index=False)