From 094c3e5b601c15ec188bfd711a3a83bb01c22b8f Mon Sep 17 00:00:00 2001 From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de> Date: Fri, 9 Jun 2023 07:52:04 +0200 Subject: [PATCH] Add script for checking fasttext vocab --- check_fasttext_vocab.py | 17 +++++++++++++++++ ...es.py => count_name_occurrences_wikipedia.py | 0 2 files changed, 17 insertions(+) create mode 100644 check_fasttext_vocab.py rename count_name_occurrences.py => count_name_occurrences_wikipedia.py (100%) diff --git a/check_fasttext_vocab.py b/check_fasttext_vocab.py new file mode 100644 index 0000000..af5c9ab --- /dev/null +++ b/check_fasttext_vocab.py @@ -0,0 +1,17 @@ +"""Find out vocab counts and positions for names in +names lists, in German fasttext model.""" + +import fasttext +import pandas as pd + +df = pd.read_csv("../data/names_nationality_data.csv") +df["in_fasttext_vocab"] = 0 + +model = fasttext.load_model("cc.de.300.bin") + +for index, row in df.iterrows(): + name = row['name'] + if name in model.words: + df.at[index, "in_fasttext_vocab"] = 1 + +df.to_csv("../data/names_nationality_fasttext.csv", index=False) diff --git a/count_name_occurrences.py b/count_name_occurrences_wikipedia.py similarity index 100% rename from count_name_occurrences.py rename to count_name_occurrences_wikipedia.py -- GitLab