From 094c3e5b601c15ec188bfd711a3a83bb01c22b8f Mon Sep 17 00:00:00 2001
From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de>
Date: Fri, 9 Jun 2023 07:52:04 +0200
Subject: [PATCH] Add script for checking fasttext vocab

---
 check_fasttext_vocab.py                         | 17 +++++++++++++++++
 ...es.py => count_name_occurrences_wikipedia.py |  0
 2 files changed, 17 insertions(+)
 create mode 100644 check_fasttext_vocab.py
 rename count_name_occurrences.py => count_name_occurrences_wikipedia.py (100%)

diff --git a/check_fasttext_vocab.py b/check_fasttext_vocab.py
new file mode 100644
index 0000000..af5c9ab
--- /dev/null
+++ b/check_fasttext_vocab.py
@@ -0,0 +1,17 @@
+"""Find out vocab counts and positions for names in
+names lists, in German fasttext model."""
+
+import fasttext
+import pandas as pd
+
+df = pd.read_csv("../data/names_nationality_data.csv")
+df["in_fasttext_vocab"] = 0
+
+model = fasttext.load_model("cc.de.300.bin")
+
+for index, row in df.iterrows():
+    name = row['name']
+    if name in model.words:
+        df.at[index, "in_fasttext_vocab"] = 1
+
+df.to_csv("../data/names_nationality_fasttext.csv", index=False)
diff --git a/count_name_occurrences.py b/count_name_occurrences_wikipedia.py
similarity index 100%
rename from count_name_occurrences.py
rename to count_name_occurrences_wikipedia.py
-- 
GitLab