From 4944a70d50351702f4eaa6b617a8a1b8ee8ad13e Mon Sep 17 00:00:00 2001
From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de>
Date: Thu, 8 Jun 2023 18:37:42 +0200
Subject: [PATCH] Add script to count name occurrences in Wiki dump

---
 count_name_occurrences.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 count_name_occurrences.py

diff --git a/count_name_occurrences.py b/count_name_occurrences.py
new file mode 100644
index 0000000..11a57ce
--- /dev/null
+++ b/count_name_occurrences.py
@@ -0,0 +1,20 @@
+"""Check how often the most common names of each nationality
+occurr in the Wikipedia snapshot used for GloVe training."""
+
+import pandas as pd
+
+df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality"])
+
+with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f:
+    wikipedia_text = f.read()
+
+df["occurrences_in_wikipedia"] = 0
+
+for index, row in df.iterrows():
+    name = row['name']
+    count = wikipedia_text.count(name)
+    df.at[index, "occurrences_in_wikipedia"] = count
+
+avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean()
+for nationality, avg_count in avg_occurrences.items():
+    print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}")
-- 
GitLab