diff --git a/count_name_occurrences.py b/count_name_occurrences.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a57ce276b696c38ca5466d93e42113e5882964
--- /dev/null
+++ b/count_name_occurrences.py
@@ -0,0 +1,20 @@
+"""Check how often the most common names of each nationality
+occurr in the Wikipedia snapshot used for GloVe training."""
+
+import pandas as pd
+
+df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality"])
+
+with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f:
+    wikipedia_text = f.read()
+
+df["occurrences_in_wikipedia"] = 0
+
+for index, row in df.iterrows():
+    name = row['name']
+    count = wikipedia_text.count(name)
+    df.at[index, "occurrences_in_wikipedia"] = count
+
+avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean()
+for nationality, avg_count in avg_occurrences.items():
+    print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}")