From 4944a70d50351702f4eaa6b617a8a1b8ee8ad13e Mon Sep 17 00:00:00 2001 From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de> Date: Thu, 8 Jun 2023 18:37:42 +0200 Subject: [PATCH] Add script to count name occurrences in Wiki dump --- count_name_occurrences.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 count_name_occurrences.py diff --git a/count_name_occurrences.py b/count_name_occurrences.py new file mode 100644 index 0000000..11a57ce --- /dev/null +++ b/count_name_occurrences.py @@ -0,0 +1,20 @@ +"""Check how often the most common names of each nationality +occurr in the Wikipedia snapshot used for GloVe training.""" + +import pandas as pd + +df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality"]) + +with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f: + wikipedia_text = f.read() + +df["occurrences_in_wikipedia"] = 0 + +for index, row in df.iterrows(): + name = row['name'] + count = wikipedia_text.count(name) + df.at[index, "occurrences_in_wikipedia"] = count + +avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean() +for nationality, avg_count in avg_occurrences.items(): + print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}") -- GitLab