diff --git a/count_name_occurrences.py b/count_name_occurrences.py new file mode 100644 index 0000000000000000000000000000000000000000..11a57ce276b696c38ca5466d93e42113e5882964 --- /dev/null +++ b/count_name_occurrences.py @@ -0,0 +1,20 @@ +"""Check how often the most common names of each nationality +occurr in the Wikipedia snapshot used for GloVe training.""" + +import pandas as pd + +df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality"]) + +with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f: + wikipedia_text = f.read() + +df["occurrences_in_wikipedia"] = 0 + +for index, row in df.iterrows(): + name = row['name'] + count = wikipedia_text.count(name) + df.at[index, "occurrences_in_wikipedia"] = count + +avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean() +for nationality, avg_count in avg_occurrences.items(): + print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}")