Loading count_name_occurrences.py 0 → 100644 +20 −0 Original line number Diff line number Diff line """Check how often the most common names of each nationality occurr in the Wikipedia snapshot used for GloVe training.""" import pandas as pd df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality"]) with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f: wikipedia_text = f.read() df["occurrences_in_wikipedia"] = 0 for index, row in df.iterrows(): name = row['name'] count = wikipedia_text.count(name) df.at[index, "occurrences_in_wikipedia"] = count avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean() for nationality, avg_count in avg_occurrences.items(): print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}") Loading
count_name_occurrences.py 0 → 100644 +20 −0 Original line number Diff line number Diff line """Check how often the most common names of each nationality occurr in the Wikipedia snapshot used for GloVe training.""" import pandas as pd df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality"]) with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f: wikipedia_text = f.read() df["occurrences_in_wikipedia"] = 0 for index, row in df.iterrows(): name = row['name'] count = wikipedia_text.count(name) df.at[index, "occurrences_in_wikipedia"] = count avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean() for nationality, avg_count in avg_occurrences.items(): print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}")