From 98ce918416f8d31505846a1eb676cc2f8c30c5ce Mon Sep 17 00:00:00 2001
From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de>
Date: Fri, 9 Jun 2023 02:54:54 +0200
Subject: [PATCH] Save updated dataframe

---
 count_name_occurrences.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/count_name_occurrences.py b/count_name_occurrences.py
index 11a57ce..3fe1fd5 100644
--- a/count_name_occurrences.py
+++ b/count_name_occurrences.py
@@ -3,7 +3,7 @@ occurr in the Wikipedia snapshot used for GloVe training."""
 
 import pandas as pd
 
-df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality"])
+df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"])
 
 with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f:
     wikipedia_text = f.read()
@@ -15,6 +15,8 @@ for index, row in df.iterrows():
     count = wikipedia_text.count(name)
     df.at[index, "occurrences_in_wikipedia"] = count
 
+df.to_csv("./data/names_nationality_wikipedia.csv", index=False)
+
 avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean()
 for nationality, avg_count in avg_occurrences.items():
     print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}")
-- 
GitLab