From bb4d93f978af779b5c673685a4f6bf6c056400f8 Mon Sep 17 00:00:00 2001
From: reichelt <reichelt@login.cl.uni-heidelberg.de>
Date: Fri, 9 Jun 2023 04:01:58 +0200
Subject: [PATCH] Fix small errors

---
 count_name_occurrences.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/count_name_occurrences.py b/count_name_occurrences.py
index 73bcf9e..fc1b8f4 100644
--- a/count_name_occurrences.py
+++ b/count_name_occurrences.py
@@ -8,7 +8,7 @@ df["occurrences_in_wikipedia"] = 0
 
 CHUNK_SIZE = 1024 * 1024  # 1 MB
 
-with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f:
+with open("../data/wikipedia/wikipedia_corpus.txt", "rb") as f:
     CHUNK_NO = 1
     while True:
         print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
@@ -18,13 +18,13 @@ with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f
             break
 
         for index, row in df.iterrows():
-            name = row['name']
+            name = row['name'].encode()
             count = wikipedia_text.count(name)
             df.at[index, "occurrences_in_wikipedia"] += count
         
         CHUNK_NO += 1
 
-df.to_csv("./data/names_nationality_wikipedia.csv", index=False)
+df.to_csv("../data/names_nationality_wikipedia.csv", index=False)
 
 avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean()
 for nationality, avg_count in avg_occurrences.items():
-- 
GitLab