From bb4d93f978af779b5c673685a4f6bf6c056400f8 Mon Sep 17 00:00:00 2001 From: reichelt <reichelt@login.cl.uni-heidelberg.de> Date: Fri, 9 Jun 2023 04:01:58 +0200 Subject: [PATCH] Fix small errors --- count_name_occurrences.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/count_name_occurrences.py b/count_name_occurrences.py index 73bcf9e..fc1b8f4 100644 --- a/count_name_occurrences.py +++ b/count_name_occurrences.py @@ -8,7 +8,7 @@ df["occurrences_in_wikipedia"] = 0 CHUNK_SIZE = 1024 * 1024 # 1 MB -with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f: +with open("../data/wikipedia/wikipedia_corpus.txt", "rb") as f: CHUNK_NO = 1 while True: print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r") @@ -18,13 +18,13 @@ with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f break for index, row in df.iterrows(): - name = row['name'] + name = row['name'].encode() count = wikipedia_text.count(name) df.at[index, "occurrences_in_wikipedia"] += count CHUNK_NO += 1 -df.to_csv("./data/names_nationality_wikipedia.csv", index=False) +df.to_csv("../data/names_nationality_wikipedia.csv", index=False) avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean() for nationality, avg_count in avg_occurrences.items(): -- GitLab