From 25623fb7618c681b8bbb1fe455af317bee960b65 Mon Sep 17 00:00:00 2001
From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de>
Date: Fri, 9 Jun 2023 03:16:43 +0200
Subject: [PATCH] Attempt to read Wiki in chunks

---
 count_name_occurrences.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/count_name_occurrences.py b/count_name_occurrences.py
index 3fe1fd5..73bcf9e 100644
--- a/count_name_occurrences.py
+++ b/count_name_occurrences.py
@@ -4,16 +4,25 @@ occurr in the Wikipedia snapshot used for GloVe training."""
 import pandas as pd
 
 df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"])
+df["occurrences_in_wikipedia"] = 0
 
-with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f:
-    wikipedia_text = f.read()
+CHUNK_SIZE = 1024 * 1024  # 1 MB
 
-df["occurrences_in_wikipedia"] = 0
+with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f:
+    CHUNK_NO = 1
+    while True:
+        print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
+        wikipedia_text = f.read(CHUNK_SIZE)
+        if not wikipedia_text:
+            print("reading completed")
+            break
 
-for index, row in df.iterrows():
-    name = row['name']
-    count = wikipedia_text.count(name)
-    df.at[index, "occurrences_in_wikipedia"] = count
+        for index, row in df.iterrows():
+            name = row['name']
+            count = wikipedia_text.count(name)
+            df.at[index, "occurrences_in_wikipedia"] += count
+        
+        CHUNK_NO += 1
 
 df.to_csv("./data/names_nationality_wikipedia.csv", index=False)
 
-- 
GitLab