From 25623fb7618c681b8bbb1fe455af317bee960b65 Mon Sep 17 00:00:00 2001 From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de> Date: Fri, 9 Jun 2023 03:16:43 +0200 Subject: [PATCH] Attempt to read Wiki in chunks --- count_name_occurrences.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/count_name_occurrences.py b/count_name_occurrences.py index 3fe1fd5..73bcf9e 100644 --- a/count_name_occurrences.py +++ b/count_name_occurrences.py @@ -4,16 +4,25 @@ occurr in the Wikipedia snapshot used for GloVe training.""" import pandas as pd df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"]) +df["occurrences_in_wikipedia"] = 0 -with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f: - wikipedia_text = f.read() +CHUNK_SIZE = 1024 * 1024 # 1 MB -df["occurrences_in_wikipedia"] = 0 +with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f: + CHUNK_NO = 1 + while True: + print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r") + wikipedia_text = f.read(CHUNK_SIZE) + if not wikipedia_text: + print("reading completed") + break -for index, row in df.iterrows(): - name = row['name'] - count = wikipedia_text.count(name) - df.at[index, "occurrences_in_wikipedia"] = count + for index, row in df.iterrows(): + name = row['name'] + count = wikipedia_text.count(name) + df.at[index, "occurrences_in_wikipedia"] += count + + CHUNK_NO += 1 df.to_csv("./data/names_nationality_wikipedia.csv", index=False) -- GitLab