diff --git a/count_name_occurrences.py b/count_name_occurrences.py index 3fe1fd5b2c6cfb552e448c0cde052a4041842455..73bcf9ec8c32a897b72d64ea7dceff309ecc73cb 100644 --- a/count_name_occurrences.py +++ b/count_name_occurrences.py @@ -4,16 +4,25 @@ occurr in the Wikipedia snapshot used for GloVe training.""" import pandas as pd df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"]) +df["occurrences_in_wikipedia"] = 0 -with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f: - wikipedia_text = f.read() +CHUNK_SIZE = 1024 * 1024 # 1 MB -df["occurrences_in_wikipedia"] = 0 +with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f: + CHUNK_NO = 1 + while True: + print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r") + wikipedia_text = f.read(CHUNK_SIZE) + if not wikipedia_text: + print("reading completed") + break -for index, row in df.iterrows(): - name = row['name'] - count = wikipedia_text.count(name) - df.at[index, "occurrences_in_wikipedia"] = count + for index, row in df.iterrows(): + name = row['name'] + count = wikipedia_text.count(name) + df.at[index, "occurrences_in_wikipedia"] += count + + CHUNK_NO += 1 df.to_csv("./data/names_nationality_wikipedia.csv", index=False)