Skip to content
Snippets Groups Projects
Commit 25623fb7 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Attempt to read Wiki in chunks

parent 98ce9184
No related branches found
No related tags found
No related merge requests found
......@@ -4,16 +4,25 @@ occurr in the Wikipedia snapshot used for GloVe training."""
import pandas as pd
df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"])
df["occurrences_in_wikipedia"] = 0
with open("../data/wikipedia/wikipedia_corpus.txt", encoding="utf-8") as f:
wikipedia_text = f.read()
CHUNK_SIZE = 1024 * 1024 # 1 MB
df["occurrences_in_wikipedia"] = 0
with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f:
CHUNK_NO = 1
while True:
print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
wikipedia_text = f.read(CHUNK_SIZE)
if not wikipedia_text:
print("reading completed")
break
for index, row in df.iterrows():
name = row['name']
count = wikipedia_text.count(name)
df.at[index, "occurrences_in_wikipedia"] = count
for index, row in df.iterrows():
name = row['name']
count = wikipedia_text.count(name)
df.at[index, "occurrences_in_wikipedia"] += count
CHUNK_NO += 1
df.to_csv("./data/names_nationality_wikipedia.csv", index=False)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment