Skip to content
Snippets Groups Projects
Commit bb4d93f9 authored by reichelt's avatar reichelt
Browse files

Fix small errors

parent 25623fb7
No related branches found
No related tags found
No related merge requests found
......@@ -8,7 +8,7 @@ df["occurrences_in_wikipedia"] = 0
CHUNK_SIZE = 1024 * 1024 # 1 MB
with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f:
with open("../data/wikipedia/wikipedia_corpus.txt", "rb") as f:
CHUNK_NO = 1
while True:
print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
......@@ -18,13 +18,13 @@ with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f
break
for index, row in df.iterrows():
name = row['name']
name = row['name'].encode()
count = wikipedia_text.count(name)
df.at[index, "occurrences_in_wikipedia"] += count
CHUNK_NO += 1
df.to_csv("./data/names_nationality_wikipedia.csv", index=False)
df.to_csv("../data/names_nationality_wikipedia.csv", index=False)
avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean()
for nationality, avg_count in avg_occurrences.items():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment