Commit bb4d93f9 authored by reichelt's avatar reichelt
Browse files

Fix small errors

parent 25623fb7
Loading
Loading
Loading
Loading
+3 −3
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ df["occurrences_in_wikipedia"] = 0

CHUNK_SIZE = 1024 * 1024  # 1 MB

with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f:
with open("../data/wikipedia/wikipedia_corpus.txt", "rb") as f:
    CHUNK_NO = 1
    while True:
        print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
@@ -18,13 +18,13 @@ with open("../data/wikipedia/wikipedia_corpus.txt", "rb", encoding="utf-8") as f
            break

        for index, row in df.iterrows():
            name = row['name']
            name = row['name'].encode()
            count = wikipedia_text.count(name)
            df.at[index, "occurrences_in_wikipedia"] += count
        
        CHUNK_NO += 1

df.to_csv("./data/names_nationality_wikipedia.csv", index=False)
df.to_csv("../data/names_nationality_wikipedia.csv", index=False)

avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean()
for nationality, avg_count in avg_occurrences.items():