Add preprocessing and counting scripts

0b84d2a3 · Aileen Reichelt · 1b1f61c1 · 0b84d2a3 · 0b84d2a3 · 0b84d2a3
Commit 0b84d2a3 authored 1 year ago by Aileen Reichelt
--- a/count_name_occurrences_wikipedia.py
+++ b/count_name_occurrences_wikipedia.py
 """Check how often the most common names of each nationality
-occurr in the Wikipedia snapshot used for GloVe training."""
+occur in the Wikipedia snapshot used for GloVe training.
+To be used when there is no vocab count file yet."""

 import pandas as pd


--- a/count_names_glove_vocab.py
+++ b/count_names_glove_vocab.py
+"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary"""
+from tqdm import tqdm
+
+with open("names.txt", "r", encoding="utf-8") as names_file:
+    names = names_file.readlines()
+
+names = [name.strip().lower() for name in names]
+
+counts = []
+
+with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file:
+    vocab = vocab_file.readlines()
+    for name in tqdm(names):  # this is inefficient but using dictionaries doesn't work
+        for line in vocab:
+            token, count = line.strip().split()
+            if token == name:
+                counts.append(count)
+                found = True
+                break
+        if found == False:
+            counts.append(0)
+        found = False
+
+print(len(names))
+print(len(counts))
+
+with open("name_counts.csv", "w+", encoding="utf-8") as output:
+    for i, name in enumerate(names):
+        output.write(f"{name},{counts[i]}\n")
--- a/preprocess_wikipedia.py
+++ b/preprocess_wikipedia.py
+"""
+Download Wikipedia dump using huggingface and preprocess it
+using nltk tokenizer, lowercasing, punctuation removal
+"""
+
+from tqdm import tqdm
+from nltk.tokenize import word_tokenize
+from datasets import load_dataset
+
+wikipedia = load_dataset("wikipedia", "20220301.en")
+wikipedia = wikipedia["train"]
+
+with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt",
+          "w+", encoding="utf-8") as f:
+    for article in tqdm(wikipedia):
+        tokenized = word_tokenize(article["text"], language='english')
+        tokenized = [token.lower() for token in tokenized]
+        JOINED = " ".join(tokenized)
+        f.write(JOINED + "\n")