diff --git a/count_name_occurrences_wikipedia.py b/count_name_occurrences_wikipedia.py
index fc1b8f4b5a5616271793c95c0a05f0958c14fc21..2544238f4c5e00f3d8608ccec6b51d15922985b6 100644
--- a/count_name_occurrences_wikipedia.py
+++ b/count_name_occurrences_wikipedia.py
@@ -1,5 +1,6 @@
 """Check how often the most common names of each nationality
-occurr in the Wikipedia snapshot used for GloVe training."""
+occur in the Wikipedia snapshot used for GloVe training.
+To be used when there is no vocab count file yet."""
 
 import pandas as pd
 
diff --git a/count_names_glove_vocab.py b/count_names_glove_vocab.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e952146fc3b07176d41a11858afd1441055ad8b
--- /dev/null
+++ b/count_names_glove_vocab.py
@@ -0,0 +1,29 @@
+"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary"""
+from tqdm import tqdm
+
+with open("names.txt", "r", encoding="utf-8") as names_file:
+    names = names_file.readlines()
+
+names = [name.strip().lower() for name in names]
+
+counts = []
+
+with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file:
+    vocab = vocab_file.readlines()
+    for name in tqdm(names):  # this is inefficient but using dictionaries doesn't work
+        for line in vocab:
+            token, count = line.strip().split()
+            if token == name:
+                counts.append(count)
+                found = True
+                break
+        if found == False:
+            counts.append(0)
+        found = False
+
+print(len(names))
+print(len(counts))
+
+with open("name_counts.csv", "w+", encoding="utf-8") as output:
+    for i, name in enumerate(names):
+        output.write(f"{name},{counts[i]}\n")
diff --git a/preprocess_wikipedia.py b/preprocess_wikipedia.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb8806d636f5eebbc6daa2bab2d80bf63366b2ef
--- /dev/null
+++ b/preprocess_wikipedia.py
@@ -0,0 +1,19 @@
+"""
+Download Wikipedia dump using huggingface and preprocess it
+using nltk tokenizer, lowercasing, punctuation removal
+"""
+
+from tqdm import tqdm
+from nltk.tokenize import word_tokenize
+from datasets import load_dataset
+
+wikipedia = load_dataset("wikipedia", "20220301.en")
+wikipedia = wikipedia["train"]
+
+with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt",
+          "w+", encoding="utf-8") as f:
+    for article in tqdm(wikipedia):
+        tokenized = word_tokenize(article["text"], language='english')
+        tokenized = [token.lower() for token in tokenized]
+        JOINED = " ".join(tokenized)
+        f.write(JOINED + "\n")