From 0b84d2a36c6c9cd171fc7109d1b0d18a8919bdd1 Mon Sep 17 00:00:00 2001
From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de>
Date: Tue, 23 Jan 2024 16:40:49 +0100
Subject: [PATCH] Add preprocessing and counting scripts

---
 count_name_occurrences_wikipedia.py |  3 ++-
 count_names_glove_vocab.py          | 29 +++++++++++++++++++++++++++++
 preprocess_wikipedia.py             | 19 +++++++++++++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 count_names_glove_vocab.py
 create mode 100644 preprocess_wikipedia.py

diff --git a/count_name_occurrences_wikipedia.py b/count_name_occurrences_wikipedia.py
index fc1b8f4..2544238 100644
--- a/count_name_occurrences_wikipedia.py
+++ b/count_name_occurrences_wikipedia.py
@@ -1,5 +1,6 @@
 """Check how often the most common names of each nationality
-occurr in the Wikipedia snapshot used for GloVe training."""
+occur in the Wikipedia snapshot used for GloVe training.
+To be used when there is no vocab count file yet."""
 
 import pandas as pd
 
diff --git a/count_names_glove_vocab.py b/count_names_glove_vocab.py
new file mode 100644
index 0000000..1e95214
--- /dev/null
+++ b/count_names_glove_vocab.py
@@ -0,0 +1,29 @@
+"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary"""
+from tqdm import tqdm
+
+with open("names.txt", "r", encoding="utf-8") as names_file:
+    names = names_file.readlines()
+
+names = [name.strip().lower() for name in names]
+
+counts = []
+
+with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file:
+    vocab = vocab_file.readlines()
+    for name in tqdm(names):  # this is inefficient but using dictionaries doesn't work
+        for line in vocab:
+            token, count = line.strip().split()
+            if token == name:
+                counts.append(count)
+                found = True
+                break
+        if found == False:
+            counts.append(0)
+        found = False
+
+print(len(names))
+print(len(counts))
+
+with open("name_counts.csv", "w+", encoding="utf-8") as output:
+    for i, name in enumerate(names):
+        output.write(f"{name},{counts[i]}\n")
diff --git a/preprocess_wikipedia.py b/preprocess_wikipedia.py
new file mode 100644
index 0000000..eb8806d
--- /dev/null
+++ b/preprocess_wikipedia.py
@@ -0,0 +1,19 @@
+"""
+Download Wikipedia dump using huggingface and preprocess it
+using nltk tokenizer, lowercasing, punctuation removal
+"""
+
+from tqdm import tqdm
+from nltk.tokenize import word_tokenize
+from datasets import load_dataset
+
+wikipedia = load_dataset("wikipedia", "20220301.en")
+wikipedia = wikipedia["train"]
+
+with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt",
+          "w+", encoding="utf-8") as f:
+    for article in tqdm(wikipedia):
+        tokenized = word_tokenize(article["text"], language='english')
+        tokenized = [token.lower() for token in tokenized]
+        JOINED = " ".join(tokenized)
+        f.write(JOINED + "\n")
-- 
GitLab