From 0b84d2a36c6c9cd171fc7109d1b0d18a8919bdd1 Mon Sep 17 00:00:00 2001 From: Aileen Reichelt <reichelt@cl.uni-heidelberg.de> Date: Tue, 23 Jan 2024 16:40:49 +0100 Subject: [PATCH] Add preprocessing and counting scripts --- count_name_occurrences_wikipedia.py | 3 ++- count_names_glove_vocab.py | 29 +++++++++++++++++++++++++++++ preprocess_wikipedia.py | 19 +++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 count_names_glove_vocab.py create mode 100644 preprocess_wikipedia.py diff --git a/count_name_occurrences_wikipedia.py b/count_name_occurrences_wikipedia.py index fc1b8f4..2544238 100644 --- a/count_name_occurrences_wikipedia.py +++ b/count_name_occurrences_wikipedia.py @@ -1,5 +1,6 @@ """Check how often the most common names of each nationality -occurr in the Wikipedia snapshot used for GloVe training.""" +occur in the Wikipedia snapshot used for GloVe training. +To be used when there is no vocab count file yet.""" import pandas as pd diff --git a/count_names_glove_vocab.py b/count_names_glove_vocab.py new file mode 100644 index 0000000..1e95214 --- /dev/null +++ b/count_names_glove_vocab.py @@ -0,0 +1,29 @@ +"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary""" +from tqdm import tqdm + +with open("names.txt", "r", encoding="utf-8") as names_file: + names = names_file.readlines() + +names = [name.strip().lower() for name in names] + +counts = [] + +with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file: + vocab = vocab_file.readlines() + for name in tqdm(names): # this is inefficient but using dictionaries doesn't work + for line in vocab: + token, count = line.strip().split() + if token == name: + counts.append(count) + found = True + break + if found == False: + counts.append(0) + found = False + +print(len(names)) +print(len(counts)) + +with open("name_counts.csv", "w+", encoding="utf-8") as output: + for i, name in enumerate(names): + output.write(f"{name},{counts[i]}\n") diff --git a/preprocess_wikipedia.py b/preprocess_wikipedia.py new file mode 100644 index 0000000..eb8806d --- /dev/null +++ b/preprocess_wikipedia.py @@ -0,0 +1,19 @@ +""" +Download Wikipedia dump using huggingface and preprocess it +using nltk tokenizer, lowercasing, punctuation removal +""" + +from tqdm import tqdm +from nltk.tokenize import word_tokenize +from datasets import load_dataset + +wikipedia = load_dataset("wikipedia", "20220301.en") +wikipedia = wikipedia["train"] + +with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt", + "w+", encoding="utf-8") as f: + for article in tqdm(wikipedia): + tokenized = word_tokenize(article["text"], language='english') + tokenized = [token.lower() for token in tokenized] + JOINED = " ".join(tokenized) + f.write(JOINED + "\n") -- GitLab