diff --git a/count_name_occurrences_wikipedia.py b/count_name_occurrences_wikipedia.py index fc1b8f4b5a5616271793c95c0a05f0958c14fc21..2544238f4c5e00f3d8608ccec6b51d15922985b6 100644 --- a/count_name_occurrences_wikipedia.py +++ b/count_name_occurrences_wikipedia.py @@ -1,5 +1,6 @@ """Check how often the most common names of each nationality -occurr in the Wikipedia snapshot used for GloVe training.""" +occur in the Wikipedia snapshot used for GloVe training. +To be used when there is no vocab count file yet.""" import pandas as pd diff --git a/count_names_glove_vocab.py b/count_names_glove_vocab.py new file mode 100644 index 0000000000000000000000000000000000000000..1e952146fc3b07176d41a11858afd1441055ad8b --- /dev/null +++ b/count_names_glove_vocab.py @@ -0,0 +1,29 @@ +"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary""" +from tqdm import tqdm + +with open("names.txt", "r", encoding="utf-8") as names_file: + names = names_file.readlines() + +names = [name.strip().lower() for name in names] + +counts = [] + +with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file: + vocab = vocab_file.readlines() + for name in tqdm(names): # this is inefficient but using dictionaries doesn't work + for line in vocab: + token, count = line.strip().split() + if token == name: + counts.append(count) + found = True + break + if found == False: + counts.append(0) + found = False + +print(len(names)) +print(len(counts)) + +with open("name_counts.csv", "w+", encoding="utf-8") as output: + for i, name in enumerate(names): + output.write(f"{name},{counts[i]}\n") diff --git a/preprocess_wikipedia.py b/preprocess_wikipedia.py new file mode 100644 index 0000000000000000000000000000000000000000..eb8806d636f5eebbc6daa2bab2d80bf63366b2ef --- /dev/null +++ b/preprocess_wikipedia.py @@ -0,0 +1,19 @@ +""" +Download Wikipedia dump using huggingface and preprocess it +using nltk tokenizer, lowercasing, punctuation removal +""" + +from tqdm import tqdm +from nltk.tokenize import word_tokenize +from datasets import load_dataset + +wikipedia = load_dataset("wikipedia", "20220301.en") +wikipedia = wikipedia["train"] + +with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt", + "w+", encoding="utf-8") as f: + for article in tqdm(wikipedia): + tokenized = word_tokenize(article["text"], language='english') + tokenized = [token.lower() for token in tokenized] + JOINED = " ".join(tokenized) + f.write(JOINED + "\n")