Commit 0b84d2a3 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Add preprocessing and counting scripts

parent 1b1f61c1
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
"""Check how often the most common names of each nationality
occurr in the Wikipedia snapshot used for GloVe training."""
occur in the Wikipedia snapshot used for GloVe training.
To be used when there is no vocab count file yet."""

import pandas as pd

+29 −0
Original line number Diff line number Diff line
"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary"""
from tqdm import tqdm

with open("names.txt", "r", encoding="utf-8") as names_file:
    names = names_file.readlines()

names = [name.strip().lower() for name in names]

counts = []

with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file:
    vocab = vocab_file.readlines()
    for name in tqdm(names):  # this is inefficient but using dictionaries doesn't work
        for line in vocab:
            token, count = line.strip().split()
            if token == name:
                counts.append(count)
                found = True
                break
        if found == False:
            counts.append(0)
        found = False

print(len(names))
print(len(counts))

with open("name_counts.csv", "w+", encoding="utf-8") as output:
    for i, name in enumerate(names):
        output.write(f"{name},{counts[i]}\n")
+19 −0
Original line number Diff line number Diff line
"""
Download Wikipedia dump using huggingface and preprocess it
using nltk tokenizer, lowercasing, punctuation removal
"""

from tqdm import tqdm
from nltk.tokenize import word_tokenize
from datasets import load_dataset

wikipedia = load_dataset("wikipedia", "20220301.en")
wikipedia = wikipedia["train"]

with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt",
          "w+", encoding="utf-8") as f:
    for article in tqdm(wikipedia):
        tokenized = word_tokenize(article["text"], language='english')
        tokenized = [token.lower() for token in tokenized]
        JOINED = " ".join(tokenized)
        f.write(JOINED + "\n")