Skip to content
Snippets Groups Projects
Commit 0b84d2a3 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Add preprocessing and counting scripts

parent 1b1f61c1
No related branches found
No related tags found
No related merge requests found
"""Check how often the most common names of each nationality
occurr in the Wikipedia snapshot used for GloVe training."""
occur in the Wikipedia snapshot used for GloVe training.
To be used when there is no vocab count file yet."""
import pandas as pd
......
"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary"""
from tqdm import tqdm
with open("names.txt", "r", encoding="utf-8") as names_file:
names = names_file.readlines()
names = [name.strip().lower() for name in names]
counts = []
with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file:
vocab = vocab_file.readlines()
for name in tqdm(names): # this is inefficient but using dictionaries doesn't work
for line in vocab:
token, count = line.strip().split()
if token == name:
counts.append(count)
found = True
break
if found == False:
counts.append(0)
found = False
print(len(names))
print(len(counts))
with open("name_counts.csv", "w+", encoding="utf-8") as output:
for i, name in enumerate(names):
output.write(f"{name},{counts[i]}\n")
"""
Download Wikipedia dump using huggingface and preprocess it
using nltk tokenizer, lowercasing, punctuation removal
"""
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from datasets import load_dataset
wikipedia = load_dataset("wikipedia", "20220301.en")
wikipedia = wikipedia["train"]
with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt",
"w+", encoding="utf-8") as f:
for article in tqdm(wikipedia):
tokenized = word_tokenize(article["text"], language='english')
tokenized = [token.lower() for token in tokenized]
JOINED = " ".join(tokenized)
f.write(JOINED + "\n")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment