Add preprocessing and counting scripts (0b84d2a3) · Commits · Aileen Reichelt / Analysing and Mitigating Origin Bias in German Word Embeddings

count_name_occurrences_wikipedia.py

+2 −1

Original line number	Diff line number	Diff line
		"""Check how often the most common names of each nationality
		occurr in the Wikipedia snapshot used for GloVe training."""
		occur in the Wikipedia snapshot used for GloVe training.
		To be used when there is no vocab count file yet."""

		import pandas as pd

0 → 100644

+29 −0

Original line number	Diff line number	Diff line
		"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary"""
		from tqdm import tqdm

		with open("names.txt", "r", encoding="utf-8") as names_file:
		names = names_file.readlines()

		names = [name.strip().lower() for name in names]

		counts = []

		with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file:
		vocab = vocab_file.readlines()
		for name in tqdm(names): # this is inefficient but using dictionaries doesn't work
		for line in vocab:
		token, count = line.strip().split()
		if token == name:
		counts.append(count)
		found = True
		break
		if found == False:
		counts.append(0)
		found = False

		print(len(names))
		print(len(counts))

		with open("name_counts.csv", "w+", encoding="utf-8") as output:
		for i, name in enumerate(names):
		output.write(f"{name},{counts[i]}\n")

0 → 100644

+19 −0

Original line number	Diff line number	Diff line
		"""
		Download Wikipedia dump using huggingface and preprocess it
		using nltk tokenizer, lowercasing, punctuation removal
		"""

		from tqdm import tqdm
		from nltk.tokenize import word_tokenize
		from datasets import load_dataset

		wikipedia = load_dataset("wikipedia", "20220301.en")
		wikipedia = wikipedia["train"]

		with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt",
		"w+", encoding="utf-8") as f:
		for article in tqdm(wikipedia):
		tokenized = word_tokenize(article["text"], language='english')
		tokenized = [token.lower() for token in tokenized]
		JOINED = " ".join(tokenized)
		f.write(JOINED + "\n")