Loading count_name_occurrences_wikipedia.py +2 −1 Original line number Diff line number Diff line """Check how often the most common names of each nationality occurr in the Wikipedia snapshot used for GloVe training.""" occur in the Wikipedia snapshot used for GloVe training. To be used when there is no vocab count file yet.""" import pandas as pd Loading count_names_glove_vocab.py 0 → 100644 +29 −0 Original line number Diff line number Diff line """Short helper script to look up vocab counts of ~400 names in GloVe vocabulary""" from tqdm import tqdm with open("names.txt", "r", encoding="utf-8") as names_file: names = names_file.readlines() names = [name.strip().lower() for name in names] counts = [] with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file: vocab = vocab_file.readlines() for name in tqdm(names): # this is inefficient but using dictionaries doesn't work for line in vocab: token, count = line.strip().split() if token == name: counts.append(count) found = True break if found == False: counts.append(0) found = False print(len(names)) print(len(counts)) with open("name_counts.csv", "w+", encoding="utf-8") as output: for i, name in enumerate(names): output.write(f"{name},{counts[i]}\n") preprocess_wikipedia.py 0 → 100644 +19 −0 Original line number Diff line number Diff line """ Download Wikipedia dump using huggingface and preprocess it using nltk tokenizer, lowercasing, punctuation removal """ from tqdm import tqdm from nltk.tokenize import word_tokenize from datasets import load_dataset wikipedia = load_dataset("wikipedia", "20220301.en") wikipedia = wikipedia["train"] with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt", "w+", encoding="utf-8") as f: for article in tqdm(wikipedia): tokenized = word_tokenize(article["text"], language='english') tokenized = [token.lower() for token in tokenized] JOINED = " ".join(tokenized) f.write(JOINED + "\n") Loading
count_name_occurrences_wikipedia.py +2 −1 Original line number Diff line number Diff line """Check how often the most common names of each nationality occurr in the Wikipedia snapshot used for GloVe training.""" occur in the Wikipedia snapshot used for GloVe training. To be used when there is no vocab count file yet.""" import pandas as pd Loading
count_names_glove_vocab.py 0 → 100644 +29 −0 Original line number Diff line number Diff line """Short helper script to look up vocab counts of ~400 names in GloVe vocabulary""" from tqdm import tqdm with open("names.txt", "r", encoding="utf-8") as names_file: names = names_file.readlines() names = [name.strip().lower() for name in names] counts = [] with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file: vocab = vocab_file.readlines() for name in tqdm(names): # this is inefficient but using dictionaries doesn't work for line in vocab: token, count = line.strip().split() if token == name: counts.append(count) found = True break if found == False: counts.append(0) found = False print(len(names)) print(len(counts)) with open("name_counts.csv", "w+", encoding="utf-8") as output: for i, name in enumerate(names): output.write(f"{name},{counts[i]}\n")
preprocess_wikipedia.py 0 → 100644 +19 −0 Original line number Diff line number Diff line """ Download Wikipedia dump using huggingface and preprocess it using nltk tokenizer, lowercasing, punctuation removal """ from tqdm import tqdm from nltk.tokenize import word_tokenize from datasets import load_dataset wikipedia = load_dataset("wikipedia", "20220301.en") wikipedia = wikipedia["train"] with open("/workspace/students/reichelt/BA/data/wikipedia/english_wikipedia_preprocessed.txt", "w+", encoding="utf-8") as f: for article in tqdm(wikipedia): tokenized = word_tokenize(article["text"], language='english') tokenized = [token.lower() for token in tokenized] JOINED = " ".join(tokenized) f.write(JOINED + "\n")