Restore general script files (abbd48e6) · Commits · Aileen Reichelt / Analysing and Mitigating Origin Bias in German Word Embeddings

count_name_occurrences_wikipedia.py

0 → 100644

+32 −0

Original line number	Diff line number	Diff line
		"""Check how often the most common names of each nationality
		occur in the Wikipedia snapshot used for GloVe training.
		To be used when there is no vocab count file yet."""

		import pandas as pd

		df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"])
		df["occurrences_in_wikipedia"] = 0

		CHUNK_SIZE = 1024 * 1024 # 1 MB

		with open("../data/wikipedia/wikipedia_corpus.txt", "rb") as f:
		CHUNK_NO = 1
		while True:
		print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
		wikipedia_text = f.read(CHUNK_SIZE)
		if not wikipedia_text:
		print("reading completed")
		break

		for index, row in df.iterrows():
		name = row['name'].encode()
		count = wikipedia_text.count(name)
		df.at[index, "occurrences_in_wikipedia"] += count

		CHUNK_NO += 1

		df.to_csv("../data/names_nationality_wikipedia.csv", index=False)

		avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean()
		for nationality, avg_count in avg_occurrences.items():
		print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}")

count_names_glove_vocab.py

0 → 100644

+29 −0

Original line number	Diff line number	Diff line
		"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary"""
		from tqdm import tqdm

		with open("names.txt", "r", encoding="utf-8") as names_file:
		names = names_file.readlines()

		names = [name.strip().lower() for name in names]

		counts = []

		with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file:
		vocab = vocab_file.readlines()
		for name in tqdm(names): # this is inefficient but using dictionaries doesn't work
		for line in vocab:
		token, count = line.strip().split()
		if token == name:
		counts.append(count)
		found = True
		break
		if found == False:
		counts.append(0)
		found = False

		print(len(names))
		print(len(counts))

		with open("name_counts.csv", "w+", encoding="utf-8") as output:
		for i, name in enumerate(names):
		output.write(f"{name},{counts[i]}\n")

crawl_persondata.py

0 → 100644

+115 −0

Original line number	Diff line number	Diff line
		"""Crawl persondata.toolforge.org for data about first names, output results to console"""
		import re
		import time
		import requests
		from collections import Counter
		from bs4 import BeautifulSoup

		def read_names_from_file(file_path):
		with open(file_path, "r", encoding="utf-8") as f:
		names = [line.strip() for line in f if line.strip()]
		return names


		def get_number_of_results(name: str) -> int:
		"""For a certain name, returns how many Wikipedia articles there are.
		The name must be capitalised."""

		url = f"https://persondata.toolforge.org/index.php?name={name}"

		for i in range(6): # try 5 times (on 6th, return 0)
		if i == 5:
		print(f"Max retries exceeded for {name}")
		return 0
		try:
		response = requests.get(url, timeout=500)
		response.raise_for_status()
		break
		except requests.RequestException as e:
		print(f"Error in request for {name}: {e}")
		time.sleep(60)

		if response.status_code == 200:
		soup = BeautifulSoup(response.text, "html.parser")
		result_sentence = soup.find("p").find("b", string=re.compile(r"Es wurden (\d+) Personen gefunden", re.IGNORECASE))

		if result_sentence:
		match = re.search(r"Es wurden (\d+) Personen gefunden", result_sentence.text)
		if match:
		return int(match.group(1))

		return 0

		def crawl_persondata(names: list) -> dict:
		results = {}

		for i, name in enumerate(names):
		total_results = get_number_of_results(name) # how many articles were found?
		if total_results > 0:
		# process the first 100 results which have different url structure
		results[name] = process_page(name, 0)

		# process additional pages if there are more than 100 results
		for start in range(100, total_results, 100):
		results[name].extend(process_page(name, start))

		time.sleep(5) # add 5 second delay after processing each name
		if i % 100 == 0:
		time.sleep(600)

		return results

		def process_page(name: str, start: int) -> list:
		"""For one given page, extracts all nationality mentions.
		Returns them in a list like [Germany, Germany, USA, Armenia, USA, ...]
		so that more calls to this function can easily be appended to this list.
		The counting of nationalities comes in a later step."""

		if start == 0:
		url = f"https://persondata.toolforge.org/index.php?name={name}"
		else:
		url = f"https://persondata.toolforge.org/index.php?name={name}&start={start}"

		response = requests.get(url, timeout=500)

		if response.status_code == 200:
		soup = BeautifulSoup(response.text, "html.parser")

		personinfo_tags = soup.find_all("div", class_="infografik")

		nationalities = []
		for tag in personinfo_tags:
		# img tags contain info about nationality and gender
		# I only want those about nationality, distinguished by flag information
		img_tags = tag.find_all("img", alt=re.compile(r"Flagge von .+", re.IGNORECASE))
		for img_tag in img_tags:
		title_tag = img_tag.get("title") # "title" contains nationality info
		if title_tag:
		nationalities.append(title_tag)

		return nationalities

		return []

		def main():
		names_to_search = read_names_from_file("names.txt")

		results = crawl_persondata(names_to_search)

		for name, nationalities in results.items():
		print(f"Name: {name}")

		nationality_counts = Counter(nationalities) # use Counter to count occurrences
		total_entries = len(nationalities)

		# sort occurrences in descending order and get top 5
		sorted_counts = sorted(nationality_counts.items(), key=lambda x: x[1], reverse=True)
		top_nationalities = sorted_counts[:5]

		for nationality, count in top_nationalities: # top_nationalities is a tuple, not dict
		percentage = (count / total_entries) * 100
		print(f" {nationality}: {count} occurrences ({percentage:.2f}%)")


		if __name__ == "__main__":
		main()

persondata_nationality_output.txt

0 → 100644

+2032 −0

File added.

Preview size limit exceeded, changes collapsed.