Commit abbd48e6 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Restore general script files

parent ba247f1c
Loading
Loading
Loading
Loading
+32 −0
Original line number Diff line number Diff line
"""Check how often the most common names of each nationality
occur in the Wikipedia snapshot used for GloVe training.
To be used when there is no vocab count file yet."""

import pandas as pd

df = pd.read_csv("../data/names_nationality.csv", usecols=["name", "nationality", "gender"])
df["occurrences_in_wikipedia"] = 0

CHUNK_SIZE = 1024 * 1024  # 1 MB

with open("../data/wikipedia/wikipedia_corpus.txt", "rb") as f:
    CHUNK_NO = 1
    while True:
        print(f"reading Wikipedia data: approx. {CHUNK_NO} MB/8100 MB", end="\r")
        wikipedia_text = f.read(CHUNK_SIZE)
        if not wikipedia_text:
            print("reading completed")
            break

        for index, row in df.iterrows():
            name = row['name'].encode()
            count = wikipedia_text.count(name)
            df.at[index, "occurrences_in_wikipedia"] += count
        
        CHUNK_NO += 1

df.to_csv("../data/names_nationality_wikipedia.csv", index=False)

avg_occurrences = df.groupby("nationality")["occurrences_in_wikipedia"].mean()
for nationality, avg_count in avg_occurrences.items():
    print(f"Nationality: {nationality}, Average Occurrences in Wikipedia: {avg_count:.2f}")
+29 −0
Original line number Diff line number Diff line
"""Short helper script to look up vocab counts of ~400 names in GloVe vocabulary"""
from tqdm import tqdm

with open("names.txt", "r", encoding="utf-8") as names_file:
    names = names_file.readlines()

names = [name.strip().lower() for name in names]

counts = []

with open("data/embeddings/glove/dd-glove/vocab.txt", "r", encoding="utf-8") as vocab_file:
    vocab = vocab_file.readlines()
    for name in tqdm(names):  # this is inefficient but using dictionaries doesn't work
        for line in vocab:
            token, count = line.strip().split()
            if token == name:
                counts.append(count)
                found = True
                break
        if found == False:
            counts.append(0)
        found = False

print(len(names))
print(len(counts))

with open("name_counts.csv", "w+", encoding="utf-8") as output:
    for i, name in enumerate(names):
        output.write(f"{name},{counts[i]}\n")

crawl_persondata.py

0 → 100644
+115 −0
Original line number Diff line number Diff line
"""Crawl persondata.toolforge.org for data about first names, output results to console"""
import re
import time
import requests
from collections import Counter
from bs4 import BeautifulSoup

def read_names_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        names = [line.strip() for line in f if line.strip()]
    return names


def get_number_of_results(name: str) -> int:
    """For a certain name, returns how many Wikipedia articles there are.
    The name must be capitalised."""

    url = f"https://persondata.toolforge.org/index.php?name={name}"

    for i in range(6):  # try 5 times (on 6th, return 0)
        if i == 5:
            print(f"Max retries exceeded for {name}")
            return 0
        try:
            response = requests.get(url, timeout=500)
            response.raise_for_status()
            break
        except requests.RequestException as e:
            print(f"Error in request for {name}: {e}")
            time.sleep(60)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        result_sentence = soup.find("p").find("b", string=re.compile(r"Es wurden (\d+) Personen gefunden", re.IGNORECASE))

        if result_sentence:
            match = re.search(r"Es wurden (\d+) Personen gefunden", result_sentence.text)
            if match:
                return int(match.group(1))

    return 0

def crawl_persondata(names: list) -> dict:
    results = {}

    for i, name in enumerate(names):
        total_results = get_number_of_results(name)  # how many articles were found?
        if total_results > 0:
            # process the first 100 results which have different url structure
            results[name] = process_page(name, 0)

            # process additional pages if there are more than 100 results
            for start in range(100, total_results, 100):
                results[name].extend(process_page(name, start))

        time.sleep(5)  # add 5 second delay after processing each name
        if i % 100 == 0:
            time.sleep(600)

    return results

def process_page(name: str, start: int) -> list:
    """For one given page, extracts all nationality mentions.
    Returns them in a list like [Germany, Germany, USA, Armenia, USA, ...]
    so that more calls to this function can easily be appended to this list.
    The counting of nationalities comes in a later step."""

    if start == 0:
        url = f"https://persondata.toolforge.org/index.php?name={name}"
    else:
        url = f"https://persondata.toolforge.org/index.php?name={name}&start={start}"

    response = requests.get(url, timeout=500)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        personinfo_tags = soup.find_all("div", class_="infografik")

        nationalities = []
        for tag in personinfo_tags:
            # img tags contain info about nationality and gender
            # I only want those about nationality, distinguished by flag information
            img_tags = tag.find_all("img", alt=re.compile(r"Flagge von .+", re.IGNORECASE))
            for img_tag in img_tags:
                title_tag = img_tag.get("title")  # "title" contains nationality info
                if title_tag:
                    nationalities.append(title_tag)

        return nationalities

    return []

def main():
    names_to_search = read_names_from_file("names.txt")

    results = crawl_persondata(names_to_search)

    for name, nationalities in results.items():
        print(f"Name: {name}")

        nationality_counts = Counter(nationalities) # use Counter to count occurrences
        total_entries = len(nationalities)
  
        # sort occurrences in descending order and get top 5
        sorted_counts = sorted(nationality_counts.items(), key=lambda x: x[1], reverse=True)
        top_nationalities = sorted_counts[:5]

        for nationality, count in top_nationalities:  # top_nationalities is a tuple, not dict
            percentage = (count / total_entries) * 100
            print(f"    {nationality}: {count} occurrences ({percentage:.2f}%)")


if __name__ == "__main__":
    main()
+2032 −0

File added.

Preview size limit exceeded, changes collapsed.