Commit 1b1f61c1 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Create script for crawling persondata and save output

parent b11ea49c
Loading
Loading
Loading
Loading

crawl_persondata.py

0 → 100644
+115 −0
Original line number Diff line number Diff line
"""Crawl persondata.toolforge.org for data about first names, output results to console"""
import re
import time
import requests
from collections import Counter
from bs4 import BeautifulSoup

def read_names_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        names = [line.strip() for line in f if line.strip()]
    return names


def get_number_of_results(name: str) -> int:
    """For a certain name, returns how many Wikipedia articles there are.
    The name must be capitalised."""

    url = f"https://persondata.toolforge.org/index.php?name={name}"

    for i in range(6):  # try 5 times (on 6th, return 0)
        if i == 5:
            print(f"Max retries exceeded for {name}")
            return 0
        try:
            response = requests.get(url, timeout=500)
            response.raise_for_status()
            break
        except requests.RequestException as e:
            print(f"Error in request for {name}: {e}")
            time.sleep(60)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        result_sentence = soup.find("p").find("b", string=re.compile(r"Es wurden (\d+) Personen gefunden", re.IGNORECASE))

        if result_sentence:
            match = re.search(r"Es wurden (\d+) Personen gefunden", result_sentence.text)
            if match:
                return int(match.group(1))

    return 0

def crawl_persondata(names: list) -> dict:
    results = {}

    for i, name in enumerate(names):
        total_results = get_number_of_results(name)  # how many articles were found?
        if total_results > 0:
            # process the first 100 results which have different url structure
            results[name] = process_page(name, 0)

            # process additional pages if there are more than 100 results
            for start in range(100, total_results, 100):
                results[name].extend(process_page(name, start))

        time.sleep(5)  # add 5 second delay after processing each name
        if i % 100 == 0:
            time.sleep(600)

    return results

def process_page(name: str, start: int) -> list:
    """For one given page, extracts all nationality mentions.
    Returns them in a list like [Germany, Germany, USA, Armenia, USA, ...]
    so that more calls to this function can easily be appended to this list.
    The counting of nationalities comes in a later step."""

    if start == 0:
        url = f"https://persondata.toolforge.org/index.php?name={name}"
    else:
        url = f"https://persondata.toolforge.org/index.php?name={name}&start={start}"

    response = requests.get(url, timeout=500)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")

        personinfo_tags = soup.find_all("div", class_="infografik")

        nationalities = []
        for tag in personinfo_tags:
            # img tags contain info about nationality and gender
            # I only want those about nationality, distinguished by flag information
            img_tags = tag.find_all("img", alt=re.compile(r"Flagge von .+", re.IGNORECASE))
            for img_tag in img_tags:
                title_tag = img_tag.get("title")  # "title" contains nationality info
                if title_tag:
                    nationalities.append(title_tag)

        return nationalities

    return []

def main():
    names_to_search = read_names_from_file("names.txt")

    results = crawl_persondata(names_to_search)

    for name, nationalities in results.items():
        print(f"Name: {name}")

        nationality_counts = Counter(nationalities) # use Counter to count occurrences
        total_entries = len(nationalities)
  
        # sort occurrences in descending order and get top 5
        sorted_counts = sorted(nationality_counts.items(), key=lambda x: x[1], reverse=True)
        top_nationalities = sorted_counts[:5]

        for nationality, count in top_nationalities:  # top_nationalities is a tuple, not dict
            percentage = (count / total_entries) * 100
            print(f"    {nationality}: {count} occurrences ({percentage:.2f}%)")


if __name__ == "__main__":
    main()
+2032 −0

File added.

Preview size limit exceeded, changes collapsed.