Create script for crawling persondata and save output

1b1f61c1 · Aileen Reichelt · b11ea49c · 1b1f61c1 · 1b1f61c1
Commit 1b1f61c1 authored 1 year ago by Aileen Reichelt
--- a/crawl_persondata.py
+++ b/crawl_persondata.py
+"""Crawl persondata.toolforge.org for data about first names, output results to console"""
+import re
+import time
+import requests
+from collections import Counter
+from bs4 import BeautifulSoup
+
+def read_names_from_file(file_path):
+    with open(file_path, "r", encoding="utf-8") as f:
+        names = [line.strip() for line in f if line.strip()]
+    return names
+
+
+def get_number_of_results(name: str) -> int:
+    """For a certain name, returns how many Wikipedia articles there are.
+    The name must be capitalised."""
+
+    url = f"https://persondata.toolforge.org/index.php?name={name}"
+
+    for i in range(6):  # try 5 times (on 6th, return 0)
+        if i == 5:
+            print(f"Max retries exceeded for {name}")
+            return 0
+        try:
+            response = requests.get(url, timeout=500)
+            response.raise_for_status()
+            break
+        except requests.RequestException as e:
+            print(f"Error in request for {name}: {e}")
+            time.sleep(60)
+
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, "html.parser")
+        result_sentence = soup.find("p").find("b", string=re.compile(r"Es wurden (\d+) Personen gefunden", re.IGNORECASE))
+
+        if result_sentence:
+            match = re.search(r"Es wurden (\d+) Personen gefunden", result_sentence.text)
+            if match:
+                return int(match.group(1))
+
+    return 0
+
+def crawl_persondata(names: list) -> dict:
+    results = {}
+
+    for i, name in enumerate(names):
+        total_results = get_number_of_results(name)  # how many articles were found?
+        if total_results > 0:
+            # process the first 100 results which have different url structure
+            results[name] = process_page(name, 0)
+
+            # process additional pages if there are more than 100 results
+            for start in range(100, total_results, 100):
+                results[name].extend(process_page(name, start))
+
+        time.sleep(5)  # add 5 second delay after processing each name
+        if i % 100 == 0:
+            time.sleep(600)
+
+    return results
+
+def process_page(name: str, start: int) -> list:
+    """For one given page, extracts all nationality mentions.
+    Returns them in a list like [Germany, Germany, USA, Armenia, USA, ...]
+    so that more calls to this function can easily be appended to this list.
+    The counting of nationalities comes in a later step."""
+
+    if start == 0:
+        url = f"https://persondata.toolforge.org/index.php?name={name}"
+    else:
+        url = f"https://persondata.toolforge.org/index.php?name={name}&start={start}"
+
+    response = requests.get(url, timeout=500)
+
+    if response.status_code == 200:
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        personinfo_tags = soup.find_all("div", class_="infografik")
+
+        nationalities = []
+        for tag in personinfo_tags:
+            # img tags contain info about nationality and gender
+            # I only want those about nationality, distinguished by flag information
+            img_tags = tag.find_all("img", alt=re.compile(r"Flagge von .+", re.IGNORECASE))
+            for img_tag in img_tags:
+                title_tag = img_tag.get("title")  # "title" contains nationality info
+                if title_tag:
+                    nationalities.append(title_tag)
+
+        return nationalities
+
+    return []
+
+def main():
+    names_to_search = read_names_from_file("names.txt")
+
+    results = crawl_persondata(names_to_search)
+
+    for name, nationalities in results.items():
+        print(f"Name: {name}")
+
+        nationality_counts = Counter(nationalities) # use Counter to count occurrences
+        total_entries = len(nationalities)
+  
+        # sort occurrences in descending order and get top 5
+        sorted_counts = sorted(nationality_counts.items(), key=lambda x: x[1], reverse=True)
+        top_nationalities = sorted_counts[:5]
+
+        for nationality, count in top_nationalities:  # top_nationalities is a tuple, not dict
+            percentage = (count / total_entries) * 100
+            print(f"    {nationality}: {count} occurrences ({percentage:.2f}%)")
+
+
+if __name__ == "__main__":
+    main()
--- a/persondata_nationality_output.txt
+++ b/persondata_nationality_output.txt