Skip to content
Snippets Groups Projects
Commit 1b1f61c1 authored by Aileen Reichelt's avatar Aileen Reichelt
Browse files

Create script for crawling persondata and save output

parent b11ea49c
No related branches found
No related tags found
No related merge requests found
"""Crawl persondata.toolforge.org for data about first names, output results to console"""
import re
import time
import requests
from collections import Counter
from bs4 import BeautifulSoup
def read_names_from_file(file_path):
with open(file_path, "r", encoding="utf-8") as f:
names = [line.strip() for line in f if line.strip()]
return names
def get_number_of_results(name: str) -> int:
"""For a certain name, returns how many Wikipedia articles there are.
The name must be capitalised."""
url = f"https://persondata.toolforge.org/index.php?name={name}"
for i in range(6): # try 5 times (on 6th, return 0)
if i == 5:
print(f"Max retries exceeded for {name}")
return 0
try:
response = requests.get(url, timeout=500)
response.raise_for_status()
break
except requests.RequestException as e:
print(f"Error in request for {name}: {e}")
time.sleep(60)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
result_sentence = soup.find("p").find("b", string=re.compile(r"Es wurden (\d+) Personen gefunden", re.IGNORECASE))
if result_sentence:
match = re.search(r"Es wurden (\d+) Personen gefunden", result_sentence.text)
if match:
return int(match.group(1))
return 0
def crawl_persondata(names: list) -> dict:
results = {}
for i, name in enumerate(names):
total_results = get_number_of_results(name) # how many articles were found?
if total_results > 0:
# process the first 100 results which have different url structure
results[name] = process_page(name, 0)
# process additional pages if there are more than 100 results
for start in range(100, total_results, 100):
results[name].extend(process_page(name, start))
time.sleep(5) # add 5 second delay after processing each name
if i % 100 == 0:
time.sleep(600)
return results
def process_page(name: str, start: int) -> list:
"""For one given page, extracts all nationality mentions.
Returns them in a list like [Germany, Germany, USA, Armenia, USA, ...]
so that more calls to this function can easily be appended to this list.
The counting of nationalities comes in a later step."""
if start == 0:
url = f"https://persondata.toolforge.org/index.php?name={name}"
else:
url = f"https://persondata.toolforge.org/index.php?name={name}&start={start}"
response = requests.get(url, timeout=500)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
personinfo_tags = soup.find_all("div", class_="infografik")
nationalities = []
for tag in personinfo_tags:
# img tags contain info about nationality and gender
# I only want those about nationality, distinguished by flag information
img_tags = tag.find_all("img", alt=re.compile(r"Flagge von .+", re.IGNORECASE))
for img_tag in img_tags:
title_tag = img_tag.get("title") # "title" contains nationality info
if title_tag:
nationalities.append(title_tag)
return nationalities
return []
def main():
names_to_search = read_names_from_file("names.txt")
results = crawl_persondata(names_to_search)
for name, nationalities in results.items():
print(f"Name: {name}")
nationality_counts = Counter(nationalities) # use Counter to count occurrences
total_entries = len(nationalities)
# sort occurrences in descending order and get top 5
sorted_counts = sorted(nationality_counts.items(), key=lambda x: x[1], reverse=True)
top_nationalities = sorted_counts[:5]
for nationality, count in top_nationalities: # top_nationalities is a tuple, not dict
percentage = (count / total_entries) * 100
print(f" {nationality}: {count} occurrences ({percentage:.2f}%)")
if __name__ == "__main__":
main()
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment