Loading crawl_persondata.py 0 → 100644 +115 −0 Original line number Diff line number Diff line """Crawl persondata.toolforge.org for data about first names, output results to console""" import re import time import requests from collections import Counter from bs4 import BeautifulSoup def read_names_from_file(file_path): with open(file_path, "r", encoding="utf-8") as f: names = [line.strip() for line in f if line.strip()] return names def get_number_of_results(name: str) -> int: """For a certain name, returns how many Wikipedia articles there are. The name must be capitalised.""" url = f"https://persondata.toolforge.org/index.php?name={name}" for i in range(6): # try 5 times (on 6th, return 0) if i == 5: print(f"Max retries exceeded for {name}") return 0 try: response = requests.get(url, timeout=500) response.raise_for_status() break except requests.RequestException as e: print(f"Error in request for {name}: {e}") time.sleep(60) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") result_sentence = soup.find("p").find("b", string=re.compile(r"Es wurden (\d+) Personen gefunden", re.IGNORECASE)) if result_sentence: match = re.search(r"Es wurden (\d+) Personen gefunden", result_sentence.text) if match: return int(match.group(1)) return 0 def crawl_persondata(names: list) -> dict: results = {} for i, name in enumerate(names): total_results = get_number_of_results(name) # how many articles were found? if total_results > 0: # process the first 100 results which have different url structure results[name] = process_page(name, 0) # process additional pages if there are more than 100 results for start in range(100, total_results, 100): results[name].extend(process_page(name, start)) time.sleep(5) # add 5 second delay after processing each name if i % 100 == 0: time.sleep(600) return results def process_page(name: str, start: int) -> list: """For one given page, extracts all nationality mentions. Returns them in a list like [Germany, Germany, USA, Armenia, USA, ...] so that more calls to this function can easily be appended to this list. The counting of nationalities comes in a later step.""" if start == 0: url = f"https://persondata.toolforge.org/index.php?name={name}" else: url = f"https://persondata.toolforge.org/index.php?name={name}&start={start}" response = requests.get(url, timeout=500) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") personinfo_tags = soup.find_all("div", class_="infografik") nationalities = [] for tag in personinfo_tags: # img tags contain info about nationality and gender # I only want those about nationality, distinguished by flag information img_tags = tag.find_all("img", alt=re.compile(r"Flagge von .+", re.IGNORECASE)) for img_tag in img_tags: title_tag = img_tag.get("title") # "title" contains nationality info if title_tag: nationalities.append(title_tag) return nationalities return [] def main(): names_to_search = read_names_from_file("names.txt") results = crawl_persondata(names_to_search) for name, nationalities in results.items(): print(f"Name: {name}") nationality_counts = Counter(nationalities) # use Counter to count occurrences total_entries = len(nationalities) # sort occurrences in descending order and get top 5 sorted_counts = sorted(nationality_counts.items(), key=lambda x: x[1], reverse=True) top_nationalities = sorted_counts[:5] for nationality, count in top_nationalities: # top_nationalities is a tuple, not dict percentage = (count / total_entries) * 100 print(f" {nationality}: {count} occurrences ({percentage:.2f}%)") if __name__ == "__main__": main() persondata_nationality_output.txt 0 → 100644 +2032 −0 File added.Preview size limit exceeded, changes collapsed. Show changes Loading
crawl_persondata.py 0 → 100644 +115 −0 Original line number Diff line number Diff line """Crawl persondata.toolforge.org for data about first names, output results to console""" import re import time import requests from collections import Counter from bs4 import BeautifulSoup def read_names_from_file(file_path): with open(file_path, "r", encoding="utf-8") as f: names = [line.strip() for line in f if line.strip()] return names def get_number_of_results(name: str) -> int: """For a certain name, returns how many Wikipedia articles there are. The name must be capitalised.""" url = f"https://persondata.toolforge.org/index.php?name={name}" for i in range(6): # try 5 times (on 6th, return 0) if i == 5: print(f"Max retries exceeded for {name}") return 0 try: response = requests.get(url, timeout=500) response.raise_for_status() break except requests.RequestException as e: print(f"Error in request for {name}: {e}") time.sleep(60) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") result_sentence = soup.find("p").find("b", string=re.compile(r"Es wurden (\d+) Personen gefunden", re.IGNORECASE)) if result_sentence: match = re.search(r"Es wurden (\d+) Personen gefunden", result_sentence.text) if match: return int(match.group(1)) return 0 def crawl_persondata(names: list) -> dict: results = {} for i, name in enumerate(names): total_results = get_number_of_results(name) # how many articles were found? if total_results > 0: # process the first 100 results which have different url structure results[name] = process_page(name, 0) # process additional pages if there are more than 100 results for start in range(100, total_results, 100): results[name].extend(process_page(name, start)) time.sleep(5) # add 5 second delay after processing each name if i % 100 == 0: time.sleep(600) return results def process_page(name: str, start: int) -> list: """For one given page, extracts all nationality mentions. Returns them in a list like [Germany, Germany, USA, Armenia, USA, ...] so that more calls to this function can easily be appended to this list. The counting of nationalities comes in a later step.""" if start == 0: url = f"https://persondata.toolforge.org/index.php?name={name}" else: url = f"https://persondata.toolforge.org/index.php?name={name}&start={start}" response = requests.get(url, timeout=500) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") personinfo_tags = soup.find_all("div", class_="infografik") nationalities = [] for tag in personinfo_tags: # img tags contain info about nationality and gender # I only want those about nationality, distinguished by flag information img_tags = tag.find_all("img", alt=re.compile(r"Flagge von .+", re.IGNORECASE)) for img_tag in img_tags: title_tag = img_tag.get("title") # "title" contains nationality info if title_tag: nationalities.append(title_tag) return nationalities return [] def main(): names_to_search = read_names_from_file("names.txt") results = crawl_persondata(names_to_search) for name, nationalities in results.items(): print(f"Name: {name}") nationality_counts = Counter(nationalities) # use Counter to count occurrences total_entries = len(nationalities) # sort occurrences in descending order and get top 5 sorted_counts = sorted(nationality_counts.items(), key=lambda x: x[1], reverse=True) top_nationalities = sorted_counts[:5] for nationality, count in top_nationalities: # top_nationalities is a tuple, not dict percentage = (count / total_entries) * 100 print(f" {nationality}: {count} occurrences ({percentage:.2f}%)") if __name__ == "__main__": main()
persondata_nationality_output.txt 0 → 100644 +2032 −0 File added.Preview size limit exceeded, changes collapsed. Show changes