Skip to content
Snippets Groups Projects
Commit 23b0368f authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add the scraped data

parent a4e9a957
No related branches found
No related tags found
No related merge requests found
fanfics.csv 0 → 100644
This diff is collapsed.
46390654,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
46393285,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
46267624,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
45890962,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
46390228,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
45092413,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
46386010,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
46381819,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
42277845,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
46381525,https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
This diff is collapsed.
36182401,Status: 429
1858767,Status: 429
35038003,Status: 429
35322460,Status: 429
31230230,Status: 429
39705159,Status: 429
28189017,Status: 429
37066318,Status: 429
42742305,Status: 429
32246428,Status: 429
32636398,Status: 429
36081928,Status: 429
46037344,Status: 429
43367985,Status: 429
30997271,Status: 429
31268831,Status: 429
28408329,Status: 429
35998642,Status: 429
35794672,Status: 429
37972471,Status: 429
7637785,Status: 429
36492010,Status: 429
31128947,Status: 429
31874278,Status: 429
34723168,Status: 429
39898800,Status: 429
32822140,Status: 429
34566796,Status: 429
32766241,Status: 429
37858894,Status: 429
38874333,Status: 429
43981135,Status: 429
35502547,Status: 429
45818023,Status: 429
33632293,Status: 429
31878871,Status: 429
26513101,Status: 429
34689322,Status: 429
31136318,Access Denied
url: https://archiveofourown.org/works?commit=Sort+and+Filter&work_search%5Bsort_column%5D=kudos_count&work_search%5Bother_tag_names%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Bcrossover%5D=&work_search%5Bcomplete%5D=&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&tag_id=The+Grisha+Trilogy+-+Leigh+Bardugo
num_requested_fic: -1
retreived on: 2023-04-10 18:49:27.892117
\ No newline at end of file
I am a filler, namely a chubby pink dragon subsisting on mint drops to uphold the folder structure. Of course I could be replaced with a git ignore file, but where's the fun in that? Have a mint drop!
\ No newline at end of file
url: https://archiveofourown.org/tags/The%20Grisha%20Trilogy%20-%20Leigh%20Bardugo/works
num_requested_fic: 10
retreived on: 2023-04-10 18:28:50.884297
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Scraping tool for taking text from fanfics on AO3 and putting it into a text file.
doesn't work much, only finds some fanfictions and only the first page
"""
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import time
#---------------EDIT THIS CODE FOR SCRAPING OTHER FICS------------------------#
#given a start link - the first page of the tag with all the filters you want
page = "https://archiveofourown.org/works?commit=Sort+and+Filter&work_search%5Bsort_column%5D=kudos_count&work_search%5Bother_tag_names%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Bcrossover%5D=&work_search%5Bcomplete%5D=&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&tag_id=The+Grisha+Trilogy+-+Leigh+Bardugo"
nameOfFileCreated = "grishaverse.txt" #name of exported text file
NumberOfPages = 6
#--------------------Code for scraping AO3 fanfics----------------------------#
#make a list of all the pages that fit that tag limit list
def getPageList(startLink):
links = [startLink]
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(startLink,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
olist = soup.find("ol", {"class": "pagination actions"}) #first, title navi
lis = list(olist.find_all("li"))[0:NumberOfPages] #get the first 20 pages
for l in lis:
try:
a = l.find("a", href = True)['href']
links.append(("https://archiveofourown.org" + a))
except: pass
print(links)
return links
allPages = getPageList(page)
#Scrape, from each page, a link list of all the works on those pages
allFicList = []
def getPageFics(thisPage):
thisFicList = []
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(thisPage,headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
ols = soup.find("ol", {"class": "work index group"})
lis = list(ols.find_all("li"))
inlist = 0
for l in lis:
try:
work = l.find("div", {"class": "header module"}).find("h4", {"class": "heading"}).find("a", href = True)['href']
thisFicList.append(("https://archiveofourown.org" + work + "?view_full_work=true?view_adult=true"))
except:
pass
print(thisFicList)
return thisFicList
for thisPage in allPages:
allFicList.extend(getPageFics(thisPage))
#make sure they're all unique
allFicList = list(dict.fromkeys(allFicList))
print(len(allFicList))
#From each fic, take the body content and appeand onto a text file
def scrapedFics(fic):
text = ""
time.sleep(5)
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(fic, headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page)
#about the actual text in the fiction
try:
fiq = soup.find("div", {"class": "userstuff"})
p = fiq.find_all("p")
for pp in p:
text = text + pp.get_text().replace("<br/>", "")
except:
try: #fix it as though it were multichapter
fiq = list(soup.find_all("div", {"class": "userstuff module"}))
for f in fiq:
p = f.find_all("p")
for pp in p:
text = text + pp.get_text().replace("<br/>", "")
except: print("Error: ", fic)
print(text)
return text
textFile = open(nameOfFileCreated, "a", encoding='utf8')
count = 0
textt = "" # not sure what i did here is a fix, check
for fic in allFicList:
print(count)
textFile.write(scrapedFics(fic))
count += 1
textFile.close()
10477665,Status: 429
13764837,Status: 429
31610477,Status: 429
13411896,Status: 429
15048896,Status: 429
19968571,Status: 429
14917193,Status: 429
42146589,Status: 429
21786541,Status: 429
25657255,Status: 429
42381534,Status: 429
11844462,Status: 429
24761668,Status: 429
26089846,Status: 429
17073689,Status: 429
24429415,Status: 429
28843206,Status: 429
22210213,Status: 429
41704224,Status: 429
35175508,Status: 429
24353596,Status: 429
43029246,Status: 429
16667644,Status: 429
28251708,Status: 429
23007898,Status: 429
33805261,Status: 429
11198496,Status: 429
31644962,Status: 429
25140205,Status: 429
19181257,Status: 429
45395953,Status: 429
12834039,Status: 429
41668083,Status: 429
44015854,Status: 429
13804569,Status: 429
23387938,Status: 429
39415851,Status: 429
url: https://archiveofourown.org/works?commit=Sort+and+Filter&work_search%5Bsort_column%5D=kudos_count&work_search%5Bother_tag_names%5D=&work_search%5Bexcluded_tag_names%5D=&work_search%5Bcrossover%5D=&work_search%5Bcomplete%5D=&work_search%5Bwords_from%5D=&work_search%5Bwords_to%5D=&work_search%5Bdate_from%5D=&work_search%5Bdate_to%5D=&work_search%5Bquery%5D=&work_search%5Blanguage_id%5D=&tag_id=Throne+of+Glass+Series+-+Sarah+J*d*+Maas
num_requested_fic: -1
retreived on: 2023-04-10 19:00:22.627162
\ No newline at end of file
hello, I am a sentient cream puff instead of a professional git ignore file. I exist. (I like prime numbers)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment