Skip to content
Snippets Groups Projects
Commit d1a15ec4 authored by Nadia Arslan's avatar Nadia Arslan
Browse files

main change

parent 67045887
No related branches found
No related tags found
No related merge requests found
......@@ -20,6 +20,7 @@ DOWNLOAD_FAILS = '../output/extracted_articles/download_fails.txt'
WINS = '../output/extracted_articles/extraction_complete.txt'
OUTPUT = '../output/extracted_articles/pdf_extraction/pdfs/'
PDF_DICT = '../output/extracted_articles/pdf_extraction/pdfs/pdf_dict.json'
DE_EN = '../output/extracted_articles/de_en_articles.json'
def open_json(filename):
with open(filename, 'r') as f:
......@@ -31,7 +32,7 @@ def download_pdf(url, pdf_name, pdf_dict):
if pdf_name not in pdf_dict:
file_name = str(len(pdf_dict))
#pdf_dict[pdf_name] = file_name
pdf_dict[pdf_name] = file_name
else:
file_name = pdf_dict[pdf_name]
......@@ -44,7 +45,7 @@ def download_pdf(url, pdf_name, pdf_dict):
# get and save pdf file
pdf_file = requests. get(url, allow_redirects=True)
open(pdf_filename, 'wb').write(pdf_file.content)
pdf_dict[pdf_name] = file_name
#pdf_dict[pdf_name] = file_name
return True
except:
with open(DOWNLOAD_FAILS, 'a') as f:
......@@ -143,7 +144,7 @@ def check_pdf(urls, ID, abstract=False,ranking=[]):
pdf_name = ID+pdf
if download_pdf(pdf,pdf_name, pdf_dict):
file_name = pdf_dict[pdf_name]
soup = get_pdf_soup(OUTPUT+file_name+'.pdf')
soup = get_pdf_soup(OUTPUT+file_name+'.pdf',ID)
pdf_title = soup.title.text.strip()
if compare_titles(url_title.lower(), pdf_title.lower()):
pdf_extractor.extract(soup, ID)
......@@ -184,7 +185,7 @@ def check_keywords(urls, ID):
ranking.append((eval(urls[url]['Keyword']),url))
ranking.sort(reverse=True)
ranking = [el[1] for el in ranking]
if check_pdf(urls,ID,abstract=False,ranking):
if check_pdf(urls,ID,False,ranking):
return True
else:
try:
......@@ -217,8 +218,8 @@ def extractor(article):
def iterate(data):
de_en = open_json(DE_EN)
for i, el in enumerate(data.items()):
#if el[0] in de_en:
#continue
if el[0] in de_en:
continue
extracted = extractor(el)
if extracted == False:
with open(FAILS,'r') as f:
......@@ -237,4 +238,4 @@ if __name__ == '__main__':
pdf_dict = json.load(f)
iterate(data)
with open(PDF_DICT,'w') as f:
json.dump(pdf_dict,f)
\ No newline at end of file
json.dump(pdf_dict,f)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment