Skip to content
Snippets Groups Projects
Commit efb6c486 authored by Nadia Arslan's avatar Nadia Arslan
Browse files

old

parent 557fa2ce
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 13 10:48:25 2019
@author: nadia
"""
import json
from bs4 import BeautifulSoup
import requests
from tika import parser as pdf_parser
import re
import html_extractor
import pdf_extractor
import os
INPUT = '../output/spektrum_links_output/filtered_Spektrum_Links.json'
FAILS = '../output/extracted_articles/extraction_fails.txt'
DOWNLOAD_FAILS = '../output/extracted_articles/download_fails.txt'
WINS = '../output/extracted_articles/extraction_complete.txt'
OUTPUT = '../output/extracted_articles/pdf_extraction/pdfs/'
PDF_DICT = '../output/extracted_articles/pdf_extraction/pdfs/pdf_dict.json'
DE_EN = '../output/extracted_articles/de_en_articles.json'
def open_json(filename):
with open(filename, 'r') as f:
data = json.load(f)
return data
def download_pdf(url, pdf_name, pdf_dict):
pdfs = os.listdir(OUTPUT)
if pdf_name not in pdf_dict:
file_name = str(len(pdf_dict))
pdf_dict[pdf_name] = file_name
else:
file_name = pdf_dict[pdf_name]
pdf_filename = OUTPUT+file_name+'.pdf'
if pdf_filename.replace(OUTPUT,'') in pdfs:
#print('PDF already exists')
return True
try:
# get and save pdf file
pdf_file = requests. get(url, allow_redirects=True)
open(pdf_filename, 'wb').write(pdf_file.content)
return True
except:
with open(DOWNLOAD_FAILS, 'a') as f:
f.writelines(pdf_filename+'\t'+url+'\n')
return False
def get_pdf_soup(filename, ID):
"""
if ID+'.html' in os.listdir('../output/extracted_articles/pdf_extraction/'):
with open('../output/extracted_articles/pdf_extraction/'+ID+'.html','r') as f:
html_doc = f.read()
soup = BeautifulSoup(html_doc, 'html.parser')
return soup
"""
#pdfreader=PyPDF2.PdfFileReader(open(filename,'rb'))
#pdf_count=pdfreader.numPages
#print(pdf_count)
sysxml = pdf_parser.from_file(filename, xmlContent=True)['content']
sysxml = re.sub(r"<p />","",sysxml)
sysxml = re.sub(r"<p>[\s]*\n</p>","",sysxml)
soup=BeautifulSoup(sysxml,'html.parser')
#print(soup.title.text)
return soup
def check_structure(urls, ID):
score = (0,'')
for url in urls.keys():
if int(urls[url]['Structure']) > score[0] and urls[url]['Abstract']==True:
score = (int(urls[url]['Structure']), url)
if score[0] < 4:
return False
else:
url = score[1]
try:
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, 'html.parser')
html_extractor.extract(soup, ID)
return True
except:
#print(ID, url)
#with open(FAILS,'a') as f:
#f.writelines(ID+'\t'+url+'\n')
return False
def check_pdf(urls, ID, abstract=False,ranking=[]):
def compare_titles(title1, title2):
if len(title1.split()) < 5 or len(title2.split()) < 5:
return False
#if title1 == '' or title2 == '':
#return False
title1 = re.sub('\|.*$','',title1)
title2 = re.sub('[_:]','',title2)
title1 = re.sub('[_:]','',title1)
#title2 = re.sub('\n','\s',title2)
#title1 = re.sub('\n','\s',title1)
#print(title1)
#print(title2)
#if title1 in title2 or title2 in title1:
#return True
title2 = title2.split()
title3 = []
for i in range(len(title2)-2):
title3.append(title2[i]+' '+title2[i+1])
#print(title3)
i = 0
for bigram in title3:
if bigram in title1: i+=1
if i!=0 and i/len(title3)>0.5:
return True
return False
if abstract==True:
for url in urls:
if urls[url]['Abstract']==True and len(urls[url]['Pdfs'])>0:
url_title = urls[url]['En_title']
pdfs = urls[url]['Pdfs']
pdfs = check_path(pdfs, url)
for pdf in pdfs:
pdf_name = ID+pdf
if download_pdf(pdf,pdf_name, pdf_dict):
file_name = pdf_dict[pdf_name]
soup = get_pdf_soup(OUTPUT+file_name+'.pdf',ID)
pdf_title = soup.title.text.strip()
if compare_titles(url_title.lower(), pdf_title.lower()):
pdf_extractor.extract(soup, ID)
return True
else:
for url in ranking:
if len(urls[url]['Pdfs'])>0:
url_title = urls[url]['En_title']
pdfs = urls[url]['Pdfs']
pdfs = check_path(pdfs, url)
for pdf in pdfs:
pdf_name = ID+pdf
if download_pdf(pdf,pdf_name, pdf_dict):
file_name = pdf_dict[pdf_name]
soup = get_pdf_soup(OUTPUT+file_name+'.pdf')
pdf_title = soup.title.text.strip()
if compare_titles(url_title.lower(), pdf_title.lower()):
pdf_extractor.extract(soup, ID)
return True
return False
def check_path(pdfs, url):
def merge_link(p, url):
url_split = [el for el in url.split('/') if len(el) != 0]
url_foot = '//'.join(url_split[:2])
p = url_foot + p
return p
p1 = [merge_link(p,url) for p in pdfs if p.startswith('/')]
p2 = [p for p in pdfs if not p.startswith('/')]
pdfs = p1 + p2
return pdfs
def check_abstract(urls,ID):
for url in urls:
if urls[url]['Abstract']==True:
try:
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, 'html.parser')
html_extractor.extract(soup, ID)
return True
except:
#print(ID,url)
#with open(FAILS,'a') as f:
#f.writelines(ID+'\t'+url+'\n')
continue
return False
def check_keywords(urls, ID):
ranking = []
for url in urls:
ranking.append((urls[url]['Keyword'],url))
ranking.sort(reverse=True)
ranking = [el[1] for el in ranking]
if check_pdf(urls,ID,ranking):
return True
else:
with open(FAILS.replace('.txt','_keywords.txt'),'r') as f:
lines = f.readlines()
with open(FAILS.replace('.txt','_keywords.txt'),'a') as ff:
line = ID+'\t'+str(ranking)+'\n'
if line not in lines:
ff.writelines(line)
return False
def extractor(article):
ID = article[0]
urls = article[1]['Urls']
if check_structure(urls, ID):
return True
elif check_pdf(urls, ID, abstract=True):
return True
elif check_abstract(urls,ID):
return True
elif check_keywords(urls, ID):
return True
else: return False
def iterate(data):
de_en = open_json(DE_EN)
for i, el in enumerate(data.items()):
#print(el)
if el[0] in de_en:
continue
extracted = extractor(el)
"""
"""
if extracted == False:
#print(el)
with open(FAILS,'r') as f:
lines = f.readlines()
with open(FAILS,'a') as ff:
line = el[0]+'\n'
if line not in lines:
ff.writelines(line)
if i%100 == 0:
print(i, ' articles of ',len(data),' extracted')
if __name__ == '__main__':
data = open_json(INPUT)
with open(PDF_DICT,'r') as f:
pdf_dict = json.load(f)
iterate(data)
#print(pdf_dict)
with open(PDF_DICT,'w') as f:
json.dump(pdf_dict,f)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment