old

efb6c486 · Nadia Arslan · 557fa2ce · efb6c486
Commit efb6c486 authored 5 years ago by Nadia Arslan
--- a/code/main_extractor_old.py
+++ b/code/main_extractor_old.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Dec 13 10:48:25 2019
+
+@author: nadia
+"""
+import json
+from bs4 import BeautifulSoup
+import requests
+from tika import parser as pdf_parser
+import re
+import html_extractor
+import pdf_extractor
+import os
+
+INPUT = '../output/spektrum_links_output/filtered_Spektrum_Links.json'
+FAILS = '../output/extracted_articles/extraction_fails.txt'
+DOWNLOAD_FAILS = '../output/extracted_articles/download_fails.txt'
+WINS = '../output/extracted_articles/extraction_complete.txt'
+OUTPUT = '../output/extracted_articles/pdf_extraction/pdfs/'
+PDF_DICT = '../output/extracted_articles/pdf_extraction/pdfs/pdf_dict.json'
+DE_EN = '../output/extracted_articles/de_en_articles.json'
+
+def open_json(filename):
+    with open(filename, 'r') as f:
+        data = json.load(f)
+    return data
+    
+def download_pdf(url, pdf_name, pdf_dict):
+    pdfs = os.listdir(OUTPUT)
+    
+    if pdf_name not in pdf_dict:
+        file_name = str(len(pdf_dict))
+        pdf_dict[pdf_name] = file_name
+    else:
+        file_name = pdf_dict[pdf_name]
+        
+    pdf_filename = OUTPUT+file_name+'.pdf'
+    
+    if pdf_filename.replace(OUTPUT,'') in pdfs:
+        #print('PDF already exists')
+        return True
+    try:
+        # get and save pdf file
+        pdf_file = requests. get(url, allow_redirects=True)     
+        open(pdf_filename, 'wb').write(pdf_file.content)
+        return True
+    except:
+        with open(DOWNLOAD_FAILS, 'a') as f:
+            f.writelines(pdf_filename+'\t'+url+'\n')
+        return False
+    
+def get_pdf_soup(filename, ID):
+    """
+    if ID+'.html' in os.listdir('../output/extracted_articles/pdf_extraction/'):
+        with open('../output/extracted_articles/pdf_extraction/'+ID+'.html','r') as f:
+            html_doc = f.read()
+            soup = BeautifulSoup(html_doc, 'html.parser')
+            return soup
+    """
+    #pdfreader=PyPDF2.PdfFileReader(open(filename,'rb'))
+    #pdf_count=pdfreader.numPages
+    #print(pdf_count)
+    sysxml = pdf_parser.from_file(filename, xmlContent=True)['content']
+    sysxml = re.sub(r"<p />","",sysxml)
+    sysxml = re.sub(r"<p>[\s]*\n</p>","",sysxml)
+    soup=BeautifulSoup(sysxml,'html.parser')
+    #print(soup.title.text)
+    return soup
+
+def check_structure(urls, ID):
+    score = (0,'')
+    for url in urls.keys():
+        if int(urls[url]['Structure']) > score[0] and urls[url]['Abstract']==True:
+            score = (int(urls[url]['Structure']), url)
+    if score[0] < 4:
+        return False
+    else:
+        url = score[1]
+        try:
+            html_doc = requests.get(url).text
+            soup = BeautifulSoup(html_doc, 'html.parser')
+            html_extractor.extract(soup, ID)
+            return True
+        except:
+            #print(ID, url)
+            #with open(FAILS,'a') as f:
+                #f.writelines(ID+'\t'+url+'\n')
+            return False
+        
+def check_pdf(urls, ID, abstract=False,ranking=[]):
+    def compare_titles(title1, title2):
+        if len(title1.split()) < 5 or len(title2.split()) < 5:
+            return False
+        #if title1 == '' or title2 == '':
+            #return False
+        title1 = re.sub('\|.*$','',title1)
+        title2 = re.sub('[_:]','',title2)
+        title1 = re.sub('[_:]','',title1)
+        #title2 = re.sub('\n','\s',title2)
+        #title1 = re.sub('\n','\s',title1)
+        #print(title1)
+        #print(title2)
+        #if title1 in title2 or title2 in title1:
+            #return True
+        title2 = title2.split()
+        title3 = []
+        for i in range(len(title2)-2):
+            title3.append(title2[i]+' '+title2[i+1])
+        #print(title3)
+        i = 0
+        for bigram in title3:
+            if bigram in title1: i+=1
+        if i!=0 and i/len(title3)>0.5:
+            return True
+        return False
+    
+    if abstract==True:
+        for url in urls:
+            if urls[url]['Abstract']==True and len(urls[url]['Pdfs'])>0:
+                url_title = urls[url]['En_title']
+                pdfs = urls[url]['Pdfs']
+                pdfs = check_path(pdfs, url)
+                for pdf in pdfs:
+                    pdf_name = ID+pdf                                              
+                    if download_pdf(pdf,pdf_name, pdf_dict):
+                        file_name = pdf_dict[pdf_name]
+                        soup = get_pdf_soup(OUTPUT+file_name+'.pdf',ID)
+                        pdf_title = soup.title.text.strip()
+                        if compare_titles(url_title.lower(), pdf_title.lower()):
+                            pdf_extractor.extract(soup, ID)
+                            return True
+    else:
+        for url in ranking:
+            if len(urls[url]['Pdfs'])>0:
+                url_title = urls[url]['En_title']
+                pdfs = urls[url]['Pdfs']
+                pdfs = check_path(pdfs, url)
+                for pdf in pdfs:
+                    pdf_name = ID+pdf                                              
+                    if download_pdf(pdf,pdf_name, pdf_dict):
+                        file_name = pdf_dict[pdf_name]
+                        soup = get_pdf_soup(OUTPUT+file_name+'.pdf')
+                        pdf_title = soup.title.text.strip()
+                        if compare_titles(url_title.lower(), pdf_title.lower()):
+                            pdf_extractor.extract(soup, ID)
+                            return True               
+    return False
+
+def check_path(pdfs, url):
+    def merge_link(p, url):
+        url_split = [el for el in url.split('/') if len(el) != 0]
+        url_foot = '//'.join(url_split[:2])
+        p = url_foot + p
+        return p
+    p1 = [merge_link(p,url) for p in pdfs if p.startswith('/')]
+    p2 = [p for p in pdfs if not p.startswith('/')]
+    pdfs = p1 + p2   
+    return pdfs 
+             
+
+                        
+def check_abstract(urls,ID):
+    for url in urls:
+        if urls[url]['Abstract']==True:
+            try:
+                html_doc = requests.get(url).text
+                soup = BeautifulSoup(html_doc, 'html.parser')
+                html_extractor.extract(soup, ID)
+                return True
+            except:
+                #print(ID,url)
+                #with open(FAILS,'a') as f:
+                    #f.writelines(ID+'\t'+url+'\n')
+                continue
+    return False
+
+def check_keywords(urls, ID):
+    ranking = []
+    for url in urls:
+        ranking.append((urls[url]['Keyword'],url))
+    ranking.sort(reverse=True)
+    ranking = [el[1] for el in ranking]
+    if check_pdf(urls,ID,ranking):
+        return True
+    else:
+        with open(FAILS.replace('.txt','_keywords.txt'),'r') as f:
+            lines = f.readlines()
+        with open(FAILS.replace('.txt','_keywords.txt'),'a') as ff:
+            line = ID+'\t'+str(ranking)+'\n'
+            if line not in lines:
+                ff.writelines(line)
+        return False
+
+def extractor(article):
+    ID = article[0]
+    urls = article[1]['Urls']
+    if check_structure(urls, ID):
+        return True
+    elif check_pdf(urls, ID, abstract=True):
+        return True
+    elif check_abstract(urls,ID):
+        return True
+    elif check_keywords(urls, ID):
+        return True
+    else: return False
+
+def iterate(data):
+    de_en = open_json(DE_EN)
+    for i, el in enumerate(data.items()):
+        #print(el)
+        if el[0] in de_en:
+            continue
+        extracted = extractor(el)
+        """
+        
+        """
+        if extracted == False:
+            #print(el)
+            with open(FAILS,'r') as f:
+                lines = f.readlines()
+            with open(FAILS,'a') as ff:
+                line = el[0]+'\n'
+                if line not in lines:
+                    ff.writelines(line)
+        if i%100 == 0:
+            print(i, ' articles of ',len(data),' extracted')
+
+
+if __name__ == '__main__':
+    data = open_json(INPUT)
+    with open(PDF_DICT,'r') as f:
+        pdf_dict = json.load(f)
+    iterate(data)
+    #print(pdf_dict)
+    with open(PDF_DICT,'w') as f:
+        json.dump(pdf_dict,f)