added extraction code

04ccfe0b · nwarslan · e9e507a3 · 04ccfe0b · 04ccfe0b · 04ccfe0b
Commit 04ccfe0b authored 5 years ago by nwarslan
--- a/code/__pycache__/article_to.cpython-37.pyc
+++ b/code/__pycache__/article_to.cpython-37.pyc
--- a/code/__pycache__/html_extractor.cpython-37.pyc
+++ b/code/__pycache__/html_extractor.cpython-37.pyc
--- a/code/__pycache__/pdf_extractor.cpython-37.pyc
+++ b/code/__pycache__/pdf_extractor.cpython-37.pyc
--- a/code/article_to.py
+++ b/code/article_to.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Dec 15 11:47:23 2019
+
+@author: nadia
+"""
+import pickle
+import os
+import json
+from bs4 import BeautifulSoup as bs
+import nltk
+# english
+from nltk.tokenize import word_tokenize
+from nltk.tokenize import sent_tokenize
+# german
+from nltk.tokenize import WordPunctTokenizer
+from nltk.tokenize import PunktSentenceTokenizer
+
+DE_SUM = '../de_spektrum_summaries/de_summaries.json'
+OUT ='../output/extracted_articles/'#de_en_articles.pkl
+
+def load_pkl(filename):
+    with open(filename, 'rb') as f:
+        data = pickle.load(f)
+    return data
+
+def text_file(title, sections, ID, path):
+    if path == 'url':
+        OUTPUT = OUT+'url_extraction/'
+    elif path == 'pdf':
+        OUTPUT = OUT+'pdf_extraction/'
+    with open(OUTPUT+ID+'.txt', 'w') as f:
+        f.writelines(title+'\n\n\n')
+        for el in sections:
+            if type(el)==str:
+                f.writelines('\n\n'+el+'\n\n')
+            else:
+                for paragraph in el:
+                    f.writelines(paragraph+'\n')
+
+def html_file(soup, ID, path):
+    if path == 'url':
+        OUTPUT = OUT+'url_extraction/'
+    elif path == 'pdf':
+        OUTPUT = OUT+'pdf_extraction/'
+    with open(OUTPUT+ID+'.html', 'w') as f:
+        f.write(soup.prettify())
+    
+    
+def split_into_sentences(section, de=False):
+    if de == False:
+        sents = nltk.sent_tokenize(section)
+        sec = ''
+        for sent in sents:
+            sec += '<S> '+sent+' </S> '
+        return sec
+    else:
+        sents = nltk.PunktSentenceTokenizer(section)
+        sec = ''
+        for sent in sents:
+            sec += '<S> '+sent+' </S> '
+        return sec
+
+def get_de_summaries():
+    de_files = os.listdir(DE_SUM)
+    summaries = {}
+    for file in de_files:
+        with open(DE_SUM+file, 'rb') as f:
+            d = json.load(f)
+        data = {}
+        for el in d['Id'].keys():
+            ID = d['Id'][el]
+            data[ID]={}
+            data[ID]['DeTitle']=d['Title'][el]
+            data[ID]['DeUnderTitle']=d['UnderTitle'][el]
+            data[ID]['DeTeaser']=d['Teaser'][el]
+            data[ID]['DeSummary']=d['Summary'][el]
+        summaries.update(data)
+    #print(len(summaries))
+    # OUTPUT: {'Id':{'DeTitle', 'DeUnderTitle', 'DeTeaser','DeSummary'}}
+    return summaries
+
+def tag_en_text(title, sections):
+    txt = '<ARTICLE><TITLE>'+title+'</TITLE>'
+    for el in sections:
+        if type(el)==str:
+            txt += '<HEADING>'+el+'</HEADING>'
+        else:
+            for paragraph in el:
+                txt += '<SECTION>'+split_into_sentences(paragraph)+'</SECTION>'
+    txt += '</ARTICLE>'
+    #print(txt)
+    return txt
+    
+def tag_de_text(summary):
+    txt = '<ARTICLE><TITLE>'+summary['DeTitle']+'</TITLE>'
+    if len(summary['DeUnderTitle'])>0:
+        txt += '<UNDERTITLE>'+summary['DeUnderTitle']+'</UNDERTITLE>'
+    if len(summary['DeTeaser'])>0:
+        txt += '<TEASER>'+summary['DeTeaser']+'</TEASER>'
+    sections = summary['DeSummary'].split('\n')
+    sections = [s for s in sections if s != '']
+    for paragraph in sections:
+        txt += '<SECTION>'+split_into_sentences(paragraph)+'</SECTION>'
+    txt += '</ARTICLE>'
+    #print(txt)
+    return txt
+
+def load_json(filename):
+    with open(filename,'r') as f:
+        data = json.load(f)
+    return data
+    
+def data(title, sections, ID):
+    # get german summaries
+    with open(DE_SUM, 'r') as f:
+        summaries = json.load(f)
+        
+    #de_en_articles = load_pkl(OUT+'de_en_articles.pkl')
+    de_en_articles = load_json(OUT+'de_en_articles.json')
+    en_tagged_text = tag_en_text(title, sections)
+    #de_summary = summaries[ID]
+    with open(OUT+ID+'_de.txt','w') as f:
+        f.writelines(summaries[ID]['DeTitle']+'\n\n'+summaries[ID]['DeSummary'])
+    de_tagged_text = tag_de_text(summaries[ID])
+    #print(de_en_articles.keys())
+    de_en_articles[ID]={'De_Summary': de_tagged_text,'En_Article': en_tagged_text}
+    #if ID in de_en_articles.keys():
+    with open(OUT+'de_en_articles.json','w') as f:
+        json.dump(de_en_articles,f)
+"""    
+if __name__ == '__main__':
+    data={'Id':{},'De_Summary':{},'En_Article':{}}
+    with open(OUT, 'wb') as f:
+        pickle.dump(data,f)
+"""
\ No newline at end of file
--- a/code/combine_de_en.py
+++ b/code/combine_de_en.py
@@ -33,6 +33,8 @@ if __name__ == '__main__':
    de_sum_sample = load_pkl(DE_SUM+files[0])
    de_sum_big = load_pkl(DE_SUM+files[1])
    de_sum_small = load_pkl(DE_SUM+files[2])
+    print(de_sum_sample['Teaser'][0])
+    #print(de_sum_sample['Title'][0]+'\n\n'+de_sum_sample['UnderTitle'][0]+'\n\n'+de_sum_sample['Summary'][0])#SummaryUnderTitle
    #print(len(de_sum_small))
    
    # get rated en urls
@@ -40,7 +42,7 @@ if __name__ == '__main__':
    rated_sample = json_load(EN_RATED+files[1])
    rated_big = json_load(EN_RATED+files[2])
    rated_small = json_load(EN_RATED+files[0])
-    print(len(rated_sample))
+    #print(len(rated_sample))
    
    #for k in de_sum_sample['Summary']:
        

--- a/code/filter_url_html.py
+++ b/code/filter_url_html.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Dec  3 11:20:42 2019
+
+@author: nadia
+"""
+import json
+from bs4 import BeautifulSoup as bs
+import requests
+import re
+import os
+
+INPUT = '../output/extracted_articles/html_by_structure/new/'
+NO_ABSTRACT = '../output/extracted_articles/html_by_structure/new/articles_without_abstract.txt'
+OUTPUT = '../output/extracted_articles/html_by_structure/new/'
+#FAIL = '../output/extracted_articles/html_by_structure/new/html_extraction_request_fails.txt'
+#DONE = '../output/extracted_articles/html_by_structure/new/html_extracted.txt'
+
+def open_html(filename):
+    with open(filename, 'r') as f:
+        data = f.read()
+    soup = bs(data, 'html.parser')
+    return soup
+
+def filter_section(section):
+    section = bs(re.sub('<sup>.+</sup>','', str(section)), 'html.parser')
+    return section
+
+def filter_soup(soup):
+    #new_soup = soup
+    for a in soup.find_all('a'):
+        a.clear()
+    #for p in soup.find_all('p'):
+        #if check_paragraph(p,soup.title.text.strip())==True:
+            #p.clear()
+    for li in soup.find_all('li'):
+        li.clear()      
+    return soup
+
+def get_article(soup, html):
+    title = soup.title.text.strip()
+    print(title)
+    soup = filter_soup(soup)
+    sections = []
+    start = False
+    abstract = False
+    for i in range(1,7):
+        h = 'h' + str(i)
+        heads = soup.find_all(h)
+        for j, head in enumerate(heads):
+            if 'references' in head.text.lower():
+                start = False
+            elif start == True:
+                txt = filter_section(head.parent)
+                sections.append((head.text, txt))
+            elif 'abstract' in head.text.lower():
+                txt = filter_section(head.parent)
+                sections.append((head.text, txt))
+                start = True
+                abstract = True
+                
+    if abstract == False:
+        with open(NO_ABSTRACT,'a') as f: 
+            f.writelines(html+'\n')
+            
+    return sections
+
+def write_article(title, sections, filename):
+    with open(filename, 'w') as f:
+        f.writelines('##TITLE '+ title + '\n\n')
+        for sec in sections:
+            f.writelines('##HEAD ' + sec[0]+'\n\n')
+            f.writelines('##TEXT ' +sec[1].text.replace(sec[0], '', 1)+'\n\n')#.prettify()
+            #f.writelines('\n##END\n')    
+
+if __name__ == '__main__':
+    structures = os.listdir(INPUT)
+    structures = [el for el in structures if not el.endswith('.txt')]
+    for structure in structures:
+        htmls = os.listdir(INPUT+structure)
+        for html in htmls:
+            soup = open_html(INPUT+structure+ '/' + html)
+            title = soup.title.text.strip()
+            sections = get_article(soup, html)
+            filename = OUTPUT+ structure +'/'+ html.replace('.html','.txt')
+            write_article(title, sections, filename)
\ No newline at end of file
--- a/code/get_html_by_structure.py
+++ b/code/get_html_by_structure.py
@@ -5,15 +5,15 @@ Created on Tue Dec  3 10:10:54 2019

 @author: nadia
 """
-
+import html_extractor
 import json
 from bs4 import BeautifulSoup as bs
 import requests
 import re
 #import os

-#INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190304_142037_Sample.json'
-INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190502_142441.json'#filtered_
+INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190304_142037_Sample.json'
+#INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190502_142441.json'#filtered_
 #INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190502_143151.json'#filtered
 OUTPUT = '../output/extracted_articles/html_by_structure/new/'#'/home/nadia/Desktop/'
 FAIL = '../output/extracted_articles/html_by_structure/new/html_extraction_request_fails.txt'
@@ -37,16 +37,18 @@ def select_urls(data):
        try:
            html_doc = requests.get(score[1]).text
            soup = bs(html_doc, 'html.parser')
-            with open(OUTPUT+str(score[0])+'/'+i+'.html', 'w') as f:
-                f.write(soup.prettify())
-            with open(DONE, 'a') as f:
-                f.writelines(str(data[el]['Id'])+'\t'+str(score[0])+'\t'+url+'\t'+soup.title.text.strip()+'\n')
+            html_extractor.extract(soup, i)
+            #with open(OUTPUT+str(score[0])+'/'+i+'.html', 'w') as f:
+                #f.write(soup.prettify())
+            #with open(DONE, 'a') as f:
+                #f.writelines(str(data[el]['Id'])+'\t'+str(score[0])+'\t'+url+'\t'+soup.title.text.strip()+'\n')
            #title = soup.title.text
            #sections = get_article(soup)
            #write_article(title, sections, str(score[0])+'/'+i)
        except:
-            with open(FAIL, 'a') as f:
-                f.writelines(str(data[el]['Id'])+'\t'+str(score[0])+'\t'+url+'\n')
+            print('FAIL: ', el, score)
+            #with open(FAIL, 'a') as f:
+                #f.writelines(str(data[el]['Id'])+'\t'+str(score[0])+'\t'+url+'\n')
        print(c,' of ',len(data.keys()), ' articles extracted')
        c+=1
        

--- a/code/html_extractor.py
+++ b/code/html_extractor.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Oct  9 09:20 2019
+
+@author: nadia
+"""
+from bs4 import BeautifulSoup as bs
+import re
+import os
+import article_to
+
+OUTPUT =  '../output/extracted_articles/url_extraction/'
+
+def get_article(soup):
+    sections = []
+    start = False
+    heads = soup.find_all(re.compile('h[0-9]+'))
+    for j, head in enumerate(heads):
+        #print(head.text)
+        """
+        TO BE IMPLEMENTED:
+            other stop words
+            (e.g.: "acknowledgements", "footnotes", "data accessibility", "authors' contributions","Funding")
+        """
+        if 'references' in head.text.lower():
+            start = False
+        elif start == True or 'abstract' in head.text.lower() or 'introduction' in head.text.lower():
+            txt = filter_section(head.parent)
+            sections.append((head.text.strip(), txt))
+            start = True
+    return sections
+
+
+def filter_section(section):
+    section = bs(re.sub('<sup>.+</sup>','', str(section)), 'html.parser')
+    paragraphs = section.find_all('p')
+    txt = ''
+    for p in paragraphs:
+        txt += p.text.strip() + '\n'
+    txt = re.sub('[\[\(][,;\s]*[\]\)]','',txt)
+    return txt
+
+def filter_soup(soup):
+    
+    for tag in soup.find_all(['a','li','table','figure']):
+        tag.clear()
+    for div in soup.find_all('div'):
+        if div.has_attr('class'):
+            if 'figure' in div['class']:
+                div.clear()
+    return soup
+
+          
+def write_article(title, sections, filename, soup):
+    with open(OUTPUT+filename+'.txt', 'w') as f:
+        f.writelines('##TITLE '+ title + '\n\n')
+        for sec in sections:
+            f.writelines('##HEAD ' + sec[0]+'\n\n')
+            f.writelines('##TEXT ' +sec[1].replace(sec[0], '', 1)+'\n\n')
+            #f.writelines('\n##END\n')
+            
+    if filename+'.html' not in os.listdir(OUTPUT):
+        with open(OUTPUT+filename+'.html', 'w') as f:
+            f.write(soup.prettify())
+    #else: print('file exits')
+"""
+TO BE IMPLEMENTED:
+    add article text to Database ???
+"""
+def extract(soup, ID):
+    title = soup.title.text.strip()
+    filtered_soup = filter_soup(soup)
+    sections = get_article(filtered_soup)
+    article_to.text_file(title, sections, ID, 'url')
+    article_to.html_file(soup, ID, 'url')
+    article_to.data(title, sections, ID)
+    #write_article(title, sections, ID, soup)
+"""    
+if __name__ == '__main__':
+    #url = 'https://www.nature.com/articles/s41598-018-22664-4'
+    #url = 'https://www.nature.com/articles/s41467-018-03465-9'
+    #url = 'http://rsbl.royalsocietypublishing.org/content/14/2/20170743'
+    #url = 'http://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2001663'
+    url = 'https://peerj.com/articles/4452/'
+    html_doc = requests.get(url).text
+    soup = bs(html_doc, 'html.parser')
+    extract(soup,'TEST')
+"""
\ No newline at end of file
--- a/code/main_extractor.py
+++ b/code/main_extractor.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Dec 13 10:48:25 2019
+
+@author: nadia
+"""
+import json
+from bs4 import BeautifulSoup
+import requests
+from tika import parser as pdf_parser
+import re
+import html_extractor
+import pdf_extractor
+import os
+
+INPUT = '../output/spektrum_links_output/filtered_Spektrum_Links_testset.json'
+FAILS = '../output/extracted_articles/extraction_fails.txt'
+DOWNLOAD_FAILS = '../output/extracted_articles/download_fails.txt'
+WINS = '../output/extracted_articles/extraction_complete.txt'
+OUTPUT = '../output/extracted_articles/pdf_extraction/pdfs/'
+PDF_DICT = '../output/extracted_articles/pdf_extraction/pdfs/pdf_dict.json'
+
+def open_json(filename):
+    with open(filename, 'r') as f:
+        data = json.load(f)
+    return data
+    
+def download_pdf(url, pdf_name, pdf_dict):
+    pdfs = os.listdir(OUTPUT)
+    
+    if pdf_name not in pdf_dict:
+        file_name = str(len(pdf_dict))
+        pdf_dict[pdf_name] = file_name
+    else:
+        file_name = pdf_dict[pdf_name]
+        
+    pdf_filename = OUTPUT+file_name+'.pdf'
+    
+    if pdf_filename.replace(OUTPUT,'') in pdfs:
+        #print('PDF already exists')
+        return True
+    try:
+        # get and save pdf file
+        pdf_file = requests. get(url, allow_redirects=True)     
+        open(pdf_filename, 'wb').write(pdf_file.content)
+        return True
+    except:
+        with open(DOWNLOAD_FAILS, 'a') as f:
+            f.writelines(pdf_filename+'\t'+url+'\n')
+        return False
+    
+def get_pdf_soup(filename, ID):
+    """
+    if ID+'.html' in os.listdir('../output/extracted_articles/pdf_extraction/'):
+        with open('../output/extracted_articles/pdf_extraction/'+ID+'.html','r') as f:
+            html_doc = f.read()
+            soup = BeautifulSoup(html_doc, 'html.parser')
+            return soup
+    """
+    #pdfreader=PyPDF2.PdfFileReader(open(filename,'rb'))
+    #pdf_count=pdfreader.numPages
+    #print(pdf_count)
+    sysxml = pdf_parser.from_file(filename, xmlContent=True)['content']
+    sysxml = re.sub(r"<p />","",sysxml)
+    sysxml = re.sub(r"<p>[\s]*\n</p>","",sysxml)
+    soup=BeautifulSoup(sysxml,'html.parser')
+    #print(soup.title.text)
+    return soup
+
+def check_structure(urls, ID):
+    score = (0,'')
+    for url in urls.keys():
+        if int(urls[url]['Structure']) > score[0] and urls[url]['Abstract']==True:
+            score = (int(urls[url]['Structure']), url)
+    if score[0] < 4:
+        return False
+    else:
+        url = score[1]
+        try:
+            html_doc = requests.get(url).text
+            soup = BeautifulSoup(html_doc, 'html.parser')
+            html_extractor.extract(soup, ID)
+            return True
+        except:
+            #print(ID, url)
+            #with open(FAILS,'a') as f:
+                #f.writelines(ID+'\t'+url+'\n')
+            return False
+        
+def check_pdf(urls, ID, abstract=False,ranking=[]):
+    def compare_titles(title1, title2):
+        if len(title1.split()) < 5 or len(title2.split()) < 5:
+            return False
+        #if title1 == '' or title2 == '':
+            #return False
+        title1 = re.sub('\|.*$','',title1)
+        title2 = re.sub('[_:]','',title2)
+        title1 = re.sub('[_:]','',title1)
+        #title2 = re.sub('\n','\s',title2)
+        #title1 = re.sub('\n','\s',title1)
+        #print(title1)
+        #print(title2)
+        #if title1 in title2 or title2 in title1:
+            #return True
+        title2 = title2.split()
+        title3 = []
+        for i in range(len(title2)-2):
+            title3.append(title2[i]+' '+title2[i+1])
+        #print(title3)
+        i = 0
+        for bigram in title3:
+            if bigram in title1: i+=1
+        if i!=0 and i/len(title3)>0.5:
+            return True
+        return False
+    
+    if abstract==True:
+        for url in urls:
+            if urls[url]['Abstract']==True and len(urls[url]['Pdfs'])>0:
+                url_title = urls[url]['En_title']
+                pdfs = urls[url]['Pdfs']
+                pdfs = check_path(pdfs, url)
+                for pdf in pdfs:
+                    pdf_name = ID+pdf                                              
+                    if download_pdf(pdf,pdf_name, pdf_dict):
+                        file_name = pdf_dict[pdf_name]
+                        soup = get_pdf_soup(OUTPUT+file_name+'.pdf',ID)
+                        pdf_title = soup.title.text.strip()
+                        if compare_titles(url_title.lower(), pdf_title.lower()):
+                            pdf_extractor.extract(soup, ID)
+                            return True
+    else:
+        for url in ranking:
+            if len(urls[url]['Pdfs'])>0:
+                url_title = urls[url]['En_title']
+                pdfs = urls[url]['Pdfs']
+                pdfs = check_path(pdfs, url)
+                for pdf in pdfs:
+                    pdf_name = ID+pdf                                              
+                    if download_pdf(pdf,pdf_name, pdf_dict):
+                        file_name = pdf_dict[pdf_name]
+                        soup = get_pdf_soup(OUTPUT+file_name+'.pdf')
+                        pdf_title = soup.title.text.strip()
+                        if compare_titles(url_title.lower(), pdf_title.lower()):
+                            pdf_extractor.extract(soup, ID)
+                            return True               
+    return False
+
+def check_path(pdfs, url):
+    def merge_link(p, url):
+        url_split = [el for el in url.split('/') if len(el) != 0]
+        url_foot = '//'.join(url_split[:2])
+        p = url_foot + p
+        return p
+    p1 = [merge_link(p,url) for p in pdfs if p.startswith('/')]
+    p2 = [p for p in pdfs if not p.startswith('/')]
+    pdfs = p1 + p2   
+    return pdfs 
+             
+
+                        
+def check_abstract(urls,ID):
+    for url in urls:
+        if urls[url]['Abstract']==True:
+            try:
+                html_doc = requests.get(url).text
+                soup = BeautifulSoup(html_doc, 'html.parser')
+                html_extractor.extract(soup, ID)
+                return True
+            except:
+                #print(ID,url)
+                #with open(FAILS,'a') as f:
+                    #f.writelines(ID+'\t'+url+'\n')
+                continue
+    return False
+
+def check_keywords(urls, ID):
+    ranking = []
+    for url in urls:
+        ranking.append((urls[url]['Keyword'],url))
+    ranking.sort(reverse=True)
+    ranking = [el[1] for el in ranking]
+    if check_pdf(urls,ID,ranking):
+        return True
+    else:
+        with open(FAILS.replace('.txt','_keywords.txt'),'r') as f:
+            lines = f.readlines()
+        with open(FAILS.replace('.txt','_keywords.txt'),'a') as ff:
+            line = ID+'\t'+str(ranking)+'\n'
+            if line not in lines:
+                ff.writelines(line)
+        return False
+
+def extractor(article):
+    ID = article[0]
+    urls = article[1]['Urls']
+    if check_structure(urls, ID):
+        return True
+    elif check_pdf(urls, ID, abstract=True):
+        return True
+    elif check_abstract(urls,ID):
+        return True
+    elif check_keywords(urls, ID):
+        return True
+    else: return False
+
+def iterate(data):
+    for i, el in enumerate(data.items()):
+        #print(el)
+        extracted = extractor(el)
+        """
+        
+        """
+        if extracted == False:
+            #print(el)
+            with open(FAILS,'r') as f:
+                lines = f.readlines()
+            with open(FAILS,'a') as ff:
+                line = el[0]+'\n'
+                if line not in lines:
+                    ff.writelines(line)
+        if i%100 == 0:
+            print(i, ' articles of ',len(data),' extracted')
+
+
+if __name__ == '__main__':
+    data = open_json(INPUT)
+    with open(PDF_DICT,'r') as f:
+        pdf_dict = json.load(f)
+    iterate(data)
+    #print(pdf_dict)
+    with open(PDF_DICT,'w') as f:
+        json.dump(pdf_dict,f)
\ No newline at end of file
--- a/code/pdf_extractor.py
+++ b/code/pdf_extractor.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Nov  4 18:28:59 2019
+
+@author: nadia
+"""
+from bs4 import BeautifulSoup as bs
+import re
+import os
+import nltk
+import article_to
+# english
+from nltk.tokenize import word_tokenize
+from nltk.tokenize import sent_tokenize
+# german
+from nltk.tokenize import WordPunctTokenizer
+from nltk.tokenize import PunktSentenceTokenizer
+
+OUTPUT = '../output/extracted_articles/pdf_extraction/'
+
+#names_and_footnotes = re.compile('([A-Za-z\.\s]+([0-9]+,?)+\s?&?){3}([A-Za-z\s]+([0-9]+,?)+)*')
+#names_and_footnotes = re.compile('([A-Za-z\.-]+\s?[A-Za-z\.-]*\s?[A-Za-z\.-]*([0-9],?)+\s?&?\s?){3}([A-Za-z\.-]+\s?[A-Za-z\.-]*\s?[A-Za-z\.-]*([0-9],?)+\s?&?\s?)*')
+names_and_footnotes = re.compile('([A-Za-z\.\s-]+([0-9]+,)+\s?&?){3}([A-Za-z\s-]+([0-9]+,)+)*')
+figure_pattern = re.compile('^[\s\)]*(Figure|Table|Fig\s?\.|Tab\s?\.)\s[0-9].*')
+#title_pattern = re.compile('^\s*([aA]bstract|[iI]ntroduction|[cC]onclusions(\sand final remarks)?|[aA]uthor\s[cC]ontributions|[aA]dditional\s[iI]nformation|[mM]ethods|[rR]esults(\s[aA]nd\s[dD]iscussion)?|[dD]iscussion|[rR]eferences|[aA]cknowledgements)')
+title_pattern = re.compile('^[0-9\.:\s]*([aA]bstract|[iI]ntroduction|[cC]onclusions(\sand final remarks)?|[aA]uthor\s[cC]ontributions|[aA]dditional\s[iI]nformation|([mM]aterials?\s[aA]nd\s)?[mM]ethods|[rR]esults(\s[aA]nd\s[dD]iscussion)?|[dD]iscussion|[rR]eferences|[aA]cknowledgements)')
+headings = ['Abstract','Introduction','Conclusions','Author contributions','Additional Information','Methods','Material and methods','Results and Discussion','Results','Discussion','References','Acknowledgements']
+
+def open_soup(filename):
+    with open(filename, 'r') as f:
+        html_doc = f.read()
+        soup = bs(html_doc, 'html.parser')
+    return soup
+
+def check_paragraph(p, article_title):
+    text = p.text
+    #print(text)
+    #print('#######')
+    # check if paragraph is document title
+    text_split = text.strip().replace('\n',' ')
+    if text_split.replace(' ','').lower().startswith(article_title.replace(' ','').lower()): #.replace(' ','').lower()
+        return True
+    
+    # check if paragraph is beginning of a section
+    if re.search(title_pattern, text):
+        #print(re.search(title_pattern, text).group(1))
+        return False
+    if re.search(title_pattern, text.replace(' ','').lower()):
+        #print(re.search(title_pattern, text).group(1))
+        return False
+    #if re.sub('^[0-9.]+\s?','',p.text.strip()).strip() in headings:
+        #print(re.sub('^[0-9.]+\s?','',p.text.strip()).strip())
+        #return False
+    #text_no_numbers = re.sub('^[0-9.]+\s?','',p.text.strip())
+    #text_no_numbers = re.sub('[0-9]\.?\s?','',text)
+    #for title in headings:
+        #if text_no_numbers.strip().lower().startswith(title.lower()):
+            #print(title)
+            #return False
+    
+    # check if paragraph is long enough
+    pure_text = re.sub('[0-9]+','',text)
+    text_words = pure_text.split()
+    if len(text_words) <= 15:
+        return True
+    if len(text.strip().split('\n')) == 1:
+        return True
+    
+    #check '&' occurance
+    if text.count('&')>=3:
+        return True
+    
+    # check '@' occurance
+    if text.count('@')>=3:
+        return True
+    
+    # check if text is subscript of figure ore table
+    if re.match(figure_pattern, text):
+        return True
+    
+    # check for certain expressions
+    #Received 15 October 2017; Accepted 16 January 2018
+    #,re.compile('[Rr]eceived:?\s?[0-9]{2}[A-Za-z]*[0-9]{4}.*[Aa]ccepted:?\s?[0-9]{2}[A-Za-z]*[0-9]{4}')
+    expressions = [re.compile('online version of this article'), re.compile('\(email:')]
+    for e in expressions:
+        if re.search(e,text.replace('\n',' ')) :
+            return True
+        
+    # check for expressions like
+    #aUSDA Forest Service Northern Research Station, 160 Holdsworth Way, Amherst, MA 01003, USA bDepartment 
+    #of Environmental Conservation, University of Massachusetts, 160 Holdsworth Way, Amherst, MA 01003, USA 
+    #c Earth Systems Research Center, Institute for the Study of Earth, Oceans, and Space, University of 
+    #New Hampshire, 8 College Road, Durham, NH 03824, USA d School of Life Sciences, P.O. Box 874501, 
+    #Arizona State University, Tempe, AZ 85287-4501, USA
+
+    research_words = ['Research Station','Research Center','Institute','University','USA','School']
+    count = 0
+    for word in research_words:
+        count += text.lower().count(word.lower())
+    if count>=3:
+        return True
+    
+    # check for names and footnotes pattern
+    text_flow = ''.join(text.split('\n'))
+    if re.search(names_and_footnotes, text_flow):
+        return True
+    
+    # check if paragraph has more numbers than letters    
+    letter = re.compile('[a-zA-Z]')
+    number = re.compile('[0-9]')
+    letters, numbers, something = 0, 0, 0
+    for el in text:
+        if re.match(letter, el):
+            letters += 1
+        elif re.match(number, el):
+            numbers += 1
+        else:
+            something += 1
+    if numbers > letters:
+        return True    
+    
+    #text_flow = re.sub('[0-9,-]+','',text_flow)
+    #text_flow = re.sub('\(.*\)','',text_flow)
+    #text_flow = re.sub('\s([A-Z]\.)+','',text_flow)
+    
+    # check proportion commas and chars
+    lower_letter_pattern = re.compile('^(\s)*[a-z]')
+    if re.match(lower_letter_pattern, text_flow):
+        return False
+    commas = text_flow.count(',')
+    #chars = len([char for char in text_flow if char != ' '])
+    #if commas != 0:
+        #if (chars-commas)/commas < 100:
+            #return True   
+    #sents = len(nltk.sent_tokenize(text))
+    #sents = len(text_flow.split('.'))#len(nltk.sent_tokenize(text_flow))#
+    #if sents*2.5 <= commas:
+        #return True
+    return False  
+    
+def filter_soup(soup):
+    #new_soup = soup
+    #print(soup.prettify())
+    for a in soup.find_all('a'):
+        a.clear()
+    for p in soup.find_all('p'):
+        if check_paragraph(p,soup.title.text.strip())==True:
+            p.clear()
+    for li in soup.find_all('li'):
+        li.clear()      
+    return soup
+
+def save_soup(soup, filename):
+    filename = OUTPUT+filename+'.html'
+    with open(filename,'w') as f:
+        f.write(soup.prettify()) 
+
+def split_into_sentences(section):
+    sents = nltk.sent_tokenize(section)
+    sec = ''
+    for sent in sents:
+        sec += '<S> '+sent+' </S> '
+    return sec
+    
+def check_for_heading(text):
+    if re.search(title_pattern, text):
+        #print(re.search(title_pattern, text).group(1))
+        head = re.search(title_pattern, text).group(1)
+        text = re.sub(title_pattern,'', text.strip())
+        return (head, text)
+    if re.search(title_pattern, text.replace(' ','').lower()):
+        #print(re.search(title_pattern, text.replace(' ','').lower()).group(1))
+        head = re.search(title_pattern, text.replace(' ','').lower()).group(1)
+        text = text.strip() #text[len(head):].strip()
+        return (head, text)
+    """
+    for head in headings:
+        text_no_numbers = re.sub('^\s*[0-9]','',text)
+        if text_no_numbers.strip().lower().startswith(head.lower()):
+            text = re.sub(title_pattern,'',text_no_numbers.strip())
+            return (head, text)
+    """
+    return False
+    
+    
+def get_article(soup):
+    #tit = soup.title.text.strip()
+    
+    #article = []
+    sections = []
+    heading = ''
+    subsections = []
+    
+    for p in soup.find_all('p'):
+        text = p.text.strip()
+        #print(text)
+        #print('------')
+        if text == '':
+            continue
+        text =text.replace('-\n','')
+        text= text.replace('\n', ' ')
+        head_check = check_for_heading(text)
+        if head_check != False:
+            if len(subsections)>0:
+                    if heading != '':
+                        sections.append(heading)
+                    sections.append(subsections)
+            subsections = []
+            heading = head_check[0]
+            subsections.append(head_check[1])
+        else:
+            subsections.append(text)
+            
+    sections = merge_sections(sections)   
+    return sections
+
+
+def merge_sections(sections):
+    for n, el in enumerate(sections):
+        if type(el)==str:
+            continue
+        else:
+            merged_sections = []
+            i = 0
+            while i<len(el):
+                try:
+                    if el[i].strip()[-1] != '.':
+                        if re.search('^[a-z]',el[i+1].strip()):
+                            merged_sections.append(el[i]+' '+el[i+1])
+                            i += 2
+                        else:
+                            merged_sections.append(el[i])
+                            i += 1
+                    else:
+                        merged_sections.append(el[i])
+                        i += 1
+                except:
+                    merged_sections.append(el[i])
+                    i += 1
+            sections[n] = merged_sections
+    return sections
+
+def article_2_file(article, filename):
+    with open(OUTPUT+filename+'.txt','w') as f:
+        for el in article:
+            f.writelines(el)#+'\n'
+        
+def extract(soup,ID):
+    article_to.html_file(soup, ID, 'pdf')
+    title = soup.title.text.strip()
+    filtered_soup = filter_soup(soup)
+    sections = get_article(filtered_soup)
+    article_to.text_file(title, sections, ID, 'pdf')
+    article_to.data(title, sections, ID)
+    
+    #article_2_file(article, ID)
+
+
+if __name__ == '__main__':
+    #soup = open_soup('../output/extracted_articles/pdf_extraction/1551668.html')
+    soup = open_soup('../output/old_stuff/pdfs/sample/1549083.html')
+    extract(soup, '1549083')
+"""    
+    files = os.listdir(INFILES)
+    files = [f for f in files if f.endswith('.html')]
+    #print('number of pdf extracted html files: ', len(files))
+    for i, file in enumerate(files):
+        file_name = file.replace('.html','')
+        soup = open_soup(INFILES+file)
+        filtered_soup = filter_soup(soup)
+        #save_soup(filtered_soup, file)
+        article = extract_article(filtered_soup)
+        article_2_file(article, file_name)
+        
+        if i%100==0:
+            print(i,' of ',len(files), ' extracted')
+"""
\ No newline at end of file