Skip to content
Snippets Groups Projects
Commit 04ccfe0b authored by nwarslan's avatar nwarslan
Browse files

added extraction code

parent e9e507a3
No related branches found
No related tags found
No related merge requests found
File added
File added
File added
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 15 11:47:23 2019
@author: nadia
"""
import pickle
import os
import json
from bs4 import BeautifulSoup as bs
import nltk
# english
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# german
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import PunktSentenceTokenizer
DE_SUM = '../de_spektrum_summaries/de_summaries.json'
OUT ='../output/extracted_articles/'#de_en_articles.pkl
def load_pkl(filename):
with open(filename, 'rb') as f:
data = pickle.load(f)
return data
def text_file(title, sections, ID, path):
if path == 'url':
OUTPUT = OUT+'url_extraction/'
elif path == 'pdf':
OUTPUT = OUT+'pdf_extraction/'
with open(OUTPUT+ID+'.txt', 'w') as f:
f.writelines(title+'\n\n\n')
for el in sections:
if type(el)==str:
f.writelines('\n\n'+el+'\n\n')
else:
for paragraph in el:
f.writelines(paragraph+'\n')
def html_file(soup, ID, path):
if path == 'url':
OUTPUT = OUT+'url_extraction/'
elif path == 'pdf':
OUTPUT = OUT+'pdf_extraction/'
with open(OUTPUT+ID+'.html', 'w') as f:
f.write(soup.prettify())
def split_into_sentences(section, de=False):
if de == False:
sents = nltk.sent_tokenize(section)
sec = ''
for sent in sents:
sec += '<S> '+sent+' </S> '
return sec
else:
sents = nltk.PunktSentenceTokenizer(section)
sec = ''
for sent in sents:
sec += '<S> '+sent+' </S> '
return sec
def get_de_summaries():
de_files = os.listdir(DE_SUM)
summaries = {}
for file in de_files:
with open(DE_SUM+file, 'rb') as f:
d = json.load(f)
data = {}
for el in d['Id'].keys():
ID = d['Id'][el]
data[ID]={}
data[ID]['DeTitle']=d['Title'][el]
data[ID]['DeUnderTitle']=d['UnderTitle'][el]
data[ID]['DeTeaser']=d['Teaser'][el]
data[ID]['DeSummary']=d['Summary'][el]
summaries.update(data)
#print(len(summaries))
# OUTPUT: {'Id':{'DeTitle', 'DeUnderTitle', 'DeTeaser','DeSummary'}}
return summaries
def tag_en_text(title, sections):
txt = '<ARTICLE><TITLE>'+title+'</TITLE>'
for el in sections:
if type(el)==str:
txt += '<HEADING>'+el+'</HEADING>'
else:
for paragraph in el:
txt += '<SECTION>'+split_into_sentences(paragraph)+'</SECTION>'
txt += '</ARTICLE>'
#print(txt)
return txt
def tag_de_text(summary):
txt = '<ARTICLE><TITLE>'+summary['DeTitle']+'</TITLE>'
if len(summary['DeUnderTitle'])>0:
txt += '<UNDERTITLE>'+summary['DeUnderTitle']+'</UNDERTITLE>'
if len(summary['DeTeaser'])>0:
txt += '<TEASER>'+summary['DeTeaser']+'</TEASER>'
sections = summary['DeSummary'].split('\n')
sections = [s for s in sections if s != '']
for paragraph in sections:
txt += '<SECTION>'+split_into_sentences(paragraph)+'</SECTION>'
txt += '</ARTICLE>'
#print(txt)
return txt
def load_json(filename):
with open(filename,'r') as f:
data = json.load(f)
return data
def data(title, sections, ID):
# get german summaries
with open(DE_SUM, 'r') as f:
summaries = json.load(f)
#de_en_articles = load_pkl(OUT+'de_en_articles.pkl')
de_en_articles = load_json(OUT+'de_en_articles.json')
en_tagged_text = tag_en_text(title, sections)
#de_summary = summaries[ID]
with open(OUT+ID+'_de.txt','w') as f:
f.writelines(summaries[ID]['DeTitle']+'\n\n'+summaries[ID]['DeSummary'])
de_tagged_text = tag_de_text(summaries[ID])
#print(de_en_articles.keys())
de_en_articles[ID]={'De_Summary': de_tagged_text,'En_Article': en_tagged_text}
#if ID in de_en_articles.keys():
with open(OUT+'de_en_articles.json','w') as f:
json.dump(de_en_articles,f)
"""
if __name__ == '__main__':
data={'Id':{},'De_Summary':{},'En_Article':{}}
with open(OUT, 'wb') as f:
pickle.dump(data,f)
"""
\ No newline at end of file
......@@ -33,6 +33,8 @@ if __name__ == '__main__':
de_sum_sample = load_pkl(DE_SUM+files[0])
de_sum_big = load_pkl(DE_SUM+files[1])
de_sum_small = load_pkl(DE_SUM+files[2])
print(de_sum_sample['Teaser'][0])
#print(de_sum_sample['Title'][0]+'\n\n'+de_sum_sample['UnderTitle'][0]+'\n\n'+de_sum_sample['Summary'][0])#SummaryUnderTitle
#print(len(de_sum_small))
# get rated en urls
......@@ -40,7 +42,7 @@ if __name__ == '__main__':
rated_sample = json_load(EN_RATED+files[1])
rated_big = json_load(EN_RATED+files[2])
rated_small = json_load(EN_RATED+files[0])
print(len(rated_sample))
#print(len(rated_sample))
#for k in de_sum_sample['Summary']:
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 3 11:20:42 2019
@author: nadia
"""
import json
from bs4 import BeautifulSoup as bs
import requests
import re
import os
INPUT = '../output/extracted_articles/html_by_structure/new/'
NO_ABSTRACT = '../output/extracted_articles/html_by_structure/new/articles_without_abstract.txt'
OUTPUT = '../output/extracted_articles/html_by_structure/new/'
#FAIL = '../output/extracted_articles/html_by_structure/new/html_extraction_request_fails.txt'
#DONE = '../output/extracted_articles/html_by_structure/new/html_extracted.txt'
def open_html(filename):
with open(filename, 'r') as f:
data = f.read()
soup = bs(data, 'html.parser')
return soup
def filter_section(section):
section = bs(re.sub('<sup>.+</sup>','', str(section)), 'html.parser')
return section
def filter_soup(soup):
#new_soup = soup
for a in soup.find_all('a'):
a.clear()
#for p in soup.find_all('p'):
#if check_paragraph(p,soup.title.text.strip())==True:
#p.clear()
for li in soup.find_all('li'):
li.clear()
return soup
def get_article(soup, html):
title = soup.title.text.strip()
print(title)
soup = filter_soup(soup)
sections = []
start = False
abstract = False
for i in range(1,7):
h = 'h' + str(i)
heads = soup.find_all(h)
for j, head in enumerate(heads):
if 'references' in head.text.lower():
start = False
elif start == True:
txt = filter_section(head.parent)
sections.append((head.text, txt))
elif 'abstract' in head.text.lower():
txt = filter_section(head.parent)
sections.append((head.text, txt))
start = True
abstract = True
if abstract == False:
with open(NO_ABSTRACT,'a') as f:
f.writelines(html+'\n')
return sections
def write_article(title, sections, filename):
with open(filename, 'w') as f:
f.writelines('##TITLE '+ title + '\n\n')
for sec in sections:
f.writelines('##HEAD ' + sec[0]+'\n\n')
f.writelines('##TEXT ' +sec[1].text.replace(sec[0], '', 1)+'\n\n')#.prettify()
#f.writelines('\n##END\n')
if __name__ == '__main__':
structures = os.listdir(INPUT)
structures = [el for el in structures if not el.endswith('.txt')]
for structure in structures:
htmls = os.listdir(INPUT+structure)
for html in htmls:
soup = open_html(INPUT+structure+ '/' + html)
title = soup.title.text.strip()
sections = get_article(soup, html)
filename = OUTPUT+ structure +'/'+ html.replace('.html','.txt')
write_article(title, sections, filename)
\ No newline at end of file
......@@ -5,15 +5,15 @@ Created on Tue Dec 3 10:10:54 2019
@author: nadia
"""
import html_extractor
import json
from bs4 import BeautifulSoup as bs
import requests
import re
#import os
#INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190304_142037_Sample.json'
INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190502_142441.json'#filtered_
INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190304_142037_Sample.json'
#INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190502_142441.json'#filtered_
#INPUT = '../output/spektrum_links_output/rated_Spektrum_Links_20190502_143151.json'#filtered
OUTPUT = '../output/extracted_articles/html_by_structure/new/'#'/home/nadia/Desktop/'
FAIL = '../output/extracted_articles/html_by_structure/new/html_extraction_request_fails.txt'
......@@ -37,16 +37,18 @@ def select_urls(data):
try:
html_doc = requests.get(score[1]).text
soup = bs(html_doc, 'html.parser')
with open(OUTPUT+str(score[0])+'/'+i+'.html', 'w') as f:
f.write(soup.prettify())
with open(DONE, 'a') as f:
f.writelines(str(data[el]['Id'])+'\t'+str(score[0])+'\t'+url+'\t'+soup.title.text.strip()+'\n')
html_extractor.extract(soup, i)
#with open(OUTPUT+str(score[0])+'/'+i+'.html', 'w') as f:
#f.write(soup.prettify())
#with open(DONE, 'a') as f:
#f.writelines(str(data[el]['Id'])+'\t'+str(score[0])+'\t'+url+'\t'+soup.title.text.strip()+'\n')
#title = soup.title.text
#sections = get_article(soup)
#write_article(title, sections, str(score[0])+'/'+i)
except:
with open(FAIL, 'a') as f:
f.writelines(str(data[el]['Id'])+'\t'+str(score[0])+'\t'+url+'\n')
print('FAIL: ', el, score)
#with open(FAIL, 'a') as f:
#f.writelines(str(data[el]['Id'])+'\t'+str(score[0])+'\t'+url+'\n')
print(c,' of ',len(data.keys()), ' articles extracted')
c+=1
......
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 9 09:20 2019
@author: nadia
"""
from bs4 import BeautifulSoup as bs
import re
import os
import article_to
OUTPUT = '../output/extracted_articles/url_extraction/'
def get_article(soup):
sections = []
start = False
heads = soup.find_all(re.compile('h[0-9]+'))
for j, head in enumerate(heads):
#print(head.text)
"""
TO BE IMPLEMENTED:
other stop words
(e.g.: "acknowledgements", "footnotes", "data accessibility", "authors' contributions","Funding")
"""
if 'references' in head.text.lower():
start = False
elif start == True or 'abstract' in head.text.lower() or 'introduction' in head.text.lower():
txt = filter_section(head.parent)
sections.append((head.text.strip(), txt))
start = True
return sections
def filter_section(section):
section = bs(re.sub('<sup>.+</sup>','', str(section)), 'html.parser')
paragraphs = section.find_all('p')
txt = ''
for p in paragraphs:
txt += p.text.strip() + '\n'
txt = re.sub('[\[\(][,;\s]*[\]\)]','',txt)
return txt
def filter_soup(soup):
for tag in soup.find_all(['a','li','table','figure']):
tag.clear()
for div in soup.find_all('div'):
if div.has_attr('class'):
if 'figure' in div['class']:
div.clear()
return soup
def write_article(title, sections, filename, soup):
with open(OUTPUT+filename+'.txt', 'w') as f:
f.writelines('##TITLE '+ title + '\n\n')
for sec in sections:
f.writelines('##HEAD ' + sec[0]+'\n\n')
f.writelines('##TEXT ' +sec[1].replace(sec[0], '', 1)+'\n\n')
#f.writelines('\n##END\n')
if filename+'.html' not in os.listdir(OUTPUT):
with open(OUTPUT+filename+'.html', 'w') as f:
f.write(soup.prettify())
#else: print('file exits')
"""
TO BE IMPLEMENTED:
add article text to Database ???
"""
def extract(soup, ID):
title = soup.title.text.strip()
filtered_soup = filter_soup(soup)
sections = get_article(filtered_soup)
article_to.text_file(title, sections, ID, 'url')
article_to.html_file(soup, ID, 'url')
article_to.data(title, sections, ID)
#write_article(title, sections, ID, soup)
"""
if __name__ == '__main__':
#url = 'https://www.nature.com/articles/s41598-018-22664-4'
#url = 'https://www.nature.com/articles/s41467-018-03465-9'
#url = 'http://rsbl.royalsocietypublishing.org/content/14/2/20170743'
#url = 'http://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2001663'
url = 'https://peerj.com/articles/4452/'
html_doc = requests.get(url).text
soup = bs(html_doc, 'html.parser')
extract(soup,'TEST')
"""
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 13 10:48:25 2019
@author: nadia
"""
import json
from bs4 import BeautifulSoup
import requests
from tika import parser as pdf_parser
import re
import html_extractor
import pdf_extractor
import os
INPUT = '../output/spektrum_links_output/filtered_Spektrum_Links_testset.json'
FAILS = '../output/extracted_articles/extraction_fails.txt'
DOWNLOAD_FAILS = '../output/extracted_articles/download_fails.txt'
WINS = '../output/extracted_articles/extraction_complete.txt'
OUTPUT = '../output/extracted_articles/pdf_extraction/pdfs/'
PDF_DICT = '../output/extracted_articles/pdf_extraction/pdfs/pdf_dict.json'
def open_json(filename):
with open(filename, 'r') as f:
data = json.load(f)
return data
def download_pdf(url, pdf_name, pdf_dict):
pdfs = os.listdir(OUTPUT)
if pdf_name not in pdf_dict:
file_name = str(len(pdf_dict))
pdf_dict[pdf_name] = file_name
else:
file_name = pdf_dict[pdf_name]
pdf_filename = OUTPUT+file_name+'.pdf'
if pdf_filename.replace(OUTPUT,'') in pdfs:
#print('PDF already exists')
return True
try:
# get and save pdf file
pdf_file = requests. get(url, allow_redirects=True)
open(pdf_filename, 'wb').write(pdf_file.content)
return True
except:
with open(DOWNLOAD_FAILS, 'a') as f:
f.writelines(pdf_filename+'\t'+url+'\n')
return False
def get_pdf_soup(filename, ID):
"""
if ID+'.html' in os.listdir('../output/extracted_articles/pdf_extraction/'):
with open('../output/extracted_articles/pdf_extraction/'+ID+'.html','r') as f:
html_doc = f.read()
soup = BeautifulSoup(html_doc, 'html.parser')
return soup
"""
#pdfreader=PyPDF2.PdfFileReader(open(filename,'rb'))
#pdf_count=pdfreader.numPages
#print(pdf_count)
sysxml = pdf_parser.from_file(filename, xmlContent=True)['content']
sysxml = re.sub(r"<p />","",sysxml)
sysxml = re.sub(r"<p>[\s]*\n</p>","",sysxml)
soup=BeautifulSoup(sysxml,'html.parser')
#print(soup.title.text)
return soup
def check_structure(urls, ID):
score = (0,'')
for url in urls.keys():
if int(urls[url]['Structure']) > score[0] and urls[url]['Abstract']==True:
score = (int(urls[url]['Structure']), url)
if score[0] < 4:
return False
else:
url = score[1]
try:
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, 'html.parser')
html_extractor.extract(soup, ID)
return True
except:
#print(ID, url)
#with open(FAILS,'a') as f:
#f.writelines(ID+'\t'+url+'\n')
return False
def check_pdf(urls, ID, abstract=False,ranking=[]):
def compare_titles(title1, title2):
if len(title1.split()) < 5 or len(title2.split()) < 5:
return False
#if title1 == '' or title2 == '':
#return False
title1 = re.sub('\|.*$','',title1)
title2 = re.sub('[_:]','',title2)
title1 = re.sub('[_:]','',title1)
#title2 = re.sub('\n','\s',title2)
#title1 = re.sub('\n','\s',title1)
#print(title1)
#print(title2)
#if title1 in title2 or title2 in title1:
#return True
title2 = title2.split()
title3 = []
for i in range(len(title2)-2):
title3.append(title2[i]+' '+title2[i+1])
#print(title3)
i = 0
for bigram in title3:
if bigram in title1: i+=1
if i!=0 and i/len(title3)>0.5:
return True
return False
if abstract==True:
for url in urls:
if urls[url]['Abstract']==True and len(urls[url]['Pdfs'])>0:
url_title = urls[url]['En_title']
pdfs = urls[url]['Pdfs']
pdfs = check_path(pdfs, url)
for pdf in pdfs:
pdf_name = ID+pdf
if download_pdf(pdf,pdf_name, pdf_dict):
file_name = pdf_dict[pdf_name]
soup = get_pdf_soup(OUTPUT+file_name+'.pdf',ID)
pdf_title = soup.title.text.strip()
if compare_titles(url_title.lower(), pdf_title.lower()):
pdf_extractor.extract(soup, ID)
return True
else:
for url in ranking:
if len(urls[url]['Pdfs'])>0:
url_title = urls[url]['En_title']
pdfs = urls[url]['Pdfs']
pdfs = check_path(pdfs, url)
for pdf in pdfs:
pdf_name = ID+pdf
if download_pdf(pdf,pdf_name, pdf_dict):
file_name = pdf_dict[pdf_name]
soup = get_pdf_soup(OUTPUT+file_name+'.pdf')
pdf_title = soup.title.text.strip()
if compare_titles(url_title.lower(), pdf_title.lower()):
pdf_extractor.extract(soup, ID)
return True
return False
def check_path(pdfs, url):
def merge_link(p, url):
url_split = [el for el in url.split('/') if len(el) != 0]
url_foot = '//'.join(url_split[:2])
p = url_foot + p
return p
p1 = [merge_link(p,url) for p in pdfs if p.startswith('/')]
p2 = [p for p in pdfs if not p.startswith('/')]
pdfs = p1 + p2
return pdfs
def check_abstract(urls,ID):
for url in urls:
if urls[url]['Abstract']==True:
try:
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, 'html.parser')
html_extractor.extract(soup, ID)
return True
except:
#print(ID,url)
#with open(FAILS,'a') as f:
#f.writelines(ID+'\t'+url+'\n')
continue
return False
def check_keywords(urls, ID):
ranking = []
for url in urls:
ranking.append((urls[url]['Keyword'],url))
ranking.sort(reverse=True)
ranking = [el[1] for el in ranking]
if check_pdf(urls,ID,ranking):
return True
else:
with open(FAILS.replace('.txt','_keywords.txt'),'r') as f:
lines = f.readlines()
with open(FAILS.replace('.txt','_keywords.txt'),'a') as ff:
line = ID+'\t'+str(ranking)+'\n'
if line not in lines:
ff.writelines(line)
return False
def extractor(article):
ID = article[0]
urls = article[1]['Urls']
if check_structure(urls, ID):
return True
elif check_pdf(urls, ID, abstract=True):
return True
elif check_abstract(urls,ID):
return True
elif check_keywords(urls, ID):
return True
else: return False
def iterate(data):
for i, el in enumerate(data.items()):
#print(el)
extracted = extractor(el)
"""
"""
if extracted == False:
#print(el)
with open(FAILS,'r') as f:
lines = f.readlines()
with open(FAILS,'a') as ff:
line = el[0]+'\n'
if line not in lines:
ff.writelines(line)
if i%100 == 0:
print(i, ' articles of ',len(data),' extracted')
if __name__ == '__main__':
data = open_json(INPUT)
with open(PDF_DICT,'r') as f:
pdf_dict = json.load(f)
iterate(data)
#print(pdf_dict)
with open(PDF_DICT,'w') as f:
json.dump(pdf_dict,f)
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 4 18:28:59 2019
@author: nadia
"""
from bs4 import BeautifulSoup as bs
import re
import os
import nltk
import article_to
# english
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# german
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import PunktSentenceTokenizer
OUTPUT = '../output/extracted_articles/pdf_extraction/'
#names_and_footnotes = re.compile('([A-Za-z\.\s]+([0-9]+,?)+\s?&?){3}([A-Za-z\s]+([0-9]+,?)+)*')
#names_and_footnotes = re.compile('([A-Za-z\.-]+\s?[A-Za-z\.-]*\s?[A-Za-z\.-]*([0-9],?)+\s?&?\s?){3}([A-Za-z\.-]+\s?[A-Za-z\.-]*\s?[A-Za-z\.-]*([0-9],?)+\s?&?\s?)*')
names_and_footnotes = re.compile('([A-Za-z\.\s-]+([0-9]+,)+\s?&?){3}([A-Za-z\s-]+([0-9]+,)+)*')
figure_pattern = re.compile('^[\s\)]*(Figure|Table|Fig\s?\.|Tab\s?\.)\s[0-9].*')
#title_pattern = re.compile('^\s*([aA]bstract|[iI]ntroduction|[cC]onclusions(\sand final remarks)?|[aA]uthor\s[cC]ontributions|[aA]dditional\s[iI]nformation|[mM]ethods|[rR]esults(\s[aA]nd\s[dD]iscussion)?|[dD]iscussion|[rR]eferences|[aA]cknowledgements)')
title_pattern = re.compile('^[0-9\.:\s]*([aA]bstract|[iI]ntroduction|[cC]onclusions(\sand final remarks)?|[aA]uthor\s[cC]ontributions|[aA]dditional\s[iI]nformation|([mM]aterials?\s[aA]nd\s)?[mM]ethods|[rR]esults(\s[aA]nd\s[dD]iscussion)?|[dD]iscussion|[rR]eferences|[aA]cknowledgements)')
headings = ['Abstract','Introduction','Conclusions','Author contributions','Additional Information','Methods','Material and methods','Results and Discussion','Results','Discussion','References','Acknowledgements']
def open_soup(filename):
with open(filename, 'r') as f:
html_doc = f.read()
soup = bs(html_doc, 'html.parser')
return soup
def check_paragraph(p, article_title):
text = p.text
#print(text)
#print('#######')
# check if paragraph is document title
text_split = text.strip().replace('\n',' ')
if text_split.replace(' ','').lower().startswith(article_title.replace(' ','').lower()): #.replace(' ','').lower()
return True
# check if paragraph is beginning of a section
if re.search(title_pattern, text):
#print(re.search(title_pattern, text).group(1))
return False
if re.search(title_pattern, text.replace(' ','').lower()):
#print(re.search(title_pattern, text).group(1))
return False
#if re.sub('^[0-9.]+\s?','',p.text.strip()).strip() in headings:
#print(re.sub('^[0-9.]+\s?','',p.text.strip()).strip())
#return False
#text_no_numbers = re.sub('^[0-9.]+\s?','',p.text.strip())
#text_no_numbers = re.sub('[0-9]\.?\s?','',text)
#for title in headings:
#if text_no_numbers.strip().lower().startswith(title.lower()):
#print(title)
#return False
# check if paragraph is long enough
pure_text = re.sub('[0-9]+','',text)
text_words = pure_text.split()
if len(text_words) <= 15:
return True
if len(text.strip().split('\n')) == 1:
return True
#check '&' occurance
if text.count('&')>=3:
return True
# check '@' occurance
if text.count('@')>=3:
return True
# check if text is subscript of figure ore table
if re.match(figure_pattern, text):
return True
# check for certain expressions
#Received 15 October 2017; Accepted 16 January 2018
#,re.compile('[Rr]eceived:?\s?[0-9]{2}[A-Za-z]*[0-9]{4}.*[Aa]ccepted:?\s?[0-9]{2}[A-Za-z]*[0-9]{4}')
expressions = [re.compile('online version of this article'), re.compile('\(email:')]
for e in expressions:
if re.search(e,text.replace('\n',' ')) :
return True
# check for expressions like
#aUSDA Forest Service Northern Research Station, 160 Holdsworth Way, Amherst, MA 01003, USA bDepartment
#of Environmental Conservation, University of Massachusetts, 160 Holdsworth Way, Amherst, MA 01003, USA
#c Earth Systems Research Center, Institute for the Study of Earth, Oceans, and Space, University of
#New Hampshire, 8 College Road, Durham, NH 03824, USA d School of Life Sciences, P.O. Box 874501,
#Arizona State University, Tempe, AZ 85287-4501, USA
research_words = ['Research Station','Research Center','Institute','University','USA','School']
count = 0
for word in research_words:
count += text.lower().count(word.lower())
if count>=3:
return True
# check for names and footnotes pattern
text_flow = ''.join(text.split('\n'))
if re.search(names_and_footnotes, text_flow):
return True
# check if paragraph has more numbers than letters
letter = re.compile('[a-zA-Z]')
number = re.compile('[0-9]')
letters, numbers, something = 0, 0, 0
for el in text:
if re.match(letter, el):
letters += 1
elif re.match(number, el):
numbers += 1
else:
something += 1
if numbers > letters:
return True
#text_flow = re.sub('[0-9,-]+','',text_flow)
#text_flow = re.sub('\(.*\)','',text_flow)
#text_flow = re.sub('\s([A-Z]\.)+','',text_flow)
# check proportion commas and chars
lower_letter_pattern = re.compile('^(\s)*[a-z]')
if re.match(lower_letter_pattern, text_flow):
return False
commas = text_flow.count(',')
#chars = len([char for char in text_flow if char != ' '])
#if commas != 0:
#if (chars-commas)/commas < 100:
#return True
#sents = len(nltk.sent_tokenize(text))
#sents = len(text_flow.split('.'))#len(nltk.sent_tokenize(text_flow))#
#if sents*2.5 <= commas:
#return True
return False
def filter_soup(soup):
#new_soup = soup
#print(soup.prettify())
for a in soup.find_all('a'):
a.clear()
for p in soup.find_all('p'):
if check_paragraph(p,soup.title.text.strip())==True:
p.clear()
for li in soup.find_all('li'):
li.clear()
return soup
def save_soup(soup, filename):
filename = OUTPUT+filename+'.html'
with open(filename,'w') as f:
f.write(soup.prettify())
def split_into_sentences(section):
sents = nltk.sent_tokenize(section)
sec = ''
for sent in sents:
sec += '<S> '+sent+' </S> '
return sec
def check_for_heading(text):
if re.search(title_pattern, text):
#print(re.search(title_pattern, text).group(1))
head = re.search(title_pattern, text).group(1)
text = re.sub(title_pattern,'', text.strip())
return (head, text)
if re.search(title_pattern, text.replace(' ','').lower()):
#print(re.search(title_pattern, text.replace(' ','').lower()).group(1))
head = re.search(title_pattern, text.replace(' ','').lower()).group(1)
text = text.strip() #text[len(head):].strip()
return (head, text)
"""
for head in headings:
text_no_numbers = re.sub('^\s*[0-9]','',text)
if text_no_numbers.strip().lower().startswith(head.lower()):
text = re.sub(title_pattern,'',text_no_numbers.strip())
return (head, text)
"""
return False
def get_article(soup):
#tit = soup.title.text.strip()
#article = []
sections = []
heading = ''
subsections = []
for p in soup.find_all('p'):
text = p.text.strip()
#print(text)
#print('------')
if text == '':
continue
text =text.replace('-\n','')
text= text.replace('\n', ' ')
head_check = check_for_heading(text)
if head_check != False:
if len(subsections)>0:
if heading != '':
sections.append(heading)
sections.append(subsections)
subsections = []
heading = head_check[0]
subsections.append(head_check[1])
else:
subsections.append(text)
sections = merge_sections(sections)
return sections
def merge_sections(sections):
for n, el in enumerate(sections):
if type(el)==str:
continue
else:
merged_sections = []
i = 0
while i<len(el):
try:
if el[i].strip()[-1] != '.':
if re.search('^[a-z]',el[i+1].strip()):
merged_sections.append(el[i]+' '+el[i+1])
i += 2
else:
merged_sections.append(el[i])
i += 1
else:
merged_sections.append(el[i])
i += 1
except:
merged_sections.append(el[i])
i += 1
sections[n] = merged_sections
return sections
def article_2_file(article, filename):
with open(OUTPUT+filename+'.txt','w') as f:
for el in article:
f.writelines(el)#+'\n'
def extract(soup,ID):
article_to.html_file(soup, ID, 'pdf')
title = soup.title.text.strip()
filtered_soup = filter_soup(soup)
sections = get_article(filtered_soup)
article_to.text_file(title, sections, ID, 'pdf')
article_to.data(title, sections, ID)
#article_2_file(article, ID)
if __name__ == '__main__':
#soup = open_soup('../output/extracted_articles/pdf_extraction/1551668.html')
soup = open_soup('../output/old_stuff/pdfs/sample/1549083.html')
extract(soup, '1549083')
"""
files = os.listdir(INFILES)
files = [f for f in files if f.endswith('.html')]
#print('number of pdf extracted html files: ', len(files))
for i, file in enumerate(files):
file_name = file.replace('.html','')
soup = open_soup(INFILES+file)
filtered_soup = filter_soup(soup)
#save_soup(filtered_soup, file)
article = extract_article(filtered_soup)
article_2_file(article, file_name)
if i%100==0:
print(i,' of ',len(files), ' extracted')
"""
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment