Skip to content
Snippets Groups Projects
Commit fb947d77 authored by Nadia Arslan's avatar Nadia Arslan
Browse files

code

parent e04c9e6a
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 15 11:47:23 2019
@author: nadia
"""
import pickle
import os
import json
from bs4 import BeautifulSoup as bs
import nltk
# english
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# german
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import PunktSentenceTokenizer
DE_SUM = '../data/de_spektrum_summaries/de_summaries.json'
OUT ='../output/extracted_articles/'#de_en_articles.pkl
def load_pkl(filename):
with open(filename, 'rb') as f:
data = pickle.load(f)
return data
def text_file(title, sections, ID, path):
if path == 'url':
OUTPUT = OUT+'url_extraction/'
elif path == 'pdf':
OUTPUT = OUT+'pdf_extraction/'
with open(OUTPUT+ID+'.txt', 'w') as f:
f.writelines(title+'\n\n\n')
for el in sections:
if type(el)==str:
f.writelines('\n\n'+el+'\n\n')
else:
for paragraph in el:
f.writelines(paragraph+'\n')
def html_file(soup, ID, path):
if path == 'url':
OUTPUT = OUT+'url_extraction/'
elif path == 'pdf':
OUTPUT = OUT+'pdf_extraction/'
elif path == 'not':
OUTPUT = OUT+'not_extracted/'
with open(OUTPUT+ID+'.html', 'w') as f:
f.write(soup.prettify())
def split_into_sentences(section, de=False):
if de == False:
#sents = nltk.sent_tokenize(section)
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sents = sent_detector.tokenize(section.strip())
sec = ''
for sent in sents:
sec += '<S> '+sent+' </S> '
return sec
else:
sents = nltk.PunktSentenceTokenizer(section)
sec = ''
for sent in sents:
sec += '<S> '+sent+' </S> '
return sec
def get_de_summaries():
de_files = os.listdir(DE_SUM)
summaries = {}
for file in de_files:
with open(DE_SUM+file, 'rb') as f:
d = json.load(f)
data = {}
for el in d['Id'].keys():
ID = d['Id'][el]
data[ID]={}
data[ID]['DeTitle']=d['Title'][el]
data[ID]['DeUnderTitle']=d['UnderTitle'][el]
if data[ID]['DeUnderTitle']==None:
data[ID]['DeUnderTitle']=''
data[ID]['DeTeaser']=d['Teaser'][el]
data[ID]['DeSummary']=d['Summary'][el]
summaries.update(data)
#print(len(summaries))
# OUTPUT: {'Id':{'DeTitle', 'DeUnderTitle', 'DeTeaser','DeSummary'}}
return summaries
def tag_en_text(title, sections):
txt = '<ARTICLE><TITLE>'+title+'</TITLE>'
for el in sections:
if type(el)==str:
txt += '<HEADING>'+el+'</HEADING>'
else:
for paragraph in el:
txt += '<SECTION>'+split_into_sentences(paragraph)+'</SECTION>'
txt += '</ARTICLE>'
#print(txt)
return txt
def tag_de_text(summary):
txt = '<ARTICLE><TITLE>'+summary['DeTitle']+'</TITLE>'
if summary['DeUnderTitle'] != None:
if len(summary['DeUnderTitle'])>0:
txt += '<UNDERTITLE>'+summary['DeUnderTitle']+'</UNDERTITLE>'
if summary['DeTeaser'] != None:
if len(summary['DeTeaser'])>0:
txt += '<TEASER>'+summary['DeTeaser']+'</TEASER>'
sections = summary['DeSummary'].split('\n')
sections = [s for s in sections if s != '']
for paragraph in sections:
txt += '<SECTION>'+split_into_sentences(paragraph)+'</SECTION>'
txt += '</ARTICLE>'
#print(txt)
return txt
def load_json(filename):
with open(filename,'r') as f:
data = json.load(f)
return data
def data(title, sections, ID):
# get german summaries
with open(DE_SUM, 'r') as f:
summaries = json.load(f)
#de_en_articles = load_pkl(OUT+'de_en_articles.pkl')
de_en_articles = load_json(OUT+'de_en_articles.json')
en_tagged_text = tag_en_text(title, sections)
#de_summary = summaries[ID]
#with open(OUT+ID+'_de.txt','w') as f:
#f.writelines(summaries[ID]['DeTitle']+'\n\n'+summaries[ID]['DeSummary'])
de_tagged_text = tag_de_text(summaries[ID])
#print(de_en_articles.keys())
de_en_articles[ID]={'De_Summary': de_tagged_text,'En_Article': en_tagged_text}
#if ID in de_en_articles.keys():
with open(OUT+'de_en_articles.json','w') as f:
json.dump(de_en_articles,f)
"""
if __name__ == '__main__':
data={'Id':{},'De_Summary':{},'En_Article':{}}
with open(OUT, 'wb') as f:
pickle.dump(data,f)
"""
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment