Skip to content
Snippets Groups Projects
Commit 557fa2ce authored by nwarslan's avatar nwarslan
Browse files

added code

parent c472a976
No related branches found
No related tags found
No related merge requests found
No preview for this file type
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 19 09:50:27 2019
@author: nadia
"""
import json
import matplotlib.pyplot as plt
INPUT = '../output/spektrum_links_output/filtered_Spektrum_Links.json'
def open_json(filename):
with open(filename, 'r') as f:
data = json.load(f)
return data
def dump_json(filename, data):
with open(filename, 'w') as f:
json.dump(data, f)
def add_2_dict(el, d):
if el in d.keys():
d[el] += 1
else:
d[el] = 1
return d
def change_dict(d):
dd={}
for (k,v) in d.items():
k = int(str(k).split('/')[0])
if k in dd.keys(): dd[k] += v
else: dd[k] = v
return dd
def mk_stat(data):
total_de = len(data)
total_url = 0
no_url = 0
structure = {}
structure_decision = 0
keywords = {}
pdf_count = {}
a_with_pdf = 0
url_with_pdf = 0
for el in data:
a_pdf_pointer = False
total_url += len(data[el]['Urls'])
if len(data[el]['Urls']) == 0:
no_url += 1
for url in data[el]['Urls']:
add_2_dict(data[el]['Urls'][url]['Structure'], structure)
add_2_dict(data[el]['Urls'][url]['Keyword'], keywords)
add_2_dict(len(data[el]['Urls'][url]['Pdfs']), pdf_count)
if len(data[el]['Urls'][url]['Pdfs']) != 0:
url_with_pdf += 1
a_pdf_pointer = True
if a_pdf_pointer == True:
a_with_pdf += 1
keywords = change_dict(keywords)
print('Total de articles: ', total_de)
print('Total urls :', total_url)
print('Articles without url: ', no_url)
#plot(structure)
print('Structure: ', structure)
print('Structure 4+: ',add(structure))
print('Structure decision: ', structure_decision)
#plot(pdf_count)
print('Articles with pdf: ', a_with_pdf)
print('URLs with pdf: ', url_with_pdf)
print('Pdfs: ', pdf_count)
#print(add(pdf_count))
#plot(keywords)
#print('Keywords: ', keywords)
def add(data):
i=0
for el in data:
if el > 3:
i += data[el]
return i
def plot(data):
plt_list = sorted(data.items())
plt.bar(range(len(plt_list)), [v[1] for v in plt_list], align='center')#plt_list.values()
plt.xticks(range(len(plt_list)), [v[0] for v in plt_list])#list(plt_list.keys())
plt.show()
if __name__ == '__main__':
data = open_json(INPUT)
mk_stat(data)
\ No newline at end of file
......@@ -10,7 +10,7 @@ import re
import os
import article_to
OUTPUT = '../output/extracted_articles/url_extraction/'
OUTPUT = '/home/nadia/Desktop/'#'../output/extracted_articles/url_extraction/'
def get_article(soup):
sections = []
......@@ -72,12 +72,27 @@ def extract(soup, ID):
title = soup.title.text.strip()
filtered_soup = filter_soup(soup)
sections = get_article(filtered_soup)
article_to.text_file(title, sections, ID, 'url')
article_to.html_file(soup, ID, 'url')
article_to.data(title, sections, ID)
#article_to.text_file(title, sections, ID, 'url')
#article_to.html_file(soup, ID, 'url')
#article_to.data(title, sections, ID)
#write_article(title, sections, ID, soup)
"""
#print(title)
#for s in sections:
#print(s)
print(sections)
def open_soup(filename):
with open(filename, 'r') as f:
html_doc = f.read()
soup = bs(html_doc, 'html.parser')
return soup
if __name__ == '__main__':
html_doc = '/home/nadia/Desktop/test.html'
soup = open_soup(html_doc)
extract(soup,'TEST')
"""
#url = 'https://www.nature.com/articles/s41598-018-22664-4'
#url = 'https://www.nature.com/articles/s41467-018-03465-9'
#url = 'http://rsbl.royalsocietypublishing.org/content/14/2/20170743'
......
......@@ -31,7 +31,7 @@ def download_pdf(url, pdf_name, pdf_dict):
if pdf_name not in pdf_dict:
file_name = str(len(pdf_dict))
pdf_dict[pdf_name] = file_name
#pdf_dict[pdf_name] = file_name
else:
file_name = pdf_dict[pdf_name]
......@@ -44,6 +44,7 @@ def download_pdf(url, pdf_name, pdf_dict):
# get and save pdf file
pdf_file = requests. get(url, allow_redirects=True)
open(pdf_filename, 'wb').write(pdf_file.content)
pdf_dict[pdf_name] = file_name
return True
except:
with open(DOWNLOAD_FAILS, 'a') as f:
......@@ -178,10 +179,10 @@ def check_abstract(urls,ID):
def check_keywords(urls, ID):
ranking = []
for url in urls:
ranking.append((urls[url]['Keyword'],url))
ranking.append((eval(urls[url]['Keyword']),url))
ranking.sort(reverse=True)
ranking = [el[1] for el in ranking]
if check_pdf(urls,ID,ranking):
if check_pdf(urls,ID,abstract=False,ranking):
return True
else:
with open(FAILS.replace('.txt','_keywords.txt'),'r') as f:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment