added code

557fa2ce · nwarslan · c472a976 · 557fa2ce · 557fa2ce · 557fa2ce
Commit 557fa2ce authored 5 years ago by nwarslan
--- a/code/__pycache__/article_to.cpython-37.pyc
+++ b/code/__pycache__/article_to.cpython-37.pyc
--- a/code/get_data_statistics.py
+++ b/code/get_data_statistics.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Oct 19 09:50:27 2019
+
+@author: nadia
+"""
+import json
+import matplotlib.pyplot as plt
+
+INPUT = '../output/spektrum_links_output/filtered_Spektrum_Links.json'
+
+def open_json(filename):
+    with open(filename, 'r') as f:
+        data = json.load(f)
+    return data
+
+def dump_json(filename, data):
+    with open(filename, 'w') as f:
+        json.dump(data, f)
+        
+def add_2_dict(el, d):
+    if el in d.keys():
+        d[el] += 1
+    else:
+        d[el] = 1
+    return d
+
+def change_dict(d):
+    dd={}
+    for (k,v) in d.items():
+        k = int(str(k).split('/')[0])
+        if k in dd.keys(): dd[k] += v
+        else: dd[k] = v
+    return dd
+
+def mk_stat(data):
+    
+    total_de = len(data)
+    total_url = 0
+    no_url = 0
+    structure = {}
+    structure_decision = 0
+    keywords = {}
+    pdf_count = {}
+    a_with_pdf = 0
+    url_with_pdf = 0
+    
+    for el in data:
+        a_pdf_pointer = False
+        total_url += len(data[el]['Urls'])
+        if len(data[el]['Urls']) == 0:
+            no_url += 1
+        for url in data[el]['Urls']:
+            add_2_dict(data[el]['Urls'][url]['Structure'], structure)
+            add_2_dict(data[el]['Urls'][url]['Keyword'], keywords)
+            add_2_dict(len(data[el]['Urls'][url]['Pdfs']), pdf_count)
+            if len(data[el]['Urls'][url]['Pdfs']) != 0:
+                url_with_pdf += 1
+                a_pdf_pointer = True
+        if a_pdf_pointer == True:
+            a_with_pdf += 1
+           
+    keywords = change_dict(keywords)
+    print('Total de articles: ', total_de)
+    print('Total urls :', total_url)
+    print('Articles without url: ', no_url)
+    #plot(structure)
+    print('Structure: ', structure)
+    print('Structure 4+: ',add(structure))
+    print('Structure decision: ', structure_decision)
+    #plot(pdf_count)
+    print('Articles with pdf: ', a_with_pdf)
+    print('URLs with pdf: ', url_with_pdf)
+    print('Pdfs: ', pdf_count)
+    #print(add(pdf_count))
+    #plot(keywords)
+    #print('Keywords: ', keywords)
+    
+def add(data):
+    i=0
+    for el in data:
+        if el > 3:
+            i += data[el]
+    return i
+
+def plot(data):
+    plt_list = sorted(data.items())
+    plt.bar(range(len(plt_list)), [v[1] for v in plt_list], align='center')#plt_list.values()
+    plt.xticks(range(len(plt_list)), [v[0] for v in plt_list])#list(plt_list.keys())
+    plt.show()
+    
+if __name__ == '__main__':
+    data = open_json(INPUT)
+    mk_stat(data)
+    
+    
\ No newline at end of file
--- a/code/html_extractor.py
+++ b/code/html_extractor.py
@@ -10,7 +10,7 @@ import re
 import os
 import article_to

-OUTPUT =  '../output/extracted_articles/url_extraction/'
+OUTPUT =  '/home/nadia/Desktop/'#'../output/extracted_articles/url_extraction/'

 def get_article(soup):
    sections = []
@@ -72,12 +72,27 @@ def extract(soup, ID):
    title = soup.title.text.strip()
    filtered_soup = filter_soup(soup)
    sections = get_article(filtered_soup)
-    article_to.text_file(title, sections, ID, 'url')
-    article_to.html_file(soup, ID, 'url')
-    article_to.data(title, sections, ID)
+    #article_to.text_file(title, sections, ID, 'url')
+    #article_to.html_file(soup, ID, 'url')
+    #article_to.data(title, sections, ID)
    #write_article(title, sections, ID, soup)
-"""    
+    #print(title)
+    #for s in sections:
+        #print(s)
+    print(sections)
+        
+        
+def open_soup(filename):
+    with open(filename, 'r') as f:
+        html_doc = f.read()
+        soup = bs(html_doc, 'html.parser')
+    return soup  
+
 if __name__ == '__main__':
+    html_doc = '/home/nadia/Desktop/test.html'
+    soup = open_soup(html_doc)
+    extract(soup,'TEST')
+"""      
    #url = 'https://www.nature.com/articles/s41598-018-22664-4'
    #url = 'https://www.nature.com/articles/s41467-018-03465-9'
    #url = 'http://rsbl.royalsocietypublishing.org/content/14/2/20170743'

--- a/code/main_extractor.py
+++ b/code/main_extractor.py
@@ -31,7 +31,7 @@ def download_pdf(url, pdf_name, pdf_dict):
    
    if pdf_name not in pdf_dict:
        file_name = str(len(pdf_dict))
-        pdf_dict[pdf_name] = file_name
+        #pdf_dict[pdf_name] = file_name
    else:
        file_name = pdf_dict[pdf_name]
        
@@ -44,6 +44,7 @@ def download_pdf(url, pdf_name, pdf_dict):
        # get and save pdf file
        pdf_file = requests. get(url, allow_redirects=True)     
        open(pdf_filename, 'wb').write(pdf_file.content)
+        pdf_dict[pdf_name] = file_name
        return True
    except:
        with open(DOWNLOAD_FAILS, 'a') as f:
@@ -178,10 +179,10 @@ def check_abstract(urls,ID):
 def check_keywords(urls, ID):
    ranking = []
    for url in urls:
-        ranking.append((urls[url]['Keyword'],url))
+        ranking.append((eval(urls[url]['Keyword']),url))
    ranking.sort(reverse=True)
    ranking = [el[1] for el in ranking]
-    if check_pdf(urls,ID,ranking):
+    if check_pdf(urls,ID,abstract=False,ranking):
        return True
    else:
        with open(FAILS.replace('.txt','_keywords.txt'),'r') as f: