Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
text_translation_and_summarization
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
nwarslan
text_translation_and_summarization
Commits
efb6c486
Commit
efb6c486
authored
5 years ago
by
Nadia Arslan
Browse files
Options
Downloads
Patches
Plain Diff
old
parent
557fa2ce
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
code/main_extractor_old.py
+238
-0
238 additions, 0 deletions
code/main_extractor_old.py
with
238 additions
and
0 deletions
code/main_extractor_old.py
0 → 100644
+
238
−
0
View file @
efb6c486
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 13 10:48:25 2019
@author: nadia
"""
import
json
from
bs4
import
BeautifulSoup
import
requests
from
tika
import
parser
as
pdf_parser
import
re
import
html_extractor
import
pdf_extractor
import
os
INPUT
=
'
../output/spektrum_links_output/filtered_Spektrum_Links.json
'
FAILS
=
'
../output/extracted_articles/extraction_fails.txt
'
DOWNLOAD_FAILS
=
'
../output/extracted_articles/download_fails.txt
'
WINS
=
'
../output/extracted_articles/extraction_complete.txt
'
OUTPUT
=
'
../output/extracted_articles/pdf_extraction/pdfs/
'
PDF_DICT
=
'
../output/extracted_articles/pdf_extraction/pdfs/pdf_dict.json
'
DE_EN
=
'
../output/extracted_articles/de_en_articles.json
'
def
open_json
(
filename
):
with
open
(
filename
,
'
r
'
)
as
f
:
data
=
json
.
load
(
f
)
return
data
def
download_pdf
(
url
,
pdf_name
,
pdf_dict
):
pdfs
=
os
.
listdir
(
OUTPUT
)
if
pdf_name
not
in
pdf_dict
:
file_name
=
str
(
len
(
pdf_dict
))
pdf_dict
[
pdf_name
]
=
file_name
else
:
file_name
=
pdf_dict
[
pdf_name
]
pdf_filename
=
OUTPUT
+
file_name
+
'
.pdf
'
if
pdf_filename
.
replace
(
OUTPUT
,
''
)
in
pdfs
:
#print('PDF already exists')
return
True
try
:
# get and save pdf file
pdf_file
=
requests
.
get
(
url
,
allow_redirects
=
True
)
open
(
pdf_filename
,
'
wb
'
).
write
(
pdf_file
.
content
)
return
True
except
:
with
open
(
DOWNLOAD_FAILS
,
'
a
'
)
as
f
:
f
.
writelines
(
pdf_filename
+
'
\t
'
+
url
+
'
\n
'
)
return
False
def
get_pdf_soup
(
filename
,
ID
):
"""
if ID+
'
.html
'
in os.listdir(
'
../output/extracted_articles/pdf_extraction/
'
):
with open(
'
../output/extracted_articles/pdf_extraction/
'
+ID+
'
.html
'
,
'
r
'
) as f:
html_doc = f.read()
soup = BeautifulSoup(html_doc,
'
html.parser
'
)
return soup
"""
#pdfreader=PyPDF2.PdfFileReader(open(filename,'rb'))
#pdf_count=pdfreader.numPages
#print(pdf_count)
sysxml
=
pdf_parser
.
from_file
(
filename
,
xmlContent
=
True
)[
'
content
'
]
sysxml
=
re
.
sub
(
r
"
<p />
"
,
""
,
sysxml
)
sysxml
=
re
.
sub
(
r
"
<p>[\s]*\n</p>
"
,
""
,
sysxml
)
soup
=
BeautifulSoup
(
sysxml
,
'
html.parser
'
)
#print(soup.title.text)
return
soup
def
check_structure
(
urls
,
ID
):
score
=
(
0
,
''
)
for
url
in
urls
.
keys
():
if
int
(
urls
[
url
][
'
Structure
'
])
>
score
[
0
]
and
urls
[
url
][
'
Abstract
'
]
==
True
:
score
=
(
int
(
urls
[
url
][
'
Structure
'
]),
url
)
if
score
[
0
]
<
4
:
return
False
else
:
url
=
score
[
1
]
try
:
html_doc
=
requests
.
get
(
url
).
text
soup
=
BeautifulSoup
(
html_doc
,
'
html.parser
'
)
html_extractor
.
extract
(
soup
,
ID
)
return
True
except
:
#print(ID, url)
#with open(FAILS,'a') as f:
#f.writelines(ID+'\t'+url+'\n')
return
False
def
check_pdf
(
urls
,
ID
,
abstract
=
False
,
ranking
=
[]):
def
compare_titles
(
title1
,
title2
):
if
len
(
title1
.
split
())
<
5
or
len
(
title2
.
split
())
<
5
:
return
False
#if title1 == '' or title2 == '':
#return False
title1
=
re
.
sub
(
'
\|.*$
'
,
''
,
title1
)
title2
=
re
.
sub
(
'
[_:]
'
,
''
,
title2
)
title1
=
re
.
sub
(
'
[_:]
'
,
''
,
title1
)
#title2 = re.sub('\n','\s',title2)
#title1 = re.sub('\n','\s',title1)
#print(title1)
#print(title2)
#if title1 in title2 or title2 in title1:
#return True
title2
=
title2
.
split
()
title3
=
[]
for
i
in
range
(
len
(
title2
)
-
2
):
title3
.
append
(
title2
[
i
]
+
'
'
+
title2
[
i
+
1
])
#print(title3)
i
=
0
for
bigram
in
title3
:
if
bigram
in
title1
:
i
+=
1
if
i
!=
0
and
i
/
len
(
title3
)
>
0.5
:
return
True
return
False
if
abstract
==
True
:
for
url
in
urls
:
if
urls
[
url
][
'
Abstract
'
]
==
True
and
len
(
urls
[
url
][
'
Pdfs
'
])
>
0
:
url_title
=
urls
[
url
][
'
En_title
'
]
pdfs
=
urls
[
url
][
'
Pdfs
'
]
pdfs
=
check_path
(
pdfs
,
url
)
for
pdf
in
pdfs
:
pdf_name
=
ID
+
pdf
if
download_pdf
(
pdf
,
pdf_name
,
pdf_dict
):
file_name
=
pdf_dict
[
pdf_name
]
soup
=
get_pdf_soup
(
OUTPUT
+
file_name
+
'
.pdf
'
,
ID
)
pdf_title
=
soup
.
title
.
text
.
strip
()
if
compare_titles
(
url_title
.
lower
(),
pdf_title
.
lower
()):
pdf_extractor
.
extract
(
soup
,
ID
)
return
True
else
:
for
url
in
ranking
:
if
len
(
urls
[
url
][
'
Pdfs
'
])
>
0
:
url_title
=
urls
[
url
][
'
En_title
'
]
pdfs
=
urls
[
url
][
'
Pdfs
'
]
pdfs
=
check_path
(
pdfs
,
url
)
for
pdf
in
pdfs
:
pdf_name
=
ID
+
pdf
if
download_pdf
(
pdf
,
pdf_name
,
pdf_dict
):
file_name
=
pdf_dict
[
pdf_name
]
soup
=
get_pdf_soup
(
OUTPUT
+
file_name
+
'
.pdf
'
)
pdf_title
=
soup
.
title
.
text
.
strip
()
if
compare_titles
(
url_title
.
lower
(),
pdf_title
.
lower
()):
pdf_extractor
.
extract
(
soup
,
ID
)
return
True
return
False
def
check_path
(
pdfs
,
url
):
def
merge_link
(
p
,
url
):
url_split
=
[
el
for
el
in
url
.
split
(
'
/
'
)
if
len
(
el
)
!=
0
]
url_foot
=
'
//
'
.
join
(
url_split
[:
2
])
p
=
url_foot
+
p
return
p
p1
=
[
merge_link
(
p
,
url
)
for
p
in
pdfs
if
p
.
startswith
(
'
/
'
)]
p2
=
[
p
for
p
in
pdfs
if
not
p
.
startswith
(
'
/
'
)]
pdfs
=
p1
+
p2
return
pdfs
def
check_abstract
(
urls
,
ID
):
for
url
in
urls
:
if
urls
[
url
][
'
Abstract
'
]
==
True
:
try
:
html_doc
=
requests
.
get
(
url
).
text
soup
=
BeautifulSoup
(
html_doc
,
'
html.parser
'
)
html_extractor
.
extract
(
soup
,
ID
)
return
True
except
:
#print(ID,url)
#with open(FAILS,'a') as f:
#f.writelines(ID+'\t'+url+'\n')
continue
return
False
def
check_keywords
(
urls
,
ID
):
ranking
=
[]
for
url
in
urls
:
ranking
.
append
((
urls
[
url
][
'
Keyword
'
],
url
))
ranking
.
sort
(
reverse
=
True
)
ranking
=
[
el
[
1
]
for
el
in
ranking
]
if
check_pdf
(
urls
,
ID
,
ranking
):
return
True
else
:
with
open
(
FAILS
.
replace
(
'
.txt
'
,
'
_keywords.txt
'
),
'
r
'
)
as
f
:
lines
=
f
.
readlines
()
with
open
(
FAILS
.
replace
(
'
.txt
'
,
'
_keywords.txt
'
),
'
a
'
)
as
ff
:
line
=
ID
+
'
\t
'
+
str
(
ranking
)
+
'
\n
'
if
line
not
in
lines
:
ff
.
writelines
(
line
)
return
False
def
extractor
(
article
):
ID
=
article
[
0
]
urls
=
article
[
1
][
'
Urls
'
]
if
check_structure
(
urls
,
ID
):
return
True
elif
check_pdf
(
urls
,
ID
,
abstract
=
True
):
return
True
elif
check_abstract
(
urls
,
ID
):
return
True
elif
check_keywords
(
urls
,
ID
):
return
True
else
:
return
False
def
iterate
(
data
):
de_en
=
open_json
(
DE_EN
)
for
i
,
el
in
enumerate
(
data
.
items
()):
#print(el)
if
el
[
0
]
in
de_en
:
continue
extracted
=
extractor
(
el
)
"""
"""
if
extracted
==
False
:
#print(el)
with
open
(
FAILS
,
'
r
'
)
as
f
:
lines
=
f
.
readlines
()
with
open
(
FAILS
,
'
a
'
)
as
ff
:
line
=
el
[
0
]
+
'
\n
'
if
line
not
in
lines
:
ff
.
writelines
(
line
)
if
i
%
100
==
0
:
print
(
i
,
'
articles of
'
,
len
(
data
),
'
extracted
'
)
if
__name__
==
'
__main__
'
:
data
=
open_json
(
INPUT
)
with
open
(
PDF_DICT
,
'
r
'
)
as
f
:
pdf_dict
=
json
.
load
(
f
)
iterate
(
data
)
#print(pdf_dict)
with
open
(
PDF_DICT
,
'
w
'
)
as
f
:
json
.
dump
(
pdf_dict
,
f
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment