Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
BA Timeline Summarization
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
kaiser
BA Timeline Summarization
Commits
977dbc70
Commit
977dbc70
authored
3 years ago
by
vvye
Browse files
Options
Downloads
Patches
Plain Diff
Implement fetching of crisis dataset
parent
aaec60f3
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
dataset.py
+77
-6
77 additions, 6 deletions
dataset.py
heideltime_util.py
+3
-1
3 additions, 1 deletion
heideltime_util.py
run.py
+1
-1
1 addition, 1 deletion
run.py
util.py
+23
-0
23 additions, 0 deletions
util.py
with
104 additions
and
8 deletions
dataset.py
+
77
−
6
View file @
977dbc70
...
...
@@ -29,19 +29,18 @@ import heideltime_util
import
util
def
get_timeline17_dataset
(
path
):
def
get_timeline17_dataset
():
"""
Returns the Timeline17 dataset as a dictionary.
If
cached.pkl exists in the given path
, it will be loaded from there,
If
data/in/timeline17/timeline17.pkl exists
, it will be loaded from there,
otherwise, it will be parsed from scratch (assuming the default folder structure).
:param path: The path to Timeline17
'
s
'
Data
'
directory.
:return: A dictionary containing the preprocessed data.
:return: A dictionary containing the dataset.
"""
path
=
Path
(
path
)
path
=
Path
(
'
data/in/timeline17/Data
'
)
cache_filename
=
p
ath
/
'
cached
.pkl
'
cache_filename
=
P
ath
(
'
data/in/timeline17/timeline17
.pkl
'
)
if
os
.
path
.
exists
(
cache_filename
):
return
pickle
.
load
(
open
(
cache_filename
,
'
rb
'
))
...
...
@@ -100,5 +99,77 @@ def get_timeline17_dataset(path):
return
data
def
get_crisis_dataset
():
"""
Returns the crisis dataset as a dictionary.
If data/in/crisis/crisis.pkl exists, it will be loaded from there,
otherwise, it will be parsed from scratch (assuming the default folder structure).
:return: A dictionary containing the dataset.
"""
path
=
Path
(
'
data/in/crisis
'
)
cache_filename
=
Path
(
'
data/in/crisis/crisis.pkl
'
)
if
os
.
path
.
exists
(
cache_filename
):
return
pickle
.
load
(
open
(
cache_filename
,
'
rb
'
))
data
=
{}
# go through each topic directory
for
topic_dirname
in
util
.
subdirs
(
path
):
topic_path
=
path
/
topic_dirname
topic_name
=
topic_dirname
if
topic_name
not
in
data
:
data
[
topic_name
]
=
{
'
articles
'
:
[],
'
gold_timelines
'
:
{}}
# parse input articles
for
pub_date
in
util
.
subdirs
(
topic_path
/
'
public
'
/
'
content
'
):
date_path
=
topic_path
/
'
public
'
/
'
content
'
/
pub_date
for
article_filename
in
util
.
files
(
date_path
,
extension
=
'
.cont
'
):
article_file_path
=
date_path
/
article_filename
print
(
article_file_path
)
article
=
{
'
pub_date
'
:
pub_date
,
'
sentences
'
:
[]}
# get sentence text
with
util
.
detect_encoding_and_open
(
article_file_path
)
as
f
:
sentences_in_article
=
[{
'
text
'
:
line
.
strip
(),
'
mentioned_dates
'
:
[]
}
for
line
in
f
.
readlines
()[
1
:]
# skip first line (headline)
if
line
.
strip
()]
# get date mentions using HeidelTime
# and add them to the sentence data
mentioned_dates_by_sentence
=
heideltime_util
.
mentioned_dates_by_sentence
(
article_file_path
,
pub_date
)
mentioned_dates_by_sentence
=
mentioned_dates_by_sentence
[
1
:]
assert
len
(
mentioned_dates_by_sentence
)
==
len
(
sentences_in_article
)
# skip first line (headline)
for
i
in
range
(
len
(
sentences_in_article
)):
sentence
=
sentences_in_article
[
i
]
sentence
[
'
mentioned_dates
'
]
=
mentioned_dates_by_sentence
[
i
]
article
[
'
sentences
'
]
+=
sentences_in_article
data
[
topic_name
][
'
articles
'
].
append
(
article
)
# parse gold timelines
for
gold_timeline_filename
in
util
.
files
(
topic_path
/
'
public
'
/
'
timelines
'
,
extension
=
'
txt
'
):
if
gold_timeline_filename
.
startswith
(
'
.
'
):
continue
gold_timeline_file_path
=
topic_path
/
'
public
'
/
'
timelines
'
/
gold_timeline_filename
gold_timeline_name
=
gold_timeline_filename
.
split
(
'
.
'
)[
0
]
gold_timeline
=
{}
with
open
(
gold_timeline_file_path
)
as
f
:
lines
=
[
line
.
strip
()
for
line
in
f
.
readlines
()]
date_groups
=
[
list
(
y
)
for
x
,
y
in
itertools
.
groupby
(
lines
,
lambda
z
:
re
.
match
(
'
^-+$
'
,
z
))
if
not
x
]
for
date_group
in
date_groups
:
date
,
sentences_on_date
=
date_group
[
0
],
date_group
[
1
:]
sentences_on_date
=
[
s
.
lstrip
(
'
-
'
).
strip
()
for
s
in
sentences_on_date
]
gold_timeline
[
date
]
=
sentences_on_date
data
[
topic_name
][
'
gold_timelines
'
][
gold_timeline_name
]
=
gold_timeline
pickle
.
dump
(
data
,
open
(
cache_filename
,
'
wb
'
))
return
data
def
filter_articles_by_date
(
articles
,
start_date
,
end_date
):
return
[
a
for
a
in
articles
if
start_date
<=
a
[
'
pub_date
'
]
<=
end_date
]
This diff is collapsed.
Click to expand it.
heideltime_util.py
+
3
−
1
View file @
977dbc70
...
...
@@ -4,6 +4,8 @@ import subprocess
import
xml.etree.ElementTree
as
ET
from
xml.sax.saxutils
import
escape
import
util
heideltime_path
=
'
tools/heideltime
'
heideltime_jar_name
=
'
de.unihd.dbs.heideltime.standalone.jar
'
heideltime_root_regex
=
re
.
compile
(
'
<TimeML>(.*?)</TimeML>
'
,
re
.
MULTILINE
|
re
.
DOTALL
)
...
...
@@ -14,7 +16,7 @@ def mentioned_dates_by_sentence(filename, pub_date):
# create a temporary copy of the file with interfering characters escaped
escaped_filename
=
str
(
filename
)
+
'
.escaped
'
with
open
(
filename
,
encoding
=
'
utf-8
'
)
as
f
,
open
(
escaped_filename
,
'
w
'
,
encoding
=
'
utf-8
'
)
as
g
:
with
util
.
detect_encoding_and_open
(
filename
)
as
f
,
open
(
escaped_filename
,
'
w
'
,
encoding
=
'
utf-8
'
)
as
g
:
for
line
in
f
:
g
.
write
(
escape
(
line
))
...
...
This diff is collapsed.
Click to expand it.
run.py
+
1
−
1
View file @
977dbc70
...
...
@@ -10,7 +10,7 @@ import timeline_generation
def
main
(
args
):
eval_results
=
evaluation
.
ResultLogger
()
data
=
dataset
.
get_
timeline17_dataset
(
'
data/in/timeline17/Data
'
)
data
=
dataset
.
get_
crisis_dataset
(
)
for
topic
in
data
.
keys
():
articles
=
data
[
topic
][
'
articles
'
]
...
...
This diff is collapsed.
Click to expand it.
util.py
+
23
−
0
View file @
977dbc70
import
os
import
chardet
from
datetime
import
datetime
...
...
@@ -29,3 +30,25 @@ def rank(lst, scores):
def
days_between
(
date1
,
date2
):
return
abs
((
datetime
.
strptime
(
date1
,
'
%Y-%m-%d
'
)
-
datetime
.
strptime
(
date2
,
'
%Y-%m-%d
'
)).
days
)
def
detect_encoding_and_open
(
filename
):
"""
Opens a (text) file for reading.
Behaves the same as the builtin open, but attempts to determine the correct encoding first.
chardet is used to detect the encoding, with
'
utf-8
'
and
'
ansi
'
as fallbacks in case the detection fails.
If no encoding works, a UnicodeDecode error is raised.
:param filename: The name of the file to be opened
:return: A file handle.
"""
raw_data
=
open
(
filename
,
'
rb
'
).
read
()
detected_encoding
=
chardet
.
detect
(
raw_data
)[
'
encoding
'
]
encodings
=
[
detected_encoding
[
'
encoding
'
],
'
utf-8
'
,
'
ansi
'
]
for
encoding
in
encodings
:
f
=
open
(
filename
,
encoding
=
encoding
)
try
:
_
=
[
line
for
line
in
f
.
readlines
()]
return
f
except
UnicodeDecodeError
:
f
.
close
()
continue
raise
UnicodeDecodeError
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment