Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
BA Timeline Summarization
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
kaiser
BA Timeline Summarization
Commits
fa97eff1
Commit
fa97eff1
authored
3 years ago
by
vvye
Browse files
Options
Downloads
Patches
Plain Diff
Implement dataset selection via argument
parent
e79f881e
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
dataset.py
+18
-0
18 additions, 0 deletions
dataset.py
misc.py
+48
-1
48 additions, 1 deletion
misc.py
run.py
+8
-1
8 additions, 1 deletion
run.py
timeline_generation.py
+1
-3
1 addition, 3 deletions
timeline_generation.py
with
75 additions
and
5 deletions
dataset.py
+
18
−
0
View file @
fa97eff1
...
...
@@ -129,6 +129,10 @@ def get_crisis_dataset():
for
article_filename
in
util
.
files
(
date_path
,
extension
=
'
.cont
'
):
article_file_path
=
date_path
/
article_filename
# TODO separate sentences and tokenize (the input files are not tokenized and not one-sentence-per-line)
# (probably best to create a temporary file with the correct format and run heideltime on that
# and also do the first-line-skipping business there)
# this one breaks heideltime due to character encoding shenanigans
if
topic_name
==
'
egypt
'
and
'
2429.htm.cont
'
in
str
(
article_file_path
):
continue
...
...
@@ -176,5 +180,19 @@ def get_crisis_dataset():
return
data
def
get_entities_dataset
():
"""
Returns the ENTITIES dataset as a dictionary.
If data/in/entities/entities.pkl exists, it will be loaded from there,
otherwise, it will be parsed from scratch (assuming the default folder structure).
:return: A dictionary containing the dataset.
"""
# TODO actually build the dataset from the input files?
cache_filename
=
Path
(
'
data/in/entities/entities.pkl
'
)
return
pickle
.
load
(
open
(
cache_filename
,
'
rb
'
))
def
filter_articles_by_date
(
articles
,
start_date
,
end_date
):
return
[
a
for
a
in
articles
if
start_date
<=
a
[
'
pub_date
'
]
<=
end_date
]
This diff is collapsed.
Click to expand it.
misc.py
+
48
−
1
View file @
fa97eff1
...
...
@@ -16,4 +16,51 @@ keywords_by_topic = {
'
mj
'
:
[
'
michael
'
,
'
jackson
'
],
'
syria
'
:
[
'
syria
'
,
'
syrian
'
],
'
yemen
'
:
[
'
yemen
'
],
}
\ No newline at end of file
'
Al_Gore
'
:
[
'
al gore
'
,
'
gore
'
],
'
Angela_Merkel
'
:
[
'
angela merkel
'
,
'
merkel
'
],
'
Ariel_Sharon
'
:
[
'
ariel sharon
'
,
'
sharon
'
],
'
Arnold_Schwarzenegger
'
:
[
'
arnold schwarzenegger
'
,
'
schwarzenegger
'
],
'
Bashar_al-Assad
'
:
[
'
al-assad
'
,
'
bashar al-assad
'
,
'
assad
'
],
'
Bill_Clinton
'
:
[
'
bill clinton
'
,
'
clinton
'
],
'
Charles_Taylor
'
:
[
'
charles taylor
'
,
'
taylor
'
],
'
Chris_Brown
'
:
[
'
brown
'
,
'
chris brown
'
],
'
David_Beckham
'
:
[
'
beckham
'
,
'
david beckham
'
],
'
David_Bowie
'
:
[
'
bowie
'
,
'
david bowie
'
],
'
Dilma_Rousseff
'
:
[
'
dilma rousseff
'
,
'
rousseff
'
],
'
Dmitry_Medvedev
'
:
[
'
dmitry medvedev
'
,
'
medvedev
'
],
'
Dominique_Strauss-Kahn
'
:
[
'
dominique strauss-kahn
'
,
'
strauss-kahn
'
,
'
kahn
'
],
'
Edward_Snowden
'
:
[
'
edward snowden
'
,
'
snowden
'
],
'
Ehud_Olmert
'
:
[
'
ehud olmert
'
,
'
olmert
'
],
'
Enron
'
:
[
'
enron
'
],
'
Hamid_Karzai
'
:
[
'
hamid karzai
'
,
'
karzai
'
],
'
Hassan_Rouhani
'
:
[
'
hassan rouhani
'
,
'
rouhani
'
],
'
Hu_Jintao
'
:
[
'
hu jintao
'
,
'
jintao
'
],
'
Jacob_Zuma
'
:
[
'
jacob zuma
'
,
'
zuma
'
],
'
John_Boehner
'
:
[
'
boehner
'
,
'
john boehner
'
],
'
John_Kerry
'
:
[
'
john kerry
'
,
'
kerry
'
],
'
Julian_Assange
'
:
[
'
assange
'
,
'
julian assange
'
],
'
Lance_Armstrong
'
:
[
'
armstrong
'
,
'
lance armstrong
'
],
'
Mahmoud_Ahmadinejad
'
:
[
'
ahmadinejad
'
,
'
mahmoud ahmadinejad
'
],
'
Marco_Rubio
'
:
[
'
marco rubio
'
,
'
rubio
'
],
'
Margaret_Thatcher
'
:
[
'
margaret thatcher
'
,
'
thatcher
'
],
'
Michael_Jackson
'
:
[
'
jackson
'
,
'
michael jackson
'
],
'
Michelle_Obama
'
:
[
'
michelle obama
'
,
'
obama
'
],
'
Mitt_Romney
'
:
[
'
mitt romney
'
,
'
romney
'
],
'
Morgan_Tsvangirai
'
:
[
'
morgan tsvangirai
'
,
'
tsvangirai
'
],
'
Nawaz_Sharif
'
:
[
'
nawaz sharif
'
,
'
sharif
'
],
'
Nelson_Mandela
'
:
[
'
mandela
'
,
'
nelson mandela
'
],
'
Osama_bin_Laden
'
:
[
'
laden
'
,
'
osama bin laden
'
],
'
Oscar_Pistorius
'
:
[
'
oscar pistorius
'
,
'
pistorius
'
],
'
Phil_Spector
'
:
[
'
phil spector
'
,
'
spector
'
],
'
Prince_William
'
:
[
'
prince william
'
,
'
william
'
],
'
Robert_Mugabe
'
:
[
'
mugabe
'
,
'
robert mugabe
'
],
'
Rupert_Murdoch
'
:
[
'
murdoch
'
,
'
rupert murdoch
'
],
'
Saddam_Hussein
'
:
[
'
hussein
'
,
'
saddam hussein
'
],
'
Sarah_Palin
'
:
[
'
palin
'
,
'
sarah palin
'
],
'
Silvio_Berlusconi
'
:
[
'
berlusconi
'
,
'
silvio berlusconi
'
],
'
Steve_Jobs
'
:
[
'
jobs
'
,
'
steve jobs
'
],
'
Taliban
'
:
[
'
taliban
'
],
'
Ted_Cruz
'
:
[
'
cruz
'
,
'
ted cruz
'
],
'
Tiger_Woods
'
:
[
'
tiger woods
'
,
'
woods
'
],
'
WikiLeaks
'
:
[
'
wikileaks
'
]
}
This diff is collapsed.
Click to expand it.
run.py
+
8
−
1
View file @
fa97eff1
...
...
@@ -10,7 +10,12 @@ import timeline_generation
def
main
(
args
):
eval_results
=
evaluation
.
ResultLogger
()
data
=
dataset
.
get_crisis_dataset
()
data
=
{
'
timeline17
'
:
dataset
.
get_timeline17_dataset
,
'
crisis
'
:
dataset
.
get_crisis_dataset
,
'
entities
'
:
dataset
.
get_entities_dataset
}[
args
.
dataset
]()
for
topic
in
data
.
keys
():
articles
=
data
[
topic
][
'
articles
'
]
...
...
@@ -44,6 +49,8 @@ def main(args):
if
__name__
==
'
__main__
'
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'
--dataset
'
,
type
=
str
,
choices
=
[
'
timeline17
'
,
'
crisis
'
,
'
entities
'
],
help
=
'
the dataset to use
'
,
required
=
True
)
parser
.
add_argument
(
'
--print_timelines
'
,
action
=
'
store_true
'
,
help
=
'
whether to print the timelines to the console after generating them
'
)
...
...
This diff is collapsed.
Click to expand it.
timeline_generation.py
+
1
−
3
View file @
fa97eff1
from
pathlib
import
Path
from
sklearn.feature_extraction.text
import
TfidfVectorizer
import
dataset
...
...
@@ -23,7 +21,7 @@ def make_timeline(articles, gold_timeline, keywords):
# articles = dataset.filter_articles_by_keywords(articles, keywords)
# select dates
ranked_dates
=
date_selection
.
rank_dates_by_
wilson
(
articles
,
start_date
,
end_date
,
num_dates
)
ranked_dates
=
date_selection
.
rank_dates_by_
mention_count
(
articles
,
start_date
,
end_date
,
num_dates
)
# train TFIDF vectorizer on all sentences (not just the ones for this date)
all_sentences
=
[
sentence
[
'
text
'
]
for
article
in
articles
for
sentence
in
article
[
'
sentences
'
]]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment