Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
BA Timeline Summarization
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
kaiser
BA Timeline Summarization
Commits
e1cc453c
Commit
e1cc453c
authored
3 years ago
by
vvye
Browse files
Options
Downloads
Patches
Plain Diff
Start implementing date selection from WILSON
parent
32eb4acb
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
date_selection.py
+36
-0
36 additions, 0 deletions
date_selection.py
timeline_generation.py
+1
-1
1 addition, 1 deletion
timeline_generation.py
with
37 additions
and
1 deletion
date_selection.py
+
36
−
0
View file @
e1cc453c
from
collections
import
Counter
from
datetime
import
datetime
import
igraph
def
rank_dates_by_mention_count
(
articles
,
start_date
,
end_date
):
...
...
@@ -9,3 +12,36 @@ def rank_dates_by_mention_count(articles, start_date, end_date):
if
start_date
<=
mentioned_date
<=
end_date
:
mention_count
[
mentioned_date
]
+=
1
return
[
item
[
0
]
for
item
in
mention_count
.
most_common
()]
def
rank_dates_by_wilson
(
articles
,
start_date
,
end_date
):
edges
=
[]
# count how often each published -> mentioned pair occurs
pub_to_mention_count
=
Counter
({})
for
article
in
articles
:
pub_date
=
article
[
'
pub_date
'
]
for
sentence
in
article
[
'
sentences
'
]:
for
mentioned_date
in
set
(
sentence
[
'
mentioned_dates
'
]):
if
pub_date
!=
mentioned_date
and
start_date
<=
mentioned_date
<=
end_date
:
pub_to_mention_count
[(
pub_date
,
mentioned_date
)]
+=
1
# the edge weight for each published -> mentioned pair
# is how often it occurs * the temporal distance between the dates
for
pub_date
,
mentioned_date
in
pub_to_mention_count
.
keys
():
date_diff
=
(
datetime
.
strptime
(
pub_date
,
'
%Y-%m-%d
'
)
-
datetime
.
strptime
(
mentioned_date
,
'
%Y-%m-%d
'
)).
days
edge_weight
=
pub_to_mention_count
[(
pub_date
,
mentioned_date
)]
*
abs
(
date_diff
)
edges
.
append
((
pub_date
,
mentioned_date
,
edge_weight
))
# create a graph from the edge list
g
=
igraph
.
Graph
.
TupleList
(
edges
,
directed
=
True
)
vertex_names
=
g
.
vs
[
'
name
'
]
# igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000))
# rate vertices by pagerank score
pagerank_scores
=
g
.
pagerank
()
sorted_indices
=
sorted
(
list
(
range
(
len
(
pagerank_scores
))),
key
=
lambda
i
:
pagerank_scores
[
i
],
reverse
=
True
)
ranked_dates
=
[
vertex_names
[
i
]
for
i
in
sorted_indices
]
return
ranked_dates
This diff is collapsed.
Click to expand it.
timeline_generation.py
+
1
−
1
View file @
e1cc453c
...
...
@@ -23,7 +23,7 @@ def make_timeline(articles, gold_timeline, keywords):
# articles = dataset.filter_articles_by_keywords(articles, keywords)
# select dates
ranked_dates
=
date_selection
.
rank_dates_by_
mention_count
(
articles
,
start_date
,
end_date
)
ranked_dates
=
date_selection
.
rank_dates_by_
wilson
(
articles
,
start_date
,
end_date
)
# train TFIDF vectorizer on all sentences (not just the ones for this date)
all_sentences
=
[
sentence
[
'
text
'
]
for
article
in
articles
for
sentence
in
article
[
'
sentences
'
]]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment