Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
B
BA Timeline Summarization
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
kaiser
BA Timeline Summarization
Commits
53a69981
Commit
53a69981
authored
3 years ago
by
vvye
Browse files
Options
Downloads
Patches
Plain Diff
Implement date uniformity
parent
ee1ca829
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
date_selection.py
+12
-7
12 additions, 7 deletions
date_selection.py
timeline_generation.py
+1
-1
1 addition, 1 deletion
timeline_generation.py
util.py
+6
-0
6 additions, 0 deletions
util.py
with
19 additions
and
8 deletions
date_selection.py
+
12
−
7
View file @
53a69981
from
collections
import
Counter
from
datetime
import
datetime
import
random
import
igraph
import
numpy
as
np
import
util
...
...
@@ -17,7 +16,7 @@ def rank_dates_by_mention_count(articles, start_date, end_date):
return
[
item
[
0
]
for
item
in
mention_count
.
most_common
()]
def
rank_dates_by_wilson
(
articles
,
start_date
,
end_date
):
def
rank_dates_by_wilson
(
articles
,
start_date
,
end_date
,
num_dates
):
# count how often each published -> mentioned pair occurs
pub_to_mention_count
=
Counter
({})
for
article
in
articles
:
...
...
@@ -30,18 +29,24 @@ def rank_dates_by_wilson(articles, start_date, end_date):
# the edge weight for each published -> mentioned pair is (occurrence count) * (temporal distance between the dates)
edges
=
[]
for
pub_date
,
mentioned_date
in
pub_to_mention_count
.
keys
():
date_diff
=
(
datetime
.
strptime
(
pub_date
,
'
%Y-%m-%d
'
)
-
datetime
.
strptime
(
mentioned_date
,
'
%Y-%m-%d
'
)).
days
edge_weight
=
pub_to_mention_count
[(
pub_date
,
mentioned_date
)]
*
abs
(
date_diff
)
date_diff
=
util
.
days_between
(
mentioned_date
,
pub_date
)
edge_weight
=
pub_to_mention_count
[(
pub_date
,
mentioned_date
)]
*
date_diff
edges
.
append
((
pub_date
,
mentioned_date
,
edge_weight
))
# create a graph from the edge list
g
=
igraph
.
Graph
.
TupleList
(
edges
,
directed
=
True
,
edge_attrs
=
'
weight
'
)
vertex_names
=
g
.
vs
[
'
name
'
]
# igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000))
# rank vertices by pagerank score
pagerank_scores
=
g
.
pagerank
(
directed
=
True
,
weights
=
g
.
es
[
'
weight
'
])
ranked_dates
=
util
.
rank
(
vertex_names
,
scores
=
pagerank_scores
)
print
(
date_uniformity
(
ranked_dates
[:
num_dates
]))
return
ranked_dates
def
date_uniformity
(
dates
):
dates
.
sort
()
date_diffs
=
[
util
.
days_between
(
dates
[
i
],
dates
[
i
+
1
])
for
i
in
range
(
len
(
dates
)
-
1
)]
return
np
.
std
(
date_diffs
)
This diff is collapsed.
Click to expand it.
timeline_generation.py
+
1
−
1
View file @
53a69981
...
...
@@ -23,7 +23,7 @@ def make_timeline(articles, gold_timeline, keywords):
# articles = dataset.filter_articles_by_keywords(articles, keywords)
# select dates
ranked_dates
=
date_selection
.
rank_dates_by_wilson
(
articles
,
start_date
,
end_date
)
ranked_dates
=
date_selection
.
rank_dates_by_wilson
(
articles
,
start_date
,
end_date
,
num_dates
)
# train TFIDF vectorizer on all sentences (not just the ones for this date)
all_sentences
=
[
sentence
[
'
text
'
]
for
article
in
articles
for
sentence
in
article
[
'
sentences
'
]]
...
...
This diff is collapsed.
Click to expand it.
util.py
+
6
−
0
View file @
53a69981
import
os
from
datetime
import
datetime
def
subdirs
(
path
):
...
...
@@ -23,3 +24,8 @@ def rank(lst, scores):
"""
sorted_indices
=
sorted
(
list
(
range
(
len
(
scores
))),
key
=
lambda
i
:
scores
[
i
],
reverse
=
True
)
return
[
lst
[
i
]
for
i
in
sorted_indices
]
def
days_between
(
date1
,
date2
):
return
abs
((
datetime
.
strptime
(
date1
,
'
%Y-%m-%d
'
)
-
datetime
.
strptime
(
date2
,
'
%Y-%m-%d
'
)).
days
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment