Skip to content
Snippets Groups Projects
Commit 53a69981 authored by vvye's avatar vvye
Browse files

Implement date uniformity

parent ee1ca829
No related branches found
No related tags found
No related merge requests found
from collections import Counter
from datetime import datetime
import random
import igraph
import numpy as np
import util
......@@ -17,7 +16,7 @@ def rank_dates_by_mention_count(articles, start_date, end_date):
return [item[0] for item in mention_count.most_common()]
def rank_dates_by_wilson(articles, start_date, end_date):
def rank_dates_by_wilson(articles, start_date, end_date, num_dates):
# count how often each published -> mentioned pair occurs
pub_to_mention_count = Counter({})
for article in articles:
......@@ -30,18 +29,24 @@ def rank_dates_by_wilson(articles, start_date, end_date):
# the edge weight for each published -> mentioned pair is (occurrence count) * (temporal distance between the dates)
edges = []
for pub_date, mentioned_date in pub_to_mention_count.keys():
date_diff = (datetime.strptime(pub_date, '%Y-%m-%d') - datetime.strptime(mentioned_date, '%Y-%m-%d')).days
edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * abs(date_diff)
date_diff = util.days_between(mentioned_date, pub_date)
edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * date_diff
edges.append((pub_date, mentioned_date, edge_weight))
# create a graph from the edge list
g = igraph.Graph.TupleList(edges, directed=True, edge_attrs='weight')
vertex_names = g.vs['name']
# igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000))
# rank vertices by pagerank score
pagerank_scores = g.pagerank(directed=True, weights=g.es['weight'])
ranked_dates = util.rank(vertex_names, scores=pagerank_scores)
print(date_uniformity(ranked_dates[:num_dates]))
return ranked_dates
def date_uniformity(dates):
dates.sort()
date_diffs = [util.days_between(dates[i], dates[i + 1]) for i in range(len(dates) - 1)]
return np.std(date_diffs)
......@@ -23,7 +23,7 @@ def make_timeline(articles, gold_timeline, keywords):
# articles = dataset.filter_articles_by_keywords(articles, keywords)
# select dates
ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date)
ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date, num_dates)
# train TFIDF vectorizer on all sentences (not just the ones for this date)
all_sentences = [sentence['text'] for article in articles for sentence in article['sentences']]
......
import os
from datetime import datetime
def subdirs(path):
......@@ -23,3 +24,8 @@ def rank(lst, scores):
"""
sorted_indices = sorted(list(range(len(scores))), key=lambda i: scores[i], reverse=True)
return [lst[i] for i in sorted_indices]
def days_between(date1, date2):
return abs((datetime.strptime(date1, '%Y-%m-%d') - datetime.strptime(date2, '%Y-%m-%d')).days)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment