Skip to content
Snippets Groups Projects
Commit e1cc453c authored by vvye's avatar vvye
Browse files

Start implementing date selection from WILSON

parent 32eb4acb
No related branches found
No related tags found
No related merge requests found
from collections import Counter
from datetime import datetime
import igraph
def rank_dates_by_mention_count(articles, start_date, end_date):
......@@ -9,3 +12,36 @@ def rank_dates_by_mention_count(articles, start_date, end_date):
if start_date <= mentioned_date <= end_date:
mention_count[mentioned_date] += 1
return [item[0] for item in mention_count.most_common()]
def rank_dates_by_wilson(articles, start_date, end_date):
edges = []
# count how often each published -> mentioned pair occurs
pub_to_mention_count = Counter({})
for article in articles:
pub_date = article['pub_date']
for sentence in article['sentences']:
for mentioned_date in set(sentence['mentioned_dates']):
if pub_date != mentioned_date and start_date <= mentioned_date <= end_date:
pub_to_mention_count[(pub_date, mentioned_date)] += 1
# the edge weight for each published -> mentioned pair
# is how often it occurs * the temporal distance between the dates
for pub_date, mentioned_date in pub_to_mention_count.keys():
date_diff = (datetime.strptime(pub_date, '%Y-%m-%d') - datetime.strptime(mentioned_date, '%Y-%m-%d')).days
edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * abs(date_diff)
edges.append((pub_date, mentioned_date, edge_weight))
# create a graph from the edge list
g = igraph.Graph.TupleList(edges, directed=True)
vertex_names = g.vs['name']
# igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000))
# rate vertices by pagerank score
pagerank_scores = g.pagerank()
sorted_indices = sorted(list(range(len(pagerank_scores))), key=lambda i: pagerank_scores[i], reverse=True)
ranked_dates = [vertex_names[i] for i in sorted_indices]
return ranked_dates
......@@ -23,7 +23,7 @@ def make_timeline(articles, gold_timeline, keywords):
# articles = dataset.filter_articles_by_keywords(articles, keywords)
# select dates
ranked_dates = date_selection.rank_dates_by_mention_count(articles, start_date, end_date)
ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date)
# train TFIDF vectorizer on all sentences (not just the ones for this date)
all_sentences = [sentence['text'] for article in articles for sentence in article['sentences']]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment