diff --git a/date_selection.py b/date_selection.py index dd53f0f7b5290d9691989c9f44dfd1b179611562..34de4c08d98b66e5998de164a9c9e5aa94044385 100644 --- a/date_selection.py +++ b/date_selection.py @@ -1,4 +1,7 @@ from collections import Counter +from datetime import datetime + +import igraph def rank_dates_by_mention_count(articles, start_date, end_date): @@ -9,3 +12,36 @@ def rank_dates_by_mention_count(articles, start_date, end_date): if start_date <= mentioned_date <= end_date: mention_count[mentioned_date] += 1 return [item[0] for item in mention_count.most_common()] + + +def rank_dates_by_wilson(articles, start_date, end_date): + edges = [] + + # count how often each published -> mentioned pair occurs + pub_to_mention_count = Counter({}) + for article in articles: + pub_date = article['pub_date'] + for sentence in article['sentences']: + for mentioned_date in set(sentence['mentioned_dates']): + if pub_date != mentioned_date and start_date <= mentioned_date <= end_date: + pub_to_mention_count[(pub_date, mentioned_date)] += 1 + + # the edge weight for each published -> mentioned pair + # is how often it occurs * the temporal distance between the dates + for pub_date, mentioned_date in pub_to_mention_count.keys(): + date_diff = (datetime.strptime(pub_date, '%Y-%m-%d') - datetime.strptime(mentioned_date, '%Y-%m-%d')).days + edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * abs(date_diff) + edges.append((pub_date, mentioned_date, edge_weight)) + + # create a graph from the edge list + g = igraph.Graph.TupleList(edges, directed=True) + vertex_names = g.vs['name'] + + # igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000)) + + # rate vertices by pagerank score + pagerank_scores = g.pagerank() + sorted_indices = sorted(list(range(len(pagerank_scores))), key=lambda i: pagerank_scores[i], reverse=True) + ranked_dates = [vertex_names[i] for i in sorted_indices] + + return ranked_dates diff --git a/timeline_generation.py b/timeline_generation.py index 52824452dccb03e06572aacc4fa5843666e0665c..f5a8c2367c5d703c67ead7ca11b4d7410b0c1b73 100644 --- a/timeline_generation.py +++ b/timeline_generation.py @@ -23,7 +23,7 @@ def make_timeline(articles, gold_timeline, keywords): # articles = dataset.filter_articles_by_keywords(articles, keywords) # select dates - ranked_dates = date_selection.rank_dates_by_mention_count(articles, start_date, end_date) + ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date) # train TFIDF vectorizer on all sentences (not just the ones for this date) all_sentences = [sentence['text'] for article in articles for sentence in article['sentences']]