Start implementing date selection from WILSON

e1cc453c · vvye · 32eb4acb · e1cc453c · e1cc453c
Commit e1cc453c authored 3 years ago by vvye
--- a/date_selection.py
+++ b/date_selection.py
 from collections import Counter
+from datetime import datetime
+
+import igraph


 def rank_dates_by_mention_count(articles, start_date, end_date):
@@ -9,3 +12,36 @@ def rank_dates_by_mention_count(articles, start_date, end_date):
                if start_date <= mentioned_date <= end_date:
                    mention_count[mentioned_date] += 1
    return [item[0] for item in mention_count.most_common()]
+
+
+def rank_dates_by_wilson(articles, start_date, end_date):
+    edges = []
+
+    # count how often each published -> mentioned pair occurs
+    pub_to_mention_count = Counter({})
+    for article in articles:
+        pub_date = article['pub_date']
+        for sentence in article['sentences']:
+            for mentioned_date in set(sentence['mentioned_dates']):
+                if pub_date != mentioned_date and start_date <= mentioned_date <= end_date:
+                    pub_to_mention_count[(pub_date, mentioned_date)] += 1
+
+    # the edge weight for each published -> mentioned pair
+    # is how often it occurs * the temporal distance between the dates
+    for pub_date, mentioned_date in pub_to_mention_count.keys():
+        date_diff = (datetime.strptime(pub_date, '%Y-%m-%d') - datetime.strptime(mentioned_date, '%Y-%m-%d')).days
+        edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * abs(date_diff)
+        edges.append((pub_date, mentioned_date, edge_weight))
+
+    # create a graph from the edge list
+    g = igraph.Graph.TupleList(edges, directed=True)
+    vertex_names = g.vs['name']
+
+    # igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000))
+
+    # rate vertices by pagerank score
+    pagerank_scores = g.pagerank()
+    sorted_indices = sorted(list(range(len(pagerank_scores))), key=lambda i: pagerank_scores[i], reverse=True)
+    ranked_dates = [vertex_names[i] for i in sorted_indices]
+
+    return ranked_dates
--- a/timeline_generation.py
+++ b/timeline_generation.py
@@ -23,7 +23,7 @@ def make_timeline(articles, gold_timeline, keywords):
    # articles = dataset.filter_articles_by_keywords(articles, keywords)

    # select dates
-    ranked_dates = date_selection.rank_dates_by_mention_count(articles, start_date, end_date)
+    ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date)

    # train TFIDF vectorizer on all sentences (not just the ones for this date)
    all_sentences = [sentence['text'] for article in articles for sentence in article['sentences']]