Implement date uniformity

53a69981 · vvye · ee1ca829 · 53a69981 · 53a69981 · 53a69981
Commit 53a69981 authored 3 years ago by vvye
--- a/date_selection.py
+++ b/date_selection.py
 from collections import Counter
-from datetime import datetime
-import random

 import igraph
+import numpy as np

 import util

@@ -17,7 +16,7 @@ def rank_dates_by_mention_count(articles, start_date, end_date):
    return [item[0] for item in mention_count.most_common()]


-def rank_dates_by_wilson(articles, start_date, end_date):
+def rank_dates_by_wilson(articles, start_date, end_date, num_dates):
    # count how often each published -> mentioned pair occurs
    pub_to_mention_count = Counter({})
    for article in articles:
@@ -30,18 +29,24 @@ def rank_dates_by_wilson(articles, start_date, end_date):
    # the edge weight for each published -> mentioned pair is (occurrence count) * (temporal distance between the dates)
    edges = []
    for pub_date, mentioned_date in pub_to_mention_count.keys():
-        date_diff = (datetime.strptime(pub_date, '%Y-%m-%d') - datetime.strptime(mentioned_date, '%Y-%m-%d')).days
-        edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * abs(date_diff)
+        date_diff = util.days_between(mentioned_date, pub_date)
+        edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * date_diff
        edges.append((pub_date, mentioned_date, edge_weight))

    # create a graph from the edge list
    g = igraph.Graph.TupleList(edges, directed=True, edge_attrs='weight')
    vertex_names = g.vs['name']

-    # igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000))
-
    # rank vertices by pagerank score
    pagerank_scores = g.pagerank(directed=True, weights=g.es['weight'])
    ranked_dates = util.rank(vertex_names, scores=pagerank_scores)

+    print(date_uniformity(ranked_dates[:num_dates]))
+
    return ranked_dates
+
+
+def date_uniformity(dates):
+    dates.sort()
+    date_diffs = [util.days_between(dates[i], dates[i + 1]) for i in range(len(dates) - 1)]
+    return np.std(date_diffs)
--- a/timeline_generation.py
+++ b/timeline_generation.py
@@ -23,7 +23,7 @@ def make_timeline(articles, gold_timeline, keywords):
    # articles = dataset.filter_articles_by_keywords(articles, keywords)

    # select dates
-    ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date)
+    ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date, num_dates)

    # train TFIDF vectorizer on all sentences (not just the ones for this date)
    all_sentences = [sentence['text'] for article in articles for sentence in article['sentences']]

--- a/util.py
+++ b/util.py
 import os
+from datetime import datetime


 def subdirs(path):
@@ -23,3 +24,8 @@ def rank(lst, scores):
    """
    sorted_indices = sorted(list(range(len(scores))), key=lambda i: scores[i], reverse=True)
    return [lst[i] for i in sorted_indices]
+
+
+def days_between(date1, date2):
+    return abs((datetime.strptime(date1, '%Y-%m-%d') - datetime.strptime(date2, '%Y-%m-%d')).days)
+