diff --git a/date_selection.py b/date_selection.py index d180e51936fbf3500e5bfc2f9837dfdbabfef20e..f5db4631032e8a472d3eb6a087954f590d992e8f 100644 --- a/date_selection.py +++ b/date_selection.py @@ -1,8 +1,7 @@ from collections import Counter -from datetime import datetime -import random import igraph +import numpy as np import util @@ -17,7 +16,7 @@ def rank_dates_by_mention_count(articles, start_date, end_date): return [item[0] for item in mention_count.most_common()] -def rank_dates_by_wilson(articles, start_date, end_date): +def rank_dates_by_wilson(articles, start_date, end_date, num_dates): # count how often each published -> mentioned pair occurs pub_to_mention_count = Counter({}) for article in articles: @@ -30,18 +29,24 @@ def rank_dates_by_wilson(articles, start_date, end_date): # the edge weight for each published -> mentioned pair is (occurrence count) * (temporal distance between the dates) edges = [] for pub_date, mentioned_date in pub_to_mention_count.keys(): - date_diff = (datetime.strptime(pub_date, '%Y-%m-%d') - datetime.strptime(mentioned_date, '%Y-%m-%d')).days - edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * abs(date_diff) + date_diff = util.days_between(mentioned_date, pub_date) + edge_weight = pub_to_mention_count[(pub_date, mentioned_date)] * date_diff edges.append((pub_date, mentioned_date, edge_weight)) # create a graph from the edge list g = igraph.Graph.TupleList(edges, directed=True, edge_attrs='weight') vertex_names = g.vs['name'] - # igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000)) - # rank vertices by pagerank score pagerank_scores = g.pagerank(directed=True, weights=g.es['weight']) ranked_dates = util.rank(vertex_names, scores=pagerank_scores) + print(date_uniformity(ranked_dates[:num_dates])) + return ranked_dates + + +def date_uniformity(dates): + dates.sort() + date_diffs = [util.days_between(dates[i], dates[i + 1]) for i in range(len(dates) - 1)] + return np.std(date_diffs) diff --git a/timeline_generation.py b/timeline_generation.py index f5a8c2367c5d703c67ead7ca11b4d7410b0c1b73..9e6919a2f85887099730bfd4bfa4e1d7b0be365a 100644 --- a/timeline_generation.py +++ b/timeline_generation.py @@ -23,7 +23,7 @@ def make_timeline(articles, gold_timeline, keywords): # articles = dataset.filter_articles_by_keywords(articles, keywords) # select dates - ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date) + ranked_dates = date_selection.rank_dates_by_wilson(articles, start_date, end_date, num_dates) # train TFIDF vectorizer on all sentences (not just the ones for this date) all_sentences = [sentence['text'] for article in articles for sentence in article['sentences']] diff --git a/util.py b/util.py index 3ba515bafeab2d74126387138e3f4f507437e2b9..b2af4465b64438e140f4079bde8a8177b117bb32 100644 --- a/util.py +++ b/util.py @@ -1,4 +1,5 @@ import os +from datetime import datetime def subdirs(path): @@ -23,3 +24,8 @@ def rank(lst, scores): """ sorted_indices = sorted(list(range(len(scores))), key=lambda i: scores[i], reverse=True) return [lst[i] for i in sorted_indices] + + +def days_between(date1, date2): + return abs((datetime.strptime(date1, '%Y-%m-%d') - datetime.strptime(date2, '%Y-%m-%d')).days) +