diff --git a/date_selection.py b/date_selection.py index f5db4631032e8a472d3eb6a087954f590d992e8f..4d3f8d51cb4f721fb1b45e510883188c23b88dae 100644 --- a/date_selection.py +++ b/date_selection.py @@ -1,3 +1,4 @@ +import math from collections import Counter import igraph @@ -37,13 +38,35 @@ def rank_dates_by_wilson(articles, start_date, end_date, num_dates): g = igraph.Graph.TupleList(edges, directed=True, edge_attrs='weight') vertex_names = g.vs['name'] - # rank vertices by pagerank score - pagerank_scores = g.pagerank(directed=True, weights=g.es['weight']) - ranked_dates = util.rank(vertex_names, scores=pagerank_scores) + # rank the dates with personalized pagerank + # (do this multiple times with different "vertex weights", depending on alpha, + # and return the result that is most uniform) + best_uniformity = math.inf + best_ranked_dates = [] + candidate_alphas = [0.01 * x for x in range(1, 100)] + for alpha in candidate_alphas: - print(date_uniformity(ranked_dates[:num_dates])) + # calculate vertex "weights" for personalized pagerank + vertex_weights = [] + start = min(vertex_names) + for date in vertex_names: + diff_to_start = util.days_between(start, date) + try: + vertex_weights.append(alpha ** -diff_to_start) + except OverflowError: + vertex_weights.append(math.inf) - return ranked_dates + # rank vertices with personalized pagerank + pagerank_scores = g.personalized_pagerank(directed=True, weights=g.es['weight'], reset=vertex_weights) + ranked_dates = util.rank(vertex_names, scores=pagerank_scores) + + # if this result is the most uniform yet, save it + uniformity = date_uniformity(ranked_dates[:num_dates]) + if uniformity < best_uniformity: + best_uniformity = uniformity + best_ranked_dates = ranked_dates + + return best_ranked_dates def date_uniformity(dates):