Refactor list ranking

7c49fcd7 · vvye · a5d0ee9d · 7c49fcd7 · 7c49fcd7 · 7c49fcd7
Commit 7c49fcd7 authored 3 years ago by vvye
--- a/date_selection.py
+++ b/date_selection.py
@@ -3,6 +3,8 @@ from datetime import datetime

 import igraph

+import util
+

 def rank_dates_by_mention_count(articles, start_date, end_date):
    mention_count = Counter({})
@@ -38,9 +40,8 @@ def rank_dates_by_wilson(articles, start_date, end_date):

    # igraph.plot(g, layout='kk', vertex_label=g.vs['name'], bbox=(3000, 3000))

-    # rate vertices by pagerank score
+    # rank vertices by pagerank score
    pagerank_scores = g.pagerank()
-    sorted_indices = sorted(list(range(len(pagerank_scores))), key=lambda i: pagerank_scores[i], reverse=True)
-    ranked_dates = [vertex_names[i] for i in sorted_indices]
+    ranked_dates = util.rank(vertex_names, scores=pagerank_scores)

    return ranked_dates
--- a/sentence_selection.py
+++ b/sentence_selection.py
@@ -3,6 +3,8 @@ from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 from scipy import sparse

+import util
+

 def candidate_sentences(articles, date, vectorizer):
    pub_sentences = sentences_published_on_date(articles, date, tolerance_days=2, num_first_sentences=5)
@@ -34,13 +36,10 @@ def candidate_sentences(articles, date, vectorizer):
    candidate_sentence_pool = ment_sentences + pub_sentences
    sent_vectors = vectorizer.transform([s['text'] for s in candidate_sentence_pool]).toarray().tolist()
    similarities = cosine_similarity([date_vector], sent_vectors)[0]
-
-    sorted_indices = sorted(list(range(len(candidate_sentence_pool))), key=lambda i: similarities[i], reverse=True)
-    sorted_sentences = [candidate_sentence_pool[i] for i in sorted_indices]
-    sorted_scores = [similarities[i] for i in sorted_indices]
+    sorted_sentences = util.rank(candidate_sentence_pool, scores=similarities)

    # only consider sentences above the "knee point"
-    cutoff_index = knee_point(sorted_scores)
+    cutoff_index = knee_point(sorted(similarities, reverse=True))
    candidates = sorted_sentences[:cutoff_index + 1]

    if not candidates:

--- a/util.py
+++ b/util.py
@@ -21,3 +21,15 @@ def contains_any(string, keywords):

 def avg(lst):
    return sum(lst) / len(lst)
+
+
+def rank(lst, scores):
+    """
+    Sorts a list by the values in another, corresponding, list.
+    :param lst: The list to be sorted.
+    :param scores: A list of values to sort by, where each item corresponds to the item in lst
+    (being that item's "score").
+    :return: A copy of lst sorted from highest to lowest score.
+    """
+    sorted_indices = sorted(list(range(len(scores))), key=lambda i: scores[i], reverse=True)
+    return [lst[i] for i in sorted_indices]