Make summarization undershoot the length constraint instead of overshooting

c1692a7f · vvye · 8cafc46d · c1692a7f
Commit c1692a7f authored 3 years ago by vvye
--- a/summarization.py
+++ b/summarization.py
@@ -21,7 +21,7 @@ def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tok
        else:
            return len(selected_indices) < round(num_sentences)

-    while remaining_indices and constraint():
+    while True:

        # if the summary already has sentences, calculate the current summary vector
        if selected_indices:
@@ -44,16 +44,32 @@ def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tok
            similarities[i] = cosine_similarity(candidate_summary_vector, centroid)[0, 0]

        sorted_indices = sorted(remaining_indices, key=lambda i: similarities[i], reverse=True)
+
+        # go through all sentences in order from "best" (most similar to centroid) to "worst"
        for i in sorted_indices:
+            # don't consider this sentence in the next round
            remaining_indices.remove(i)
-            sentence = sentences[i]
-            if not any([kw.lower() in sentence['text'].lower() for kw in keywords]):
+            # if the sentence contains no keywords, skip it
+            if not any([kw.lower() in sentences[i]['text'].lower() for kw in keywords]):
                continue
+            # if the sentence is near-identical to the current summary, skip it
            if redundant(i, selected_indices, X):
                continue
+            # if the sentence would make the summary too long, skip it
+            if by_tokens:
+                if sum([len(sentences[k]['text'].split()) for k in (selected_indices + [i])]) > round(num_tokens):
+                    continue
+            else:
+                if len(selected_indices) + 2 > round(num_sentences):
+                    continue
+            # otherwise, select the sentence
            selected_indices.append(i)
            break

+        # if there are still sentences left, repeat this step
+        if remaining_indices:
+            continue
+
    return [sentences[i]['text'] for i in selected_indices]