diff --git a/summarization.py b/summarization.py index 74d1e8eea738faadfce76be31ae5dd87e0b46119..16254ef5e81332e9539e2c4f62e9aff3450283df 100644 --- a/summarization.py +++ b/summarization.py @@ -21,7 +21,7 @@ def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tok else: return len(selected_indices) < round(num_sentences) - while remaining_indices and constraint(): + while True: # if the summary already has sentences, calculate the current summary vector if selected_indices: @@ -44,16 +44,32 @@ def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tok similarities[i] = cosine_similarity(candidate_summary_vector, centroid)[0, 0] sorted_indices = sorted(remaining_indices, key=lambda i: similarities[i], reverse=True) + + # go through all sentences in order from "best" (most similar to centroid) to "worst" for i in sorted_indices: + # don't consider this sentence in the next round remaining_indices.remove(i) - sentence = sentences[i] - if not any([kw.lower() in sentence['text'].lower() for kw in keywords]): + # if the sentence contains no keywords, skip it + if not any([kw.lower() in sentences[i]['text'].lower() for kw in keywords]): continue + # if the sentence is near-identical to the current summary, skip it if redundant(i, selected_indices, X): continue + # if the sentence would make the summary too long, skip it + if by_tokens: + if sum([len(sentences[k]['text'].split()) for k in (selected_indices + [i])]) > round(num_tokens): + continue + else: + if len(selected_indices) + 2 > round(num_sentences): + continue + # otherwise, select the sentence selected_indices.append(i) break + # if there are still sentences left, repeat this step + if remaining_indices: + continue + return [sentences[i]['text'] for i in selected_indices]