Skip to content
Snippets Groups Projects
Commit c1692a7f authored by vvye's avatar vvye
Browse files

Make summarization undershoot the length constraint instead of overshooting

parent 8cafc46d
No related branches found
No related tags found
No related merge requests found
......@@ -21,7 +21,7 @@ def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tok
else:
return len(selected_indices) < round(num_sentences)
while remaining_indices and constraint():
while True:
# if the summary already has sentences, calculate the current summary vector
if selected_indices:
......@@ -44,16 +44,32 @@ def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tok
similarities[i] = cosine_similarity(candidate_summary_vector, centroid)[0, 0]
sorted_indices = sorted(remaining_indices, key=lambda i: similarities[i], reverse=True)
# go through all sentences in order from "best" (most similar to centroid) to "worst"
for i in sorted_indices:
# don't consider this sentence in the next round
remaining_indices.remove(i)
sentence = sentences[i]
if not any([kw.lower() in sentence['text'].lower() for kw in keywords]):
# if the sentence contains no keywords, skip it
if not any([kw.lower() in sentences[i]['text'].lower() for kw in keywords]):
continue
# if the sentence is near-identical to the current summary, skip it
if redundant(i, selected_indices, X):
continue
# if the sentence would make the summary too long, skip it
if by_tokens:
if sum([len(sentences[k]['text'].split()) for k in (selected_indices + [i])]) > round(num_tokens):
continue
else:
if len(selected_indices) + 2 > round(num_sentences):
continue
# otherwise, select the sentence
selected_indices.append(i)
break
# if there are still sentences left, repeat this step
if remaining_indices:
continue
return [sentences[i]['text'] for i in selected_indices]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment