Skip to content
Snippets Groups Projects
Commit 8afc1b76 authored by vvye's avatar vvye
Browse files

Actually implement redundancy

parent d914ae3d
No related branches found
No related tags found
No related merge requests found
......@@ -17,6 +17,7 @@ def main(args):
sentence_shortening.shortening_length = args.shortening_length
sentence_shortening.do_resolve_unks = True
filter_keywords = args.filter_keywords
redundancy = args.redundancy_threshold
# get dataset
data = {
......@@ -36,7 +37,7 @@ def main(args):
print(f'Topic {topic}, gold timeline {gold_timeline_name}')
timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords, by_tokens, shorten,
filter_keywords)
filter_keywords, redundancy)
if not timeline:
print(' the generated timeline is empty - skipping evaluation of this one')
......
......@@ -5,7 +5,8 @@ from sklearn.preprocessing import normalize
import sentence_shortening
def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tokens, shorten, filter_keywords):
def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tokens, shorten, filter_keywords,
redundancy):
selected_indices = []
selected_sentences = []
remaining_indices = set(range(len(sentences)))
......@@ -62,7 +63,7 @@ def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tok
continue
# if the sentence is near-identical to the current summary, skip it
if redundant(i, selected_indices, X):
if redundant(i, selected_indices, X, redundancy):
continue
# if the sentence would make the summary too long, skip it
......@@ -93,10 +94,10 @@ def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tok
return selected_sentences
def redundant(new_i, selected, X):
def redundant(new_i, selected, X, redundancy):
summary_vectors = [X[i] for i in selected]
new_x = X[new_i]
for x in summary_vectors:
if cosine_similarity(new_x, x)[0] > 0.9999:
if cosine_similarity(new_x, x)[0] > redundancy:
return True
return False
......@@ -7,7 +7,7 @@ import sentence_selection
import summarization
def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten, filter_keywords):
def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten, filter_keywords, redundancy):
timeline = {}
num_dates = len(gold_timeline)
......@@ -40,7 +40,8 @@ def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten, filter_
# build summary for date
summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=by_tokens,
num_sentences=avg_num_sentences, num_tokens=avg_num_tokens,
shorten=shorten, filter_keywords=filter_keywords)
shorten=shorten, filter_keywords=filter_keywords,
redundancy=redundancy)
if not summary_for_date:
continue
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment