diff --git a/summarization.py b/summarization.py index 141cf74c6d0f9fbd37556b525500f3976bcb1611..71ebc8bfc9f292809e43f82c120c8daf8d8fdcc2 100644 --- a/summarization.py +++ b/summarization.py @@ -2,10 +2,8 @@ from scipy import sparse from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import normalize -import util - -def summarize(sentences, vectorizer, keywords, num_sentences): +def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tokens): selected_indices = [] remaining_indices = set(range(len(sentences))) @@ -15,7 +13,15 @@ def summarize(sentences, vectorizer, keywords, num_sentences): Xsum = sparse.csr_matrix(X.sum(0)) centroid = normalize(Xsum) - while remaining_indices and len(selected_indices) < num_sentences: + # determine constraint for when the summary is considered complete + # (either checking for number of sentences or number of tokens) + def constraint(): + if by_tokens: + return sum([len(sentences[i]['text'].split()) for i in selected_indices]) < num_tokens + else: + return len(selected_indices) < num_sentences + + while remaining_indices and constraint(): # if the summary already has sentences, calculate the current summary vector if selected_indices: diff --git a/timeline_generation.py b/timeline_generation.py index a74fbfd4141b35643c668b486fb5395559e599f5..438b1144ef942d05e97e88c633acfaac52d798de 100644 --- a/timeline_generation.py +++ b/timeline_generation.py @@ -4,6 +4,7 @@ import dataset import date_selection import sentence_selection import summarization +import util def make_timeline(articles, gold_timeline, keywords): @@ -11,15 +12,13 @@ def make_timeline(articles, gold_timeline, keywords): num_dates = len(gold_timeline) avg_num_sentences = round(sum([len(gold_timeline[date]) for date in gold_timeline]) / len(gold_timeline)) + avg_num_tokens = round(util.avg([sum([len(s.split()) for s in gold_timeline[date]]) for date in gold_timeline])) # keep only the articles published within the gold timeline's range start_date = min(gold_timeline.keys()) end_date = max(gold_timeline.keys()) articles = dataset.filter_articles_by_date(articles, start_date, end_date) - # keep only the sentences containing at least one of the keywords - # articles = dataset.filter_articles_by_keywords(articles, keywords) - # select dates ranked_dates = date_selection.rank_dates_by_mention_count(articles, start_date, end_date, num_dates) @@ -39,8 +38,8 @@ def make_timeline(articles, gold_timeline, keywords): continue # build summary for date - summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, - num_sentences=avg_num_sentences) + summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=True, + num_sentences=avg_num_sentences, num_tokens=avg_num_tokens) if not summary_for_date: continue