Skip to content
Snippets Groups Projects
Commit c9914899 authored by vvye's avatar vvye
Browse files

Constrain summaries by number of tokens instead of number of sentences

parent fa97eff1
No related branches found
No related tags found
No related merge requests found
......@@ -2,10 +2,8 @@ from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import util
def summarize(sentences, vectorizer, keywords, num_sentences):
def summarize(sentences, vectorizer, keywords, by_tokens, num_sentences, num_tokens):
selected_indices = []
remaining_indices = set(range(len(sentences)))
......@@ -15,7 +13,15 @@ def summarize(sentences, vectorizer, keywords, num_sentences):
Xsum = sparse.csr_matrix(X.sum(0))
centroid = normalize(Xsum)
while remaining_indices and len(selected_indices) < num_sentences:
# determine constraint for when the summary is considered complete
# (either checking for number of sentences or number of tokens)
def constraint():
if by_tokens:
return sum([len(sentences[i]['text'].split()) for i in selected_indices]) < num_tokens
else:
return len(selected_indices) < num_sentences
while remaining_indices and constraint():
# if the summary already has sentences, calculate the current summary vector
if selected_indices:
......
......@@ -4,6 +4,7 @@ import dataset
import date_selection
import sentence_selection
import summarization
import util
def make_timeline(articles, gold_timeline, keywords):
......@@ -11,15 +12,13 @@ def make_timeline(articles, gold_timeline, keywords):
num_dates = len(gold_timeline)
avg_num_sentences = round(sum([len(gold_timeline[date]) for date in gold_timeline]) / len(gold_timeline))
avg_num_tokens = round(util.avg([sum([len(s.split()) for s in gold_timeline[date]]) for date in gold_timeline]))
# keep only the articles published within the gold timeline's range
start_date = min(gold_timeline.keys())
end_date = max(gold_timeline.keys())
articles = dataset.filter_articles_by_date(articles, start_date, end_date)
# keep only the sentences containing at least one of the keywords
# articles = dataset.filter_articles_by_keywords(articles, keywords)
# select dates
ranked_dates = date_selection.rank_dates_by_mention_count(articles, start_date, end_date, num_dates)
......@@ -39,8 +38,8 @@ def make_timeline(articles, gold_timeline, keywords):
continue
# build summary for date
summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords,
num_sentences=avg_num_sentences)
summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=True,
num_sentences=avg_num_sentences, num_tokens=avg_num_tokens)
if not summary_for_date:
continue
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment