Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from sklearn.feature_extraction.text import TfidfVectorizer
import sentence_selection
import summarization
def make_timeline(articles, gold_timeline, date_selection_function):
timeline = {}
num_dates = len(gold_timeline)
avg_num_sentences = sum([len(gold_timeline[date]) for date in gold_timeline]) // len(gold_timeline)
# select dates
start_date = min([a['pub_date'] for a in articles])
end_date = max([a['pub_date'] for a in articles])
ranked_dates = date_selection_function(articles, start_date, end_date)
# train TFIDF vectorizer on all sentences (not just the ones for this date)
all_sentences = [sentence['text'] for article in articles for sentence in article['sentences']]
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
vectorizer.fit(all_sentences)
for date in ranked_dates:
if len(timeline) >= num_dates:
break
# select candidate sentences for date
candidate_sentences = sentence_selection.candidate_sentences(articles, date, vectorizer)
if not candidate_sentences:
continue
# build summary for date
summary_for_date = summarization.summarize(candidate_sentences, num_sentences=avg_num_sentences)
if not summary_for_date:
continue
timeline[date] = summary_for_date
return timeline