timeline_generation.py

from sklearn.feature_extraction.text import TfidfVectorizer

import dataset
import date_selection
import evaluation
import sentence_selection
import summarization
import sentence_shortening


def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten):
    timeline = {}

    num_dates = len(gold_timeline)
    avg_num_sentences = evaluation.avg_num_sentences_in_timeline(gold_timeline)
    avg_num_tokens = evaluation.avg_num_tokens_in_timeline(gold_timeline)

    # keep only the articles published within the gold timeline's range
    start_date = min(gold_timeline.keys())
    end_date = max(gold_timeline.keys())
    articles = dataset.filter_articles_by_date(articles, start_date, end_date)

    # select dates
    ranked_dates = date_selection.rank_dates_by_mention_count(articles, start_date, end_date, num_dates)

    # train TFIDF vectorizer on all sentences (not just the ones for this date)
    all_sentences = [sentence['text'] for article in articles for sentence in article['sentences']]
    vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
    vectorizer.fit(all_sentences)

    for date in ranked_dates:

        if len(timeline) >= num_dates:
            break

        # select candidate sentences for date
        candidate_sentences = sentence_selection.candidate_sentences(articles, date, vectorizer, shorten)
        if not candidate_sentences:
            continue

        # build summary for date
        summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=by_tokens,
                                                   num_sentences=avg_num_sentences, num_tokens=avg_num_tokens)
        if not summary_for_date:
            continue

        timeline[date] = summary_for_date

    # sort timeline by date
    timeline = {date: timeline[date] for date in sorted(timeline.keys())}

    return timeline


def print_timeline(timeline, indent=4, start_indent=0, file=None):
    for date in sorted(timeline.keys()):
        sentences = timeline[date]
        print(' ' * start_indent + date, file=file)
        for sentence in sentences:
            print(' ' * (indent + start_indent) + sentence, file=file)
        print('', file=file)
    print('', file=file)
    print('', file=file)
    print('', file=file)


def save_timeline_to_file(timeline, filename):
    filename.parent.mkdir(parents=True, exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        print_timeline(timeline, file=f)