Skip to content
Snippets Groups Projects
sentence_selection.py 3.74 KiB
Newer Older
from datetime import datetime, timedelta
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def candidate_sentences(articles, date, vectorizer):
    pub_sentences = sentences_published_on_date(articles, date, tolerance_days=2, num_first_sentences=2)
    ment_sentences = sentences_mentioning_date(articles, date)

    if not pub_sentences:
        return ment_sentences
    if not ment_sentences:
        return pub_sentences

    # get TFIDF vectors for sentences published on date and sentences mentioning date
    pub_vectors = vectorizer.transform([s['text'] for s in pub_sentences])
    ment_vectors = vectorizer.transform([s['text'] for s in ment_sentences])

    # sum them to a single vector each
    pub_mean_vector = pub_vectors.sum(0).tolist()[0]
    ment_mean_vector = ment_vectors.sum(0).tolist()[0]

    # get date vector (weighted average of publication vector and mention vector)
    pub_weight = 1 / len(pub_sentences)
    ment_weight = 1 / len(ment_sentences)
    date_vector = [(pub_weight * pub_mean_vector[i] + ment_weight * ment_mean_vector[i]
                    if pub_mean_vector[i] > 0 and ment_mean_vector[i] > 0
                    else 0)
                   for i in range(len(pub_mean_vector))]

    # select candidate sentences as those most similar to the date vector
    candidate_sentence_pool = [s['text'] for s in pub_sentences + ment_sentences]
    sent_vectors = vectorizer.transform(candidate_sentence_pool).toarray().tolist()
    similarities = cosine_similarity([date_vector], sent_vectors)[0]

    sorted_indices = sorted(list(range(len(candidate_sentence_pool))), key=lambda i: similarities[i], reverse=True)
    sorted_sentences = [{'text': candidate_sentence_pool[i]} for i in sorted_indices]
    sorted_scores = [similarities[i] for i in sorted_indices]

    # only consider sentences above the "knee point"
    cutoff_index = knee_point(sorted_scores)

    return sorted_sentences[:cutoff_index + 1]


def sentences_published_on_date(articles, date, tolerance_days, num_first_sentences):
    # implementation details are the same as ghalandari et al:
    # a sentence is not included in the final list if it also mentions the date
    sentences = []
    for article in articles:
        pub_date = datetime.strptime(article['pub_date'], '%Y-%m-%d')
        start_date = datetime.strptime(date, '%Y-%m-%d')
        end_date = start_date + timedelta(days=tolerance_days)
        if start_date <= pub_date <= end_date:
            for sentence in article['sentences'][:num_first_sentences]:
                if date not in sentence['mentioned_dates']:
                    sentences.append(sentence)
    return sentences


def sentences_mentioning_date(articles, date):
    sentences = []
    for article in articles:
        sentences += [sentence for sentence in article['sentences'] if date in sentence['mentioned_dates']]
    return sentences


def knee_point(values):
    if len(values) <= 1:
        return 0

    # get coordinates of all the points
    n_points = len(values)
    all_coords = np.vstack((range(n_points), values)).T

    # get the first point
    first_point = all_coords[0]

    # get vector between first and last point - this is the line
    line_vec = all_coords[-1] - all_coords[0]
    line_vec_norm = line_vec / np.sqrt(np.sum(line_vec ** 2))
    vec_from_first = all_coords - first_point
    scalar_prod = np.sum(vec_from_first * np.tile(line_vec_norm, (n_points, 1)), axis=1)
    vec_from_first_parallel = np.outer(scalar_prod, line_vec_norm)
    vec_to_line = vec_from_first - vec_from_first_parallel

    # distance to line is the norm of vec_to_line
    dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1))

    # knee/elbow is the point with max distance value
    best_idx = np.argmax(dist_to_line)
    return best_idx