Newer
Older
from datetime import datetime, timedelta
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def candidate_sentences(articles, date, vectorizer):
pub_sentences = sentences_published_on_date(articles, date, tolerance_days=2, num_first_sentences=5)
ment_sentences = sentences_mentioning_date(articles, date)
if not pub_sentences:
return ment_sentences
if not ment_sentences:
return pub_sentences
# get TFIDF vectors for sentences published on date and sentences mentioning date
pub_vectors = vectorizer.transform([s['text'] for s in pub_sentences])
ment_vectors = vectorizer.transform([s['text'] for s in ment_sentences])
# sum them to a single vector each
pub_mean_vector = pub_vectors.sum(0).tolist()[0]
ment_mean_vector = ment_vectors.sum(0).tolist()[0]
# get date vector (weighted average of publication vector and mention vector)
pub_weight = 1 / len(pub_sentences)
ment_weight = 1 / len(ment_sentences)
date_vector = [(pub_weight * pub_mean_vector[i] + ment_weight * ment_mean_vector[i]
if pub_mean_vector[i] > 0 and ment_mean_vector[i] > 0
else 0)
for i in range(len(pub_mean_vector))]
# select candidate sentences as those most similar to the date vector
candidate_sentence_pool = ment_sentences + pub_sentences
sent_vectors = vectorizer.transform([s['text'] for s in candidate_sentence_pool]).toarray().tolist()
similarities = cosine_similarity([date_vector], sent_vectors)[0]
sorted_indices = sorted(list(range(len(candidate_sentence_pool))), key=lambda i: similarities[i], reverse=True)
sorted_sentences = [candidate_sentence_pool[i] for i in sorted_indices]
sorted_scores = [similarities[i] for i in sorted_indices]
# only consider sentences above the "knee point"
cutoff_index = knee_point(sorted_scores)
# return sorted_sentences[:cutoff_index + 1]
threshold = similarities[cutoff_index] # sic!
return_value = []
for i in range(len(candidate_sentence_pool)):
if similarities[i] > threshold:
return_value.append(candidate_sentence_pool[i])
return return_value
def sentences_published_on_date(articles, date, tolerance_days, num_first_sentences):
# implementation details are the same as ghalandari et al:
# a sentence is not included in the final list if it also mentions any date at all
sentences = []
for article in articles:
pub_date = datetime.strptime(article['pub_date'], '%Y-%m-%d')
start_date = datetime.strptime(date, '%Y-%m-%d')
end_date = start_date + timedelta(days=tolerance_days)
if start_date <= pub_date < end_date:
for sentence in article['sentences'][:num_first_sentences + 1]:
if not sentence['mentioned_dates']:
def sentences_mentioning_date(articles, date):
sentences = []
for article in articles:
for sentence in article['sentences']:
if date in sentence['mentioned_dates']:
sentences.append(sentence)
return sentences
def knee_point(values):
if len(values) <= 1:
return 0
# get coordinates of all the points
n_points = len(values)
all_coords = np.vstack((range(n_points), values)).T
# get the first point
first_point = all_coords[0]
# get vector between first and last point - this is the line
line_vec = all_coords[-1] - all_coords[0]
line_vec_norm = line_vec / np.sqrt(np.sum(line_vec ** 2))
vec_from_first = all_coords - first_point
scalar_prod = np.sum(vec_from_first * np.tile(line_vec_norm, (n_points, 1)), axis=1)
vec_from_first_parallel = np.outer(scalar_prod, line_vec_norm)
vec_to_line = vec_from_first - vec_from_first_parallel
# distance to line is the norm of vec_to_line
dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1))
# knee/elbow is the point with max distance value
best_idx = np.argmax(dist_to_line)
return best_idx