Newer
Older
from datetime import datetime, timedelta
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def candidate_sentences(articles, date, vectorizer):
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
pub_sentences = sentences_published_on_date(articles, date, tolerance_days=2, num_first_sentences=2)
ment_sentences = sentences_mentioning_date(articles, date)
if not pub_sentences:
return ment_sentences
if not ment_sentences:
return pub_sentences
# get TFIDF vectors for sentences published on date and sentences mentioning date
pub_vectors = vectorizer.transform([s['text'] for s in pub_sentences])
ment_vectors = vectorizer.transform([s['text'] for s in ment_sentences])
# sum them to a single vector each
pub_mean_vector = pub_vectors.sum(0).tolist()[0]
ment_mean_vector = ment_vectors.sum(0).tolist()[0]
# get date vector (weighted average of publication vector and mention vector)
pub_weight = 1 / len(pub_sentences)
ment_weight = 1 / len(ment_sentences)
date_vector = [(pub_weight * pub_mean_vector[i] + ment_weight * ment_mean_vector[i]
if pub_mean_vector[i] > 0 and ment_mean_vector[i] > 0
else 0)
for i in range(len(pub_mean_vector))]
# select candidate sentences as those most similar to the date vector
candidate_sentence_pool = [s['text'] for s in pub_sentences + ment_sentences]
sent_vectors = vectorizer.transform(candidate_sentence_pool).toarray().tolist()
similarities = cosine_similarity([date_vector], sent_vectors)[0]
sorted_indices = sorted(list(range(len(candidate_sentence_pool))), key=lambda i: similarities[i], reverse=True)
sorted_sentences = [{'text': candidate_sentence_pool[i]} for i in sorted_indices]
sorted_scores = [similarities[i] for i in sorted_indices]
# only consider sentences above the "knee point"
cutoff_index = knee_point(sorted_scores)
return sorted_sentences[:cutoff_index + 1]
def sentences_published_on_date(articles, date, tolerance_days, num_first_sentences):
# implementation details are the same as ghalandari et al:
# a sentence is not included in the final list if it also mentions the date
sentences = []
for article in articles:
pub_date = datetime.strptime(article['pub_date'], '%Y-%m-%d')
start_date = datetime.strptime(date, '%Y-%m-%d')
end_date = start_date + timedelta(days=tolerance_days)
if start_date <= pub_date <= end_date:
for sentence in article['sentences'][:num_first_sentences]:
if date not in sentence['mentioned_dates']:
sentences.append(sentence)
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def sentences_mentioning_date(articles, date):
sentences = []
for article in articles:
sentences += [sentence for sentence in article['sentences'] if date in sentence['mentioned_dates']]
return sentences
def knee_point(values):
if len(values) <= 1:
return 0
# get coordinates of all the points
n_points = len(values)
all_coords = np.vstack((range(n_points), values)).T
# get the first point
first_point = all_coords[0]
# get vector between first and last point - this is the line
line_vec = all_coords[-1] - all_coords[0]
line_vec_norm = line_vec / np.sqrt(np.sum(line_vec ** 2))
vec_from_first = all_coords - first_point
scalar_prod = np.sum(vec_from_first * np.tile(line_vec_norm, (n_points, 1)), axis=1)
vec_from_first_parallel = np.outer(scalar_prod, line_vec_norm)
vec_to_line = vec_from_first - vec_from_first_parallel
# distance to line is the norm of vec_to_line
dist_to_line = np.sqrt(np.sum(vec_to_line ** 2, axis=1))
# knee/elbow is the point with max distance value
best_idx = np.argmax(dist_to_line)
return best_idx