Skip to content
Snippets Groups Projects
Commit 6d2894f0 authored by vvye's avatar vvye
Browse files

Change the points at which sentences are shortened

parent f7fea25f
No related branches found
No related tags found
No related merge requests found
......@@ -4,12 +4,20 @@ from pathlib import Path
import dataset
import evaluation
import misc
import sentence_shortening
import timeline_generation
def main(args):
eval_results = evaluation.ResultLogger()
# set up configuration
by_tokens = args.length_constraint == 'tokens'
shorten = args.shorten_sentences
sentence_shortening.shortening_length = args.shortening_length
sentence_shortening.do_resolve_unks = args.resolve_unks
# get dataset
data = {
'timeline17': dataset.get_timeline17_dataset,
'crisis': dataset.get_crisis_dataset,
......@@ -26,13 +34,7 @@ def main(args):
print(f'Topic {topic}, gold timeline {gold_timeline_name}')
by_tokens = args.length_constraint == 'tokens'
shorten_sentences = args.shorten_sentences
shortening_length = args.shortening_length
resolve_unks = args.resolve_unks
timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords, by_tokens,
shorten_sentences, shortening_length, resolve_unks)
timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords, by_tokens, shorten)
if not timeline:
print(' the generated timeline is empty - skipping evaluation of this one')
......@@ -77,9 +79,9 @@ if __name__ == '__main__':
'(number of sentences or number of tokens)')
parser.add_argument('--shorten_sentences',
type=str,
choices=['never', 'before_summarization', 'after_summarization'], default='never',
help='whether to apply shortening to sentences, and if so, when '
'(directly before or after the summary is generated from candidate sentences)')
choices=['never', 'shorten', 'shorten_and_rerank'], default='never',
help='whether to apply shortening to sentences, and if so, how '
'(shorten candidate sentences, or shorten them and rerank them afterwards)')
parser.add_argument('--shortening_length',
type=int,
choices=[8, 10, 12], default=8,
......
......@@ -2,10 +2,11 @@ from datetime import datetime, timedelta
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import sentence_shortening
import util
def candidate_sentences(articles, date, vectorizer):
def candidate_sentences(articles, date, vectorizer, shorten):
pub_sentences = sentences_published_on_date(articles, date, tolerance_days=2, num_first_sentences=5)
ment_sentences = sentences_mentioning_date(articles, date)
......@@ -41,6 +42,19 @@ def candidate_sentences(articles, date, vectorizer):
cutoff_index = knee_point(sorted(similarities, reverse=True))
candidates = sorted_sentences[:cutoff_index + 1]
# shorten candidate sentences if desired
if shorten != 'never':
candidates = [{
'text': sentence_shortening.shorten(sentence['text']),
'mentioned_dates': sentence['mentioned_dates']
} for sentence in candidates]
# re-rank the shortened sentences if desired
if shorten == 'shorten_and_rerank':
sent_vectors = vectorizer.transform([s['text'] for s in candidates]).toarray().tolist()
similarities = cosine_similarity([date_vector], sent_vectors)[0]
candidates = util.rank(candidates, scores=similarities)
if not candidates:
return candidate_sentence_pool
return candidates
......
......@@ -3,20 +3,23 @@ from Bio import pairwise2
import pickle
import string
shortening_length = 8
do_resolve_unks = True
shortened_sentences = pickle.load(open('data/in/summarized_sentences.pkl', 'rb'))
with open('data/in/sentence_summarization_vocab.txt', encoding='utf-8') as f:
vocab = [line.strip() for line in f.readlines() if line.strip()]
def shorten(sentence, num_tokens, do_resolve_unks=True):
def shorten(sentence):
try:
shortened_sentence = shortened_sentences[sentence.lower()][num_tokens]
shortened_sentence = shortened_sentences[sentence.lower()][shortening_length]
if do_resolve_unks:
shortened_sentence = resolve_unks(sentence, shortened_sentence)
return shortened_sentence
except KeyError:
print(sentence)
with open('missing-sentences_' + str(num_tokens) + '.txt', 'a', encoding='utf-8') as f:
with open('missing-sentences_' + str(shortening_length) + '.txt', 'a', encoding='utf-8') as f:
f.write(sentence + '\n')
return sentence
......@@ -42,8 +45,7 @@ def resolve_unks(sentence, summarized_sentence):
# find the best candidate for each UNK
# (just take the most common item, and fall back to UNK if there are ever no candidates for some reason)
unk_replacements = [max(candidates, key=candidates.count) if candidates else 'UNK'
for candidates in candidates_by_unk]
unk_replacements = [max(c, key=c.count) if c else 'UNK' for c in candidates_by_unk]
# replace each UNK with its best candidate
for i, w in enumerate(summarized_sentence):
......@@ -54,4 +56,4 @@ def resolve_unks(sentence, summarized_sentence):
if __name__ == '__main__':
print(shorten('and this was on - going in egypt before january 25 .', 10))
print(shorten('and this was on - going in egypt before january 25 .'))
......@@ -6,10 +6,9 @@ import evaluation
import sentence_selection
import summarization
import sentence_shortening
import util
def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten_sentences, shortening_length, resolve_unks):
def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten):
timeline = {}
num_dates = len(gold_timeline)
......@@ -35,32 +34,16 @@ def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten_sentence
break
# select candidate sentences for date
candidate_sentences = sentence_selection.candidate_sentences(articles, date, vectorizer)
candidate_sentences = sentence_selection.candidate_sentences(articles, date, vectorizer, shorten)
if not candidate_sentences:
continue
# shorten sentences if needed
if shorten_sentences == 'before_summarization':
new_candidate_sentences = []
for i in range(len(candidate_sentences)):
new_candidate_sentences.append({
'text': sentence_shortening.shorten(candidate_sentences[i]['text'], shortening_length,
resolve_unks),
'mentioned_dates': candidate_sentences[i]['mentioned_dates']
})
candidate_sentences = new_candidate_sentences
# build summary for date
summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=by_tokens,
num_sentences=avg_num_sentences, num_tokens=avg_num_tokens)
if not summary_for_date:
continue
# shorten sentences if needed
if shorten_sentences == 'after_summarization':
summary_for_date = [sentence_shortening.shorten(sentence, shortening_length, resolve_unks)
for sentence in summary_for_date]
timeline[date] = summary_for_date
# sort timeline by date
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment