Change the points at which sentences are shortened

6d2894f0 · vvye · f7fea25f · 6d2894f0 · 6d2894f0 · 6d2894f0
Commit 6d2894f0 authored 3 years ago by vvye
--- a/run.py
+++ b/run.py
@@ -4,12 +4,20 @@ from pathlib import Path
 import dataset
 import evaluation
 import misc
+import sentence_shortening
 import timeline_generation


 def main(args):
    eval_results = evaluation.ResultLogger()

+    # set up configuration
+    by_tokens = args.length_constraint == 'tokens'
+    shorten = args.shorten_sentences
+    sentence_shortening.shortening_length = args.shortening_length
+    sentence_shortening.do_resolve_unks = args.resolve_unks
+
+    # get dataset
    data = {
        'timeline17': dataset.get_timeline17_dataset,
        'crisis': dataset.get_crisis_dataset,
@@ -26,13 +34,7 @@ def main(args):

            print(f'Topic {topic}, gold timeline {gold_timeline_name}')

-            by_tokens = args.length_constraint == 'tokens'
-            shorten_sentences = args.shorten_sentences
-            shortening_length = args.shortening_length
-            resolve_unks = args.resolve_unks
-
-            timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords, by_tokens,
-                                                         shorten_sentences, shortening_length, resolve_unks)
+            timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords, by_tokens, shorten)

            if not timeline:
                print('    the generated timeline is empty - skipping evaluation of this one')
@@ -77,9 +79,9 @@ if __name__ == '__main__':
                             '(number of sentences or number of tokens)')
    parser.add_argument('--shorten_sentences',
                        type=str,
-                        choices=['never', 'before_summarization', 'after_summarization'], default='never',
-                        help='whether to apply shortening to sentences, and if so, when '
-                             '(directly before or after the summary is generated from candidate sentences)')
+                        choices=['never', 'shorten', 'shorten_and_rerank'], default='never',
+                        help='whether to apply shortening to sentences, and if so, how '
+                             '(shorten candidate sentences, or shorten them and rerank them afterwards)')
    parser.add_argument('--shortening_length',
                        type=int,
                        choices=[8, 10, 12], default=8,

--- a/sentence_selection.py
+++ b/sentence_selection.py
@@ -2,10 +2,11 @@ from datetime import datetime, timedelta
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np

+import sentence_shortening
 import util


-def candidate_sentences(articles, date, vectorizer):
+def candidate_sentences(articles, date, vectorizer, shorten):
    pub_sentences = sentences_published_on_date(articles, date, tolerance_days=2, num_first_sentences=5)
    ment_sentences = sentences_mentioning_date(articles, date)

@@ -41,6 +42,19 @@ def candidate_sentences(articles, date, vectorizer):
    cutoff_index = knee_point(sorted(similarities, reverse=True))
    candidates = sorted_sentences[:cutoff_index + 1]

+    # shorten candidate sentences if desired
+    if shorten != 'never':
+        candidates = [{
+            'text': sentence_shortening.shorten(sentence['text']),
+            'mentioned_dates': sentence['mentioned_dates']
+        } for sentence in candidates]
+
+    # re-rank the shortened sentences if desired
+    if shorten == 'shorten_and_rerank':
+        sent_vectors = vectorizer.transform([s['text'] for s in candidates]).toarray().tolist()
+        similarities = cosine_similarity([date_vector], sent_vectors)[0]
+        candidates = util.rank(candidates, scores=similarities)
+
    if not candidates:
        return candidate_sentence_pool
    return candidates

--- a/sentence_shortening.py
+++ b/sentence_shortening.py
@@ -3,20 +3,23 @@ from Bio import pairwise2
 import pickle
 import string

+shortening_length = 8
+do_resolve_unks = True
+
 shortened_sentences = pickle.load(open('data/in/summarized_sentences.pkl', 'rb'))
 with open('data/in/sentence_summarization_vocab.txt', encoding='utf-8') as f:
    vocab = [line.strip() for line in f.readlines() if line.strip()]


-def shorten(sentence, num_tokens, do_resolve_unks=True):
+def shorten(sentence):
    try:
-        shortened_sentence = shortened_sentences[sentence.lower()][num_tokens]
+        shortened_sentence = shortened_sentences[sentence.lower()][shortening_length]
        if do_resolve_unks:
            shortened_sentence = resolve_unks(sentence, shortened_sentence)
        return shortened_sentence
    except KeyError:
        print(sentence)
-        with open('missing-sentences_' + str(num_tokens) + '.txt', 'a', encoding='utf-8') as f:
+        with open('missing-sentences_' + str(shortening_length) + '.txt', 'a', encoding='utf-8') as f:
            f.write(sentence + '\n')
        return sentence

@@ -42,8 +45,7 @@ def resolve_unks(sentence, summarized_sentence):

    # find the best candidate for each UNK
    # (just take the most common item, and fall back to UNK if there are ever no candidates for some reason)
-    unk_replacements = [max(candidates, key=candidates.count) if candidates else 'UNK'
-                        for candidates in candidates_by_unk]
+    unk_replacements = [max(c, key=c.count) if c else 'UNK' for c in candidates_by_unk]

    # replace each UNK with its best candidate
    for i, w in enumerate(summarized_sentence):
@@ -54,4 +56,4 @@ def resolve_unks(sentence, summarized_sentence):


 if __name__ == '__main__':
-    print(shorten('and this was on - going in egypt before january 25 .', 10))
+    print(shorten('and this was on - going in egypt before january 25 .'))
--- a/timeline_generation.py
+++ b/timeline_generation.py
@@ -6,10 +6,9 @@ import evaluation
 import sentence_selection
 import summarization
 import sentence_shortening
-import util


-def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten_sentences, shortening_length, resolve_unks):
+def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten):
    timeline = {}

    num_dates = len(gold_timeline)
@@ -35,32 +34,16 @@ def make_timeline(articles, gold_timeline, keywords, by_tokens, shorten_sentence
            break

        # select candidate sentences for date
-        candidate_sentences = sentence_selection.candidate_sentences(articles, date, vectorizer)
+        candidate_sentences = sentence_selection.candidate_sentences(articles, date, vectorizer, shorten)
        if not candidate_sentences:
            continue

-        # shorten sentences if needed
-        if shorten_sentences == 'before_summarization':
-            new_candidate_sentences = []
-            for i in range(len(candidate_sentences)):
-                new_candidate_sentences.append({
-                    'text': sentence_shortening.shorten(candidate_sentences[i]['text'], shortening_length,
-                                                        resolve_unks),
-                    'mentioned_dates': candidate_sentences[i]['mentioned_dates']
-                })
-            candidate_sentences = new_candidate_sentences
-
        # build summary for date
        summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=by_tokens,
                                                   num_sentences=avg_num_sentences, num_tokens=avg_num_tokens)
        if not summary_for_date:
            continue

-        # shorten sentences if needed
-        if shorten_sentences == 'after_summarization':
-            summary_for_date = [sentence_shortening.shorten(sentence, shortening_length, resolve_unks)
-                                for sentence in summary_for_date]
-
        timeline[date] = summary_for_date

    # sort timeline by date