diff --git a/evaluation.py b/evaluation.py index 12e40541a6b5ebe4dfc741de99b18ac2e69b5d72..3d5fd043b0df4e31b55cc8968dcfc421f8436f34 100644 --- a/evaluation.py +++ b/evaluation.py @@ -34,16 +34,29 @@ def date_f1(timeline, gold_timeline): return (2 * precision * recall) / (precision + recall) +def avg_num_sentences_in_timeline(timeline): + return util.avg([len(timeline[date]) for date in timeline]) + + +def avg_num_tokens_in_timeline(timeline): + return util.avg([sum([len(s.split()) for s in timeline[date]]) for date in timeline]) + + class ResultLogger: def __init__(self): self.results = [] pass - def add_result(self, topic, gold_timeline_name, ar1_f, ar2_f, date_f): + def add_result(self, topic, gold_timeline_name, avg_gold_sentences, avg_system_sentences, avg_gold_tokens, + avg_system_tokens, ar1_f, ar2_f, date_f): self.results.append({ 'topic': topic, 'gold_timeline': gold_timeline_name, + 'avg_gold_sentences': avg_gold_sentences, + 'avg_system_sentences': avg_system_sentences, + 'avg_gold_tokens': avg_gold_tokens, + 'avg_system_tokens': avg_system_tokens, 'ar1_f': ar1_f, 'ar2_f': ar2_f, 'date_f1': date_f @@ -55,12 +68,20 @@ class ResultLogger: avg_date_f = util.avg([row['date_f1'] for row in self.results]) return avg_ar1_f, avg_ar2_f, avg_date_f + def average_stats(self): + avg_gold_sentences = util.avg([row['avg_gold_sentences'] for row in self.results]) + avg_system_sentences = util.avg([row['avg_system_sentences'] for row in self.results]) + avg_gold_tokens = util.avg([row['avg_gold_tokens'] for row in self.results]) + avg_system_tokens = util.avg([row['avg_system_tokens'] for row in self.results]) + return avg_gold_sentences, avg_system_sentences, avg_gold_tokens, avg_system_tokens + def print_average_scores(self): avg_ar1_f, avg_ar2_f, avg_date_f1 = self.average_scores() print(f'average AR1-F: {avg_ar1_f}\taverage AR2-F: {avg_ar2_f}\taverage Date F: {avg_date_f1}') def save_to_file(self, filename): avg_ar1_f, avg_ar2_f, avg_date_f1 = self.average_scores() + avg_gold_sentences, avg_system_sentences, avg_gold_tokens, avg_system_tokens = self.average_stats() with open(filename, 'w', encoding='utf-8') as f: w = csv.DictWriter(f, ['topic', 'gold_timeline', 'ar1_f', 'ar2_f', 'date_f1'], delimiter=';') w.writeheader() @@ -68,6 +89,10 @@ class ResultLogger: w.writerow({ 'topic': 'average', 'gold_timeline': '', + 'avg_gold_sentences': avg_gold_sentences, + 'avg_system_sentences': avg_system_sentences, + 'avg_gold_tokens': avg_gold_tokens, + 'avg_system_tokens': avg_system_tokens, 'ar1_f': avg_ar1_f, 'ar2_f': avg_ar2_f, 'date_f1': avg_date_f1 diff --git a/run.py b/run.py index 6515792e38a7ff46bf7adff7b1ad20a563ce3b14..2fa4d1cce09ff57e202d0a1117e73cd043b2f443 100644 --- a/run.py +++ b/run.py @@ -26,7 +26,9 @@ def main(args): print(f'Topic {topic}, gold timeline {gold_timeline_name}') - timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords) + by_tokens = args.timeline_length == 'tokens' + + timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords, by_tokens) if args.print_timelines: timeline_generation.print_timeline(timeline) @@ -38,8 +40,13 @@ def main(args): if args.evaluate: ar1_f, ar2_f = evaluation.evaluate(timeline, [gold_timeline]) date_f = evaluation.date_f1(timeline, gold_timeline) + avg_gold_sentences = evaluation.avg_num_sentences_in_timeline(gold_timeline) + avg_system_sentences = evaluation.avg_num_sentences_in_timeline(timeline) + avg_gold_tokens = evaluation.avg_num_tokens_in_timeline(gold_timeline) + avg_system_tokens = evaluation.avg_num_tokens_in_timeline(timeline) print(f'AR1-F: {ar1_f}\tAR2-F: {ar2_f}\tDate F: {date_f}') - eval_results.add_result(topic, gold_timeline_name, ar1_f, ar2_f, date_f) + eval_results.add_result(topic, gold_timeline_name, avg_gold_sentences, avg_system_sentences, + avg_gold_tokens, avg_system_tokens, ar1_f, ar2_f, date_f) if args.evaluate: eval_results.print_average_scores() @@ -51,6 +58,10 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--dataset', type=str, choices=['timeline17', 'crisis', 'entities'], help='the dataset to use', required=True) + parser.add_argument('--timeline_length', type=str, choices=['sentences', 'tokens'], default='sentences', + help='what constraint to impose on the length of the generated timeline ' + '(number of setences or number of tokens', + required=True) parser.add_argument('--print_timelines', action='store_true', help='whether to print the timelines to the console after generating them') diff --git a/timeline_generation.py b/timeline_generation.py index 438b1144ef942d05e97e88c633acfaac52d798de..039deca61d2bd6e569f7c002723e2487c878fb09 100644 --- a/timeline_generation.py +++ b/timeline_generation.py @@ -2,17 +2,18 @@ from sklearn.feature_extraction.text import TfidfVectorizer import dataset import date_selection +import evaluation import sentence_selection import summarization import util -def make_timeline(articles, gold_timeline, keywords): +def make_timeline(articles, gold_timeline, keywords, by_tokens): timeline = {} num_dates = len(gold_timeline) - avg_num_sentences = round(sum([len(gold_timeline[date]) for date in gold_timeline]) / len(gold_timeline)) - avg_num_tokens = round(util.avg([sum([len(s.split()) for s in gold_timeline[date]]) for date in gold_timeline])) + avg_num_sentences = evaluation.avg_num_sentences_in_timeline(gold_timeline) + avg_num_tokens = evaluation.avg_num_tokens_in_timeline(gold_timeline) # keep only the articles published within the gold timeline's range start_date = min(gold_timeline.keys()) @@ -38,7 +39,7 @@ def make_timeline(articles, gold_timeline, keywords): continue # build summary for date - summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=True, + summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=by_tokens, num_sentences=avg_num_sentences, num_tokens=avg_num_tokens) if not summary_for_date: continue