diff --git a/evaluation.py b/evaluation.py
index 12e40541a6b5ebe4dfc741de99b18ac2e69b5d72..3d5fd043b0df4e31b55cc8968dcfc421f8436f34 100644
--- a/evaluation.py
+++ b/evaluation.py
@@ -34,16 +34,29 @@ def date_f1(timeline, gold_timeline):
     return (2 * precision * recall) / (precision + recall)
 
 
+def avg_num_sentences_in_timeline(timeline):
+    return util.avg([len(timeline[date]) for date in timeline])
+
+
+def avg_num_tokens_in_timeline(timeline):
+    return util.avg([sum([len(s.split()) for s in timeline[date]]) for date in timeline])
+
+
 class ResultLogger:
 
     def __init__(self):
         self.results = []
         pass
 
-    def add_result(self, topic, gold_timeline_name, ar1_f, ar2_f, date_f):
+    def add_result(self, topic, gold_timeline_name, avg_gold_sentences, avg_system_sentences, avg_gold_tokens,
+                   avg_system_tokens, ar1_f, ar2_f, date_f):
         self.results.append({
             'topic': topic,
             'gold_timeline': gold_timeline_name,
+            'avg_gold_sentences': avg_gold_sentences,
+            'avg_system_sentences': avg_system_sentences,
+            'avg_gold_tokens': avg_gold_tokens,
+            'avg_system_tokens': avg_system_tokens,
             'ar1_f': ar1_f,
             'ar2_f': ar2_f,
             'date_f1': date_f
@@ -55,12 +68,20 @@ class ResultLogger:
         avg_date_f = util.avg([row['date_f1'] for row in self.results])
         return avg_ar1_f, avg_ar2_f, avg_date_f
 
+    def average_stats(self):
+        avg_gold_sentences = util.avg([row['avg_gold_sentences'] for row in self.results])
+        avg_system_sentences = util.avg([row['avg_system_sentences'] for row in self.results])
+        avg_gold_tokens = util.avg([row['avg_gold_tokens'] for row in self.results])
+        avg_system_tokens = util.avg([row['avg_system_tokens'] for row in self.results])
+        return avg_gold_sentences, avg_system_sentences, avg_gold_tokens, avg_system_tokens
+
     def print_average_scores(self):
         avg_ar1_f, avg_ar2_f, avg_date_f1 = self.average_scores()
         print(f'average AR1-F: {avg_ar1_f}\taverage AR2-F: {avg_ar2_f}\taverage Date F: {avg_date_f1}')
 
     def save_to_file(self, filename):
         avg_ar1_f, avg_ar2_f, avg_date_f1 = self.average_scores()
+        avg_gold_sentences, avg_system_sentences, avg_gold_tokens, avg_system_tokens = self.average_stats()
         with open(filename, 'w', encoding='utf-8') as f:
             w = csv.DictWriter(f, ['topic', 'gold_timeline', 'ar1_f', 'ar2_f', 'date_f1'], delimiter=';')
             w.writeheader()
@@ -68,6 +89,10 @@ class ResultLogger:
             w.writerow({
                 'topic': 'average',
                 'gold_timeline': '',
+                'avg_gold_sentences': avg_gold_sentences,
+                'avg_system_sentences': avg_system_sentences,
+                'avg_gold_tokens': avg_gold_tokens,
+                'avg_system_tokens': avg_system_tokens,
                 'ar1_f': avg_ar1_f,
                 'ar2_f': avg_ar2_f,
                 'date_f1': avg_date_f1
diff --git a/run.py b/run.py
index 6515792e38a7ff46bf7adff7b1ad20a563ce3b14..2fa4d1cce09ff57e202d0a1117e73cd043b2f443 100644
--- a/run.py
+++ b/run.py
@@ -26,7 +26,9 @@ def main(args):
 
             print(f'Topic {topic}, gold timeline {gold_timeline_name}')
 
-            timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords)
+            by_tokens = args.timeline_length == 'tokens'
+
+            timeline = timeline_generation.make_timeline(articles, gold_timeline, keywords, by_tokens)
 
             if args.print_timelines:
                 timeline_generation.print_timeline(timeline)
@@ -38,8 +40,13 @@ def main(args):
             if args.evaluate:
                 ar1_f, ar2_f = evaluation.evaluate(timeline, [gold_timeline])
                 date_f = evaluation.date_f1(timeline, gold_timeline)
+                avg_gold_sentences = evaluation.avg_num_sentences_in_timeline(gold_timeline)
+                avg_system_sentences = evaluation.avg_num_sentences_in_timeline(timeline)
+                avg_gold_tokens = evaluation.avg_num_tokens_in_timeline(gold_timeline)
+                avg_system_tokens = evaluation.avg_num_tokens_in_timeline(timeline)
                 print(f'AR1-F: {ar1_f}\tAR2-F: {ar2_f}\tDate F: {date_f}')
-                eval_results.add_result(topic, gold_timeline_name, ar1_f, ar2_f, date_f)
+                eval_results.add_result(topic, gold_timeline_name, avg_gold_sentences, avg_system_sentences,
+                                        avg_gold_tokens, avg_system_tokens, ar1_f, ar2_f, date_f)
 
     if args.evaluate:
         eval_results.print_average_scores()
@@ -51,6 +58,10 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--dataset', type=str, choices=['timeline17', 'crisis', 'entities'], help='the dataset to use',
                         required=True)
+    parser.add_argument('--timeline_length', type=str, choices=['sentences', 'tokens'], default='sentences',
+                        help='what constraint to impose on the length of the generated timeline '
+                             '(number of setences or number of tokens',
+                        required=True)
     parser.add_argument('--print_timelines',
                         action='store_true',
                         help='whether to print the timelines to the console after generating them')
diff --git a/timeline_generation.py b/timeline_generation.py
index 438b1144ef942d05e97e88c633acfaac52d798de..039deca61d2bd6e569f7c002723e2487c878fb09 100644
--- a/timeline_generation.py
+++ b/timeline_generation.py
@@ -2,17 +2,18 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 
 import dataset
 import date_selection
+import evaluation
 import sentence_selection
 import summarization
 import util
 
 
-def make_timeline(articles, gold_timeline, keywords):
+def make_timeline(articles, gold_timeline, keywords, by_tokens):
     timeline = {}
 
     num_dates = len(gold_timeline)
-    avg_num_sentences = round(sum([len(gold_timeline[date]) for date in gold_timeline]) / len(gold_timeline))
-    avg_num_tokens = round(util.avg([sum([len(s.split()) for s in gold_timeline[date]]) for date in gold_timeline]))
+    avg_num_sentences = evaluation.avg_num_sentences_in_timeline(gold_timeline)
+    avg_num_tokens = evaluation.avg_num_tokens_in_timeline(gold_timeline)
 
     # keep only the articles published within the gold timeline's range
     start_date = min(gold_timeline.keys())
@@ -38,7 +39,7 @@ def make_timeline(articles, gold_timeline, keywords):
             continue
 
         # build summary for date
-        summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=True,
+        summary_for_date = summarization.summarize(candidate_sentences, vectorizer, keywords, by_tokens=by_tokens,
                                                    num_sentences=avg_num_sentences, num_tokens=avg_num_tokens)
         if not summary_for_date:
             continue