diff --git a/dataset.py b/dataset.py
index a61d63ddbae0d662a48513e18ebea31bce8f4f0b..c80a34e4bd676ea2e8baeea8fd0dbaaf4d931a52 100644
--- a/dataset.py
+++ b/dataset.py
@@ -128,16 +128,17 @@ def get_crisis_dataset():
             date_path = topic_path / 'public' / 'content' / pub_date
             for article_filename in util.files(date_path, extension='.cont'):
                 article_file_path = date_path / article_filename
+                if '2429.htm.cont' not in str(article_file_path):
+                    continue
                 print(article_file_path)
                 article = {'pub_date': pub_date, 'sentences': []}
 
                 # get sentence text
                 with util.detect_encoding_and_open(article_file_path) as f:
-                    sentences_in_article = [{
-                        'text': line.strip(),
-                        'mentioned_dates': []
-                    } for line in f.readlines()[1:]  # skip first line (headline)
-                        if line.strip()]
+                    sentences_in_article = []
+                    for line in f.readlines()[1:]:
+                        if line.strip():
+                            sentences_in_article.append({'text': line.strip(), 'mentioned_dates': []})
 
                 # get date mentions using HeidelTime
                 # and add them to the sentence data