diff --git a/dataset.py b/dataset.py index a61d63ddbae0d662a48513e18ebea31bce8f4f0b..c80a34e4bd676ea2e8baeea8fd0dbaaf4d931a52 100644 --- a/dataset.py +++ b/dataset.py @@ -128,16 +128,17 @@ def get_crisis_dataset(): date_path = topic_path / 'public' / 'content' / pub_date for article_filename in util.files(date_path, extension='.cont'): article_file_path = date_path / article_filename + if '2429.htm.cont' not in str(article_file_path): + continue print(article_file_path) article = {'pub_date': pub_date, 'sentences': []} # get sentence text with util.detect_encoding_and_open(article_file_path) as f: - sentences_in_article = [{ - 'text': line.strip(), - 'mentioned_dates': [] - } for line in f.readlines()[1:] # skip first line (headline) - if line.strip()] + sentences_in_article = [] + for line in f.readlines()[1:]: + if line.strip(): + sentences_in_article.append({'text': line.strip(), 'mentioned_dates': []}) # get date mentions using HeidelTime # and add them to the sentence data