diff --git a/dataset.py b/dataset.py index c80a34e4bd676ea2e8baeea8fd0dbaaf4d931a52..d1ee98beb1ac93791175b8e088be87aa82213b03 100644 --- a/dataset.py +++ b/dataset.py @@ -128,8 +128,11 @@ def get_crisis_dataset(): date_path = topic_path / 'public' / 'content' / pub_date for article_filename in util.files(date_path, extension='.cont'): article_file_path = date_path / article_filename - if '2429.htm.cont' not in str(article_file_path): + + # nah + if '2429.htm.cont' in str(article_file_path): continue + print(article_file_path) article = {'pub_date': pub_date, 'sentences': []} diff --git a/heideltime_util.py b/heideltime_util.py index 339ceb44c5c1cf324d28111754b7d14a1c314b00..def5661720512cd1e791ab75a94b0951491e7567 100644 --- a/heideltime_util.py +++ b/heideltime_util.py @@ -17,7 +17,7 @@ def mentioned_dates_by_sentence(filename, pub_date): # create a temporary copy of the file with interfering characters escaped escaped_filename = str(filename) + '.escaped' with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g: - for line in f: + for line in f.readlines(): g.write(escape(line)) # change to heideltime directory (and keep track of the path back to the root)