From e79f881e209a0978aeb9f3855b2f779ed38c24ec Mon Sep 17 00:00:00 2001 From: vvye <ekaiser.hellwege@gmail.com> Date: Fri, 24 Sep 2021 18:53:52 +0200 Subject: [PATCH] More changes necessary for crisis dataset loading --- dataset.py | 6 +++--- heideltime_util.py | 7 +++++-- util.py | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/dataset.py b/dataset.py index 4c88173..d2e0f3c 100644 --- a/dataset.py +++ b/dataset.py @@ -129,7 +129,7 @@ def get_crisis_dataset(): for article_filename in util.files(date_path, extension='.cont'): article_file_path = date_path / article_filename - # nah + # this one breaks heideltime due to character encoding shenanigans if topic_name == 'egypt' and '2429.htm.cont' in str(article_file_path): continue @@ -145,8 +145,8 @@ def get_crisis_dataset(): # get date mentions using HeidelTime # and add them to the sentence data - mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date) - mentioned_dates_by_sentence = mentioned_dates_by_sentence[1:] # skip first line (headline) + mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date, + skip_first_line=True) assert len(mentioned_dates_by_sentence) == len(sentences_in_article) for i in range(len(sentences_in_article)): sentence = sentences_in_article[i] diff --git a/heideltime_util.py b/heideltime_util.py index def5661..e91cd82 100644 --- a/heideltime_util.py +++ b/heideltime_util.py @@ -12,13 +12,16 @@ heideltime_root_regex = re.compile('<TimeML>(.*?)</TimeML>', re.MULTILINE | re.D date_format_regex = re.compile('^\d{4}-\d{2}-\d{2}') -def mentioned_dates_by_sentence(filename, pub_date): +def mentioned_dates_by_sentence(filename, pub_date, skip_first_line=False): # create a temporary copy of the file with interfering characters escaped escaped_filename = str(filename) + '.escaped' with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g: + first_line = True for line in f.readlines(): - g.write(escape(line)) + if not skip_first_line or not first_line: + g.write(escape(line)) + first_line = False # change to heideltime directory (and keep track of the path back to the root) working_dir = os.getcwd() diff --git a/util.py b/util.py index 0c1f35c..1014987 100644 --- a/util.py +++ b/util.py @@ -35,14 +35,14 @@ def detect_encoding_and_open(filename): """ Opens a (text) file for reading. Behaves the same as the builtin open, but attempts to determine the correct encoding first. - chardet is used to detect the encoding, with 'utf-8' and 'ansi' as fallbacks in case the detection fails. + 'utf-8' is attempted first, followed by detecting the encoding with chardet, with 'ansi' as a fallback. If no encoding works, a UnicodeDecode error is raised. :param filename: The name of the file to be opened :return: A file handle. """ raw_data = open(filename, 'rb').read() detected_encoding = chardet.detect(raw_data)['encoding'] - encodings = [detected_encoding, 'utf-8', 'ansi'] # these seem to work + encodings = ['utf-8', detected_encoding, 'ansi'] # these seem to work for encoding in encodings: f = open(filename, encoding=encoding) try: -- GitLab