From e79f881e209a0978aeb9f3855b2f779ed38c24ec Mon Sep 17 00:00:00 2001
From: vvye <ekaiser.hellwege@gmail.com>
Date: Fri, 24 Sep 2021 18:53:52 +0200
Subject: [PATCH] More changes necessary for crisis dataset loading

---
 dataset.py         | 6 +++---
 heideltime_util.py | 7 +++++--
 util.py            | 4 ++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/dataset.py b/dataset.py
index 4c88173..d2e0f3c 100644
--- a/dataset.py
+++ b/dataset.py
@@ -129,7 +129,7 @@ def get_crisis_dataset():
             for article_filename in util.files(date_path, extension='.cont'):
                 article_file_path = date_path / article_filename
 
-                # nah
+                # this one breaks heideltime due to character encoding shenanigans
                 if topic_name == 'egypt' and '2429.htm.cont' in str(article_file_path):
                     continue
 
@@ -145,8 +145,8 @@ def get_crisis_dataset():
 
                 # get date mentions using HeidelTime
                 # and add them to the sentence data
-                mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date)
-                mentioned_dates_by_sentence = mentioned_dates_by_sentence[1:]  # skip first line (headline)
+                mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date,
+                                                                                          skip_first_line=True)
                 assert len(mentioned_dates_by_sentence) == len(sentences_in_article)
                 for i in range(len(sentences_in_article)):
                     sentence = sentences_in_article[i]
diff --git a/heideltime_util.py b/heideltime_util.py
index def5661..e91cd82 100644
--- a/heideltime_util.py
+++ b/heideltime_util.py
@@ -12,13 +12,16 @@ heideltime_root_regex = re.compile('<TimeML>(.*?)</TimeML>', re.MULTILINE | re.D
 date_format_regex = re.compile('^\d{4}-\d{2}-\d{2}')
 
 
-def mentioned_dates_by_sentence(filename, pub_date):
+def mentioned_dates_by_sentence(filename, pub_date, skip_first_line=False):
 
     # create a temporary copy of the file with interfering characters escaped
     escaped_filename = str(filename) + '.escaped'
     with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g:
+        first_line = True
         for line in f.readlines():
-            g.write(escape(line))
+            if not skip_first_line or not first_line:
+                g.write(escape(line))
+            first_line = False
 
     # change to heideltime directory (and keep track of the path back to the root)
     working_dir = os.getcwd()
diff --git a/util.py b/util.py
index 0c1f35c..1014987 100644
--- a/util.py
+++ b/util.py
@@ -35,14 +35,14 @@ def detect_encoding_and_open(filename):
     """
     Opens a (text) file for reading.
     Behaves the same as the builtin open, but attempts to determine the correct encoding first.
-    chardet is used to detect the encoding, with 'utf-8' and 'ansi' as fallbacks in case the detection fails.
+    'utf-8' is attempted first, followed by detecting the encoding with chardet, with 'ansi' as a fallback.
     If no encoding works, a UnicodeDecode error is raised.
     :param filename: The name of the file to be opened
     :return: A file handle.
     """
     raw_data = open(filename, 'rb').read()
     detected_encoding = chardet.detect(raw_data)['encoding']
-    encodings = [detected_encoding, 'utf-8', 'ansi']  # these seem to work
+    encodings = ['utf-8', detected_encoding, 'ansi']  # these seem to work
     for encoding in encodings:
         f = open(filename, encoding=encoding)
         try:
-- 
GitLab