More changes necessary for crisis dataset loading

e79f881e · vvye · 27f37dba · e79f881e · e79f881e · e79f881e
Commit e79f881e authored 3 years ago by vvye
--- a/dataset.py
+++ b/dataset.py
@@ -129,7 +129,7 @@ def get_crisis_dataset():
            for article_filename in util.files(date_path, extension='.cont'):
                article_file_path = date_path / article_filename

-                # nah
+                # this one breaks heideltime due to character encoding shenanigans
                if topic_name == 'egypt' and '2429.htm.cont' in str(article_file_path):
                    continue

@@ -145,8 +145,8 @@ def get_crisis_dataset():

                # get date mentions using HeidelTime
                # and add them to the sentence data
-                mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date)
-                mentioned_dates_by_sentence = mentioned_dates_by_sentence[1:]  # skip first line (headline)
+                mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date,
+                                                                                          skip_first_line=True)
                assert len(mentioned_dates_by_sentence) == len(sentences_in_article)
                for i in range(len(sentences_in_article)):
                    sentence = sentences_in_article[i]

--- a/heideltime_util.py
+++ b/heideltime_util.py
@@ -12,13 +12,16 @@ heideltime_root_regex = re.compile('<TimeML>(.*?)</TimeML>', re.MULTILINE | re.D
 date_format_regex = re.compile('^\d{4}-\d{2}-\d{2}')


-def mentioned_dates_by_sentence(filename, pub_date):
+def mentioned_dates_by_sentence(filename, pub_date, skip_first_line=False):

    # create a temporary copy of the file with interfering characters escaped
    escaped_filename = str(filename) + '.escaped'
    with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g:
+        first_line = True
        for line in f.readlines():
-            g.write(escape(line))
+            if not skip_first_line or not first_line:
+                g.write(escape(line))
+            first_line = False

    # change to heideltime directory (and keep track of the path back to the root)
    working_dir = os.getcwd()

--- a/util.py
+++ b/util.py
@@ -35,14 +35,14 @@ def detect_encoding_and_open(filename):
    """
    Opens a (text) file for reading.
    Behaves the same as the builtin open, but attempts to determine the correct encoding first.
-    chardet is used to detect the encoding, with 'utf-8' and 'ansi' as fallbacks in case the detection fails.
+    'utf-8' is attempted first, followed by detecting the encoding with chardet, with 'ansi' as a fallback.
    If no encoding works, a UnicodeDecode error is raised.
    :param filename: The name of the file to be opened
    :return: A file handle.
    """
    raw_data = open(filename, 'rb').read()
    detected_encoding = chardet.detect(raw_data)['encoding']
-    encodings = [detected_encoding, 'utf-8', 'ansi']  # these seem to work
+    encodings = ['utf-8', detected_encoding, 'ansi']  # these seem to work
    for encoding in encodings:
        f = open(filename, encoding=encoding)
        try: