From 977dbc703dd715e682209b62dc4512e3b737a35f Mon Sep 17 00:00:00 2001
From: vvye <ekaiser.hellwege@gmail.com>
Date: Thu, 23 Sep 2021 22:24:05 +0200
Subject: [PATCH] Implement fetching of crisis dataset

---
 dataset.py         | 83 ++++++++++++++++++++++++++++++++++++++++++----
 heideltime_util.py |  4 ++-
 run.py             |  2 +-
 util.py            | 23 +++++++++++++
 4 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/dataset.py b/dataset.py
index 4a7e968..6095752 100644
--- a/dataset.py
+++ b/dataset.py
@@ -29,19 +29,18 @@ import heideltime_util
 import util
 
 
-def get_timeline17_dataset(path):
+def get_timeline17_dataset():
     """
     Returns the Timeline17 dataset as a dictionary.
-    If cached.pkl exists in the given path, it will be loaded from there,
+    If data/in/timeline17/timeline17.pkl exists, it will be loaded from there,
     otherwise, it will be parsed from scratch (assuming the default folder structure).
 
-    :param path: The path to Timeline17's 'Data' directory.
-    :return: A dictionary containing the preprocessed data.
+    :return: A dictionary containing the dataset.
     """
 
-    path = Path(path)
+    path = Path('data/in/timeline17/Data')
 
-    cache_filename = path / 'cached.pkl'
+    cache_filename = Path('data/in/timeline17/timeline17.pkl')
     if os.path.exists(cache_filename):
         return pickle.load(open(cache_filename, 'rb'))
 
@@ -100,5 +99,77 @@ def get_timeline17_dataset(path):
     return data
 
 
+def get_crisis_dataset():
+    """
+    Returns the crisis dataset as a dictionary.
+    If data/in/crisis/crisis.pkl exists, it will be loaded from there,
+    otherwise, it will be parsed from scratch (assuming the default folder structure).
+
+    :return: A dictionary containing the dataset.
+    """
+
+    path = Path('data/in/crisis')
+
+    cache_filename = Path('data/in/crisis/crisis.pkl')
+    if os.path.exists(cache_filename):
+        return pickle.load(open(cache_filename, 'rb'))
+
+    data = {}
+
+    # go through each topic directory
+    for topic_dirname in util.subdirs(path):
+        topic_path = path / topic_dirname
+        topic_name = topic_dirname
+        if topic_name not in data:
+            data[topic_name] = {'articles': [], 'gold_timelines': {}}
+
+        # parse input articles
+        for pub_date in util.subdirs(topic_path / 'public' / 'content'):
+            date_path = topic_path / 'public' / 'content' / pub_date
+            for article_filename in util.files(date_path, extension='.cont'):
+                article_file_path = date_path / article_filename
+                print(article_file_path)
+                article = {'pub_date': pub_date, 'sentences': []}
+
+                # get sentence text
+                with util.detect_encoding_and_open(article_file_path) as f:
+                    sentences_in_article = [{
+                        'text': line.strip(),
+                        'mentioned_dates': []
+                    } for line in f.readlines()[1:]  # skip first line (headline)
+                        if line.strip()]
+
+                # get date mentions using HeidelTime
+                # and add them to the sentence data
+                mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date)
+                mentioned_dates_by_sentence = mentioned_dates_by_sentence[1:]
+                assert len(mentioned_dates_by_sentence) == len(sentences_in_article)  # skip first line (headline)
+                for i in range(len(sentences_in_article)):
+                    sentence = sentences_in_article[i]
+                    sentence['mentioned_dates'] = mentioned_dates_by_sentence[i]
+
+                article['sentences'] += sentences_in_article
+                data[topic_name]['articles'].append(article)
+
+        # parse gold timelines
+        for gold_timeline_filename in util.files(topic_path / 'public' / 'timelines', extension='txt'):
+            if gold_timeline_filename.startswith('.'):
+                continue
+            gold_timeline_file_path = topic_path / 'public' / 'timelines' / gold_timeline_filename
+            gold_timeline_name = gold_timeline_filename.split('.')[0]
+            gold_timeline = {}
+            with open(gold_timeline_file_path) as f:
+                lines = [line.strip() for line in f.readlines()]
+                date_groups = [list(y) for x, y in itertools.groupby(lines, lambda z: re.match('^-+$', z)) if not x]
+                for date_group in date_groups:
+                    date, sentences_on_date = date_group[0], date_group[1:]
+                    sentences_on_date = [s.lstrip('-').strip() for s in sentences_on_date]
+                    gold_timeline[date] = sentences_on_date
+            data[topic_name]['gold_timelines'][gold_timeline_name] = gold_timeline
+
+    pickle.dump(data, open(cache_filename, 'wb'))
+    return data
+
+
 def filter_articles_by_date(articles, start_date, end_date):
     return [a for a in articles if start_date <= a['pub_date'] <= end_date]
diff --git a/heideltime_util.py b/heideltime_util.py
index f149b20..0638d41 100644
--- a/heideltime_util.py
+++ b/heideltime_util.py
@@ -4,6 +4,8 @@ import subprocess
 import xml.etree.ElementTree as ET
 from xml.sax.saxutils import escape
 
+import util
+
 heideltime_path = 'tools/heideltime'
 heideltime_jar_name = 'de.unihd.dbs.heideltime.standalone.jar'
 heideltime_root_regex = re.compile('<TimeML>(.*?)</TimeML>', re.MULTILINE | re.DOTALL)
@@ -14,7 +16,7 @@ def mentioned_dates_by_sentence(filename, pub_date):
 
     # create a temporary copy of the file with interfering characters escaped
     escaped_filename = str(filename) + '.escaped'
-    with open(filename, encoding='utf-8') as f, open(escaped_filename, 'w', encoding='utf-8') as g:
+    with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g:
         for line in f:
             g.write(escape(line))
 
diff --git a/run.py b/run.py
index f4944db..ec758aa 100644
--- a/run.py
+++ b/run.py
@@ -10,7 +10,7 @@ import timeline_generation
 def main(args):
     eval_results = evaluation.ResultLogger()
 
-    data = dataset.get_timeline17_dataset('data/in/timeline17/Data')
+    data = dataset.get_crisis_dataset()
     for topic in data.keys():
 
         articles = data[topic]['articles']
diff --git a/util.py b/util.py
index b2af446..88daf5b 100644
--- a/util.py
+++ b/util.py
@@ -1,4 +1,5 @@
 import os
+import chardet
 from datetime import datetime
 
 
@@ -29,3 +30,25 @@ def rank(lst, scores):
 def days_between(date1, date2):
     return abs((datetime.strptime(date1, '%Y-%m-%d') - datetime.strptime(date2, '%Y-%m-%d')).days)
 
+
+def detect_encoding_and_open(filename):
+    """
+    Opens a (text) file for reading.
+    Behaves the same as the builtin open, but attempts to determine the correct encoding first.
+    chardet is used to detect the encoding, with 'utf-8' and 'ansi' as fallbacks in case the detection fails.
+    If no encoding works, a UnicodeDecode error is raised.
+    :param filename: The name of the file to be opened
+    :return: A file handle.
+    """
+    raw_data = open(filename, 'rb').read()
+    detected_encoding = chardet.detect(raw_data)['encoding']
+    encodings = [detected_encoding['encoding'], 'utf-8', 'ansi']
+    for encoding in encodings:
+        f = open(filename, encoding=encoding)
+        try:
+            _ = [line for line in f.readlines()]
+            return f
+        except UnicodeDecodeError:
+            f.close()
+            continue
+    raise UnicodeDecodeError
-- 
GitLab