Implement fetching of crisis dataset

977dbc70 · vvye · aaec60f3 · 977dbc70 · 977dbc70 · 977dbc70
Commit 977dbc70 authored 3 years ago by vvye
--- a/dataset.py
+++ b/dataset.py
@@ -29,19 +29,18 @@ import heideltime_util
 import util


-def get_timeline17_dataset(path):
+def get_timeline17_dataset():
    """
    Returns the Timeline17 dataset as a dictionary.
-    If cached.pkl exists in the given path, it will be loaded from there,
+    If data/in/timeline17/timeline17.pkl exists, it will be loaded from there,
    otherwise, it will be parsed from scratch (assuming the default folder structure).

-    :param path: The path to Timeline17's 'Data' directory.
-    :return: A dictionary containing the preprocessed data.
+    :return: A dictionary containing the dataset.
    """

-    path = Path(path)
+    path = Path('data/in/timeline17/Data')

-    cache_filename = path / 'cached.pkl'
+    cache_filename = Path('data/in/timeline17/timeline17.pkl')
    if os.path.exists(cache_filename):
        return pickle.load(open(cache_filename, 'rb'))

@@ -100,5 +99,77 @@ def get_timeline17_dataset(path):
    return data


+def get_crisis_dataset():
+    """
+    Returns the crisis dataset as a dictionary.
+    If data/in/crisis/crisis.pkl exists, it will be loaded from there,
+    otherwise, it will be parsed from scratch (assuming the default folder structure).
+
+    :return: A dictionary containing the dataset.
+    """
+
+    path = Path('data/in/crisis')
+
+    cache_filename = Path('data/in/crisis/crisis.pkl')
+    if os.path.exists(cache_filename):
+        return pickle.load(open(cache_filename, 'rb'))
+
+    data = {}
+
+    # go through each topic directory
+    for topic_dirname in util.subdirs(path):
+        topic_path = path / topic_dirname
+        topic_name = topic_dirname
+        if topic_name not in data:
+            data[topic_name] = {'articles': [], 'gold_timelines': {}}
+
+        # parse input articles
+        for pub_date in util.subdirs(topic_path / 'public' / 'content'):
+            date_path = topic_path / 'public' / 'content' / pub_date
+            for article_filename in util.files(date_path, extension='.cont'):
+                article_file_path = date_path / article_filename
+                print(article_file_path)
+                article = {'pub_date': pub_date, 'sentences': []}
+
+                # get sentence text
+                with util.detect_encoding_and_open(article_file_path) as f:
+                    sentences_in_article = [{
+                        'text': line.strip(),
+                        'mentioned_dates': []
+                    } for line in f.readlines()[1:]  # skip first line (headline)
+                        if line.strip()]
+
+                # get date mentions using HeidelTime
+                # and add them to the sentence data
+                mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date)
+                mentioned_dates_by_sentence = mentioned_dates_by_sentence[1:]
+                assert len(mentioned_dates_by_sentence) == len(sentences_in_article)  # skip first line (headline)
+                for i in range(len(sentences_in_article)):
+                    sentence = sentences_in_article[i]
+                    sentence['mentioned_dates'] = mentioned_dates_by_sentence[i]
+
+                article['sentences'] += sentences_in_article
+                data[topic_name]['articles'].append(article)
+
+        # parse gold timelines
+        for gold_timeline_filename in util.files(topic_path / 'public' / 'timelines', extension='txt'):
+            if gold_timeline_filename.startswith('.'):
+                continue
+            gold_timeline_file_path = topic_path / 'public' / 'timelines' / gold_timeline_filename
+            gold_timeline_name = gold_timeline_filename.split('.')[0]
+            gold_timeline = {}
+            with open(gold_timeline_file_path) as f:
+                lines = [line.strip() for line in f.readlines()]
+                date_groups = [list(y) for x, y in itertools.groupby(lines, lambda z: re.match('^-+$', z)) if not x]
+                for date_group in date_groups:
+                    date, sentences_on_date = date_group[0], date_group[1:]
+                    sentences_on_date = [s.lstrip('-').strip() for s in sentences_on_date]
+                    gold_timeline[date] = sentences_on_date
+            data[topic_name]['gold_timelines'][gold_timeline_name] = gold_timeline
+
+    pickle.dump(data, open(cache_filename, 'wb'))
+    return data
+
+
 def filter_articles_by_date(articles, start_date, end_date):
    return [a for a in articles if start_date <= a['pub_date'] <= end_date]
--- a/heideltime_util.py
+++ b/heideltime_util.py
@@ -4,6 +4,8 @@ import subprocess
 import xml.etree.ElementTree as ET
 from xml.sax.saxutils import escape

+import util
+
 heideltime_path = 'tools/heideltime'
 heideltime_jar_name = 'de.unihd.dbs.heideltime.standalone.jar'
 heideltime_root_regex = re.compile('<TimeML>(.*?)</TimeML>', re.MULTILINE | re.DOTALL)
@@ -14,7 +16,7 @@ def mentioned_dates_by_sentence(filename, pub_date):

    # create a temporary copy of the file with interfering characters escaped
    escaped_filename = str(filename) + '.escaped'
-    with open(filename, encoding='utf-8') as f, open(escaped_filename, 'w', encoding='utf-8') as g:
+    with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g:
        for line in f:
            g.write(escape(line))


--- a/run.py
+++ b/run.py
@@ -10,7 +10,7 @@ import timeline_generation
 def main(args):
    eval_results = evaluation.ResultLogger()

-    data = dataset.get_timeline17_dataset('data/in/timeline17/Data')
+    data = dataset.get_crisis_dataset()
    for topic in data.keys():

        articles = data[topic]['articles']

--- a/util.py
+++ b/util.py
 import os
+import chardet
 from datetime import datetime


@@ -29,3 +30,25 @@ def rank(lst, scores):
 def days_between(date1, date2):
    return abs((datetime.strptime(date1, '%Y-%m-%d') - datetime.strptime(date2, '%Y-%m-%d')).days)

+
+def detect_encoding_and_open(filename):
+    """
+    Opens a (text) file for reading.
+    Behaves the same as the builtin open, but attempts to determine the correct encoding first.
+    chardet is used to detect the encoding, with 'utf-8' and 'ansi' as fallbacks in case the detection fails.
+    If no encoding works, a UnicodeDecode error is raised.
+    :param filename: The name of the file to be opened
+    :return: A file handle.
+    """
+    raw_data = open(filename, 'rb').read()
+    detected_encoding = chardet.detect(raw_data)['encoding']
+    encodings = [detected_encoding['encoding'], 'utf-8', 'ansi']
+    for encoding in encodings:
+        f = open(filename, encoding=encoding)
+        try:
+            _ = [line for line in f.readlines()]
+            return f
+        except UnicodeDecodeError:
+            f.close()
+            continue
+    raise UnicodeDecodeError