From 977dbc703dd715e682209b62dc4512e3b737a35f Mon Sep 17 00:00:00 2001 From: vvye <ekaiser.hellwege@gmail.com> Date: Thu, 23 Sep 2021 22:24:05 +0200 Subject: [PATCH] Implement fetching of crisis dataset --- dataset.py | 83 ++++++++++++++++++++++++++++++++++++++++++---- heideltime_util.py | 4 ++- run.py | 2 +- util.py | 23 +++++++++++++ 4 files changed, 104 insertions(+), 8 deletions(-) diff --git a/dataset.py b/dataset.py index 4a7e968..6095752 100644 --- a/dataset.py +++ b/dataset.py @@ -29,19 +29,18 @@ import heideltime_util import util -def get_timeline17_dataset(path): +def get_timeline17_dataset(): """ Returns the Timeline17 dataset as a dictionary. - If cached.pkl exists in the given path, it will be loaded from there, + If data/in/timeline17/timeline17.pkl exists, it will be loaded from there, otherwise, it will be parsed from scratch (assuming the default folder structure). - :param path: The path to Timeline17's 'Data' directory. - :return: A dictionary containing the preprocessed data. + :return: A dictionary containing the dataset. """ - path = Path(path) + path = Path('data/in/timeline17/Data') - cache_filename = path / 'cached.pkl' + cache_filename = Path('data/in/timeline17/timeline17.pkl') if os.path.exists(cache_filename): return pickle.load(open(cache_filename, 'rb')) @@ -100,5 +99,77 @@ def get_timeline17_dataset(path): return data +def get_crisis_dataset(): + """ + Returns the crisis dataset as a dictionary. + If data/in/crisis/crisis.pkl exists, it will be loaded from there, + otherwise, it will be parsed from scratch (assuming the default folder structure). + + :return: A dictionary containing the dataset. + """ + + path = Path('data/in/crisis') + + cache_filename = Path('data/in/crisis/crisis.pkl') + if os.path.exists(cache_filename): + return pickle.load(open(cache_filename, 'rb')) + + data = {} + + # go through each topic directory + for topic_dirname in util.subdirs(path): + topic_path = path / topic_dirname + topic_name = topic_dirname + if topic_name not in data: + data[topic_name] = {'articles': [], 'gold_timelines': {}} + + # parse input articles + for pub_date in util.subdirs(topic_path / 'public' / 'content'): + date_path = topic_path / 'public' / 'content' / pub_date + for article_filename in util.files(date_path, extension='.cont'): + article_file_path = date_path / article_filename + print(article_file_path) + article = {'pub_date': pub_date, 'sentences': []} + + # get sentence text + with util.detect_encoding_and_open(article_file_path) as f: + sentences_in_article = [{ + 'text': line.strip(), + 'mentioned_dates': [] + } for line in f.readlines()[1:] # skip first line (headline) + if line.strip()] + + # get date mentions using HeidelTime + # and add them to the sentence data + mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date) + mentioned_dates_by_sentence = mentioned_dates_by_sentence[1:] + assert len(mentioned_dates_by_sentence) == len(sentences_in_article) # skip first line (headline) + for i in range(len(sentences_in_article)): + sentence = sentences_in_article[i] + sentence['mentioned_dates'] = mentioned_dates_by_sentence[i] + + article['sentences'] += sentences_in_article + data[topic_name]['articles'].append(article) + + # parse gold timelines + for gold_timeline_filename in util.files(topic_path / 'public' / 'timelines', extension='txt'): + if gold_timeline_filename.startswith('.'): + continue + gold_timeline_file_path = topic_path / 'public' / 'timelines' / gold_timeline_filename + gold_timeline_name = gold_timeline_filename.split('.')[0] + gold_timeline = {} + with open(gold_timeline_file_path) as f: + lines = [line.strip() for line in f.readlines()] + date_groups = [list(y) for x, y in itertools.groupby(lines, lambda z: re.match('^-+$', z)) if not x] + for date_group in date_groups: + date, sentences_on_date = date_group[0], date_group[1:] + sentences_on_date = [s.lstrip('-').strip() for s in sentences_on_date] + gold_timeline[date] = sentences_on_date + data[topic_name]['gold_timelines'][gold_timeline_name] = gold_timeline + + pickle.dump(data, open(cache_filename, 'wb')) + return data + + def filter_articles_by_date(articles, start_date, end_date): return [a for a in articles if start_date <= a['pub_date'] <= end_date] diff --git a/heideltime_util.py b/heideltime_util.py index f149b20..0638d41 100644 --- a/heideltime_util.py +++ b/heideltime_util.py @@ -4,6 +4,8 @@ import subprocess import xml.etree.ElementTree as ET from xml.sax.saxutils import escape +import util + heideltime_path = 'tools/heideltime' heideltime_jar_name = 'de.unihd.dbs.heideltime.standalone.jar' heideltime_root_regex = re.compile('<TimeML>(.*?)</TimeML>', re.MULTILINE | re.DOTALL) @@ -14,7 +16,7 @@ def mentioned_dates_by_sentence(filename, pub_date): # create a temporary copy of the file with interfering characters escaped escaped_filename = str(filename) + '.escaped' - with open(filename, encoding='utf-8') as f, open(escaped_filename, 'w', encoding='utf-8') as g: + with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g: for line in f: g.write(escape(line)) diff --git a/run.py b/run.py index f4944db..ec758aa 100644 --- a/run.py +++ b/run.py @@ -10,7 +10,7 @@ import timeline_generation def main(args): eval_results = evaluation.ResultLogger() - data = dataset.get_timeline17_dataset('data/in/timeline17/Data') + data = dataset.get_crisis_dataset() for topic in data.keys(): articles = data[topic]['articles'] diff --git a/util.py b/util.py index b2af446..88daf5b 100644 --- a/util.py +++ b/util.py @@ -1,4 +1,5 @@ import os +import chardet from datetime import datetime @@ -29,3 +30,25 @@ def rank(lst, scores): def days_between(date1, date2): return abs((datetime.strptime(date1, '%Y-%m-%d') - datetime.strptime(date2, '%Y-%m-%d')).days) + +def detect_encoding_and_open(filename): + """ + Opens a (text) file for reading. + Behaves the same as the builtin open, but attempts to determine the correct encoding first. + chardet is used to detect the encoding, with 'utf-8' and 'ansi' as fallbacks in case the detection fails. + If no encoding works, a UnicodeDecode error is raised. + :param filename: The name of the file to be opened + :return: A file handle. + """ + raw_data = open(filename, 'rb').read() + detected_encoding = chardet.detect(raw_data)['encoding'] + encodings = [detected_encoding['encoding'], 'utf-8', 'ansi'] + for encoding in encodings: + f = open(filename, encoding=encoding) + try: + _ = [line for line in f.readlines()] + return f + except UnicodeDecodeError: + f.close() + continue + raise UnicodeDecodeError -- GitLab