Skip to content
Snippets Groups Projects
Commit 977dbc70 authored by vvye's avatar vvye
Browse files

Implement fetching of crisis dataset

parent aaec60f3
No related branches found
No related tags found
No related merge requests found
......@@ -29,19 +29,18 @@ import heideltime_util
import util
def get_timeline17_dataset(path):
def get_timeline17_dataset():
"""
Returns the Timeline17 dataset as a dictionary.
If cached.pkl exists in the given path, it will be loaded from there,
If data/in/timeline17/timeline17.pkl exists, it will be loaded from there,
otherwise, it will be parsed from scratch (assuming the default folder structure).
:param path: The path to Timeline17's 'Data' directory.
:return: A dictionary containing the preprocessed data.
:return: A dictionary containing the dataset.
"""
path = Path(path)
path = Path('data/in/timeline17/Data')
cache_filename = path / 'cached.pkl'
cache_filename = Path('data/in/timeline17/timeline17.pkl')
if os.path.exists(cache_filename):
return pickle.load(open(cache_filename, 'rb'))
......@@ -100,5 +99,77 @@ def get_timeline17_dataset(path):
return data
def get_crisis_dataset():
"""
Returns the crisis dataset as a dictionary.
If data/in/crisis/crisis.pkl exists, it will be loaded from there,
otherwise, it will be parsed from scratch (assuming the default folder structure).
:return: A dictionary containing the dataset.
"""
path = Path('data/in/crisis')
cache_filename = Path('data/in/crisis/crisis.pkl')
if os.path.exists(cache_filename):
return pickle.load(open(cache_filename, 'rb'))
data = {}
# go through each topic directory
for topic_dirname in util.subdirs(path):
topic_path = path / topic_dirname
topic_name = topic_dirname
if topic_name not in data:
data[topic_name] = {'articles': [], 'gold_timelines': {}}
# parse input articles
for pub_date in util.subdirs(topic_path / 'public' / 'content'):
date_path = topic_path / 'public' / 'content' / pub_date
for article_filename in util.files(date_path, extension='.cont'):
article_file_path = date_path / article_filename
print(article_file_path)
article = {'pub_date': pub_date, 'sentences': []}
# get sentence text
with util.detect_encoding_and_open(article_file_path) as f:
sentences_in_article = [{
'text': line.strip(),
'mentioned_dates': []
} for line in f.readlines()[1:] # skip first line (headline)
if line.strip()]
# get date mentions using HeidelTime
# and add them to the sentence data
mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date)
mentioned_dates_by_sentence = mentioned_dates_by_sentence[1:]
assert len(mentioned_dates_by_sentence) == len(sentences_in_article) # skip first line (headline)
for i in range(len(sentences_in_article)):
sentence = sentences_in_article[i]
sentence['mentioned_dates'] = mentioned_dates_by_sentence[i]
article['sentences'] += sentences_in_article
data[topic_name]['articles'].append(article)
# parse gold timelines
for gold_timeline_filename in util.files(topic_path / 'public' / 'timelines', extension='txt'):
if gold_timeline_filename.startswith('.'):
continue
gold_timeline_file_path = topic_path / 'public' / 'timelines' / gold_timeline_filename
gold_timeline_name = gold_timeline_filename.split('.')[0]
gold_timeline = {}
with open(gold_timeline_file_path) as f:
lines = [line.strip() for line in f.readlines()]
date_groups = [list(y) for x, y in itertools.groupby(lines, lambda z: re.match('^-+$', z)) if not x]
for date_group in date_groups:
date, sentences_on_date = date_group[0], date_group[1:]
sentences_on_date = [s.lstrip('-').strip() for s in sentences_on_date]
gold_timeline[date] = sentences_on_date
data[topic_name]['gold_timelines'][gold_timeline_name] = gold_timeline
pickle.dump(data, open(cache_filename, 'wb'))
return data
def filter_articles_by_date(articles, start_date, end_date):
return [a for a in articles if start_date <= a['pub_date'] <= end_date]
......@@ -4,6 +4,8 @@ import subprocess
import xml.etree.ElementTree as ET
from xml.sax.saxutils import escape
import util
heideltime_path = 'tools/heideltime'
heideltime_jar_name = 'de.unihd.dbs.heideltime.standalone.jar'
heideltime_root_regex = re.compile('<TimeML>(.*?)</TimeML>', re.MULTILINE | re.DOTALL)
......@@ -14,7 +16,7 @@ def mentioned_dates_by_sentence(filename, pub_date):
# create a temporary copy of the file with interfering characters escaped
escaped_filename = str(filename) + '.escaped'
with open(filename, encoding='utf-8') as f, open(escaped_filename, 'w', encoding='utf-8') as g:
with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g:
for line in f:
g.write(escape(line))
......
......@@ -10,7 +10,7 @@ import timeline_generation
def main(args):
eval_results = evaluation.ResultLogger()
data = dataset.get_timeline17_dataset('data/in/timeline17/Data')
data = dataset.get_crisis_dataset()
for topic in data.keys():
articles = data[topic]['articles']
......
import os
import chardet
from datetime import datetime
......@@ -29,3 +30,25 @@ def rank(lst, scores):
def days_between(date1, date2):
return abs((datetime.strptime(date1, '%Y-%m-%d') - datetime.strptime(date2, '%Y-%m-%d')).days)
def detect_encoding_and_open(filename):
"""
Opens a (text) file for reading.
Behaves the same as the builtin open, but attempts to determine the correct encoding first.
chardet is used to detect the encoding, with 'utf-8' and 'ansi' as fallbacks in case the detection fails.
If no encoding works, a UnicodeDecode error is raised.
:param filename: The name of the file to be opened
:return: A file handle.
"""
raw_data = open(filename, 'rb').read()
detected_encoding = chardet.detect(raw_data)['encoding']
encodings = [detected_encoding['encoding'], 'utf-8', 'ansi']
for encoding in encodings:
f = open(filename, encoding=encoding)
try:
_ = [line for line in f.readlines()]
return f
except UnicodeDecodeError:
f.close()
continue
raise UnicodeDecodeError
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment