Skip to content
Snippets Groups Projects
Commit e79f881e authored by vvye's avatar vvye
Browse files

More changes necessary for crisis dataset loading

parent 27f37dba
No related branches found
No related tags found
No related merge requests found
......@@ -129,7 +129,7 @@ def get_crisis_dataset():
for article_filename in util.files(date_path, extension='.cont'):
article_file_path = date_path / article_filename
# nah
# this one breaks heideltime due to character encoding shenanigans
if topic_name == 'egypt' and '2429.htm.cont' in str(article_file_path):
continue
......@@ -145,8 +145,8 @@ def get_crisis_dataset():
# get date mentions using HeidelTime
# and add them to the sentence data
mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date)
mentioned_dates_by_sentence = mentioned_dates_by_sentence[1:] # skip first line (headline)
mentioned_dates_by_sentence = heideltime_util.mentioned_dates_by_sentence(article_file_path, pub_date,
skip_first_line=True)
assert len(mentioned_dates_by_sentence) == len(sentences_in_article)
for i in range(len(sentences_in_article)):
sentence = sentences_in_article[i]
......
......@@ -12,13 +12,16 @@ heideltime_root_regex = re.compile('<TimeML>(.*?)</TimeML>', re.MULTILINE | re.D
date_format_regex = re.compile('^\d{4}-\d{2}-\d{2}')
def mentioned_dates_by_sentence(filename, pub_date):
def mentioned_dates_by_sentence(filename, pub_date, skip_first_line=False):
# create a temporary copy of the file with interfering characters escaped
escaped_filename = str(filename) + '.escaped'
with util.detect_encoding_and_open(filename) as f, open(escaped_filename, 'w', encoding='utf-8') as g:
first_line = True
for line in f.readlines():
g.write(escape(line))
if not skip_first_line or not first_line:
g.write(escape(line))
first_line = False
# change to heideltime directory (and keep track of the path back to the root)
working_dir = os.getcwd()
......
......@@ -35,14 +35,14 @@ def detect_encoding_and_open(filename):
"""
Opens a (text) file for reading.
Behaves the same as the builtin open, but attempts to determine the correct encoding first.
chardet is used to detect the encoding, with 'utf-8' and 'ansi' as fallbacks in case the detection fails.
'utf-8' is attempted first, followed by detecting the encoding with chardet, with 'ansi' as a fallback.
If no encoding works, a UnicodeDecode error is raised.
:param filename: The name of the file to be opened
:return: A file handle.
"""
raw_data = open(filename, 'rb').read()
detected_encoding = chardet.detect(raw_data)['encoding']
encodings = [detected_encoding, 'utf-8', 'ansi'] # these seem to work
encodings = ['utf-8', detected_encoding, 'ansi'] # these seem to work
for encoding in encodings:
f = open(filename, encoding=encoding)
try:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment