From 5f9d59e809201611e3de34bf36cdf161c27d980d Mon Sep 17 00:00:00 2001 From: vvye <ekaiser.hellwege@gmail.com> Date: Fri, 24 Sep 2021 12:52:31 +0200 Subject: [PATCH] Fix another bug in open function --- dataset.py | 3 ++- util.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dataset.py b/dataset.py index 6095752..88e3274 100644 --- a/dataset.py +++ b/dataset.py @@ -156,9 +156,10 @@ def get_crisis_dataset(): if gold_timeline_filename.startswith('.'): continue gold_timeline_file_path = topic_path / 'public' / 'timelines' / gold_timeline_filename + print(gold_timeline_file_path) gold_timeline_name = gold_timeline_filename.split('.')[0] gold_timeline = {} - with open(gold_timeline_file_path) as f: + with util.detect_encoding_and_open(gold_timeline_file_path) as f: lines = [line.strip() for line in f.readlines()] date_groups = [list(y) for x, y in itertools.groupby(lines, lambda z: re.match('^-+$', z)) if not x] for date_group in date_groups: diff --git a/util.py b/util.py index cf5d63d..0c1f35c 100644 --- a/util.py +++ b/util.py @@ -42,11 +42,12 @@ def detect_encoding_and_open(filename): """ raw_data = open(filename, 'rb').read() detected_encoding = chardet.detect(raw_data)['encoding'] - encodings = [detected_encoding, 'utf-8', 'ansi'] + encodings = [detected_encoding, 'utf-8', 'ansi'] # these seem to work for encoding in encodings: f = open(filename, encoding=encoding) try: _ = [line for line in f.readlines()] + f.seek(0) return f except UnicodeDecodeError: f.close() -- GitLab