From 5f9d59e809201611e3de34bf36cdf161c27d980d Mon Sep 17 00:00:00 2001
From: vvye <ekaiser.hellwege@gmail.com>
Date: Fri, 24 Sep 2021 12:52:31 +0200
Subject: [PATCH] Fix another bug in open function

---
 dataset.py | 3 ++-
 util.py    | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/dataset.py b/dataset.py
index 6095752..88e3274 100644
--- a/dataset.py
+++ b/dataset.py
@@ -156,9 +156,10 @@ def get_crisis_dataset():
             if gold_timeline_filename.startswith('.'):
                 continue
             gold_timeline_file_path = topic_path / 'public' / 'timelines' / gold_timeline_filename
+            print(gold_timeline_file_path)
             gold_timeline_name = gold_timeline_filename.split('.')[0]
             gold_timeline = {}
-            with open(gold_timeline_file_path) as f:
+            with util.detect_encoding_and_open(gold_timeline_file_path) as f:
                 lines = [line.strip() for line in f.readlines()]
                 date_groups = [list(y) for x, y in itertools.groupby(lines, lambda z: re.match('^-+$', z)) if not x]
                 for date_group in date_groups:
diff --git a/util.py b/util.py
index cf5d63d..0c1f35c 100644
--- a/util.py
+++ b/util.py
@@ -42,11 +42,12 @@ def detect_encoding_and_open(filename):
     """
     raw_data = open(filename, 'rb').read()
     detected_encoding = chardet.detect(raw_data)['encoding']
-    encodings = [detected_encoding, 'utf-8', 'ansi']
+    encodings = [detected_encoding, 'utf-8', 'ansi']  # these seem to work
     for encoding in encodings:
         f = open(filename, encoding=encoding)
         try:
             _ = [line for line in f.readlines()]
+            f.seek(0)
             return f
         except UnicodeDecodeError:
             f.close()
-- 
GitLab