Fix another bug in open function

5f9d59e8 · vvye · cb2faec3 · 5f9d59e8 · 5f9d59e8
Commit 5f9d59e8 authored 3 years ago by vvye
--- a/dataset.py
+++ b/dataset.py
@@ -156,9 +156,10 @@ def get_crisis_dataset():
            if gold_timeline_filename.startswith('.'):
                continue
            gold_timeline_file_path = topic_path / 'public' / 'timelines' / gold_timeline_filename
+            print(gold_timeline_file_path)
            gold_timeline_name = gold_timeline_filename.split('.')[0]
            gold_timeline = {}
-            with open(gold_timeline_file_path) as f:
+            with util.detect_encoding_and_open(gold_timeline_file_path) as f:
                lines = [line.strip() for line in f.readlines()]
                date_groups = [list(y) for x, y in itertools.groupby(lines, lambda z: re.match('^-+$', z)) if not x]
                for date_group in date_groups:

--- a/util.py
+++ b/util.py
@@ -42,11 +42,12 @@ def detect_encoding_and_open(filename):
    """
    raw_data = open(filename, 'rb').read()
    detected_encoding = chardet.detect(raw_data)['encoding']
-    encodings = [detected_encoding, 'utf-8', 'ansi']
+    encodings = [detected_encoding, 'utf-8', 'ansi']  # these seem to work
    for encoding in encodings:
        f = open(filename, encoding=encoding)
        try:
            _ = [line for line in f.readlines()]
+            f.seek(0)
            return f
        except UnicodeDecodeError:
            f.close()