Skip to content
Snippets Groups Projects
Commit 51642aa1 authored by vvye's avatar vvye
Browse files

What is even going on

parent e97c777b
No related branches found
No related tags found
No related merge requests found
......@@ -128,16 +128,17 @@ def get_crisis_dataset():
date_path = topic_path / 'public' / 'content' / pub_date
for article_filename in util.files(date_path, extension='.cont'):
article_file_path = date_path / article_filename
if '2429.htm.cont' not in str(article_file_path):
continue
print(article_file_path)
article = {'pub_date': pub_date, 'sentences': []}
# get sentence text
with util.detect_encoding_and_open(article_file_path) as f:
sentences_in_article = [{
'text': line.strip(),
'mentioned_dates': []
} for line in f.readlines()[1:] # skip first line (headline)
if line.strip()]
sentences_in_article = []
for line in f.readlines()[1:]:
if line.strip():
sentences_in_article.append({'text': line.strip(), 'mentioned_dates': []})
# get date mentions using HeidelTime
# and add them to the sentence data
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment