Skip to content
Snippets Groups Projects
Commit 303aa71e authored by Simon Will's avatar Simon Will
Browse files

Make extracting verses more robust

parent e5c8d74e
No related branches found
No related tags found
No related merge requests found
......@@ -3,6 +3,7 @@
import logging
import re
import os.path
import traceback
from bs4 import BeautifulSoup
from unidecode import unidecode
......@@ -115,10 +116,15 @@ def separate_punctuation(tokens):
def reconstruct_verse_text_from_reading(reading):
last_non_punct_token = [t for t in reading.tokens if not t.is_punct()]
codepoints = [' ' for _ in range(last_non_punct_token.span[1])]
for token in reading.tokens:
codepoints[token.span[0]:token.span[1]] = token.text
try:
codepoints = [' ' for _ in range(reading.tokens[-1].span[1])]
for token in reading.tokens:
codepoints[token.span[0]:token.span[1]] = token.text
except Exception:
print('ERROR reconstructing verse from reading {!r}'
.format(reading))
traceback.print_exc()
codepoints = []
return ''.join(codepoints)
......
......@@ -4,6 +4,7 @@
import argparse
import json
import os
import traceback
import allzweckmesser as azm
......@@ -11,17 +12,23 @@ import allzweckmesser as azm
def main(hypotactic_dir, top_out_dir, meters=['hexameter']):
corpus = azm.corpus.HypotacticCorpus.from_directory(hypotactic_dir)
for document in corpus.documents:
doc_out_dir = os.path.join(top_out_dir, document.title)
os.makedirs(doc_out_dir, exist_ok=True)
for meter in meters:
verses_for_meter = [
azm.corpus.HypotacticLine(line).verse.to_dict()
for line in document.get_lines_with_meter([meter])
]
if verses_for_meter:
with open(os.path.join(doc_out_dir, '{}.json'.format(meter)),
'w') as f:
json.dump(verses_for_meter, f)
print('Processing {}'.format(document.title))
try:
doc_out_dir = os.path.join(top_out_dir, document.title)
os.makedirs(doc_out_dir, exist_ok=True)
for meter in meters:
verses_for_meter = [
azm.corpus.HypotacticLine(line).verse.to_dict()
for line in document.get_lines_with_meter([meter])
]
if verses_for_meter:
with open(os.path.join(doc_out_dir,
'{}.json'.format(meter)),
'w') as f:
json.dump(verses_for_meter, f)
except Exception:
print('ERROR at document {}'.format(document.title))
traceback.print_exc()
def parse_args_and_main():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment