Skip to content
Snippets Groups Projects
Commit 303aa71e authored by Simon Will's avatar Simon Will
Browse files

Make extracting verses more robust

parent e5c8d74e
No related branches found
No related tags found
No related merge requests found
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import logging import logging
import re import re
import os.path import os.path
import traceback
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from unidecode import unidecode from unidecode import unidecode
...@@ -115,10 +116,15 @@ def separate_punctuation(tokens): ...@@ -115,10 +116,15 @@ def separate_punctuation(tokens):
def reconstruct_verse_text_from_reading(reading): def reconstruct_verse_text_from_reading(reading):
last_non_punct_token = [t for t in reading.tokens if not t.is_punct()] try:
codepoints = [' ' for _ in range(last_non_punct_token.span[1])] codepoints = [' ' for _ in range(reading.tokens[-1].span[1])]
for token in reading.tokens: for token in reading.tokens:
codepoints[token.span[0]:token.span[1]] = token.text codepoints[token.span[0]:token.span[1]] = token.text
except Exception:
print('ERROR reconstructing verse from reading {!r}'
.format(reading))
traceback.print_exc()
codepoints = []
return ''.join(codepoints) return ''.join(codepoints)
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
import argparse import argparse
import json import json
import os import os
import traceback
import allzweckmesser as azm import allzweckmesser as azm
...@@ -11,17 +12,23 @@ import allzweckmesser as azm ...@@ -11,17 +12,23 @@ import allzweckmesser as azm
def main(hypotactic_dir, top_out_dir, meters=['hexameter']): def main(hypotactic_dir, top_out_dir, meters=['hexameter']):
corpus = azm.corpus.HypotacticCorpus.from_directory(hypotactic_dir) corpus = azm.corpus.HypotacticCorpus.from_directory(hypotactic_dir)
for document in corpus.documents: for document in corpus.documents:
doc_out_dir = os.path.join(top_out_dir, document.title) print('Processing {}'.format(document.title))
os.makedirs(doc_out_dir, exist_ok=True) try:
for meter in meters: doc_out_dir = os.path.join(top_out_dir, document.title)
verses_for_meter = [ os.makedirs(doc_out_dir, exist_ok=True)
azm.corpus.HypotacticLine(line).verse.to_dict() for meter in meters:
for line in document.get_lines_with_meter([meter]) verses_for_meter = [
] azm.corpus.HypotacticLine(line).verse.to_dict()
if verses_for_meter: for line in document.get_lines_with_meter([meter])
with open(os.path.join(doc_out_dir, '{}.json'.format(meter)), ]
'w') as f: if verses_for_meter:
json.dump(verses_for_meter, f) with open(os.path.join(doc_out_dir,
'{}.json'.format(meter)),
'w') as f:
json.dump(verses_for_meter, f)
except Exception:
print('ERROR at document {}'.format(document.title))
traceback.print_exc()
def parse_args_and_main(): def parse_args_and_main():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment