Commit 303aa71e authored by Simon Will's avatar Simon Will
Browse files

Make extracting verses more robust

parent e5c8d74e
Loading
Loading
Loading
Loading
+10 −4
Original line number Diff line number Diff line
@@ -3,6 +3,7 @@
import logging
import re
import os.path
import traceback

from bs4 import BeautifulSoup
from unidecode import unidecode
@@ -115,10 +116,15 @@ def separate_punctuation(tokens):


def reconstruct_verse_text_from_reading(reading):
    last_non_punct_token = [t for t in reading.tokens if not t.is_punct()]
    codepoints = [' ' for _ in range(last_non_punct_token.span[1])]
    try:
        codepoints = [' ' for _ in range(reading.tokens[-1].span[1])]
        for token in reading.tokens:
            codepoints[token.span[0]:token.span[1]] = token.text
    except Exception:
        print('ERROR reconstructing verse from reading {!r}'
              .format(reading))
        traceback.print_exc()
        codepoints = []
    return ''.join(codepoints)


+18 −11
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@
import argparse
import json
import os
import traceback

import allzweckmesser as azm

@@ -11,6 +12,8 @@ import allzweckmesser as azm
def main(hypotactic_dir, top_out_dir, meters=['hexameter']):
    corpus = azm.corpus.HypotacticCorpus.from_directory(hypotactic_dir)
    for document in corpus.documents:
        print('Processing {}'.format(document.title))
        try:
            doc_out_dir = os.path.join(top_out_dir, document.title)
            os.makedirs(doc_out_dir, exist_ok=True)
            for meter in meters:
@@ -19,9 +22,13 @@ def main(hypotactic_dir, top_out_dir, meters=['hexameter']):
                    for line in document.get_lines_with_meter([meter])
                ]
                if verses_for_meter:
                with open(os.path.join(doc_out_dir, '{}.json'.format(meter)),
                    with open(os.path.join(doc_out_dir,
                                           '{}.json'.format(meter)),
                              'w') as f:
                        json.dump(verses_for_meter, f)
        except Exception:
            print('ERROR at document {}'.format(document.title))
            traceback.print_exc()


def parse_args_and_main():