Loading allzweckmesser/corpus.py +10 −4 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ import logging import re import os.path import traceback from bs4 import BeautifulSoup from unidecode import unidecode Loading Loading @@ -115,10 +116,15 @@ def separate_punctuation(tokens): def reconstruct_verse_text_from_reading(reading): last_non_punct_token = [t for t in reading.tokens if not t.is_punct()] codepoints = [' ' for _ in range(last_non_punct_token.span[1])] try: codepoints = [' ' for _ in range(reading.tokens[-1].span[1])] for token in reading.tokens: codepoints[token.span[0]:token.span[1]] = token.text except Exception: print('ERROR reconstructing verse from reading {!r}' .format(reading)) traceback.print_exc() codepoints = [] return ''.join(codepoints) Loading scripts/extract_verses_by_meters.py +18 −11 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ import argparse import json import os import traceback import allzweckmesser as azm Loading @@ -11,6 +12,8 @@ import allzweckmesser as azm def main(hypotactic_dir, top_out_dir, meters=['hexameter']): corpus = azm.corpus.HypotacticCorpus.from_directory(hypotactic_dir) for document in corpus.documents: print('Processing {}'.format(document.title)) try: doc_out_dir = os.path.join(top_out_dir, document.title) os.makedirs(doc_out_dir, exist_ok=True) for meter in meters: Loading @@ -19,9 +22,13 @@ def main(hypotactic_dir, top_out_dir, meters=['hexameter']): for line in document.get_lines_with_meter([meter]) ] if verses_for_meter: with open(os.path.join(doc_out_dir, '{}.json'.format(meter)), with open(os.path.join(doc_out_dir, '{}.json'.format(meter)), 'w') as f: json.dump(verses_for_meter, f) except Exception: print('ERROR at document {}'.format(document.title)) traceback.print_exc() def parse_args_and_main(): Loading Loading
allzweckmesser/corpus.py +10 −4 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ import logging import re import os.path import traceback from bs4 import BeautifulSoup from unidecode import unidecode Loading Loading @@ -115,10 +116,15 @@ def separate_punctuation(tokens): def reconstruct_verse_text_from_reading(reading): last_non_punct_token = [t for t in reading.tokens if not t.is_punct()] codepoints = [' ' for _ in range(last_non_punct_token.span[1])] try: codepoints = [' ' for _ in range(reading.tokens[-1].span[1])] for token in reading.tokens: codepoints[token.span[0]:token.span[1]] = token.text except Exception: print('ERROR reconstructing verse from reading {!r}' .format(reading)) traceback.print_exc() codepoints = [] return ''.join(codepoints) Loading
scripts/extract_verses_by_meters.py +18 −11 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ import argparse import json import os import traceback import allzweckmesser as azm Loading @@ -11,6 +12,8 @@ import allzweckmesser as azm def main(hypotactic_dir, top_out_dir, meters=['hexameter']): corpus = azm.corpus.HypotacticCorpus.from_directory(hypotactic_dir) for document in corpus.documents: print('Processing {}'.format(document.title)) try: doc_out_dir = os.path.join(top_out_dir, document.title) os.makedirs(doc_out_dir, exist_ok=True) for meter in meters: Loading @@ -19,9 +22,13 @@ def main(hypotactic_dir, top_out_dir, meters=['hexameter']): for line in document.get_lines_with_meter([meter]) ] if verses_for_meter: with open(os.path.join(doc_out_dir, '{}.json'.format(meter)), with open(os.path.join(doc_out_dir, '{}.json'.format(meter)), 'w') as f: json.dump(verses_for_meter, f) except Exception: print('ERROR at document {}'.format(document.title)) traceback.print_exc() def parse_args_and_main(): Loading