Loading allzweckmesser/meters.py +20 −12 Original line number Diff line number Diff line Loading @@ -64,7 +64,7 @@ class Meter: AEOLIC_BASE = r'(?:(–)(–)|(–)(⏑)|(⏑)(–))' ALL_METERS = { 'Catalectic Dactylic Hexameter': Meter( 'hexameter': Meter( 'Catalectic Dactylic Hexameter', r'(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(⏑|–)', conditions=[ Loading @@ -76,52 +76,60 @@ ALL_METERS = { [('mora', 10, 'Penthemimeral')], [('mora', 16, 'Bucolic Diaeresis')] ], short_name='6da‸', short_name='hexameter', id=0 ), 'Dactylic Pentameter': Meter( 'pentameter': Meter( 'Dactylic Pentameter', r'(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(–)(⏑⏑)(–)(⏑⏑)(⏑|–)', breaks=[[('mora', 5, 'Middle diaeresis')]], short_name='3da‸3da‸', short_name='pentameter', id=1 ), 'Iambic Trimeter': Meter( 'ia6': Meter( 'Iambic Trimeter', r'(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑|–)', breaks=[ [('element', 4, 'After fourth element')], [('element', 8, 'After eighth element')] ], short_name='3ia', short_name='ia6', id=2 ), 'Iambic Senarius': Meter( 'senarii': Meter( 'Iambic Senarius', r'(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑|–)', short_name='6ia', short_name='senarii', id=3 ), 'Sapphic Hendecasyllable': Meter( 'sap hen': Meter( 'Sapphic Hendecasyllable', r'(–)(–|⏑)(–)(–|⏑)(–)(⏑)(⏑)(–)(⏑)(–)(⏑|–)', conditions={}, short_name='sap hen', id=4 ), 'Adoneus': Meter( 'adoneus': Meter( 'Adoneus', r'(–)(⏑⏑)(–)(⏑|–)', short_name='adoneus', id=5 ), 'Phalaecian Hendecasyllable': Meter( 'hendecasyllables': Meter( 'Phalaecian Hendecasyllable', AEOLIC_BASE + r'(–)(⏑)(⏑)(–)(⏑)(–)(⏑)(–)(⏑|–)', breaks=[[('element', 6, 'After sixth element')]], short_name='hen', short_name='hendecasyllables', id=6 ), 'scazon': Meter( 'Choliamb', r'(⏑|–)(–)(⏑)(–)(⏑|–)(–)(⏑)(–)(⏑)(–)(–)(⏑|–)', breaks=[[('element', 5, 'After sixth element')], [('element', 7, 'After sixth element')]], short_name='scazon', id=7 ), } Loading allzweckmesser/model.py +2 −2 Original line number Diff line number Diff line Loading @@ -73,7 +73,7 @@ class Syllable: else: self.text = syllable self.span = span self.id = idx or None self.id = idx if idx is not None else None self.syllable_length = syllable_length self.vowel_length = vowel_length self.phenomena = phenomena or dict() Loading @@ -82,7 +82,7 @@ class Syllable: def from_json(cls, json_file): raw = check_format(json_file) idx = raw['id'] idx = raw['id'] if 'id' in raw else 0 span = raw['span'] text = raw['syllable'] syllable_length = raw.get('syllable_length') Loading scripts/create_corpus.py 0 → 100644 +59 −0 Original line number Diff line number Diff line #!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import json import os import sys import traceback import allzweckmesser as azm def main(corpus_specification, top_in_dir, top_out_dir): spec = json.load(open(corpus_specification)) os.makedirs(top_out_dir, exist_ok=True) for split in ['train', 'dev', 'test']: print('Split {}'.format(split)) split_file = os.path.join(top_out_dir, split) instances = [] for meter_name, paths in spec[split].items(): print('Processing meter {}'.format(meter_name)) meter = azm.meters.ALL_METERS[meter_name] for path in paths: print(' Processing path {}'.format(path)) with open(os.path.join(top_in_dir, path)) as f: verses = [] for v_dict in json.load(f): try: verse = azm.model.Verse.from_json(v_dict) except Exception: print('ERROR in verse {!r}'.format(v_dict), file=sys.stderr) traceback.print_exc() verse = None if verse: verses.append(verse) for verse in verses: print(' Processing verse {}'.format(verse.text)) matches = meter.match_reading(verse.readings[0]) instances.append([meter_name, verse, matches]) with open(split_file, 'w') as f: json.dump(instances, f) def parse_args_and_main(): d = 'Create a train-dev-test split corpus from a corpus specfication' parser = argparse.ArgumentParser(description=d) parser.add_argument('corpus_specification', help='JSON specifying the splits and their files') parser.add_argument('top_in_dir', help='Top level directory of the extracted json files') parser.add_argument('top_out_dir', help='Top level directory of the created JSON files') args = parser.parse_args() main(**vars(args)) if __name__ == '__main__': parse_args_and_main() Loading
allzweckmesser/meters.py +20 −12 Original line number Diff line number Diff line Loading @@ -64,7 +64,7 @@ class Meter: AEOLIC_BASE = r'(?:(–)(–)|(–)(⏑)|(⏑)(–))' ALL_METERS = { 'Catalectic Dactylic Hexameter': Meter( 'hexameter': Meter( 'Catalectic Dactylic Hexameter', r'(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(⏑|–)', conditions=[ Loading @@ -76,52 +76,60 @@ ALL_METERS = { [('mora', 10, 'Penthemimeral')], [('mora', 16, 'Bucolic Diaeresis')] ], short_name='6da‸', short_name='hexameter', id=0 ), 'Dactylic Pentameter': Meter( 'pentameter': Meter( 'Dactylic Pentameter', r'(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(–)(⏑⏑)(–)(⏑⏑)(⏑|–)', breaks=[[('mora', 5, 'Middle diaeresis')]], short_name='3da‸3da‸', short_name='pentameter', id=1 ), 'Iambic Trimeter': Meter( 'ia6': Meter( 'Iambic Trimeter', r'(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑|–)', breaks=[ [('element', 4, 'After fourth element')], [('element', 8, 'After eighth element')] ], short_name='3ia', short_name='ia6', id=2 ), 'Iambic Senarius': Meter( 'senarii': Meter( 'Iambic Senarius', r'(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑|–)', short_name='6ia', short_name='senarii', id=3 ), 'Sapphic Hendecasyllable': Meter( 'sap hen': Meter( 'Sapphic Hendecasyllable', r'(–)(–|⏑)(–)(–|⏑)(–)(⏑)(⏑)(–)(⏑)(–)(⏑|–)', conditions={}, short_name='sap hen', id=4 ), 'Adoneus': Meter( 'adoneus': Meter( 'Adoneus', r'(–)(⏑⏑)(–)(⏑|–)', short_name='adoneus', id=5 ), 'Phalaecian Hendecasyllable': Meter( 'hendecasyllables': Meter( 'Phalaecian Hendecasyllable', AEOLIC_BASE + r'(–)(⏑)(⏑)(–)(⏑)(–)(⏑)(–)(⏑|–)', breaks=[[('element', 6, 'After sixth element')]], short_name='hen', short_name='hendecasyllables', id=6 ), 'scazon': Meter( 'Choliamb', r'(⏑|–)(–)(⏑)(–)(⏑|–)(–)(⏑)(–)(⏑)(–)(–)(⏑|–)', breaks=[[('element', 5, 'After sixth element')], [('element', 7, 'After sixth element')]], short_name='scazon', id=7 ), } Loading
allzweckmesser/model.py +2 −2 Original line number Diff line number Diff line Loading @@ -73,7 +73,7 @@ class Syllable: else: self.text = syllable self.span = span self.id = idx or None self.id = idx if idx is not None else None self.syllable_length = syllable_length self.vowel_length = vowel_length self.phenomena = phenomena or dict() Loading @@ -82,7 +82,7 @@ class Syllable: def from_json(cls, json_file): raw = check_format(json_file) idx = raw['id'] idx = raw['id'] if 'id' in raw else 0 span = raw['span'] text = raw['syllable'] syllable_length = raw.get('syllable_length') Loading
scripts/create_corpus.py 0 → 100644 +59 −0 Original line number Diff line number Diff line #!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import json import os import sys import traceback import allzweckmesser as azm def main(corpus_specification, top_in_dir, top_out_dir): spec = json.load(open(corpus_specification)) os.makedirs(top_out_dir, exist_ok=True) for split in ['train', 'dev', 'test']: print('Split {}'.format(split)) split_file = os.path.join(top_out_dir, split) instances = [] for meter_name, paths in spec[split].items(): print('Processing meter {}'.format(meter_name)) meter = azm.meters.ALL_METERS[meter_name] for path in paths: print(' Processing path {}'.format(path)) with open(os.path.join(top_in_dir, path)) as f: verses = [] for v_dict in json.load(f): try: verse = azm.model.Verse.from_json(v_dict) except Exception: print('ERROR in verse {!r}'.format(v_dict), file=sys.stderr) traceback.print_exc() verse = None if verse: verses.append(verse) for verse in verses: print(' Processing verse {}'.format(verse.text)) matches = meter.match_reading(verse.readings[0]) instances.append([meter_name, verse, matches]) with open(split_file, 'w') as f: json.dump(instances, f) def parse_args_and_main(): d = 'Create a train-dev-test split corpus from a corpus specfication' parser = argparse.ArgumentParser(description=d) parser.add_argument('corpus_specification', help='JSON specifying the splits and their files') parser.add_argument('top_in_dir', help='Top level directory of the extracted json files') parser.add_argument('top_out_dir', help='Top level directory of the created JSON files') args = parser.parse_args() main(**vars(args)) if __name__ == '__main__': parse_args_and_main()