Commit 70c924a2 authored by Simon Will's avatar Simon Will
Browse files

Add corpus creation script

parent ca1d55ca
Loading
Loading
Loading
Loading
+20 −12
Original line number Diff line number Diff line
@@ -64,7 +64,7 @@ class Meter:
AEOLIC_BASE = r'(?:(–)(–)|(–)(⏑)|(⏑)(–))'

ALL_METERS = {
    'Catalectic Dactylic Hexameter': Meter(
    'hexameter': Meter(
        'Catalectic Dactylic Hexameter',
        r'(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(⏑⏑|–)(⏑|–)',
        conditions=[
@@ -76,52 +76,60 @@ ALL_METERS = {
            [('mora', 10, 'Penthemimeral')],
            [('mora', 16, 'Bucolic Diaeresis')]
        ],
        short_name='6da‸',
        short_name='hexameter',
        id=0
    ),
    'Dactylic Pentameter': Meter(
    'pentameter': Meter(
        'Dactylic Pentameter',
        r'(–)(⏑⏑|–)(–)(⏑⏑|–)(–)(–)(⏑⏑)(–)(⏑⏑)(⏑|–)',
        breaks=[[('mora', 5, 'Middle diaeresis')]],
        short_name='3da‸3da‸',
        short_name='pentameter',
        id=1
    ),
    'Iambic Trimeter': Meter(
    'ia6': Meter(
        'Iambic Trimeter',
        r'(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑|–)',
        breaks=[
            [('element', 4, 'After fourth element')],
            [('element', 8, 'After eighth element')]
        ],
        short_name='3ia',
        short_name='ia6',
        id=2
    ),
    'Iambic Senarius': Meter(
    'senarii': Meter(
        'Iambic Senarius',
        r'(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑|⏑⏑|–)(⏑⏑|–)(⏑)(⏑|–)',
        short_name='6ia',
        short_name='senarii',
        id=3
    ),
    'Sapphic Hendecasyllable': Meter(
    'sap hen': Meter(
        'Sapphic Hendecasyllable',
        r'(–)(–|⏑)(–)(–|⏑)(–)(⏑)(⏑)(–)(⏑)(–)(⏑|–)',
        conditions={},
        short_name='sap hen',
        id=4
    ),
    'Adoneus': Meter(
    'adoneus': Meter(
        'Adoneus',
        r'(–)(⏑⏑)(–)(⏑|–)',
        short_name='adoneus',
        id=5
    ),
    'Phalaecian Hendecasyllable': Meter(
    'hendecasyllables': Meter(
        'Phalaecian Hendecasyllable',
        AEOLIC_BASE + r'(–)(⏑)(⏑)(–)(⏑)(–)(⏑)(–)(⏑|–)',
        breaks=[[('element', 6, 'After sixth element')]],
        short_name='hen',
        short_name='hendecasyllables',
        id=6
    ),
    'scazon': Meter(
        'Choliamb',
        r'(⏑|–)(–)(⏑)(–)(⏑|–)(–)(⏑)(–)(⏑)(–)(–)(⏑|–)',
        breaks=[[('element', 5, 'After sixth element')],
                [('element', 7, 'After sixth element')]],
        short_name='scazon',
        id=7
    ),
}


+2 −2
Original line number Diff line number Diff line
@@ -73,7 +73,7 @@ class Syllable:
        else:
            self.text = syllable
            self.span = span
            self.id = idx or None
            self.id = idx if idx is not None else None
            self.syllable_length = syllable_length
            self.vowel_length = vowel_length
            self.phenomena = phenomena or dict()
@@ -82,7 +82,7 @@ class Syllable:
    def from_json(cls, json_file):
        raw = check_format(json_file)

        idx = raw['id']
        idx = raw['id'] if 'id' in raw else 0
        span = raw['span']
        text = raw['syllable']
        syllable_length = raw.get('syllable_length')
+59 −0
Original line number Diff line number Diff line
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import argparse
import json
import os
import sys
import traceback

import allzweckmesser as azm


def main(corpus_specification, top_in_dir, top_out_dir):
    spec = json.load(open(corpus_specification))
    os.makedirs(top_out_dir, exist_ok=True)
    for split in ['train', 'dev', 'test']:
        print('Split {}'.format(split))
        split_file = os.path.join(top_out_dir, split)
        instances = []
        for meter_name, paths in spec[split].items():
            print('Processing meter {}'.format(meter_name))
            meter = azm.meters.ALL_METERS[meter_name]
            for path in paths:
                print('  Processing path {}'.format(path))
                with open(os.path.join(top_in_dir, path)) as f:
                    verses = []
                    for v_dict in json.load(f):
                        try:
                            verse = azm.model.Verse.from_json(v_dict)
                        except Exception:
                            print('ERROR in verse {!r}'.format(v_dict),
                                  file=sys.stderr)
                            traceback.print_exc()
                            verse = None
                        if verse:
                            verses.append(verse)
                for verse in verses:
                    print('    Processing verse {}'.format(verse.text))
                    matches = meter.match_reading(verse.readings[0])
                    instances.append([meter_name, verse, matches])
        with open(split_file, 'w') as f:
            json.dump(instances, f)


def parse_args_and_main():
    d = 'Create a train-dev-test split corpus from a corpus specfication'
    parser = argparse.ArgumentParser(description=d)
    parser.add_argument('corpus_specification',
                        help='JSON specifying the splits and their files')
    parser.add_argument('top_in_dir',
                        help='Top level directory of the extracted json files')
    parser.add_argument('top_out_dir',
                        help='Top level directory of the created JSON files')
    args = parser.parse_args()
    main(**vars(args))


if __name__ == '__main__':
    parse_args_and_main()