Skip to content
Snippets Groups Projects
create_feature_vectors.py 2.94 KiB
Newer Older
#!/usr/bin/python3
# -*- coding: utf-8 -*-

import argparse
import json
import sys
import traceback

from unidecode import unidecode

import allzweckmesser as azm


def main(meter_reference_verses, outfile, meters=['hexameter']):
    meters = [
        azm.meters.ALL_METERS[meter]
        for meter in meters
        if meter in azm.meters.ALL_METERS
    ]
    scanner = azm.scanner.Scanner()
    total_instances = len(meter_reference_verses)
    for i, (ref_meter, ref_verse, correct) in enumerate(meter_reference_verses, 1):
            print('Processing verse {} ({}/{})'
                  .format(ref_verse.text, i, total_instances))
            instances = []
            ref_reading = ref_verse.readings[0]
            ref_schema = ref_reading.get_schema()
            try:
                analysis = scanner.scan_verses([unidecode(ref_verse.text)])[0]
            except Exception:
                print('ERROR when scanning verse {!r}'.format(ref_verse),
                      file=sys.stderr)
                traceback.print_exc()
            reading_meter_combinations = (
                azm.meters.get_reading_meter_combinations(
                    analysis.readings, meters
                )
            )
            for reading, meter, rmfeatures in reading_meter_combinations:
                features = azm.features.combine_features(
                    reading.features, rmfeatures)
                # A feature vector gets a correct label if the schema matches
                # the reference reading’s schema and the meter matches the
                # reference meter.
                reading_is_correct = int(
                    meter.short_name == ref_meter.short_name
                    and reading.get_schema() == ref_schema
                )
                instances.append((features, reading_is_correct))
            out.append((ref_verse.text, ref_meter, instances))

    with open(outfile, 'w') as f:
        json.dump(out, f, indent=2)


def read_infile(infile):
    meter_reference_verses = []
    with open(infile) as f:
        for meter, verse_dict, correct in json.load(f):
            verse = azm.model.Verse.from_json(verse_dict)
            meter_reference_verses.append((meter, verse, correct))
    return meter_reference_verses


def parse_args_and_main():
    d = 'Generate feature vectors for reading-meter combinations'
    parser = argparse.ArgumentParser(description=d)
    parser.add_argument('--meters', '-m', nargs='+',
                        help='Meters to consider when scanning.')
    parser.add_argument('infile',
                        help='JSON file containing the reference verses')
    parser.add_argument('outfile',
                        help='JSON file for the output')
    args = parser.parse_args()
    args = vars(args)
    args['meter_reference_verses'] = read_infile(args['infile'])
    del args['infile']
    main(**args)


if __name__ == '__main__':
    parse_args_and_main()