diff --git a/scripts/extract_meters.py b/scripts/extract_meters.py new file mode 100644 index 0000000000000000000000000000000000000000..29ab5de9f03b1dad2a192702cf78c5c33a2b8804 --- /dev/null +++ b/scripts/extract_meters.py @@ -0,0 +1,59 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import argparse +from collections import defaultdict +import json + +import allzweckmesser as azm + + +def main(hypotactic_dir, outfile): + corpus = azm.corpus.HypotacticCorpus.from_directory(hypotactic_dir) + poem_meters = defaultdict(lambda: [set(), 0]) + line_meters = defaultdict(lambda: [set(), 0]) + for document in corpus.documents: + print('Processing {}'.format(document.title)) + has_poems = False + for poem in document.get_poems(): + has_poems = True + meters = [c for c in poem.attrs['class'] if c != 'poem'] + for meter in meters: + poem_meters[meter][0].add(document.title) + poem_meters[meter][1] += 1 + line_meters[meter][0].add(document.title) + line_meters[meter][1] += len(poem.find_all(name='div', + class_='line')) + + if not has_poems: + for line in document.get_lines(): + meters = [c for c in line.attrs['class'] if c != 'line'] + for meter in meters: + line_meters[meter][0].add(document.title) + line_meters[meter][1] += 1 + + print('Meters: {}' + .format(set(poem_meters.keys()).union(line_meters.keys()))) + + for pair in poem_meters.values(): + pair[0] = list(pair[0]) + for pair in line_meters.values(): + pair[0] = list(pair[0]) + with open(outfile, 'w') as f: + obj = {'poem_meters': poem_meters, + 'line_meters': line_meters} + json.dump(obj, f, indent=2) + + +def parse_args_and_main(): + d = 'Extract occurring meters from a Hypotactic corpus' + parser = argparse.ArgumentParser(description=d) + parser.add_argument('hypotactic_dir', + help='Top level directory of the Hypotactic corpus') + parser.add_argument('outfile', help='File to save the lines in') + args = parser.parse_args() + main(**vars(args)) + + +if __name__ == '__main__': + parse_args_and_main()