diff --git a/allzweckmesser/scanner.py b/allzweckmesser/scanner.py index 97ced9b6537d64edb4bef9956fa3fd67546b6a2b..1be933c3c8420efbfc95bfcac765d1762631b006 100644 --- a/allzweckmesser/scanner.py +++ b/allzweckmesser/scanner.py @@ -90,6 +90,28 @@ def tokenize(plain_verse: str) -> List[Token]: return tokens +def blow_up_accented(accented): + matches = list(re.finditer(r'[_^]{2}', accented)) + if matches: + # Generate blueprint. + blueprint = [accented[:matches[0].start()]] + for m in matches: + blueprint.append('{}') + blueprint = ''.join(blueprint) + + # Fill blueprint with variants of accented form. + combinations = product([0, 1], repeat=len(matches)) + blown_up = [] + for combi in combinations: + format_args = ['_' if i == 1 else '^' + for i in combi] + blown_up.append(blueprint.format(*format_args)) + else: + # The accented is form is unambiguous. + blown_up = [accented] + return blown_up + + def condense_analyses( analyses: Set[FormAnalysis]) -> Dict[str, Dict[str, Set[str]]]: """Condense analyses objects into a nested dict representation. @@ -101,13 +123,14 @@ def condense_analyses( """ condensed = {} for a in analyses: - if a.accented in condensed: - if a.lemma in condensed[a.accented]: - condensed[a.accented][a.lemma].add(a.morphtag) + for accented in blow_up_accented(a.accented): + if accented in condensed: + if a.lemma in condensed[accented]: + condensed[accented][a.lemma].add(a.morphtag) + else: + condensed[accented][a.lemma] = {a.morphtag} else: - condensed[a.accented][a.lemma] = {a.morphtag} - else: - condensed[a.accented] = {a.lemma: {a.morphtag}} + condensed[accented] = {a.lemma: {a.morphtag}} return condensed @@ -292,7 +315,6 @@ def make_elisions(verse): return verse - def parse_verse(verse): """Annotates syllable lengths based on positional_lengthening and muta cum liquida diff --git a/tests/test_scanner.py b/tests/test_scanner.py index fa75abd3c8261535f7d255dd36ef8106b9baed2c..29e7c4a2df524d3ef47df42141cdfad29419e8f3 100644 --- a/tests/test_scanner.py +++ b/tests/test_scanner.py @@ -50,10 +50,30 @@ def test_get_clitic(): assert azm.scanner.get_clitic('querela') == ('querela', None) +def test_condense_analyses(): + ancilla_analyses = { + FormAnalysis(form='ancilla', morphtag='n-s---fb-', + lemma='ancilla', accented='ancilla_'), + FormAnalysis(form='ancilla', morphtag='n-s---fn-', + lemma='ancilla', accented='ancilla'), + FormAnalysis(form='ancilla', morphtag='n-s---fv-', + lemma='ancilla', accented='ancilla') + } + condensed = azm.scanner.condense_analyses(ancilla_analyses) + assert isinstance(condensed, dict) + assert all(isinstance(accented, str) + and isinstance(lemma_to_morphtags, dict) + for accented, lemma_to_morphtags in condensed.items()) + assert condensed == { + 'ancilla': {'ancilla': {'n-s---fn-', 'n-s---fv-'}}, + 'ancilla_': {'ancilla': {'n-s---fb-'}} + } + + def test_lemmatize(tokenized_verses, word_list): readings = azm.scanner.lemmatize(word_list, azm.model.Reading(tokenized_verses[0])) - assert len(readings) == 1 + assert len(readings) == 2 assert all(len(r) == len(tokenized_verses[0]) for r in readings) readings = azm.scanner.lemmatize(word_list, @@ -71,23 +91,3 @@ def test_multiply_readings(tokenized_verses): multiplied_readings = azm.scanner.multiply_readings(readings, 4) assert len(multiplied_readings) == 4 assert all(len(reading) == reading_len for reading in multiplied_readings) - - -def test_condense_analyses(): - ancilla_analyses = { - FormAnalysis(form='ancilla', morphtag='n-s---fb-', - lemma='ancilla', accented='ancilla_'), - FormAnalysis(form='ancilla', morphtag='n-s---fn-', - lemma='ancilla', accented='ancilla'), - FormAnalysis(form='ancilla', morphtag='n-s---fv-', - lemma='ancilla', accented='ancilla') - } - condensed = azm.scanner.condense_analyses(ancilla_analyses) - assert isinstance(condensed, dict) - assert all(isinstance(accented, str) - and isinstance(lemma_to_morphtags, dict) - for accented, lemma_to_morphtags in condensed.items()) - assert condensed == { - 'ancilla': {'ancilla': {'n-s---fn-', 'n-s---fv-'}}, - 'ancilla_': {'ancilla': {'n-s---fb-'}} - }