Skip to content
Snippets Groups Projects
Commit 6db95753 authored by Simon Will's avatar Simon Will
Browse files

Correctly handle ambiguous morpheus forms; fixes #29

parent 7919f9e4
No related branches found
No related tags found
No related merge requests found
......@@ -90,6 +90,28 @@ def tokenize(plain_verse: str) -> List[Token]:
return tokens
def blow_up_accented(accented):
matches = list(re.finditer(r'[_^]{2}', accented))
if matches:
# Generate blueprint.
blueprint = [accented[:matches[0].start()]]
for m in matches:
blueprint.append('{}')
blueprint = ''.join(blueprint)
# Fill blueprint with variants of accented form.
combinations = product([0, 1], repeat=len(matches))
blown_up = []
for combi in combinations:
format_args = ['_' if i == 1 else '^'
for i in combi]
blown_up.append(blueprint.format(*format_args))
else:
# The accented is form is unambiguous.
blown_up = [accented]
return blown_up
def condense_analyses(
analyses: Set[FormAnalysis]) -> Dict[str, Dict[str, Set[str]]]:
"""Condense analyses objects into a nested dict representation.
......@@ -101,13 +123,14 @@ def condense_analyses(
"""
condensed = {}
for a in analyses:
if a.accented in condensed:
if a.lemma in condensed[a.accented]:
condensed[a.accented][a.lemma].add(a.morphtag)
for accented in blow_up_accented(a.accented):
if accented in condensed:
if a.lemma in condensed[accented]:
condensed[accented][a.lemma].add(a.morphtag)
else:
condensed[accented][a.lemma] = {a.morphtag}
else:
condensed[a.accented][a.lemma] = {a.morphtag}
else:
condensed[a.accented] = {a.lemma: {a.morphtag}}
condensed[accented] = {a.lemma: {a.morphtag}}
return condensed
......@@ -292,7 +315,6 @@ def make_elisions(verse):
return verse
def parse_verse(verse):
"""Annotates syllable lengths based on positional_lengthening and muta cum liquida
......
......@@ -50,10 +50,30 @@ def test_get_clitic():
assert azm.scanner.get_clitic('querela') == ('querela', None)
def test_condense_analyses():
ancilla_analyses = {
FormAnalysis(form='ancilla', morphtag='n-s---fb-',
lemma='ancilla', accented='ancilla_'),
FormAnalysis(form='ancilla', morphtag='n-s---fn-',
lemma='ancilla', accented='ancilla'),
FormAnalysis(form='ancilla', morphtag='n-s---fv-',
lemma='ancilla', accented='ancilla')
}
condensed = azm.scanner.condense_analyses(ancilla_analyses)
assert isinstance(condensed, dict)
assert all(isinstance(accented, str)
and isinstance(lemma_to_morphtags, dict)
for accented, lemma_to_morphtags in condensed.items())
assert condensed == {
'ancilla': {'ancilla': {'n-s---fn-', 'n-s---fv-'}},
'ancilla_': {'ancilla': {'n-s---fb-'}}
}
def test_lemmatize(tokenized_verses, word_list):
readings = azm.scanner.lemmatize(word_list,
azm.model.Reading(tokenized_verses[0]))
assert len(readings) == 1
assert len(readings) == 2
assert all(len(r) == len(tokenized_verses[0]) for r in readings)
readings = azm.scanner.lemmatize(word_list,
......@@ -71,23 +91,3 @@ def test_multiply_readings(tokenized_verses):
multiplied_readings = azm.scanner.multiply_readings(readings, 4)
assert len(multiplied_readings) == 4
assert all(len(reading) == reading_len for reading in multiplied_readings)
def test_condense_analyses():
ancilla_analyses = {
FormAnalysis(form='ancilla', morphtag='n-s---fb-',
lemma='ancilla', accented='ancilla_'),
FormAnalysis(form='ancilla', morphtag='n-s---fn-',
lemma='ancilla', accented='ancilla'),
FormAnalysis(form='ancilla', morphtag='n-s---fv-',
lemma='ancilla', accented='ancilla')
}
condensed = azm.scanner.condense_analyses(ancilla_analyses)
assert isinstance(condensed, dict)
assert all(isinstance(accented, str)
and isinstance(lemma_to_morphtags, dict)
for accented, lemma_to_morphtags in condensed.items())
assert condensed == {
'ancilla': {'ancilla': {'n-s---fn-', 'n-s---fv-'}},
'ancilla_': {'ancilla': {'n-s---fb-'}}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment