Newer
Older
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
FormAnalysis = azm.db.FormAnalysis
DB_CONFIG = {'dialect': 'sqlite', 'file': ':memory:'}
ENGINE = create_engine(azm.db.get_db_uri(DB_CONFIG))
SESSION_FACTORY = sessionmaker(bind=ENGINE)
azm.db.BASE.metadata.create_all(ENGINE)
@pytest.fixture()
def word_list():
word_list = azm.wordlist.WordList(session_factory=SESSION_FACTORY)
return word_list
@pytest.fixture()
def tokenized_verses():
verses = [
'nunc dum tibi lubet licetque pota perde rem',
'antehac est habitus parcus nec magis continens',
"clamavit moriens lingua: 'Corinna, vale!'",
'an, quod ubique, tuum est? tua sunt Heliconia Tempe?',
]
return [azm.scanner.tokenize(verse) for verse in verses]
def test_tokenize(tokenized_verses):
expected = [
['nunc', 'dum', 'tibi', 'lubet', 'licetque', 'pota', 'perde', 'rem'],
['antehac', 'est', 'habitus', 'parcus', 'nec', 'magis', 'continens'],
['clamavit', 'moriens', 'lingua', ':', "'", 'Corinna', ',', 'vale',
'!', "'"],
['an', ',', 'quod', 'ubique', ',', 'tuum', 'est', '?', 'tua', 'sunt',
'Heliconia', 'Tempe', '?']
]
for tokens, expected in zip(tokenized_verses, expected):
plain_tokens = [t.text for t in tokens]
assert plain_tokens == expected
def test_get_clitic():
assert azm.scanner.get_clitic('licetque') == ('licet', 'que')
assert azm.scanner.get_clitic('Estne') == ('Est', 'ne')
assert azm.scanner.get_clitic('querela') == ('querela', None)
def test_lemmatize(tokenized_verses, word_list):
readings = azm.scanner.lemmatize(word_list,
azm.model.Reading(tokenized_verses[0]))
assert len(readings) == 1
assert all(len(r) == len(tokenized_verses[0]) for r in readings)
readings = azm.scanner.lemmatize(word_list,
azm.model.Reading(tokenized_verses[2]))
assert len(readings) == 4
assert all(len(r) == len(tokenized_verses[2]) for r in readings)
# TODO: Make this test more detailed.
def test_multiply_readings(tokenized_verses):
tokens = tokenized_verses[0]
reading_len = len(tokens)
readings = [azm.model.Reading(tokens)]
assert len(readings) == 1
multiplied_readings = azm.scanner.multiply_readings(readings, 4)
assert len(multiplied_readings) == 4
assert all(len(reading) == reading_len for reading in multiplied_readings)
def test_condense_analyses():
ancilla_analyses = {
FormAnalysis(form='ancilla', morphtag='n-s---fb-',
lemma='ancilla', accented='ancilla_'),
FormAnalysis(form='ancilla', morphtag='n-s---fn-',
lemma='ancilla', accented='ancilla'),
FormAnalysis(form='ancilla', morphtag='n-s---fv-',
lemma='ancilla', accented='ancilla')
}
condensed = azm.scanner.condense_analyses(ancilla_analyses)
assert isinstance(condensed, dict)
assert all(isinstance(accented, str)
and isinstance(lemma_to_morphtags, dict)
for accented, lemma_to_morphtags in condensed.items())
assert condensed == {
'ancilla': {'ancilla': {'n-s---fn-', 'n-s---fv-'}},
'ancilla_': {'ancilla': {'n-s---fb-'}}
}