# -*- coding: utf-8 -*- import pytest from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker import allzweckmesser as azm FormAnalysis = azm.db.FormAnalysis DB_CONFIG = {'dialect': 'sqlite', 'file': ':memory:'} ENGINE = create_engine(azm.db.get_db_uri(DB_CONFIG)) SESSION_FACTORY = sessionmaker(bind=ENGINE) azm.db.BASE.metadata.create_all(ENGINE) @pytest.fixture def word_list(): word_list = azm.wordlist.WordList(session_factory=SESSION_FACTORY) return word_list @pytest.fixture def tokenized_verses(): verses = [ 'nunc dum tibi lubet licetque pota perde rem', 'antehac est habitus parcus nec magis continens', "clamavit moriens lingua: 'Corinna, vale!'", 'an, quod ubique, tuum est? tua sunt Heliconia Tempe?', ] return [azm.scanner.tokenize(verse) for verse in verses] def test_tokenize(tokenized_verses): expected = [ ['nunc', 'dum', 'tibi', 'lubet', 'licetque', 'pota', 'perde', 'rem'], ['antehac', 'est', 'habitus', 'parcus', 'nec', 'magis', 'continens'], ['clamavit', 'moriens', 'lingua', ':', "'", 'Corinna', ',', 'vale', '!', "'"], ['an', ',', 'quod', 'ubique', ',', 'tuum', 'est', '?', 'tua', 'sunt', 'Heliconia', 'Tempe', '?'] ] for tokens, expected in zip(tokenized_verses, expected): plain_tokens = [t.text for t in tokens] assert plain_tokens == expected def test_get_clitic(): assert azm.scanner.get_clitic('licetque') == ('licet', 'que') assert azm.scanner.get_clitic('Estne') == ('Est', 'ne') assert azm.scanner.get_clitic('querela') == ('querela', None) def test_condense_analyses(): ancilla_analyses = { FormAnalysis(form='ancilla', morphtag='n-s---fb-', lemma='ancilla', accented='ancilla_'), FormAnalysis(form='ancilla', morphtag='n-s---fn-', lemma='ancilla', accented='ancilla'), FormAnalysis(form='ancilla', morphtag='n-s---fv-', lemma='ancilla', accented='ancilla') } condensed = azm.scanner.condense_analyses(ancilla_analyses) assert isinstance(condensed, dict) assert all(isinstance(accented, str) and isinstance(lemma_to_morphtags, dict) for accented, lemma_to_morphtags in condensed.items()) assert condensed == { 'ancilla': {'ancilla': {'n-s---fn-', 'n-s---fv-'}}, 'ancilla_': {'ancilla': {'n-s---fb-'}} } def test_lemmatize(tokenized_verses, word_list): readings = azm.scanner.lemmatize(word_list, azm.model.Reading(tokenized_verses[0])) assert len(readings) == 2 assert all(len(r) == len(tokenized_verses[0]) for r in readings) readings = azm.scanner.lemmatize(word_list, azm.model.Reading(tokenized_verses[2])) assert len(readings) == 4 assert all(len(r) == len(tokenized_verses[2]) for r in readings) # TODO: Make this test more detailed. def test_multiply_readings(tokenized_verses): tokens = tokenized_verses[0] reading_len = len(tokens) readings = [azm.model.Reading(tokens)] assert len(readings) == 1 multiplied_readings = azm.scanner.multiply_readings(readings, 4) assert len(multiplied_readings) == 4 assert all(len(reading) == reading_len for reading in multiplied_readings)