Skip to content
Snippets Groups Projects
scanner.py 6.43 KiB
Newer Older
Simon Will's avatar
Simon Will committed
# -*- coding: utf-8 -*-

Simon Will's avatar
Simon Will committed
import re
from typing import Dict, List, Set, Tuple
Simon Will's avatar
Simon Will committed

from .db import FormAnalysis
from .model import Reading, Token, Verse
Simon Will's avatar
Simon Will committed
from .wordlist import WordList

verses = [
    'nunc dum tibi lubet licetque pota perde rem',
    'antehac est habitus parcus nec magis continens',
    "clamavit moriens lingua: 'Corinna, vale!'",
    'an, quod ubique, tuum est? tua sunt Heliconia Tempe?',
]

CLITICS = ['que', 'qve', 'ue', 've', 'ne']


Simon Will's avatar
Simon Will committed
def get_clitic(token: str) -> Tuple[str, str]:
    """Split a clitic from the token if possible.

    :param token: A token that may contain a clitic.
    :return: A tuple of token without clitic and clitic, if a clitic
        was found. Or a tuple of the original token and None if no
        clitic was found.
    """
    for clitic in CLITICS:
        if token.endswith(clitic):
            return token[:-len(clitic)], clitic
    else:
        return token, None


def multiply_readings(readings: List[Reading],
                      n: int) -> List[Reading]:
    """Copy the readings n - 1 times.

    :param readings: The readings that are to be multiplied.
    :param n: The number with which to multiply.
    :return: n times as many readings as they were before.
    """
    orig_readings_len = len(readings)
    for _ in range(n - 1):
        for i in range(orig_readings_len):
            # TODO: Think about moving this to Reading in model.py
            new_reading = Reading(
                [copy.copy(token) for token in readings[i].tokens]
            )
            readings.append(new_reading)
    return readings
def tokenize(plain_verse: str) -> List[Token]:
    """Tokenize a verse.

    This function first splits on whitespace and then further on
    punctuation. Punctuation marks are regarded as tokens and are
    therefore included in the list of returned tokens.

    :param plain_verse: The verse that is to be tokenized.
    :return: A list of the found tokens.
    """
Simon Will's avatar
Simon Will committed
    tokens = []
    i = 0  # Index into the whole verse.
    for token in re.split(r'\s', plain_verse):
        if token:
            # Add Tokens for the punctuation before a token.
Simon Will's avatar
Simon Will committed
            pre_punct_match = re.search('^\W+', token)
            if pre_punct_match:
                for c in pre_punct_match.group():
                    tokens.append(Token(c, (i, i + 1)))
                    i += 1
                pre_punct_end = pre_punct_match.end()
            else:
                pre_punct_end = 0

            post_punct_match = re.search('[\W_]+$', token)
Simon Will's avatar
Simon Will committed
            if post_punct_match:
                # Add a Token for the word itself.
Simon Will's avatar
Simon Will committed
                word = token[pre_punct_end:post_punct_match.start()]
                tokens.append(Token(word, (i, i + len(word))))
                i += len(word)

                # Add Tokens for the punctuation after a token.
Simon Will's avatar
Simon Will committed
                for c in post_punct_match.group():
                    tokens.append(Token(c, (i, i + 1)))
                    i += 1
            else:
                # Add a Token for the word itself.
Simon Will's avatar
Simon Will committed
                word = token[pre_punct_end:]
                tokens.append(Token(word, (i, i + len(word))))
                i += len(word)
        i += 1
    return tokens


def condense_analyses(
        analyses: Set[FormAnalysis]) -> Dict[str, Dict[str, Set[str]]]:
    """Condense analyses objects into a nested dict representation.

    :param analyses: The analyses that are to be condensed.
    :return: A condensed version of the analyses. The keys in the
        outer dict are the accented forms, the keys in the inner dict
        are lemmas and the strings in the set are the morphtags.
    """
    condensed = {}
    for a in analyses:
        if a.accented in condensed:
            if a.lemma in condensed[a.accented]:
                condensed[a.accented][a.lemma].add(a.morphtag)
            else:
                condensed[a.accented][a.lemma] = {a.morphtag}
        else:
            condensed[a.accented] = {a.lemma: {a.morphtag}}
    return condensed


def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
    """Find different possible readings by analyzing the word forms.

    This function analyzes the word forms in the verse and creates
    readings for all possible combinations of accented versions of the
    words. E.g. if two words occur with more than one accented
    version, say one with two accented versions and the other with
    three accented versions, a total of six readings will be
    generated.

    :param word_list: The word list to look up the word forms.
    :param reading: A basic reading of a verse that is to be analyzed.
    :return: A list of readings of the verse that differ with respect
        to the accented versions for the forms.

    """
    token_alternatives = []
    for token in reading.tokens:
        if token.is_punct():
            analyses = None
        else:
            analyses = word_list.analyze(token.text)
            if not analyses:
                bare, clitic = get_clitic(token.text)
                if clitic:
                    token.clitic = clitic
                    analyses = word_list.analyze(bare)

        if analyses:
            condensed_analyses = condense_analyses(analyses)
            for accented, lemma_to_morphtags in condensed_analyses.items():
                # The token should not have any syllables at this
                # point so that the question of copy vs deepcopy
                # does not even arise.
                t = copy.copy(token)
                t.accented = accented
                t.lemma_to_morphtags = lemma_to_morphtags
                alternatives.append(t)
        else:
            alternatives.append(token)
        token_alternatives.append(alternatives)

    readings = [Reading()]
    for alternatives in token_alternatives:
        orig_readings_len = len(readings)
        readings = multiply_readings(readings, len(alternatives))
        for i, token in enumerate(alternatives):
            start = i * orig_readings_len
            for reading in readings[start:start+orig_readings_len]:
                reading.append_token(token)
Simon Will's avatar
Simon Will committed
class Scanner:

        self.word_list = WordList()

    def scan_verses(self, plain_verses: List[str]):
        base_readings = [Reading(tokens=tokenize(v)) for v in plain_verses]
        verses = [
            Verse(verse=v, readings=lemmatize(self.word_list, br))
            for v, br in zip(plain_verses, base_readings)
        ]
        return verses