scanner.py

# -*- coding: utf-8 -*-

import copy
import re
from typing import List

from .model import Token
from .wordlist import WordList

verses = [
    'nunc dum tibi lubet licetque pota perde rem',
    'antehac est habitus parcus nec magis continens',
    "clamavit moriens lingua: 'Corinna, vale!'",
    'an, quod ubique, tuum est? tua sunt Heliconia Tempe?',
]

CLITICS = ['que', 'qve', 'ue', 've', 'ne']


def get_clitic(token):
    for clitic in CLITICS:
        if token.endswith(clitic):
            return token[:-len(clitic)], clitic
    else:
        return token, None


def multiply_readings(readings: List[List[Token]],
                      n: int) -> List[List[Token]]:
    """Copy the readings n - 1 times.

    :param readings: The readings that are to be multiplied.
    :param n: The number with which to multiply.
    :return: n times as many readings as they were before.
    """
    orig_readings_len = len(readings)
    for _ in range(n - 1):
        for i in range(orig_readings_len):
            new_reading = [copy.copy(token)
                           for token in readings[i]]
            readings.append(new_reading)
    return readings


def tokenize(plain_verse):
    tokens = []
    i = 0  # Index into the whole verse.
    for token in re.split(r'\s', plain_verse):
        if token:
            # Add Tokens for the punctuation before a token.
            pre_punct_match = re.search('^\W+', token)
            if pre_punct_match:
                for c in pre_punct_match.group():
                    tokens.append(Token(c, (i, i + 1)))
                    i += 1
                pre_punct_end = pre_punct_match.end()
            else:
                pre_punct_end = 0

            post_punct_match = re.search('[\W_]+$', token)
            if post_punct_match:
                # Add a Token for the word itself.
                word = token[pre_punct_end:post_punct_match.start()]
                tokens.append(Token(word, (i, i + len(word))))
                i += len(word)

                # Add Tokens for the punctuation after a token.
                for c in post_punct_match.group():
                    tokens.append(Token(c, (i, i + 1)))
                    i += 1
            else:
                # Add a Token for the word itself.
                word = token[pre_punct_end:]
                tokens.append(Token(word, (i, i + len(word))))
                i += len(word)
        i += 1
    return tokens


def lemmatize_verses(word_list, tokens):
    token_alternatives = []
    for token in tokens:
        if token.is_punct():
            analyses = None
        else:
            analyses = word_list.analyze(token.text)
            if not analyses:
                bare, clitic = get_clitic(token.text)
                if clitic:
                    token.clitic = clitic
                    analyses = word_list.analyze(bare)

        if analyses:
            alternatives = []
            for a in analyses:
                # The token should not have any syllables at this
                # point so that the question of copy vs deepcopy
                # does not even arise.
                t = copy.copy(token)
                t.analysis = a
                alternatives.append(t)
        else:
            alternatives.append(token)
        token_alternatives.append(alternatives)

    readings = [[]]
    for alternatives in token_alternatives:
        orig_readings_len = len(readings)
        readings = multiply_readings(readings, len(alternatives))
        for i, token in enumerate(alternatives):
            start = i * orig_readings_len
            for reading in readings[start:start+orig_readings_len]:
                reading.append(token)

    return readings


class Scanner:

    def __init__(self, plain_verses):
        self.word_list = WordList()
        self.plain_verses = plain_verses
        self.tokenized_verses = [tokenize(v) for v in self.plain_verses]