scanner.py

# -*- coding: utf-8 -*-

import copy
import re

from .model import Token
from .wordlist import WordList

verses = [
    'nunc dum tibi lubet licetque pota perde rem',
    'antehac est habitus parcus nec magis continens',
    "clamavit moriens lingua: 'Corinna, vale!'",
    'an, quod ubique, tuum est? tua sunt Heliconia Tempe?',
]

CLITICS = ['que', 'qve', 'ue', 've', 'ne']


def get_clitic(token):
    for clitic in CLITICS:
        if token.endswith(clitic):
            return token[:-len(clitic)], clitic
    else:
        return token, None


def multiply_readings(readings, n):
    return readings * n


def tokenize(plain_verse):
    tokens = []
    i = 0  # Index into the whole verse.
    for token in re.split(r'\s', plain_verse):
        if token:
            # Add Tokens for the punctuation before a token.
            pre_punct_match = re.search('^\W+', token)
            if pre_punct_match:
                for c in pre_punct_match.group():
                    tokens.append(Token(c, (i, i + 1)))
                    i += 1
                pre_punct_end = pre_punct_match.end()
            else:
                pre_punct_end = 0

            post_punct_match = re.search('\W+$', token)
            if post_punct_match:
                # Add a Token for the word itself.
                word = token[pre_punct_end:post_punct_match.start()]
                tokens.append(Token(word, (i, i + len(word))))
                i += len(word)

                # Add Tokens for the punctuation after a token.
                for c in post_punct_match.group():
                    tokens.append(Token(c, (i, i + 1)))
                    i += 1
            else:
                # Add a Token for the word itself.
                word = token[pre_punct_end:]
                tokens.append(Token(word, (i, i + len(word))))
                i += len(word)
        i += 1
    return tokens


class Scanner:

    def __init__(self, plain_verses):
        self.wordlist = WordList()
        self.plain_verses = plain_verses
        self.tokenized_verses = [tokenize(v) for v in self.plain_verses]

    def lemmatize_verses(self, tokens):
        token_alternatives = []
        for token in tokens:
            analyses = self.wordlist.analyze(token)
            if not analyses:
                bare, clitic = get_clitic(token.text)
                if clitic:
                    token.clitic = clitic
                    analyses = self.wordlist(bare)

            if analyses:
                alternatives = []
                for a in analyses:
                    # The token should not have any syllables at this
                    # point so that the question of copy vs deepcopy
                    # does not even arise.
                    t = copy.copy(token)
                    t.analysis = a
                    alternatives.append(t)
                else:
                    alternatives.append(token)
                token_alternatives.append(alternatives)

        # TODO: Generate readings from token_alternatives.
        readings = [[]]
        for alternatives in token_alternatives:
            prev_len_readings = len(readings)
            readings = multiply_readings(readings, len(alternatives))
            for i, token in enumerate(alternatives):
                start = i * prev_len_readings
                for reading in readings[start:start+prev_len_readings]:
                    reading.append(token)

        return readings