scanner.py

# -*- coding: utf-8 -*-

import copy
import re
from typing import Dict, List, Set, Tuple
from itertools import product

from .db import FormAnalysis
from .model import Reading, Syllable, Token, Verse, Phenomenon
from .wordlist import WordList

CLITICS = ['que', 'qve', 'ue', 've', 'ne']

SPECIAL_CASES = {
    # Positional lengthening for historical reasons
    # TODO: But there are also hīc and hōc, which should have vowel_length=2.
    'hic': lambda token: [Syllable("hic", token.span,
                                   syllable_length=2, vowel_length=1)],
    'hoc': lambda token: [Syllable("hoc", token.span,
                                   syllable_length=2, vowel_length=1)],

    # Forms with diphthong.
    'ceu': lambda token: [Syllable("ceu", token.span,
                                   syllable_length=2, vowel_length=2)],
    'cui': lambda token: [Syllable("cui", token.span,
                                   syllable_length=2, vowel_length=2)],
    'ei': lambda token: [Syllable("ei", token.span,
                                  syllable_length=2, vowel_length=2)],
    'hei': lambda token: [Syllable("hei", token.span,
                                   syllable_length=2, vowel_length=2)],
    'heic': lambda token: [Syllable("heic", token.span,
                                    syllable_length=2, vowel_length=2)],
    'heus': lambda token: [Syllable("heus", token.span,
                                    syllable_length=2, vowel_length=2)],
    'heu': lambda token: [Syllable("heu", token.span,
                                   syllable_length=2, vowel_length=2)],
    'huic': lambda token: [Syllable("huic", token.span,
                                    syllable_length=2, vowel_length=2)],
    'hui': lambda token: [Syllable("hui", token.span,
                                   syllable_length=2, vowel_length=2)],
    'neu': lambda token: [Syllable("neu", token.span,
                                   syllable_length=2, vowel_length=2)],
    'seu': lambda token: [Syllable("seu", token.span,
                                   syllable_length=2, vowel_length=2)],
    'cuiquam': (lambda token:
                [Syllable("cui", [token.span[0] + 0, token.span[0] + 3],
                          syllable_length=2, vowel_length=2),
                 Syllable("quam", [token.span[0] + 3, token.span[0] + 7],
                          syllable_length=1, vowel_length=1)]),
    'cuiqvam': (lambda token:
                [Syllable("cui", [token.span[0] + 0, token.span[0] + 3],
                          syllable_length=2, vowel_length=2),
                 Syllable("quam", [token.span[0] + 3, token.span[0] + 7],
                          syllable_length=1, vowel_length=1)]),
    'cuique': (lambda token:
               [Syllable("cui", [token.span[0] + 0, token.span[0] + 3],
                         syllable_length=2, vowel_length=2),
                Syllable("que", [token.span[0] + 3, token.span[0] + 6],
                         syllable_length=2, vowel_length=2)]),
    'deinde': (lambda token:
                [Syllable("deind", [token.span[0], token.span[0] + 5],
                          syllable_length=2, vowel_length=2),
                 Syllable("e", [token.span[0] + 5, token.span[0] + 6],
                          syllable_length=1, vowel_length=1)]),
    'proinde': (lambda token:
                [Syllable("proind", [token.span[0], token.span[0] + 6],
                          syllable_length=2, vowel_length=2),
                 Syllable("e", [token.span[0] + 6, token.span[0] + 7],
                          syllable_length=1, vowel_length=1)]),
    'necnon': (lambda token:
               [Syllable("nec", [token.span[0] + 0, token.span[0] + 3],
                         syllable_length=2, vowel_length=1),
                Syllable("non", [token.span[0] + 3, token.span[0] + 6],
                         syllable_length=2, vowel_length=2)]),
}


def get_clitic(token: str) -> Tuple[str, str]:
    """Split a clitic from the token if possible.

    :param token: A token that may contain a clitic.
    :return: A tuple of token without clitic and clitic, if a clitic
        was found. Or a tuple of the original token and None if no
        clitic was found.
    """
    for clitic in CLITICS:
        if token.endswith(clitic):
            return token[:-len(clitic)], clitic
    else:
        return token, None


def multiply_readings(readings: List[Reading],
                      n: int) -> List[Reading]:
    """Copy the readings n - 1 times.

    :param readings: The readings that are to be multiplied.
    :param n: The number with which to multiply.
    :return: n times as many readings as they were before.
    """
    orig_readings_len = len(readings)
    for _ in range(n - 1):
        for i in range(orig_readings_len):
            # TODO: Think about moving this to Reading in model.py
            new_reading = Reading(
                [copy.deepcopy(token) for token in readings[i].tokens]
            )
            readings.append(new_reading)
    return readings


def tokenize(plain_verse: str) -> List[Token]:
    """Tokenize a verse.

    This function first splits on whitespace and then further on
    punctuation. Punctuation marks are regarded as tokens and are
    therefore included in the list of returned tokens.

    :param plain_verse: The verse that is to be tokenized.
    :return: A list of the found tokens.
    """
    tokens = []
    i = 0  # Index into the whole verse.
    for token in re.split(r'\s', plain_verse):
        if token:
            # Add Tokens for the punctuation before a token.
            pre_punct_match = re.search('^\W+', token)
            if pre_punct_match:
                for c in pre_punct_match.group():
                    tokens.append(Token(c, (i, i + 1)))
                    i += 1
                pre_punct_end = pre_punct_match.end()
            else:
                pre_punct_end = 0

            post_punct_match = re.search('[\W_]+$', token)
            if post_punct_match:
                # Add a Token for the word itself.
                word = token[pre_punct_end:post_punct_match.start()]
                tokens.append(Token(word, (i, i + len(word))))
                i += len(word)

                # Add Tokens for the punctuation after a token.
                for c in post_punct_match.group():
                    tokens.append(Token(c, (i, i + 1)))
                    i += 1
            else:
                # Add a Token for the word itself.
                word = token[pre_punct_end:]
                tokens.append(Token(word, (i, i + len(word))))
                i += len(word)
        i += 1
    return tokens


def blow_up_accented(accented):
    matches = list(re.finditer(r'[_^]{2}', accented))
    if matches:
        # Generate blueprint.
        blueprint = [accented[:matches[0].start()]]
        for m in matches:
            blueprint.append('{}')
        blueprint.append(accented[matches[-1].end():])
        blueprint = ''.join(blueprint)

        # Fill blueprint with variants of accented form.
        combinations = product([0, 1], repeat=len(matches))
        blown_up = []
        for combi in combinations:
            format_args = ['_' if i == 1 else '^'
                           for i in combi]
            blown_up.append(blueprint.format(*format_args))
    else:
        # The accented is form is unambiguous.
        blown_up = [accented]
    return blown_up


def condense_analyses(
        analyses: Set[FormAnalysis]) -> Dict[str, Dict[str, Set[str]]]:
    """Condense analyses objects into a nested dict representation.

    :param analyses: The analyses that are to be condensed.
    :return: A condensed version of the analyses. The keys in the
        outer dict are the accented forms, the keys in the inner dict
        are lemmas and the strings in the set are the morphtags.
    """
    condensed = {}
    for a in analyses:
        for accented in blow_up_accented(a.accented):
            if accented in condensed:
                if a.lemma in condensed[accented]:
                    condensed[accented][a.lemma].add(a.morphtag)
                else:
                    condensed[accented][a.lemma] = {a.morphtag}
            else:
                condensed[accented] = {a.lemma: {a.morphtag}}
    return condensed


def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
    """Find different possible readings by analyzing the word forms.

    This function analyzes the word forms in the verse and creates
    readings for all possible combinations of accented versions of the
    words. E.g. if two words occur with more than one accented
    version, say one with two accented versions and the other with
    three accented versions, a total of six readings will be
    generated.

    :param word_list: The word list to look up the word forms.
    :param reading: A basic reading of a verse that is to be analyzed.
    :return: A list of readings of the verse that differ with respect
        to the accented versions for the forms.

    """
    token_alternatives = []
    for token in reading.tokens:
        if token.is_punct():
            analyses = None
        else:
            analyses = word_list.analyze(token.text)
            if not analyses:
                bare, clitic = get_clitic(token.text)
                if clitic:
                    token.clitic = clitic
                    analyses = word_list.analyze(bare)

        alternatives = []
        if analyses:
            condensed_analyses = condense_analyses(analyses)
            for accented, lemma_to_morphtags in condensed_analyses.items():
                # The token should not have any syllables at this
                # point so that the question of copy vs deepcopy
                # does not even arise.
                t = copy.copy(token)
                t.accented = accented
                t.lemma_to_morphtags = lemma_to_morphtags
                alternatives.append(t)
        else:
            alternatives.append(token)
        token_alternatives.append(alternatives)

    readings = [Reading()]
    for alternatives in token_alternatives:
        orig_readings_len = len(readings)
        readings = multiply_readings(readings, len(alternatives))
        for i, token in enumerate(alternatives):
            start = i * orig_readings_len
            for reading in readings[start:start+orig_readings_len]:
                reading.append_token(token)

    return readings


def get_syllables_for_accented_form(token):
    syllables = []
    regex = (
        r'((?<!q)(?:ae|oe|au|eu|yi|[aeiouy])[_^]?)'
        if token.text[0].isupper()
        else r'((?<!q)(?:ae|oe|au|[aeiouy])[_^]?)'
    )
    accented = (token.accented + token.clitic
                if token.clitic
                else token.accented)
    chunks = [
        chunk
        for chunk in re.split(regex, accented, flags=re.IGNORECASE)
        if chunk
    ]
    syll_start = token.span[0]
    syll_text = ''
    syll_vowel_length = 1
    syll_has_vowel = False
    for i, c in enumerate(chunks):
        if c[0] in 'AEIOUYaeiouy':
            if syll_has_vowel:
                # Syllable already has a vowel.
                # Add the current syllable and begin a new one.
                syll = Syllable(syllable=syll_text,
                                span=[syll_start,
                                      syll_start + len(syll_text)],
                                idx=None,
                                vowel_length=syll_vowel_length,
                                syllable_length=syll_vowel_length)
                syllables.append(syll)

                # Begin info for new syllable.
                syll_start = syll_start + len(syll_text)
                syll_text = c.rstrip('_^')
            else:
                # Syllable has no vowel yet.
                syll_text += c.rstrip('_^')
            syll_has_vowel = True
            syll_vowel_length = (
                2 if len(c) > 1 and c[1] in 'AEIOUYaeiouy_' else 1
            )
        else:
            syll_text += c.rstrip('_^')

    if syll_text:
        # Add the last syllable.
        syll = Syllable(syllable=syll_text,
                        span=[syll_start, syll_start + len(syll_text)],
                        idx=None,
                        vowel_length=syll_vowel_length,
                        syllable_length=syll_vowel_length)
        syllables.append(syll)
    return syllables


def get_syllables_for_unknown_form(token):
    """Stolen from Jonathan (insert proper citation here)

    ee
    """

    strng = token.text
    strng = strng.lower()

    if strng.isupper():
        chunks = [
            chunk
            for chunk
            in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower())
            if chunk != ""
        ]
    else:
        chunks=[
            chunk
            for chunk
            in re.split("(ae|au|oe|[aeiouy])", strng.lower())
            if chunk != ""
        ]
    y = []

    # Zaehler j: gerades j: Konsonanten werden an y angehaengt,
    #                    ungerades j: Vokale werden an Konsonanten angehaengt
    # Zu beachten: Faengt Wort mit Vokal an?
    j = -1
    fluff = 0

    for ch in chunks:
        j += 1
        if j == 0:
            if re.match("[^aeiou]", chunks[0]):
                fluff = 1
                y.append(ch)
            else:
                y.append(ch)
                j += 1
        elif j == 1 and fluff == 1:
            y[0] += chunks[1]
        else:
            if j % 2 == 0:
                if re.match("[^aeiou]", ch):
                    y[-1] += ch
                else:
                    y.append(ch)
                    j += 1
            else:
                y.append(ch)

    res = list()
    length = token.span[0]
    for x in y:
        res.append(Syllable(x, [length, length+len(x)]))
        length += (len(x))

    # special cases again
    if re.search("oen?$", strng) and strng.isupper():
        res[-1] = Syllable("o", [res[-1].span[0], res[-1].span[0]+1])
        if strng.endswith("n"):
            res.append(Syllable("en", [res[-1].span[0] + 1, res[-1].span[1]]))
        else:
            res.append(Syllable("e", [res[-1].span[0] + 1, res[-1].span[1]]))

    for syll in res:
        if re.search(r'[aeiuoy]{2}', syll.text):
            syll.vowel_length = 2
            syll.syllable_length = 2

    return res


def join_u_diphthong_syllables(token, syllables):
    i = 0
    while i < len(syllables) - 1:
        this_syllable = syllables[i]
        next_syllable = syllables[i + 1]
        if (token.text[:this_syllable.span[1] - token.span[0]].endswith('ngu')
            and next_syllable.text[0] in 'aeioy'):
            this_syllable.text += next_syllable.text
            this_syllable.span[1] = next_syllable.span[1]
            if next_syllable.vowel_length == 2:
                this_syllable.vowel_length = 2
                this_syllable.syllable_length = 2
            syllables.pop(i + 1)
            i += 1
        i += 1
    return syllables


def get_syllables_for_token(token: Token):
    syllables = []
    if token.text.lower() in SPECIAL_CASES:
        syllables = SPECIAL_CASES[token.text.lower()](token)
        token.syllables_provider = 'SPECIAL_CASES'
    elif token.accented:
        syllables = get_syllables_for_accented_form(token)
        token.syllables_provider = 'get_syllables_for_accented_form'
    else:
        if not token.is_punct():
            syllables = get_syllables_for_unknown_form(token)
            token.syllables_provider = 'get_syllables_for_unknown_form'
    syllables = join_u_diphthong_syllables(token, syllables)
    return syllables


def get_syllables(reading):
    i = 0
    for token in reading.tokens:
        token.syllables = get_syllables_for_token(token)
        for s in token.syllables:
            s.id = i
            i += 1
    return reading


def muta_cum_liquida(verse):
    mcl_regex = re.compile(
        r'[aeiouvy](([bpsckgdt]|(qu)|(qv))[h\W]*[lrmn])([aeiouvy]|[.?!]|$)',
        flags=re.IGNORECASE
    )
    for match in re.finditer(mcl_regex, verse.text):
        for reading in verse.readings:
            for token in reading.tokens:
                break_ = False
                for syllable in token.syllables:

                    if syllable.span[0] <= match.start() < syllable.span[1]:
                        mcl = Phenomenon(chars=match.group(1))
                        syllable.phenomena['muta cum liquida'] = mcl
                        break_ = True
                        break
                if break_:
                    break


def positional_lengthening(verse):
    pl_regex = re.compile(
        r'[aeiouvy](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])',
        flags=re.IGNORECASE
    )
    for match in re.finditer(pl_regex, verse.text):
        for reading in verse.readings:
            for token in reading.tokens:
                break_ = False
                for syllable in token.syllables:
                    if syllable.span[0] <= match.start() < syllable.span[1]:
                        syllable.syllable_length = 2
                        pl = Phenomenon(chars=match.group(1))
                        syllable.phenomena['positional lengthening'] = pl
                        break_ = True
                        break
                if break_:
                    break


def make_elisions(verse):
    for reading in verse.readings:
        for i, token in enumerate(reading.tokens):
            if not token.is_punct() and i < len(reading.tokens) - 1:
                this_syllable = token.syllables[-1]
                j = i
                for j in range(i + 1, len(reading.tokens)):
                    if not reading.tokens[j].is_punct():
                        next_syllable = reading.tokens[j].syllables[0]
                        break
                else:
                    # No succeeding syllable has been found.
                    # Break the for and continue with the next reading.
                    break

                m = re.search(r'[aeiouy][mh]*$', this_syllable.text,
                              flags=re.IGNORECASE)
                if m:
                    if re.search(r'^h?[aeiouy]', next_syllable.text,
                                 flags=re.IGNORECASE):
                        # Elision!
                        elision = Phenomenon(omitted=m.group())
                        this_syllable.phenomena['elision'] = elision
                        this_syllable.syllable_length = 0
    return verse


def parse_verse(verse):
    """Annotate syllable lengths based on positional_lengthening and muta
    cum liquida.
    """
    positional_lengthening(verse)

    muta_cum_liquida(verse)

    new_readings = list()

    for reading in verse.readings:
        syllables = [syllable for token in reading.tokens
                     for syllable in token.syllables]
        abstract = str()
        mcl_count = 0
        for syllable in syllables:
            if syllable.id == len(syllables) - 1:
                abstract += '2'
            elif 'muta cum liquida' in syllable.phenomena:
                if ('positional lengthening' in syllable.phenomena
                    and ' ' in (syllable.phenomena['positional lengthening']
                                .chars)):
                    abstract += '2'
                else:
                    abstract += '{}'
                    mcl_count += 1
            elif 'positional lengthening' in syllable.phenomena:
                abstract += '2'
            elif syllable.syllable_length == 0:
                abstract += '0'
            elif syllable.syllable_length == 1:
                abstract += '1'
            elif syllable.syllable_length == 2:
                abstract += '2'

        if mcl_count > 0:
            new_abstracts = list()
            combinations = list(product(['1', '2'], repeat=mcl_count))
            for combi in combinations:
                new_abstracts.append(abstract.format(*combi))

            reading_copies = multiply_readings([reading], (mcl_count)*2)
        else:
            new_abstracts = [abstract]
            reading_copies = [reading]

        for i in range(len(new_abstracts)):
            blueprint = new_abstracts[i]
            new_reading = reading_copies[i]

            syll_id = 0
            for token in new_reading.tokens:
                for s in token.syllables:
                    if blueprint[syll_id] == '1':
                        s.syllable_length = 1
                        if ('positional lengthening' in s.phenomena
                            and 'muta cum liquida' in s.phenomena):
                            (s.phenomena['positional lengthening']
                             .overruled_by) = 'muta cum liquida'
                    elif blueprint[syll_id] == '2':
                        s.syllable_length = 2
                    syll_id += 1

            new_readings.append(copy.deepcopy(new_reading))

    verse.readings = new_readings
    return verse


class Scanner:

    def __init__(self):
        self.word_list = WordList()

    def scan_verses(self, plain_verses: List[str]):
        base_readings = [Reading(tokens=tokenize(v)) for v in plain_verses]
        verses = [
            Verse(verse=v, readings=lemmatize(self.word_list, br))
            for v, br in zip(plain_verses, base_readings)
        ]
        for verse in verses:
            for reading in verse.readings:
                get_syllables(reading)
            parse_verse(verse)
            make_elisions(verse)
        return verses