# -*- coding: utf-8 -*- import copy import re from typing import List from .model import Token from .wordlist import WordList verses = [ 'nunc dum tibi lubet licetque pota perde rem', 'antehac est habitus parcus nec magis continens', "clamavit moriens lingua: 'Corinna, vale!'", 'an, quod ubique, tuum est? tua sunt Heliconia Tempe?', ] CLITICS = ['que', 'qve', 'ue', 've', 'ne'] def get_clitic(token): for clitic in CLITICS: if token.endswith(clitic): return token[:-len(clitic)], clitic else: return token, None def multiply_readings(readings: List[List[Token]], n: int) -> List[List[Token]]: """Copy the readings n - 1 times. :param readings: The readings that are to be multiplied. :param n: The number with which to multiply. :return: n times as many readings as they were before. """ orig_readings_len = len(readings) for _ in range(n - 1): for i in range(orig_readings_len): new_reading = [copy.copy(token) for token in readings[i]] readings.append(new_reading) return readings def tokenize(plain_verse): tokens = [] i = 0 # Index into the whole verse. for token in re.split(r'\s', plain_verse): if token: # Add Tokens for the punctuation before a token. pre_punct_match = re.search('^\W+', token) if pre_punct_match: for c in pre_punct_match.group(): tokens.append(Token(c, (i, i + 1))) i += 1 pre_punct_end = pre_punct_match.end() else: pre_punct_end = 0 post_punct_match = re.search('[\W_]+$', token) if post_punct_match: # Add a Token for the word itself. word = token[pre_punct_end:post_punct_match.start()] tokens.append(Token(word, (i, i + len(word)))) i += len(word) # Add Tokens for the punctuation after a token. for c in post_punct_match.group(): tokens.append(Token(c, (i, i + 1))) i += 1 else: # Add a Token for the word itself. word = token[pre_punct_end:] tokens.append(Token(word, (i, i + len(word)))) i += len(word) i += 1 return tokens def lemmatize_verses(word_list, tokens): token_alternatives = [] for token in tokens: if token.is_punct(): analyses = None else: analyses = word_list.analyze(token.text) if not analyses: bare, clitic = get_clitic(token.text) if clitic: token.clitic = clitic analyses = word_list.analyze(bare) if analyses: alternatives = [] for a in analyses: # The token should not have any syllables at this # point so that the question of copy vs deepcopy # does not even arise. t = copy.copy(token) t.analysis = a alternatives.append(t) else: alternatives.append(token) token_alternatives.append(alternatives) readings = [[]] for alternatives in token_alternatives: orig_readings_len = len(readings) readings = multiply_readings(readings, len(alternatives)) for i, token in enumerate(alternatives): start = i * orig_readings_len for reading in readings[start:start+orig_readings_len]: reading.append(token) return readings class Scanner: def __init__(self, plain_verses): self.word_list = WordList() self.plain_verses = plain_verses self.tokenized_verses = [tokenize(v) for v in self.plain_verses]