# -*- coding: utf-8 -*- import re from .model import Token from .wordlist import WordList verses = [ 'nunc dum tibi lubet licetque pota perde rem', 'antehac est habitus parcus nec magis continens', "clamavit moriens lingua: 'Corinna, vale!'", 'an, quod ubique, tuum est? tua sunt Heliconia Tempe?', ] CLITICS = ['que', 'qve', 'ue', 've', 'ne'] def get_clitic(token): for clitic in CLITICS: if token.endswith(clitic): return token[:-len(clitic)], clitic else: return token, None def tokenize(plain_verse): tokens = [] i = 0 # Index into the whole verse. for token in re.split(r'\s', plain_verse): if token: # Add Tokens for the punctuation before a token. pre_punct_match = re.search('^\W+', token) if pre_punct_match: for c in pre_punct_match.group(): tokens.append(Token(c, (i, i + 1))) i += 1 pre_punct_end = pre_punct_match.end() else: pre_punct_end = 0 post_punct_match = re.search('\W+$', token) if post_punct_match: # Add a Token for the word itself. word = token[pre_punct_end:post_punct_match.start()] tokens.append(Token(word, (i, i + len(word)))) i += len(word) # Add Tokens for the punctuation after a token. for c in post_punct_match.group(): tokens.append(Token(c, (i, i + 1))) i += 1 else: # Add a Token for the word itself. word = token[pre_punct_end:] tokens.append(Token(word, (i, i + len(word)))) i += len(word) i += 1 return tokens class Scanner: def __init__(self, plain_verses): self.wordlist = WordList() self.plain_verses = plain_verses self.tokenized_verses = [tokenize(v) for v in self.plain_verses] def lemmatize_verses(self, tokens): token_alternatives = [] for token in tokens: analyses = self.wordlist.analyze(token) if not analyses: bare, clitic = get_clitic(token) if clitic: token.clitic = clitic analyses = self.wordlist(bare) if analyses: alternatives = [] for a in analyses: t = copy.copy(token) t.analysis = a alternatives.append(t) else: alternatives.append(token) token_alternatives.append(alternatives) # TODO: Generate readings from token_alternatives.