Newer
Older
import re
from .model import Token
from .wordlist import WordList
verses = [
'nunc dum tibi lubet licetque pota perde rem',
'antehac est habitus parcus nec magis continens',
"clamavit moriens lingua: 'Corinna, vale!'",
'an, quod ubique, tuum est? tua sunt Heliconia Tempe?',
]
CLITICS = ['que', 'qve', 'ue', 've', 'ne']
def get_clitic(token):
for clitic in CLITICS:
if token.endswith(clitic):
return token[:-len(clitic)], clitic
else:
return token, None
def multiply_readings(readings, n):
return readings * n
def tokenize(plain_verse):
tokens = []
i = 0 # Index into the whole verse.
for token in re.split(r'\s', plain_verse):
if token:
# Add Tokens for the punctuation before a token.
pre_punct_match = re.search('^\W+', token)
if pre_punct_match:
for c in pre_punct_match.group():
tokens.append(Token(c, (i, i + 1)))
i += 1
pre_punct_end = pre_punct_match.end()
else:
pre_punct_end = 0
post_punct_match = re.search('\W+$', token)
if post_punct_match:
word = token[pre_punct_end:post_punct_match.start()]
tokens.append(Token(word, (i, i + len(word))))
i += len(word)
# Add Tokens for the punctuation after a token.
for c in post_punct_match.group():
tokens.append(Token(c, (i, i + 1)))
i += 1
else:
word = token[pre_punct_end:]
tokens.append(Token(word, (i, i + len(word))))
i += len(word)
i += 1
return tokens
class Scanner:
def __init__(self, plain_verses):
self.plain_verses = plain_verses
self.tokenized_verses = [tokenize(v) for v in self.plain_verses]
def lemmatize_verses(self, tokens):
token_alternatives = []
for token in tokens:
analyses = self.wordlist.analyze(token)
if not analyses:
bare, clitic = get_clitic(token.text)
if clitic:
token.clitic = clitic
analyses = self.wordlist(bare)
if analyses:
alternatives = []
for a in analyses:
# The token should not have any syllables at this
# point so that the question of copy vs deepcopy
# does not even arise.
t = copy.copy(token)
t.analysis = a
alternatives.append(t)
else:
alternatives.append(token)
token_alternatives.append(alternatives)
# TODO: Generate readings from token_alternatives.
readings = [[]]
for alternatives in token_alternatives:
prev_len_readings = len(readings)
readings = multiply_readings(readings, len(alternatives))
for i, token in enumerate(alternatives):
start = i * prev_len_readings
for reading in readings[start:start+prev_len_readings]:
reading.append(token)
return readings