Newer
Older
from .model import Token
from .wordlist import WordList
verses = [
'nunc dum tibi lubet licetque pota perde rem',
'antehac est habitus parcus nec magis continens',
"clamavit moriens lingua: 'Corinna, vale!'",
'an, quod ubique, tuum est? tua sunt Heliconia Tempe?',
]
CLITICS = ['que', 'qve', 'ue', 've', 'ne']
def get_clitic(token):
for clitic in CLITICS:
if token.endswith(clitic):
return token[:-len(clitic)], clitic
else:
return token, None
def multiply_readings(readings: List[List[Token]],
n: int) -> List[List[Token]]:
"""Copy the readings n - 1 times.
:param readings: The readings that are to be multiplied.
:param n: The number with which to multiply.
:return: n times as many readings as they were before.
"""
orig_readings_len = len(readings)
for _ in range(n - 1):
for i in range(orig_readings_len):
new_reading = [copy.copy(token)
for token in readings[i]]
readings.append(new_reading)
return readings
def tokenize(plain_verse):
tokens = []
i = 0 # Index into the whole verse.
for token in re.split(r'\s', plain_verse):
if token:
# Add Tokens for the punctuation before a token.
pre_punct_match = re.search('^\W+', token)
if pre_punct_match:
for c in pre_punct_match.group():
tokens.append(Token(c, (i, i + 1)))
i += 1
pre_punct_end = pre_punct_match.end()
else:
pre_punct_end = 0
post_punct_match = re.search('[\W_]+$', token)
word = token[pre_punct_end:post_punct_match.start()]
tokens.append(Token(word, (i, i + len(word))))
i += len(word)
# Add Tokens for the punctuation after a token.
for c in post_punct_match.group():
tokens.append(Token(c, (i, i + 1)))
i += 1
else:
word = token[pre_punct_end:]
tokens.append(Token(word, (i, i + len(word))))
i += len(word)
i += 1
return tokens
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def lemmatize_verses(word_list, tokens):
token_alternatives = []
for token in tokens:
if token.is_punct():
analyses = None
else:
analyses = word_list.analyze(token.text)
if not analyses:
bare, clitic = get_clitic(token.text)
if clitic:
token.clitic = clitic
analyses = word_list.analyze(bare)
if analyses:
alternatives = []
for a in analyses:
# The token should not have any syllables at this
# point so that the question of copy vs deepcopy
# does not even arise.
t = copy.copy(token)
t.analysis = a
alternatives.append(t)
else:
alternatives.append(token)
token_alternatives.append(alternatives)
readings = [[]]
for alternatives in token_alternatives:
orig_readings_len = len(readings)
readings = multiply_readings(readings, len(alternatives))
for i, token in enumerate(alternatives):
start = i * orig_readings_len
for reading in readings[start:start+orig_readings_len]:
reading.append(token)
return readings
class Scanner:
def __init__(self, plain_verses):
self.plain_verses = plain_verses
self.tokenized_verses = [tokenize(v) for v in self.plain_verses]