Newer
Older
from typing import Dict, List, Set, Tuple
from .db import FormAnalysis
from .model import Reading, Syllable, Token, Verse, Phenomenon
from .wordlist import WordList
CLITICS = ['que', 'qve', 'ue', 've', 'ne']
"""Split a clitic from the token if possible.
:param token: A token that may contain a clitic.
:return: A tuple of token without clitic and clitic, if a clitic
was found. Or a tuple of the original token and None if no
clitic was found.
"""
for clitic in CLITICS:
if token.endswith(clitic):
return token[:-len(clitic)], clitic
else:
return token, None
def multiply_readings(readings: List[Reading],
n: int) -> List[Reading]:
"""Copy the readings n - 1 times.
:param readings: The readings that are to be multiplied.
:param n: The number with which to multiply.
:return: n times as many readings as they were before.
"""
orig_readings_len = len(readings)
for _ in range(n - 1):
for i in range(orig_readings_len):
# TODO: Think about moving this to Reading in model.py
new_reading = Reading(
[copy.deepcopy(token) for token in readings[i].tokens]
readings.append(new_reading)
return readings
def tokenize(plain_verse: str) -> List[Token]:
"""Tokenize a verse.
This function first splits on whitespace and then further on
punctuation. Punctuation marks are regarded as tokens and are
therefore included in the list of returned tokens.
:param plain_verse: The verse that is to be tokenized.
:return: A list of the found tokens.
"""
tokens = []
i = 0 # Index into the whole verse.
for token in re.split(r'\s', plain_verse):
if token:
# Add Tokens for the punctuation before a token.
pre_punct_match = re.search('^\W+', token)
if pre_punct_match:
for c in pre_punct_match.group():
tokens.append(Token(c, (i, i + 1)))
i += 1
pre_punct_end = pre_punct_match.end()
else:
pre_punct_end = 0
post_punct_match = re.search('[\W_]+$', token)
word = token[pre_punct_end:post_punct_match.start()]
tokens.append(Token(word, (i, i + len(word))))
i += len(word)
# Add Tokens for the punctuation after a token.
for c in post_punct_match.group():
tokens.append(Token(c, (i, i + 1)))
i += 1
else:
word = token[pre_punct_end:]
tokens.append(Token(word, (i, i + len(word))))
i += len(word)
i += 1
return tokens
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def condense_analyses(
analyses: Set[FormAnalysis]) -> Dict[str, Dict[str, Set[str]]]:
"""Condense analyses objects into a nested dict representation.
:param analyses: The analyses that are to be condensed.
:return: A condensed version of the analyses. The keys in the
outer dict are the accented forms, the keys in the inner dict
are lemmas and the strings in the set are the morphtags.
"""
condensed = {}
for a in analyses:
if a.accented in condensed:
if a.lemma in condensed[a.accented]:
condensed[a.accented][a.lemma].add(a.morphtag)
else:
condensed[a.accented][a.lemma] = {a.morphtag}
else:
condensed[a.accented] = {a.lemma: {a.morphtag}}
return condensed
def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
"""Find different possible readings by analyzing the word forms.
This function analyzes the word forms in the verse and creates
readings for all possible combinations of accented versions of the
words. E.g. if two words occur with more than one accented
version, say one with two accented versions and the other with
three accented versions, a total of six readings will be
generated.
:param word_list: The word list to look up the word forms.
:param reading: A basic reading of a verse that is to be analyzed.
:return: A list of readings of the verse that differ with respect
to the accented versions for the forms.
"""
for token in reading.tokens:
if token.is_punct():
analyses = None
else:
analyses = word_list.analyze(token.text)
if not analyses:
bare, clitic = get_clitic(token.text)
if clitic:
token.clitic = clitic
analyses = word_list.analyze(bare)
condensed_analyses = condense_analyses(analyses)
for accented, lemma_to_morphtags in condensed_analyses.items():
# The token should not have any syllables at this
# point so that the question of copy vs deepcopy
# does not even arise.
t = copy.copy(token)
t.accented = accented
t.lemma_to_morphtags = lemma_to_morphtags
alternatives.append(t)
else:
alternatives.append(token)
token_alternatives.append(alternatives)
readings = [Reading()]
for alternatives in token_alternatives:
orig_readings_len = len(readings)
readings = multiply_readings(readings, len(alternatives))
for i, token in enumerate(alternatives):
start = i * orig_readings_len
for reading in readings[start:start+orig_readings_len]:
reading.append_token(token)
def get_syllables_for_token(token: Token):
syllables = []
if token.accented:
regex = (
r'((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)'
else r'((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)'
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
)
accented = (token.accented + token.clitic
if token.clitic
else token.accented)
chunks = [
chunk
for chunk in re.split(regex, accented, flags=re.IGNORECASE)
if chunk
]
syll_start = token.span[0]
syll_text = ''
syll_vowel_length = 1
syll_has_vowel = False
for i, c in enumerate(chunks):
if c[0] in 'aeiouy':
if syll_has_vowel:
# Syllable already has a vowel.
# Add the current syllable and begin a new one.
syll = Syllable(syllable=syll_text,
span=[syll_start,
syll_start + len(syll_text)],
idx=None,
vowel_length=syll_vowel_length,
syllable_length=syll_vowel_length)
syllables.append(syll)
# Begin info for new syllable.
syll_text = c.rstrip('_^')
else:
# Syllable has no vowel yet.
syll_text += c.rstrip('_^')
syll_has_vowel = True
syll_vowel_length = (
2 if len(c) > 1 and c[1] in 'aeiouy_' else 1
)
else:
syll_text += c.rstrip('_^')
if syll_text:
# Add the last syllable.
syll = Syllable(syllable=syll_text,
span=[syll_start, syll_start + len(syll_text)],
idx=None,
vowel_length=syll_vowel_length,
syllable_length=syll_vowel_length)
syllables.append(syll)
else:
if not token.is_punct():
syllables = [Syllable(syllable=token.text, span=token.span, idx=None,
vowel_length=1, syllable_length=1)]
return syllables
def get_syllables(reading):
for token in reading.tokens:
token.syllables = get_syllables_for_token(token)
mcl_regex = re.compile(r'[aeiouv](([bpsckgdt]|(qu)|(qv)),?\s?[lrmn])([aeiouv]|[.?!]|$)', flags=re.IGNORECASE)
if re.search(mcl_regex, verse.text):
matches = re.finditer(mcl_regex, verse.text)
for match in matches:
for reading in verse.readings:
for token in reading.tokens:
for syllable in token.syllables:
if syllable.span[0]<= match.start() < syllable.span[1]:
syllable.phenomena['muta cum liquida'] = Phenomenon(chars=match.group(1))
def positional_lengthening(verse):
pl_regex = re.compile(r'[aeiouv](((([bcdfgjklmnprstvwxz]|(qu)),?\s?){2,})|[xz])', flags=re.IGNORECASE)
if re.search(pl_regex, verse.text):
matches = re.finditer(pl_regex, verse.text)
for match in matches:
for reading in verse.readings:
for token in reading.tokens:
for syllable in token.syllables:
if syllable.span[0]<= match.start() < syllable.span[1]:
syllable.phenomena['positional lengthening'] = Phenomenon(chars=match.group(1))
def make_elisions(verse):
for reading in verse.readings:
for i, token in enumerate(reading.tokens):
if not token.is_punct() and i < len(reading.tokens) - 1:
j = i
for j in range(i + 1, len(reading.tokens)):
if not reading.tokens[j].is_punct():
next_syllable = reading.tokens[j].syllables[0]
break
else:
# No succeeding syllable has been found.
# Break the for and continue with the next reading.
break
m = re.search(r'[aeiouy][mh]*$', this_syllable.text)
if m:
if re.search(r'^h?[aeiouy]', next_syllable.text):
# Elision!
this_syllable.phenomena['elision'] = Phenomenon(omitted=m.group())
this_syllable.syllable_length = 0
return verse
def parse_verse(verse):
"""Annotates syllable lengths based on positional_lengthening and muta cum liquida
"""
positional_lengthening(verse)
muta_cum_liquida(verse)
for reading in verse.readings:
syllables = [syllable for token in reading.tokens
for syllable in token.syllables]
abstract = str()
mcl_count = 0
abstract += '2'
elif 'muta cum liquida' in syllable.phenomena:

Victor Zimmermann
committed
if 'positional lengthening' in syllable.phenomena and ' ' in syllable.phenomena['positional lengthening'].chars:
abstract += '2'
else:
abstract += '{}'
mcl_count += 1
elif 'positional lengthening' in syllable.phenomena:
abstract += '2'
elif syllable.syllable_length == 0:
abstract += '0'
abstract += '1'
elif syllable.syllable_length == 2:
abstract += '2'
if mcl_count > 0:
new_abstracts = list()
combinations = list(product(['1','2'],repeat=mcl_count))
for combi in combinations:
new_abstracts.append(abstract.format(*combi))
reading_copies = multiply_readings([reading], (mcl_count)*2)
else:
new_abstracts = [abstract]
reading_copies = [reading]
for i in range(len(new_abstracts)):
blueprint = new_abstracts[i]
new_reading = reading_copies[i]
syll_id = 0
for token in new_reading.tokens:
for s in token.syllables:
if blueprint[syll_id] == "1":
s.syllable_length = 1
if 'positional lengthening' in s.phenomena and 'muta cum liquida' in s.phenomena:
s.phenomena['positional lengthening'].overruled_by = 'muta cum liquida'
elif blueprint[syll_id] == "2":
s.syllable_length = 2
syll_id += 1
new_readings.append(copy.deepcopy(new_reading))
#print("In: "+abstract)
#print("Out: "+"".join([str(s.syllable_length) for t in new_reading.tokens for s in t.syllables]))
verse.readings = new_readings
return verse
def scan_verses(self, plain_verses: List[str]):
base_readings = [Reading(tokens=tokenize(v)) for v in plain_verses]
verses = [
Verse(verse=v, readings=lemmatize(self.word_list, br))
for v, br in zip(plain_verses, base_readings)
]
for verse in verses:
for reading in verse.readings:
get_syllables(reading)