diff --git a/allzweckmesser/scanner.py b/allzweckmesser/scanner.py index 41fcc5619b2a1e7dc6a688e46e60e77d79b0d8cf..fb6b538f2d1b904267c93860b2f2b0c712a8d502 100644 --- a/allzweckmesser/scanner.py +++ b/allzweckmesser/scanner.py @@ -189,171 +189,188 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]: return readings -def get_unknown_syllables(token): +def get_syllables_for_accented_form(token): + syllables = [] + regex = ( + r'((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)' + if token.text[0].isupper() + else r'((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)' + ) + accented = (token.accented + token.clitic + if token.clitic + else token.accented) + chunks = [ + chunk + for chunk in re.split(regex, accented, flags=re.IGNORECASE) + if chunk + ] + syll_start = token.span[0] + syll_text = '' + syll_vowel_length = 1 + syll_has_vowel = False + for i, c in enumerate(chunks): + if c[0] in 'aeiouy': + if syll_has_vowel: + # Syllable already has a vowel. + # Add the current syllable and begin a new one. + syll = Syllable(syllable=syll_text, + span=[syll_start, + syll_start + len(syll_text)], + idx=None, + vowel_length=syll_vowel_length, + syllable_length=syll_vowel_length) + syllables.append(syll) + + # Begin info for new syllable. + syll_start = syll_start + len(syll_text) + syll_text = c.rstrip('_^') + else: + # Syllable has no vowel yet. + syll_text += c.rstrip('_^') + syll_has_vowel = True + syll_vowel_length = ( + 2 if len(c) > 1 and c[1] in 'aeiouy_' else 1 + ) + else: + syll_text += c.rstrip('_^') + + if syll_text: + # Add the last syllable. + syll = Syllable(syllable=syll_text, + span=[syll_start, syll_start + len(syll_text)], + idx=None, + vowel_length=syll_vowel_length, + syllable_length=syll_vowel_length) + syllables.append(syll) + return syllables + + +def get_syllables_for_unknown_form(token): """Stolen from Jonathan (insert proper citation here) ee """ - strng=token.text - strng=strng.lower() - # special cases - if strng=="cui" or strng=="cvi": - syll=Syllable("cui", token.span) + strng = token.text + strng = strng.lower() + # Special cases + if strng == "cui" or strng == "cvi": + syll = Syllable("cui", token.span) return [syll] - if strng=="cuiqve" or strng=="cviqve": - syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3]) - syll2=Syllable("qve", [token.span[0]+3,token.span[0]+6]) + if strng == "cuiqve" or strng == "cviqve": + syll1 = Syllable("cui", [token.span[0] + 0, token.span[0] + 3]) + syll2 = Syllable("qve", [token.span[0] + 3, token.span[0] + 6]) return [syll1, syll2] - if strng=="proinde": - syll1=Syllable("proind", [token.span[0],token.span[0]+6]) - syll2=Syllable("e", [token.span[0]+6,token.span[0]+7]) + if strng == "proinde": + syll1 = Syllable("proind", [token.span[0], token.span[0] + 6]) + syll2 = Syllable("e", [token.span[0] + 6, token.span[0] + 7]) return [syll1, syll2] - if strng=="cuiqvam" or strng=="cviqvam": - syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3]) - syll2=Syllable("qvam", [token.span[0]+3,token.span[0]+7]) + if strng == "cuiqvam" or strng == "cviqvam": + syll1 = Syllable("cui", [token.span[0] + 0, token.span[0] + 3]) + syll2 = Syllable("qvam", [token.span[0] + 3, token.span[0] + 7]) return [syll1, syll2] - if strng=="necnon": - syll1=Syllable("nec", [token.span[0]+0,token.span[0]+3]) - syll2=Syllable("non", [token.span[0]+3,token.span[0]+6]) + if strng == "necnon": + syll1 = Syllable("nec", [token.span[0] + 0, token.span[0] + 3]) + syll2 = Syllable("non", [token.span[0] + 3, token.span[0] + 6]) return [syll1, syll2] - if strng=="seu": - syll=Syllable("seu", token.span) + if strng == "seu": + syll = Syllable("seu", token.span) return [syll] - if strng=="neu": - syll=Syllable("neu", token.span) + if strng == "neu": + syll = Syllable("neu", token.span) return [syll] - if strng=="heu": - syll=Syllable("heu", token.span) + if strng == "heu": + syll = Syllable("heu", token.span) return [syll] - if strng=="huic": - syll=Syllable("huic", token.span) + if strng == "huic": + syll = Syllable("huic", token.span) return [syll] - if strng=="ei": - syll=Syllable("ei", token.span) + if strng == "ei": + syll = Syllable("ei", token.span) return [syll] - if strng=="hei": - syll=Syllable("hei", token.span) + if strng == "hei": + syll = Syllable("hei", token.span) return [syll] - if strng=="ceu": - syll=Syllable("ceu", token.span) + if strng == "ceu": + syll = Syllable("ceu", token.span) return [syll] - if strng=="heus": - syll=Syllable("heus", token.span) + if strng == "heus": + syll = Syllable("heus", token.span) return [syll] - #end special cases + # End special cases + if strng.isupper(): - chunks=[chunk for chunk in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower()) if chunk!=""] + chunks = [ + chunk + for chunk + in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower()) + if chunk != "" + ] else: - chunks=[chunk for chunk in re.split("(ae|au|oe|[aeiouy])", strng.lower()) if chunk!=""] - y=[] + chunks=[ + chunk + for chunk + in re.split("(ae|au|oe|[aeiouy])", strng.lower()) + if chunk != "" + ] + y = [] # Zaehler j: gerades j: Konsonanten werden an y angehaengt, # ungerades j: Vokale werden an Konsonanten angehaengt # Zu beachten: Faengt Wort mit Vokal an? - j=-1 - fluff=0 + j = -1 + fluff = 0 for ch in chunks: - j+=1 - if j==0: + j += 1 + if j == 0: if re.match("[^aeiou]", chunks[0]): - fluff=1 + fluff = 1 y.append(ch) else: y.append(ch) - j+=1 - elif j==1 and fluff==1: - y[0]+=chunks[1] + j += 1 + elif j == 1 and fluff == 1: + y[0] += chunks[1] else: - if j%2==0: + if j % 2 == 0: if re.match("[^aeiou]", ch): - y[-1]+=ch + y[-1] += ch else: y.append(ch) - j+=1 + j += 1 else: y.append(ch) - + res = list() length = 0 for x in y: res.append(Syllable(x, [length, length+len(x)])) - length+=(len(x)) - + length += (len(x)) + # special cases again if re.search("oen?$", strng) and strng.isupper(): - res[-1]=Syllable("o", [res[-1].span[0], res[-1].span[0]+1]) + res[-1] = Syllable("o", [res[-1].span[0], res[-1].span[0]+1]) if strng.endswith("n"): - res.append(Syllable("en", [res[-1].span[0]+1, res[-1].span[1]])) + res.append(Syllable("en", [res[-1].span[0] + 1, res[-1].span[1]])) else: - res.append(Syllable("e",[res[-1].span[0]+1, res[-1].span[1]])) - + res.append(Syllable("e", [res[-1].span[0] + 1, res[-1].span[1]])) + for syll in res: if re.search(r'[aeiuoy]{2}', syll.text): syll.vowel_length = 2 syll.syllable_length = 2 - + return res def get_syllables_for_token(token: Token): syllables = [] if token.accented: - regex = ( - r'((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)' - if token.text[0].isupper() - else r'((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)' - ) - accented = (token.accented + token.clitic - if token.clitic - else token.accented) - chunks = [ - chunk - for chunk in re.split(regex, accented, flags=re.IGNORECASE) - if chunk - ] - syll_start = token.span[0] - syll_text = '' - syll_vowel_length = 1 - syll_has_vowel = False - for i, c in enumerate(chunks): - if c[0] in 'aeiouy': - if syll_has_vowel: - # Syllable already has a vowel. - # Add the current syllable and begin a new one. - syll = Syllable(syllable=syll_text, - span=[syll_start, - syll_start + len(syll_text)], - idx=None, - vowel_length=syll_vowel_length, - syllable_length=syll_vowel_length) - syllables.append(syll) - - # Begin info for new syllable. - syll_start = syll_start + len(syll_text) - syll_text = c.rstrip('_^') - else: - # Syllable has no vowel yet. - syll_text += c.rstrip('_^') - syll_has_vowel = True - syll_vowel_length = ( - 2 if len(c) > 1 and c[1] in 'aeiouy_' else 1 - ) - else: - syll_text += c.rstrip('_^') - - if syll_text: - # Add the last syllable. - syll = Syllable(syllable=syll_text, - span=[syll_start, syll_start + len(syll_text)], - idx=None, - vowel_length=syll_vowel_length, - syllable_length=syll_vowel_length) - syllables.append(syll) + syllables = get_syllables_for_accented_form(token) else: if not token.is_punct(): - syllables = get_unknown_syllables(token) + syllables = get_syllables_for_unknown_form(token) return syllables @@ -368,8 +385,10 @@ def get_syllables(reading): def muta_cum_liquida(verse): - - mcl_regex = re.compile(r'[aeiouv](([bpsckgdt]|(qu)|(qv))\W*[lrmn])([aeiouv]|[.?!]|$)', flags=re.IGNORECASE) + mcl_regex = re.compile( + r'[aeiouv](([bpsckgdt]|(qu)|(qv))\W*[lrmn])([aeiouv]|[.?!]|$)', + flags=re.IGNORECASE + ) if re.search(mcl_regex, verse.text): matches = re.finditer(mcl_regex, verse.text) @@ -378,24 +397,29 @@ def muta_cum_liquida(verse): for token in reading.tokens: for syllable in token.syllables: - if syllable.span[0]<= match.start() < syllable.span[1]: - syllable.phenomena['muta cum liquida'] = Phenomenon(chars=match.group(1)) + if syllable.span[0] <= match.start() < syllable.span[1]: + mcl = Phenomenon(chars=match.group(1)) + syllable.phenomena['muta cum liquida'] = mcl def positional_lengthening(verse): - - pl_regex = re.compile(r'[aeiouv](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])', flags=re.IGNORECASE) - if re.search(pl_regex, verse.text): - matches = re.finditer(pl_regex, verse.text) - - for match in matches: - for reading in verse.readings: - for token in reading.tokens: - for syllable in token.syllables: - - if syllable.span[0]<= match.start() < syllable.span[1]: - syllable.syllable_length = 2 - syllable.phenomena['positional lengthening'] = Phenomenon(chars=match.group(1)) + pl_regex = re.compile( + r'[aeiouv](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])', + flags=re.IGNORECASE + ) + for match in re.finditer(pl_regex, verse.text): + for reading in verse.readings: + for token in reading.tokens: + break_ = False + for syllable in token.syllables: + if syllable.span[0] <= match.start() < syllable.span[1]: + syllable.syllable_length = 2 + pl = Phenomenon(chars=match.group(1)) + syllable.phenomena['positional lengthening'] = pl + break_ = True + break + if break_: + break def make_elisions(verse): @@ -417,23 +441,24 @@ def make_elisions(verse): if m: if re.search(r'^h?[aeiouy]', next_syllable.text): # Elision! - this_syllable.phenomena['elision'] = Phenomenon(omitted=m.group()) + elision = Phenomenon(omitted=m.group()) + this_syllable.phenomena['elision'] = elision this_syllable.syllable_length = 0 return verse def parse_verse(verse): - """Annotates syllable lengths based on positional_lengthening and muta cum liquida - + """Annotate syllable lengths based on positional_lengthening and muta + cum liquida. """ positional_lengthening(verse) - + muta_cum_liquida(verse) new_readings = list() - + for reading in verse.readings: - syllables = [syllable for token in reading.tokens + syllables = [syllable for token in reading.tokens for syllable in token.syllables] abstract = str() mcl_count = 0 @@ -441,7 +466,9 @@ def parse_verse(verse): if syllable.id == len(syllables) - 1: abstract += '2' elif 'muta cum liquida' in syllable.phenomena: - if 'positional lengthening' in syllable.phenomena and ' ' in syllable.phenomena['positional lengthening'].chars: + if ('positional lengthening' in syllable.phenomena + and ' ' in (syllable.phenomena['positional lengthening'] + .chars)): abstract += '2' else: abstract += '{}' @@ -457,15 +484,15 @@ def parse_verse(verse): if mcl_count > 0: new_abstracts = list() - combinations = list(product(['1','2'],repeat=mcl_count)) + combinations = list(product(['1', '2'], repeat=mcl_count)) for combi in combinations: new_abstracts.append(abstract.format(*combi)) - + reading_copies = multiply_readings([reading], (mcl_count)*2) else: new_abstracts = [abstract] reading_copies = [reading] - + for i in range(len(new_abstracts)): blueprint = new_abstracts[i] new_reading = reading_copies[i] @@ -473,18 +500,18 @@ def parse_verse(verse): syll_id = 0 for token in new_reading.tokens: for s in token.syllables: - if blueprint[syll_id] == "1": + if blueprint[syll_id] == '1': s.syllable_length = 1 - if 'positional lengthening' in s.phenomena and 'muta cum liquida' in s.phenomena: - s.phenomena['positional lengthening'].overruled_by = 'muta cum liquida' - elif blueprint[syll_id] == "2": + if ('positional lengthening' in s.phenomena + and 'muta cum liquida' in s.phenomena): + (s.phenomena['positional lengthening'] + .overruled_by) = 'muta cum liquida' + elif blueprint[syll_id] == '2': s.syllable_length = 2 syll_id += 1 - + new_readings.append(copy.deepcopy(new_reading)) - #print("In: "+abstract) - #print("Out: "+"".join([str(s.syllable_length) for t in new_reading.tokens for s in t.syllables])) - + verse.readings = new_readings return verse