diff --git a/allzweckmesser/scanner.py b/allzweckmesser/scanner.py index 97ced9b6537d64edb4bef9956fa3fd67546b6a2b..a19db3584a5d39fb4c3b21466d5cdd0c556af439 100644 --- a/allzweckmesser/scanner.py +++ b/allzweckmesser/scanner.py @@ -166,6 +166,105 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]: return readings +def get_unknown_syllables(token): + """Stolen from Jonathan (insert proper citation here) + + ee + """ + + strng=token.text + strng=strng.lower() + # special cases + if strng=="cui" or strng=="cvi": + syll=Syllable("cui", token.span) + return [syll] + if strng=="cuiqve" or strng=="cviqve": + syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3]) + syll2=Syllable("qve", [token.span[0]+3,token.span[0]+6]) + return [syll1, syll2] + if strng=="proinde": + syll1=Syllable("proind", [token.span[0],token.span[0]+6]) + syll2=Syllable("e", [token.span[0]+6,token.span[0]+7]) + return [syll1, syll2] + if strng=="cuiqvam" or strng=="cviqvam": + syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3]) + syll2=Syllable("qvam", [token.span[0]+3,token.span[0]+7]) + return [syll1, syll2] + if strng=="seu": + syll=Syllable("seu", token.span) + return [syll] + if strng=="neu": + syll=Syllable("neu", token.span) + return [syll] + if strng=="heu": + syll=Syllable("heu", token.span) + return [syll] + if strng=="huic": + syll=Syllable("huic", token.span) + return [syll] + if strng=="ei": + syll=Syllable("ei", token.span) + return [syll] + if strng=="hei": + syll=Syllable("hei", token.span) + return [syll] + if strng=="ceu": + syll=Syllable("ceu", token.span) + return [syll] + if strng=="heus": + syll=Syllable("heus", token.span) + return [syll] + #end special cases + if strng.isupper(): + chunks=[chunk for chunk in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower()) if chunk!=""] + else: + chunks=[chunk for chunk in re.split("(ae|au|oe|[aeiouy])", strng.lower()) if chunk!=""] + y=[] + + # Zaehler j: gerades j: Konsonanten werden an y angehaengt, + # ungerades j: Vokale werden an Konsonanten angehaengt + # Zu beachten: Faengt Wort mit Vokal an? + j=-1 + fluff=0 + + for ch in chunks: + j+=1 + if j==0: + if re.match("[^aeiou]", chunks[0]): + fluff=1 + y.append(ch) + else: + y.append(ch) + j+=1 + elif j==1 and fluff==1: + y[0]+=chunks[1] + else: + if j%2==0: + if re.match("[^aeiou]", ch): + y[-1]+=ch + else: + y.append(ch) + j+=1 + else: + y.append(ch) + + res = list() + length = 0 + for x in y: + res.append(Syllable(x, [length, length+len(x)])) + length+=(len(x)) + + # special cases again + if re.search("oen?$", strng) and strng.isupper(): + res[-1]=Syllable("o", [res[-1].span[0], res[-1].span[0]+1]) + if strng.endswith("n"): + res.append(Syllable("en", [res[-1].span[0]+1, res[-1].span[1]])) + else: + res.append(Syllable("e",[res[-1].span[0]+1, res[-1].span[1]])) + + return res + + def get_syllables_for_token(token: Token): syllables = [] if token.accented: @@ -222,8 +321,7 @@ def get_syllables_for_token(token: Token): syllables.append(syll) else: if not token.is_punct(): - syllables = [Syllable(syllable=token.text, span=token.span, idx=None, - vowel_length=1, syllable_length=1)] + syllables = get_unknown_syllables(token) return syllables