Skip to content
Snippets Groups Projects
Commit d152f9e6 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Add unknown word splitter

parent 7919f9e4
No related branches found
No related tags found
No related merge requests found
......@@ -166,6 +166,105 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
return readings
def get_unknown_syllables(token):
"""Stolen from Jonathan (insert proper citation here)
ee
"""
strng=token.text
strng=strng.lower()
# special cases
if strng=="cui" or strng=="cvi":
syll=Syllable("cui", token.span)
return [syll]
if strng=="cuiqve" or strng=="cviqve":
syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3])
syll2=Syllable("qve", [token.span[0]+3,token.span[0]+6])
return [syll1, syll2]
if strng=="proinde":
syll1=Syllable("proind", [token.span[0],token.span[0]+6])
syll2=Syllable("e", [token.span[0]+6,token.span[0]+7])
return [syll1, syll2]
if strng=="cuiqvam" or strng=="cviqvam":
syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3])
syll2=Syllable("qvam", [token.span[0]+3,token.span[0]+7])
return [syll1, syll2]
if strng=="seu":
syll=Syllable("seu", token.span)
return [syll]
if strng=="neu":
syll=Syllable("neu", token.span)
return [syll]
if strng=="heu":
syll=Syllable("heu", token.span)
return [syll]
if strng=="huic":
syll=Syllable("huic", token.span)
return [syll]
if strng=="ei":
syll=Syllable("ei", token.span)
return [syll]
if strng=="hei":
syll=Syllable("hei", token.span)
return [syll]
if strng=="ceu":
syll=Syllable("ceu", token.span)
return [syll]
if strng=="heus":
syll=Syllable("heus", token.span)
return [syll]
#end special cases
if strng.isupper():
chunks=[chunk for chunk in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower()) if chunk!=""]
else:
chunks=[chunk for chunk in re.split("(ae|au|oe|[aeiouy])", strng.lower()) if chunk!=""]
y=[]
# Zaehler j: gerades j: Konsonanten werden an y angehaengt,
# ungerades j: Vokale werden an Konsonanten angehaengt
# Zu beachten: Faengt Wort mit Vokal an?
j=-1
fluff=0
for ch in chunks:
j+=1
if j==0:
if re.match("[^aeiou]", chunks[0]):
fluff=1
y.append(ch)
else:
y.append(ch)
j+=1
elif j==1 and fluff==1:
y[0]+=chunks[1]
else:
if j%2==0:
if re.match("[^aeiou]", ch):
y[-1]+=ch
else:
y.append(ch)
j+=1
else:
y.append(ch)
res = list()
length = 0
for x in y:
res.append(Syllable(x, [length, length+len(x)]))
length+=(len(x))
# special cases again
if re.search("oen?$", strng) and strng.isupper():
res[-1]=Syllable("o", [res[-1].span[0], res[-1].span[0]+1])
if strng.endswith("n"):
res.append(Syllable("en", [res[-1].span[0]+1, res[-1].span[1]]))
else:
res.append(Syllable("e",[res[-1].span[0]+1, res[-1].span[1]]))
return res
def get_syllables_for_token(token: Token):
syllables = []
if token.accented:
......@@ -222,8 +321,7 @@ def get_syllables_for_token(token: Token):
syllables.append(syll)
else:
if not token.is_punct():
syllables = [Syllable(syllable=token.text, span=token.span, idx=None,
vowel_length=1, syllable_length=1)]
syllables = get_unknown_syllables(token)
return syllables
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment