Skip to content
Snippets Groups Projects
Commit 6f84f3e0 authored by Simon Will's avatar Simon Will
Browse files

Clean up code in scanner.py a bit

parent 42b7ffc6
No related branches found
No related tags found
No related merge requests found
......@@ -189,171 +189,188 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
return readings
def get_unknown_syllables(token):
def get_syllables_for_accented_form(token):
syllables = []
regex = (
r'((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)'
if token.text[0].isupper()
else r'((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)'
)
accented = (token.accented + token.clitic
if token.clitic
else token.accented)
chunks = [
chunk
for chunk in re.split(regex, accented, flags=re.IGNORECASE)
if chunk
]
syll_start = token.span[0]
syll_text = ''
syll_vowel_length = 1
syll_has_vowel = False
for i, c in enumerate(chunks):
if c[0] in 'aeiouy':
if syll_has_vowel:
# Syllable already has a vowel.
# Add the current syllable and begin a new one.
syll = Syllable(syllable=syll_text,
span=[syll_start,
syll_start + len(syll_text)],
idx=None,
vowel_length=syll_vowel_length,
syllable_length=syll_vowel_length)
syllables.append(syll)
# Begin info for new syllable.
syll_start = syll_start + len(syll_text)
syll_text = c.rstrip('_^')
else:
# Syllable has no vowel yet.
syll_text += c.rstrip('_^')
syll_has_vowel = True
syll_vowel_length = (
2 if len(c) > 1 and c[1] in 'aeiouy_' else 1
)
else:
syll_text += c.rstrip('_^')
if syll_text:
# Add the last syllable.
syll = Syllable(syllable=syll_text,
span=[syll_start, syll_start + len(syll_text)],
idx=None,
vowel_length=syll_vowel_length,
syllable_length=syll_vowel_length)
syllables.append(syll)
return syllables
def get_syllables_for_unknown_form(token):
"""Stolen from Jonathan (insert proper citation here)
ee
"""
strng=token.text
strng=strng.lower()
# special cases
if strng=="cui" or strng=="cvi":
syll=Syllable("cui", token.span)
strng = token.text
strng = strng.lower()
# Special cases
if strng == "cui" or strng == "cvi":
syll = Syllable("cui", token.span)
return [syll]
if strng=="cuiqve" or strng=="cviqve":
syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3])
syll2=Syllable("qve", [token.span[0]+3,token.span[0]+6])
if strng == "cuiqve" or strng == "cviqve":
syll1 = Syllable("cui", [token.span[0] + 0, token.span[0] + 3])
syll2 = Syllable("qve", [token.span[0] + 3, token.span[0] + 6])
return [syll1, syll2]
if strng=="proinde":
syll1=Syllable("proind", [token.span[0],token.span[0]+6])
syll2=Syllable("e", [token.span[0]+6,token.span[0]+7])
if strng == "proinde":
syll1 = Syllable("proind", [token.span[0], token.span[0] + 6])
syll2 = Syllable("e", [token.span[0] + 6, token.span[0] + 7])
return [syll1, syll2]
if strng=="cuiqvam" or strng=="cviqvam":
syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3])
syll2=Syllable("qvam", [token.span[0]+3,token.span[0]+7])
if strng == "cuiqvam" or strng == "cviqvam":
syll1 = Syllable("cui", [token.span[0] + 0, token.span[0] + 3])
syll2 = Syllable("qvam", [token.span[0] + 3, token.span[0] + 7])
return [syll1, syll2]
if strng=="necnon":
syll1=Syllable("nec", [token.span[0]+0,token.span[0]+3])
syll2=Syllable("non", [token.span[0]+3,token.span[0]+6])
if strng == "necnon":
syll1 = Syllable("nec", [token.span[0] + 0, token.span[0] + 3])
syll2 = Syllable("non", [token.span[0] + 3, token.span[0] + 6])
return [syll1, syll2]
if strng=="seu":
syll=Syllable("seu", token.span)
if strng == "seu":
syll = Syllable("seu", token.span)
return [syll]
if strng=="neu":
syll=Syllable("neu", token.span)
if strng == "neu":
syll = Syllable("neu", token.span)
return [syll]
if strng=="heu":
syll=Syllable("heu", token.span)
if strng == "heu":
syll = Syllable("heu", token.span)
return [syll]
if strng=="huic":
syll=Syllable("huic", token.span)
if strng == "huic":
syll = Syllable("huic", token.span)
return [syll]
if strng=="ei":
syll=Syllable("ei", token.span)
if strng == "ei":
syll = Syllable("ei", token.span)
return [syll]
if strng=="hei":
syll=Syllable("hei", token.span)
if strng == "hei":
syll = Syllable("hei", token.span)
return [syll]
if strng=="ceu":
syll=Syllable("ceu", token.span)
if strng == "ceu":
syll = Syllable("ceu", token.span)
return [syll]
if strng=="heus":
syll=Syllable("heus", token.span)
if strng == "heus":
syll = Syllable("heus", token.span)
return [syll]
#end special cases
# End special cases
if strng.isupper():
chunks=[chunk for chunk in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower()) if chunk!=""]
chunks = [
chunk
for chunk
in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower())
if chunk != ""
]
else:
chunks=[chunk for chunk in re.split("(ae|au|oe|[aeiouy])", strng.lower()) if chunk!=""]
y=[]
chunks=[
chunk
for chunk
in re.split("(ae|au|oe|[aeiouy])", strng.lower())
if chunk != ""
]
y = []
# Zaehler j: gerades j: Konsonanten werden an y angehaengt,
# ungerades j: Vokale werden an Konsonanten angehaengt
# Zu beachten: Faengt Wort mit Vokal an?
j=-1
fluff=0
j = -1
fluff = 0
for ch in chunks:
j+=1
if j==0:
j += 1
if j == 0:
if re.match("[^aeiou]", chunks[0]):
fluff=1
fluff = 1
y.append(ch)
else:
y.append(ch)
j+=1
elif j==1 and fluff==1:
y[0]+=chunks[1]
j += 1
elif j == 1 and fluff == 1:
y[0] += chunks[1]
else:
if j%2==0:
if j % 2 == 0:
if re.match("[^aeiou]", ch):
y[-1]+=ch
y[-1] += ch
else:
y.append(ch)
j+=1
j += 1
else:
y.append(ch)
res = list()
length = 0
for x in y:
res.append(Syllable(x, [length, length+len(x)]))
length+=(len(x))
length += (len(x))
# special cases again
if re.search("oen?$", strng) and strng.isupper():
res[-1]=Syllable("o", [res[-1].span[0], res[-1].span[0]+1])
res[-1] = Syllable("o", [res[-1].span[0], res[-1].span[0]+1])
if strng.endswith("n"):
res.append(Syllable("en", [res[-1].span[0]+1, res[-1].span[1]]))
res.append(Syllable("en", [res[-1].span[0] + 1, res[-1].span[1]]))
else:
res.append(Syllable("e",[res[-1].span[0]+1, res[-1].span[1]]))
res.append(Syllable("e", [res[-1].span[0] + 1, res[-1].span[1]]))
for syll in res:
if re.search(r'[aeiuoy]{2}', syll.text):
syll.vowel_length = 2
syll.syllable_length = 2
return res
def get_syllables_for_token(token: Token):
syllables = []
if token.accented:
regex = (
r'((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)'
if token.text[0].isupper()
else r'((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)'
)
accented = (token.accented + token.clitic
if token.clitic
else token.accented)
chunks = [
chunk
for chunk in re.split(regex, accented, flags=re.IGNORECASE)
if chunk
]
syll_start = token.span[0]
syll_text = ''
syll_vowel_length = 1
syll_has_vowel = False
for i, c in enumerate(chunks):
if c[0] in 'aeiouy':
if syll_has_vowel:
# Syllable already has a vowel.
# Add the current syllable and begin a new one.
syll = Syllable(syllable=syll_text,
span=[syll_start,
syll_start + len(syll_text)],
idx=None,
vowel_length=syll_vowel_length,
syllable_length=syll_vowel_length)
syllables.append(syll)
# Begin info for new syllable.
syll_start = syll_start + len(syll_text)
syll_text = c.rstrip('_^')
else:
# Syllable has no vowel yet.
syll_text += c.rstrip('_^')
syll_has_vowel = True
syll_vowel_length = (
2 if len(c) > 1 and c[1] in 'aeiouy_' else 1
)
else:
syll_text += c.rstrip('_^')
if syll_text:
# Add the last syllable.
syll = Syllable(syllable=syll_text,
span=[syll_start, syll_start + len(syll_text)],
idx=None,
vowel_length=syll_vowel_length,
syllable_length=syll_vowel_length)
syllables.append(syll)
syllables = get_syllables_for_accented_form(token)
else:
if not token.is_punct():
syllables = get_unknown_syllables(token)
syllables = get_syllables_for_unknown_form(token)
return syllables
......@@ -368,8 +385,10 @@ def get_syllables(reading):
def muta_cum_liquida(verse):
mcl_regex = re.compile(r'[aeiouv](([bpsckgdt]|(qu)|(qv))\W*[lrmn])([aeiouv]|[.?!]|$)', flags=re.IGNORECASE)
mcl_regex = re.compile(
r'[aeiouv](([bpsckgdt]|(qu)|(qv))\W*[lrmn])([aeiouv]|[.?!]|$)',
flags=re.IGNORECASE
)
if re.search(mcl_regex, verse.text):
matches = re.finditer(mcl_regex, verse.text)
......@@ -378,24 +397,29 @@ def muta_cum_liquida(verse):
for token in reading.tokens:
for syllable in token.syllables:
if syllable.span[0]<= match.start() < syllable.span[1]:
syllable.phenomena['muta cum liquida'] = Phenomenon(chars=match.group(1))
if syllable.span[0] <= match.start() < syllable.span[1]:
mcl = Phenomenon(chars=match.group(1))
syllable.phenomena['muta cum liquida'] = mcl
def positional_lengthening(verse):
pl_regex = re.compile(r'[aeiouv](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])', flags=re.IGNORECASE)
if re.search(pl_regex, verse.text):
matches = re.finditer(pl_regex, verse.text)
for match in matches:
for reading in verse.readings:
for token in reading.tokens:
for syllable in token.syllables:
if syllable.span[0]<= match.start() < syllable.span[1]:
syllable.syllable_length = 2
syllable.phenomena['positional lengthening'] = Phenomenon(chars=match.group(1))
pl_regex = re.compile(
r'[aeiouv](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])',
flags=re.IGNORECASE
)
for match in re.finditer(pl_regex, verse.text):
for reading in verse.readings:
for token in reading.tokens:
break_ = False
for syllable in token.syllables:
if syllable.span[0] <= match.start() < syllable.span[1]:
syllable.syllable_length = 2
pl = Phenomenon(chars=match.group(1))
syllable.phenomena['positional lengthening'] = pl
break_ = True
break
if break_:
break
def make_elisions(verse):
......@@ -417,23 +441,24 @@ def make_elisions(verse):
if m:
if re.search(r'^h?[aeiouy]', next_syllable.text):
# Elision!
this_syllable.phenomena['elision'] = Phenomenon(omitted=m.group())
elision = Phenomenon(omitted=m.group())
this_syllable.phenomena['elision'] = elision
this_syllable.syllable_length = 0
return verse
def parse_verse(verse):
"""Annotates syllable lengths based on positional_lengthening and muta cum liquida
"""Annotate syllable lengths based on positional_lengthening and muta
cum liquida.
"""
positional_lengthening(verse)
muta_cum_liquida(verse)
new_readings = list()
for reading in verse.readings:
syllables = [syllable for token in reading.tokens
syllables = [syllable for token in reading.tokens
for syllable in token.syllables]
abstract = str()
mcl_count = 0
......@@ -441,7 +466,9 @@ def parse_verse(verse):
if syllable.id == len(syllables) - 1:
abstract += '2'
elif 'muta cum liquida' in syllable.phenomena:
if 'positional lengthening' in syllable.phenomena and ' ' in syllable.phenomena['positional lengthening'].chars:
if ('positional lengthening' in syllable.phenomena
and ' ' in (syllable.phenomena['positional lengthening']
.chars)):
abstract += '2'
else:
abstract += '{}'
......@@ -457,15 +484,15 @@ def parse_verse(verse):
if mcl_count > 0:
new_abstracts = list()
combinations = list(product(['1','2'],repeat=mcl_count))
combinations = list(product(['1', '2'], repeat=mcl_count))
for combi in combinations:
new_abstracts.append(abstract.format(*combi))
reading_copies = multiply_readings([reading], (mcl_count)*2)
else:
new_abstracts = [abstract]
reading_copies = [reading]
for i in range(len(new_abstracts)):
blueprint = new_abstracts[i]
new_reading = reading_copies[i]
......@@ -473,18 +500,18 @@ def parse_verse(verse):
syll_id = 0
for token in new_reading.tokens:
for s in token.syllables:
if blueprint[syll_id] == "1":
if blueprint[syll_id] == '1':
s.syllable_length = 1
if 'positional lengthening' in s.phenomena and 'muta cum liquida' in s.phenomena:
s.phenomena['positional lengthening'].overruled_by = 'muta cum liquida'
elif blueprint[syll_id] == "2":
if ('positional lengthening' in s.phenomena
and 'muta cum liquida' in s.phenomena):
(s.phenomena['positional lengthening']
.overruled_by) = 'muta cum liquida'
elif blueprint[syll_id] == '2':
s.syllable_length = 2
syll_id += 1
new_readings.append(copy.deepcopy(new_reading))
#print("In: "+abstract)
#print("Out: "+"".join([str(s.syllable_length) for t in new_reading.tokens for s in t.syllables]))
verse.readings = new_readings
return verse
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment