Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Allzweckmesser
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Messerschleifer
Allzweckmesser
Commits
6f84f3e0
Commit
6f84f3e0
authored
6 years ago
by
Simon Will
Browse files
Options
Downloads
Patches
Plain Diff
Clean up code in scanner.py a bit
parent
42b7ffc6
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
allzweckmesser/scanner.py
+171
-144
171 additions, 144 deletions
allzweckmesser/scanner.py
with
171 additions
and
144 deletions
allzweckmesser/scanner.py
+
171
−
144
View file @
6f84f3e0
...
...
@@ -189,171 +189,188 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
return
readings
def
get_unknown_syllables
(
token
):
def
get_syllables_for_accented_form
(
token
):
syllables
=
[]
regex
=
(
r
'
((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)
'
if
token
.
text
[
0
].
isupper
()
else
r
'
((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)
'
)
accented
=
(
token
.
accented
+
token
.
clitic
if
token
.
clitic
else
token
.
accented
)
chunks
=
[
chunk
for
chunk
in
re
.
split
(
regex
,
accented
,
flags
=
re
.
IGNORECASE
)
if
chunk
]
syll_start
=
token
.
span
[
0
]
syll_text
=
''
syll_vowel_length
=
1
syll_has_vowel
=
False
for
i
,
c
in
enumerate
(
chunks
):
if
c
[
0
]
in
'
aeiouy
'
:
if
syll_has_vowel
:
# Syllable already has a vowel.
# Add the current syllable and begin a new one.
syll
=
Syllable
(
syllable
=
syll_text
,
span
=
[
syll_start
,
syll_start
+
len
(
syll_text
)],
idx
=
None
,
vowel_length
=
syll_vowel_length
,
syllable_length
=
syll_vowel_length
)
syllables
.
append
(
syll
)
# Begin info for new syllable.
syll_start
=
syll_start
+
len
(
syll_text
)
syll_text
=
c
.
rstrip
(
'
_^
'
)
else
:
# Syllable has no vowel yet.
syll_text
+=
c
.
rstrip
(
'
_^
'
)
syll_has_vowel
=
True
syll_vowel_length
=
(
2
if
len
(
c
)
>
1
and
c
[
1
]
in
'
aeiouy_
'
else
1
)
else
:
syll_text
+=
c
.
rstrip
(
'
_^
'
)
if
syll_text
:
# Add the last syllable.
syll
=
Syllable
(
syllable
=
syll_text
,
span
=
[
syll_start
,
syll_start
+
len
(
syll_text
)],
idx
=
None
,
vowel_length
=
syll_vowel_length
,
syllable_length
=
syll_vowel_length
)
syllables
.
append
(
syll
)
return
syllables
def
get_syllables_for_unknown_form
(
token
):
"""
Stolen from Jonathan (insert proper citation here)
ee
"""
strng
=
token
.
text
strng
=
strng
.
lower
()
#
s
pecial cases
if
strng
==
"
cui
"
or
strng
==
"
cvi
"
:
syll
=
Syllable
(
"
cui
"
,
token
.
span
)
strng
=
token
.
text
strng
=
strng
.
lower
()
#
S
pecial cases
if
strng
==
"
cui
"
or
strng
==
"
cvi
"
:
syll
=
Syllable
(
"
cui
"
,
token
.
span
)
return
[
syll
]
if
strng
==
"
cuiqve
"
or
strng
==
"
cviqve
"
:
syll1
=
Syllable
(
"
cui
"
,
[
token
.
span
[
0
]
+
0
,
token
.
span
[
0
]
+
3
])
syll2
=
Syllable
(
"
qve
"
,
[
token
.
span
[
0
]
+
3
,
token
.
span
[
0
]
+
6
])
if
strng
==
"
cuiqve
"
or
strng
==
"
cviqve
"
:
syll1
=
Syllable
(
"
cui
"
,
[
token
.
span
[
0
]
+
0
,
token
.
span
[
0
]
+
3
])
syll2
=
Syllable
(
"
qve
"
,
[
token
.
span
[
0
]
+
3
,
token
.
span
[
0
]
+
6
])
return
[
syll1
,
syll2
]
if
strng
==
"
proinde
"
:
syll1
=
Syllable
(
"
proind
"
,
[
token
.
span
[
0
],
token
.
span
[
0
]
+
6
])
syll2
=
Syllable
(
"
e
"
,
[
token
.
span
[
0
]
+
6
,
token
.
span
[
0
]
+
7
])
if
strng
==
"
proinde
"
:
syll1
=
Syllable
(
"
proind
"
,
[
token
.
span
[
0
],
token
.
span
[
0
]
+
6
])
syll2
=
Syllable
(
"
e
"
,
[
token
.
span
[
0
]
+
6
,
token
.
span
[
0
]
+
7
])
return
[
syll1
,
syll2
]
if
strng
==
"
cuiqvam
"
or
strng
==
"
cviqvam
"
:
syll1
=
Syllable
(
"
cui
"
,
[
token
.
span
[
0
]
+
0
,
token
.
span
[
0
]
+
3
])
syll2
=
Syllable
(
"
qvam
"
,
[
token
.
span
[
0
]
+
3
,
token
.
span
[
0
]
+
7
])
if
strng
==
"
cuiqvam
"
or
strng
==
"
cviqvam
"
:
syll1
=
Syllable
(
"
cui
"
,
[
token
.
span
[
0
]
+
0
,
token
.
span
[
0
]
+
3
])
syll2
=
Syllable
(
"
qvam
"
,
[
token
.
span
[
0
]
+
3
,
token
.
span
[
0
]
+
7
])
return
[
syll1
,
syll2
]
if
strng
==
"
necnon
"
:
syll1
=
Syllable
(
"
nec
"
,
[
token
.
span
[
0
]
+
0
,
token
.
span
[
0
]
+
3
])
syll2
=
Syllable
(
"
non
"
,
[
token
.
span
[
0
]
+
3
,
token
.
span
[
0
]
+
6
])
if
strng
==
"
necnon
"
:
syll1
=
Syllable
(
"
nec
"
,
[
token
.
span
[
0
]
+
0
,
token
.
span
[
0
]
+
3
])
syll2
=
Syllable
(
"
non
"
,
[
token
.
span
[
0
]
+
3
,
token
.
span
[
0
]
+
6
])
return
[
syll1
,
syll2
]
if
strng
==
"
seu
"
:
syll
=
Syllable
(
"
seu
"
,
token
.
span
)
if
strng
==
"
seu
"
:
syll
=
Syllable
(
"
seu
"
,
token
.
span
)
return
[
syll
]
if
strng
==
"
neu
"
:
syll
=
Syllable
(
"
neu
"
,
token
.
span
)
if
strng
==
"
neu
"
:
syll
=
Syllable
(
"
neu
"
,
token
.
span
)
return
[
syll
]
if
strng
==
"
heu
"
:
syll
=
Syllable
(
"
heu
"
,
token
.
span
)
if
strng
==
"
heu
"
:
syll
=
Syllable
(
"
heu
"
,
token
.
span
)
return
[
syll
]
if
strng
==
"
huic
"
:
syll
=
Syllable
(
"
huic
"
,
token
.
span
)
if
strng
==
"
huic
"
:
syll
=
Syllable
(
"
huic
"
,
token
.
span
)
return
[
syll
]
if
strng
==
"
ei
"
:
syll
=
Syllable
(
"
ei
"
,
token
.
span
)
if
strng
==
"
ei
"
:
syll
=
Syllable
(
"
ei
"
,
token
.
span
)
return
[
syll
]
if
strng
==
"
hei
"
:
syll
=
Syllable
(
"
hei
"
,
token
.
span
)
if
strng
==
"
hei
"
:
syll
=
Syllable
(
"
hei
"
,
token
.
span
)
return
[
syll
]
if
strng
==
"
ceu
"
:
syll
=
Syllable
(
"
ceu
"
,
token
.
span
)
if
strng
==
"
ceu
"
:
syll
=
Syllable
(
"
ceu
"
,
token
.
span
)
return
[
syll
]
if
strng
==
"
heus
"
:
syll
=
Syllable
(
"
heus
"
,
token
.
span
)
if
strng
==
"
heus
"
:
syll
=
Syllable
(
"
heus
"
,
token
.
span
)
return
[
syll
]
#end special cases
# End special cases
if
strng
.
isupper
():
chunks
=
[
chunk
for
chunk
in
re
.
split
(
"
(ae|oe|au|eu|yi|[aeiouy])
"
,
strng
.
lower
())
if
chunk
!=
""
]
chunks
=
[
chunk
for
chunk
in
re
.
split
(
"
(ae|oe|au|eu|yi|[aeiouy])
"
,
strng
.
lower
())
if
chunk
!=
""
]
else
:
chunks
=
[
chunk
for
chunk
in
re
.
split
(
"
(ae|au|oe|[aeiouy])
"
,
strng
.
lower
())
if
chunk
!=
""
]
y
=
[]
chunks
=
[
chunk
for
chunk
in
re
.
split
(
"
(ae|au|oe|[aeiouy])
"
,
strng
.
lower
())
if
chunk
!=
""
]
y
=
[]
# Zaehler j: gerades j: Konsonanten werden an y angehaengt,
# ungerades j: Vokale werden an Konsonanten angehaengt
# Zu beachten: Faengt Wort mit Vokal an?
j
=
-
1
fluff
=
0
j
=
-
1
fluff
=
0
for
ch
in
chunks
:
j
+=
1
if
j
==
0
:
j
+=
1
if
j
==
0
:
if
re
.
match
(
"
[^aeiou]
"
,
chunks
[
0
]):
fluff
=
1
fluff
=
1
y
.
append
(
ch
)
else
:
y
.
append
(
ch
)
j
+=
1
elif
j
==
1
and
fluff
==
1
:
y
[
0
]
+=
chunks
[
1
]
j
+=
1
elif
j
==
1
and
fluff
==
1
:
y
[
0
]
+=
chunks
[
1
]
else
:
if
j
%
2
==
0
:
if
j
%
2
==
0
:
if
re
.
match
(
"
[^aeiou]
"
,
ch
):
y
[
-
1
]
+=
ch
y
[
-
1
]
+=
ch
else
:
y
.
append
(
ch
)
j
+=
1
j
+=
1
else
:
y
.
append
(
ch
)
res
=
list
()
length
=
0
for
x
in
y
:
res
.
append
(
Syllable
(
x
,
[
length
,
length
+
len
(
x
)]))
length
+=
(
len
(
x
))
length
+=
(
len
(
x
))
# special cases again
if
re
.
search
(
"
oen?$
"
,
strng
)
and
strng
.
isupper
():
res
[
-
1
]
=
Syllable
(
"
o
"
,
[
res
[
-
1
].
span
[
0
],
res
[
-
1
].
span
[
0
]
+
1
])
res
[
-
1
]
=
Syllable
(
"
o
"
,
[
res
[
-
1
].
span
[
0
],
res
[
-
1
].
span
[
0
]
+
1
])
if
strng
.
endswith
(
"
n
"
):
res
.
append
(
Syllable
(
"
en
"
,
[
res
[
-
1
].
span
[
0
]
+
1
,
res
[
-
1
].
span
[
1
]]))
res
.
append
(
Syllable
(
"
en
"
,
[
res
[
-
1
].
span
[
0
]
+
1
,
res
[
-
1
].
span
[
1
]]))
else
:
res
.
append
(
Syllable
(
"
e
"
,[
res
[
-
1
].
span
[
0
]
+
1
,
res
[
-
1
].
span
[
1
]]))
res
.
append
(
Syllable
(
"
e
"
,
[
res
[
-
1
].
span
[
0
]
+
1
,
res
[
-
1
].
span
[
1
]]))
for
syll
in
res
:
if
re
.
search
(
r
'
[aeiuoy]{2}
'
,
syll
.
text
):
syll
.
vowel_length
=
2
syll
.
syllable_length
=
2
return
res
def
get_syllables_for_token
(
token
:
Token
):
syllables
=
[]
if
token
.
accented
:
regex
=
(
r
'
((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)
'
if
token
.
text
[
0
].
isupper
()
else
r
'
((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)
'
)
accented
=
(
token
.
accented
+
token
.
clitic
if
token
.
clitic
else
token
.
accented
)
chunks
=
[
chunk
for
chunk
in
re
.
split
(
regex
,
accented
,
flags
=
re
.
IGNORECASE
)
if
chunk
]
syll_start
=
token
.
span
[
0
]
syll_text
=
''
syll_vowel_length
=
1
syll_has_vowel
=
False
for
i
,
c
in
enumerate
(
chunks
):
if
c
[
0
]
in
'
aeiouy
'
:
if
syll_has_vowel
:
# Syllable already has a vowel.
# Add the current syllable and begin a new one.
syll
=
Syllable
(
syllable
=
syll_text
,
span
=
[
syll_start
,
syll_start
+
len
(
syll_text
)],
idx
=
None
,
vowel_length
=
syll_vowel_length
,
syllable_length
=
syll_vowel_length
)
syllables
.
append
(
syll
)
# Begin info for new syllable.
syll_start
=
syll_start
+
len
(
syll_text
)
syll_text
=
c
.
rstrip
(
'
_^
'
)
else
:
# Syllable has no vowel yet.
syll_text
+=
c
.
rstrip
(
'
_^
'
)
syll_has_vowel
=
True
syll_vowel_length
=
(
2
if
len
(
c
)
>
1
and
c
[
1
]
in
'
aeiouy_
'
else
1
)
else
:
syll_text
+=
c
.
rstrip
(
'
_^
'
)
if
syll_text
:
# Add the last syllable.
syll
=
Syllable
(
syllable
=
syll_text
,
span
=
[
syll_start
,
syll_start
+
len
(
syll_text
)],
idx
=
None
,
vowel_length
=
syll_vowel_length
,
syllable_length
=
syll_vowel_length
)
syllables
.
append
(
syll
)
syllables
=
get_syllables_for_accented_form
(
token
)
else
:
if
not
token
.
is_punct
():
syllables
=
get_
unknown_
syllables
(
token
)
syllables
=
get_syllables
_for_unknown_form
(
token
)
return
syllables
...
...
@@ -368,8 +385,10 @@ def get_syllables(reading):
def
muta_cum_liquida
(
verse
):
mcl_regex
=
re
.
compile
(
r
'
[aeiouv](([bpsckgdt]|(qu)|(qv))\W*[lrmn])([aeiouv]|[.?!]|$)
'
,
flags
=
re
.
IGNORECASE
)
mcl_regex
=
re
.
compile
(
r
'
[aeiouv](([bpsckgdt]|(qu)|(qv))\W*[lrmn])([aeiouv]|[.?!]|$)
'
,
flags
=
re
.
IGNORECASE
)
if
re
.
search
(
mcl_regex
,
verse
.
text
):
matches
=
re
.
finditer
(
mcl_regex
,
verse
.
text
)
...
...
@@ -378,24 +397,29 @@ def muta_cum_liquida(verse):
for
token
in
reading
.
tokens
:
for
syllable
in
token
.
syllables
:
if
syllable
.
span
[
0
]
<=
match
.
start
()
<
syllable
.
span
[
1
]:
syllable
.
phenomena
[
'
muta cum liquida
'
]
=
Phenomenon
(
chars
=
match
.
group
(
1
))
if
syllable
.
span
[
0
]
<=
match
.
start
()
<
syllable
.
span
[
1
]:
mcl
=
Phenomenon
(
chars
=
match
.
group
(
1
))
syllable
.
phenomena
[
'
muta cum liquida
'
]
=
mcl
def
positional_lengthening
(
verse
):
pl_regex
=
re
.
compile
(
r
'
[aeiouv](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])
'
,
flags
=
re
.
IGNORECASE
)
if
re
.
search
(
pl_regex
,
verse
.
text
):
matches
=
re
.
finditer
(
pl_regex
,
verse
.
text
)
for
match
in
matches
:
for
reading
in
verse
.
readings
:
for
token
in
reading
.
tokens
:
for
syllable
in
token
.
syllables
:
if
syllable
.
span
[
0
]
<=
match
.
start
()
<
syllable
.
span
[
1
]:
syllable
.
syllable_length
=
2
syllable
.
phenomena
[
'
positional lengthening
'
]
=
Phenomenon
(
chars
=
match
.
group
(
1
))
pl_regex
=
re
.
compile
(
r
'
[aeiouv](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])
'
,
flags
=
re
.
IGNORECASE
)
for
match
in
re
.
finditer
(
pl_regex
,
verse
.
text
):
for
reading
in
verse
.
readings
:
for
token
in
reading
.
tokens
:
break_
=
False
for
syllable
in
token
.
syllables
:
if
syllable
.
span
[
0
]
<=
match
.
start
()
<
syllable
.
span
[
1
]:
syllable
.
syllable_length
=
2
pl
=
Phenomenon
(
chars
=
match
.
group
(
1
))
syllable
.
phenomena
[
'
positional lengthening
'
]
=
pl
break_
=
True
break
if
break_
:
break
def
make_elisions
(
verse
):
...
...
@@ -417,23 +441,24 @@ def make_elisions(verse):
if
m
:
if
re
.
search
(
r
'
^h?[aeiouy]
'
,
next_syllable
.
text
):
# Elision!
this_syllable
.
phenomena
[
'
elision
'
]
=
Phenomenon
(
omitted
=
m
.
group
())
elision
=
Phenomenon
(
omitted
=
m
.
group
())
this_syllable
.
phenomena
[
'
elision
'
]
=
elision
this_syllable
.
syllable_length
=
0
return
verse
def
parse_verse
(
verse
):
"""
Annotate
s
syllable lengths based on positional_lengthening and muta
cum liquida
"""
Annotate syllable lengths based on positional_lengthening and muta
cum liquida.
"""
positional_lengthening
(
verse
)
muta_cum_liquida
(
verse
)
new_readings
=
list
()
for
reading
in
verse
.
readings
:
syllables
=
[
syllable
for
token
in
reading
.
tokens
syllables
=
[
syllable
for
token
in
reading
.
tokens
for
syllable
in
token
.
syllables
]
abstract
=
str
()
mcl_count
=
0
...
...
@@ -441,7 +466,9 @@ def parse_verse(verse):
if
syllable
.
id
==
len
(
syllables
)
-
1
:
abstract
+=
'
2
'
elif
'
muta cum liquida
'
in
syllable
.
phenomena
:
if
'
positional lengthening
'
in
syllable
.
phenomena
and
'
'
in
syllable
.
phenomena
[
'
positional lengthening
'
].
chars
:
if
(
'
positional lengthening
'
in
syllable
.
phenomena
and
'
'
in
(
syllable
.
phenomena
[
'
positional lengthening
'
]
.
chars
)):
abstract
+=
'
2
'
else
:
abstract
+=
'
{}
'
...
...
@@ -457,15 +484,15 @@ def parse_verse(verse):
if
mcl_count
>
0
:
new_abstracts
=
list
()
combinations
=
list
(
product
([
'
1
'
,
'
2
'
],
repeat
=
mcl_count
))
combinations
=
list
(
product
([
'
1
'
,
'
2
'
],
repeat
=
mcl_count
))
for
combi
in
combinations
:
new_abstracts
.
append
(
abstract
.
format
(
*
combi
))
reading_copies
=
multiply_readings
([
reading
],
(
mcl_count
)
*
2
)
else
:
new_abstracts
=
[
abstract
]
reading_copies
=
[
reading
]
for
i
in
range
(
len
(
new_abstracts
)):
blueprint
=
new_abstracts
[
i
]
new_reading
=
reading_copies
[
i
]
...
...
@@ -473,18 +500,18 @@ def parse_verse(verse):
syll_id
=
0
for
token
in
new_reading
.
tokens
:
for
s
in
token
.
syllables
:
if
blueprint
[
syll_id
]
==
"
1
"
:
if
blueprint
[
syll_id
]
==
'
1
'
:
s
.
syllable_length
=
1
if
'
positional lengthening
'
in
s
.
phenomena
and
'
muta cum liquida
'
in
s
.
phenomena
:
s
.
phenomena
[
'
positional lengthening
'
].
overruled_by
=
'
muta cum liquida
'
elif
blueprint
[
syll_id
]
==
"
2
"
:
if
(
'
positional lengthening
'
in
s
.
phenomena
and
'
muta cum liquida
'
in
s
.
phenomena
):
(
s
.
phenomena
[
'
positional lengthening
'
]
.
overruled_by
)
=
'
muta cum liquida
'
elif
blueprint
[
syll_id
]
==
'
2
'
:
s
.
syllable_length
=
2
syll_id
+=
1
new_readings
.
append
(
copy
.
deepcopy
(
new_reading
))
#print("In: "+abstract)
#print("Out: "+"".join([str(s.syllable_length) for t in new_reading.tokens for s in t.syllables]))
verse
.
readings
=
new_readings
return
verse
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment