Clean up code in scanner.py a bit

6f84f3e0 · Simon Will · 42b7ffc6 · 6f84f3e0
Commit 6f84f3e0 authored 6 years ago by Simon Will
--- a/allzweckmesser/scanner.py
+++ b/allzweckmesser/scanner.py
@@ -189,171 +189,188 @@ def lemmatize(word_list: WordList, reading: Reading) -> List[Reading]:
    return readings


-def get_unknown_syllables(token):
+def get_syllables_for_accented_form(token):
+    syllables = []
+    regex = (
+        r'((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)'
+        if token.text[0].isupper()
+        else r'((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)'
+    )
+    accented = (token.accented + token.clitic
+                if token.clitic
+                else token.accented)
+    chunks = [
+        chunk
+        for chunk in re.split(regex, accented, flags=re.IGNORECASE)
+        if chunk
+    ]
+    syll_start = token.span[0]
+    syll_text = ''
+    syll_vowel_length = 1
+    syll_has_vowel = False
+    for i, c in enumerate(chunks):
+        if c[0] in 'aeiouy':
+            if syll_has_vowel:
+                # Syllable already has a vowel.
+                # Add the current syllable and begin a new one.
+                syll = Syllable(syllable=syll_text,
+                                span=[syll_start,
+                                      syll_start + len(syll_text)],
+                                idx=None,
+                                vowel_length=syll_vowel_length,
+                                syllable_length=syll_vowel_length)
+                syllables.append(syll)
+
+                # Begin info for new syllable.
+                syll_start = syll_start + len(syll_text)
+                syll_text = c.rstrip('_^')
+            else:
+                # Syllable has no vowel yet.
+                syll_text += c.rstrip('_^')
+            syll_has_vowel = True
+            syll_vowel_length = (
+                2 if len(c) > 1 and c[1] in 'aeiouy_' else 1
+            )
+        else:
+            syll_text += c.rstrip('_^')
+
+    if syll_text:
+        # Add the last syllable.
+        syll = Syllable(syllable=syll_text,
+                        span=[syll_start, syll_start + len(syll_text)],
+                        idx=None,
+                        vowel_length=syll_vowel_length,
+                        syllable_length=syll_vowel_length)
+        syllables.append(syll)
+    return syllables
+
+
+def get_syllables_for_unknown_form(token):
    """Stolen from Jonathan (insert proper citation here)

    ee
    """

-    strng=token.text
-    strng=strng.lower()
-    # special cases
-    if strng=="cui" or strng=="cvi":
-        syll=Syllable("cui", token.span)
+    strng = token.text
+    strng = strng.lower()
+    # Special cases
+    if strng == "cui" or strng == "cvi":
+        syll = Syllable("cui", token.span)
        return [syll]
-    if strng=="cuiqve" or strng=="cviqve":
-        syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3])
-        syll2=Syllable("qve", [token.span[0]+3,token.span[0]+6])
+    if strng == "cuiqve" or strng == "cviqve":
+        syll1 = Syllable("cui", [token.span[0] + 0, token.span[0] + 3])
+        syll2 = Syllable("qve", [token.span[0] + 3, token.span[0] + 6])
        return [syll1, syll2]
-    if strng=="proinde":
-        syll1=Syllable("proind", [token.span[0],token.span[0]+6])
-        syll2=Syllable("e", [token.span[0]+6,token.span[0]+7])
+    if strng == "proinde":
+        syll1 = Syllable("proind", [token.span[0], token.span[0] + 6])
+        syll2 = Syllable("e", [token.span[0] + 6, token.span[0] + 7])
        return [syll1, syll2]
-    if strng=="cuiqvam" or strng=="cviqvam":
-        syll1=Syllable("cui", [token.span[0]+0,token.span[0]+3])
-        syll2=Syllable("qvam", [token.span[0]+3,token.span[0]+7])
+    if strng == "cuiqvam" or strng == "cviqvam":
+        syll1 = Syllable("cui", [token.span[0] + 0, token.span[0] + 3])
+        syll2 = Syllable("qvam", [token.span[0] + 3, token.span[0] + 7])
        return [syll1, syll2]
-    if strng=="necnon":
-        syll1=Syllable("nec", [token.span[0]+0,token.span[0]+3])
-        syll2=Syllable("non", [token.span[0]+3,token.span[0]+6])
+    if strng == "necnon":
+        syll1 = Syllable("nec", [token.span[0] + 0, token.span[0] + 3])
+        syll2 = Syllable("non", [token.span[0] + 3, token.span[0] + 6])
        return [syll1, syll2]
-    if strng=="seu":
-        syll=Syllable("seu", token.span)
+    if strng == "seu":
+        syll = Syllable("seu", token.span)
        return [syll]
-    if strng=="neu":
-        syll=Syllable("neu", token.span)
+    if strng == "neu":
+        syll = Syllable("neu", token.span)
        return [syll]
-    if strng=="heu":
-        syll=Syllable("heu", token.span)
+    if strng == "heu":
+        syll = Syllable("heu", token.span)
        return [syll]
-    if strng=="huic":
-        syll=Syllable("huic", token.span)
+    if strng == "huic":
+        syll = Syllable("huic", token.span)
        return [syll]
-    if strng=="ei":
-        syll=Syllable("ei", token.span)
+    if strng == "ei":
+        syll = Syllable("ei", token.span)
        return [syll]
-    if strng=="hei":
-        syll=Syllable("hei", token.span)
+    if strng == "hei":
+        syll = Syllable("hei", token.span)
        return [syll]
-    if strng=="ceu":
-        syll=Syllable("ceu", token.span)
+    if strng == "ceu":
+        syll = Syllable("ceu", token.span)
        return [syll]
-    if strng=="heus":
-        syll=Syllable("heus", token.span)
+    if strng == "heus":
+        syll = Syllable("heus", token.span)
        return [syll]
-    #end special cases
+    # End special cases
+
    if strng.isupper():
-        chunks=[chunk for chunk in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower()) if chunk!=""]
+        chunks = [
+            chunk
+            for chunk
+            in re.split("(ae|oe|au|eu|yi|[aeiouy])", strng.lower())
+            if chunk != ""
+        ]
    else:
-        chunks=[chunk for chunk in re.split("(ae|au|oe|[aeiouy])", strng.lower()) if chunk!=""]
-    y=[]
+        chunks=[
+            chunk
+            for chunk
+            in re.split("(ae|au|oe|[aeiouy])", strng.lower())
+            if chunk != ""
+        ]
+    y = []

    # Zaehler j: gerades j: Konsonanten werden an y angehaengt,
    #                    ungerades j: Vokale werden an Konsonanten angehaengt
    # Zu beachten: Faengt Wort mit Vokal an?
-    j=-1
-    fluff=0
+    j = -1
+    fluff = 0

    for ch in chunks:
-        j+=1
-        if j==0:
+        j += 1
+        if j == 0:
            if re.match("[^aeiou]", chunks[0]):
-                fluff=1
+                fluff = 1
                y.append(ch)
            else:
                y.append(ch)
-                j+=1
-        elif j==1 and fluff==1:
-            y[0]+=chunks[1]
+                j += 1
+        elif j == 1 and fluff == 1:
+            y[0] += chunks[1]
        else:
-            if j%2==0:
+            if j % 2 == 0:
                if re.match("[^aeiou]", ch):
-                    y[-1]+=ch
+                    y[-1] += ch
                else:
                    y.append(ch)
-                    j+=1
+                    j += 1
            else:
                y.append(ch)
-        
+
    res = list()
    length = 0
    for x in y:
        res.append(Syllable(x, [length, length+len(x)]))
-        length+=(len(x))
-        
+        length += (len(x))
+
    # special cases again
    if re.search("oen?$", strng) and strng.isupper():
-        res[-1]=Syllable("o", [res[-1].span[0], res[-1].span[0]+1])
+        res[-1] = Syllable("o", [res[-1].span[0], res[-1].span[0]+1])
        if strng.endswith("n"):
-            res.append(Syllable("en", [res[-1].span[0]+1, res[-1].span[1]]))
+            res.append(Syllable("en", [res[-1].span[0] + 1, res[-1].span[1]]))
        else:
-            res.append(Syllable("e",[res[-1].span[0]+1, res[-1].span[1]]))
-    
+            res.append(Syllable("e", [res[-1].span[0] + 1, res[-1].span[1]]))
+
    for syll in res:
        if re.search(r'[aeiuoy]{2}', syll.text):
            syll.vowel_length = 2
            syll.syllable_length = 2
-    
+
    return res


 def get_syllables_for_token(token: Token):
    syllables = []
    if token.accented:
-        regex = (
-            r'((?<!q)(?:ua|ue|ae|oe|au|eu|yi|[aeiouy])[_^]?)'
-            if token.text[0].isupper()
-            else r'((?<!q)(?:ua|ue|ae|oe|au|[aeiouy])[_^]?)'
-        )
-        accented = (token.accented + token.clitic
-                    if token.clitic
-                    else token.accented)
-        chunks = [
-            chunk
-            for chunk in re.split(regex, accented, flags=re.IGNORECASE)
-            if chunk
-        ]
-        syll_start = token.span[0]
-        syll_text = ''
-        syll_vowel_length = 1
-        syll_has_vowel = False
-        for i, c in enumerate(chunks):
-            if c[0] in 'aeiouy':
-                if syll_has_vowel:
-                    # Syllable already has a vowel.
-                    # Add the current syllable and begin a new one.
-                    syll = Syllable(syllable=syll_text,
-                                    span=[syll_start,
-                                          syll_start + len(syll_text)],
-                                    idx=None,
-                                    vowel_length=syll_vowel_length,
-                                    syllable_length=syll_vowel_length)
-                    syllables.append(syll)
-
-                    # Begin info for new syllable.
-                    syll_start = syll_start + len(syll_text)
-                    syll_text = c.rstrip('_^')
-                else:
-                    # Syllable has no vowel yet.
-                    syll_text += c.rstrip('_^')
-                syll_has_vowel = True
-                syll_vowel_length = (
-                    2 if len(c) > 1 and c[1] in 'aeiouy_' else 1
-                )
-            else:
-                syll_text += c.rstrip('_^')
-
-        if syll_text:
-            # Add the last syllable.
-            syll = Syllable(syllable=syll_text,
-                            span=[syll_start, syll_start + len(syll_text)],
-                            idx=None,
-                            vowel_length=syll_vowel_length,
-                            syllable_length=syll_vowel_length)
-            syllables.append(syll)
+        syllables = get_syllables_for_accented_form(token)
    else:
        if not token.is_punct():
-            syllables = get_unknown_syllables(token)
+            syllables = get_syllables_for_unknown_form(token)
    return syllables


@@ -368,8 +385,10 @@ def get_syllables(reading):


 def muta_cum_liquida(verse):
-    
-    mcl_regex = re.compile(r'[aeiouv](([bpsckgdt]|(qu)|(qv))\W*[lrmn])([aeiouv]|[.?!]|$)', flags=re.IGNORECASE)
+    mcl_regex = re.compile(
+        r'[aeiouv](([bpsckgdt]|(qu)|(qv))\W*[lrmn])([aeiouv]|[.?!]|$)',
+        flags=re.IGNORECASE
+    )
    if re.search(mcl_regex, verse.text):
        matches = re.finditer(mcl_regex, verse.text)

@@ -378,24 +397,29 @@ def muta_cum_liquida(verse):
                for token in reading.tokens:
                    for syllable in token.syllables:

-                        if syllable.span[0]<= match.start() < syllable.span[1]:
-                            syllable.phenomena['muta cum liquida'] = Phenomenon(chars=match.group(1))
+                        if syllable.span[0] <= match.start() < syllable.span[1]:
+                            mcl = Phenomenon(chars=match.group(1))
+                            syllable.phenomena['muta cum liquida'] = mcl


 def positional_lengthening(verse):
-    
-    pl_regex = re.compile(r'[aeiouv](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])', flags=re.IGNORECASE)
-    if re.search(pl_regex, verse.text):
-        matches = re.finditer(pl_regex, verse.text)
-
-        for match in matches:
-            for reading in verse.readings:
-                for token in reading.tokens:
-                    for syllable in token.syllables:
-
-                        if syllable.span[0]<= match.start() < syllable.span[1]:
-                            syllable.syllable_length = 2
-                            syllable.phenomena['positional lengthening'] = Phenomenon(chars=match.group(1))
+    pl_regex = re.compile(
+        r'[aeiouv](((([bcdfgjklmnprstvwxz]h?|(qu))\W*){2,})|[xz])',
+        flags=re.IGNORECASE
+    )
+    for match in re.finditer(pl_regex, verse.text):
+        for reading in verse.readings:
+            for token in reading.tokens:
+                break_ = False
+                for syllable in token.syllables:
+                    if syllable.span[0] <= match.start() < syllable.span[1]:
+                        syllable.syllable_length = 2
+                        pl = Phenomenon(chars=match.group(1))
+                        syllable.phenomena['positional lengthening'] = pl
+                        break_ = True
+                        break
+                if break_:
+                    break


 def make_elisions(verse):
@@ -417,23 +441,24 @@ def make_elisions(verse):
                if m:
                    if re.search(r'^h?[aeiouy]', next_syllable.text):
                        # Elision!
-                        this_syllable.phenomena['elision'] = Phenomenon(omitted=m.group())
+                        elision = Phenomenon(omitted=m.group())
+                        this_syllable.phenomena['elision'] = elision
                        this_syllable.syllable_length = 0
    return verse


 def parse_verse(verse):
-    """Annotates syllable lengths based on positional_lengthening and muta cum liquida 
-    
+    """Annotate syllable lengths based on positional_lengthening and muta
+    cum liquida.
    """
    positional_lengthening(verse)
-    
+
    muta_cum_liquida(verse)

    new_readings = list()
-    
+
    for reading in verse.readings:
-        syllables = [syllable for token in reading.tokens 
+        syllables = [syllable for token in reading.tokens
                     for syllable in token.syllables]
        abstract = str()
        mcl_count = 0
@@ -441,7 +466,9 @@ def parse_verse(verse):
            if syllable.id == len(syllables) - 1:
                abstract += '2'
            elif 'muta cum liquida' in syllable.phenomena:
-                if 'positional lengthening' in syllable.phenomena and ' ' in syllable.phenomena['positional lengthening'].chars:
+                if ('positional lengthening' in syllable.phenomena
+                    and ' ' in (syllable.phenomena['positional lengthening']
+                                .chars)):
                    abstract += '2'
                else:
                    abstract += '{}'
@@ -457,15 +484,15 @@ def parse_verse(verse):

        if mcl_count > 0:
            new_abstracts = list()
-            combinations = list(product(['1','2'],repeat=mcl_count))
+            combinations = list(product(['1', '2'], repeat=mcl_count))
            for combi in combinations:
                new_abstracts.append(abstract.format(*combi))
-            
+
            reading_copies = multiply_readings([reading], (mcl_count)*2)
        else:
            new_abstracts = [abstract]
            reading_copies = [reading]
-        
+
        for i in range(len(new_abstracts)):
            blueprint = new_abstracts[i]
            new_reading = reading_copies[i]
@@ -473,18 +500,18 @@ def parse_verse(verse):
            syll_id = 0
            for token in new_reading.tokens:
                for s in token.syllables:
-                    if blueprint[syll_id] == "1":
+                    if blueprint[syll_id] == '1':
                        s.syllable_length = 1
-                        if 'positional lengthening' in s.phenomena and 'muta cum liquida' in s.phenomena:
-                            s.phenomena['positional lengthening'].overruled_by = 'muta cum liquida'
-                    elif blueprint[syll_id] == "2":
+                        if ('positional lengthening' in s.phenomena
+                            and 'muta cum liquida' in s.phenomena):
+                            (s.phenomena['positional lengthening']
+                             .overruled_by) = 'muta cum liquida'
+                    elif blueprint[syll_id] == '2':
                        s.syllable_length = 2
                    syll_id += 1
-                
+
            new_readings.append(copy.deepcopy(new_reading))
-            #print("In:  "+abstract)
-            #print("Out: "+"".join([str(s.syllable_length) for t in new_reading.tokens for s in t.syllables]))
-             
+
    verse.readings = new_readings
    return verse