Commit 30b9355b authored by Simon Will's avatar Simon Will
Browse files

Unidecode syllables and tokens in corpus.py

parent 2cceee07
Loading
Loading
Loading
Loading
+3 −2
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ import re
import os.path

from bs4 import BeautifulSoup
from unidecode import unidecode

from .model import Reading, Syllable, Token, Verse

@@ -29,7 +30,7 @@ def get_reading_from_line_element(element):
        syllables = []
        token_text = token_tag.text
        token = Token(
            token=token_text,
            token=unidecode(token_text),
            span=[span_begin, span_begin + len(token_text)]
        )

@@ -48,7 +49,7 @@ def get_reading_from_line_element(element):
                )
            syllable = Syllable(
                idx=idx,
                syllable=syllable_text,
                syllable=unidecode(syllable_text),
                span=[span_begin, span_begin + len(syllable_text)],
                syllable_length=syllable_length,
                vowel_length=None