Skip to content
Snippets Groups Projects
Commit 3574bdf4 authored by Simon Will's avatar Simon Will
Browse files

Improve Hypotactic reading and add tests

parent 0bd0a301
No related branches found
No related tags found
No related merge requests found
# -*- coding: utf-8 -*-
import logging
import re
import os.path
from bs4 import BeautifulSoup
......@@ -19,49 +21,103 @@ BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN"
"""
class HypotacticLine:
def get_reading_from_line_element(element):
tokens = []
span_begin = 0
idx = 0
for token_tag in element.find_all(name='span', class_='word'):
syllables = []
token_text = token_tag.text
token = Token(
token=token_text,
span=[span_begin, span_begin + len(token_text)]
)
def __init__(self, element):
self.element = element
tokens = []
span_begin = 0
idx = 0
for token_tag in element.find_all(name='span', class_='word'):
syllables = []
token_text = token_tag.text
token = Token(
token=token_text,
span=(span_begin, span_begin + len(token_text))
for syllable_tag in token_tag.find_all(name='span', class_='syll'):
syllable_text = syllable_tag.text
if 'long' in syllable_tag.attrs['class']:
syllable_length = 2
elif 'short' in syllable_tag.attrs['class']:
syllable_length = 1
elif 'elided' in syllable_tag.attrs['class']:
syllable_length = 0
else:
raise ValueError(
'Could not determine syllable length of syllable {!r}'
.format(syllable_tag)
)
syllable = Syllable(
idx=idx,
syllable=syllable_text,
span=[span_begin, span_begin + len(syllable_text)],
syllable_length=syllable_length,
vowel_length=None
)
idx += 1
syllables.append(syllable)
span_begin += len(syllable_text)
# The + 1 is for simulating a space between tokens.
span_begin += 1
token.syllables = syllables
tokens.append(token)
return Reading(tokens=tokens)
def separate_punctuation(tokens):
i = 0
while i < len(tokens):
token = tokens[i]
m = re.match(r'^(?P<pre_punct>[\W_]*)(?P<non_punct>\w*)'
'(?P<post_punct>[\W_]*)$',
token.text)
if m:
pre = m.group('pre_punct')
post = m.group('post_punct')
# Create tokens for the punctuation before a token.
span_begin = token.span[0]
for c in pre:
tokens.insert(i, Token(c, [span_begin, span_begin + 1]))
span_begin += 1
i += 1
# Create tokens for the punctuation after a token.
span_begin = token.span[1] - len(post)
for c in m.group('post_punct'):
tokens.insert(i + 1,
Token(c, [span_begin, span_begin + 1]))
span_begin += 1
i += 1
# Remove the punctuation from the original token and
# from its syllables.
token.text = m.group('non_punct')
span_begin = token.span[0] + m.start('non_punct')
span_end = token.span[1] - len(post)
token.span = [span_begin, span_end]
if pre:
token.syllables[0].text = token.syllables[0].text[len(pre):]
token.syllables[0].span[0] = span_begin
if post:
token.syllables[-1].text = token.syllables[-1].text[:-len(post)]
token.syllables[-1].span[1] = span_end
else:
logging.warn('{!r} does not match the punctuation regex.'
.format(token))
i += 1
return tokens
for syllable_tag in token_tag.children:
syllable_text = syllable_tag.text
if 'long' in syllable_tag.attrs['class']:
syllable_length = 2
elif 'short' in syllable_tag.attrs['class']:
syllable_length = 1
elif 'elided' in syllable_tag.attrs['class']:
syllable_length = 0
else:
raise ValueError(
'Could not determine syllable length of syllable {!r}'
.format(syllable_tag)
)
syllable = Syllable(
idx=idx,
syllable=syllable_text,
span=(span_begin, span_begin + len(syllable_text)),
syllable_length=syllable_length,
vowel_length=None
)
idx += 1
syllables.append(syllable)
span_begin += len(syllable_text)
token.syllables = syllables
tokens.append(token)
class HypotacticLine:
self.reading = Reading(tokens=tokens)
def __init__(self, element):
self.element = element
self.reading = get_reading_from_line_element(element)
reading.tokens = separate_punctuation(reading.tokens)
class HypotacticDocument:
......@@ -138,7 +194,7 @@ class HypotacticCorpus:
for line in doc.get_lines_with_meter(meters)
)
def save_lines(self, file_handle, lines, title='Saved Poems',
def save_html_tags(self, file_handle, tags, title='Saved Poems',
base_html=BASE_HTML, pretty=False):
soup = BeautifulSoup(base_html, self.parser)
......@@ -148,8 +204,8 @@ class HypotacticCorpus:
latin = soup.new_tag('div')
latin.attrs['class'] = 'latin'
for line in lines:
latin.append(line)
for tag in tags:
latin.append(tag)
soup.find(name='body').append(latin)
if pretty:
......
......@@ -13,7 +13,7 @@ def main(hypotactic_dir, outfile, limit=0, title='Lines', meters=tuple()):
if limit:
line_generator = itertools.islice(line_generator, limit)
with open(outfile, 'w') as f:
corpus.save_lines(f, line_generator, title=title)
corpus.save_html_tags(f, line_generator, title=title)
def parse_args_and_main():
......
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import pytest
import allzweckmesser as azm
@pytest.fixture
def aratea_element():
# The line is:
# 'vertitur: [hanc nemo certo tibi dicere possit,'
return BeautifulSoup(
'<div class="line"><span class="word"><span class="syll'
' long">ver</span><span class="syll short">ti</span><span'
' class="syll short">tur:</span></span><span class="word"><span'
' class="syll long">[hanc</span></span><span class="word"><span'
' class="syll long">nē</span><span class="syll'
' long">mō</span></span><span class="word"><span class="syll'
' long">cer</span><span class="syll long">tō</span></span><span'
' class="word"><span class="syll short">ti</span><span class="syll'
' short">bi</span></span><span class="word"><span class="syll'
' long">dī</span><span class="syll short">ce</span><span class="syll'
' short">re</span></span><span class="word"><span class="syll'
' long">pos</span><span class="syll long">sit,</span></span></div>',
'lxml'
)
def test_get_reading_from_line_element(aratea_element):
reading = azm.corpus.get_reading_from_line_element(aratea_element)
verse = 'vertitur: [hanc nemo certo tibi dicere possit,'
assert len(reading.tokens) == 7
assert reading.tokens[0].text == 'vertitur:'
assert reading.tokens[1].text == '[hanc'
assert reading.tokens[2].text == 'nēmō'
assert reading.tokens[3].text == 'certō'
assert reading.tokens[4].text == 'tibi'
assert reading.tokens[5].text == 'dīcere'
assert reading.tokens[6].text == 'possit,'
assert reading.tokens[0].span == [0, 9]
assert reading.tokens[1].span == [10, 15]
assert reading.tokens[2].span == [16, 20]
assert reading.tokens[3].span == [21, 26]
assert reading.tokens[4].span == [27, 31]
assert reading.tokens[5].span == [32, 38]
assert reading.tokens[6].span == [39, 46]
assert len(reading.tokens[0].syllables) == 3
assert reading.tokens[0].syllables[0].text == 'ver'
assert reading.tokens[0].syllables[1].text == 'ti'
assert reading.tokens[0].syllables[2].text == 'tur:'
def test_separate_punctuation(aratea_element):
reading = azm.corpus.get_reading_from_line_element(aratea_element)
reading.tokens = azm.corpus.separate_punctuation(reading.tokens)
verse = 'vertitur: [hanc nemo certo tibi dicere possit,'
assert len(reading.tokens) == 10
assert reading.tokens[0].text == 'vertitur'
assert reading.tokens[1].text == ':'
assert reading.tokens[2].text == '['
assert reading.tokens[3].text == 'hanc'
assert reading.tokens[4].text == 'nēmō'
assert reading.tokens[5].text == 'certō'
assert reading.tokens[6].text == 'tibi'
assert reading.tokens[7].text == 'dīcere'
assert reading.tokens[8].text == 'possit'
assert reading.tokens[9].text == ','
assert reading.tokens[0].span == [0, 8]
assert reading.tokens[1].span == [8, 9]
assert reading.tokens[2].span == [10, 11]
assert reading.tokens[3].span == [11, 15]
assert reading.tokens[4].span == [16, 20]
assert reading.tokens[5].span == [21, 26]
assert reading.tokens[6].span == [27, 31]
assert reading.tokens[7].span == [32, 38]
assert reading.tokens[8].span == [39, 45]
assert reading.tokens[9].span == [45, 46]
assert len(reading.tokens[0].syllables) == 3
assert reading.tokens[0].syllables[0].text == 'ver'
assert reading.tokens[0].syllables[0].span == [0, 3]
assert reading.tokens[0].syllables[1].text == 'ti'
assert reading.tokens[0].syllables[1].span == [3, 5]
assert reading.tokens[0].syllables[2].text == 'tur'
assert reading.tokens[0].syllables[2].span == [5, 8]
assert reading.tokens[1].syllables == []
assert reading.tokens[2].syllables == []
assert len(reading.tokens[3].syllables) == 1
assert reading.tokens[3].syllables[0].text == 'hanc'
assert reading.tokens[3].syllables[0].span == [11, 15]
assert len(reading.tokens[4].syllables) == 2
assert reading.tokens[4].syllables[0].text == ''
assert reading.tokens[4].syllables[0].span == [16, 18]
assert reading.tokens[4].syllables[1].text == ''
assert reading.tokens[4].syllables[1].span == [18, 20]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment