Skip to content
Snippets Groups Projects
Commit 1be725b2 authored by Simon Will's avatar Simon Will
Browse files

Add module for reading Hypotactic Corpus

parent 5ba4bba0
No related branches found
No related tags found
No related merge requests found
# -*- coding: utf-8 -*-
import os.path
from bs4 import BeautifulSoup
from .model import Reading, Syllable, Token
class HypotacticLine:
def __init__(self, element):
self.element = element
tokens = []
span_begin = 0
idx = 0
for token_tag in element.children:
syllables = []
token_text = token_tag.text
token = Token(
token=token_text,
span=(span_begin, span_begin + len(token_text))
)
for syllable_tag in token_tag.children:
syllable_text = syllable_tag.text
if 'long' in syllable_tag.attrs['class']:
syllable_length = 2
elif 'short' in syllable_tag.attrs['class']:
syllable_length = 1
elif 'elided' in syllable_tag.attrs['class']:
syllable_length = 0
else:
raise ValueError(
'Could not determine syllable length of syllable {!r}'
.format(syllable_tag)
)
syllable = Syllable(
idx=idx,
syllable=syllable_text,
span=(span_begin, span_begin + len(syllable_text)),
syllable_length=syllable_length,
vowel_length=None
)
idx += 1
syllables.append(syllable)
span_begin += len(syllable_text)
token.syllables = syllables
tokens.append(token)
self.reading = Reading(tokens=tokens)
class HypotacticDocument:
def __init__(self, file_path, parser='lxml'):
with open(file_path) as f:
self.root = BeautifulSoup(f, parser)
self.title = self.root.title
def get_poems(self, filters=()):
yield from (
p
for p in self.root.find_all(name='div', class_='poem')
if all(fil(p) for fil in filters)
)
def get_lines(self, line_filters=(), poem_filters=()):
yield from (
line
for poem in self.get_poems(poem_filters)
for line in poem.find_all(name='div', class_='line')
if all(fil(line) for fil in line_filters)
)
class HypotacticCorpus:
def __init__(self, file_paths, parser='lxml'):
self.file_paths = file_paths
self.parser = parser
self.documents = [HypotacticDocument(p, parser=parser)
for p in file_paths]
@classmethod
def from_directory(cls, directory, *args, **kwargs):
file_paths = [os.path.abspath(os.path.join(directory, basename))
for basename in os.listdir(directory)]
return cls(file_paths, *args, **kwargs)
def get_poems(self, filters=()):
yield from (
p
for doc in self.documents
for p in doc.get_poems(filters)
)
def get_lines(self, line_filters=(), poem_filters=()):
yield from (
p
for doc in self.documents
for p in doc.get_lines(line_filters, poem_filters)
)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment