Skip to content
Snippets Groups Projects
Commit 79644a03 authored by Simon Will's avatar Simon Will
Browse files

Fix various things in corpus.py

parent 1be725b2
No related branches found
No related tags found
No related merge requests found
from . import config, db, meters, model, scan, scanner, wordlist from . import config, corpus, db, meters, model, scan, scanner, wordlist
...@@ -6,6 +6,19 @@ from bs4 import BeautifulSoup ...@@ -6,6 +6,19 @@ from bs4 import BeautifulSoup
from .model import Reading, Syllable, Token from .model import Reading, Syllable, Token
BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-type" >
<title>Plautus Amphitruo</title>
</head>
<body>
</body>
</html>
"""
class HypotacticLine: class HypotacticLine:
...@@ -14,7 +27,7 @@ class HypotacticLine: ...@@ -14,7 +27,7 @@ class HypotacticLine:
tokens = [] tokens = []
span_begin = 0 span_begin = 0
idx = 0 idx = 0
for token_tag in element.children: for token_tag in element.find_all(name='span', class_='word'):
syllables = [] syllables = []
token_text = token_tag.text token_text = token_tag.text
token = Token( token = Token(
...@@ -56,21 +69,25 @@ class HypotacticDocument: ...@@ -56,21 +69,25 @@ class HypotacticDocument:
def __init__(self, file_path, parser='lxml'): def __init__(self, file_path, parser='lxml'):
with open(file_path) as f: with open(file_path) as f:
self.root = BeautifulSoup(f, parser) try:
self.title = self.root.title self.root = BeautifulSoup(f, parser)
self.title = self.root.title
def get_poems(self, filters=()): except Exception as e:
print('Exception {!r} when parsing file {!r}'
.format(e, file_path))
self.title = None
def get_poems(self, filters=tuple()):
yield from ( yield from (
p poem
for p in self.root.find_all(name='div', class_='poem') for poem in self.root.find_all(name='div', class_='poem')
if all(fil(p) for fil in filters) if all(fil(poem) for fil in filters)
) )
def get_lines(self, line_filters=(), poem_filters=()): def get_lines(self, line_filters=tuple()):
yield from ( yield from (
line line
for poem in self.get_poems(poem_filters) for line in self.root.find_all(name='div', class_='line')
for line in poem.find_all(name='div', class_='line')
if all(fil(line) for fil in line_filters) if all(fil(line) for fil in line_filters)
) )
...@@ -89,16 +106,37 @@ class HypotacticCorpus: ...@@ -89,16 +106,37 @@ class HypotacticCorpus:
for basename in os.listdir(directory)] for basename in os.listdir(directory)]
return cls(file_paths, *args, **kwargs) return cls(file_paths, *args, **kwargs)
def get_poems(self, filters=()): def get_poems(self, filters=tuple()):
yield from ( yield from (
p poem
for doc in self.documents for doc in self.documents
for p in doc.get_poems(filters) for poem in doc.get_poems(filters)
) )
def get_lines(self, line_filters=(), poem_filters=()): def get_lines(self, line_filters=tuple()):
yield from ( yield from (
p line
for doc in self.documents for doc in self.documents
for p in doc.get_lines(line_filters, poem_filters) for line in doc.get_lines(line_filters)
) )
def get_lines_with_meter(self, meters):
filters = [lambda line: any((meter in line.attrs['class'])
for meter in meters)]
yield from self.get_lines(filters)
def save_lines(self, file_handle, lines, title='Saved Poems',
base_html=BASE_HTML):
soup = BeautifulSoup(base_html, self.parser)
title_tag = soup.new_tag('title')
title_tag.string = title
soup.find(name='head').append(title_tag)
latin = soup.new_tag('div')
latin.attrs['class'] = 'latin'
for line in lines:
latin.append(line)
soup.find(name='body').append(latin)
file_handle.write(soup.prettify())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment