Fix various things in corpus.py

79644a03 · Simon Will · 1be725b2 · 79644a03 · 79644a03
Commit 79644a03 authored 6 years ago by Simon Will
--- a/allzweckmesser/__init__.py
+++ b/allzweckmesser/__init__.py
-from . import config, db, meters, model, scan, scanner, wordlist
+from . import config, corpus, db, meters, model, scan, scanner, wordlist
--- a/allzweckmesser/corpus.py
+++ b/allzweckmesser/corpus.py
@@ -6,6 +6,19 @@ from bs4 import BeautifulSoup
 from .model import Reading, Syllable, Token
+BASE_HTML = """<!DOCTYPE html PUBLIC"-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta content="text/html; charset=utf-8" http-equiv="Content-type" >
+<title>Plautus Amphitruo</title>
+</head>
+<body>
+</body>
+</html>
+"""
 class HypotacticLine:
@@ -14,7 +27,7 @@ class HypotacticLine:
        tokens = []
        span_begin = 0
        idx = 0
-        for token_tag in element.children:
+        for token_tag in element.find_all(name='span', class_='word'):
            syllables = []
            token_text = token_tag.text
            token = Token(
@@ -56,21 +69,25 @@ class HypotacticDocument:
    def __init__(self, file_path, parser='lxml'):
        with open(file_path) as f:
-            self.root = BeautifulSoup(f, parser)
+            try:
-        self.title = self.root.title
+                self.root = BeautifulSoup(f, parser)
+                self.title = self.root.title
-    def get_poems(self, filters=()):
+            except Exception as e:
+                print('Exception {!r} when parsing file {!r}'
+                      .format(e, file_path))
+                self.title = None
+    def get_poems(self, filters=tuple()):
        yield from (
-            p
+            poem
-            for p in self.root.find_all(name='div', class_='poem')
+            for poem in self.root.find_all(name='div', class_='poem')
-            if all(fil(p) for fil in filters)
+            if all(fil(poem) for fil in filters)
        )
-    def get_lines(self, line_filters=(), poem_filters=()):
+    def get_lines(self, line_filters=tuple()):
        yield from (
            line
-            for poem in self.get_poems(poem_filters)
+            for line in self.root.find_all(name='div', class_='line')
-            for line in poem.find_all(name='div', class_='line')
            if all(fil(line) for fil in line_filters)
        )
@@ -89,16 +106,37 @@ class HypotacticCorpus:
                      for basename in os.listdir(directory)]
        return cls(file_paths, *args, **kwargs)
-    def get_poems(self, filters=()):
+    def get_poems(self, filters=tuple()):
        yield from (
-            p
+            poem
            for doc in self.documents
-            for p in doc.get_poems(filters)
+            for poem in doc.get_poems(filters)
        )
-    def get_lines(self, line_filters=(), poem_filters=()):
+    def get_lines(self, line_filters=tuple()):
        yield from (
-            p
+            line
            for doc in self.documents
-            for p in doc.get_lines(line_filters, poem_filters)
+            for line in doc.get_lines(line_filters)
        )
+    def get_lines_with_meter(self, meters):
+        filters = [lambda line: any((meter in line.attrs['class'])
+                                    for meter in meters)]
+        yield from self.get_lines(filters)
+    def save_lines(self, file_handle, lines, title='Saved Poems',
+                   base_html=BASE_HTML):
+        soup = BeautifulSoup(base_html, self.parser)
+        title_tag = soup.new_tag('title')
+        title_tag.string = title
+        soup.find(name='head').append(title_tag)
+        latin = soup.new_tag('div')
+        latin.attrs['class'] = 'latin'
+        for line in lines:
+            latin.append(line)
+        soup.find(name='body').append(latin)
+        file_handle.write(soup.prettify())