Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Allzweckmesser
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Messerschleifer
Allzweckmesser
Commits
79644a03
Commit
79644a03
authored
6 years ago
by
Simon Will
Browse files
Options
Downloads
Patches
Plain Diff
Fix various things in corpus.py
parent
1be725b2
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
allzweckmesser/__init__.py
+1
-1
1 addition, 1 deletion
allzweckmesser/__init__.py
allzweckmesser/corpus.py
+55
-17
55 additions, 17 deletions
allzweckmesser/corpus.py
with
56 additions
and
18 deletions
allzweckmesser/__init__.py
+
1
−
1
View file @
79644a03
from
.
import
config
,
db
,
meters
,
model
,
scan
,
scanner
,
wordlist
from
.
import
config
,
corpus
,
db
,
meters
,
model
,
scan
,
scanner
,
wordlist
This diff is collapsed.
Click to expand it.
allzweckmesser/corpus.py
+
55
−
17
View file @
79644a03
...
@@ -6,6 +6,19 @@ from bs4 import BeautifulSoup
...
@@ -6,6 +6,19 @@ from bs4 import BeautifulSoup
from
.model
import
Reading
,
Syllable
,
Token
from
.model
import
Reading
,
Syllable
,
Token
BASE_HTML
=
"""
<!DOCTYPE html PUBLIC
"
-//W3C//DTD XHTML 1.0 Strict//EN
"
"
http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
"
>
<html xmlns=
"
http://www.w3.org/1999/xhtml
"
>
<head>
<meta content=
"
text/html; charset=utf-8
"
http-equiv=
"
Content-type
"
>
<title>Plautus Amphitruo</title>
</head>
<body>
</body>
</html>
"""
class
HypotacticLine
:
class
HypotacticLine
:
...
@@ -14,7 +27,7 @@ class HypotacticLine:
...
@@ -14,7 +27,7 @@ class HypotacticLine:
tokens
=
[]
tokens
=
[]
span_begin
=
0
span_begin
=
0
idx
=
0
idx
=
0
for
token_tag
in
element
.
children
:
for
token_tag
in
element
.
find_all
(
name
=
'
span
'
,
class_
=
'
word
'
)
:
syllables
=
[]
syllables
=
[]
token_text
=
token_tag
.
text
token_text
=
token_tag
.
text
token
=
Token
(
token
=
Token
(
...
@@ -56,21 +69,25 @@ class HypotacticDocument:
...
@@ -56,21 +69,25 @@ class HypotacticDocument:
def
__init__
(
self
,
file_path
,
parser
=
'
lxml
'
):
def
__init__
(
self
,
file_path
,
parser
=
'
lxml
'
):
with
open
(
file_path
)
as
f
:
with
open
(
file_path
)
as
f
:
self
.
root
=
BeautifulSoup
(
f
,
parser
)
try
:
self
.
title
=
self
.
root
.
title
self
.
root
=
BeautifulSoup
(
f
,
parser
)
self
.
title
=
self
.
root
.
title
def
get_poems
(
self
,
filters
=
()):
except
Exception
as
e
:
print
(
'
Exception {!r} when parsing file {!r}
'
.
format
(
e
,
file_path
))
self
.
title
=
None
def
get_poems
(
self
,
filters
=
tuple
()):
yield
from
(
yield
from
(
p
p
oem
for
p
in
self
.
root
.
find_all
(
name
=
'
div
'
,
class_
=
'
poem
'
)
for
p
oem
in
self
.
root
.
find_all
(
name
=
'
div
'
,
class_
=
'
poem
'
)
if
all
(
fil
(
p
)
for
fil
in
filters
)
if
all
(
fil
(
p
oem
)
for
fil
in
filters
)
)
)
def
get_lines
(
self
,
line_filters
=
(),
poem_filters
=
()):
def
get_lines
(
self
,
line_filters
=
tuple
()):
yield
from
(
yield
from
(
line
line
for
poem
in
self
.
get_poems
(
poem_filters
)
for
line
in
self
.
root
.
find_all
(
name
=
'
div
'
,
class_
=
'
line
'
)
for
line
in
poem
.
find_all
(
name
=
'
div
'
,
class_
=
'
line
'
)
if
all
(
fil
(
line
)
for
fil
in
line_filters
)
if
all
(
fil
(
line
)
for
fil
in
line_filters
)
)
)
...
@@ -89,16 +106,37 @@ class HypotacticCorpus:
...
@@ -89,16 +106,37 @@ class HypotacticCorpus:
for
basename
in
os
.
listdir
(
directory
)]
for
basename
in
os
.
listdir
(
directory
)]
return
cls
(
file_paths
,
*
args
,
**
kwargs
)
return
cls
(
file_paths
,
*
args
,
**
kwargs
)
def
get_poems
(
self
,
filters
=
()):
def
get_poems
(
self
,
filters
=
tuple
()):
yield
from
(
yield
from
(
p
p
oem
for
doc
in
self
.
documents
for
doc
in
self
.
documents
for
p
in
doc
.
get_poems
(
filters
)
for
p
oem
in
doc
.
get_poems
(
filters
)
)
)
def
get_lines
(
self
,
line_filters
=
(),
poem_filters
=
()):
def
get_lines
(
self
,
line_filters
=
tuple
()):
yield
from
(
yield
from
(
p
line
for
doc
in
self
.
documents
for
doc
in
self
.
documents
for
p
in
doc
.
get_lines
(
line_filters
,
poem_filters
)
for
line
in
doc
.
get_lines
(
line_filters
)
)
)
def
get_lines_with_meter
(
self
,
meters
):
filters
=
[
lambda
line
:
any
((
meter
in
line
.
attrs
[
'
class
'
])
for
meter
in
meters
)]
yield
from
self
.
get_lines
(
filters
)
def
save_lines
(
self
,
file_handle
,
lines
,
title
=
'
Saved Poems
'
,
base_html
=
BASE_HTML
):
soup
=
BeautifulSoup
(
base_html
,
self
.
parser
)
title_tag
=
soup
.
new_tag
(
'
title
'
)
title_tag
.
string
=
title
soup
.
find
(
name
=
'
head
'
).
append
(
title_tag
)
latin
=
soup
.
new_tag
(
'
div
'
)
latin
.
attrs
[
'
class
'
]
=
'
latin
'
for
line
in
lines
:
latin
.
append
(
line
)
soup
.
find
(
name
=
'
body
'
).
append
(
latin
)
file_handle
.
write
(
soup
.
prettify
())
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment