Refactor, introduce "Document" abstraction (e6e40feb) · Commits · Fachschaft / coliverter

src/coliverter/convert.py

+7 −12

Original line number	Diff line number	Diff line
		@@ -4,13 +4,11 @@ from typing import Annotated

		from multiprocessing import Pool

		from coliverter.documents.Document import Document
		from coliverter.files.FileFormat import FileFormat
		from coliverter.cli.typer import typer
		from coliverter.files.read import read
		from coliverter.files.write import write
		from coliverter.steps.html_to_pdf import html_to_pdf

		from coliverter.steps.md_to_html import md_to_html


		from typer import Argument, Option
		@@ -39,16 +37,13 @@ def _convert_single(
		markdown_path: Path \| None, output_path: Path \| None, output_format: FileFormat
		) -> None:
		markdown = read(markdown_path)
		html = md_to_html(markdown)

		if output_format == FileFormat.HTML:
		write(html, output_path)
		return
		document = Document.from_markdown(markdown)

		pdf = html_to_pdf(html)
		if output_format == FileFormat.PDF:
		write(pdf, output_path)
		return
		match output_format:
		case FileFormat.HTML:
		write(document.html, output_path)
		case FileFormat.PDF:
		write(document.pdf, output_path)


		def _convert_single_guessing_output_path(

src/coliverter/documents/Document.py

0 → 100644

+58 −0

Original line number	Diff line number	Diff line
		from dataclasses import dataclass
		from typing import Self

		import pandoc
		from pandoc.types import Pandoc
		from weasyprint import HTML

		from coliverter.documents.DocumentType import DocumentType
		from coliverter.documents.transform.insert_table_of_contents import (
		insert_table_of_contents,
		)
		from coliverter.resources.pandoc.get_filter import get_filter

		ACTIVE_MARKDOWN_EXTENSIONS = ("autolink_bare_uris", "emoji", "task_lists")
		INPUT_OPTIONS = ("--strip-comments",)
		OUTPUT_OPTIONS = (
		"--wrap=none",
		"--standalone",
		"--css=https://fachschaft.cl.uni-heidelberg.de/wp-content/themes/fscoli-next-master/typography.css",
		"--variable=lang=de",
		f"--lua-filter={get_filter('heading-to-title')}",
		)


		@dataclass(frozen=True)
		class Document:
		content: Pandoc
		type: DocumentType

		@classmethod
		def from_markdown(
		cls, markdown: str, maybe_type: DocumentType \| None = None
		) -> Self:
		"""
		Create a document from a markdown string. The document type can either be explicitly provided
		(if the detection does not work, or you want to force the given markdown source to be interpreted
		differently, which is sometimes necessary), or is guessed from the document's content.
		"""
		content = pandoc.read(
		source=markdown,
		format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}",
		options=INPUT_OPTIONS,
		)

		# Apply the transformations we want to do
		transformations = (insert_table_of_contents,)
		for transform in transformations:
		content = transform(content)

		return cls(content=content, type=maybe_type or DocumentType.guess(content))

		@property
		def pdf(self) -> bytes:
		return HTML(string=self.html).write_pdf()

		@property
		def html(self) -> str:
		return pandoc.write(self.content, format="html", options=OUTPUT_OPTIONS)

src/coliverter/documents/DocumentType.py

+2 −2

Original line number	Diff line number	Diff line
		@@ -10,9 +10,9 @@ class DocumentType(StrEnum):
		OTHER = auto()

		@classmethod
		def guess(cls, document: Pandoc) -> Self:
		def guess(cls, content: Pandoc) -> Self:
		headers = tuple(
		element for element in pandoc.iter(document) if isinstance(element, Header)
		element for element in pandoc.iter(content) if isinstance(element, Header)
		)

		if not headers:

src/coliverter/steps/init.py→src/coliverter/documents/transform/init.py

+0 −0

File moved.

src/coliverter/steps/md_to_html.py→src/coliverter/documents/transform/insert_table_of_contents.py

+7 −34

Original line number	Diff line number	Diff line
		import pandoc
		from pandoc.types import Block, BulletList, Header, Pandoc, Plain, Para, Str, Emph

		from coliverter.documents.transform.replace import replace
		import pandoc

		from coliverter.documents.DocumentType import DocumentType
		from coliverter.resources.pandoc.get_filter import get_filter
		from coliverter.tree.Node import Node


		ACTIVE_MARKDOWN_EXTENSIONS = ("autolink_bare_uris", "emoji", "task_lists")
		INPUT_OPTIONS = ("--strip-comments",)
		OUTPUT_OPTIONS = (
		"--wrap=none",
		"--standalone",
		"--css=https://fachschaft.cl.uni-heidelberg.de/wp-content/themes/fscoli-next-master/typography.css",
		"--variable=lang=de",
		f"--lua-filter={get_filter('heading-to-title')}",
		)
		TOC_PLACEHOLDER = Para([Str("[["), Emph([Str("TOC")]), Str("]]")])


		@@ -41,28 +32,10 @@ def _get_table_of_contents(document: Pandoc) -> BulletList:
		return make_bullet_list(headers_tree)


		def _replace(document: Pandoc, old: Block, new: Block) -> None:
		old_locations = tuple(
		path[-1] for element, path in pandoc.iter(document, path=True) if element == old
		)

		for parent, index in old_locations:
		parent[index] = new


		def md_to_html(markdown: str) -> str:
		def insert_table_of_contents(content: Pandoc) -> Pandoc:
		"""
		Use pandoc to convert the given Markdown string to an HTML string.
		Replace [[_TOC_]] with a table of contents whenever it appears in the content.

		:param markdown: A string containing text in Markdown
		:return: The same text in HTML
		This works in-place, i.e. `content` is modified. We nevertheless return it for convenience.
		"""
		document = pandoc.read(
		source=markdown,
		format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}",
		options=INPUT_OPTIONS,
		)

		_replace(document, TOC_PLACEHOLDER, _get_table_of_contents(document))

		return pandoc.write(document, format="html", options=OUTPUT_OPTIONS)
		return replace(content, TOC_PLACEHOLDER, _get_table_of_contents(content))