Loading src/coliverter/convert.py +7 −12 Original line number Diff line number Diff line Loading @@ -4,13 +4,11 @@ from typing import Annotated from multiprocessing import Pool from coliverter.documents.Document import Document from coliverter.files.FileFormat import FileFormat from coliverter.cli.typer import typer from coliverter.files.read import read from coliverter.files.write import write from coliverter.steps.html_to_pdf import html_to_pdf from coliverter.steps.md_to_html import md_to_html from typer import Argument, Option Loading Loading @@ -39,16 +37,13 @@ def _convert_single( markdown_path: Path | None, output_path: Path | None, output_format: FileFormat ) -> None: markdown = read(markdown_path) html = md_to_html(markdown) if output_format == FileFormat.HTML: write(html, output_path) return document = Document.from_markdown(markdown) pdf = html_to_pdf(html) if output_format == FileFormat.PDF: write(pdf, output_path) return match output_format: case FileFormat.HTML: write(document.html, output_path) case FileFormat.PDF: write(document.pdf, output_path) def _convert_single_guessing_output_path( Loading src/coliverter/documents/Document.py 0 → 100644 +58 −0 Original line number Diff line number Diff line from dataclasses import dataclass from typing import Self import pandoc from pandoc.types import Pandoc from weasyprint import HTML from coliverter.documents.DocumentType import DocumentType from coliverter.documents.transform.insert_table_of_contents import ( insert_table_of_contents, ) from coliverter.resources.pandoc.get_filter import get_filter ACTIVE_MARKDOWN_EXTENSIONS = ("autolink_bare_uris", "emoji", "task_lists") INPUT_OPTIONS = ("--strip-comments",) OUTPUT_OPTIONS = ( "--wrap=none", "--standalone", "--css=https://fachschaft.cl.uni-heidelberg.de/wp-content/themes/fscoli-next-master/typography.css", "--variable=lang=de", f"--lua-filter={get_filter('heading-to-title')}", ) @dataclass(frozen=True) class Document: content: Pandoc type: DocumentType @classmethod def from_markdown( cls, markdown: str, maybe_type: DocumentType | None = None ) -> Self: """ Create a document from a markdown string. The document type can either be explicitly provided (if the detection does not work, or you want to force the given markdown source to be interpreted differently, which is sometimes necessary), or is guessed from the document's content. """ content = pandoc.read( source=markdown, format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}", options=INPUT_OPTIONS, ) # Apply the transformations we want to do transformations = (insert_table_of_contents,) for transform in transformations: content = transform(content) return cls(content=content, type=maybe_type or DocumentType.guess(content)) @property def pdf(self) -> bytes: return HTML(string=self.html).write_pdf() @property def html(self) -> str: return pandoc.write(self.content, format="html", options=OUTPUT_OPTIONS) src/coliverter/documents/DocumentType.py +2 −2 Original line number Diff line number Diff line Loading @@ -10,9 +10,9 @@ class DocumentType(StrEnum): OTHER = auto() @classmethod def guess(cls, document: Pandoc) -> Self: def guess(cls, content: Pandoc) -> Self: headers = tuple( element for element in pandoc.iter(document) if isinstance(element, Header) element for element in pandoc.iter(content) if isinstance(element, Header) ) if not headers: Loading src/coliverter/steps/__init__.py→src/coliverter/documents/transform/__init__.py +0 −0 File moved. src/coliverter/steps/md_to_html.py→src/coliverter/documents/transform/insert_table_of_contents.py +7 −34 Original line number Diff line number Diff line import pandoc from pandoc.types import Block, BulletList, Header, Pandoc, Plain, Para, Str, Emph from coliverter.documents.transform.replace import replace import pandoc from coliverter.documents.DocumentType import DocumentType from coliverter.resources.pandoc.get_filter import get_filter from coliverter.tree.Node import Node ACTIVE_MARKDOWN_EXTENSIONS = ("autolink_bare_uris", "emoji", "task_lists") INPUT_OPTIONS = ("--strip-comments",) OUTPUT_OPTIONS = ( "--wrap=none", "--standalone", "--css=https://fachschaft.cl.uni-heidelberg.de/wp-content/themes/fscoli-next-master/typography.css", "--variable=lang=de", f"--lua-filter={get_filter('heading-to-title')}", ) TOC_PLACEHOLDER = Para([Str("[["), Emph([Str("TOC")]), Str("]]")]) Loading @@ -41,28 +32,10 @@ def _get_table_of_contents(document: Pandoc) -> BulletList: return make_bullet_list(headers_tree) def _replace(document: Pandoc, old: Block, new: Block) -> None: old_locations = tuple( path[-1] for element, path in pandoc.iter(document, path=True) if element == old ) for parent, index in old_locations: parent[index] = new def md_to_html(markdown: str) -> str: def insert_table_of_contents(content: Pandoc) -> Pandoc: """ Use pandoc to convert the given Markdown string to an HTML string. Replace [[_TOC_]] with a table of contents whenever it appears in the content. :param markdown: A string containing text in Markdown :return: The same text in HTML This works in-place, i.e. `content` is modified. We nevertheless return it for convenience. """ document = pandoc.read( source=markdown, format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}", options=INPUT_OPTIONS, ) _replace(document, TOC_PLACEHOLDER, _get_table_of_contents(document)) return pandoc.write(document, format="html", options=OUTPUT_OPTIONS) return replace(content, TOC_PLACEHOLDER, _get_table_of_contents(content)) Loading
src/coliverter/convert.py +7 −12 Original line number Diff line number Diff line Loading @@ -4,13 +4,11 @@ from typing import Annotated from multiprocessing import Pool from coliverter.documents.Document import Document from coliverter.files.FileFormat import FileFormat from coliverter.cli.typer import typer from coliverter.files.read import read from coliverter.files.write import write from coliverter.steps.html_to_pdf import html_to_pdf from coliverter.steps.md_to_html import md_to_html from typer import Argument, Option Loading Loading @@ -39,16 +37,13 @@ def _convert_single( markdown_path: Path | None, output_path: Path | None, output_format: FileFormat ) -> None: markdown = read(markdown_path) html = md_to_html(markdown) if output_format == FileFormat.HTML: write(html, output_path) return document = Document.from_markdown(markdown) pdf = html_to_pdf(html) if output_format == FileFormat.PDF: write(pdf, output_path) return match output_format: case FileFormat.HTML: write(document.html, output_path) case FileFormat.PDF: write(document.pdf, output_path) def _convert_single_guessing_output_path( Loading
src/coliverter/documents/Document.py 0 → 100644 +58 −0 Original line number Diff line number Diff line from dataclasses import dataclass from typing import Self import pandoc from pandoc.types import Pandoc from weasyprint import HTML from coliverter.documents.DocumentType import DocumentType from coliverter.documents.transform.insert_table_of_contents import ( insert_table_of_contents, ) from coliverter.resources.pandoc.get_filter import get_filter ACTIVE_MARKDOWN_EXTENSIONS = ("autolink_bare_uris", "emoji", "task_lists") INPUT_OPTIONS = ("--strip-comments",) OUTPUT_OPTIONS = ( "--wrap=none", "--standalone", "--css=https://fachschaft.cl.uni-heidelberg.de/wp-content/themes/fscoli-next-master/typography.css", "--variable=lang=de", f"--lua-filter={get_filter('heading-to-title')}", ) @dataclass(frozen=True) class Document: content: Pandoc type: DocumentType @classmethod def from_markdown( cls, markdown: str, maybe_type: DocumentType | None = None ) -> Self: """ Create a document from a markdown string. The document type can either be explicitly provided (if the detection does not work, or you want to force the given markdown source to be interpreted differently, which is sometimes necessary), or is guessed from the document's content. """ content = pandoc.read( source=markdown, format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}", options=INPUT_OPTIONS, ) # Apply the transformations we want to do transformations = (insert_table_of_contents,) for transform in transformations: content = transform(content) return cls(content=content, type=maybe_type or DocumentType.guess(content)) @property def pdf(self) -> bytes: return HTML(string=self.html).write_pdf() @property def html(self) -> str: return pandoc.write(self.content, format="html", options=OUTPUT_OPTIONS)
src/coliverter/documents/DocumentType.py +2 −2 Original line number Diff line number Diff line Loading @@ -10,9 +10,9 @@ class DocumentType(StrEnum): OTHER = auto() @classmethod def guess(cls, document: Pandoc) -> Self: def guess(cls, content: Pandoc) -> Self: headers = tuple( element for element in pandoc.iter(document) if isinstance(element, Header) element for element in pandoc.iter(content) if isinstance(element, Header) ) if not headers: Loading
src/coliverter/steps/md_to_html.py→src/coliverter/documents/transform/insert_table_of_contents.py +7 −34 Original line number Diff line number Diff line import pandoc from pandoc.types import Block, BulletList, Header, Pandoc, Plain, Para, Str, Emph from coliverter.documents.transform.replace import replace import pandoc from coliverter.documents.DocumentType import DocumentType from coliverter.resources.pandoc.get_filter import get_filter from coliverter.tree.Node import Node ACTIVE_MARKDOWN_EXTENSIONS = ("autolink_bare_uris", "emoji", "task_lists") INPUT_OPTIONS = ("--strip-comments",) OUTPUT_OPTIONS = ( "--wrap=none", "--standalone", "--css=https://fachschaft.cl.uni-heidelberg.de/wp-content/themes/fscoli-next-master/typography.css", "--variable=lang=de", f"--lua-filter={get_filter('heading-to-title')}", ) TOC_PLACEHOLDER = Para([Str("[["), Emph([Str("TOC")]), Str("]]")]) Loading @@ -41,28 +32,10 @@ def _get_table_of_contents(document: Pandoc) -> BulletList: return make_bullet_list(headers_tree) def _replace(document: Pandoc, old: Block, new: Block) -> None: old_locations = tuple( path[-1] for element, path in pandoc.iter(document, path=True) if element == old ) for parent, index in old_locations: parent[index] = new def md_to_html(markdown: str) -> str: def insert_table_of_contents(content: Pandoc) -> Pandoc: """ Use pandoc to convert the given Markdown string to an HTML string. Replace [[_TOC_]] with a table of contents whenever it appears in the content. :param markdown: A string containing text in Markdown :return: The same text in HTML This works in-place, i.e. `content` is modified. We nevertheless return it for convenience. """ document = pandoc.read( source=markdown, format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}", options=INPUT_OPTIONS, ) _replace(document, TOC_PLACEHOLDER, _get_table_of_contents(document)) return pandoc.write(document, format="html", options=OUTPUT_OPTIONS) return replace(content, TOC_PLACEHOLDER, _get_table_of_contents(content))