Verified Commit e6e40feb authored by Jakob Moser's avatar Jakob Moser
Browse files

Refactor, introduce "Document" abstraction

parent 5679767b
Loading
Loading
Loading
Loading
Loading
+7 −12
Original line number Diff line number Diff line
@@ -4,13 +4,11 @@ from typing import Annotated

from multiprocessing import Pool

from coliverter.documents.Document import Document
from coliverter.files.FileFormat import FileFormat
from coliverter.cli.typer import typer
from coliverter.files.read import read
from coliverter.files.write import write
from coliverter.steps.html_to_pdf import html_to_pdf

from coliverter.steps.md_to_html import md_to_html


from typer import Argument, Option
@@ -39,16 +37,13 @@ def _convert_single(
    markdown_path: Path | None, output_path: Path | None, output_format: FileFormat
) -> None:
    markdown = read(markdown_path)
    html = md_to_html(markdown)

    if output_format == FileFormat.HTML:
        write(html, output_path)
        return
    document = Document.from_markdown(markdown)

    pdf = html_to_pdf(html)
    if output_format == FileFormat.PDF:
        write(pdf, output_path)
        return
    match output_format:
        case FileFormat.HTML:
            write(document.html, output_path)
        case FileFormat.PDF:
            write(document.pdf, output_path)


def _convert_single_guessing_output_path(
+58 −0
Original line number Diff line number Diff line
from dataclasses import dataclass
from typing import Self

import pandoc
from pandoc.types import Pandoc
from weasyprint import HTML

from coliverter.documents.DocumentType import DocumentType
from coliverter.documents.transform.insert_table_of_contents import (
    insert_table_of_contents,
)
from coliverter.resources.pandoc.get_filter import get_filter

ACTIVE_MARKDOWN_EXTENSIONS = ("autolink_bare_uris", "emoji", "task_lists")
INPUT_OPTIONS = ("--strip-comments",)
OUTPUT_OPTIONS = (
    "--wrap=none",
    "--standalone",
    "--css=https://fachschaft.cl.uni-heidelberg.de/wp-content/themes/fscoli-next-master/typography.css",
    "--variable=lang=de",
    f"--lua-filter={get_filter('heading-to-title')}",
)


@dataclass(frozen=True)
class Document:
    content: Pandoc
    type: DocumentType

    @classmethod
    def from_markdown(
        cls, markdown: str, maybe_type: DocumentType | None = None
    ) -> Self:
        """
        Create a document from a markdown string. The document type can either be explicitly provided
        (if the detection does not work, or you want to force the given markdown source to be interpreted
        differently, which is sometimes necessary), or is guessed from the document's content.
        """
        content = pandoc.read(
            source=markdown,
            format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}",
            options=INPUT_OPTIONS,
        )

        # Apply the transformations we want to do
        transformations = (insert_table_of_contents,)
        for transform in transformations:
            content = transform(content)

        return cls(content=content, type=maybe_type or DocumentType.guess(content))

    @property
    def pdf(self) -> bytes:
        return HTML(string=self.html).write_pdf()

    @property
    def html(self) -> str:
        return pandoc.write(self.content, format="html", options=OUTPUT_OPTIONS)
+2 −2
Original line number Diff line number Diff line
@@ -10,9 +10,9 @@ class DocumentType(StrEnum):
    OTHER = auto()

    @classmethod
    def guess(cls, document: Pandoc) -> Self:
    def guess(cls, content: Pandoc) -> Self:
        headers = tuple(
            element for element in pandoc.iter(document) if isinstance(element, Header)
            element for element in pandoc.iter(content) if isinstance(element, Header)
        )

        if not headers:
+7 −34
Original line number Diff line number Diff line
import pandoc
from pandoc.types import Block, BulletList, Header, Pandoc, Plain, Para, Str, Emph

from coliverter.documents.transform.replace import replace
import pandoc

from coliverter.documents.DocumentType import DocumentType
from coliverter.resources.pandoc.get_filter import get_filter
from coliverter.tree.Node import Node


ACTIVE_MARKDOWN_EXTENSIONS = ("autolink_bare_uris", "emoji", "task_lists")
INPUT_OPTIONS = ("--strip-comments",)
OUTPUT_OPTIONS = (
    "--wrap=none",
    "--standalone",
    "--css=https://fachschaft.cl.uni-heidelberg.de/wp-content/themes/fscoli-next-master/typography.css",
    "--variable=lang=de",
    f"--lua-filter={get_filter('heading-to-title')}",
)
TOC_PLACEHOLDER = Para([Str("[["), Emph([Str("TOC")]), Str("]]")])


@@ -41,28 +32,10 @@ def _get_table_of_contents(document: Pandoc) -> BulletList:
    return make_bullet_list(headers_tree)


def _replace(document: Pandoc, old: Block, new: Block) -> None:
    old_locations = tuple(
        path[-1] for element, path in pandoc.iter(document, path=True) if element == old
    )

    for parent, index in old_locations:
        parent[index] = new


def md_to_html(markdown: str) -> str:
def insert_table_of_contents(content: Pandoc) -> Pandoc:
    """
    Use pandoc to convert the given Markdown string to an HTML string.
    Replace [[_TOC_]] with a table of contents whenever it appears in the content.

    :param markdown: A string containing text in Markdown
    :return: The same text in HTML
    This works in-place, i.e. `content` is modified. We nevertheless return it for convenience.
    """
    document = pandoc.read(
        source=markdown,
        format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}",
        options=INPUT_OPTIONS,
    )

    _replace(document, TOC_PLACEHOLDER, _get_table_of_contents(document))

    return pandoc.write(document, format="html", options=OUTPUT_OPTIONS)
    return replace(content, TOC_PLACEHOLDER, _get_table_of_contents(content))
Loading