Verified Commit d026e458 authored by Jakob Moser's avatar Jakob Moser
Browse files

Add manual --document-type parameter

parent f7e6d557
Loading
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
[project]
name = "coliverter"
version = "0.4.1"
version = "0.4.2"
authors = [
  { name="Jakob Moser", email="moser@cl.uni-heidelberg.de" },
]
+18 −6
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ from typing import Annotated
from multiprocessing import Pool

from coliverter.documents.Document import Document
from coliverter.documents.DocumentType import DocumentType
from coliverter.files.FileFormat import FileFormat
from coliverter.cli.typer import typer
from coliverter.files.read import read
@@ -34,10 +35,13 @@ def _get_output_format(


def _convert_single(
    markdown_path: Path | None, output_path: Path | None, output_format: FileFormat
    markdown_path: Path | None,
    output_path: Path | None,
    output_format: FileFormat,
    document_type: DocumentType | None,
) -> None:
    markdown = read(markdown_path)
    document = Document.from_markdown(markdown)
    document = Document.from_markdown(markdown, document_type)

    match output_format:
        case FileFormat.HTML:
@@ -51,6 +55,7 @@ def _convert_single_guessing_output_path(
    markdown_dir_path: Path,
    output_dir_path: Path,
    output_format: FileFormat,
    document_type: DocumentType | None,
) -> None:
    guessed_out_file_path = output_dir_path / (
        markdown_file_path.relative_to(markdown_dir_path).with_suffix(
@@ -61,9 +66,7 @@ def _convert_single_guessing_output_path(
    guessed_out_file_path.parent.mkdir(parents=True, exist_ok=True)

    _convert_single(
        markdown_file_path,
        guessed_out_file_path,
        output_format,
        markdown_file_path, guessed_out_file_path, output_format, document_type
    )


@@ -87,6 +90,12 @@ def convert(
            help="Format of the output file. If None, infer from extension in output path. If inferring is not possible, default to html."
        ),
    ] = None,
    document_type: Annotated[
        DocumentType | None,
        Option(
            help="Type of the document. If None, guess it. Can be used to influence how content is transformed during conversion."
        ),
    ] = None,
) -> None:
    """
    Convert Markdown files into other formats, using the organization identity of Fachschaft Computerlinguistik.
@@ -109,7 +118,10 @@ def convert(
                    repeat(markdown_path),
                    repeat(output_path),
                    repeat(definitely_output_format),
                    repeat(document_type),
                ),
            )
    else:
        _convert_single(markdown_path, output_path, definitely_output_format)
        _convert_single(
            markdown_path, output_path, definitely_output_format, document_type
        )
+3 −2
Original line number Diff line number Diff line
@@ -41,13 +41,14 @@ class Document:
            format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}",
            options=INPUT_OPTIONS,
        )
        document_type = maybe_type or DocumentType.guess(content)

        # Apply the transformations we want to do
        transformations = (insert_table_of_contents,)
        for transform in transformations:
            content = transform(content)
            content = transform(content, document_type)

        return cls(content=content, type=maybe_type or DocumentType.guess(content))
        return cls(content=content, type=document_type)

    @property
    def pdf(self) -> bytes:
+8 −5
Original line number Diff line number Diff line
@@ -9,9 +9,7 @@ from coliverter.tree.Node import Node
TOC_PLACEHOLDER = Para([Str("[["), Emph([Str("TOC")]), Str("]]")])


def _get_table_of_contents(document: Pandoc) -> BulletList:
    document_type = DocumentType.guess(document)

def _get_table_of_contents(document: Pandoc, document_type: DocumentType) -> BulletList:
    headers = tuple(
        element for element in pandoc.iter(document) if isinstance(element, Header)
    )
@@ -32,10 +30,15 @@ def _get_table_of_contents(document: Pandoc) -> BulletList:
    return make_bullet_list(headers_tree)


def insert_table_of_contents(content: Pandoc) -> Pandoc:
def insert_table_of_contents(content: Pandoc, document_type: DocumentType) -> Pandoc:
    """
    Replace [[_TOC_]] with a table of contents whenever it appears in the content.

    This works in-place, i.e. `content` is modified. We nevertheless return it for convenience.

    :param content: The document contents
    :param document_type: The type of the document (necessary as it might require a special table of contents format)
    """
    return replace(content, TOC_PLACEHOLDER, _get_table_of_contents(content))
    return replace(
        content, TOC_PLACEHOLDER, _get_table_of_contents(content, document_type)
    )