Add manual --document-type parameter (d026e458) · Commits · Fachschaft / coliverter

pyproject.toml

+1 −1

Original line number	Diff line number	Diff line
		[project]
		name = "coliverter"
		version = "0.4.1"
		version = "0.4.2"
		authors = [
		{ name="Jakob Moser", email="moser@cl.uni-heidelberg.de" },
		]

src/coliverter/convert.py

+18 −6

Original line number	Diff line number	Diff line
		@@ -5,6 +5,7 @@ from typing import Annotated
		from multiprocessing import Pool

		from coliverter.documents.Document import Document
		from coliverter.documents.DocumentType import DocumentType
		from coliverter.files.FileFormat import FileFormat
		from coliverter.cli.typer import typer
		from coliverter.files.read import read
		@@ -34,10 +35,13 @@ def _get_output_format(


		def _convert_single(
		markdown_path: Path \| None, output_path: Path \| None, output_format: FileFormat
		markdown_path: Path \| None,
		output_path: Path \| None,
		output_format: FileFormat,
		document_type: DocumentType \| None,
		) -> None:
		markdown = read(markdown_path)
		document = Document.from_markdown(markdown)
		document = Document.from_markdown(markdown, document_type)

		match output_format:
		case FileFormat.HTML:
		@@ -51,6 +55,7 @@ def _convert_single_guessing_output_path(
		markdown_dir_path: Path,
		output_dir_path: Path,
		output_format: FileFormat,
		document_type: DocumentType \| None,
		) -> None:
		guessed_out_file_path = output_dir_path / (
		markdown_file_path.relative_to(markdown_dir_path).with_suffix(
		@@ -61,9 +66,7 @@ def _convert_single_guessing_output_path(
		guessed_out_file_path.parent.mkdir(parents=True, exist_ok=True)

		_convert_single(
		markdown_file_path,
		guessed_out_file_path,
		output_format,
		markdown_file_path, guessed_out_file_path, output_format, document_type
		)


		@@ -87,6 +90,12 @@ def convert(
		help="Format of the output file. If None, infer from extension in output path. If inferring is not possible, default to html."
		),
		] = None,
		document_type: Annotated[
		DocumentType \| None,
		Option(
		help="Type of the document. If None, guess it. Can be used to influence how content is transformed during conversion."
		),
		] = None,
		) -> None:
		"""
		Convert Markdown files into other formats, using the organization identity of Fachschaft Computerlinguistik.
		@@ -109,7 +118,10 @@ def convert(
		repeat(markdown_path),
		repeat(output_path),
		repeat(definitely_output_format),
		repeat(document_type),
		),
		)
		else:
		_convert_single(markdown_path, output_path, definitely_output_format)
		_convert_single(
		markdown_path, output_path, definitely_output_format, document_type
		)

src/coliverter/documents/Document.py

+3 −2

Original line number	Diff line number	Diff line
		@@ -41,13 +41,14 @@ class Document:
		format=f"commonmark_x+{'+'.join(ACTIVE_MARKDOWN_EXTENSIONS)}",
		options=INPUT_OPTIONS,
		)
		document_type = maybe_type or DocumentType.guess(content)

		# Apply the transformations we want to do
		transformations = (insert_table_of_contents,)
		for transform in transformations:
		content = transform(content)
		content = transform(content, document_type)

		return cls(content=content, type=maybe_type or DocumentType.guess(content))
		return cls(content=content, type=document_type)

		@property
		def pdf(self) -> bytes:

src/coliverter/documents/transform/insert_table_of_contents.py

+8 −5

Original line number	Diff line number	Diff line
		@@ -9,9 +9,7 @@ from coliverter.tree.Node import Node
		TOC_PLACEHOLDER = Para([Str("[["), Emph([Str("TOC")]), Str("]]")])


		def _get_table_of_contents(document: Pandoc) -> BulletList:
		document_type = DocumentType.guess(document)

		def _get_table_of_contents(document: Pandoc, document_type: DocumentType) -> BulletList:
		headers = tuple(
		element for element in pandoc.iter(document) if isinstance(element, Header)
		)
		@@ -32,10 +30,15 @@ def _get_table_of_contents(document: Pandoc) -> BulletList:
		return make_bullet_list(headers_tree)


		def insert_table_of_contents(content: Pandoc) -> Pandoc:
		def insert_table_of_contents(content: Pandoc, document_type: DocumentType) -> Pandoc:
		"""
		Replace [[_TOC_]] with a table of contents whenever it appears in the content.

		This works in-place, i.e. `content` is modified. We nevertheless return it for convenience.

		:param content: The document contents
		:param document_type: The type of the document (necessary as it might require a special table of contents format)
		"""
		return replace(content, TOC_PLACEHOLDER, _get_table_of_contents(content))
		return replace(
		content, TOC_PLACEHOLDER, _get_table_of_contents(content, document_type)
		)