Skip to content

Commit

Permalink
feat: add docling cli (#75)
Browse files Browse the repository at this point in the history
* chore: add simple convert script

Signed-off-by: Peter Staar <[email protected]>

* reformatted all

Signed-off-by: Peter Staar <[email protected]>

* reformatted all

Signed-off-by: Peter Staar <[email protected]>

* added default arg

Signed-off-by: Peter Staar <[email protected]>

* use typer for the docling CLI

Signed-off-by: Michele Dolfi <[email protected]>

* describe output when saving

Signed-off-by: Michele Dolfi <[email protected]>

* add tests for CLI

Signed-off-by: Michele Dolfi <[email protected]>

* add export options

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Peter Staar <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
  • Loading branch information
PeterStaar-IBM and dolfim-ibm authored Sep 13, 2024
1 parent 8aa476c commit 9899078
Show file tree
Hide file tree
Showing 5 changed files with 311 additions and 1 deletion.
Empty file added docling/cli/__init__.py
Empty file.
257 changes: 257 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
import importlib
import json
import logging
import time
import warnings
from enum import Enum
from pathlib import Path
from typing import Annotated, Iterable, List, Optional

import typer
from pydantic import AnyUrl

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter

warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

_log = logging.getLogger(__name__)
from rich.console import Console

err_console = Console(stderr=True)


app = typer.Typer(
name="Docling",
no_args_is_help=True,
add_completion=False,
pretty_exceptions_enable=False,
)


def version_callback(value: bool):
if value:
docling_version = importlib.metadata.version("docling")
docling_core_version = importlib.metadata.version("docling-core")
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
docling_parse_version = importlib.metadata.version("docling-parse")
print(f"Docling version: {docling_version}")
print(f"Docling Core version: {docling_core_version}")
print(f"Docling IBM Models version: {docling_ibm_models_version}")
print(f"Docling Parse version: {docling_parse_version}")
raise typer.Exit()


# Define an enum for the backend options
class Backend(str, Enum):
PYPDFIUM2 = "pypdfium2"
DOCLING = "docling"


def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
export_json: bool,
export_md: bool,
export_txt: bool,
export_doctags: bool,
):

success_count = 0
failure_count = 0

for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem

# Export Deep Search document JSON format:
if export_json:
fname = output_dir / f"{doc_filename}.json"
with fname.open("w") as fp:
_log.info(f"writing JSON output to {fname}")
fp.write(json.dumps(conv_res.render_as_dict()))

# Export Text format:
if export_txt:
fname = output_dir / f"{doc_filename}.txt"
with fname.open("w") as fp:
_log.info(f"writing Text output to {fname}")
fp.write(conv_res.render_as_text())

# Export Markdown format:
if export_md:
fname = output_dir / f"{doc_filename}.md"
with fname.open("w") as fp:
_log.info(f"writing Markdown output to {fname}")
fp.write(conv_res.render_as_markdown())

# Export Document Tags format:
if export_doctags:
fname = output_dir / f"{doc_filename}.doctags"
with fname.open("w") as fp:
_log.info(f"writing Doc Tags output to {fname}")
fp.write(conv_res.render_as_doctags())

else:
_log.warning(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1

_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)


@app.command(no_args_is_help=True)
def convert(
input_sources: Annotated[
List[Path],
typer.Argument(
...,
metavar="source",
help="PDF files to convert. Directories are also accepted.",
),
],
export_json: Annotated[
bool,
typer.Option(
..., "--json/--no-json", help="If enabled the document is exported as JSON."
),
] = False,
export_md: Annotated[
bool,
typer.Option(
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
),
] = True,
export_txt: Annotated[
bool,
typer.Option(
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
),
] = False,
export_doctags: Annotated[
bool,
typer.Option(
...,
"--doctags/--no-doctags",
help="If enabled the document is exported as Doc Tags.",
),
] = False,
ocr: Annotated[
bool,
typer.Option(
..., help="If enabled, the bitmap content will be processed using OCR."
),
] = True,
backend: Annotated[
Backend, typer.Option(..., help="The PDF backend to use.")
] = Backend.DOCLING,
output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."),
version: Annotated[
Optional[bool],
typer.Option(
"--version",
callback=version_callback,
is_eager=True,
help="Show version information.",
),
] = None,
):
logging.basicConfig(level=logging.INFO)

input_doc_paths: List[Path] = []
for source in input_sources:
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
)
raise typer.Abort()
elif source.is_dir():
input_doc_paths.extend(list(source.glob("**/*.pdf", case_sensitive=False)))
else:
input_doc_paths.append(source)

###########################################################################

# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.

doc_converter = None
if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = False

doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=PyPdfiumDocumentBackend,
)

elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=PyPdfiumDocumentBackend,
)

elif backend == Backend.DOCLING.value and not ocr: # Docling Parse without OCR
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)

elif backend == Backend.DOCLING.value and ocr: # Docling Parse with OCR
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

doc_converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)

###########################################################################

# Define input files
input = DocumentConversionInput.from_paths(input_doc_paths)

start_time = time.time()

conv_results = doc_converter.convert(input)

output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
)

end_time = time.time() - start_time

_log.info(f"All documents were converted in {end_time:.2f} seconds.")


if __name__ == "__main__":
app()
30 changes: 29 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ torchvision = [
{version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
{version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
]
typer = "^0.12.5"

[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
Expand Down Expand Up @@ -95,6 +96,10 @@ examples = [
"langchain-text-splitters",
]


[tool.poetry.scripts]
docling = "docling.cli.main:app"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Expand Down
20 changes: 20 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typer.testing import CliRunner

from docling.cli.main import app

runner = CliRunner()


def test_cli_help():
result = runner.invoke(app, ["--help"])
assert result.exit_code == 0


def test_cli_version():
result = runner.invoke(app, ["--version"])
assert result.exit_code == 0


def test_cli_convert():
result = runner.invoke(app, ["./tests/data/2305.03393v1-pg9.pdf"])
assert result.exit_code == 0

0 comments on commit 9899078

Please sign in to comment.