feat: adding txt and doctags output (#68)

* feat: adding txt and doctags output Signed-off-by: Peter Staar <[email protected]> * cleaned up the export Signed-off-by: Peter Staar <[email protected]> * Fix datamodel usage for Figure Signed-off-by: Christoph Auer <[email protected]> * updated all the examples to deal with new rendering Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
DS4SD · Sep 10, 2024 · bdfdfbf · bdfdfbf
1 parent cd5b629
commit bdfdfbf
Show file tree

Hide file tree

Showing 7 changed files with 784 additions and 1,173 deletions.
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -11,6 +11,7 @@
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
+from docling_core.types.doc.base import Figure
 from pydantic import BaseModel
 from typing_extensions import deprecated
 
@@ -279,7 +280,7 @@ def make_spans(cell):
                     ),
                 )
                 figures.append(
-                    BaseCell(
+                    Figure(
                         prov=[
                             Prov(
                                 bbox=target_bbox,
@@ -312,8 +313,76 @@ def make_spans(cell):
     def render_as_dict(self):
         return self.output.model_dump(by_alias=True, exclude_none=True)
 
-    def render_as_markdown(self):
-        return self.output.export_to_markdown()
+    def render_as_markdown(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+        ],
+        strict_text: bool = False,
+    ):
+        return self.output.export_to_markdown(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            strict_text=strict_text,
+        )
+
+    def render_as_text(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+        ],
+    ):
+        return self.output.export_to_markdown(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            strict_text=True,
+        )
+
+    def render_as_doctags(
+        self,
+        delim: str = "\n\n",
+        main_text_start: int = 0,
+        main_text_stop: Optional[int] = None,
+        main_text_labels: list[str] = [
+            "title",
+            "subtitle-level-1",
+            "paragraph",
+            "caption",
+            "table",
+            "figure",
+        ],
+        page_tagging: bool = True,
+        location_tagging: bool = True,
+        location_dimensions: Tuple[int, int] = (100, 100),
+        add_new_line: bool = True,
+    ) -> str:
+        return self.output.export_to_document_tokens(
+            delim=delim,
+            main_text_start=main_text_start,
+            main_text_stop=main_text_stop,
+            main_text_labels=main_text_labels,
+            page_tagging=page_tagging,
+            location_tagging=location_tagging,
+            location_dimensions=location_dimensions,
+            add_new_line=add_new_line,
+        )
 
     def render_element_images(
         self, element_types: Tuple[PageElement] = (FigureElement,)

diff --git a/docling/utils/export.py b/docling/utils/export.py
@@ -163,8 +163,12 @@ def _process_page():
         content_md = doc.export_to_markdown(
             main_text_start=start_ix, main_text_stop=end_ix
         )
+        # No page-tagging since we only do 1 page at the time
+        content_dt = doc.export_to_document_tokens(
+            main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
+        )
 
-        return content_text, content_md, page_cells, page_segments, page
+        return content_text, content_md, content_dt, page_cells, page_segments, page
 
     for ix, orig_item in enumerate(doc.main_text):
 

diff --git a/examples/batch_convert.py b/examples/batch_convert.py
@@ -30,9 +30,18 @@ def export_documents(
             with (output_dir / f"{doc_filename}.json").open("w") as fp:
                 fp.write(json.dumps(conv_res.render_as_dict()))
 
+            # Export Text format:
+            with (output_dir / f"{doc_filename}.txt").open("w") as fp:
+                fp.write(conv_res.render_as_text())
+
             # Export Markdown format:
             with (output_dir / f"{doc_filename}.md").open("w") as fp:
                 fp.write(conv_res.render_as_markdown())
+
+            # Export Document Tags format:
+            with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
+                fp.write(conv_res.render_as_doctags())
+
         elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
             _log.info(
                 f"Document {conv_res.input.file} was partially converted with the following errors:"

diff --git a/examples/custom_convert.py b/examples/custom_convert.py
@@ -31,9 +31,18 @@ def export_documents(
             with (output_dir / f"{doc_filename}.json").open("w") as fp:
                 fp.write(json.dumps(conv_res.render_as_dict()))
 
+            # Export Text format:
+            with (output_dir / f"{doc_filename}.txt").open("w") as fp:
+                fp.write(conv_res.render_as_text())
+
             # Export Markdown format:
             with (output_dir / f"{doc_filename}.md").open("w") as fp:
                 fp.write(conv_res.render_as_markdown())
+
+            # Export Document Tags format:
+            with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
+                fp.write(conv_res.render_as_doctags())
+
         else:
             _log.info(f"Document {conv_res.input.file} failed to convert.")
             failure_count += 1

diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py
@@ -51,6 +51,7 @@ def main():
         for (
             content_text,
             content_md,
+            content_dt,
             page_cells,
             page_segments,
             page,
@@ -71,6 +72,7 @@ def main():
                     "cells": page_cells,
                     "contents": content_text,
                     "contents_md": content_md,
+                    "contents_dt": content_dt,
                     "segments": page_segments,
                     "extra": {
                         "page_num": page.page_no + 1,