Skip to content

Commit

Permalink
feat: adding txt and doctags output (#68)
Browse files Browse the repository at this point in the history
* feat: adding txt and doctags output

Signed-off-by: Peter Staar <[email protected]>

* cleaned up the export

Signed-off-by: Peter Staar <[email protected]>

* Fix datamodel usage for Figure

Signed-off-by: Christoph Auer <[email protected]>

* updated all the examples to deal with new rendering

Signed-off-by: Peter Staar <[email protected]>

---------

Signed-off-by: Peter Staar <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
  • Loading branch information
PeterStaar-IBM and cau-git authored Sep 10, 2024
1 parent cd5b629 commit bdfdfbf
Show file tree
Hide file tree
Showing 7 changed files with 784 additions and 1,173 deletions.
75 changes: 72 additions & 3 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell
from docling_core.types.doc.base import Figure
from pydantic import BaseModel
from typing_extensions import deprecated

Expand Down Expand Up @@ -279,7 +280,7 @@ def make_spans(cell):
),
)
figures.append(
BaseCell(
Figure(
prov=[
Prov(
bbox=target_bbox,
Expand Down Expand Up @@ -312,8 +313,76 @@ def make_spans(cell):
def render_as_dict(self):
return self.output.model_dump(by_alias=True, exclude_none=True)

def render_as_markdown(self):
return self.output.export_to_markdown()
def render_as_markdown(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
],
strict_text: bool = False,
):
return self.output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=strict_text,
)

def render_as_text(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
],
):
return self.output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=True,
)

def render_as_doctags(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
"figure",
],
page_tagging: bool = True,
location_tagging: bool = True,
location_dimensions: Tuple[int, int] = (100, 100),
add_new_line: bool = True,
) -> str:
return self.output.export_to_document_tokens(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
page_tagging=page_tagging,
location_tagging=location_tagging,
location_dimensions=location_dimensions,
add_new_line=add_new_line,
)

def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
Expand Down
6 changes: 5 additions & 1 deletion docling/utils/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,12 @@ def _process_page():
content_md = doc.export_to_markdown(
main_text_start=start_ix, main_text_stop=end_ix
)
# No page-tagging since we only do 1 page at the time
content_dt = doc.export_to_document_tokens(
main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
)

return content_text, content_md, page_cells, page_segments, page
return content_text, content_md, content_dt, page_cells, page_segments, page

for ix, orig_item in enumerate(doc.main_text):

Expand Down
9 changes: 9 additions & 0 deletions examples/batch_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,18 @@ def export_documents(
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.render_as_dict()))

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.render_as_text())

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.render_as_markdown())

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
fp.write(conv_res.render_as_doctags())

elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {conv_res.input.file} was partially converted with the following errors:"
Expand Down
9 changes: 9 additions & 0 deletions examples/custom_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,18 @@ def export_documents(
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.render_as_dict()))

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.render_as_text())

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.render_as_markdown())

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
fp.write(conv_res.render_as_doctags())

else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
Expand Down
2 changes: 2 additions & 0 deletions examples/export_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def main():
for (
content_text,
content_md,
content_dt,
page_cells,
page_segments,
page,
Expand All @@ -71,6 +72,7 @@ def main():
"cells": page_cells,
"contents": content_text,
"contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments,
"extra": {
"page_num": page.page_no + 1,
Expand Down
Loading

0 comments on commit bdfdfbf

Please sign in to comment.