From 12b6417f519e650d5ab0fb8d54f1dde923823d96 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Tue, 14 Jan 2025 12:53:52 +0100 Subject: [PATCH] move logic in BaseTextImageEnrichmentModel Signed-off-by: Michele Dolfi --- docling/datamodel/base_models.py | 8 +++++ docling/models/base_model.py | 23 ++++++++++-- .../examples/develop_formula_understanding.py | 35 ++++--------------- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 1df5152a..6dac4672 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -7,6 +7,7 @@ PictureDataType, Size, TableCell, + TextItem, ) from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location DocumentStream, @@ -201,6 +202,13 @@ class AssembledUnit(BaseModel): headers: List[PageElement] = [] +class TextImageEnrichmentElement(BaseModel): + element: TextItem + image: Image + + model_config = ConfigDict(arbitrary_types_allowed=True) + + class Page(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/docling/models/base_model.py b/docling/models/base_model.py index 050fdd28..7cc764b6 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from typing import Any, Generic, Iterable, Optional -from docling_core.types.doc import DoclingDocument, NodeItem +from docling_core.types.doc import DoclingDocument, NodeItem, TextItem from typing_extensions import TypeVar -from docling.datamodel.base_models import Page +from docling.datamodel.base_models import Page, TextImageEnrichmentElement from docling.datamodel.document import ConversionResult @@ -46,3 +46,22 @@ def prepare_element( if self.is_processable(doc=conv_res.document, element=element): return element return None + + +class BaseTextImageEnrichmentModel(GenericEnrichmentModel[TextImageEnrichmentElement]): + + images_scale: float + + def prepare_element( + self, conv_res: ConversionResult, element: NodeItem + ) -> Optional[TextImageEnrichmentElement]: + if not self.is_processable(doc=conv_res.document, element=element): + return None + + assert isinstance(element, TextItem) + element_prov = element.prov[0] + page_ix = element_prov.page_no - 1 + cropped_image = conv_res.pages[page_ix].get_image( + scale=self.images_scale, cropbox=element_prov.bbox + ) + return TextImageEnrichmentElement(element=element, image=cropped_image) diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py index d9b435e1..82f61887 100644 --- a/docs/examples/develop_formula_understanding.py +++ b/docs/examples/develop_formula_understanding.py @@ -3,14 +3,11 @@ from typing import Iterable from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem -from PIL import Image as PILImage -from pydantic import BaseModel, ConfigDict -from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import ConversionResult +from docling.datamodel.base_models import InputFormat, TextImageEnrichmentElement from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.models.base_model import BaseEnrichmentModel, GenericEnrichmentModel +from docling.models.base_model import BaseTextImageEnrichmentModel from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline @@ -18,16 +15,9 @@ class ExampleFormulaUPipelineOptions(PdfPipelineOptions): do_formula_understanding: bool = True -class FormulaEnrichmentElement(BaseModel): - element: TextItem - image: PILImage.Image - - model_config = ConfigDict(arbitrary_types_allowed=True) - - -class ExampleFormulaUEnrichmentModel(GenericEnrichmentModel[FormulaEnrichmentElement]): - - images_scale: float = 2.6 +# A new enrichment model using both the document element and its image as input +class ExampleFormulaUEnrichmentModel(BaseTextImageEnrichmentModel): + images_scale = 2.6 def __init__(self, enabled: bool): self.enabled = enabled @@ -39,21 +29,8 @@ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool: and element.label == DocItemLabel.FORMULA ) - def prepare_element( - self, conv_res: ConversionResult, element: NodeItem - ) -> FormulaEnrichmentElement: - if self.is_processable(doc=conv_res.document, element=element): - assert isinstance(element, TextItem) - element_prov = element.prov[0] - page_ix = element_prov.page_no - 1 - cropped_image = conv_res.pages[page_ix].get_image( - scale=self.images_scale, cropbox=element_prov.bbox - ) - - return FormulaEnrichmentElement(element=element, image=cropped_image) - def __call__( - self, doc: DoclingDocument, element_batch: Iterable[FormulaEnrichmentElement] + self, doc: DoclingDocument, element_batch: Iterable[TextImageEnrichmentElement] ) -> Iterable[NodeItem]: if not self.enabled: return