Skip to content

Commit

Permalink
move logic in BaseTextImageEnrichmentModel
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Jan 14, 2025
1 parent 3611335 commit 12b6417
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 31 deletions.
8 changes: 8 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
PictureDataType,
Size,
TableCell,
TextItem,
)
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream,
Expand Down Expand Up @@ -201,6 +202,13 @@ class AssembledUnit(BaseModel):
headers: List[PageElement] = []


class TextImageEnrichmentElement(BaseModel):
element: TextItem
image: Image

model_config = ConfigDict(arbitrary_types_allowed=True)


class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

Expand Down
23 changes: 21 additions & 2 deletions docling/models/base_model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional

from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc import DoclingDocument, NodeItem, TextItem
from typing_extensions import TypeVar

from docling.datamodel.base_models import Page
from docling.datamodel.base_models import Page, TextImageEnrichmentElement
from docling.datamodel.document import ConversionResult


Expand Down Expand Up @@ -46,3 +46,22 @@ def prepare_element(
if self.is_processable(doc=conv_res.document, element=element):
return element
return None


class BaseTextImageEnrichmentModel(GenericEnrichmentModel[TextImageEnrichmentElement]):

images_scale: float

def prepare_element(
self, conv_res: ConversionResult, element: NodeItem
) -> Optional[TextImageEnrichmentElement]:
if not self.is_processable(doc=conv_res.document, element=element):
return None

assert isinstance(element, TextItem)
element_prov = element.prov[0]
page_ix = element_prov.page_no - 1
cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=element_prov.bbox
)
return TextImageEnrichmentElement(element=element, image=cropped_image)
35 changes: 6 additions & 29 deletions docs/examples/develop_formula_understanding.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,21 @@
from typing import Iterable

from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
from PIL import Image as PILImage
from pydantic import BaseModel, ConfigDict

from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.base_models import InputFormat, TextImageEnrichmentElement
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.base_model import BaseEnrichmentModel, GenericEnrichmentModel
from docling.models.base_model import BaseTextImageEnrichmentModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline


class ExampleFormulaUPipelineOptions(PdfPipelineOptions):
do_formula_understanding: bool = True


class FormulaEnrichmentElement(BaseModel):
element: TextItem
image: PILImage.Image

model_config = ConfigDict(arbitrary_types_allowed=True)


class ExampleFormulaUEnrichmentModel(GenericEnrichmentModel[FormulaEnrichmentElement]):

images_scale: float = 2.6
# A new enrichment model using both the document element and its image as input
class ExampleFormulaUEnrichmentModel(BaseTextImageEnrichmentModel):
images_scale = 2.6

def __init__(self, enabled: bool):
self.enabled = enabled
Expand All @@ -39,21 +29,8 @@ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
and element.label == DocItemLabel.FORMULA
)

def prepare_element(
self, conv_res: ConversionResult, element: NodeItem
) -> FormulaEnrichmentElement:
if self.is_processable(doc=conv_res.document, element=element):
assert isinstance(element, TextItem)
element_prov = element.prov[0]
page_ix = element_prov.page_no - 1
cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=element_prov.bbox
)

return FormulaEnrichmentElement(element=element, image=cropped_image)

def __call__(
self, doc: DoclingDocument, element_batch: Iterable[FormulaEnrichmentElement]
self, doc: DoclingDocument, element_batch: Iterable[TextImageEnrichmentElement]
) -> Iterable[NodeItem]:
if not self.enabled:
return
Expand Down

0 comments on commit 12b6417

Please sign in to comment.