From 468019012453fb1cf4e531d09e768d8240607299 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 21 Nov 2024 16:27:46 +0100
Subject: [PATCH] Only compute images if we need them

---
 prodigy_pdf/spans.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/prodigy_pdf/spans.py b/prodigy_pdf/spans.py
index 736c2e8..0ec34f9 100644
--- a/prodigy_pdf/spans.py
+++ b/prodigy_pdf/spans.py
@@ -6,8 +6,8 @@
 import pypdfium2 as pdfium
 import srsly
 from docling_core.types.doc.labels import DocItemLabel
-from prodigy.components.preprocess import resolve_labels
 from prodigy.components.db import connect
+from prodigy.components.preprocess import resolve_labels
 from prodigy.components.stream import Stream, _source_is_dataset, get_stream
 from prodigy.core import Arg, recipe
 from prodigy.errors import RecipeError
@@ -166,7 +166,7 @@ def get_full_stream(self) -> StreamType:
             blocks.append({"view_id": "image", "spans": []})
         for file_path in self.paths:
             doc = self.layout(file_path)
-            images = pdf_to_images(file_path)
+            images = pdf_to_images(file_path) if not self.hide_preview else None
             pages = []
             for i, (page_layout, page_spans) in enumerate(
                 doc._.get(self.layout.attrs.doc_pages)
@@ -174,7 +174,6 @@ def get_full_stream(self) -> StreamType:
                 headings, disabled = get_special_tokens(doc, disable=self.disable)
                 page = {
                     "text": SEPARATOR.join(span.text for span in page_spans),
-                    "image": images[i],
                     "tokens": get_layout_tokens(
                         doc[page_spans[0].start : page_spans[-1].end],
                         headings=headings,
@@ -185,6 +184,8 @@ def get_full_stream(self) -> StreamType:
                     "view_id": "blocks",
                     "config": {"blocks": blocks},
                 }
+                if not self.hide_preview and images:
+                    page["image"] = images[i]
                 pages.append(page)
                 if self.split_pages:
                     meta = {"title": file_path.stem, "page": page_layout.page_no}
@@ -203,21 +204,21 @@ def get_focus_stream(self) -> StreamType:
                 for span in page_spans:
                     if span.label_ not in self.focus:
                         continue
-                    span_layout = span._.get(self.layout.attrs.span_layout)
-                    image_spans = []
-                    if span_layout:
-                        image_spans.append(
-                            {
-                                "x": span_layout.x,
-                                "y": span_layout.y,
-                                "width": span_layout.width,
-                                "height": span_layout.height,
-                                "color": "magenta",
-                                "id": span.id,
-                            }
-                        )
                     blocks = [{"view_id": self.view_id}]
                     if not self.hide_preview:
+                        span_layout = span._.get(self.layout.attrs.span_layout)
+                        image_spans = []
+                        if span_layout:
+                            image_spans.append(
+                                {
+                                    "x": span_layout.x,
+                                    "y": span_layout.y,
+                                    "width": span_layout.width,
+                                    "height": span_layout.height,
+                                    "color": "magenta",
+                                    "id": span.id,
+                                }
+                            )
                         blocks.append({"view_id": "image", "spans": image_spans})
                     eg = {
                         "text": span.text,