Skip to content

Commit

Permalink
Only compute images if we need them
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Nov 21, 2024
1 parent c118c69 commit 4680190
Showing 1 changed file with 17 additions and 16 deletions.
33 changes: 17 additions & 16 deletions prodigy_pdf/spans.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import pypdfium2 as pdfium
import srsly
from docling_core.types.doc.labels import DocItemLabel
from prodigy.components.preprocess import resolve_labels
from prodigy.components.db import connect
from prodigy.components.preprocess import resolve_labels
from prodigy.components.stream import Stream, _source_is_dataset, get_stream
from prodigy.core import Arg, recipe
from prodigy.errors import RecipeError
Expand Down Expand Up @@ -166,15 +166,14 @@ def get_full_stream(self) -> StreamType:
blocks.append({"view_id": "image", "spans": []})
for file_path in self.paths:
doc = self.layout(file_path)
images = pdf_to_images(file_path)
images = pdf_to_images(file_path) if not self.hide_preview else None
pages = []
for i, (page_layout, page_spans) in enumerate(
doc._.get(self.layout.attrs.doc_pages)
):
headings, disabled = get_special_tokens(doc, disable=self.disable)
page = {
"text": SEPARATOR.join(span.text for span in page_spans),
"image": images[i],
"tokens": get_layout_tokens(
doc[page_spans[0].start : page_spans[-1].end],
headings=headings,
Expand All @@ -185,6 +184,8 @@ def get_full_stream(self) -> StreamType:
"view_id": "blocks",
"config": {"blocks": blocks},
}
if not self.hide_preview and images:
page["image"] = images[i]
pages.append(page)
if self.split_pages:
meta = {"title": file_path.stem, "page": page_layout.page_no}
Expand All @@ -203,21 +204,21 @@ def get_focus_stream(self) -> StreamType:
for span in page_spans:
if span.label_ not in self.focus:
continue
span_layout = span._.get(self.layout.attrs.span_layout)
image_spans = []
if span_layout:
image_spans.append(
{
"x": span_layout.x,
"y": span_layout.y,
"width": span_layout.width,
"height": span_layout.height,
"color": "magenta",
"id": span.id,
}
)
blocks = [{"view_id": self.view_id}]
if not self.hide_preview:
span_layout = span._.get(self.layout.attrs.span_layout)
image_spans = []
if span_layout:
image_spans.append(
{
"x": span_layout.x,
"y": span_layout.y,
"width": span_layout.width,
"height": span_layout.height,
"color": "magenta",
"id": span.id,
}
)
blocks.append({"view_id": "image", "spans": image_spans})
eg = {
"text": span.text,
Expand Down

0 comments on commit 4680190

Please sign in to comment.