From 4d3a4646b180784d19b19a032a3bda53939b2ca3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 13 Nov 2024 14:41:14 +0100
Subject: [PATCH 1/2] Auto-format and support pages

---
 prodigy_pdf/__init__.py | 145 ++++++++++++++++++++++++++--------------
 setup.cfg               |   2 +-
 2 files changed, 95 insertions(+), 52 deletions(-)

diff --git a/prodigy_pdf/__init__.py b/prodigy_pdf/__init__.py
index 406cc52..d5618bb 100644
--- a/prodigy_pdf/__init__.py
+++ b/prodigy_pdf/__init__.py
@@ -1,13 +1,12 @@
-from typing import List, Dict
 import base64
 from io import BytesIO
 from pathlib import Path
-from PIL import Image
+from typing import Dict, List
 
-import pytesseract
 import pypdfium2 as pdfium
-
-from prodigy import recipe, set_hashes, ControllerComponentsDict
+import pytesseract
+from PIL import Image
+from prodigy import ControllerComponentsDict, recipe, set_hashes
 from prodigy.components.stream import Stream, get_stream
 from prodigy.util import msg, split_string
 
@@ -21,20 +20,35 @@ def page_to_image(page: pdfium.PdfPage) -> str:
     return f"data:image/png;base64,{img_str.decode('utf-8')}"
 
 
-def generate_pdf_pages(pdf_paths: List[Path]):
+def generate_pdf_pages(pdf_paths: List[Path], split_pages: bool = False):
     """Generate dictionaries that contain an image for each page in the PDF"""
     for pdf_path in pdf_paths:
         pdf = pdfium.PdfDocument(pdf_path)
         n_pages = len(pdf)
+        pages = []
         for page_number in range(n_pages):
-            page = pdf.get_page(page_number)
-            yield set_hashes({
-                "image": page_to_image(page), 
+            pdf_page = pdf.get_page(page_number)
+            page = {
+                "image": page_to_image(pdf_page),
+                "path": str(pdf_path),
                 "meta": {
+                    "title": pdf_path.name,
                     "page": page_number,
-                    "path": str(pdf_path)
+                },
+            }
+            if split_pages:
+                yield set_hashes(page)
+            else:
+                page["view_id"] = "image_manual"
+                pages.append(page)
+        if not split_pages:
+            yield set_hashes(
+                {
+                    "pages": pages,
+                    "meta": {"title": pdf_path.name},
+                    "config": {"view_id": "pages"},
                 }
-            })
+            )
         pdf.close()
 
 
@@ -44,14 +58,16 @@ def generate_pdf_pages(pdf_paths: List[Path]):
     dataset=("Dataset to save answers to", "positional", None, str),
     pdf_folder=("Folder with PDFs to annotate", "positional", None, Path),
     labels=("Comma seperated labels to use", "option", "l", str),
-    remove_base64=("Remove base64-encoded image data", "flag", "R", bool)
+    remove_base64=("Remove base64-encoded image data", "flag", "R", bool),
+    split_pages=("View pages as separate tasks", "flag", "S", bool),
     # fmt: on
 )
 def pdf_image_manual(
     dataset: str,
     pdf_folder: Path,
-    labels:str,
-    remove_base64:bool=False
+    labels: str,
+    remove_base64: bool = False,
+    split_pages: bool = False,
 ) -> ControllerComponentsDict:
     """Turns pdfs into images in order to annotate them."""
     # Read in stream as a list for progress bar.
@@ -60,39 +76,54 @@ def pdf_image_manual(
     pdf_paths = list(Path(pdf_folder).glob("*.pdf"))
     if len(pdf_paths) == 0:
         msg.fail("Did not find any .pdf files in folder.")
-    source = Stream.from_iterable(pdf_paths).apply(generate_pdf_pages)
+    source = generate_pdf_pages(pdf_paths, split_pages=split_pages)
+    stream = Stream.from_iterable(source)
 
     def before_db(examples):
         # Remove all data URIs before storing example in the database
         for eg in examples:
             if eg["image"].startswith("data:"):
                 del eg["image"]
+            for page in eg.get("pages", []):
+                if page["image"].startswith("data:"):
+                    del page["image"]
         return examples
 
-    color = ["#00ffff", "#ff00ff", "#00ff7f", "#ff6347", "#00bfff",
-             "#ffa500", "#ff69b4", "#7fffd4", "#ffd700", "#ffdab9", "#adff2f", 
-             "#d2b48c", "#dcdcdc", "#ffff00", ]
+    color = [
+        "#00ffff",
+        "#ff00ff",
+        "#00ff7f",
+        "#ff6347",
+        "#00bfff",
+        "#ffa500",
+        "#ff69b4",
+        "#7fffd4",
+        "#ffd700",
+        "#ffdab9",
+        "#adff2f",
+        "#d2b48c",
+        "#dcdcdc",
+        "#ffff00",
+    ]
 
     return {
         "dataset": dataset,
-        "stream": source,
+        "stream": stream,
         "before_db": before_db if remove_base64 else None,
         "view_id": "image_manual",
         "config": {
             "labels": labels.split(","),
             "image_manual_stroke_width": 2,
             "custom_theme": {
-                "labels": {
-                    lab: color[i] for i, lab in enumerate(labels.split(","))
-                }
-            }
+                "labels": {lab: color[i] for i, lab in enumerate(labels.split(","))}
+            },
         },
     }
 
 
 def page_to_cropped_image(pil_page: Image, span: Dict, scale: int):
-    left, upper = span['x'], span['y']
-    right, lower = left + span['width'], upper + span['height']
+    left, upper = span["x"], span["y"]
+    right, lower = left + span["width"], upper + span["height"]
     scaled = (left * scale, upper * scale, right * scale, lower * scale)
     cropped = pil_page.crop(scaled)
     with BytesIO() as buffered:
@@ -101,11 +132,11 @@ def page_to_cropped_image(pil_page: Image, span: Dict, scale: int):
     return cropped, f"data:image/png;base64,{img_str.decode('utf-8')}"
 
 
-def fold_ocr_dashes(ocr_input:str) -> str:
+def fold_ocr_dashes(ocr_input: str) -> str:
     """
     OCR might literally add dashes at the end of the line to indicate
     continuation of the word. This can be fine in some cases, but this
-    function can fold it all into a single string. 
+    function can fold it all into a single string.
     """
     new = ""
     for line in ocr_input.split("\n"):
@@ -113,18 +144,26 @@ def fold_ocr_dashes(ocr_input:str) -> str:
         if line.rfind("-") == -1:
             newline = line + " "
         else:
-            newline = line[:line.rfind("-")]
+            newline = line[: line.rfind("-")]
         new += newline
     return new.strip()
 
 
-def _validate_ocr_example(ex: Dict):
-    if 'meta' not in ex:
-        raise ValueError(f"It seems the `meta` key is missing from an example: {ex}. Did you annotate this data with `pdf.image.manual`?")
-    if 'path' not in ex['meta']:
-        raise ValueError(f"It seems the `path` key is missing from an example metadata: {ex}. Did you annotate this data with `pdf.image.manual`?")
-    if 'page' not in ex['meta']:
-        raise ValueError(f"It seems the `page` key is missing from an example metadata: {ex}. Did you annotate this data with `pdf.image.manual`?")
+def _validate_ocr_example(stream):
+    for eg in stream:
+        if "meta" not in eg:
+            raise ValueError(
+                f"It seems the `meta` key is missing from an example. Did you annotate this data with `pdf.image.manual`?"
+            )
+        if "path" not in eg:
+            raise ValueError(
+                f"It seems the `path` key is missing from an example. Did you annotate this data with `pdf.image.manual`?"
+            )
+        if "page" not in eg["meta"]:
+            raise ValueError(
+                f"It seems the `page` key is missing from an example metadata. Did you annotate this data with `pdf.image.manual`?"
+            )
+        yield eg
 
 
 @recipe(
@@ -136,7 +175,7 @@ def _validate_ocr_example(ex: Dict):
     scale=("Zoom scale. Increase above 3 to upscale the image for OCR.", "option", "s", int),
     remove_base64=("Remove base64-encoded image data", "flag", "R", bool),
     fold_dashes=("Removes dashes at the end of a textline and folds them with the next term.", "flag", "f", bool),
-    autofocus=("Autofocus on the transcript UI", "flag", "af", bool)
+    autofocus=("Autofocus on the transcript UI", "flag", "af", bool),
     # fmt: on
 )
 def pdf_ocr_correct(
@@ -144,41 +183,45 @@ def pdf_ocr_correct(
     source: str,
     labels: str,
     scale: int = 3,
-    remove_base64:bool=False,
-    fold_dashes:bool = False,
-    autofocus: bool = False
+    remove_base64: bool = False,
+    fold_dashes: bool = False,
+    autofocus: bool = False,
 ) -> ControllerComponentsDict:
     """Applies OCR to annotated segments and gives a textbox for corrections."""
     stream = get_stream(source)
 
     def new_stream(stream):
         for ex in stream:
-            useful_spans = [span for span in ex.get('spans', []) if span['label'] in labels]
+            useful_spans = [
+                span for span in ex.get("spans", []) if span["label"] in labels
+            ]
             if useful_spans:
                 _validate_ocr_example(ex)
-                pdf = pdfium.PdfDocument(ex['meta']['path'])
-                page = pdf.get_page(ex['meta']['page'])
+                pdf = pdfium.PdfDocument(ex["path"])
+                page = pdf.get_page(ex["meta"]["page"])
                 pil_page = page.render(scale=scale).to_pil()
             for annot in useful_spans:
-                cropped, img_str = page_to_cropped_image(pil_page, span=annot, scale=scale)
+                cropped, img_str = page_to_cropped_image(
+                    pil_page, span=annot, scale=scale
+                )
                 annot["image"] = img_str
                 annot["text"] = pytesseract.image_to_string(cropped)
                 if fold_dashes:
                     annot["text"] = fold_ocr_dashes(annot["text"])
                 annot["transcription"] = annot["text"]
-                
+
                 # passing through metadata, in order to connect OCR text & bounding boxes to the pdf images via path
                 # example usecase, finetuning a layoutLM on custom data
                 # see details here https://support.prodi.gy/t/pdf-ocr-image-annotation-metadata-feature-suggestion/7211
                 annot["meta"] = ex["meta"]
-                
+
                 text_input_fields = {
                     "field_rows": 12,
                     "field_label": "Transcript",
                     "field_id": "transcription",
                     "field_autofocus": autofocus,
                 }
-                del annot['id']
+                del annot["id"]
                 yield set_hashes({**annot, **text_input_fields})
 
     def before_db(examples):
@@ -187,15 +230,15 @@ def before_db(examples):
             if eg["image"].startswith("data:"):
                 del eg["image"]
         return examples
-    
+
     blocks = [{"view_id": "classification"}, {"view_id": "text_input"}]
+    stream.apply(_validate_ocr_example)
+    stream.apply(new_stream)
 
     return {
         "dataset": dataset,
-        "stream": new_stream(stream),
+        "stream": stream,
         "before_db": before_db if remove_base64 else None,
         "view_id": "blocks",
-        "config": {
-            "blocks": blocks
-        },
+        "config": {"blocks": blocks},
     }
diff --git a/setup.cfg b/setup.cfg
index 7983b00..ae12b2f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [metadata]
-version = 0.2.2
+version = 0.3.0
 description = Recipes for PDF annotation
 url = https://github.com/explosion/prodigy-pdf
 author = Explosion

From b9f8cd98505e1750d9a18debcb9c1495495c3280 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 18 Nov 2024 13:22:32 +0100
Subject: [PATCH 2/2] Fix test

---
 tests/test_basics.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/test_basics.py b/tests/test_basics.py
index bb94ce9..c225b6e 100644
--- a/tests/test_basics.py
+++ b/tests/test_basics.py
@@ -1,14 +1,15 @@
 from pathlib import Path
-from prodigy_pdf import generate_pdf_pages, fold_ocr_dashes, pdf_image_manual
+
+from prodigy_pdf import fold_ocr_dashes, generate_pdf_pages, pdf_image_manual
 
 
 def test_generate_pdf_pages():
     # We know this one PDF has six pages.
     paths = Path("tests/pdfs").glob("*.pdf")
-    pages = list(generate_pdf_pages(paths))
+    pages = list(generate_pdf_pages(paths, split_pages=True))
     assert len(pages) == 6
     for page in pages:
-        assert "data" in page['image']
+        assert "data" in page["image"]
 
 
 def test_fold_dashes():