From ba004548452d2b579c8b3d8b15bd7b90ab5f00b5 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Tue, 12 Nov 2024 12:10:01 -0500 Subject: [PATCH] Prefer tesserocr over easyocr, if available When setting up our ingestion pipeline, explicitly check if tesserocr is available and Docling can load it. If so, prefer that. Otherwise, attempt the same for EasyOCR. If neither can load, log an error and disable optical character recognition. Fixes #352 Signed-off-by: Ben Browning --- requirements.txt | 2 +- src/instructlab/sdg/utils/chunkers.py | 43 +++++++++++++++++++--- tests/test_chunkers.py | 52 +++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7183da22..d10012c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 -docling>=2.4.2,<3.0.0 +docling[tesserocr]>=2.4.2,<3.0.0 GitPython>=3.1.42,<4.0.0 httpx>=0.25.0,<1.0.0 instructlab-schema>=0.4.0 diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 8946b8b7..19fe3acb 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -12,12 +12,19 @@ from datasets import Dataset from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + OcrOptions, + PdfPipelineOptions, + TesseractOcrOptions, +) from docling.document_converter import ( ConversionStatus, DocumentConverter, PdfFormatOption, ) +from docling.models.easyocr_model import EasyOcrModel +from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from langchain_text_splitters import Language, RecursiveCharacterTextSplitter from tabulate import tabulate @@ -35,6 +42,29 @@ def _num_chars_from_tokens(num_tokens) -> int: return int(num_tokens * 4) # 1 token ~ 4 English character +def resolve_ocr_options() -> OcrOptions: + # First, attempt to use tesserocr + try: + ocr_options = TesseractOcrOptions() + _ = TesseractOcrModel(True, ocr_options) + return ocr_options + except ImportError: + # No tesserocr, so try something else + pass + try: + ocr_options = EasyOcrOptions() + # Keep easyocr models on the CPU instead of GPU + ocr_options.use_gpu = False + _ = EasyOcrModel(True, ocr_options) + return ocr_options + except ImportError: + # no easyocr either, so don't use any OCR + logger.error( + "Failed to load Tesseract and EasyOCR - disabling optical character recognition in PDF documents" + ) + return None + + class FileTypes(Enum): MD = ".md" PDF = ".pdf" @@ -212,9 +242,14 @@ def chunk_documents(self) -> List: return [] model_artifacts_path = StandardPdfPipeline.download_models_hf() - pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path) - # Keep OCR models on the CPU instead of GPU - pipeline_options.ocr_options.use_gpu = False + pipeline_options = PdfPipelineOptions( + artifacts_path=model_artifacts_path, + do_ocr=False, + ) + ocr_options = resolve_ocr_options() + if ocr_options is not None: + pipeline_options.do_ocr = True + pipeline_options.ocr_options = ocr_options converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 0d327982..ff93e031 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -2,9 +2,11 @@ # Standard from pathlib import Path +from unittest.mock import MagicMock, patch import tempfile # Third Party +from docling.datamodel.pipeline_options import EasyOcrOptions, TesseractOcrOptions import pytest # First Party @@ -13,6 +15,7 @@ DocumentChunker, FileTypes, TextSplitChunker, + resolve_ocr_options, ) # Local @@ -86,3 +89,52 @@ def test_chunker_factory_empty_filetype(documents_dir): output_dir=temp_dir, tokenizer_model_name="instructlab/merlinite-7b-lab", ) + + +def test_resolve_ocr_options_is_not_none(): + """ + Test that resolve_ocr_options does not return None, which means it + found a valid OCR library on the machine running this test + """ + ocr_options = resolve_ocr_options() + assert ocr_options is not None + + +@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel") +def test_resolve_ocr_options_prefers_tessserocr(mock_tesseract): + """ + Ensure resolve_ocr_options defaults to tesserocr if we're able + to load that library without error. + """ + mock_tesseract.return_value = MagicMock() + ocr_options = resolve_ocr_options() + assert isinstance(ocr_options, TesseractOcrOptions) + + +@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel") +def test_resolve_ocr_options_falls_back_to_easyocr(mock_tesseract): + """ + Ensure resolve_ocr_options falls back to easyocr if we cannot + load tesserocr. + """ + mock_tesseract.side_effect = ImportError("mock import error") + ocr_options = resolve_ocr_options() + assert isinstance(ocr_options, EasyOcrOptions) + + +@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel") +@patch("instructlab.sdg.utils.chunkers.EasyOcrModel") +@patch("logging.Logger.error") +def test_resolve_ocr_options_none_found_logs_error( + mock_logger, mock_easyocr, mock_tesseract +): + """ + If we cannot load tesserocr or easyocr, ensure + resolve_ocr_options logs an error so that users are aware optical + character recognition in PDFs will be disabled. + """ + mock_tesseract.side_effect = ImportError("mock import error") + mock_easyocr.side_effect = ImportError("mock import error") + ocr_options = resolve_ocr_options() + assert ocr_options is None + mock_logger.assert_called()