diff --git a/requirements.txt b/requirements.txt index 7183da22..d10012c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 -docling>=2.4.2,<3.0.0 +docling[tesserocr]>=2.4.2,<3.0.0 GitPython>=3.1.42,<4.0.0 httpx>=0.25.0,<1.0.0 instructlab-schema>=0.4.0 diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 8946b8b7..19fe3acb 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -12,12 +12,19 @@ from datasets import Dataset from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + OcrOptions, + PdfPipelineOptions, + TesseractOcrOptions, +) from docling.document_converter import ( ConversionStatus, DocumentConverter, PdfFormatOption, ) +from docling.models.easyocr_model import EasyOcrModel +from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from langchain_text_splitters import Language, RecursiveCharacterTextSplitter from tabulate import tabulate @@ -35,6 +42,29 @@ def _num_chars_from_tokens(num_tokens) -> int: return int(num_tokens * 4) # 1 token ~ 4 English character +def resolve_ocr_options() -> OcrOptions: + # First, attempt to use tesserocr + try: + ocr_options = TesseractOcrOptions() + _ = TesseractOcrModel(True, ocr_options) + return ocr_options + except ImportError: + # No tesserocr, so try something else + pass + try: + ocr_options = EasyOcrOptions() + # Keep easyocr models on the CPU instead of GPU + ocr_options.use_gpu = False + _ = EasyOcrModel(True, ocr_options) + return ocr_options + except ImportError: + # no easyocr either, so don't use any OCR + logger.error( + "Failed to load Tesseract and EasyOCR - disabling optical character recognition in PDF documents" + ) + return None + + class FileTypes(Enum): MD = ".md" PDF = ".pdf" @@ -212,9 +242,14 @@ def chunk_documents(self) -> List: return [] model_artifacts_path = StandardPdfPipeline.download_models_hf() - pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path) - # Keep OCR models on the CPU instead of GPU - pipeline_options.ocr_options.use_gpu = False + pipeline_options = PdfPipelineOptions( + artifacts_path=model_artifacts_path, + do_ocr=False, + ) + ocr_options = resolve_ocr_options() + if ocr_options is not None: + pipeline_options.do_ocr = True + pipeline_options.ocr_options = ocr_options converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 0d327982..ff93e031 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -2,9 +2,11 @@ # Standard from pathlib import Path +from unittest.mock import MagicMock, patch import tempfile # Third Party +from docling.datamodel.pipeline_options import EasyOcrOptions, TesseractOcrOptions import pytest # First Party @@ -13,6 +15,7 @@ DocumentChunker, FileTypes, TextSplitChunker, + resolve_ocr_options, ) # Local @@ -86,3 +89,52 @@ def test_chunker_factory_empty_filetype(documents_dir): output_dir=temp_dir, tokenizer_model_name="instructlab/merlinite-7b-lab", ) + + +def test_resolve_ocr_options_is_not_none(): + """ + Test that resolve_ocr_options does not return None, which means it + found a valid OCR library on the machine running this test + """ + ocr_options = resolve_ocr_options() + assert ocr_options is not None + + +@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel") +def test_resolve_ocr_options_prefers_tessserocr(mock_tesseract): + """ + Ensure resolve_ocr_options defaults to tesserocr if we're able + to load that library without error. + """ + mock_tesseract.return_value = MagicMock() + ocr_options = resolve_ocr_options() + assert isinstance(ocr_options, TesseractOcrOptions) + + +@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel") +def test_resolve_ocr_options_falls_back_to_easyocr(mock_tesseract): + """ + Ensure resolve_ocr_options falls back to easyocr if we cannot + load tesserocr. + """ + mock_tesseract.side_effect = ImportError("mock import error") + ocr_options = resolve_ocr_options() + assert isinstance(ocr_options, EasyOcrOptions) + + +@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel") +@patch("instructlab.sdg.utils.chunkers.EasyOcrModel") +@patch("logging.Logger.error") +def test_resolve_ocr_options_none_found_logs_error( + mock_logger, mock_easyocr, mock_tesseract +): + """ + If we cannot load tesserocr or easyocr, ensure + resolve_ocr_options logs an error so that users are aware optical + character recognition in PDFs will be disabled. + """ + mock_tesseract.side_effect = ImportError("mock import error") + mock_easyocr.side_effect = ImportError("mock import error") + ocr_options = resolve_ocr_options() + assert ocr_options is None + mock_logger.assert_called()