Skip to content

Commit

Permalink
Prefer tesserocr over easyocr, if available
Browse files Browse the repository at this point in the history
When setting up our ingestion pipeline, explicitly check if tesserocr
is available and Docling can load it. If so, prefer that. Otherwise,
attempt the same for EasyOCR. If neither can load, log an error and
disable optical character recognition.

Fixes #352

Signed-off-by: Ben Browning <[email protected]>
  • Loading branch information
bbrowning committed Nov 12, 2024
1 parent b6f07a8 commit ba00454
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 5 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
datasets>=2.18.0,<3.0.0
docling>=2.4.2,<3.0.0
docling[tesserocr]>=2.4.2,<3.0.0
GitPython>=3.1.42,<4.0.0
httpx>=0.25.0,<1.0.0
instructlab-schema>=0.4.0
Expand Down
43 changes: 39 additions & 4 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,19 @@
from datasets import Dataset
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrOptions,
PdfPipelineOptions,
TesseractOcrOptions,
)
from docling.document_converter import (
ConversionStatus,
DocumentConverter,
PdfFormatOption,
)
from docling.models.easyocr_model import EasyOcrModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from tabulate import tabulate
Expand All @@ -35,6 +42,29 @@ def _num_chars_from_tokens(num_tokens) -> int:
return int(num_tokens * 4) # 1 token ~ 4 English character


def resolve_ocr_options() -> OcrOptions:
# First, attempt to use tesserocr
try:
ocr_options = TesseractOcrOptions()
_ = TesseractOcrModel(True, ocr_options)
return ocr_options
except ImportError:
# No tesserocr, so try something else
pass
try:
ocr_options = EasyOcrOptions()
# Keep easyocr models on the CPU instead of GPU
ocr_options.use_gpu = False
_ = EasyOcrModel(True, ocr_options)
return ocr_options
except ImportError:
# no easyocr either, so don't use any OCR
logger.error(
"Failed to load Tesseract and EasyOCR - disabling optical character recognition in PDF documents"
)
return None


class FileTypes(Enum):
MD = ".md"
PDF = ".pdf"
Expand Down Expand Up @@ -212,9 +242,14 @@ def chunk_documents(self) -> List:
return []

model_artifacts_path = StandardPdfPipeline.download_models_hf()
pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path)
# Keep OCR models on the CPU instead of GPU
pipeline_options.ocr_options.use_gpu = False
pipeline_options = PdfPipelineOptions(
artifacts_path=model_artifacts_path,
do_ocr=False,
)
ocr_options = resolve_ocr_options()
if ocr_options is not None:
pipeline_options.do_ocr = True
pipeline_options.ocr_options = ocr_options
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
Expand Down
52 changes: 52 additions & 0 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

# Standard
from pathlib import Path
from unittest.mock import MagicMock, patch
import tempfile

# Third Party
from docling.datamodel.pipeline_options import EasyOcrOptions, TesseractOcrOptions
import pytest

# First Party
Expand All @@ -13,6 +15,7 @@
DocumentChunker,
FileTypes,
TextSplitChunker,
resolve_ocr_options,
)

# Local
Expand Down Expand Up @@ -86,3 +89,52 @@ def test_chunker_factory_empty_filetype(documents_dir):
output_dir=temp_dir,
tokenizer_model_name="instructlab/merlinite-7b-lab",
)


def test_resolve_ocr_options_is_not_none():
"""
Test that resolve_ocr_options does not return None, which means it
found a valid OCR library on the machine running this test
"""
ocr_options = resolve_ocr_options()
assert ocr_options is not None


@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel")
def test_resolve_ocr_options_prefers_tessserocr(mock_tesseract):
"""
Ensure resolve_ocr_options defaults to tesserocr if we're able
to load that library without error.
"""
mock_tesseract.return_value = MagicMock()
ocr_options = resolve_ocr_options()
assert isinstance(ocr_options, TesseractOcrOptions)


@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel")
def test_resolve_ocr_options_falls_back_to_easyocr(mock_tesseract):
"""
Ensure resolve_ocr_options falls back to easyocr if we cannot
load tesserocr.
"""
mock_tesseract.side_effect = ImportError("mock import error")
ocr_options = resolve_ocr_options()
assert isinstance(ocr_options, EasyOcrOptions)


@patch("instructlab.sdg.utils.chunkers.TesseractOcrModel")
@patch("instructlab.sdg.utils.chunkers.EasyOcrModel")
@patch("logging.Logger.error")
def test_resolve_ocr_options_none_found_logs_error(
mock_logger, mock_easyocr, mock_tesseract
):
"""
If we cannot load tesserocr or easyocr, ensure
resolve_ocr_options logs an error so that users are aware optical
character recognition in PDFs will be disabled.
"""
mock_tesseract.side_effect = ImportError("mock import error")
mock_easyocr.side_effect = ImportError("mock import error")
ocr_options = resolve_ocr_options()
assert ocr_options is None
mock_logger.assert_called()

0 comments on commit ba00454

Please sign in to comment.