Skip to content

Commit

Permalink
Merge pull request #347 from bbrowning/docling_v2_bump
Browse files Browse the repository at this point in the history
Move to Docling v2 APIs
  • Loading branch information
mergify[bot] authored Nov 7, 2024
2 parents 7af918a + 54ae021 commit baf4c30
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 16 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
click>=8.1.7,<9.0.0
datasets>=2.18.0,<3.0.0
docling>=1.15.0,<2.0.0
docling>=2.3.0,<3.0.0
GitPython>=3.1.42,<4.0.0
httpx>=0.25.0,<1.0.0
instructlab-schema>=0.4.0
Expand Down
29 changes: 20 additions & 9 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,15 @@

# Third Party
from datasets import Dataset
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import ConversionStatus, DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import (
ConversionStatus,
DocumentConverter,
PdfFormatOption,
)
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
from tabulate import tabulate
from transformers import AutoTokenizer
Expand Down Expand Up @@ -210,10 +217,14 @@ def chunk_documents(self) -> List:
if self.document_paths == []:
return []

model_artifacts_path = DocumentConverter.download_models_hf()
converter = DocumentConverter(artifacts_path=model_artifacts_path)
inputs = DocumentConversionInput.from_paths(self.filepaths)
parsed_documents = converter.convert(inputs)
model_artifacts_path = StandardPdfPipeline.download_models_hf()
pipeline_options = PdfPipelineOptions(artifacts_path=model_artifacts_path)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
parsed_documents = converter.convert_all(self.filepaths)

docling_artifacts_path = self.export_documents(parsed_documents)

Expand Down Expand Up @@ -539,7 +550,7 @@ def build_chunks_from_docling_json(
document_chunks.append("\n\n".join(current_buffer))
return document_chunks

def export_documents(self, converted_docs: Iterable[ConvertedDocument]):
def export_documents(self, converted_docs: Iterable[ConversionResult]):
"""Write converted documents to json files
Check for successful conversions and write those to the docling artifacts directory.
Expand All @@ -559,11 +570,11 @@ def export_documents(self, converted_docs: Iterable[ConvertedDocument]):

# Export Deep Search document JSON format:
with (docling_artifacts_path / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(doc.render_as_dict()))
fp.write(json.dumps(doc.legacy_document.export_to_dict()))

# Export Markdown format:
with (docling_artifacts_path / f"{doc_filename}.md").open("w") as fp:
fp.write(doc.render_as_markdown())
fp.write(doc.legacy_document.export_to_markdown())
else:
logger.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1
Expand Down
8 changes: 5 additions & 3 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

# Third Party
from datasets import Dataset
from docling_parse.docling_parse import pdf_parser # pylint: disable=no-name-in-module

# pylint: disable=no-name-in-module
from docling_parse.docling_parse import pdf_parser_v1
from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
from instructlab.schema.taxonomy import (
TaxonomyMessageFormat,
Expand All @@ -25,7 +27,7 @@
from .chunkers import DocumentChunker

# Initialize the pdf parser
PDFParser = pdf_parser()
PDFParser = pdf_parser_v1()

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -165,7 +167,7 @@ def _get_documents(
)

elif file_path.lower().endswith(".pdf"):
# Process PDF files using docling_parse's pdf_parser
# Process PDF files using docling_parse's pdf_parser_v1
doc_key = f"key_{os.path.basename(file_path)}" # Unique document key
logger.info(f"Loading PDF document from {file_path}")

Expand Down
3 changes: 0 additions & 3 deletions tests/test_chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
import tempfile

# Third Party
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import ConversionStatus, DocumentConverter
import pytest

# First Party
Expand Down

0 comments on commit baf4c30

Please sign in to comment.