From f86f99a15a7cdb3c2198fa8e0b8ee8f823abb072 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Fri, 27 Sep 2024 17:24:42 -0400 Subject: [PATCH] wip --- src/instructlab/sdg/utils/docprocessor.py | 44 ++++++++++++++++++++--- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/src/instructlab/sdg/utils/docprocessor.py b/src/instructlab/sdg/utils/docprocessor.py index 7f9aca38..d61294d4 100644 --- a/src/instructlab/sdg/utils/docprocessor.py +++ b/src/instructlab/sdg/utils/docprocessor.py @@ -2,7 +2,7 @@ # Standard from pathlib import Path -from typing import List +from typing import List, Iterable import json import logging import re @@ -13,8 +13,9 @@ import re # Third Party -from docling.datamodel.base_models import PipelineOptions -from docling.document_converter import DocumentConverter +from docling.datamodel.base_models import PipelineOptions +from docling.datamodel.document import ConvertedDocument +from docling.document_converter import DocumentConverter, ConversionStatus from langchain_text_splitters import Language, RecursiveCharacterTextSplitter # Third Party @@ -574,13 +575,16 @@ def chunk_pdfs(pdf_docs: List, filepaths: List, leaf_node_path: Path, model_name """) converter = DocumentConverter(pipeline_options=PipelineOptions()) - parsed_pdfs = [converter.convert_single(d) for d in pdf_docs] + parsed_pdfs = converter.convert(d) print(f"THIS IS KHALED: {parsed_pdfs=}") - parsed_dicts = [p.render_as_dict() for p in parsed_pdfs] docling_jsons_path = DOC_FILEPATH / "docling-jsons" docling_jsons_path.mkdir(parents=True, exist_ok=True) + export_documents(parsed_pdfs, docling_jsons_path) + parsed_dicts = [p.render_as_dict() for p in parsed_pdfs] + + # TODO name files better for i, pd in enumerate(parsed_dicts): fp = docling_jsons_path / f"docling_{i}.json" @@ -600,3 +604,33 @@ def chunk_pdfs(pdf_docs: List, filepaths: List, leaf_node_path: Path, model_name print(f"THIS IS KHALED: {type(chunk_pdfs)=}") return chunked_pdfs + + +def export_documents( + converted_docs: Iterable[ConvertedDocument], + output_dir: Path, +): + output_dir.mkdir(parents=True, exist_ok=True) + + success_count = 0 + failure_count = 0 + + for doc in converted_docs: + if doc.status == ConversionStatus.SUCCESS: + success_count += 1 + doc_filename = doc.input.file.stem + + # Export Deep Search document JSON format: + with (output_dir / f"{doc_filename}.json").open("w") as fp: + fp.write(json.dumps(doc.render_as_dict())) + + # Export Markdown format: + with (output_dir / f"{doc_filename}.md").open("w") as fp: + fp.write(doc.render_as_markdown()) + else: + logger.info(f"Document {doc.input.file} failed to convert.") + failure_count += 1 + + logger.info( + f"Processed {success_count + failure_count} docs, of which {failure_count} failed" + )