wip

instructlab · Sep 27, 2024 · f86f99a · f86f99a
1 parent 63df047
commit f86f99a
Showing 1 changed file with 39 additions and 5 deletions.
diff --git a/src/instructlab/sdg/utils/docprocessor.py b/src/instructlab/sdg/utils/docprocessor.py
@@ -2,7 +2,7 @@
 
 # Standard
 from pathlib import Path
-from typing import List
+from typing import List, Iterable
 import json
 import logging
 import re
@@ -13,8 +13,9 @@
 import re
 
 # Third Party
-from docling.datamodel.base_models import PipelineOptions
-from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import PipelineOptions 
+from docling.datamodel.document import ConvertedDocument
+from docling.document_converter import DocumentConverter, ConversionStatus
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
 
 # Third Party
@@ -574,13 +575,16 @@ def chunk_pdfs(pdf_docs: List, filepaths: List, leaf_node_path: Path, model_name
     """)
 
     converter = DocumentConverter(pipeline_options=PipelineOptions())
-    parsed_pdfs = [converter.convert_single(d) for d in pdf_docs]
+    parsed_pdfs = converter.convert(d)
     print(f"THIS IS KHALED: {parsed_pdfs=}")
-    parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]
 
     docling_jsons_path = DOC_FILEPATH / "docling-jsons"
     docling_jsons_path.mkdir(parents=True, exist_ok=True)
 
+    export_documents(parsed_pdfs, docling_jsons_path)
+    parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]
+
+
     # TODO name files better
     for i, pd in enumerate(parsed_dicts):
         fp = docling_jsons_path / f"docling_{i}.json"
@@ -600,3 +604,33 @@ def chunk_pdfs(pdf_docs: List, filepaths: List, leaf_node_path: Path, model_name
     print(f"THIS IS KHALED: {type(chunk_pdfs)=}")
 
     return chunked_pdfs
+
+
+def export_documents(
+    converted_docs: Iterable[ConvertedDocument],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+
+    for doc in converted_docs:
+        if doc.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = doc.input.file.stem
+
+            # Export Deep Search document JSON format:
+            with (output_dir / f"{doc_filename}.json").open("w") as fp:
+                fp.write(json.dumps(doc.render_as_dict()))
+
+            # Export Markdown format:
+            with (output_dir / f"{doc_filename}.md").open("w") as fp:
+                fp.write(doc.render_as_markdown())
+        else:
+            logger.info(f"Document {doc.input.file} failed to convert.")
+            failure_count += 1
+
+    logger.info(
+        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
+    )