Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledsulayman committed Sep 27, 2024
1 parent 63df047 commit d1e076e
Showing 1 changed file with 39 additions and 5 deletions.
44 changes: 39 additions & 5 deletions src/instructlab/sdg/utils/docprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Standard
from pathlib import Path
from typing import List
from typing import List, Iterable
import json
import logging
import re
Expand All @@ -13,8 +13,9 @@
import re

Check warning on line 13 in src/instructlab/sdg/utils/docprocessor.py

View workflow job for this annotation

GitHub Actions / pylint

W0404: Reimport 're' (imported line 8) (reimported)

# Third Party
from docling.datamodel.base_models import PipelineOptions
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import PipelineOptions

Check warning on line 16 in src/instructlab/sdg/utils/docprocessor.py

View workflow job for this annotation

GitHub Actions / pylint

C0303: Trailing whitespace (trailing-whitespace)

Check failure on line 16 in src/instructlab/sdg/utils/docprocessor.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'docling.datamodel.base_models' (import-error)
from docling.datamodel.document import ConvertedDocument

Check failure on line 17 in src/instructlab/sdg/utils/docprocessor.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'docling.datamodel.document' (import-error)
from docling.document_converter import DocumentConverter, ConversionStatus

Check failure on line 18 in src/instructlab/sdg/utils/docprocessor.py

View workflow job for this annotation

GitHub Actions / pylint

E0401: Unable to import 'docling.document_converter' (import-error)
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

# Third Party
Expand Down Expand Up @@ -574,13 +575,16 @@ def chunk_pdfs(pdf_docs: List, filepaths: List, leaf_node_path: Path, model_name
""")

converter = DocumentConverter(pipeline_options=PipelineOptions())
parsed_pdfs = [converter.convert_single(d) for d in pdf_docs]
parsed_pdfs = converter.convert(filepaths)
print(f"THIS IS KHALED: {parsed_pdfs=}")
parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]

docling_jsons_path = DOC_FILEPATH / "docling-jsons"
docling_jsons_path.mkdir(parents=True, exist_ok=True)

export_documents(parsed_pdfs, docling_jsons_path)
parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]


# TODO name files better
for i, pd in enumerate(parsed_dicts):
fp = docling_jsons_path / f"docling_{i}.json"
Expand All @@ -600,3 +604,33 @@ def chunk_pdfs(pdf_docs: List, filepaths: List, leaf_node_path: Path, model_name
print(f"THIS IS KHALED: {type(chunk_pdfs)=}")

return chunked_pdfs


def export_documents(
converted_docs: Iterable[ConvertedDocument],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)

success_count = 0
failure_count = 0

for doc in converted_docs:
if doc.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = doc.input.file.stem

# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(doc.render_as_dict()))

# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(doc.render_as_markdown())
else:
logger.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1

logger.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)

0 comments on commit d1e076e

Please sign in to comment.