Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
khaledsulayman committed Sep 27, 2024
1 parent c1fd1d4 commit 63df047
Showing 1 changed file with 2 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/instructlab/sdg/utils/docprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
Dataset: Dataset object.
"""
logger.info(f"Processing parsed docling json file: {json_fp}")
print(f"THIS IS KHALED: {json_fp=}")
with open(json_fp, "r", encoding="utf-8") as f:
data = json.load(f)

Expand Down Expand Up @@ -574,6 +575,7 @@ def chunk_pdfs(pdf_docs: List, filepaths: List, leaf_node_path: Path, model_name

converter = DocumentConverter(pipeline_options=PipelineOptions())
parsed_pdfs = [converter.convert_single(d) for d in pdf_docs]
print(f"THIS IS KHALED: {parsed_pdfs=}")
parsed_dicts = [p.render_as_dict() for p in parsed_pdfs]

docling_jsons_path = DOC_FILEPATH / "docling-jsons"
Expand Down

0 comments on commit 63df047

Please sign in to comment.