diff --git a/src/instructlab/sdg/utils/chunking.py b/src/instructlab/sdg/utils/chunking.py index 6048aa75..6ccc07e6 100644 --- a/src/instructlab/sdg/utils/chunking.py +++ b/src/instructlab/sdg/utils/chunking.py @@ -138,10 +138,11 @@ def chunk_pdfs(pdf_docs: List, leaf_node_path: Path): parsed_pdfs = converter.convert(pdf_docs) parsed_dicts = [p.render_as_dict() for p in parsed_pdfs] - docling_jsons_path = Path("TODO") + docling_jsons_path = Path("~/docling-jsonls") - for pd in parsed_dicts: - fp = docling_jsons_path / "TODO.jsonl" + # TODO name files better + for i, pd in enumerate(parsed_dicts): + fp = docling_jsons_path / f"docling_{i}.jsonl" with open(fp, "w") as jsonl_file: for entry in pd: