Skip to content

Commit 68b104b

Browse files
committed
improved naming of pdf chunks
1 parent 160b2f3 commit 68b104b

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

backend/src/app/preprocessing/pipeline/steps/text/init/extract_content_in_html_from_pdf_docs.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ def __split_large_pdf_into_chunks(
3636
src.close()
3737
return None
3838

39+
# Calculate the number of digits needed for zero-padding
40+
total_digits = len(str(total_pages))
41+
3942
# If yes, we proceed to split the PDF and save the chunks to disk in the project repo
4043
out_dir = input_doc.parent
4144
logger.info(
@@ -47,7 +50,8 @@ def __split_large_pdf_into_chunks(
4750
for i in range(num_splits):
4851
start_page = i * max_pages_per_chunk + 1
4952
end_page = min((i + 1) * max_pages_per_chunk, total_pages)
50-
page_range_str = f"{start_page}-{end_page}"
53+
# Format page range with zero-padding
54+
page_range_str = f"{start_page:0{total_digits}}-{end_page:0{total_digits}}"
5155
output_fn = out_dir / f"{input_doc.stem}_pages_{page_range_str}.pdf"
5256
try:
5357
# Create a new PDF for the chunk

0 commit comments

Comments
 (0)