44import asyncio
55import tempfile
66import copy
7+ import io
78
89import runpod
910
1415from mineru .backend .pipeline .pipeline_middle_json_mkcontent import union_make as pipeline_union_make
1516from mineru .backend .pipeline .model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
1617
18+ from pypdf import PdfReader , PdfWriter
19+
1720class TimeoutError (Exception ):
1821 pass
1922
20- def convert_to_markdown (pdf_bytes , lang = "en" , parse_method = "auto" , formula_enable = True , table_enable = True ):
23+ def _trim_pdf_to_max_pages (pdf_bytes : bytes , max_pages : int ) -> bytes :
24+ """Return a new PDF bytes object with at most the first max_pages pages."""
25+ if max_pages is None or max_pages <= 0 :
26+ return pdf_bytes
27+
28+ input_buffer = io .BytesIO (pdf_bytes )
29+ reader = PdfReader (input_buffer )
30+
31+ writer = PdfWriter ()
32+ pages_to_write = min (max_pages , len (reader .pages ))
33+ for page_index in range (pages_to_write ):
34+ writer .add_page (reader .pages [page_index ])
35+
36+ output_buffer = io .BytesIO ()
37+ writer .write (output_buffer )
38+ return output_buffer .getvalue ()
39+
40+ def convert_to_markdown (pdf_bytes , lang = "en" , parse_method = "auto" , formula_enable = True , table_enable = True , max_pages = None ):
2141 """Convert PDF bytes to markdown - returns only the markdown string"""
2242
2343 try :
44+ # Optionally limit to first N pages
45+ if max_pages is not None :
46+ try :
47+ max_pages_int = int (max_pages )
48+ except Exception :
49+ raise Exception ("Invalid max_pages value; must be an integer" )
50+ pdf_bytes = _trim_pdf_to_max_pages (pdf_bytes , max_pages_int )
51+
2452 # Analyze the PDF
2553 infer_results , all_image_lists , all_pdf_docs , lang_list_result , ocr_enabled_list = pipeline_doc_analyze (
2654 [pdf_bytes ],
@@ -77,6 +105,7 @@ async def handler(event):
77105 filename = input_data .get ("filename" )
78106 timeout = input_data .get ("timeout" )
79107 created_at = input_data .get ("created_at" )
108+ max_pages = input_data .get ("max_pages" )
80109
81110 # Processing options
82111 lang = input_data .get ("lang" , "en" )
@@ -103,6 +132,15 @@ async def handler(event):
103132 if not filename .lower ().endswith ('.pdf' ):
104133 return {"error" : "Only PDF files supported" , "status" : "ERROR" }
105134
135+ # Validate max_pages if provided
136+ if max_pages is not None :
137+ try :
138+ max_pages = int (max_pages )
139+ if max_pages <= 0 :
140+ return {"error" : "max_pages must be a positive integer" , "status" : "ERROR" }
141+ except Exception :
142+ return {"error" : "Invalid max_pages; must be an integer" , "status" : "ERROR" }
143+
106144 # Process PDF
107145 pdf_bytes = base64 .b64decode (base64_content )
108146
@@ -112,7 +150,8 @@ async def handler(event):
112150 lang = lang ,
113151 parse_method = parse_method ,
114152 formula_enable = formula_enable ,
115- table_enable = table_enable
153+ table_enable = table_enable ,
154+ max_pages = max_pages
116155 )
117156
118157 return {"markdown" : md_content , "status" : "SUCCESS" }
0 commit comments