22import os
33import time
44import tempfile
5+ import copy
6+ import io
57
68import runpod
79
810
11+ from pypdf import PdfReader , PdfWriter
12+
913class TimeoutError (Exception ):
1014 pass
1115
1216
13- def convert_to_markdown (pdf_bytes , lang = "en" , parse_method = "auto" , formula_enable = True , table_enable = True ):
17+ def _trim_pdf_to_max_pages (pdf_bytes : bytes , max_pages : int ) -> bytes :
18+ """Return a new PDF bytes object with at most the first max_pages pages."""
19+ if max_pages is None or max_pages <= 0 :
20+ return pdf_bytes
21+
22+ input_buffer = io .BytesIO (pdf_bytes )
23+ reader = PdfReader (input_buffer )
24+
25+ writer = PdfWriter ()
26+ pages_to_write = min (max_pages , len (reader .pages ))
27+ for page_index in range (pages_to_write ):
28+ writer .add_page (reader .pages [page_index ])
29+
30+ output_buffer = io .BytesIO ()
31+ writer .write (output_buffer )
32+ return output_buffer .getvalue ()
33+
34+ def convert_to_markdown (pdf_bytes , lang = "en" , parse_method = "auto" , formula_enable = True , table_enable = True , max_pages = None ):
1435 """Convert PDF bytes to markdown - returns only the markdown string"""
1536 try :
1637 # Lazy imports to avoid import-time signal handling in non-main threads
@@ -20,6 +41,15 @@ def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enabl
2041 from mineru .backend .pipeline .pipeline_middle_json_mkcontent import union_make as pipeline_union_make
2142 from mineru .backend .pipeline .model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
2243
44+ # Optionally limit to first N pages
45+ if max_pages is not None :
46+ try :
47+ max_pages_int = int (max_pages )
48+ except Exception :
49+ raise Exception ("Invalid max_pages value; must be an integer" )
50+ pdf_bytes = _trim_pdf_to_max_pages (pdf_bytes , max_pages_int )
51+
52+ # Analyze the PDF
2353 infer_results , all_image_lists , all_pdf_docs , lang_list_result , ocr_enabled_list = pipeline_doc_analyze (
2454 [pdf_bytes ], [lang ], parse_method = parse_method , formula_enable = formula_enable , table_enable = table_enable
2555 )
@@ -74,6 +104,11 @@ def handler(event):
74104 base64_content = input_data .get ("file_content" )
75105 filename = input_data .get ("filename" )
76106
107+ timeout = input_data .get ("timeout" )
108+ created_at = input_data .get ("created_at" )
109+ max_pages = input_data .get ("max_pages" )
110+
111+ # Processing options
77112 lang = input_data .get ("lang" , "en" )
78113 parse_method = input_data .get ("parse_method" , "auto" )
79114 formula_enable = input_data .get ("formula_enable" , True )
@@ -85,6 +120,16 @@ def handler(event):
85120 if not filename .lower ().endswith ('.pdf' ):
86121 return {"error" : "Only PDF files supported" , "status" : "ERROR" }
87122
123+ # Validate max_pages if provided
124+ if max_pages is not None :
125+ try :
126+ max_pages = int (max_pages )
127+ if max_pages <= 0 :
128+ return {"error" : "max_pages must be a positive integer" , "status" : "ERROR" }
129+ except Exception :
130+ return {"error" : "Invalid max_pages; must be an integer" , "status" : "ERROR" }
131+
132+ # Process PDF
88133 pdf_bytes = base64 .b64decode (base64_content )
89134
90135 md_content = convert_to_markdown_dispatch (
@@ -93,6 +138,7 @@ def handler(event):
93138 parse_method = parse_method ,
94139 formula_enable = formula_enable ,
95140 table_enable = table_enable ,
141+ max_pages = max_pages
96142 )
97143
98144 return {"markdown" : md_content , "status" : "SUCCESS" }
0 commit comments