22import os
33import time
44import tempfile
5- import copy
6- from concurrent .futures import ProcessPoolExecutor , TimeoutError as FuturesTimeoutError
75
86import runpod
97
10- # New mineru imports
11- from mineru .data .data_reader_writer import FileBasedDataWriter
12- from mineru .utils .enum_class import MakeMode
13- from mineru .backend .pipeline .pipeline_analyze import doc_analyze as pipeline_doc_analyze
14- from mineru .backend .pipeline .pipeline_middle_json_mkcontent import union_make as pipeline_union_make
15- from mineru .backend .pipeline .model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
16- from mineru .backend .vlm .vlm_analyze import doc_analyze as vlm_doc_analyze
17- from mineru .backend .vlm .vlm_middle_json_mkcontent import union_make as vlm_union_make
188
199class TimeoutError (Exception ):
2010 pass
2111
12+
2213def convert_to_markdown (pdf_bytes , lang = "en" , parse_method = "auto" , formula_enable = True , table_enable = True ):
2314 """Convert PDF bytes to markdown - returns only the markdown string"""
24-
2515 try :
26- # Analyze the PDF
16+ # Lazy imports to avoid import-time signal handling in non-main threads
17+ from mineru .data .data_reader_writer import FileBasedDataWriter
18+ from mineru .utils .enum_class import MakeMode
19+ from mineru .backend .pipeline .pipeline_analyze import doc_analyze as pipeline_doc_analyze
20+ from mineru .backend .pipeline .pipeline_middle_json_mkcontent import union_make as pipeline_union_make
21+ from mineru .backend .pipeline .model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
22+
2723 infer_results , all_image_lists , all_pdf_docs , lang_list_result , ocr_enabled_list = pipeline_doc_analyze (
28- [pdf_bytes ],
29- [lang ],
30- parse_method = parse_method ,
31- formula_enable = formula_enable ,
32- table_enable = table_enable
24+ [pdf_bytes ], [lang ], parse_method = parse_method , formula_enable = formula_enable , table_enable = table_enable
3325 )
34-
35- # Process results
26+
3627 model_list = infer_results [0 ]
3728 images_list = all_image_lists [0 ]
3829 pdf_doc = all_pdf_docs [0 ]
3930 _lang = lang_list_result [0 ]
4031 _ocr_enable = ocr_enabled_list [0 ]
41-
42- # Create temporary image directory for any image processing
32+
4333 with tempfile .TemporaryDirectory () as temp_dir :
4434 image_writer = FileBasedDataWriter (temp_dir )
45-
46- # Convert to middle JSON format
4735 middle_json = pipeline_result_to_middle_json (
48- model_list , images_list , pdf_doc , image_writer ,
49- _lang , _ocr_enable , formula_enable
36+ model_list , images_list , pdf_doc , image_writer , _lang , _ocr_enable , formula_enable
5037 )
51-
52- # Generate and return markdown
5338 pdf_info = middle_json ["pdf_info" ]
5439 return pipeline_union_make (pdf_info , MakeMode .MM_MD , "images" )
55-
5640 except Exception as e :
5741 raise Exception (f"Error converting PDF to markdown: { str (e )} " )
5842
43+
5944def convert_to_markdown_vlm (pdf_bytes , backend = "vlm-sglang-engine" , server_url = None ):
60- """Convert PDF bytes to markdown using VLM backends; returns markdown string.
61- Only server/engine backend is supported as requested.
62- """
63- # Normalize backend to what vlm_doc_analyze expects
45+ """Convert PDF bytes to markdown using VLM backends; returns markdown string."""
46+ # Lazy imports to avoid import-time signal handling in non-main threads
47+ from mineru .data .data_reader_writer import FileBasedDataWriter
48+ from mineru .utils .enum_class import MakeMode
49+ from mineru .backend .vlm .vlm_analyze import doc_analyze as vlm_doc_analyze
50+ from mineru .backend .vlm .vlm_middle_json_mkcontent import union_make as vlm_union_make
51+
6452 normalized_backend = backend [4 :] if backend .startswith ("vlm-" ) else backend
6553 with tempfile .TemporaryDirectory () as temp_dir :
6654 image_writer = FileBasedDataWriter (temp_dir )
6755 middle_json , _ = vlm_doc_analyze (
68- pdf_bytes ,
69- image_writer = image_writer ,
70- backend = normalized_backend ,
71- server_url = server_url ,
56+ pdf_bytes , image_writer = image_writer , backend = normalized_backend , server_url = server_url
7257 )
7358 pdf_info = middle_json ["pdf_info" ]
7459 return vlm_union_make (pdf_info , MakeMode .MM_MD , "images" )
7560
7661
7762def convert_to_markdown_dispatch (pdf_bytes , ** kwargs ):
78- """Dispatch to pipeline or VLM engine based on env MINERU_BACKEND.
79- Defaults to pipeline without changing existing behavior.
80- """
63+ """Dispatch to pipeline or VLM engine based on env MINERU_BACKEND."""
8164 backend_env = os .getenv ("MINERU_BACKEND" , "pipeline" ).lower ()
8265 if backend_env == "vlm-sglang-engine" :
83- # Only support server/engine backend as requested; no client here
84- server_url = os .getenv ("MINERU_SGLANG_SERVER_URL" ) # optional, generally not needed for engine
66+ server_url = os .getenv ("MINERU_SGLANG_SERVER_URL" )
8567 return convert_to_markdown_vlm (pdf_bytes , backend = backend_env , server_url = server_url )
86- # Fallback to existing pipeline behavior
8768 return convert_to_markdown (pdf_bytes , ** kwargs )
8869
8970
90- def _convert_entry (args_tuple ):
91- """Top-level helper for subprocess execution."""
92- (
93- pdf_bytes ,
94- backend_env ,
95- server_url ,
96- lang ,
97- parse_method ,
98- formula_enable ,
99- table_enable ,
100- ) = args_tuple
101- if backend_env :
102- os .environ ["MINERU_BACKEND" ] = backend_env
103- if server_url :
104- os .environ ["MINERU_SGLANG_SERVER_URL" ] = server_url
105- return convert_to_markdown_dispatch (
106- pdf_bytes ,
107- lang = lang ,
108- parse_method = parse_method ,
109- formula_enable = formula_enable ,
110- table_enable = table_enable ,
111- )
112-
113-
114- def convert_to_markdown_with_timeout (
115- pdf_bytes ,
116- timeout_seconds = None ,
117- * ,
118- backend_env : str | None ,
119- server_url : str | None ,
120- lang : str ,
121- parse_method : str ,
122- formula_enable : bool ,
123- table_enable : bool ,
124- ):
125- """Run conversion in a separate process with an optional timeout.
126- Keeps conversion in the main thread when no timeout is requested.
127- """
128- # If no timeout, run inline in the main process/thread to allow libraries that require main-thread signals.
129- if not timeout_seconds or timeout_seconds <= 0 :
130- # Ensure env is applied for inline run as well
131- if backend_env :
132- os .environ ["MINERU_BACKEND" ] = backend_env
133- if server_url :
134- os .environ ["MINERU_SGLANG_SERVER_URL" ] = server_url
135- return convert_to_markdown_dispatch (
136- pdf_bytes ,
137- lang = lang ,
138- parse_method = parse_method ,
139- formula_enable = formula_enable ,
140- table_enable = table_enable ,
141- )
142-
143- args_tuple = (
144- pdf_bytes ,
145- backend_env ,
146- server_url ,
147- lang ,
148- parse_method ,
149- formula_enable ,
150- table_enable ,
151- )
152- with ProcessPoolExecutor (max_workers = 1 ) as executor :
153- future = executor .submit (_convert_entry , args_tuple )
154- try :
155- return future .result (timeout = timeout_seconds )
156- except FuturesTimeoutError :
157- raise TimeoutError (f"PDF processing timed out after { timeout_seconds } seconds" )
158-
159-
16071def handler (event ):
161- """Main serverless handler - returns only markdown (synchronous)."""
16272 try :
16373 input_data = event .get ("input" , {})
16474 base64_content = input_data .get ("file_content" )
16575 filename = input_data .get ("filename" )
166- timeout = input_data .get ("timeout" )
167- created_at = input_data .get ("created_at" )
168-
169- # Processing options
76+
17077 lang = input_data .get ("lang" , "en" )
17178 parse_method = input_data .get ("parse_method" , "auto" )
17279 formula_enable = input_data .get ("formula_enable" , True )
17380 table_enable = input_data .get ("table_enable" , True )
17481
175- # Calculate remaining timeout
176- timeout_seconds = None
177- if timeout :
178- timeout_seconds = int (timeout ) / 1000
179- if created_at :
180- elapsed = time .time () - (created_at / 1000 )
181- if elapsed >= timeout_seconds :
182- return {"error" : "Request timed out before processing" , "status" : "TIMEOUT" }
183- timeout_seconds = max (0 , timeout_seconds - elapsed )
184- if timeout_seconds < 1 :
185- return {"error" : "Insufficient time remaining" , "status" : "TIMEOUT" }
186-
187- # Validate input
18882 if not base64_content or not filename :
18983 return {"error" : "Missing file_content or filename" , "status" : "ERROR" }
19084
19185 if not filename .lower ().endswith ('.pdf' ):
19286 return {"error" : "Only PDF files supported" , "status" : "ERROR" }
19387
194- # Process PDF
19588 pdf_bytes = base64 .b64decode (base64_content )
19689
197- # Read backend envs once and pass into subprocess when needed
198- backend_env = os .getenv ("MINERU_BACKEND" , "pipeline" ).lower ()
199- server_url = os .getenv ("MINERU_SGLANG_SERVER_URL" )
200-
201- md_content = convert_to_markdown_with_timeout (
90+ md_content = convert_to_markdown_dispatch (
20291 pdf_bytes = pdf_bytes ,
203- timeout_seconds = timeout_seconds ,
204- backend_env = backend_env ,
205- server_url = server_url ,
20692 lang = lang ,
20793 parse_method = parse_method ,
20894 formula_enable = formula_enable ,
20995 table_enable = table_enable ,
21096 )
21197
21298 return {"markdown" : md_content , "status" : "SUCCESS" }
213-
214- except TimeoutError as e :
215- return {"error" : str (e ), "status" : "TIMEOUT" }
21699 except Exception as e :
217100 return {"error" : str (e ), "status" : "ERROR" }
218101
102+
219103if __name__ == "__main__" :
220104 print ("Starting RunPod serverless handler..." )
221105 runpod .serverless .start ({"handler" : handler })
0 commit comments