Skip to content

Commit 2267206

Browse files
committed
simplify
1 parent 91911fd commit 2267206

File tree

1 file changed

+26
-142
lines changed

1 file changed

+26
-142
lines changed

app/serverless.py

Lines changed: 26 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -2,220 +2,104 @@
22
import os
33
import time
44
import tempfile
5-
import copy
6-
from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
75

86
import runpod
97

10-
# New mineru imports
11-
from mineru.data.data_reader_writer import FileBasedDataWriter
12-
from mineru.utils.enum_class import MakeMode
13-
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
14-
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
15-
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
16-
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
17-
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
188

199
class TimeoutError(Exception):
2010
pass
2111

12+
2213
def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enable=True, table_enable=True):
2314
"""Convert PDF bytes to markdown - returns only the markdown string"""
24-
2515
try:
26-
# Analyze the PDF
16+
# Lazy imports to avoid import-time signal handling in non-main threads
17+
from mineru.data.data_reader_writer import FileBasedDataWriter
18+
from mineru.utils.enum_class import MakeMode
19+
from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze
20+
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
21+
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
22+
2723
infer_results, all_image_lists, all_pdf_docs, lang_list_result, ocr_enabled_list = pipeline_doc_analyze(
28-
[pdf_bytes],
29-
[lang],
30-
parse_method=parse_method,
31-
formula_enable=formula_enable,
32-
table_enable=table_enable
24+
[pdf_bytes], [lang], parse_method=parse_method, formula_enable=formula_enable, table_enable=table_enable
3325
)
34-
35-
# Process results
26+
3627
model_list = infer_results[0]
3728
images_list = all_image_lists[0]
3829
pdf_doc = all_pdf_docs[0]
3930
_lang = lang_list_result[0]
4031
_ocr_enable = ocr_enabled_list[0]
41-
42-
# Create temporary image directory for any image processing
32+
4333
with tempfile.TemporaryDirectory() as temp_dir:
4434
image_writer = FileBasedDataWriter(temp_dir)
45-
46-
# Convert to middle JSON format
4735
middle_json = pipeline_result_to_middle_json(
48-
model_list, images_list, pdf_doc, image_writer,
49-
_lang, _ocr_enable, formula_enable
36+
model_list, images_list, pdf_doc, image_writer, _lang, _ocr_enable, formula_enable
5037
)
51-
52-
# Generate and return markdown
5338
pdf_info = middle_json["pdf_info"]
5439
return pipeline_union_make(pdf_info, MakeMode.MM_MD, "images")
55-
5640
except Exception as e:
5741
raise Exception(f"Error converting PDF to markdown: {str(e)}")
5842

43+
5944
def convert_to_markdown_vlm(pdf_bytes, backend="vlm-sglang-engine", server_url=None):
60-
"""Convert PDF bytes to markdown using VLM backends; returns markdown string.
61-
Only server/engine backend is supported as requested.
62-
"""
63-
# Normalize backend to what vlm_doc_analyze expects
45+
"""Convert PDF bytes to markdown using VLM backends; returns markdown string."""
46+
# Lazy imports to avoid import-time signal handling in non-main threads
47+
from mineru.data.data_reader_writer import FileBasedDataWriter
48+
from mineru.utils.enum_class import MakeMode
49+
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
50+
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
51+
6452
normalized_backend = backend[4:] if backend.startswith("vlm-") else backend
6553
with tempfile.TemporaryDirectory() as temp_dir:
6654
image_writer = FileBasedDataWriter(temp_dir)
6755
middle_json, _ = vlm_doc_analyze(
68-
pdf_bytes,
69-
image_writer=image_writer,
70-
backend=normalized_backend,
71-
server_url=server_url,
56+
pdf_bytes, image_writer=image_writer, backend=normalized_backend, server_url=server_url
7257
)
7358
pdf_info = middle_json["pdf_info"]
7459
return vlm_union_make(pdf_info, MakeMode.MM_MD, "images")
7560

7661

7762
def convert_to_markdown_dispatch(pdf_bytes, **kwargs):
78-
"""Dispatch to pipeline or VLM engine based on env MINERU_BACKEND.
79-
Defaults to pipeline without changing existing behavior.
80-
"""
63+
"""Dispatch to pipeline or VLM engine based on env MINERU_BACKEND."""
8164
backend_env = os.getenv("MINERU_BACKEND", "pipeline").lower()
8265
if backend_env == "vlm-sglang-engine":
83-
# Only support server/engine backend as requested; no client here
84-
server_url = os.getenv("MINERU_SGLANG_SERVER_URL") # optional, generally not needed for engine
66+
server_url = os.getenv("MINERU_SGLANG_SERVER_URL")
8567
return convert_to_markdown_vlm(pdf_bytes, backend=backend_env, server_url=server_url)
86-
# Fallback to existing pipeline behavior
8768
return convert_to_markdown(pdf_bytes, **kwargs)
8869

8970

90-
def _convert_entry(args_tuple):
91-
"""Top-level helper for subprocess execution."""
92-
(
93-
pdf_bytes,
94-
backend_env,
95-
server_url,
96-
lang,
97-
parse_method,
98-
formula_enable,
99-
table_enable,
100-
) = args_tuple
101-
if backend_env:
102-
os.environ["MINERU_BACKEND"] = backend_env
103-
if server_url:
104-
os.environ["MINERU_SGLANG_SERVER_URL"] = server_url
105-
return convert_to_markdown_dispatch(
106-
pdf_bytes,
107-
lang=lang,
108-
parse_method=parse_method,
109-
formula_enable=formula_enable,
110-
table_enable=table_enable,
111-
)
112-
113-
114-
def convert_to_markdown_with_timeout(
115-
pdf_bytes,
116-
timeout_seconds=None,
117-
*,
118-
backend_env: str | None,
119-
server_url: str | None,
120-
lang: str,
121-
parse_method: str,
122-
formula_enable: bool,
123-
table_enable: bool,
124-
):
125-
"""Run conversion in a separate process with an optional timeout.
126-
Keeps conversion in the main thread when no timeout is requested.
127-
"""
128-
# If no timeout, run inline in the main process/thread to allow libraries that require main-thread signals.
129-
if not timeout_seconds or timeout_seconds <= 0:
130-
# Ensure env is applied for inline run as well
131-
if backend_env:
132-
os.environ["MINERU_BACKEND"] = backend_env
133-
if server_url:
134-
os.environ["MINERU_SGLANG_SERVER_URL"] = server_url
135-
return convert_to_markdown_dispatch(
136-
pdf_bytes,
137-
lang=lang,
138-
parse_method=parse_method,
139-
formula_enable=formula_enable,
140-
table_enable=table_enable,
141-
)
142-
143-
args_tuple = (
144-
pdf_bytes,
145-
backend_env,
146-
server_url,
147-
lang,
148-
parse_method,
149-
formula_enable,
150-
table_enable,
151-
)
152-
with ProcessPoolExecutor(max_workers=1) as executor:
153-
future = executor.submit(_convert_entry, args_tuple)
154-
try:
155-
return future.result(timeout=timeout_seconds)
156-
except FuturesTimeoutError:
157-
raise TimeoutError(f"PDF processing timed out after {timeout_seconds} seconds")
158-
159-
16071
def handler(event):
161-
"""Main serverless handler - returns only markdown (synchronous)."""
16272
try:
16373
input_data = event.get("input", {})
16474
base64_content = input_data.get("file_content")
16575
filename = input_data.get("filename")
166-
timeout = input_data.get("timeout")
167-
created_at = input_data.get("created_at")
168-
169-
# Processing options
76+
17077
lang = input_data.get("lang", "en")
17178
parse_method = input_data.get("parse_method", "auto")
17279
formula_enable = input_data.get("formula_enable", True)
17380
table_enable = input_data.get("table_enable", True)
17481

175-
# Calculate remaining timeout
176-
timeout_seconds = None
177-
if timeout:
178-
timeout_seconds = int(timeout) / 1000
179-
if created_at:
180-
elapsed = time.time() - (created_at / 1000)
181-
if elapsed >= timeout_seconds:
182-
return {"error": "Request timed out before processing", "status": "TIMEOUT"}
183-
timeout_seconds = max(0, timeout_seconds - elapsed)
184-
if timeout_seconds < 1:
185-
return {"error": "Insufficient time remaining", "status": "TIMEOUT"}
186-
187-
# Validate input
18882
if not base64_content or not filename:
18983
return {"error": "Missing file_content or filename", "status": "ERROR"}
19084

19185
if not filename.lower().endswith('.pdf'):
19286
return {"error": "Only PDF files supported", "status": "ERROR"}
19387

194-
# Process PDF
19588
pdf_bytes = base64.b64decode(base64_content)
19689

197-
# Read backend envs once and pass into subprocess when needed
198-
backend_env = os.getenv("MINERU_BACKEND", "pipeline").lower()
199-
server_url = os.getenv("MINERU_SGLANG_SERVER_URL")
200-
201-
md_content = convert_to_markdown_with_timeout(
90+
md_content = convert_to_markdown_dispatch(
20291
pdf_bytes=pdf_bytes,
203-
timeout_seconds=timeout_seconds,
204-
backend_env=backend_env,
205-
server_url=server_url,
20692
lang=lang,
20793
parse_method=parse_method,
20894
formula_enable=formula_enable,
20995
table_enable=table_enable,
21096
)
21197

21298
return {"markdown": md_content, "status": "SUCCESS"}
213-
214-
except TimeoutError as e:
215-
return {"error": str(e), "status": "TIMEOUT"}
21699
except Exception as e:
217100
return {"error": str(e), "status": "ERROR"}
218101

102+
219103
if __name__ == "__main__":
220104
print("Starting RunPod serverless handler...")
221105
runpod.serverless.start({"handler": handler})

0 commit comments

Comments
 (0)