Skip to content

Commit 10d7d62

Browse files
committed
wip
1 parent e951de3 commit 10d7d62

File tree

1 file changed

+96
-6
lines changed

1 file changed

+96
-6
lines changed

app/serverless.py

Lines changed: 96 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import tempfile
55
import copy
66
import io
7+
import asyncio
78

89
import runpod
910

@@ -118,13 +119,101 @@ def convert_to_markdown_vlm(pdf_bytes, backend="vlm-sglang-engine", server_url=N
118119
return vlm_union_make(pdf_info, MakeMode.MM_MD, "images")
119120

120121

121-
def convert_to_markdown_dispatch(pdf_bytes, **kwargs):
122-
"""Dispatch to pipeline or VLM engine based on env MINERU_BACKEND."""
122+
def _convert_to_markdown_via_aio(
123+
pdf_bytes: bytes,
124+
filename: str,
125+
*,
126+
lang: str = "en",
127+
backend: str = "pipeline",
128+
parse_method: str = "auto",
129+
formula_enable: bool = True,
130+
table_enable: bool = True,
131+
server_url: str | None = None,
132+
max_pages: int | None = None,
133+
) -> str:
134+
"""Use MinerU's aio_do_parse to produce markdown and return its content."""
135+
# Lazy import to keep module import light
136+
from mineru.cli.common import aio_do_parse
137+
138+
# Map max_pages to end_page_id semantics (inclusive end index)
139+
start_page_id = 0
140+
end_page_id = None
141+
if max_pages is not None:
142+
try:
143+
max_pages_int = int(max_pages)
144+
if max_pages_int > 0:
145+
end_page_id = max_pages_int - 1
146+
except Exception:
147+
raise Exception("Invalid max_pages value; must be an integer")
148+
149+
with tempfile.TemporaryDirectory() as output_dir:
150+
# Run async parse
151+
async def _run():
152+
await aio_do_parse(
153+
output_dir=output_dir,
154+
pdf_file_names=[filename],
155+
pdf_bytes_list=[pdf_bytes],
156+
p_lang_list=[lang],
157+
backend=backend,
158+
parse_method=parse_method,
159+
formula_enable=formula_enable,
160+
table_enable=table_enable,
161+
server_url=server_url,
162+
f_draw_layout_bbox=False,
163+
f_draw_span_bbox=False,
164+
f_dump_md=True,
165+
f_dump_middle_json=False,
166+
f_dump_model_output=False,
167+
f_dump_orig_pdf=False,
168+
f_dump_content_list=False,
169+
start_page_id=start_page_id,
170+
end_page_id=end_page_id,
171+
)
172+
173+
asyncio.run(_run())
174+
175+
# Locate markdown file
176+
parse_subdir = parse_method if backend.startswith("pipeline") else "vlm"
177+
parse_dir = os.path.join(output_dir, filename, parse_subdir)
178+
md_path = os.path.join(parse_dir, f"{filename}.md")
179+
if not os.path.exists(md_path):
180+
raise Exception("Markdown output not found after parsing")
181+
with open(md_path, "r", encoding="utf-8") as f:
182+
return f.read()
183+
184+
185+
def convert_to_markdown_dispatch(pdf_bytes, filename=None, **kwargs):
186+
"""Dispatch to pipeline or VLM engine based on env MINERU_BACKEND.
187+
188+
Prefer using aio_do_parse to match official MinerU entrypoints.
189+
"""
123190
backend_env = os.getenv("MINERU_BACKEND", "pipeline").lower()
124-
if backend_env == "vlm-sglang-engine":
125-
server_url = os.getenv("MINERU_SGLANG_SERVER_URL")
126-
return convert_to_markdown_vlm(pdf_bytes, backend=backend_env, server_url=server_url)
127-
return convert_to_markdown(pdf_bytes, **kwargs)
191+
server_url = os.getenv("MINERU_SGLANG_SERVER_URL")
192+
lang = kwargs.get("lang", "en")
193+
parse_method = kwargs.get("parse_method", "auto")
194+
formula_enable = kwargs.get("formula_enable", True)
195+
table_enable = kwargs.get("table_enable", True)
196+
max_pages = kwargs.get("max_pages")
197+
198+
if filename is None:
199+
filename = "document"
200+
201+
# Use aio_do_parse path for both pipeline and vlm backends
202+
if backend_env.startswith("vlm"):
203+
parse_method = "vlm"
204+
backend_for_aio = backend_env
205+
return _convert_to_markdown_via_aio(
206+
pdf_bytes,
207+
filename,
208+
lang=lang,
209+
backend=backend_for_aio,
210+
parse_method=parse_method,
211+
formula_enable=formula_enable,
212+
table_enable=table_enable,
213+
server_url=server_url,
214+
max_pages=max_pages,
215+
)
216+
128217

129218

130219
def handler(event):
@@ -163,6 +252,7 @@ def handler(event):
163252

164253
md_content = convert_to_markdown_dispatch(
165254
pdf_bytes=pdf_bytes,
255+
filename=os.path.splitext(os.path.basename(filename))[0] if filename else "document",
166256
lang=lang,
167257
parse_method=parse_method,
168258
formula_enable=formula_enable,

0 commit comments

Comments
 (0)