|
4 | 4 | import tempfile |
5 | 5 | import copy |
6 | 6 | import io |
| 7 | +import asyncio |
7 | 8 |
|
8 | 9 | import runpod |
9 | 10 |
|
@@ -118,13 +119,101 @@ def convert_to_markdown_vlm(pdf_bytes, backend="vlm-sglang-engine", server_url=N |
118 | 119 | return vlm_union_make(pdf_info, MakeMode.MM_MD, "images") |
119 | 120 |
|
120 | 121 |
|
121 | | -def convert_to_markdown_dispatch(pdf_bytes, **kwargs): |
122 | | - """Dispatch to pipeline or VLM engine based on env MINERU_BACKEND.""" |
| 122 | +def _convert_to_markdown_via_aio( |
| 123 | + pdf_bytes: bytes, |
| 124 | + filename: str, |
| 125 | + *, |
| 126 | + lang: str = "en", |
| 127 | + backend: str = "pipeline", |
| 128 | + parse_method: str = "auto", |
| 129 | + formula_enable: bool = True, |
| 130 | + table_enable: bool = True, |
| 131 | + server_url: str | None = None, |
| 132 | + max_pages: int | None = None, |
| 133 | +) -> str: |
| 134 | + """Use MinerU's aio_do_parse to produce markdown and return its content.""" |
| 135 | + # Lazy import to keep module import light |
| 136 | + from mineru.cli.common import aio_do_parse |
| 137 | + |
| 138 | + # Map max_pages to end_page_id semantics (inclusive end index) |
| 139 | + start_page_id = 0 |
| 140 | + end_page_id = None |
| 141 | + if max_pages is not None: |
| 142 | + try: |
| 143 | + max_pages_int = int(max_pages) |
| 144 | + if max_pages_int > 0: |
| 145 | + end_page_id = max_pages_int - 1 |
| 146 | + except Exception: |
| 147 | + raise Exception("Invalid max_pages value; must be an integer") |
| 148 | + |
| 149 | + with tempfile.TemporaryDirectory() as output_dir: |
| 150 | + # Run async parse |
| 151 | + async def _run(): |
| 152 | + await aio_do_parse( |
| 153 | + output_dir=output_dir, |
| 154 | + pdf_file_names=[filename], |
| 155 | + pdf_bytes_list=[pdf_bytes], |
| 156 | + p_lang_list=[lang], |
| 157 | + backend=backend, |
| 158 | + parse_method=parse_method, |
| 159 | + formula_enable=formula_enable, |
| 160 | + table_enable=table_enable, |
| 161 | + server_url=server_url, |
| 162 | + f_draw_layout_bbox=False, |
| 163 | + f_draw_span_bbox=False, |
| 164 | + f_dump_md=True, |
| 165 | + f_dump_middle_json=False, |
| 166 | + f_dump_model_output=False, |
| 167 | + f_dump_orig_pdf=False, |
| 168 | + f_dump_content_list=False, |
| 169 | + start_page_id=start_page_id, |
| 170 | + end_page_id=end_page_id, |
| 171 | + ) |
| 172 | + |
| 173 | + asyncio.run(_run()) |
| 174 | + |
| 175 | + # Locate markdown file |
| 176 | + parse_subdir = parse_method if backend.startswith("pipeline") else "vlm" |
| 177 | + parse_dir = os.path.join(output_dir, filename, parse_subdir) |
| 178 | + md_path = os.path.join(parse_dir, f"{filename}.md") |
| 179 | + if not os.path.exists(md_path): |
| 180 | + raise Exception("Markdown output not found after parsing") |
| 181 | + with open(md_path, "r", encoding="utf-8") as f: |
| 182 | + return f.read() |
| 183 | + |
| 184 | + |
| 185 | +def convert_to_markdown_dispatch(pdf_bytes, filename=None, **kwargs): |
| 186 | + """Dispatch to pipeline or VLM engine based on env MINERU_BACKEND. |
| 187 | +
|
| 188 | + Prefer using aio_do_parse to match official MinerU entrypoints. |
| 189 | + """ |
123 | 190 | backend_env = os.getenv("MINERU_BACKEND", "pipeline").lower() |
124 | | - if backend_env == "vlm-sglang-engine": |
125 | | - server_url = os.getenv("MINERU_SGLANG_SERVER_URL") |
126 | | - return convert_to_markdown_vlm(pdf_bytes, backend=backend_env, server_url=server_url) |
127 | | - return convert_to_markdown(pdf_bytes, **kwargs) |
| 191 | + server_url = os.getenv("MINERU_SGLANG_SERVER_URL") |
| 192 | + lang = kwargs.get("lang", "en") |
| 193 | + parse_method = kwargs.get("parse_method", "auto") |
| 194 | + formula_enable = kwargs.get("formula_enable", True) |
| 195 | + table_enable = kwargs.get("table_enable", True) |
| 196 | + max_pages = kwargs.get("max_pages") |
| 197 | + |
| 198 | + if filename is None: |
| 199 | + filename = "document" |
| 200 | + |
| 201 | + # Use aio_do_parse path for both pipeline and vlm backends |
| 202 | + if backend_env.startswith("vlm"): |
| 203 | + parse_method = "vlm" |
| 204 | + backend_for_aio = backend_env |
| 205 | + return _convert_to_markdown_via_aio( |
| 206 | + pdf_bytes, |
| 207 | + filename, |
| 208 | + lang=lang, |
| 209 | + backend=backend_for_aio, |
| 210 | + parse_method=parse_method, |
| 211 | + formula_enable=formula_enable, |
| 212 | + table_enable=table_enable, |
| 213 | + server_url=server_url, |
| 214 | + max_pages=max_pages, |
| 215 | + ) |
| 216 | + |
128 | 217 |
|
129 | 218 |
|
130 | 219 | def handler(event): |
@@ -163,6 +252,7 @@ def handler(event): |
163 | 252 |
|
164 | 253 | md_content = convert_to_markdown_dispatch( |
165 | 254 | pdf_bytes=pdf_bytes, |
| 255 | + filename=os.path.splitext(os.path.basename(filename))[0] if filename else "document", |
166 | 256 | lang=lang, |
167 | 257 | parse_method=parse_method, |
168 | 258 | formula_enable=formula_enable, |
|
0 commit comments