Skip to content

Commit 19b800c

Browse files
committed
Merge branch 'main' into feat/vlm
2 parents 2267206 + 3a87b5f commit 19b800c

File tree

3 files changed

+51
-3
lines changed

3 files changed

+51
-3
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ COPY pyproject.toml poetry.lock ./
2525

2626
ENV PATH="/root/.local/bin:$PATH"
2727
RUN poetry config virtualenvs.in-project true && \
28+
poetry lock --no-interaction && \
2829
poetry install --no-interaction --no-root && \
2930
rm -rf /root/.cache/pypoetry && \
3031
rm -rf /root/.cache/pip

app/serverless.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,36 @@
22
import os
33
import time
44
import tempfile
5+
import copy
6+
import io
57

68
import runpod
79

810

11+
from pypdf import PdfReader, PdfWriter
12+
913
class TimeoutError(Exception):
1014
pass
1115

1216

13-
def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enable=True, table_enable=True):
17+
def _trim_pdf_to_max_pages(pdf_bytes: bytes, max_pages: int) -> bytes:
18+
"""Return a new PDF bytes object with at most the first max_pages pages."""
19+
if max_pages is None or max_pages <= 0:
20+
return pdf_bytes
21+
22+
input_buffer = io.BytesIO(pdf_bytes)
23+
reader = PdfReader(input_buffer)
24+
25+
writer = PdfWriter()
26+
pages_to_write = min(max_pages, len(reader.pages))
27+
for page_index in range(pages_to_write):
28+
writer.add_page(reader.pages[page_index])
29+
30+
output_buffer = io.BytesIO()
31+
writer.write(output_buffer)
32+
return output_buffer.getvalue()
33+
34+
def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enable=True, table_enable=True, max_pages=None):
1435
"""Convert PDF bytes to markdown - returns only the markdown string"""
1536
try:
1637
# Lazy imports to avoid import-time signal handling in non-main threads
@@ -20,6 +41,15 @@ def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enabl
2041
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
2142
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
2243

44+
# Optionally limit to first N pages
45+
if max_pages is not None:
46+
try:
47+
max_pages_int = int(max_pages)
48+
except Exception:
49+
raise Exception("Invalid max_pages value; must be an integer")
50+
pdf_bytes = _trim_pdf_to_max_pages(pdf_bytes, max_pages_int)
51+
52+
# Analyze the PDF
2353
infer_results, all_image_lists, all_pdf_docs, lang_list_result, ocr_enabled_list = pipeline_doc_analyze(
2454
[pdf_bytes], [lang], parse_method=parse_method, formula_enable=formula_enable, table_enable=table_enable
2555
)
@@ -74,6 +104,11 @@ def handler(event):
74104
base64_content = input_data.get("file_content")
75105
filename = input_data.get("filename")
76106

107+
timeout = input_data.get("timeout")
108+
created_at = input_data.get("created_at")
109+
max_pages = input_data.get("max_pages")
110+
111+
# Processing options
77112
lang = input_data.get("lang", "en")
78113
parse_method = input_data.get("parse_method", "auto")
79114
formula_enable = input_data.get("formula_enable", True)
@@ -85,6 +120,16 @@ def handler(event):
85120
if not filename.lower().endswith('.pdf'):
86121
return {"error": "Only PDF files supported", "status": "ERROR"}
87122

123+
# Validate max_pages if provided
124+
if max_pages is not None:
125+
try:
126+
max_pages = int(max_pages)
127+
if max_pages <= 0:
128+
return {"error": "max_pages must be a positive integer", "status": "ERROR"}
129+
except Exception:
130+
return {"error": "Invalid max_pages; must be an integer", "status": "ERROR"}
131+
132+
# Process PDF
88133
pdf_bytes = base64.b64decode(base64_content)
89134

90135
md_content = convert_to_markdown_dispatch(
@@ -93,6 +138,7 @@ def handler(event):
93138
parse_method=parse_method,
94139
formula_enable=formula_enable,
95140
table_enable=table_enable,
141+
max_pages=max_pages
96142
)
97143

98144
return {"markdown": md_content, "status": "SUCCESS"}

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ authors = [
88
readme = "README.md"
99
requires-python = ">=3.10,<3.14"
1010
dependencies = [
11-
"mineru[pipeline,vlm,sglang] (>=2.1.11,<3.0.0)",
12-
"runpod (>=1.7.12,<2.0.0)"
11+
"mineru[pipeline] (>=2.1.11,<3.0.0)",
12+
"runpod (>=1.7.12,<2.0.0)",
13+
"pypdf (>=4.2.0,<6.0.0)"
1314
]
1415

1516

0 commit comments

Comments
 (0)