Skip to content

Commit b57fa2e

Browse files
committed
Nick: max pages
1 parent 288252a commit b57fa2e

File tree

2 files changed

+43
-3
lines changed

2 files changed

+43
-3
lines changed

app/serverless.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import asyncio
55
import tempfile
66
import copy
7+
import io
78

89
import runpod
910

@@ -14,13 +15,40 @@
1415
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make
1516
from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json
1617

18+
from pypdf import PdfReader, PdfWriter
19+
1720
class TimeoutError(Exception):
1821
pass
1922

20-
def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enable=True, table_enable=True):
23+
def _trim_pdf_to_max_pages(pdf_bytes: bytes, max_pages: int) -> bytes:
24+
"""Return a new PDF bytes object with at most the first max_pages pages."""
25+
if max_pages is None or max_pages <= 0:
26+
return pdf_bytes
27+
28+
input_buffer = io.BytesIO(pdf_bytes)
29+
reader = PdfReader(input_buffer)
30+
31+
writer = PdfWriter()
32+
pages_to_write = min(max_pages, len(reader.pages))
33+
for page_index in range(pages_to_write):
34+
writer.add_page(reader.pages[page_index])
35+
36+
output_buffer = io.BytesIO()
37+
writer.write(output_buffer)
38+
return output_buffer.getvalue()
39+
40+
def convert_to_markdown(pdf_bytes, lang="en", parse_method="auto", formula_enable=True, table_enable=True, max_pages=None):
2141
"""Convert PDF bytes to markdown - returns only the markdown string"""
2242

2343
try:
44+
# Optionally limit to first N pages
45+
if max_pages is not None:
46+
try:
47+
max_pages_int = int(max_pages)
48+
except Exception:
49+
raise Exception("Invalid max_pages value; must be an integer")
50+
pdf_bytes = _trim_pdf_to_max_pages(pdf_bytes, max_pages_int)
51+
2452
# Analyze the PDF
2553
infer_results, all_image_lists, all_pdf_docs, lang_list_result, ocr_enabled_list = pipeline_doc_analyze(
2654
[pdf_bytes],
@@ -77,6 +105,7 @@ async def handler(event):
77105
filename = input_data.get("filename")
78106
timeout = input_data.get("timeout")
79107
created_at = input_data.get("created_at")
108+
max_pages = input_data.get("max_pages")
80109

81110
# Processing options
82111
lang = input_data.get("lang", "en")
@@ -103,6 +132,15 @@ async def handler(event):
103132
if not filename.lower().endswith('.pdf'):
104133
return {"error": "Only PDF files supported", "status": "ERROR"}
105134

135+
# Validate max_pages if provided
136+
if max_pages is not None:
137+
try:
138+
max_pages = int(max_pages)
139+
if max_pages <= 0:
140+
return {"error": "max_pages must be a positive integer", "status": "ERROR"}
141+
except Exception:
142+
return {"error": "Invalid max_pages; must be an integer", "status": "ERROR"}
143+
106144
# Process PDF
107145
pdf_bytes = base64.b64decode(base64_content)
108146

@@ -112,7 +150,8 @@ async def handler(event):
112150
lang=lang,
113151
parse_method=parse_method,
114152
formula_enable=formula_enable,
115-
table_enable=table_enable
153+
table_enable=table_enable,
154+
max_pages=max_pages
116155
)
117156

118157
return {"markdown": md_content, "status": "SUCCESS"}

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ readme = "README.md"
99
requires-python = ">=3.10,<3.14"
1010
dependencies = [
1111
"mineru[pipeline] (>=2.1.11,<3.0.0)",
12-
"runpod (>=1.7.12,<2.0.0)"
12+
"runpod (>=1.7.12,<2.0.0)",
13+
"pypdf (>=4.2.0,<6.0.0)"
1314
]
1415

1516

0 commit comments

Comments
 (0)