Skip to content

Commit 19481c9

Browse files
committed
inference: add CLI overrides for fps/total_pixels and vLLM memory knobs
1 parent 8743e1b commit 19481c9

File tree

1 file changed

+26
-1
lines changed

1 file changed

+26
-1
lines changed

scripts/inference.py

100755100644
Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,20 @@ def main():
148148
type=str,
149149
help="Output directory for debugging",
150150
)
151+
#Adding the next two arguments to improve performance on the GPU
152+
parser.add_argument(
153+
"--gpu-memory-utilization",
154+
type=float,
155+
default=0.60,
156+
help="Target fraction of GPU memory vLLM can use for model + KV cache",
157+
)
158+
parser.add_argument(
159+
"--max-model-len",
160+
type=int,
161+
default=1024,
162+
help="Maximum sequence length for sizing KV cache",
163+
)
164+
151165
args = parser.parse_args()
152166

153167
images: list[str] = args.images or []
@@ -204,8 +218,11 @@ def main():
204218
revision=args.revision,
205219
limit_mm_per_prompt={"image": len(images), "video": len(videos)},
206220
enforce_eager=True,
221+
gpu_memory_utilization=args.gpu_memory_utilization,
222+
max_model_len=args.max_model_len,
207223
)
208224

225+
209226
# Process inputs
210227
processor: transformers.Qwen2_5_VLProcessor = (
211228
transformers.AutoProcessor.from_pretrained(args.model)
@@ -239,14 +256,22 @@ def main():
239256
"mm_processor_kwargs": video_kwargs,
240257
}
241258
outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
259+
242260
print(SEPARATOR)
261+
full_texts = []
243262
for output in outputs[0].outputs:
244263
output_text = output.text
264+
full_texts.append(output_text)
245265
print("Assistant:")
246266
print(textwrap.indent(output_text.rstrip(), " "))
247267
print(SEPARATOR)
248268

249-
result, _ = extract_tagged_text(output_text)
269+
# Debug: show raw length so we know if it’s really short
270+
print(f"[DEBUG] Total outputs: {len(full_texts)}")
271+
print(f"[DEBUG] Last output length: {len(full_texts[-1]) if full_texts else 0}")
272+
273+
result, _ = extract_tagged_text(full_texts[-1])
274+
250275
if args.verbose and result:
251276
pprint_dict(result, "Result")
252277

0 commit comments

Comments
 (0)