@@ -148,6 +148,20 @@ def main():
148148 type = str ,
149149 help = "Output directory for debugging" ,
150150 )
151+ #Adding the next two arguments to improve performance on the GPU
152+ parser .add_argument (
153+ "--gpu-memory-utilization" ,
154+ type = float ,
155+ default = 0.60 ,
156+ help = "Target fraction of GPU memory vLLM can use for model + KV cache" ,
157+ )
158+ parser .add_argument (
159+ "--max-model-len" ,
160+ type = int ,
161+ default = 1024 ,
162+ help = "Maximum sequence length for sizing KV cache" ,
163+ )
164+
151165 args = parser .parse_args ()
152166
153167 images : list [str ] = args .images or []
@@ -204,8 +218,11 @@ def main():
204218 revision = args .revision ,
205219 limit_mm_per_prompt = {"image" : len (images ), "video" : len (videos )},
206220 enforce_eager = True ,
221+ gpu_memory_utilization = args .gpu_memory_utilization ,
222+ max_model_len = args .max_model_len ,
207223 )
208224
225+
209226 # Process inputs
210227 processor : transformers .Qwen2_5_VLProcessor = (
211228 transformers .AutoProcessor .from_pretrained (args .model )
@@ -239,14 +256,22 @@ def main():
239256 "mm_processor_kwargs" : video_kwargs ,
240257 }
241258 outputs = llm .generate ([llm_inputs ], sampling_params = sampling_params )
259+
242260 print (SEPARATOR )
261+ full_texts = []
243262 for output in outputs [0 ].outputs :
244263 output_text = output .text
264+ full_texts .append (output_text )
245265 print ("Assistant:" )
246266 print (textwrap .indent (output_text .rstrip (), " " ))
247267 print (SEPARATOR )
248268
249- result , _ = extract_tagged_text (output_text )
269+ # Debug: show raw length so we know if it’s really short
270+ print (f"[DEBUG] Total outputs: { len (full_texts )} " )
271+ print (f"[DEBUG] Last output length: { len (full_texts [- 1 ]) if full_texts else 0 } " )
272+
273+ result , _ = extract_tagged_text (full_texts [- 1 ])
274+
250275 if args .verbose and result :
251276 pprint_dict (result , "Result" )
252277
0 commit comments