Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 31 additions & 30 deletions verl/utils/dataset/rl_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,48 +188,50 @@ def maybe_filter_out_long_prompts(self, dataframe: datasets.Dataset = None):
video_key = self.video_key

if processor is not None:
from verl.utils.dataset.vision_utils import process_image, process_video
# Use the same vision processing function as Agent Loop for consistency
from qwen_vl_utils import process_vision_info

def doc2len(doc) -> int:

def doc2len(doc) -> int:
try:
messages = self._build_messages(doc)
# pass tool schemas if available so the processor can format prompts
apply_kwargs = dict(**self.apply_chat_template_kwargs)
if self.tool_schemas is not None:
apply_kwargs["tools"] = self.tool_schemas

raw_prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False, **apply_kwargs
# Use the same vision processing as Agent Loop (process_vision_info)
# Use processor's actual patch_size instead of config default
actual_patch_size = self.processor.image_processor.patch_size if hasattr(self.processor, 'image_processor') else self.image_patch_size
images, videos = process_vision_info(
messages, image_patch_size=actual_patch_size, return_video_metadata=True
)
if image_key in doc and doc[image_key]:
images = [
process_image(image, image_patch_size=self.image_patch_size) for image in doc[image_key]
]
else:
images = None

if video_key in doc and doc[video_key]:
videos, video_metadata = zip(
*[
process_video(
video, image_patch_size=self.image_patch_size, return_video_metadata=True
)
for video in doc[video_key]
],
strict=True,
)

# Extract video metadata (same as Agent Loop)
if videos:
videos, video_metadatas = zip(*videos, strict=True)
videos = list(videos)
video_metadata = list(video_metadata)
videos_kwargs = {"video_metadata": video_metadata, "do_sample_frames": False}
video_metadatas = list(video_metadatas)
else:
videos = None
videos_kwargs = {}
video_metadatas = None

return len(
processor(text=[raw_prompt], images=images, videos=videos, videos_kwargs=videos_kwargs)[
"input_ids"
][0]
raw_prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False, **apply_kwargs
)

# Match Agent Loop's processor call exactly
prompt_len = len(
processor(
text=[raw_prompt],
images=images,
videos=videos,
video_metadatas=video_metadatas,
return_tensors="pt",
do_sample_frames=False,
)["input_ids"][0]
)

return prompt_len
except Exception:
print("Error processing one of the samples, skipping...")
traceback.print_exc()
Expand Down Expand Up @@ -257,7 +259,6 @@ def doc2len(doc) -> int:
desc=f"Filtering prompts longer than {self.max_prompt_length} tokens",
)

print(f"filter dataset len: {len(dataframe)}")
return dataframe

def resume_dataset_state(self):
Expand Down