verl-project
diff --git a/‎examples/dev/debug.sh‎
Lines changed: 3 additions & 2 deletions b/‎examples/dev/debug.sh‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎verl/trainer/config/actor/dp_actor.yaml‎
Lines changed: 3 additions & 0 deletions b/‎verl/trainer/config/actor/dp_actor.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎verl/trainer/config/ref/dp_ref.yaml‎
Lines changed: 3 additions & 0 deletions b/‎verl/trainer/config/ref/dp_ref.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎verl/trainer/distillation/__init__.py‎
Lines changed: 16 additions & 0 deletions b/‎verl/trainer/distillation/__init__.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎verl/utils/distillation.py‎ ‎verl/trainer/distillation/losses.py‎verl/utils/distillation.py renamed to verl/trainer/distillation/losses.py
Lines changed: 17 additions & 115 deletions b/‎verl/utils/distillation.py‎ ‎verl/trainer/distillation/losses.py‎verl/utils/distillation.py renamed to verl/trainer/distillation/losses.py
Lines changed: 17 additions & 115 deletions
diff --git a/‎verl/trainer/distillation/utils.py‎
Lines changed: 122 additions & 0 deletions b/‎verl/trainer/distillation/utils.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎verl/trainer/ppo/ray_trainer.py‎
Lines changed: 1 addition & 1 deletion b/‎verl/trainer/ppo/ray_trainer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎verl/workers/config/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎verl/workers/config/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -5,7 +5,7 @@ set -exo pipefail
 
 export NCCL_P2P_DISABLE=1
 export CUDA_DEVICE_ORDER=PCI_BUS_ID
-export CUDA_VISIBLE_DEVICES=5
+export CUDA_VISIBLE_DEVICES=0
 NUM_DEV=1
 export DATA_PATH=$PWD/../verlData
 export HF_HOME=$DATA_PATH
@@ -64,4 +64,5 @@ python -m verl.trainer.main_ppo \
     actor_rollout_ref.rollout.enforce_eager=True \
     actor_rollout_ref.ref.fsdp_config.use_torch_compile=False \
     actor_rollout_ref.rollout.agent.num_workers=1 \
-    trainer.use_legacy_worker_impl=disable
+    trainer.use_legacy_worker_impl=disable \
+    actor_rollout_ref.actor.distillation_config.enabled=True
@@ -13,6 +13,9 @@ defaults:
   # fsdp engine config
   - ../engine@fsdp_config: fsdp
 
+  # fsdp distillation config
+  - ../distillation@distillation_config: dp_distillation 
+
   # dp actor config, inheriting from trainer/config/actor/actor.yaml
   - actor
 
 
@@ -13,6 +13,9 @@ defaults:
 # Target class for this configuration
 _target_: verl.workers.config.FSDPActorConfig
 
+# fsdp distillation config
+distillation_config: ${oc.select:actor_rollout_ref.actor.distillation_config}
+
 # fsdp config
 fsdp_config:
 
 
@@ -0,0 +1,16 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .utils import *  # noqa: F401
+from .losses import *  # noqa: F401
@@ -1,4 +1,4 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,118 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Contains utilities/classes for on-policy distillation 
-"""
 
-from typing import Union, Optional, Callable, Any
-from enum import Enum
-from omegaconf import DictConfig
-from verl.workers.config import ActorConfig
 import torch
-import torch.nn.functional as F
-from tensordict import TensorDict
-from verl.utils import tensordict_utils as tu
-
-
-
-class Stage(Enum):
-    """
-    Stages for PPO training
-    """
-    OLD_LOG_PROB = "old_log_prob"
-    REF_LOG_PROB = "ref_log_prob"
-    ACTOR_UPDATE = "actor_update"
-
-    @classmethod
-    def get_topk_keys(cls, stage: Union[str, "Stage"]):
-        if isinstance(stage, str):
-            stage = cls(stage)
-        return f"{stage.value}_topk_log_probs", f"{stage.value}_topk_indices"
-
-
-def topk_logprobs_from_logits(logits: torch.Tensor, k: int, compute_both: bool, topk_indices: Optional[torch.Tensor] = None) -> tuple[torch.Tensor, torch.Tensor]:    
-    logprobs = F.log_softmax(logits, dim=-1)
-
-    needs_dedupe = False
-    if compute_both:
-        if topk_indices is None or topk_indices.shape[-1] == k:
-            should_compute_topk = True
-        elif topk_indices.shape[-1] == 2 * k:
-            should_compute_topk = False
-        else:
-            raise ValueError(f"{topk_indices.shape=} is not expected with {k=}")
-    else:
-        if topk_indices is None:
-            should_compute_topk = True
-        elif topk_indices.shape[-1] == k:
-            should_compute_topk = False
-        else:
-            raise ValueError(f"{topk_indices.shape=} is not expected with {k=}")
-            
-
-    topk_logprobs_ls = []
-    topk_logprobs_indices_ls = []
-    
-    # Gather logits for provided indices.
-    if topk_indices is not None:
-        topk_logprobs = torch.gather(logprobs, dim=-1, index=topk_indices)
-        topk_logprobs_ls.append(topk_logprobs)
-        topk_logprobs_indices_ls.append(topk_indices)
-
-    # Compute top-k logprobs.
-    if should_compute_topk:
-        topk_logprobs, topk_indices = torch.topk(logprobs, k=k, dim=-1)
-        topk_logprobs_ls.append(topk_logprobs)
-        topk_logprobs_indices_ls.append(topk_indices)
-
-    topk_logprobs = torch.cat(topk_logprobs_ls, dim=-1)
-    topk_indices = torch.cat(topk_logprobs_indices_ls, dim=-1)
-
-    # If top-k have been provided AND new top-k have been computed, we need to deduplicate the indices and logprobs. 
-    if needs_dedupe:
-
-        # Make sure indices are sorted so that we can identify duplicates.
-        topk_indices_diff = topk_indices.diff(dim=-1)
-        if topk_indices_diff.lt(0).any():
-            topk_indices, sort_indices = topk_indices.sort(dim=-1)
-            topk_logprobs = torch.gather(topk_logprobs, dim=-1, index=sort_indices)
-            topk_indices_diff = topk_indices.diff(dim=-1)
-
-        # Find duplicate indices and set their prob to ~0.
-        if topk_indices_diff.eq(0).any():
-            index_diffs = torch.nn.functional.pad(topk_indices_diff, (0, 1), value=1)
-            dupe_mask = index_diffs.eq(0)
-            topk_logprobs[dupe_mask] = -torch.inf
-
-    return topk_logprobs, topk_indices
-
-def compute_topk_outputs(logits: torch.Tensor, batch: TensorDict, cu_seqlens: torch.Tensor):
-    """
-    TODO: Docstring for compute_topk_outputs
-    """
-    stage = batch["stage"]
-    topk_logprobs, topk_indices = topk_logprobs_from_logits(logits=logits, k=2, compute_both=True, topk_indices=batch.get("topk_indices", None))
-    topk_logprobs_key, topk_indices_key = Stage.get_topk_keys(stage)
-    output = {
-        topk_logprobs_key: torch.nested.nested_tensor_from_jagged(topk_logprobs.squeeze(0), cu_seqlens),
-        topk_indices_key: torch.nested.nested_tensor_from_jagged(topk_indices.squeeze(0), cu_seqlens),
-    }
-    return output
-
-def gather_topk_outputs(stage: Stage, output: TensorDict):
-    """
-    TODO: Docstring for gather_topk_outputs
-    """
-    topk_logprobs_key, topk_indices_key = Stage.get_topk_keys(stage)
-    topk_logprobs = tu.get(output, topk_logprobs_key)
-    if topk_logprobs is not None:
-        return {
-            topk_logprobs_key: topk_logprobs.float(),
-            topk_indices_key: tu.get(output, topk_indices_key),
-        }
-    else:
-        return {}
+from typing import Callable, Optional, Any
+from omegaconf import DictConfig
+from verl.workers.config import DistillationConfig
 
 # TODO: Update args
 DistillationLossFn = Callable[
@@ -132,7 +25,7 @@ def gather_topk_outputs(stage: Stage, output: TensorDict):
         torch.Tensor,  # advantages
         torch.Tensor,  # response_mask
         str,  # loss_agg_mode
-        Optional[DictConfig | ActorConfig],  # config
+        Optional[DictConfig | DistillationConfig],  # config
         torch.Tensor | None,  # rollout_log_probs
     ],
     tuple[torch.Tensor, dict[str, Any]],
@@ -174,14 +67,14 @@ def get_distillation_loss_fn(name):
         )
     return DISTILLATION_LOSS_REGISTRY[loss_name]
 
-from verl.workers.config import DistillationConfig
-
 @register_distillation_loss("student_kl_topk")  # type: ignore[arg-type]
 def compute_distillation_loss_student_kl_topk(
     teacher_log_probs: torch.Tensor,
     student_log_probs: torch.Tensor,
     teacher_topk_logprobs: torch.Tensor,
     student_topk_logprobs: torch.Tensor,
+    teacher_topk_indices: torch.Tensor,
+    student_topk_indices: torch.Tensor,
     response_mask: torch.Tensor,
     config: DistillationConfig,
     loss_agg_mode: str = "token-mean",
@@ -198,6 +91,12 @@ def compute_distillation_loss_student_kl_topk(
             Top-k log-probabilities of actions under the teacher policy, shape (batch_size, response_length, topk).
         student_topk_logprobs (torch.Tensor):
             Top-k log-probabilities of actions under the student policy, shape (batch_size, response_length, topk).
+        teacher_topk_indices (torch.Tensor):
+            Top-k action indices under the teacher policy, shape (batch_size, response_length, topk).
+        student_topk_indices (torch.Tensor):
+            Top-k action indices under the student policy, shape (batch_size, response_length, topk).
+        response_mask (torch.Tensor):
+            Mask indicating which tokens to include in the loss, shape (batch_size, response_length).
         config: `(verl.trainer.config.DistillationConfig)`:
             config for the actor.
         loss_agg_mode (str, optional):
@@ -207,6 +106,7 @@ def compute_distillation_loss_student_kl_topk(
         loss_agg_mode (str, optional):
             Aggregation mode for `agg_loss`. Defaults to "token-mean".
     """
+    breakpoint()
     assert config is not None
     topk = config.topk
     if teacher_topk_logprobs.shape[-1] != topk or student_topk_logprobs.shape[-1] != topk:
@@ -220,4 +120,6 @@ def compute_distillation_loss_student_kl_topk(
     #     "actor/ppo_kl": ppo_kl.detach().item(),
     #     "actor/pg_clipfrac_lower": pg_clipfrac_lower.detach().item(),
     # }
-    return distillation_loss, distillation_metrics
+    return distillation_loss, distillation_metrics
+
+
@@ -0,0 +1,122 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Contains utilities/classes for on-policy distillation 
+"""
+
+from typing import Union, Optional
+import torch
+import torch.nn.functional as F
+from tensordict import TensorDict
+from verl.utils import tensordict_utils as tu
+from enum import Enum
+
+class Stage(Enum):
+    """
+    Stages for PPO training
+    """
+    OLD_LOG_PROB = "old_log_prob"
+    REF_LOG_PROB = "ref_log_prob"
+    ACTOR_UPDATE = "actor_update"
+
+def get_topk_keys(stage: Union[str, Stage]):
+    """TODO: Docstring for get_topk_keys"""
+    if isinstance(stage, Stage):
+        stage = stage.value
+    return f"{stage}_topk_log_probs", f"{stage}_topk_indices"
+
+def topk_logprobs_from_logits(logits: torch.Tensor, k: int, compute_both: bool, topk_indices: Optional[torch.Tensor] = None) -> tuple[torch.Tensor, torch.Tensor]:    
+    """TODO: Docstring for topk_logprobs_from_logits"""
+    logprobs = F.log_softmax(logits, dim=-1)
+
+    needs_dedupe = False
+    if compute_both:
+        if topk_indices is None or topk_indices.shape[-1] == k:
+            should_compute_topk = True
+        elif topk_indices.shape[-1] == 2 * k:
+            should_compute_topk = False
+        else:
+            raise ValueError(f"{topk_indices.shape=} is not expected with {k=}")
+    else:
+        if topk_indices is None:
+            should_compute_topk = True
+        elif topk_indices.shape[-1] == k:
+            should_compute_topk = False
+        else:
+            raise ValueError(f"{topk_indices.shape=} is not expected with {k=}")
+            
+
+    topk_logprobs_ls = []
+    topk_logprobs_indices_ls = []
+    
+    # Gather logits for provided indices.
+    if topk_indices is not None:
+        topk_logprobs = torch.gather(logprobs, dim=-1, index=topk_indices)
+        topk_logprobs_ls.append(topk_logprobs)
+        topk_logprobs_indices_ls.append(topk_indices)
+
+    # Compute top-k logprobs.
+    if should_compute_topk:
+        topk_logprobs, topk_indices = torch.topk(logprobs, k=k, dim=-1)
+        topk_logprobs_ls.append(topk_logprobs)
+        topk_logprobs_indices_ls.append(topk_indices)
+
+    topk_logprobs = torch.cat(topk_logprobs_ls, dim=-1)
+    topk_indices = torch.cat(topk_logprobs_indices_ls, dim=-1)
+
+    # If top-k have been provided AND new top-k have been computed, we need to deduplicate the indices and logprobs. 
+    if needs_dedupe:
+
+        # Make sure indices are sorted so that we can identify duplicates.
+        topk_indices_diff = topk_indices.diff(dim=-1)
+        if topk_indices_diff.lt(0).any():
+            topk_indices, sort_indices = topk_indices.sort(dim=-1)
+            topk_logprobs = torch.gather(topk_logprobs, dim=-1, index=sort_indices)
+            topk_indices_diff = topk_indices.diff(dim=-1)
+
+        # Find duplicate indices and set their prob to ~0.
+        if topk_indices_diff.eq(0).any():
+            index_diffs = torch.nn.functional.pad(topk_indices_diff, (0, 1), value=1)
+            dupe_mask = index_diffs.eq(0)
+            topk_logprobs[dupe_mask] = -torch.inf
+
+    return topk_logprobs, topk_indices
+
+def compute_topk_outputs(logits: torch.Tensor, batch: TensorDict, cu_seqlens: torch.Tensor):
+    """
+    TODO: Docstring for compute_topk_outputs
+    """
+    stage = batch["stage"]
+    topk_logprobs, topk_indices = topk_logprobs_from_logits(logits=logits, k=2, compute_both=True, topk_indices=batch.get("topk_indices", None))
+    topk_logprobs_key, topk_indices_key = get_topk_keys(stage)
+    output = {
+        topk_logprobs_key: torch.nested.nested_tensor_from_jagged(topk_logprobs.squeeze(0), cu_seqlens),
+        topk_indices_key: torch.nested.nested_tensor_from_jagged(topk_indices.squeeze(0), cu_seqlens),
+    }
+    return output
+
+def gather_topk_outputs(stage: Stage, output: TensorDict):
+    """
+    TODO: Docstring for gather_topk_outputs
+    """
+    topk_logprobs_key, topk_indices_key = get_topk_keys(stage)
+    topk_logprobs = tu.get(output, topk_logprobs_key)
+    if topk_logprobs is not None:
+        return {
+            topk_logprobs_key: topk_logprobs.float(),
+            topk_indices_key: tu.get(output, topk_indices_key),
+        }
+    else:
+        return {}
+
@@ -52,8 +52,8 @@
 )
 from verl.trainer.ppo.reward import compute_reward, compute_reward_async
 from verl.trainer.ppo.utils import Role, WorkerType, need_critic, need_reference_policy, need_reward_model
+from verl.trainer.distillation import Stage, gather_topk_outputs
 from verl.utils import tensordict_utils as tu
-from verl.utils.distillation import Stage, gather_topk_outputs
 from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.debug import marked_timer
 
@@ -20,6 +20,7 @@
 from .optimizer import *  # noqa: F401
 from .reward_model import *  # noqa: F401
 from .rollout import *  # noqa: F401
+from .distillation import *  # noqa: F401
 
 __all__ = (
     actor.__all__
@@ -29,4 +30,5 @@
     + optimizer.__all__
     + rollout.__all__
     + model.__all__
+    + distillation.__all__
 )