From 6a823afe764c64a77f5c9ce3bb9f917789f31ee9 Mon Sep 17 00:00:00 2001
From: Ville Kuosmanen <ville@villekuosmanen.com>
Date: Mon, 2 Sep 2024 10:49:12 +0100
Subject: [PATCH 1/2] feat(act): add a config option for adding a bottleneck
 layer after encofing joint states

---
 .../common/policies/act/configuration_act.py  |   2 +
 lerobot/common/policies/act/modeling_act.py   |  16 ++-
 .../configs/policy/act_real_bottleneck.yaml   | 110 ++++++++++++++++++
 3 files changed, 124 insertions(+), 4 deletions(-)
 create mode 100644 lerobot/configs/policy/act_real_bottleneck.yaml

diff --git a/lerobot/common/policies/act/configuration_act.py b/lerobot/common/policies/act/configuration_act.py
index a86c359c9..4d0851f2b 100644
--- a/lerobot/common/policies/act/configuration_act.py
+++ b/lerobot/common/policies/act/configuration_act.py
@@ -76,6 +76,7 @@ class ACTConfig:
             documentation in the policy class).
         latent_dim: The VAE's latent dimension.
         n_vae_encoder_layers: The number of transformer layers to use for the VAE's encoder.
+        use_joint_state_bottleneck_layer: When true, adds a bottleneck layer after encoding the joint states to try and reduce overfitting to proprioception.
         temporal_ensemble_coeff: Coefficient for the exponential weighting scheme to apply for temporal
             ensembling. Defaults to None which means temporal ensembling is not used. `n_action_steps` must be
             1 when using this feature, as inference needs to happen at every step to form an ensemble. For
@@ -135,6 +136,7 @@ class ACTConfig:
     use_vae: bool = True
     latent_dim: int = 32
     n_vae_encoder_layers: int = 4
+    use_joint_state_bottleneck_layer: bool = False
 
     # Inference.
     # Note: the value used in ACT when temporal ensembling is enabled is 0.01.
diff --git a/lerobot/common/policies/act/modeling_act.py b/lerobot/common/policies/act/modeling_act.py
index 3427c4829..5d97343b2 100644
--- a/lerobot/common/policies/act/modeling_act.py
+++ b/lerobot/common/policies/act/modeling_act.py
@@ -37,7 +37,6 @@
 from lerobot.common.policies.act.configuration_act import ACTConfig
 from lerobot.common.policies.normalize import Normalize, Unnormalize
 
-
 class ACTPolicy(
     nn.Module,
     PyTorchModelHubMixin,
@@ -300,9 +299,18 @@ def __init__(self, config: ACTConfig):
             self.vae_encoder_cls_embed = nn.Embedding(1, config.dim_model)
             # Projection layer for joint-space configuration to hidden dimension.
             if self.use_robot_state:
-                self.vae_encoder_robot_state_input_proj = nn.Linear(
-                    config.input_shapes["observation.state"][0], config.dim_model
-                )
+                if config.use_joint_state_bottleneck_layer:
+                    self.vae_encoder_robot_state_input_proj = nn.Sequential(
+                        nn.Linear(config.input_shapes["observation.state"][0], config.dim_model),  # First linear layer
+                        nn.ReLU(),  # Activation function after the first layer
+                        nn.Linear(config.dim_model, 4),  # Bottleneck layer with reduced dimensionality
+                        nn.ReLU(),  # Activation function after the bottleneck layer
+                        nn.Linear(4, config.dim_model)  # Expanding back to the original desired dimensionality
+                    )
+                else:
+                    self.vae_encoder_robot_state_input_proj = nn.Linear(
+                        config.input_shapes["observation.state"][0], config.dim_model
+                    )
             # Projection layer for action (joint-space target) to hidden dimension.
             self.vae_encoder_action_input_proj = nn.Linear(
                 config.output_shapes["action"][0], config.dim_model
diff --git a/lerobot/configs/policy/act_real_bottleneck.yaml b/lerobot/configs/policy/act_real_bottleneck.yaml
new file mode 100644
index 000000000..0b8022265
--- /dev/null
+++ b/lerobot/configs/policy/act_real_bottleneck.yaml
@@ -0,0 +1,110 @@
+# @package _global_
+
+# The `act_real_bottleneck.yaml` adds a bottleneck layer to try to reduce overfitting on joint states.
+#
+# Example of usage for training:
+# ```bash
+# python lerobot/scripts/train.py \
+#   policy=act_real_bottleneck \
+#   env=dora_aloha_real
+# ```
+
+seed: 1000
+dataset_repo_id: lerobot/aloha_static_vinh_cup
+
+override_dataset_stats:
+  observation.images.cam_right_wrist:
+    # stats from imagenet, since we use a pretrained vision model
+    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
+    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
+  observation.images.cam_left_wrist:
+    # stats from imagenet, since we use a pretrained vision model
+    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
+    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
+  observation.images.cam_high:
+    # stats from imagenet, since we use a pretrained vision model
+    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
+    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
+  observation.images.cam_low:
+    # stats from imagenet, since we use a pretrained vision model
+    mean: [[[0.485]], [[0.456]], [[0.406]]]  # (c,1,1)
+    std: [[[0.229]], [[0.224]], [[0.225]]]  # (c,1,1)
+
+training:
+  offline_steps: 100000
+  online_steps: 0
+  eval_freq: -1
+  save_freq: 20000
+  save_checkpoint: true
+
+  batch_size: 8
+  lr: 1e-5
+  lr_backbone: 1e-5
+  weight_decay: 1e-4
+  grad_clip_norm: 10
+  online_steps_between_rollouts: 1
+
+  delta_timestamps:
+    action: "[i / ${fps} for i in range(${policy.chunk_size})]"
+
+eval:
+  n_episodes: 50
+  batch_size: 50
+
+# See `configuration_act.py` for more details.
+policy:
+  name: act
+
+  # Input / output structure.
+  n_obs_steps: 1
+  chunk_size: 100 # chunk_size
+  n_action_steps: 100
+
+  input_shapes:
+    # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+    observation.images.cam_right_wrist: [3, 480, 640]
+    observation.images.cam_left_wrist: [3, 480, 640]
+    observation.images.cam_high: [3, 480, 640]
+    observation.images.cam_low: [3, 480, 640]
+    observation.state: ["${env.state_dim}"]
+  output_shapes:
+    action: ["${env.action_dim}"]
+
+  # Normalization / Unnormalization
+  input_normalization_modes:
+    observation.images.cam_right_wrist: mean_std
+    observation.images.cam_left_wrist: mean_std
+    observation.images.cam_high: mean_std
+    observation.images.cam_low: mean_std
+    observation.state: mean_std
+  output_normalization_modes:
+    action: mean_std
+
+  # Architecture.
+  # Vision backbone.
+  vision_backbone: resnet18
+  pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
+  replace_final_stride_with_dilation: false
+  # Transformer layers.
+  pre_norm: false
+  dim_model: 512
+  n_heads: 8
+  dim_feedforward: 3200
+  feedforward_activation: relu
+  n_encoder_layers: 4
+  # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
+  # that means only the first layer is used. Here we match the original implementation by setting this to 1.
+  # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
+  n_decoder_layers: 1
+  # VAE.
+  use_vae: true
+  latent_dim: 32
+  n_vae_encoder_layers: 4
+  use_joint_state_bottleneck_layer: True
+
+  # Inference.
+  temporal_ensemble_coeff: null
+
+  # Training and loss computation.
+  dropout: 0.1
+  kl_weight: 10.0

From 919b493cd307ae9831c61a652fdcfd65edf3cc51 Mon Sep 17 00:00:00 2001
From: Ville Kuosmanen <ville@villekuosmanen.com>
Date: Mon, 2 Sep 2024 11:06:50 +0100
Subject: [PATCH 2/2] add false to other act configs

---
 lerobot/configs/policy/act.yaml               | 1 +
 lerobot/configs/policy/act_koch_real.yaml     | 1 +
 lerobot/configs/policy/act_real.yaml          | 1 +
 lerobot/configs/policy/act_real_no_state.yaml | 1 +
 4 files changed, 4 insertions(+)

diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
index 28883936a..23176f834 100644
--- a/lerobot/configs/policy/act.yaml
+++ b/lerobot/configs/policy/act.yaml
@@ -73,6 +73,7 @@ policy:
   use_vae: true
   latent_dim: 32
   n_vae_encoder_layers: 4
+  use_joint_state_bottleneck_layer: False
 
   # Inference.
   temporal_ensemble_coeff: null
diff --git a/lerobot/configs/policy/act_koch_real.yaml b/lerobot/configs/policy/act_koch_real.yaml
index fd4bf3b59..845ccbf6d 100644
--- a/lerobot/configs/policy/act_koch_real.yaml
+++ b/lerobot/configs/policy/act_koch_real.yaml
@@ -93,6 +93,7 @@ policy:
   use_vae: true
   latent_dim: 32
   n_vae_encoder_layers: 4
+  use_joint_state_bottleneck_layer: False
 
   # Inference.
   temporal_ensemble_momentum: null
diff --git a/lerobot/configs/policy/act_real.yaml b/lerobot/configs/policy/act_real.yaml
index 058104f4d..b7cd9f53c 100644
--- a/lerobot/configs/policy/act_real.yaml
+++ b/lerobot/configs/policy/act_real.yaml
@@ -105,6 +105,7 @@ policy:
   use_vae: true
   latent_dim: 32
   n_vae_encoder_layers: 4
+  use_joint_state_bottleneck_layer: False
 
   # Inference.
   temporal_ensemble_coeff: null
diff --git a/lerobot/configs/policy/act_real_no_state.yaml b/lerobot/configs/policy/act_real_no_state.yaml
index 082610503..d51c1b1c2 100644
--- a/lerobot/configs/policy/act_real_no_state.yaml
+++ b/lerobot/configs/policy/act_real_no_state.yaml
@@ -101,6 +101,7 @@ policy:
   use_vae: true
   latent_dim: 32
   n_vae_encoder_layers: 4
+  use_joint_state_bottleneck_layer: False
 
   # Inference.
   temporal_ensemble_coeff: null