huggingface · villekuosmanen · Sep 2, 2024 · Sep 2, 2024
diff --git a/lerobot/common/policies/act/configuration_act.py b/lerobot/common/policies/act/configuration_act.py
@@ -76,6 +76,7 @@ class ACTConfig:
  documentation in the policy class).
  latent_dim: The VAE's latent dimension.
  n_vae_encoder_layers: The number of transformer layers to use for the VAE's encoder.
+ use_joint_state_bottleneck_layer: When true, adds a bottleneck layer after encoding the joint states to try and reduce overfitting to proprioception.
  temporal_ensemble_coeff: Coefficient for the exponential weighting scheme to apply for temporal
  ensembling. Defaults to None which means temporal ensembling is not used. `n_action_steps` must be
  1 when using this feature, as inference needs to happen at every step to form an ensemble. For
@@ -135,6 +136,7 @@ class ACTConfig:
  use_vae: bool = True
  latent_dim: int = 32
  n_vae_encoder_layers: int = 4
+ use_joint_state_bottleneck_layer: bool = False
 
  # Inference.
  # Note: the value used in ACT when temporal ensembling is enabled is 0.01.

diff --git a/lerobot/common/policies/act/modeling_act.py b/lerobot/common/policies/act/modeling_act.py
@@ -37,7 +37,6 @@
 from lerobot.common.policies.act.configuration_act import ACTConfig
 from lerobot.common.policies.normalize import Normalize, Unnormalize
 
-
 class ACTPolicy(
  nn.Module,
  PyTorchModelHubMixin,
@@ -300,9 +299,18 @@ def __init__(self, config: ACTConfig):
  self.vae_encoder_cls_embed = nn.Embedding(1, config.dim_model)
  # Projection layer for joint-space configuration to hidden dimension.
  if self.use_robot_state:
- self.vae_encoder_robot_state_input_proj = nn.Linear(
- config.input_shapes["observation.state"][0], config.dim_model
- )
+ if config.use_joint_state_bottleneck_layer:
+ self.vae_encoder_robot_state_input_proj = nn.Sequential(
+ nn.Linear(config.input_shapes["observation.state"][0], config.dim_model), # First linear layer
+ nn.ReLU(), # Activation function after the first layer
+ nn.Linear(config.dim_model, 4), # Bottleneck layer with reduced dimensionality
+ nn.ReLU(), # Activation function after the bottleneck layer
+ nn.Linear(4, config.dim_model) # Expanding back to the original desired dimensionality
+ )
+ else:
+ self.vae_encoder_robot_state_input_proj = nn.Linear(
+ config.input_shapes["observation.state"][0], config.dim_model
+ )
  # Projection layer for action (joint-space target) to hidden dimension.
  self.vae_encoder_action_input_proj = nn.Linear(
  config.output_shapes["action"][0], config.dim_model

diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml
@@ -73,6 +73,7 @@ policy:
  use_vae: true
  latent_dim: 32
  n_vae_encoder_layers: 4
+ use_joint_state_bottleneck_layer: False
 
  # Inference.
  temporal_ensemble_coeff: null

diff --git a/lerobot/configs/policy/act_koch_real.yaml b/lerobot/configs/policy/act_koch_real.yaml
@@ -93,6 +93,7 @@ policy:
  use_vae: true
  latent_dim: 32
  n_vae_encoder_layers: 4
+ use_joint_state_bottleneck_layer: False
 
  # Inference.
  temporal_ensemble_momentum: null

diff --git a/lerobot/configs/policy/act_real.yaml b/lerobot/configs/policy/act_real.yaml
@@ -105,6 +105,7 @@ policy:
  use_vae: true
  latent_dim: 32
  n_vae_encoder_layers: 4
+ use_joint_state_bottleneck_layer: False
 
  # Inference.
  temporal_ensemble_coeff: null

diff --git a/lerobot/configs/policy/act_real_bottleneck.yaml b/lerobot/configs/policy/act_real_bottleneck.yaml
@@ -0,0 +1,110 @@
+# @package _global_
+
+# The `act_real_bottleneck.yaml` adds a bottleneck layer to try to reduce overfitting on joint states.
+#
+# Example of usage for training:
+# ```bash
+# python lerobot/scripts/train.py \
+# policy=act_real_bottleneck \
+# env=dora_aloha_real
+# ```
+
+seed: 1000
+dataset_repo_id: lerobot/aloha_static_vinh_cup
+
+override_dataset_stats:
+ observation.images.cam_right_wrist:
+ # stats from imagenet, since we use a pretrained vision model
+ mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
+ std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
+ observation.images.cam_left_wrist:
+ # stats from imagenet, since we use a pretrained vision model
+ mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
+ std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
+ observation.images.cam_high:
+ # stats from imagenet, since we use a pretrained vision model
+ mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
+ std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
+ observation.images.cam_low:
+ # stats from imagenet, since we use a pretrained vision model
+ mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
+ std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
+
+training:
+ offline_steps: 100000
+ online_steps: 0
+ eval_freq: -1
+ save_freq: 20000
+ save_checkpoint: true
+
+ batch_size: 8
+ lr: 1e-5
+ lr_backbone: 1e-5
+ weight_decay: 1e-4
+ grad_clip_norm: 10
+ online_steps_between_rollouts: 1
+
+ delta_timestamps:
+ action: "[i / ${fps} for i in range(${policy.chunk_size})]"
+
+eval:
+ n_episodes: 50
+ batch_size: 50
+
+# See `configuration_act.py` for more details.
+policy:
+ name: act
+
+ # Input / output structure.
+ n_obs_steps: 1
+ chunk_size: 100 # chunk_size
+ n_action_steps: 100
+
+ input_shapes:
+ # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
+ observation.images.cam_right_wrist: [3, 480, 640]
+ observation.images.cam_left_wrist: [3, 480, 640]
+ observation.images.cam_high: [3, 480, 640]
+ observation.images.cam_low: [3, 480, 640]
+ observation.state: ["${env.state_dim}"]
+ output_shapes:
+ action: ["${env.action_dim}"]
+
+ # Normalization / Unnormalization
+ input_normalization_modes:
+ observation.images.cam_right_wrist: mean_std
+ observation.images.cam_left_wrist: mean_std
+ observation.images.cam_high: mean_std
+ observation.images.cam_low: mean_std
+ observation.state: mean_std
+ output_normalization_modes:
+ action: mean_std
+
+ # Architecture.
+ # Vision backbone.
+ vision_backbone: resnet18
+ pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
+ replace_final_stride_with_dilation: false
+ # Transformer layers.
+ pre_norm: false
+ dim_model: 512
+ n_heads: 8
+ dim_feedforward: 3200
+ feedforward_activation: relu
+ n_encoder_layers: 4
+ # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
+ # that means only the first layer is used. Here we match the original implementation by setting this to 1.
+ # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
+ n_decoder_layers: 1
+ # VAE.
+ use_vae: true
+ latent_dim: 32
+ n_vae_encoder_layers: 4
+ use_joint_state_bottleneck_layer: True
+
+ # Inference.
+ temporal_ensemble_coeff: null
+
+ # Training and loss computation.
+ dropout: 0.1
+ kl_weight: 10.0
diff --git a/lerobot/configs/policy/act_real_no_state.yaml b/lerobot/configs/policy/act_real_no_state.yaml
@@ -101,6 +101,7 @@ policy:
  use_vae: true
  latent_dim: 32
  n_vae_encoder_layers: 4
+ use_joint_state_bottleneck_layer: False
 
  # Inference.
  temporal_ensemble_coeff: null