From 6a823afe764c64a77f5c9ce3bb9f917789f31ee9 Mon Sep 17 00:00:00 2001 From: Ville Kuosmanen Date: Mon, 2 Sep 2024 10:49:12 +0100 Subject: [PATCH 1/2] feat(act): add a config option for adding a bottleneck layer after encofing joint states --- .../common/policies/act/configuration_act.py | 2 + lerobot/common/policies/act/modeling_act.py | 16 ++- .../configs/policy/act_real_bottleneck.yaml | 110 ++++++++++++++++++ 3 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 lerobot/configs/policy/act_real_bottleneck.yaml diff --git a/lerobot/common/policies/act/configuration_act.py b/lerobot/common/policies/act/configuration_act.py index a86c359c9..4d0851f2b 100644 --- a/lerobot/common/policies/act/configuration_act.py +++ b/lerobot/common/policies/act/configuration_act.py @@ -76,6 +76,7 @@ class ACTConfig: documentation in the policy class). latent_dim: The VAE's latent dimension. n_vae_encoder_layers: The number of transformer layers to use for the VAE's encoder. + use_joint_state_bottleneck_layer: When true, adds a bottleneck layer after encoding the joint states to try and reduce overfitting to proprioception. temporal_ensemble_coeff: Coefficient for the exponential weighting scheme to apply for temporal ensembling. Defaults to None which means temporal ensembling is not used. `n_action_steps` must be 1 when using this feature, as inference needs to happen at every step to form an ensemble. For @@ -135,6 +136,7 @@ class ACTConfig: use_vae: bool = True latent_dim: int = 32 n_vae_encoder_layers: int = 4 + use_joint_state_bottleneck_layer: bool = False # Inference. # Note: the value used in ACT when temporal ensembling is enabled is 0.01. diff --git a/lerobot/common/policies/act/modeling_act.py b/lerobot/common/policies/act/modeling_act.py index 3427c4829..5d97343b2 100644 --- a/lerobot/common/policies/act/modeling_act.py +++ b/lerobot/common/policies/act/modeling_act.py @@ -37,7 +37,6 @@ from lerobot.common.policies.act.configuration_act import ACTConfig from lerobot.common.policies.normalize import Normalize, Unnormalize - class ACTPolicy( nn.Module, PyTorchModelHubMixin, @@ -300,9 +299,18 @@ def __init__(self, config: ACTConfig): self.vae_encoder_cls_embed = nn.Embedding(1, config.dim_model) # Projection layer for joint-space configuration to hidden dimension. if self.use_robot_state: - self.vae_encoder_robot_state_input_proj = nn.Linear( - config.input_shapes["observation.state"][0], config.dim_model - ) + if config.use_joint_state_bottleneck_layer: + self.vae_encoder_robot_state_input_proj = nn.Sequential( + nn.Linear(config.input_shapes["observation.state"][0], config.dim_model), # First linear layer + nn.ReLU(), # Activation function after the first layer + nn.Linear(config.dim_model, 4), # Bottleneck layer with reduced dimensionality + nn.ReLU(), # Activation function after the bottleneck layer + nn.Linear(4, config.dim_model) # Expanding back to the original desired dimensionality + ) + else: + self.vae_encoder_robot_state_input_proj = nn.Linear( + config.input_shapes["observation.state"][0], config.dim_model + ) # Projection layer for action (joint-space target) to hidden dimension. self.vae_encoder_action_input_proj = nn.Linear( config.output_shapes["action"][0], config.dim_model diff --git a/lerobot/configs/policy/act_real_bottleneck.yaml b/lerobot/configs/policy/act_real_bottleneck.yaml new file mode 100644 index 000000000..0b8022265 --- /dev/null +++ b/lerobot/configs/policy/act_real_bottleneck.yaml @@ -0,0 +1,110 @@ +# @package _global_ + +# The `act_real_bottleneck.yaml` adds a bottleneck layer to try to reduce overfitting on joint states. +# +# Example of usage for training: +# ```bash +# python lerobot/scripts/train.py \ +# policy=act_real_bottleneck \ +# env=dora_aloha_real +# ``` + +seed: 1000 +dataset_repo_id: lerobot/aloha_static_vinh_cup + +override_dataset_stats: + observation.images.cam_right_wrist: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + observation.images.cam_left_wrist: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + observation.images.cam_high: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + observation.images.cam_low: + # stats from imagenet, since we use a pretrained vision model + mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1) + std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1) + +training: + offline_steps: 100000 + online_steps: 0 + eval_freq: -1 + save_freq: 20000 + save_checkpoint: true + + batch_size: 8 + lr: 1e-5 + lr_backbone: 1e-5 + weight_decay: 1e-4 + grad_clip_norm: 10 + online_steps_between_rollouts: 1 + + delta_timestamps: + action: "[i / ${fps} for i in range(${policy.chunk_size})]" + +eval: + n_episodes: 50 + batch_size: 50 + +# See `configuration_act.py` for more details. +policy: + name: act + + # Input / output structure. + n_obs_steps: 1 + chunk_size: 100 # chunk_size + n_action_steps: 100 + + input_shapes: + # TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env? + observation.images.cam_right_wrist: [3, 480, 640] + observation.images.cam_left_wrist: [3, 480, 640] + observation.images.cam_high: [3, 480, 640] + observation.images.cam_low: [3, 480, 640] + observation.state: ["${env.state_dim}"] + output_shapes: + action: ["${env.action_dim}"] + + # Normalization / Unnormalization + input_normalization_modes: + observation.images.cam_right_wrist: mean_std + observation.images.cam_left_wrist: mean_std + observation.images.cam_high: mean_std + observation.images.cam_low: mean_std + observation.state: mean_std + output_normalization_modes: + action: mean_std + + # Architecture. + # Vision backbone. + vision_backbone: resnet18 + pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1 + replace_final_stride_with_dilation: false + # Transformer layers. + pre_norm: false + dim_model: 512 + n_heads: 8 + dim_feedforward: 3200 + feedforward_activation: relu + n_encoder_layers: 4 + # Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code + # that means only the first layer is used. Here we match the original implementation by setting this to 1. + # See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521. + n_decoder_layers: 1 + # VAE. + use_vae: true + latent_dim: 32 + n_vae_encoder_layers: 4 + use_joint_state_bottleneck_layer: True + + # Inference. + temporal_ensemble_coeff: null + + # Training and loss computation. + dropout: 0.1 + kl_weight: 10.0 From 919b493cd307ae9831c61a652fdcfd65edf3cc51 Mon Sep 17 00:00:00 2001 From: Ville Kuosmanen Date: Mon, 2 Sep 2024 11:06:50 +0100 Subject: [PATCH 2/2] add false to other act configs --- lerobot/configs/policy/act.yaml | 1 + lerobot/configs/policy/act_koch_real.yaml | 1 + lerobot/configs/policy/act_real.yaml | 1 + lerobot/configs/policy/act_real_no_state.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml index 28883936a..23176f834 100644 --- a/lerobot/configs/policy/act.yaml +++ b/lerobot/configs/policy/act.yaml @@ -73,6 +73,7 @@ policy: use_vae: true latent_dim: 32 n_vae_encoder_layers: 4 + use_joint_state_bottleneck_layer: False # Inference. temporal_ensemble_coeff: null diff --git a/lerobot/configs/policy/act_koch_real.yaml b/lerobot/configs/policy/act_koch_real.yaml index fd4bf3b59..845ccbf6d 100644 --- a/lerobot/configs/policy/act_koch_real.yaml +++ b/lerobot/configs/policy/act_koch_real.yaml @@ -93,6 +93,7 @@ policy: use_vae: true latent_dim: 32 n_vae_encoder_layers: 4 + use_joint_state_bottleneck_layer: False # Inference. temporal_ensemble_momentum: null diff --git a/lerobot/configs/policy/act_real.yaml b/lerobot/configs/policy/act_real.yaml index 058104f4d..b7cd9f53c 100644 --- a/lerobot/configs/policy/act_real.yaml +++ b/lerobot/configs/policy/act_real.yaml @@ -105,6 +105,7 @@ policy: use_vae: true latent_dim: 32 n_vae_encoder_layers: 4 + use_joint_state_bottleneck_layer: False # Inference. temporal_ensemble_coeff: null diff --git a/lerobot/configs/policy/act_real_no_state.yaml b/lerobot/configs/policy/act_real_no_state.yaml index 082610503..d51c1b1c2 100644 --- a/lerobot/configs/policy/act_real_no_state.yaml +++ b/lerobot/configs/policy/act_real_no_state.yaml @@ -101,6 +101,7 @@ policy: use_vae: true latent_dim: 32 n_vae_encoder_layers: 4 + use_joint_state_bottleneck_layer: False # Inference. temporal_ensemble_coeff: null