Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(act): add a config option for adding a bottleneck layer after encoding joint states #402

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lerobot/common/policies/act/configuration_act.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class ACTConfig:
documentation in the policy class).
latent_dim: The VAE's latent dimension.
n_vae_encoder_layers: The number of transformer layers to use for the VAE's encoder.
use_joint_state_bottleneck_layer: When true, adds a bottleneck layer after encoding the joint states to try and reduce overfitting to proprioception.
temporal_ensemble_coeff: Coefficient for the exponential weighting scheme to apply for temporal
ensembling. Defaults to None which means temporal ensembling is not used. `n_action_steps` must be
1 when using this feature, as inference needs to happen at every step to form an ensemble. For
Expand Down Expand Up @@ -135,6 +136,7 @@ class ACTConfig:
use_vae: bool = True
latent_dim: int = 32
n_vae_encoder_layers: int = 4
use_joint_state_bottleneck_layer: bool = False

# Inference.
# Note: the value used in ACT when temporal ensembling is enabled is 0.01.
Expand Down
16 changes: 12 additions & 4 deletions lerobot/common/policies/act/modeling_act.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
from lerobot.common.policies.act.configuration_act import ACTConfig
from lerobot.common.policies.normalize import Normalize, Unnormalize


class ACTPolicy(
nn.Module,
PyTorchModelHubMixin,
Expand Down Expand Up @@ -300,9 +299,18 @@ def __init__(self, config: ACTConfig):
self.vae_encoder_cls_embed = nn.Embedding(1, config.dim_model)
# Projection layer for joint-space configuration to hidden dimension.
if self.use_robot_state:
self.vae_encoder_robot_state_input_proj = nn.Linear(
config.input_shapes["observation.state"][0], config.dim_model
)
if config.use_joint_state_bottleneck_layer:
self.vae_encoder_robot_state_input_proj = nn.Sequential(
nn.Linear(config.input_shapes["observation.state"][0], config.dim_model), # First linear layer
nn.ReLU(), # Activation function after the first layer
nn.Linear(config.dim_model, 4), # Bottleneck layer with reduced dimensionality
nn.ReLU(), # Activation function after the bottleneck layer
nn.Linear(4, config.dim_model) # Expanding back to the original desired dimensionality
)
else:
self.vae_encoder_robot_state_input_proj = nn.Linear(
config.input_shapes["observation.state"][0], config.dim_model
)
# Projection layer for action (joint-space target) to hidden dimension.
self.vae_encoder_action_input_proj = nn.Linear(
config.output_shapes["action"][0], config.dim_model
Expand Down
1 change: 1 addition & 0 deletions lerobot/configs/policy/act.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ policy:
use_vae: true
latent_dim: 32
n_vae_encoder_layers: 4
use_joint_state_bottleneck_layer: False

# Inference.
temporal_ensemble_coeff: null
Expand Down
1 change: 1 addition & 0 deletions lerobot/configs/policy/act_koch_real.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ policy:
use_vae: true
latent_dim: 32
n_vae_encoder_layers: 4
use_joint_state_bottleneck_layer: False

# Inference.
temporal_ensemble_momentum: null
Expand Down
1 change: 1 addition & 0 deletions lerobot/configs/policy/act_real.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ policy:
use_vae: true
latent_dim: 32
n_vae_encoder_layers: 4
use_joint_state_bottleneck_layer: False

# Inference.
temporal_ensemble_coeff: null
Expand Down
110 changes: 110 additions & 0 deletions lerobot/configs/policy/act_real_bottleneck.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# @package _global_

# The `act_real_bottleneck.yaml` adds a bottleneck layer to try to reduce overfitting on joint states.
#
# Example of usage for training:
# ```bash
# python lerobot/scripts/train.py \
# policy=act_real_bottleneck \
# env=dora_aloha_real
# ```

seed: 1000
dataset_repo_id: lerobot/aloha_static_vinh_cup

override_dataset_stats:
observation.images.cam_right_wrist:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
observation.images.cam_left_wrist:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
observation.images.cam_high:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)
observation.images.cam_low:
# stats from imagenet, since we use a pretrained vision model
mean: [[[0.485]], [[0.456]], [[0.406]]] # (c,1,1)
std: [[[0.229]], [[0.224]], [[0.225]]] # (c,1,1)

training:
offline_steps: 100000
online_steps: 0
eval_freq: -1
save_freq: 20000
save_checkpoint: true

batch_size: 8
lr: 1e-5
lr_backbone: 1e-5
weight_decay: 1e-4
grad_clip_norm: 10
online_steps_between_rollouts: 1

delta_timestamps:
action: "[i / ${fps} for i in range(${policy.chunk_size})]"

eval:
n_episodes: 50
batch_size: 50

# See `configuration_act.py` for more details.
policy:
name: act

# Input / output structure.
n_obs_steps: 1
chunk_size: 100 # chunk_size
n_action_steps: 100

input_shapes:
# TODO(rcadene, alexander-soare): add variables for height and width from the dataset/env?
observation.images.cam_right_wrist: [3, 480, 640]
observation.images.cam_left_wrist: [3, 480, 640]
observation.images.cam_high: [3, 480, 640]
observation.images.cam_low: [3, 480, 640]
observation.state: ["${env.state_dim}"]
output_shapes:
action: ["${env.action_dim}"]

# Normalization / Unnormalization
input_normalization_modes:
observation.images.cam_right_wrist: mean_std
observation.images.cam_left_wrist: mean_std
observation.images.cam_high: mean_std
observation.images.cam_low: mean_std
observation.state: mean_std
output_normalization_modes:
action: mean_std

# Architecture.
# Vision backbone.
vision_backbone: resnet18
pretrained_backbone_weights: ResNet18_Weights.IMAGENET1K_V1
replace_final_stride_with_dilation: false
# Transformer layers.
pre_norm: false
dim_model: 512
n_heads: 8
dim_feedforward: 3200
feedforward_activation: relu
n_encoder_layers: 4
# Note: Although the original ACT implementation has 7 for `n_decoder_layers`, there is a bug in the code
# that means only the first layer is used. Here we match the original implementation by setting this to 1.
# See this issue https://github.com/tonyzhaozh/act/issues/25#issue-2258740521.
n_decoder_layers: 1
# VAE.
use_vae: true
latent_dim: 32
n_vae_encoder_layers: 4
use_joint_state_bottleneck_layer: True

# Inference.
temporal_ensemble_coeff: null

# Training and loss computation.
dropout: 0.1
kl_weight: 10.0
1 change: 1 addition & 0 deletions lerobot/configs/policy/act_real_no_state.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ policy:
use_vae: true
latent_dim: 32
n_vae_encoder_layers: 4
use_joint_state_bottleneck_layer: False

# Inference.
temporal_ensemble_coeff: null
Expand Down