stepjam
diff --git a/‎README.md‎
Lines changed: 6 additions & 0 deletions b/‎README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎arm/qte/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎arm/qte/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎arm/qte/launch_utils.py‎
Lines changed: 72 additions & 0 deletions b/‎arm/qte/launch_utils.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎arm/qte/networks.py‎
Lines changed: 200 additions & 0 deletions b/‎arm/qte/networks.py‎
Lines changed: 200 additions & 0 deletions
@@ -5,6 +5,7 @@ Codebase of Q-attention, coarse-to-fine Q-attention, and other variants. Code fr
 - [Q-attention: Enabling Efficient Learning for Vision-based Robotic Manipulation](https://arxiv.org/abs/2105.14829) (ARM system)
 - [Coarse-to-Fine Q-attention: Efficient Learning for Visual Robotic Manipulation via Discretisation](https://arxiv.org/abs/2106.12534) (C2F-ARM system)
 - [Coarse-to-Fine Q-attention with Learned Path Ranking](https://arxiv.org/abs/2204.01571) (C2F-ARM+LPR system)
+- [Coarse-to-Fine Q-attention with Tree Expansion](https://arxiv.org/abs/2204.12471)
 
 ![task grid image missing](readme_files/variants.png)
 
@@ -42,3 +43,8 @@ To launch **C2F-ARM+LPR**:
 ```bash
 python launch.py method=LPR rlbench.task=take_lid_off_saucepan rlbench.demo_path=/mnt/my/save/dir framework.gpu=0
 ```
+
+To launch **C2F-ARM+QTE**:
+```bash
+python launch.py method=QTE rlbench.task=take_lid_off_saucepan rlbench.demo_path=/mnt/my/save/dir framework.gpu=0
+```
@@ -0,0 +1 @@
+import arm.qte.launch_utils
@@ -0,0 +1,72 @@
+from omegaconf import DictConfig
+
+from arm.qte.networks import Qattention3DNet
+from arm.qte.qattention_agent import QAttentionAgent
+from arm.c2farm.qattention_stack_agent import QAttentionStackAgent
+from arm.preprocess_agent import PreprocessAgent
+
+
+def create_agent(cfg: DictConfig, env, depth_0bounds=None, cam_resolution=None):
+    VOXEL_FEATS = 3
+    LATENT_SIZE = 64
+    cam_resolution = cam_resolution or [128, 128]
+
+    include_prev_layer = False
+
+    num_rotation_classes = int(360. // cfg.method.rotation_resolution)
+    qattention_agents = []
+    for depth, vox_size in enumerate(cfg.method.voxel_sizes):
+        last = depth == len(cfg.method.voxel_sizes) - 1
+        unet3d = Qattention3DNet(
+            in_channels=VOXEL_FEATS + 3 + 1 + 3,
+            out_channels=1,
+            voxel_size=vox_size,
+            timesteps=cfg.replay.timesteps,
+            out_dense=((num_rotation_classes * 3) + 2) if last else 0,
+            kernels=LATENT_SIZE,
+            norm=None if 'None' in cfg.method.norm else cfg.method.norm,
+            dense_feats=128,
+            activation=cfg.method.activation,
+            low_dim_size=env.low_dim_state_len,
+            include_prev_layer=include_prev_layer and depth > 0)
+
+
+        qattention_agent = QAttentionAgent(
+            layer=depth,
+            coordinate_bounds=depth_0bounds,
+            unet3d=unet3d,
+            camera_names=cfg.rlbench.cameras,
+            voxel_size=vox_size,
+            bounds_offset=cfg.method.bounds_offset[depth - 1] if depth > 0 else None,
+            image_crop_size=cfg.method.image_crop_size,
+            tau=cfg.method.tau,
+            lr=cfg.method.lr,
+            lambda_trans_qreg=cfg.method.lambda_trans_qreg,
+            lambda_rot_qreg=cfg.method.lambda_rot_qreg,
+            include_low_dim_state=True,
+            image_resolution=cam_resolution,
+            batch_size=cfg.replay.batch_size,
+            timesteps=cfg.replay.timesteps,
+            voxel_feature_size=VOXEL_FEATS,
+            exploration_strategy=cfg.method.exploration_strategy,
+            lambda_weight_l2=cfg.method.lambda_weight_l2,
+            num_rotation_classes=num_rotation_classes,
+            rotation_resolution=cfg.method.rotation_resolution,
+            grad_clip=0.01,
+            gamma=0.99,
+            tree_search_breadth=cfg.method.tree_search_breadth,
+            tree_during_update=cfg.method.tree_during_update,
+            tree_during_act=cfg.method.tree_during_act
+        )
+        qattention_agents.append(qattention_agent)
+
+    for i in range(len(qattention_agents) - 1):
+        qattention_agents[i].give_next_layer_qattention(qattention_agents[i + 1])
+
+    rotation_agent = QAttentionStackAgent(
+        qattention_agents=qattention_agents,
+        rotation_resolution=cfg.method.rotation_resolution,
+        camera_names=cfg.rlbench.cameras,
+    )
+    preprocess_agent = PreprocessAgent(pose_agent=rotation_agent)
+    return preprocess_agent
@@ -0,0 +1,200 @@
+import torch
+import torch.nn as nn
+
+from arm.network_utils import Conv3DInceptionBlock, DenseBlock, SpatialSoftmax3D, \
+    Conv3DInceptionBlockUpsampleBlock, Conv3DBlock
+
+
+class Qattention3DNet(nn.Module):
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 out_dense: int,
+                 voxel_size: int,
+                 low_dim_size: int,
+                 kernels: int,
+                 timesteps: int,
+                 norm: str = None,
+                 activation: str = 'relu',
+                 dense_feats: int = 32,
+                 include_prev_layer = False,):
+        super(Qattention3DNet, self).__init__()
+        self._in_channels = in_channels
+        self._out_channels = out_channels
+        self._norm = norm
+        self._activation = activation
+        self._kernels = kernels
+        self._timesteps = timesteps
+        self._low_dim_size = low_dim_size * timesteps
+        self._build_calls = 0
+        self._voxel_size = voxel_size
+        self._dense_feats = dense_feats
+        self._out_dense = out_dense
+        self._include_prev_layer = include_prev_layer
+
+    def build(self):
+        use_residual = False
+        self._build_calls += 1
+        if self._build_calls != 1:
+            raise RuntimeError('Build needs to be called once.')
+
+        spatial_size = self._voxel_size
+        self._input_preprocess = Conv3DInceptionBlock(
+            self._in_channels, self._kernels, norm=self._norm,
+            activation=self._activation)
+
+        d0_ins = self._input_preprocess.out_channels * self._timesteps
+        if self._include_prev_layer:
+            PREV_VOXEL_CHANNELS = 0
+            self._input_preprocess_prev_layer = Conv3DInceptionBlock(
+                self._in_channels + PREV_VOXEL_CHANNELS, self._kernels, norm=self._norm,
+                activation=self._activation)
+            d0_ins += self._input_preprocess_prev_layer.out_channels
+
+        if self._low_dim_size > 0:
+            self._proprio_preprocess = DenseBlock(
+                self._low_dim_size, self._kernels, None, self._activation)
+            d0_ins += self._kernels
+
+        self._down0 = Conv3DInceptionBlock(
+            d0_ins, self._kernels, norm=self._norm,
+            activation=self._activation, residual=use_residual)
+        self._ss0 = SpatialSoftmax3D(
+            spatial_size, spatial_size, spatial_size,
+            self._down0.out_channels)
+        spatial_size //= 2
+        self._down1 = Conv3DInceptionBlock(
+            self._down0.out_channels, self._kernels * 2, norm=self._norm,
+            activation=self._activation, residual=use_residual)
+        self._ss1 = SpatialSoftmax3D(
+            spatial_size, spatial_size, spatial_size,
+            self._down1.out_channels)
+        spatial_size //= 2
+
+        flat_size = self._down0.out_channels * 4 + self._down1.out_channels * 4
+
+        k1 = self._down1.out_channels
+        if self._voxel_size > 8:
+            k1 += self._kernels
+            self._down2 = Conv3DInceptionBlock(
+                self._down1.out_channels, self._kernels * 4, norm=self._norm,
+                activation=self._activation,  residual=use_residual)
+            flat_size += self._down2.out_channels * 4
+            self._ss2 = SpatialSoftmax3D(
+                spatial_size, spatial_size, spatial_size,
+                self._down2.out_channels)
+            spatial_size //= 2
+            k2 = self._down2.out_channels
+            if self._voxel_size > 16:
+                k2 *= 2
+                self._down3 = Conv3DInceptionBlock(
+                    self._down2.out_channels, self._kernels, norm=self._norm,
+                    activation=self._activation, residual=use_residual)
+                flat_size += self._down3.out_channels * 4
+                self._ss3 = SpatialSoftmax3D(
+                    spatial_size, spatial_size, spatial_size,
+                    self._down3.out_channels)
+                self._up3 = Conv3DInceptionBlockUpsampleBlock(
+                    self._kernels, self._kernels, 2, norm=self._norm,
+                    activation=self._activation, residual=use_residual)
+            self._up2 = Conv3DInceptionBlockUpsampleBlock(
+                k2, self._kernels, 2, norm=self._norm,
+                activation=self._activation, residual=use_residual)
+
+        self._up1 = Conv3DInceptionBlockUpsampleBlock(
+            k1, self._kernels, 2, norm=self._norm,
+            activation=self._activation, residual=use_residual)
+
+        self._global_maxp = nn.AdaptiveMaxPool3d(1)
+        self._local_maxp = nn.MaxPool3d(3, 2, padding=1)
+        self._final = Conv3DBlock(
+            self._kernels * 2, self._kernels, kernel_sizes=3,
+            strides=1, norm=self._norm, activation=self._activation)
+        self._final2 = Conv3DBlock(
+            self._kernels, self._out_channels, kernel_sizes=3,
+            strides=1, norm=None, activation=None)
+
+        self._ss_final = SpatialSoftmax3D(
+            self._voxel_size, self._voxel_size, self._voxel_size,
+            self._kernels)
+        flat_size += self._kernels * 4
+
+        if self._out_dense > 0:
+            self._dense0 = DenseBlock(
+                flat_size, self._dense_feats, None, self._activation)
+            self._dense1 = DenseBlock(
+                self._dense_feats, self._dense_feats, None, self._activation)
+            self._dense2 = DenseBlock(
+                self._dense_feats, self._out_dense, None, None)
+
+    def forward(self, ins, proprio, prev_layer_voxel_grid):
+        b, t, _, d, h, w = ins.shape
+        x = torch.cat([self._input_preprocess(x_) for x_ in ins.unbind(1)], 1)
+
+        if self._include_prev_layer:
+            y = self._input_preprocess_prev_layer(prev_layer_voxel_grid)
+            x = torch.cat([x, y], dim=1)
+
+        if self._low_dim_size > 0:
+            p = self._proprio_preprocess(proprio)
+            p = p.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).repeat(
+                1, 1, d, h, w)
+            x = torch.cat([x, p], dim=1)
+
+        d0 = self._down0(x)
+        ss0 = self._ss0(d0)
+        maxp0 = self._global_maxp(d0).view(b, -1)
+        d1 = u = self._down1(self._local_maxp(d0))
+        ss1 = self._ss1(d1)
+        maxp1 = self._global_maxp(d1).view(b, -1)
+
+        feats = [ss0, maxp0, ss1, maxp1]
+
+        if self._voxel_size > 8:
+            d2 = u = self._down2(self._local_maxp(d1))
+            feats.extend([self._ss2(d2), self._global_maxp(d2).view(b, -1)])
+            if self._voxel_size > 16:
+                d3 = self._down3(self._local_maxp(d2))
+                feats.extend([self._ss3(d3), self._global_maxp(d3).view(b, -1)])
+                u3 = self._up3(d3)
+                u = torch.cat([d2, u3], dim=1)
+            u2 = self._up2(u)
+            u = torch.cat([d1, u2], dim=1)
+
+        u1 = self._up1(u)
+        f1 = self._final(torch.cat([d0, u1], dim=1))
+        trans = self._final2(f1)
+
+        feats.extend([self._ss_final(f1), self._global_maxp(f1).view(b, -1)])
+
+        self.latent_dict = {
+            'd0': d0.mean(-1).mean(-1).mean(-1),
+            'd1': d1.mean(-1).mean(-1).mean(-1),
+            'u1': u1.mean(-1).mean(-1).mean(-1),
+            'trans_out': trans,
+        }
+
+        rot_and_grip_out = None
+        if self._out_dense > 0:
+            dense0 = self._dense0(torch.cat(feats, 1))
+            dense1 = self._dense1(dense0)
+            rot_and_grip_out = self._dense2(dense1)
+            self.latent_dict.update({
+                'dense0': dense0,
+                'dense1': dense1,
+                'dense2': rot_and_grip_out,
+            })
+
+        if self._voxel_size > 8:
+            self.latent_dict.update({
+                'd2': d2.mean(-1).mean(-1).mean(-1),
+                'u2': u2.mean(-1).mean(-1).mean(-1),
+            })
+        if self._voxel_size > 16:
+            self.latent_dict.update({
+                'd3': d3.mean(-1).mean(-1).mean(-1),
+                'u3': u3.mean(-1).mean(-1).mean(-1),
+            })
+
+        return trans, rot_and_grip_out