Merge pull request #399 from facebookresearch/yvsriram/ovmm_im_fixes

yvsriram · web-flow · commit 89b212fd1490 · 2023-09-29T16:58:47.000-04:00
Get 2d association for IM working in OVMM envs
diff --git a/projects/habitat_ovmm/configs/agent/heuristic_instance_tracking_agent.yaml b/projects/habitat_ovmm/configs/agent/heuristic_instance_tracking_agent.yaml
@@ -24,6 +24,7 @@ SEMANTIC_MAP:
   dilate_size: 3
   dilate_iter: 1
   record_instance_ids: True
+  instance_association: map_overlap
   max_instances: 0
 
 SKILLS:
diff --git a/src/home_robot/home_robot/agent/objectnav_agent/objectnav_agent.py b/src/home_robot/home_robot/agent/objectnav_agent/objectnav_agent.py
@@ -293,6 +293,8 @@ def reset_vectorized(self):
         self.last_poses = [np.zeros(3)] * self.num_environments
         self.semantic_map.init_map_and_pose()
         self.episode_panorama_start_steps = self.panorama_start_steps
+        if self.record_instance_ids:
+            self.instance_memory.reset()
         self.planner.reset()
 
     def reset_vectorized_for_env(self, e: int):
@@ -302,6 +304,8 @@ def reset_vectorized_for_env(self, e: int):
         self.last_poses[e] = np.zeros(3)
         self.semantic_map.init_map_and_pose_for_env(e)
         self.episode_panorama_start_steps = self.panorama_start_steps
+        if self.record_instance_ids:
+            self.instance_memory.reset_for_env(e)
         self.planner.reset()
 
     # ---------------------------------------------------------------------
diff --git a/src/home_robot/home_robot/mapping/semantic/categorical_2d_semantic_map_module.py b/src/home_robot/home_robot/mapping/semantic/categorical_2d_semantic_map_module.py
@@ -70,7 +70,7 @@ def __init__(
         evaluate_instance_tracking: bool = False,
         instance_memory: Optional[InstanceMemory] = None,
         max_instances: int = 0,
-        instance_association: str = "bbox_iou",
+        instance_association: str = "map_overlap",
         dilation_for_instances: int = 5,
         padding_for_instance_overlap: int = 5,
     ):
@@ -816,6 +816,7 @@ def _update_global_map_instances_for_one_channel(
             extended_dilated_local_map,
             global_instances_within_local,
             max_instance_id,
+            torch.unique(extended_local_map),
         )
 
         # Update the global map with the associated instances from the local map
@@ -838,21 +839,23 @@ def _get_local_to_global_instance_mapping(
         extended_local_labels: Tensor,
         global_instances_within_local: Tensor,
         max_instance_id: int,
+        local_instance_ids: Tensor,
     ) -> dict:
         """
         Creates a mapping of local instance IDs to global instance IDs.
 
         Args:
             extended_local_labels: Labels of instances in the extended local map.
             global_instances_within_local: Instances from the global map within the local map's region.
-
+            max_instance_id: The number of instance ids that are used up
+            local_instance_ids: The local instance ids for which local to global mapping is to be determined
         Returns:
             A mapping of local instance IDs to global instance IDs.
         """
         instance_mapping = {}
 
         # Associate instances in the local map with corresponding instances in the global map
-        for local_instance_id in torch.unique(extended_local_labels):
+        for local_instance_id in local_instance_ids:
             if local_instance_id == 0:
                 # ignore 0 as it does not correspond to an instance
                 continue
@@ -879,7 +882,7 @@ def _get_local_to_global_instance_mapping(
             self.instance_memory.add_view_to_instance(
                 env_id, int(local_instance_id.item()), global_instance_id
             )
-        instance_mapping[0] = 0
+        instance_mapping[0.0] = 0
         return instance_mapping
 
     def _update_global_map_instances(
diff --git a/src/home_robot/home_robot/mapping/semantic/instance_tracking_modules.py b/src/home_robot/home_robot/mapping/semantic/instance_tracking_modules.py
@@ -26,7 +26,7 @@
     get_box_bounds_from_verts,
     get_box_verts_from_bounds,
 )
-from home_robot.utils.image import dilate_or_erode_mask
+from home_robot.utils.image import dilate_or_erode_mask, interpolate_image
 from home_robot.utils.point_cloud_torch import get_bounds
 from home_robot.utils.voxel import drop_smallest_weight_points
 
@@ -76,6 +76,10 @@ class InstanceView:
     cam_to_world: Tensor = None
     """[4,4] Tensor pose matrix mapping camera space to world space"""
 
+    # Where did we observe this from
+    pose: Tensor = None
+    """ Base pose of the robot when this view was collected"""
+
     @cached_property
     def object_coverage(self):
         return float(self.mask.sum()) / self.mask.size
@@ -271,7 +275,7 @@ def __init__(
         self,
         num_envs: int,
         du_scale: int,
-        instance_association: str = "bbox_iou",
+        instance_association: str = "map_overlap",
         instance_association_within_class: bool = True,
         iou_threshold: float = 0.8,
         global_box_nms_thresh: float = 0.0,
@@ -647,7 +651,8 @@ def associate_instances_to_memory(self):
                         match_within_category=self.instance_association_within_class,
                     )
                     if global_instance_id is None:
-                        global_instance_id = len(self.instances[env_id])
+                        # start ids from 1
+                        global_instance_id = len(self.instances[env_id]) + 1
                     self.add_view_to_instance(
                         env_id, local_instance_id, global_instance_id
                     )
@@ -658,7 +663,8 @@ def associate_instances_to_memory(self):
                         match_within_category=self.instance_association_within_class,
                     )
                     if global_instance_id is None:
-                        global_instance_id = len(self.instances[env_id])
+                        # start ids from 1
+                        global_instance_id = len(self.instances[env_id]) + 1
                     self.add_view_to_instance(
                         env_id, local_instance_id, global_instance_id
                     )
@@ -752,6 +758,7 @@ def process_instances_for_env(
         mask_out_object: bool = True,
         background_instance_label: int = 0,
         valid_points: Optional[Tensor] = None,
+        pose: Optional[Tensor] = None,
     ):
         """
         Process instance information in the current frame and add instance views to the list of unprocessed views for future association.
@@ -764,7 +771,7 @@ def process_instances_for_env(
             instance_seg (Tensor): [H, W] tensor of instance ids at each pixel
             point_cloud (Tensor): Point cloud data in world coordinates.
             image (Tensor): [3, H, W] RGB image
-            pose: 4x4 camera_space_to_world transform
+            cam_to_world: 4x4 camera_space_to_world transform
             instance_classes (Optional[Tensor]): [K,] class ids for each instance in instance seg
                 class_int = instance_classes[instance_id]
             instance_scores (Optional[Tensor]): [K,] detection confidences for each instance in instance_seg
@@ -773,6 +780,7 @@ def process_instances_for_env(
                 # If false does it not save crops? Not black background?
             background_class_label(int): id indicating background points in instance_seg. That view is not saved. (default = 0)
             valid_points (Tensor): [H, W] boolean tensor indicating valid points in the pointcloud
+            pose: (Optional[Tensor]): base pose of the agent at this timestep
         Note:
             - The method creates instance views for detected instances within the provided data.
             - If a semantic segmentation tensor is provided, each instance is associated with a semantic category.
@@ -788,35 +796,29 @@ def process_instances_for_env(
         ), "Ensure that RGB images are channels-first and in the right format."
 
         self.unprocessed_views[env_id] = {}
-        # append image to list of images
+        # append image to list of images; move tensors to cpu to prevent memory from blowing up
         if self.images[env_id] is None:
-            self.images[env_id] = image.unsqueeze(0)
+            self.images[env_id] = image.unsqueeze(0).detach().cpu()
         else:
             self.images[env_id] = torch.cat(
-                [self.images[env_id], image.unsqueeze(0)], dim=0
+                [self.images[env_id], image.unsqueeze(0).detach().cpu()], dim=0
             )
         if self.point_cloud[env_id] is None:
-            self.point_cloud[env_id] = point_cloud.unsqueeze(0)
+            self.point_cloud[env_id] = point_cloud.unsqueeze(0).detach().cpu()
         else:
             self.point_cloud[env_id] = torch.cat(
-                [self.point_cloud[env_id], point_cloud.unsqueeze(0)], dim=0
+                [self.point_cloud[env_id], point_cloud.unsqueeze(0).detach().cpu()],
+                dim=0,
             )
 
-        # Valid opints
+        # Valid points
         if valid_points is None:
-            valid_points = torch.full(
-                image.shape[:, 0], True, dtype=torch.bool, device=image.device
+            valid_points = torch.full_like(
+                image[0], True, dtype=torch.bool, device=image.device
             )
         if self.du_scale != 1:
-            valid_points_downsampled = (
-                torch.nn.functional.interpolate(
-                    valid_points.unsqueeze(0).unsqueeze(0).float(),
-                    scale_factor=1 / self.du_scale,
-                    mode="nearest",
-                )
-                .squeeze(0)
-                .squeeze(0)
-                .bool()
+            valid_points_downsampled = interpolate_image(
+                valid_points, scale_factor=1 / self.du_scale
             )
         else:
             valid_points_downsampled = valid_points
@@ -863,20 +865,15 @@ def process_instances_for_env(
 
             # TODO: If we use du_scale, we should apply this at the beginning to speed things up
             if self.du_scale != 1:
-                # downsample mask by du_scale using "NEAREST"
-                instance_mask_downsampled = (
-                    torch.nn.functional.interpolate(
-                        instance_mask.unsqueeze(0).unsqueeze(0).float(),
-                        scale_factor=1 / self.du_scale,
-                        mode="nearest",
-                    )
-                    .squeeze(0)
-                    .squeeze(0)
-                    .bool()
+                instance_mask_downsampled = interpolate_image(
+                    instance_mask, scale_factor=1 / self.du_scale
+                )
+                image_downsampled = interpolate_image(
+                    image, scale_factor=1 / self.du_scale
                 )
-
             else:
                 instance_mask_downsampled = instance_mask
+                image_downsampled = image
 
             # Erode instance masks for point cloud
             # TODO: We can do erosion and masking on the downsampled/cropped image to avoid unnecessary computation
@@ -912,7 +909,9 @@ def process_instances_for_env(
                 instance_mask_downsampled & valid_points_downsampled
             )
             point_cloud_instance = point_cloud[point_mask_downsampled]
-            point_cloud_rgb_instance = image.permute(1, 2, 0)[point_mask_downsampled]
+            point_cloud_rgb_instance = image_downsampled.permute(1, 2, 0)[
+                point_mask_downsampled
+            ]
 
             n_points = point_mask_downsampled.sum()
             n_mask = instance_mask_downsampled.sum()
@@ -941,6 +940,7 @@ def process_instances_for_env(
                         category_id=category_id,
                         score=score,
                         bounds=bounds,  # .cpu().numpy(),
+                        pose=pose,
                     )
                     # append instance view to list of instance views
                     self.unprocessed_views[env_id][instance_id.item()] = instance_view
@@ -970,9 +970,10 @@ def process_instances(
         self,
         instance_channels: Tensor,
         point_cloud: Tensor,
-        pose: torch.Tensor,
         image: Tensor,
+        cam_to_world: Optional[Tensor] = None,
         semantic_channels: Optional[Tensor] = None,
+        pose: Optional[Tensor] = None,
     ):
         """
         Process instance information across environments and associate instance views with global instances.
@@ -1005,9 +1006,10 @@ def process_instances(
                 env_id,
                 instance_segs[env_id],
                 point_cloud[env_id],
-                pose[env_id],
                 image[env_id],
+                cam_to_world=cam_to_world[env_id] if cam_to_world is not None else None,
                 semantic_seg=semantic_seg,
+                pose=pose[env_id] if pose is not None else None,
             )
         self.associate_instances_to_memory()
 
diff --git a/src/home_robot/home_robot/mapping/voxel/voxel.py b/src/home_robot/home_robot/mapping/voxel/voxel.py
@@ -348,12 +348,13 @@ def add(
             instance_seg=instance,
             point_cloud=full_world_xyz.reshape(H, W, 3),
             image=rgb.permute(2, 0, 1),
-            cam_to_world=base_pose,
+            cam_to_world=camera_pose,
             instance_classes=instance_classes,
             instance_scores=instance_scores,
             mask_out_object=False,  # Save the whole image here? Or is this with background?
             background_instance_label=self.background_instance_label,
             valid_points=valid_depth,
+            pose=base_pose,
         )
         self.instances.associate_instances_to_memory()
 
diff --git a/src/home_robot/home_robot/utils/image.py b/src/home_robot/home_robot/utils/image.py
@@ -348,3 +348,28 @@ def get_cropped_image_with_padding(self, image, bbox, padding: float = 1.0):
         x:x2,
     ]
     return cropped_image
+
+
+def interpolate_image(image: Tensor, scale_factor: float = 1.0, mode: str = "nearest"):
+    """
+    Interpolates images by the specified scale_factor using the specific interpolation mode.
+    This method uses `torch.nn.functional.interpolate` by temporarily adding batch dimension and channel dimension for 2D inputs.
+    image (Tensor): image of shape [3, H, W] or [H, W]
+    scale_factor (float): multiplier for spatial size
+    mode: (str): algorithm for interpolation: 'nearest' (default), 'bicubic' or other interpolation modes at https://pytorch.org/docs/stable/generated/torch.nn.functional.interpolate.html
+    """
+
+    if len(image.shape) == 2:
+        image = image.unsqueeze(0)
+
+    image_downsampled = (
+        torch.nn.functional.interpolate(
+            image.unsqueeze(0).float(),
+            scale_factor=scale_factor,
+            mode=mode,
+        )
+        .squeeze()
+        .squeeze()
+        .bool()
+    )
+    return image_downsampled
diff --git a/src/home_robot_sim/home_robot_sim/env/habitat_objectnav_env/visualizer.py b/src/home_robot_sim/home_robot_sim/env/habitat_objectnav_env/visualizer.py
@@ -501,7 +501,7 @@ def _visualize_instance_counts(
         '"""
         num_instances_per_category = defaultdict(int)
         num_views_per_instance = defaultdict(list)
-        for instance_id, instance in instance_memory.instance_views[0].items():
+        for instance_id, instance in instance_memory.instances[0].items():
             num_instances_per_category[instance.category_id.item()] += 1
             num_views_per_instance[instance.category_id.item()].append(
                 len(instance.instance_views)