tracel-ai
diff --git a/‎Cargo.lock
-15 b/‎Cargo.lock
-15
diff --git a/‎Cargo.toml
+4-4 b/‎Cargo.toml
+4-4
diff --git a/‎crates/burn-jit/src/ops/base.rs
+2-1 b/‎crates/burn-jit/src/ops/base.rs
+2-1
diff --git a/‎crates/burn-vision/Cargo.toml
+1-1 b/‎crates/burn-vision/Cargo.toml
+1-1
diff --git a/‎crates/burn-vision/src/backends/jit/connected_components/hardware_accelerated.rs
+33-23 b/‎crates/burn-vision/src/backends/jit/connected_components/hardware_accelerated.rs
+33-23
diff --git a/‎crates/burn-vision/src/backends/jit/mod.rs
+4 b/‎crates/burn-vision/src/backends/jit/mod.rs
+4
diff --git a/‎crates/burn-vision/src/backends/jit/ops.rs
+4-4 b/‎crates/burn-vision/src/backends/jit/ops.rs
+4-4
@@ -153,11 +153,11 @@ ahash = { version = "0.8.11", default-features = false }
 portable-atomic-util = { version = "0.2.4", features = ["alloc"] }
 
 ### For the main burn branch. ###
-cubecl = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "e0734dadca994b02b7dce3b77a575edb1fb2232e" }
-cubecl-common = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "e0734dadca994b02b7dce3b77a575edb1fb2232e" }
+# cubecl = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "e0734dadca994b02b7dce3b77a575edb1fb2232e" }
+# cubecl-common = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "e0734dadca994b02b7dce3b77a575edb1fb2232e" }
 ### For local development. ###
-# cubecl = { path = "../cubecl/crates/cubecl", default-features = false }
-# cubecl-common = { path = "../cubecl/crates/cubecl-common", default-features = false }
+cubecl = { path = "../cubecl/crates/cubecl", default-features = false }
+cubecl-common = { path = "../cubecl/crates/cubecl-common", default-features = false }
 ### For the release. ###
 # cubecl = { version = "0.4.0", default-features = false }
 # cubecl-common = { version = "0.4.0", default-features = false }
 
@@ -136,7 +136,8 @@ pub(crate) fn expand<R: JitRuntime>(tensor: JitTensor<R>, target_shape: Shape) -
     }
 }
 
-pub(crate) fn reshape<R: JitRuntime>(tensor: JitTensor<R>, shape: Shape) -> JitTensor<R> {
+/// Reshape a jit tensor to a new shape
+pub fn reshape<R: JitRuntime>(tensor: JitTensor<R>, shape: Shape) -> JitTensor<R> {
     // TODO: Not force standard layout all the time (improve performance).
     let tensor = kernel::into_contiguous(tensor);
 
 
@@ -25,7 +25,7 @@ tch = ["burn-tch"]
 # Test features
 cpu = ["export-tests"]
 cuda = ["jit-backend", "export-tests"]
-vulkan = ["burn-wgpu/vulkan", "wgpu"]
+vulkan = ["burn-wgpu/vulkan", "jit-backend", "export-tests"]
 wgpu = ["jit-backend", "export-tests"]
 
 [dependencies]
 
@@ -4,17 +4,17 @@
 //! DASIP, 2018
 
 use crate::{
-    backends::jit::connected_components::stats_from_opts, ConnectedStatsOptions,
-    ConnectedStatsPrimitive, Connectivity,
+    backends::jit::{connected_components::stats_from_opts, prefix_sum::prefix_sum},
+    ConnectedStatsOptions, ConnectedStatsPrimitive, Connectivity,
 };
 use burn_jit::{
     kernel,
     ops::{into_data_sync, numeric::zeros_device},
     tensor::JitTensor,
     BoolElement, FloatElement, IntElement, JitBackend, JitRuntime,
 };
-use burn_tensor::ops::IntTensorOps;
-use cubecl::{calculate_cube_count_elemwise, prelude::*, Feature};
+use burn_tensor::{ops::IntTensorOps, Shape};
+use cubecl::{prelude::*, Feature};
 
 const BLOCK_H: u32 = 4;
 
@@ -380,6 +380,7 @@ fn analysis<I: Int, BT: CubePrimitive>(
             while label != u32::cast_from(labels[b_offs + label]) - 1 {
                 label = u32::cast_from(labels[b_offs + label]) - 1;
             }
+            label += 1;
 
             Atomic::add(&area[b_offs + label], I::cast_from(count));
 
@@ -397,13 +398,17 @@ fn analysis<I: Int, BT: CubePrimitive>(
         label = plane_broadcast(label, UNIT_POS_X - s_dist);
 
         if p {
-            labels[labels_index] = I::cast_from(label + 1);
+            labels[labels_index] = I::cast_from(label);
         }
     }
 }
 
 #[cube(launch)]
-fn compact_labels<I: Int>(labels: &mut Tensor<I>, remap: &Tensor<I>) {
+fn compact_labels<I: Int>(
+    labels: &mut Tensor<I>,
+    remap: &Tensor<I>,
+    max_label: &Tensor<Atomic<I>>,
+) {
     let batch = ABSOLUTE_POS_Z;
     let x = ABSOLUTE_POS_X;
     let y = ABSOLUTE_POS_Y;
@@ -416,7 +421,9 @@ fn compact_labels<I: Int>(labels: &mut Tensor<I>, remap: &Tensor<I>) {
 
     let label = u32::cast_from(labels[labels_pos]);
     if label != 0 {
-        labels[labels_pos] = remap[label];
+        let new_label = remap[label];
+        labels[labels_pos] = new_label;
+        Atomic::max(&max_label[batch], new_label);
     }
 }
 
@@ -433,11 +440,9 @@ fn compact_stats<I: Int>(
     bottom: &Tensor<I>,
     bottom_new: &mut Tensor<I>,
     remap: &Tensor<I>,
-    max_label: u32,
-    #[comptime] opts: ConnectedStatsOptions,
 ) {
     let label = ABSOLUTE_POS_X;
-    if label > max_label {
+    if label >= remap.len() {
         terminate!();
     }
 
@@ -448,12 +453,12 @@ fn compact_stats<I: Int>(
     let new_label = u32::cast_from(remap[label]);
 
     area_new[new_label] = area;
-    if opts.bounds_enabled {
-        top_new[new_label] = top[label];
-        left_new[new_label] = left[label];
-        right_new[new_label] = right[label];
-        bottom_new[new_label] = bottom[label];
-    }
+    // This should be gated but there's a problem with the Eq bound only being implemented for tuples
+    // up to 12 elems, so I can't pass the opts. It's not unsafe, but potentially unnecessary work.
+    top_new[new_label] = top[label];
+    left_new[new_label] = left[label];
+    right_new[new_label] = right[label];
+    bottom_new[new_label] = bottom[label];
 }
 
 #[allow(clippy::type_complexity)]
@@ -525,7 +530,7 @@ pub fn hardware_accelerated<R: JitRuntime, F: FloatElement, I: IntElement, BT: B
         batches as u32,
     );
 
-    let stats = stats_from_opts(labels.clone(), stats_opt);
+    let mut stats = stats_from_opts(labels.clone(), stats_opt);
 
     if stats_opt == ConnectedStatsOptions::none() {
         relabeling::launch::<I, BT, R>(
@@ -553,28 +558,35 @@ pub fn hardware_accelerated<R: JitRuntime, F: FloatElement, I: IntElement, BT: B
         if stats_opt.compact_labels {
             let max_labels = into_data_sync::<R, I>(stats.max_label.clone()).convert::<u32>();
             let max_label = *max_labels.as_slice::<u32>().unwrap().iter().max().unwrap() as usize;
-            let sliced = kernel::slice::<R, I>(stats.area.clone(), &[0..batches, 0..max_label + 1]);
+            let sliced = kernel::slice::<R, I>(
+                stats.area.clone(),
+                &[0..batches, 0..(max_label + 1).next_multiple_of(4)],
+            );
             let present = JitBackend::<R, F, I, BT>::int_not_equal_elem(sliced, I::new(0));
-            let relabel = JitBackend::<R, F, I, BT>::int_prefix_sum(present);
+            let present = kernel::cast::<R, BT, I>(present);
+            let relabel = prefix_sum::<R, I>(present);
 
             let cube_dim = CubeDim::default();
             let cube_count = CubeCount::new_3d(
                 (cols as u32).div_ceil(cube_dim.x),
                 (rows as u32).div_ceil(cube_dim.y),
                 batches as u32,
             );
-            compact_labels::launch(
+            stats.max_label =
+                zeros_device::<R, I>(client.clone(), device.clone(), Shape::new([batches]));
+            compact_labels::launch::<I, R>(
                 &client,
                 cube_count,
                 cube_dim,
                 labels.as_tensor_arg::<I>(1),
                 relabel.as_tensor_arg::<I>(1),
+                stats.max_label.as_tensor_arg::<I>(1),
             );
 
             let cube_dim = CubeDim::new_1d(256);
             let cube_count =
                 CubeCount::new_3d((rows * cols).div_ceil(256) as u32, 1, batches as u32);
-            compact_stats::launch(
+            compact_stats::launch::<I, R>(
                 &client,
                 cube_count,
                 cube_dim,
@@ -589,8 +601,6 @@ pub fn hardware_accelerated<R: JitRuntime, F: FloatElement, I: IntElement, BT: B
                 stats.bottom.copy().as_tensor_arg::<I>(1),
                 stats.bottom.as_tensor_arg::<I>(1),
                 relabel.as_tensor_arg::<I>(1),
-                ScalarArg::new(max_label as u32),
-                stats_opt,
             );
         }
     }
 
@@ -1,2 +1,6 @@
 mod connected_components;
 mod ops;
+
+/// Should eventually make this a full op, but the kernel is too specialized on ints and plane ops
+/// to really use it in a general case. Needs more work to use as a normal tensor method.
+mod prefix_sum;
@@ -45,8 +45,8 @@ where
 impl<B: FusionBackend + VisionOps<B>> VisionOps<Self> for Fusion<B> {
     fn connected_components(img: BoolTensor<Self>, conn: Connectivity) -> IntTensor<Self> {
         let batches = img.shape[0];
-        let height = img.shape[2];
-        let width = img.shape[3];
+        let height = img.shape[1];
+        let width = img.shape[2];
         let client = img.client.clone();
 
         #[derive(derive_new::new)]
@@ -92,8 +92,8 @@ impl<B: FusionBackend + VisionOps<B>> VisionOps<Self> for Fusion<B> {
         opts: ConnectedStatsOptions,
     ) -> (IntTensor<Self>, ConnectedStatsPrimitive<Self>) {
         let batches = img.shape[0];
-        let height = img.shape[2];
-        let width = img.shape[3];
+        let height = img.shape[1];
+        let width = img.shape[2];
         let client = img.client.clone();
 
         #[derive(derive_new::new)]
Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,8 @@ pub(crate) fn expand<R: JitRuntime>(tensor: JitTensor<R>, target_shape: Shape) -`
`136`	`136`	`}`
`137`	`137`	`}`
`138`	`138`
`139`		`-pub(crate) fn reshape<R: JitRuntime>(tensor: JitTensor<R>, shape: Shape) -> JitTensor<R> {`
	`139`	`+/// Reshape a jit tensor to a new shape`
	`140`	`+pub fn reshape<R: JitRuntime>(tensor: JitTensor<R>, shape: Shape) -> JitTensor<R> {`
`140`	`141`	`// TODO: Not force standard layout all the time (improve performance).`
`141`	`142`	`let tensor = kernel::into_contiguous(tensor);`
`142`	`143`