Manually fuse presence and prefix sum

tracel-ai · Feb 2, 2025 · e3ec085 · e3ec085
1 parent 15c431c
commit e3ec085
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 11 deletions.
diff --git a/crates/burn-vision/src/backends/jit/connected_components/hardware_accelerated.rs b/crates/burn-vision/src/backends/jit/connected_components/hardware_accelerated.rs
@@ -4,8 +4,8 @@
 //! DASIP, 2018
 
 use crate::{
-    backends::jit::{connected_components::stats_from_opts, prefix_sum::prefix_sum},
-    ConnectedStatsOptions, ConnectedStatsPrimitive, Connectivity,
+    backends::jit::connected_components::stats_from_opts, ConnectedStatsOptions,
+    ConnectedStatsPrimitive, Connectivity,
 };
 use burn_jit::{
     kernel,
@@ -16,6 +16,8 @@ use burn_jit::{
 use burn_tensor::{ops::IntTensorOps, Shape};
 use cubecl::{prelude::*, Feature};
 
+use super::prefix_sum::prefix_sum;
+
 const BLOCK_H: u32 = 4;
 
 #[cube]
@@ -563,9 +565,7 @@ pub fn hardware_accelerated<R: JitRuntime, F: FloatElement, I: IntElement, BT: B
                 stats.area.clone(),
                 &[0..batches, 0..(max_label + 1).next_multiple_of(4)],
             );
-            let present = JitBackend::<R, F, I, BT>::int_not_equal_elem(sliced, I::new(0));
-            let present = kernel::cast::<R, BT, I>(present);
-            let relabel = prefix_sum::<R, I>(present);
+            let relabel = prefix_sum::<R, I>(sliced);
 
             let cube_dim = CubeDim::default();
             let cube_count = CubeCount::new_3d(

diff --git a/crates/burn-vision/src/backends/jit/connected_components/mod.rs b/crates/burn-vision/src/backends/jit/connected_components/mod.rs
@@ -1,5 +1,9 @@
 mod hardware_accelerated;
 
+/// Should eventually make this a full op, but the kernel is too specialized on ints and plane ops
+/// to really use it in a general case. Needs more work to use as a normal tensor method.
+mod prefix_sum;
+
 use burn_jit::{
     ops::numeric::{full_device, zeros_device},
     tensor::JitTensor,

diff --git a/...urn-vision/src/backends/jit/prefix_sum.rs → ...ds/jit/connected_components/prefix_sum.rs b/...urn-vision/src/backends/jit/prefix_sum.rs → ...ds/jit/connected_components/prefix_sum.rs
@@ -61,7 +61,8 @@ fn prefix_sum_kernel<I: Int>(
 
         if part_id < cube_count_x - 1 {
             for k in 0..vec4_spt {
-                let mut scan = scan_in[i + scan_offs];
+                // Manually fuse not_equal and cast
+                let mut scan = Line::cast_from(scan_in[i + scan_offs].not_equal(Line::new(zero)));
                 let x = scan[0];
                 scan[1] += x;
                 let y = scan[1];
@@ -76,7 +77,9 @@ fn prefix_sum_kernel<I: Int>(
         if part_id == cube_count_x - 1 {
             for k in 0..vec4_spt {
                 if i < scan_in.shape(1) {
-                    let mut scan = scan_in[i + scan_offs];
+                    // Manually fuse not_equal and cast
+                    let mut scan =
+                        Line::cast_from(scan_in[i + scan_offs].not_equal(Line::new(zero)));
                     let x = scan[0];
                     scan[1] += x;
                     let y = scan[1];

diff --git a/crates/burn-vision/src/backends/jit/mod.rs b/crates/burn-vision/src/backends/jit/mod.rs
@@ -1,6 +1,2 @@
 mod connected_components;
 mod ops;
-
-/// Should eventually make this a full op, but the kernel is too specialized on ints and plane ops
-/// to really use it in a general case. Needs more work to use as a normal tensor method.
-mod prefix_sum;