tracel-ai · nathanielsimard · Jan 22, 2025 · Jan 22, 2025 · Jan 23, 2025 · Jan 24, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -153,11 +153,11 @@ ahash = { version = "0.8.11", default-features = false }
 portable-atomic-util = { version = "0.2.4", features = ["alloc"] }
 
 ### For the main burn branch. ###
-cubecl = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "a172f6760052bef392e6f0e44e912460960f2c1b" }
-cubecl-common = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "a172f6760052bef392e6f0e44e912460960f2c1b" }
+# cubecl = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "276b8db0b5402492cc72013ce8da9b63be3a165f" }
+# cubecl-common = { git = "https://github.com/tracel-ai/cubecl", default-features = false, rev = "276b8db0b5402492cc72013ce8da9b63be3a165f" }
 ### For local development. ###
-# cubecl = { path = "../cubecl/crates/cubecl", default-features = false }
-# cubecl-common = { path = "../cubecl/crates/cubecl-common", default-features = false }
+cubecl = { path = "../cubecl/crates/cubecl", default-features = false }
+cubecl-common = { path = "../cubecl/crates/cubecl-common", default-features = false }
 ### For the release. ###
 # cubecl = { version = "0.4.0", default-features = false }
 # cubecl-common = { version = "0.4.0", default-features = false }

diff --git a/backend-comparison/benches/matmul_fused.rs b/backend-comparison/benches/matmul_fused.rs
@@ -26,8 +26,7 @@ impl<B: Backend, const D: usize> Benchmark for MatmulBenchmark<B, D> {
     }
 
     fn execute(&self, (lhs, rhs, bias): Self::Args) {
-        let bias = bias.unsqueeze();
-        gelu(relu(lhs.matmul(rhs)) + bias);
+        let _output = gelu(relu(lhs.matmul(rhs)) + bias.unsqueeze());
     }
 
     fn prepare(&self) -> Self::Args {

diff --git a/crates/burn-core/src/lib.rs b/crates/burn-core/src/lib.rs
@@ -59,12 +59,15 @@ extern crate alloc;
 pub type TestBackend = burn_ndarray::NdArray<f32>;
 
 #[cfg(all(test, feature = "test-tch"))]
+/// Backend for test cases
 pub type TestBackend = burn_tch::LibTorch<f32>;
 
 #[cfg(all(test, feature = "test-wgpu"))]
+/// Backend for test cases
 pub type TestBackend = burn_wgpu::Wgpu;
 
 #[cfg(all(test, feature = "test-cuda"))]
+/// Backend for test cases
 pub type TestBackend = burn_cuda::Cuda;
 
 /// Backend for autodiff test cases

diff --git a/crates/burn-core/src/nn/linear.rs b/crates/burn-core/src/nn/linear.rs
@@ -77,7 +77,6 @@ impl<B: Backend> Linear<B> {
 
         let weight = self.weight.val().unsqueeze();
         let bias = self.bias.as_ref().map(|b| b.val().unsqueeze());
-
         let output = input.matmul(weight);
 
         match bias {

diff --git a/crates/burn-core/src/nn/transformer/decoder.rs b/crates/burn-core/src/nn/transformer/decoder.rs
@@ -455,8 +455,9 @@ impl<B: Backend> TransformerDecoder<B> {
 
 #[cfg(test)]
 mod tests {
+    use burn_tensor::Device;
+
     use super::*;
-    use crate::tensor::Distribution;
     use crate::{nn::attention::generate_autoregressive_mask, TestBackend};
 
     #[test]
@@ -481,20 +482,16 @@ mod tests {
     }
 
     fn test_autoregressive(config: TransformerDecoderConfig) {
-        let device = Default::default();
+        let device: Device<TestBackend> = Default::default();
         let [batch_size, seq_length, d_model] = [3, 4, config.d_model];
-        let transformer = config.init(&device);
-
-        let memory = Tensor::<TestBackend, 3>::random(
-            [batch_size, seq_length, d_model],
-            Distribution::Default,
-            &device,
-        );
-        let target = Tensor::<TestBackend, 3>::random(
-            [batch_size, seq_length, d_model],
-            Distribution::Default,
-            &device,
-        );
+        let transformer = config.init::<TestBackend>(&device);
+
+        let memory = Tensor::arange(0..(batch_size * seq_length * d_model) as i64, &device)
+            .float()
+            .reshape([batch_size, seq_length, d_model]);
+        let target = Tensor::arange(0..(batch_size * seq_length * d_model) as i64, &device)
+            .float()
+            .reshape([batch_size, seq_length, d_model]);
         let mask_attn = generate_autoregressive_mask(batch_size, seq_length, &target.device());
         let input = TransformerDecoderInput::new(target.clone(), memory.clone())
             .target_mask_attn(mask_attn);

diff --git a/crates/burn-fusion/src/ops/boolean.rs b/crates/burn-fusion/src/ops/boolean.rs
@@ -17,8 +17,8 @@ use burn_tensor::{
         BaseOperationDescription, BinaryOperationDescription, BoolOperationDescription,
         CatOperationDescription, ExpandOperationDescription, FlipOperationDescription,
         HandleContainer, OperationDescription, PermuteOperationDescription,
-        RepeatDimOperationDescription, ReshapeDescription, SliceAssignOperationDescription,
-        SliceOperationDescription, SwapDimsDescription, UnaryOperationDescription,
+        RepeatDimOperationDescription, SliceAssignOperationDescription, SliceOperationDescription,
+        SwapDimsDescription, UnaryOperationDescription,
     },
     Device, Shape,
 };
@@ -182,7 +182,7 @@ impl<B: FusionBackend> BoolTensorOps<Self> for Fusion<B> {
     fn bool_reshape(tensor: BoolTensor<Self>, shape: Shape) -> BoolTensor<Self> {
         #[derive(new)]
         struct ReshapeDimsOps<B: FusionBackend> {
-            desc: ReshapeDescription,
+            desc: UnaryOperationDescription,
             _b: PhantomData<B>,
         }
 
@@ -197,7 +197,7 @@ impl<B: FusionBackend> BoolTensorOps<Self> for Fusion<B> {
         let stream = tensor.stream;
         let out = tensor.client.tensor_uninitialized(shape.dims, DType::Bool);
 
-        let desc = ReshapeDescription {
+        let desc = UnaryOperationDescription {
             input: tensor.into_description(),
             out: out.to_description_out(),
         };

diff --git a/crates/burn-fusion/src/ops/float.rs b/crates/burn-fusion/src/ops/float.rs
@@ -650,7 +650,7 @@ impl<B: FusionBackend> FloatTensorOps<Self> for Fusion<B> {
     fn float_reshape(tensor: FloatTensor<Self>, shape: Shape) -> FloatTensor<Self> {
         #[derive(new)]
         struct ReshapeDimsOps<B: FusionBackend> {
-            desc: ReshapeDescription,
+            desc: UnaryOperationDescription,
             _b: PhantomData<B>,
         }
 
@@ -666,7 +666,7 @@ impl<B: FusionBackend> FloatTensorOps<Self> for Fusion<B> {
         let dtype = tensor.dtype;
         let out = tensor.client.tensor_uninitialized(shape.dims, dtype);
 
-        let desc = ReshapeDescription {
+        let desc = UnaryOperationDescription {
             input: tensor.into_description(),
             out: out.to_description_out(),
         };

diff --git a/crates/burn-fusion/src/ops/int.rs b/crates/burn-fusion/src/ops/int.rs
@@ -103,7 +103,7 @@ impl<B: FusionBackend> IntTensorOps<Self> for Fusion<B> {
     fn int_reshape(tensor: IntTensor<Self>, shape: Shape) -> IntTensor<Self> {
         #[derive(new)]
         struct ReshapeDimsOps<B: FusionBackend> {
-            desc: ReshapeDescription,
+            desc: UnaryOperationDescription,
             _b: PhantomData<B>,
         }
 
@@ -120,7 +120,7 @@ impl<B: FusionBackend> IntTensorOps<Self> for Fusion<B> {
             .client
             .tensor_uninitialized(shape.dims, B::IntElem::dtype());
 
-        let desc = ReshapeDescription {
+        let desc = UnaryOperationDescription {
             input: tensor.into_description(),
             out: out.to_description_out(),
         };

diff --git a/crates/burn-fusion/src/stream/context.rs b/crates/burn-fusion/src/stream/context.rs
@@ -39,12 +39,9 @@ pub struct Context<'a, H> {
     pub scalar_u8: &'a Vec<u8>,
 }
 
-#[derive(Default)]
 pub(crate) struct OperationConverter {
     tensors_relative2global: HashMap<TensorId, TensorDescription>,
     tensors_global2relative: HashMap<TensorId, TensorDescription>,
-    /// Only useful to create new shape ID.
-    /// You should use tensor descriptions to retrieve the proper shape.
     shapes_global2relative: HashMap<usize, usize>,
     scalar_f32: Vec<f32>,
     scalar_f16: Vec<f16>,
@@ -59,6 +56,32 @@ pub(crate) struct OperationConverter {
     scalar_u8: Vec<u8>,
 }
 
+impl Default for OperationConverter {
+    fn default() -> Self {
+        let mut val = Self {
+            tensors_relative2global: Default::default(),
+            tensors_global2relative: Default::default(),
+            shapes_global2relative: Default::default(),
+            scalar_f32: Default::default(),
+            scalar_f16: Default::default(),
+            scalar_bf16: Default::default(),
+            scalar_i64: Default::default(),
+            scalar_i32: Default::default(),
+            scalar_i16: Default::default(),
+            scalar_i8: Default::default(),
+            scalar_u64: Default::default(),
+            scalar_u32: Default::default(),
+            scalar_u16: Default::default(),
+            scalar_u8: Default::default(),
+        };
+
+        // global 1 is always shape id 0.
+        val.shapes_global2relative.insert(1, 0);
+
+        val
+    }
+}
+
 /// Fork of a [context](Context) which owns its data.
 pub struct ContextOwned<H> {
     tensors: HashMap<TensorId, TensorDescription>,
@@ -180,7 +203,11 @@ impl OperationConverter {
     pub(crate) fn clear(&mut self) {
         self.tensors_relative2global.clear();
         self.tensors_global2relative.clear();
+
         self.shapes_global2relative.clear();
+        // global 1 is always shape id 0.
+        self.shapes_global2relative.insert(1, 0);
+
         self.scalar_f32.clear();
         self.scalar_f16.clear();
         self.scalar_bf16.clear();
@@ -1126,7 +1153,7 @@ impl RelativeOps for BaseOperationDescription {
                 BaseOperationDescription::ToDevice(desc.to_relative(converter))
             }
             BaseOperationDescription::Reshape(desc) => {
-                BaseOperationDescription::Reshape(ReshapeDescription {
+                BaseOperationDescription::Reshape(UnaryOperationDescription {
                     input: desc.input.to_relative(converter),
                     out: desc.out.to_relative(converter),
                 })
@@ -1241,6 +1268,7 @@ impl RelativeOps for TensorDescription {
                 // We never saw this dim value before, therefore we create a new ID.
                 let dim_id = converter.shapes_global2relative.len();
                 relative_shape.push(dim_id);
+
                 converter.shapes_global2relative.insert(*dim, dim_id);
             }
         }

diff --git a/crates/burn-jit/src/fusion/elemwise/builder.rs b/crates/burn-jit/src/fusion/elemwise/builder.rs
@@ -2,7 +2,7 @@ use burn_fusion::OptimizationBuilder;
 
 use crate::{
     fusion::{
-        on_write::{builder::FuseOnWriteBuilder, ir::ElemwisePrecision},
+        on_write::{builder::FuseOnWriteBuilder, ir::ElemwisePrecision, settings::FuseSettings},
         JitOptimization,
     },
     JitRuntime,
@@ -23,7 +23,16 @@ impl<R: JitRuntime> ElementWiseBuilder<R> {
         let max_bindings = props.hardware_properties().max_bindings;
 
         Self {
-            builder: FuseOnWriteBuilder::new(max_bindings, bool_precision),
+            builder: FuseOnWriteBuilder::new(
+                max_bindings,
+                bool_precision,
+                FuseSettings {
+                    broadcast: true,
+                    output_shape_updates: true,
+                    mix_vectorization: true,
+                    inplace: true,
+                },
+            ),
             device,
         }
     }