diff --git a/backend-comparison/benches/reduce.rs b/backend-comparison/benches/reduce.rs index 94d298d8a1..df365f2306 100644 --- a/backend-comparison/benches/reduce.rs +++ b/backend-comparison/benches/reduce.rs @@ -18,7 +18,6 @@ struct ReduceBenchmark { impl ReduceBenchmark { pub fn new(instruction: Instruction, device: B::Device) -> Self { let shape = Shape::new([4096, 512, 64]); - // let shape = Shape::new([128, 128, 64]); let tensor = Tensor::random(shape.clone(), Distribution::Default, &device); Self { instruction, diff --git a/crates/burn-jit/src/kernel/reduce/base.rs b/crates/burn-jit/src/kernel/reduce/base.rs index 8b6279efac..9ec677ee93 100644 --- a/crates/burn-jit/src/kernel/reduce/base.rs +++ b/crates/burn-jit/src/kernel/reduce/base.rs @@ -9,7 +9,7 @@ use burn_tensor::{Shape, TensorData}; pub use cubecl::reduce::instructions::{ArgMax, ArgMin, Mean, Prod, Sum}; use cubecl::reduce::shared_sum; -/// Specialize reduce function to computhe the sum of all elements of the `input` tensor and return +/// Specialize reduce function to compute the sum of all elements of the `input` tensor and return /// the value into a single-element tensor of shape `1 x 1 x 1 x ...` with the same rank as `input`. /// /// This is expected to be faster for larger tensors than calling [reduce] with the `Sum` instruction.