tracel-ai
diff --git a/‎burn-wgpu/src/codegen/compilation.rs
+261 b/‎burn-wgpu/src/codegen/compilation.rs
+261
diff --git a/‎burn-wgpu/src/codegen/dialect/gpu/algorithm.rs
+57 b/‎burn-wgpu/src/codegen/dialect/gpu/algorithm.rs
+57
diff --git a/‎burn-wgpu/src/codegen/dialect/gpu/body.rs
-7 b/‎burn-wgpu/src/codegen/dialect/gpu/body.rs
-7
@@ -0,0 +1,261 @@
+use super::dialect::gpu;
+use crate::codegen::dialect::gpu::{
+    Binding, ComputeShader, Elem, Item, Location, Variable, Vectorization, Visibility,
+    WorkgroupSize,
+};
+
+/// The compilation struct allows you to create a [compute shader](ComputeShader) based on
+/// [compilation info](CompilationInfo) and [compilation settings](CompilationSettings).
+#[derive(Clone)]
+pub struct Compilation {
+    info: CompilationInfo,
+    input_bindings: Vec<Binding>,
+    output_bindings: Vec<Binding>,
+    named_bindings: Vec<(String, Binding)>,
+}
+
+/// The information necessary to compile a [compute shader](ComputeShader).
+#[derive(Clone)]
+pub struct CompilationInfo {
+    pub inputs: Vec<InputInfo>,
+    pub outputs: Vec<OutputInfo>,
+    pub scope: gpu::Scope,
+    pub mappings: Vec<InplaceMapping>,
+}
+
+/// Simply indicate the output that can be replaced by the input.
+#[derive(new, Clone, Copy)]
+pub struct InplaceMapping {
+    /// Input position.
+    pub pos_input: usize,
+    /// Output position.
+    pub pos_output: usize,
+}
+
+#[derive(Default)]
+pub struct CompilationSettings {
+    vectorization: Vectorization,
+    inplace_available: bool,
+    workgroup_size: WorkgroupSize,
+}
+
+impl CompilationSettings {
+    /// Compile the shader with vectorization enabled.
+    #[allow(dead_code)]
+    pub fn vectorize(mut self, vectorization: Vectorization) -> Self {
+        self.vectorization = vectorization;
+        self
+    }
+    /// Compile the shader with inplace enabled.
+    ///
+    /// Notes:
+    ///
+    /// This won't guarantee that the shader will use input arrays as outputs, since it is only
+    /// possible when [inplace mappings](InplaceMapping) are provided as [compilation info](CompilationInfo)
+    pub fn inplace(mut self, available: bool) -> Self {
+        self.inplace_available = available;
+        self
+    }
+    /// Set the grid size.
+    #[allow(dead_code)] // Only used for fusion for now.
+    pub fn workgroup_size(mut self, workgroup_size: WorkgroupSize) -> Self {
+        self.workgroup_size = workgroup_size;
+        self
+    }
+}
+
+/// Information related to an input.
+#[derive(Clone)]
+pub enum InputInfo {
+    Array { item: Item, visibility: Visibility },
+    Scalar { elem: Elem, size: usize },
+}
+
+/// Information related to an output.
+#[derive(Clone)]
+pub enum OutputInfo {
+    /// Write the local variable to a new array.
+    ///
+    /// This will create a new binding in the [compute shader](ComputeShader).
+    Array { item: Item, local: u16 },
+    /// Write the local variable to an existing input binding.
+    Input { item: Item, input: u16, local: u16 },
+}
+
+impl Compilation {
+    /// Starts a new compilation.
+    pub fn new(info: CompilationInfo) -> Self {
+        Self {
+            info,
+            input_bindings: Default::default(),
+            output_bindings: Default::default(),
+            named_bindings: Default::default(),
+        }
+    }
+
+    /// Performs the compilation with the provided [settings](CompilationSettings).
+    pub fn compile(mut self, settings: CompilationSettings) -> ComputeShader {
+        self.info.scope.vectorize(settings.vectorization);
+
+        self.register_inputs(&settings);
+        self.register_outputs(&settings);
+
+        let inputs = self.input_bindings;
+        let outputs = self.output_bindings;
+        let mut named = Vec::with_capacity(2);
+
+        named.push((
+            "info".to_string(),
+            Binding {
+                item: Item::Scalar(Elem::UInt),
+                visibility: Visibility::Read,
+                location: Location::Storage,
+                size: None, // We avoid putting the length here since it will force a new kernel
+                            // for each tensor rank.
+            },
+        ));
+
+        for (name, binding) in self.named_bindings.into_iter() {
+            named.push((name, binding));
+        }
+
+        ComputeShader {
+            inputs,
+            outputs,
+            named,
+            workgroup_size: settings.workgroup_size,
+            body: self.info.scope,
+            num_workgroups: true,
+            global_invocation_id: true,
+        }
+    }
+
+    fn register_inputs(&mut self, settings: &CompilationSettings) {
+        for input in self.info.inputs.drain(..) {
+            match input {
+                InputInfo::Array { item, visibility } => {
+                    let item = item.vectorize(settings.vectorization);
+
+                    self.input_bindings.push(Binding {
+                        item: bool_item(item),
+                        visibility,
+                        location: Location::Storage,
+                        size: None,
+                    });
+                }
+                InputInfo::Scalar { elem, size } => {
+                    let elem = bool_elem(elem);
+
+                    self.named_bindings.push((
+                        format!("scalars_{}", elem),
+                        Binding {
+                            item: Item::Scalar(elem),
+                            visibility: Visibility::Read,
+                            location: Location::Storage,
+                            size: Some(size),
+                        },
+                    ));
+                }
+            }
+        }
+    }
+
+    fn register_outputs(&mut self, settings: &CompilationSettings) {
+        let mut index = 0;
+
+        if settings.inplace_available {
+            let mut mappings = Vec::new();
+            core::mem::swap(&mut self.info.mappings, &mut mappings);
+
+            for mapping in mappings {
+                self.register_inplace_mapping(mapping);
+            }
+        }
+
+        for array in self.info.outputs.drain(..) {
+            match array {
+                OutputInfo::Array { item, local } => {
+                    let item = item.vectorize(settings.vectorization);
+                    let elem_adapted = bool_item(item);
+
+                    self.output_bindings.push(Binding {
+                        item: elem_adapted,
+                        visibility: Visibility::ReadWrite,
+                        location: Location::Storage,
+                        size: None,
+                    });
+                    self.info.scope.write_global(
+                        Variable::Local(local, item, self.info.scope.depth),
+                        Variable::GlobalOutputArray(index, elem_adapted),
+                    );
+                    index += 1;
+                }
+                OutputInfo::Input { item, input, local } => {
+                    let item = item.vectorize(settings.vectorization);
+
+                    self.info.scope.write_global(
+                        Variable::Local(local, item, self.info.scope.depth),
+                        Variable::GlobalInputArray(input, bool_item(item)),
+                    );
+                }
+            }
+        }
+    }
+
+    fn register_inplace_mapping(&mut self, mapping: InplaceMapping) {
+        let output = match self.info.outputs.get_mut(mapping.pos_output) {
+            Some(output) => output,
+            None => return, // No output to update.
+        };
+
+        let (item, local) = match output {
+            OutputInfo::Array { item, local } => (item, local),
+            OutputInfo::Input {
+                item: _,
+                input: _,
+                local: _,
+            } => return, // Output already updated.
+        };
+
+        let item = match self.input_bindings.get_mut(mapping.pos_input) {
+            Some(binding) => {
+                // Update input visibility.
+                binding.visibility = Visibility::ReadWrite;
+                // Inputs modified inplace should be read without any specified layout.
+                self.info
+                    .scope
+                    .update_read(mapping.pos_input as u16, gpu::ReadingStrategy::Plain);
+
+                // Use the same item as the input.
+                //
+                // The output can be different (i.e inplace boolean operations on float bindings).
+                binding.item
+            }
+            None => *item,
+        };
+
+        // Update the output.
+        *output = OutputInfo::Input {
+            item,
+            input: mapping.pos_input as u16,
+            local: *local,
+        };
+    }
+}
+
+fn bool_item(ty: Item) -> Item {
+    match ty {
+        Item::Vec4(elem) => Item::Vec4(bool_elem(elem)),
+        Item::Vec3(elem) => Item::Vec3(bool_elem(elem)),
+        Item::Vec2(elem) => Item::Vec2(bool_elem(elem)),
+        Item::Scalar(elem) => Item::Scalar(bool_elem(elem)),
+    }
+}
+
+fn bool_elem(elem: Elem) -> Elem {
+    match elem {
+        // U32 are used for bool tensors
+        Elem::Bool => Elem::UInt,
+        _ => elem,
+    }
+}
@@ -0,0 +1,57 @@
+use super::{
+    gpu, Elem, Item, Metadata, Operator, ReadGlobalAlgo, ReadGlobalWithLayoutAlgo, Scope, Variable,
+};
+use crate::codegen::dialect::gpu::BinaryOperator;
+
+impl ReadGlobalAlgo {
+    pub fn expand(self, scope: &mut Scope) {
+        scope.register(Operator::Index(BinaryOperator {
+            lhs: self.global,
+            rhs: Variable::Id,
+            out: self.out,
+        }));
+    }
+}
+
+impl ReadGlobalWithLayoutAlgo {
+    pub fn expand(self, scope: &mut Scope) {
+        let out = self.out;
+        let tensor = self.global;
+        let layout = self.layout;
+        let index_item_ty = Item::Scalar(Elem::UInt);
+        let index_local = scope.create_local(index_item_ty);
+        let zero: Variable = 0u32.into();
+        let id = Variable::Id;
+        let offset: Variable = match self.global.item() {
+            Item::Vec4(_) => 4u32,
+            Item::Vec3(_) => 3u32,
+            Item::Vec2(_) => 2u32,
+            Item::Scalar(_) => 1u32,
+        }
+        .into();
+
+        gpu!(scope, index_local = zero);
+        gpu!(
+            scope,
+            range(zero, Variable::Rank).for_each(|i, scope| {
+                let stride = scope.create_local(index_item_ty);
+                let stride_layout = scope.create_local(index_item_ty);
+                let shape = scope.create_local(index_item_ty);
+                let tmp = scope.create_local(index_item_ty);
+
+                gpu!(scope, stride = stride(tensor, i));
+                gpu!(scope, shape = shape(tensor, i));
+                gpu!(scope, stride_layout = stride(layout, i));
+
+                gpu!(scope, tmp = id * offset);
+                gpu!(scope, tmp = tmp / stride_layout);
+                gpu!(scope, tmp = tmp % shape);
+                gpu!(scope, tmp = tmp * stride);
+                gpu!(scope, index_local = index_local + tmp);
+            })
+        );
+
+        gpu!(scope, index_local = index_local / offset);
+        gpu!(scope, out = tensor[index_local]);
+    }
+}