complex number multiplication that supports 3D ROPE triton kernel

Zhu Jiale · root · commit 691e03728351 · 2025-12-26T08:12:53.000Z
diff --git a/aiter/__init__.py b/aiter/__init__.py
@@ -74,3 +74,4 @@ def getLogger():
 from .ops.gradlib import *
 from .ops.trans_ragged_layout import *
 from . import mla
+from .ops.groupnorm import *
diff --git a/aiter/install_mode b/aiter/install_mode
@@ -0,0 +1 @@
+develop
diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json
@@ -913,5 +913,17 @@
         ],
         "verbose": "False",
         "blob_gen_cmd": "''"
+    },
+    "module_groupnorm": {
+	"srcs": [
+	    "f'{AITER_CSRC_DIR}/pybind/groupnorm_pybind.cu'",
+	    "f'{AITER_CSRC_DIR}/kernels/groupnorm.cu'"
+	],
+	"flags_extra_cc": [],
+	"flags_extra_hip": [], 
+	"extra_ldflags": "None",
+	"extra_include": [],
+	"verbose": "True",
+	"blob_gen_cmd": "''"
     }
 }
diff --git a/aiter/ops/groupnorm.py b/aiter/ops/groupnorm.py
@@ -0,0 +1,55 @@
+from ..jit.core import compile_ops
+import torch
+from typing import Optional
+
+
+@compile_ops("module_groupnorm")
+def _groupnorm_run(
+    input: torch.Tensor,
+    num_groups: int,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    """Placeholder function, will be replaced by JIT."""
+    pass
+
+
+class GroupNorm(torch.nn.Module):
+    def __init__(
+        self,
+        num_groups: int,
+        num_channels: int,
+        eps: float = 1e-5,
+        affine: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.num_groups = num_groups
+        self.eps = eps
+        self.affine = affine
+
+        if affine:
+            self.weight = torch.nn.Parameter(
+                torch.ones(num_channels, device=device, dtype=dtype)
+            )
+            self.bias = torch.nn.Parameter(
+                torch.zeros(num_channels, device=device, dtype=dtype)
+            )
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+
+    def forward(self, x: torch.Tensor, use_torch: bool = False) -> torch.Tensor:
+        if use_torch or not self.affine:
+            # fallback to PyTorch for non-affine or debug mode
+            return torch.nn.functional.group_norm(
+                x,
+                self.num_groups,
+                weight=self.weight if self.affine else None,
+                bias=self.bias if self.affine else None,
+                eps=self.eps,
+            )
+        else:
+            return _groupnorm_run(x, self.num_groups, self.weight, self.bias, self.eps)
diff --git a/aiter/ops/triton/_triton_kernels/rope.py b/aiter/ops/triton/_triton_kernels/rope.py
@@ -1934,3 +1934,84 @@ def _rope_fwd_2d_kernel_neox(
 
     # store output
     tl.store(out_ptr + offs_x, out)
+
+@triton.jit
+def _rope_fwd_3d_kernel(
+    x_ptr, freqs_real_ptr, freqs_imag_ptr, grid_sizes_ptr, out_ptr,
+    stride_x_b, stride_x_l, stride_x_n, stride_x_c,
+    stride_freqs_s, stride_freqs_c,
+    stride_grid_b, stride_grid_d,
+    stride_out_b, stride_out_l, stride_out_n, stride_out_c,
+    L: tl.constexpr, N_HEADS: tl.constexpr, C: tl.constexpr, c_total: tl.constexpr,
+    sp_size: tl.constexpr, sp_rank: tl.constexpr,
+    max_freq_seq_len: tl.constexpr, s_per_rank: tl.constexpr,
+    pad_freq_val_r: tl.constexpr, pad_freq_val_i: tl.constexpr,
+    BLOCK_L: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_C: tl.constexpr,
+    C1: tl.constexpr, C2: tl.constexpr,
+):
+    pid_b = tl.program_id(0)
+    pid_n = tl.program_id(1)
+    pid_l = tl.program_id(2)
+
+    l_start = pid_l * BLOCK_L
+    l_off = l_start + tl.arange(0, BLOCK_L)
+    s_mask = l_off < L
+
+    c_off = tl.arange(0, BLOCK_C)
+    c_mask = c_off < c_total
+
+    # head mask
+    n_mask = pid_n < N_HEADS
+
+    # broadcast to  (BLOCK_L, 1, BLOCK_C) 
+    l_b = tl.broadcast_to(l_off[:, None], (BLOCK_L, BLOCK_C))
+    c_b = tl.broadcast_to(c_off[None, :], (BLOCK_L, BLOCK_C))
+
+    # read grid_sizes
+    f_grid = tl.load(grid_sizes_ptr + pid_b * stride_grid_b + 0 * stride_grid_d,
+                    mask=n_mask, other=0)
+    h_grid = tl.load(grid_sizes_ptr + pid_b * stride_grid_b + 1 * stride_grid_d,
+                    mask=n_mask, other=0)
+    w_grid = tl.load(grid_sizes_ptr + pid_b * stride_grid_b + 2 * stride_grid_d,
+                    mask=n_mask, other=0)
+    h_w = h_grid * w_grid
+
+    global_tid = sp_rank * s_per_rank + l_b
+    valid_global_tid = global_tid < f_grid * h_w
+
+    # caculate f h w
+    f_idx = tl.where(valid_global_tid, global_tid // h_w, 0)
+    rem = tl.where(valid_global_tid, global_tid % h_w, 0)
+    h_idx = tl.where(valid_global_tid, rem // w_grid, 0)
+    w_idx = tl.where(valid_global_tid, rem % w_grid, 0)
+
+    freq_row = tl.where(c_b < C1, f_idx,
+                       tl.where(c_b < C1 + C2, h_idx, w_idx))
+    freq_row = tl.where(freq_row >= max_freq_seq_len, max_freq_seq_len - 1, freq_row)
+
+    mask_rope = s_mask[:, None] & c_mask[None, :] & n_mask & valid_global_tid[:, :]
+
+    # load freqs_real and freqs_imag
+    off_freq = freq_row * stride_freqs_s + c_b * stride_freqs_c
+    freq_r = tl.load(freqs_real_ptr + off_freq, mask=mask_rope, other=pad_freq_val_r)
+    freq_i = tl.load(freqs_imag_ptr + off_freq, mask=mask_rope, other=pad_freq_val_i)
+
+    off_x_base = pid_b * stride_x_b + pid_n * stride_x_n
+    off_x_r = off_x_base + l_b * stride_x_l + (2 * c_b) * stride_x_c
+    off_x_i = off_x_base + l_b * stride_x_l + (2 * c_b + 1) * stride_x_c
+
+    x_r = tl.load(x_ptr + off_x_r, mask=mask_rope, other=0.0)
+    x_i = tl.load(x_ptr + off_x_i, mask=mask_rope, other=0.0)
+
+    # complex number multiplication
+    out_r = x_r * freq_r - x_i * freq_i
+    out_i = x_r * freq_i + x_i * freq_r
+
+    # write result
+    off_out_base = pid_b * stride_out_b + pid_n * stride_out_n
+    off_out_r = off_out_base + l_b * stride_out_l + (2 * c_b) * stride_out_c
+    off_out_i = off_out_base + l_b * stride_out_l + (2 * c_b + 1) * stride_out_c
+
+    tl.store(out_ptr + off_out_r, out_r, mask=mask_rope)
+    tl.store(out_ptr + off_out_i, out_i, mask=mask_rope)
+
diff --git a/aiter/ops/triton/rope3d.py b/aiter/ops/triton/rope3d.py
@@ -0,0 +1,161 @@
+import torch
+import triton
+import triton.language as tl
+from aiter.ops.triton._triton_kernels.rope import _rope_fwd_3d_kernel
+
+def rope_params(max_seq_len, dim, theta=10000):
+    assert dim % 2 == 0
+    freqs = torch.outer(
+        torch.arange(max_seq_len),
+        1.0 / torch.pow(theta,
+                        torch.arange(0, dim, 2).to(torch.float32).div(dim))
+    )
+    freqs = torch.polar(torch.ones_like(freqs), freqs)  # complex
+    return freqs
+
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size, s1, s2, dtype=original_tensor.dtype, device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+
+
+def rope_apply_triton(x, grid_sizes: tl.constexpr, freqs: tl.constexpr, sp_size: tl.constexpr, sp_rank: tl.constexpr):
+    B, s, n_heads, C = x.shape
+    c_total = C // 2  # 64
+    c1 = c_total - 2 * (c_total // 3)  # 22
+    c2 = c_total // 3                  # 21
+    c3 = c_total // 3                  # 21
+    device = x.device
+
+    grid_sizes = grid_sizes.to(device=device, dtype=torch.int32).contiguous()
+
+    freqs_real = freqs.real.to(dtype=torch.float32, device=device).contiguous()
+    freqs_imag = freqs.imag.to(dtype=torch.float32, device=device).contiguous()
+    out = torch.empty_like(x, dtype=torch.float32, device=device)
+
+    BLOCK_L, BLOCK_N, BLOCK_C = 32, 4, 64
+
+    grid = (
+        B,
+        n_heads,  
+        triton.cdiv(s, BLOCK_L)
+    )
+
+    num_warps = 4
+    waves_per_eu = 1
+
+    _rope_fwd_3d_kernel[grid](
+        x, freqs_real, freqs_imag, grid_sizes, out, 
+        *x.stride(),
+        freqs_real.stride(0), freqs_real.stride(1),
+        *grid_sizes.stride(),
+        *out.stride(),
+        s, n_heads, C, c_total,
+        sp_size, sp_rank,
+        freqs.shape[0], s,
+        1.0, 0.0,
+        BLOCK_L=BLOCK_L, BLOCK_N=BLOCK_N, BLOCK_C=BLOCK_C,
+        C1=c1, C2=c2,
+        num_warps=num_warps,
+        waves_per_eu=waves_per_eu,
+    )
+
+    return out
+
+def rope_apply_original(x, grid_sizes, freqs, sp_size, sp_rank):
+    B = x.size(0)
+    s = x.size(1)
+    n = x.size(2)
+    c = x.size(3) // 2
+
+    c1 = c - 2 * (c // 3)
+    c2 = (c // 3)
+    c3 = (c // 3)
+    freqs = freqs.split([c1, c2, c3], dim=1)
+
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(s, n, -1, 2))
+
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ], dim=-1).reshape(seq_len, 1, -1)
+        merged_real_sum = freqs_i.real.sum()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) * s_per_rank), :, :]
+
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+        output.append(x_i)
+
+    out = torch.stack(output).float()
+    return out
+
+def test_rope_consistency():
+    B, s, n, C = 1, 9450, 40, 128
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    sp_size = 8
+    max_seq_len = 1024
+
+    x = torch.arange(B*s*n*C, dtype=torch.float32, device=device).reshape(B, s, n, C)
+    x = x / (B*s*n*C)
+
+    grid_sizes = torch.tensor([[21, 45, 80]], dtype=torch.int32, device=device)
+
+    d_total = 128
+    d1 = d_total - 4 * (d_total // 6)
+    d2 = 2 * (d_total // 6)
+    d3 = 2 * (d_total // 6)
+
+    freqs_f = rope_params(max_seq_len, d1)
+    freqs_h = rope_params(max_seq_len, d2)
+    freqs_w = rope_params(max_seq_len, d3)
+    freqs = torch.cat([freqs_f, freqs_h, freqs_w], dim=1).to(device)
+
+    sp_rank = 0
+    out_orig = rope_apply_original(x.clone(), grid_sizes.clone(), freqs.clone(), sp_size, sp_rank)
+
+
+    out_triton = rope_apply_triton(x.clone(), grid_sizes.clone(), freqs.clone(), sp_size, sp_rank)
+
+    print(f"the result compare: sp_rank={sp_rank}")
+    print("="*50)
+    shape_ok = (out_orig.shape == out_triton.shape)
+    sum_orig = out_orig.sum().item()
+    sum_triton = out_triton.sum().item()
+    sum_diff = abs(sum_orig - sum_triton) / abs(sum_orig)
+    sum_ok = sum_diff < 1e-2
+    feat_orig = out_orig[0,0,0,:4]
+    feat_triton = out_triton[0,0,0,:4]
+    feat_diff = torch.abs(feat_orig - feat_triton).max().item()
+    feat_ok = feat_diff < 1e-3
+
+    print(f"shape same {'yes' if shape_ok else 'no'}")
+    print(f"(sum diff<1%): {'yes' if sum_ok else 'no'}")
+    print(f"   - Original sum: {sum_orig:.6f}")
+    print(f"   - Triton sum:   {sum_triton:.6f}")
+    print(f"   - corellation diff %:     {sum_diff*100:.2f}%")
+    print(f"fisrt 4 tensor same {'yes' if feat_ok else 'no'}")
+    print(f"   - Original: {feat_orig.cpu().numpy()}")
+    print(f"   - Triton:   {feat_triton.cpu().numpy()}")
+    print(f"   - max diff: {feat_diff:.6f}")
+
+
+    if shape_ok and sum_ok and feat_ok:
+        print(f"\n sp_rank={sp_rank} test success")
+    else:
+        print(f"\n sp_rank={sp_rank} test failed")
+    print("="*60)
+
+
+if __name__ == "__main__":
+    test_rope_consistency()
+
diff --git a/csrc/include/common.hpp b/csrc/include/common.hpp
@@ -0,0 +1,28 @@
+#include <hip/hip_runtime.h>
+
+#include <cstdint>
+#include <iostream>
+#include <exception>
+
+#define CHECK_COND(x) \
+    do { \
+        if (!(x)) { \
+            std::cerr << "check failed, file=" \
+                << __FILE__ << ", line=" \
+                << __LINE__ << std::endl; \
+            std::terminate(); \
+        } \
+    } while(false)
+
+#define CHECK_HIP(x) \
+    do { \
+        hipError_t __err_code = (x); \
+        if( __err_code != hipSuccess ) { \
+            std::cerr << "call hip api failed, file=" \
+                << __FILE__ << ", line=" \
+                << __LINE__ << ", name=" \
+                << hipGetErrorName(__err_code) \
+                << std::endl; \
+            std::terminate(); \
+        } \
+    } while(false)
diff --git a/csrc/include/groupnorm.hpp b/csrc/include/groupnorm.hpp
@@ -0,0 +1,32 @@
+#include <torch/extension.h>
+
+#include "common.hpp"
+
+#include <optional>
+
+namespace rocm_torch_x {
+
+class __attribute__ ((visibility("hidden"))) GroupNorm final
+{
+public:
+    explicit GroupNorm() = default;
+    ~GroupNorm() = default;
+public:
+    // return empty if not supported
+    std::optional<torch::Tensor> Run(
+        torch::Tensor x,
+        int num_groups,
+        torch::Tensor weights,
+        torch::Tensor bias,
+        float epsilon);
+private:
+    template<typename T>
+    torch::Tensor launchGroupNormKernel(uint32_t num_groups, float epsilon,
+        const torch::Tensor x, const torch::Tensor weights, const torch::Tensor bias, hipStream_t stream);
+
+    void reserveMeanAccumulator(uint32_t nums_to_reserve, torch::Device device);
+private:
+    torch::Tensor mean_accumulator_;
+};
+
+} // namespace rocm_torch_x
diff --git a/csrc/kernels/groupnorm.cu b/csrc/kernels/groupnorm.cu
diff --git a/csrc/pybind/groupnorm_pybind.cu b/csrc/pybind/groupnorm_pybind.cu
diff --git a/csrc/rocm_ops.cpp b/csrc/rocm_ops.cpp
diff --git a/op_tests/test_groupnorm.py b/op_tests/test_groupnorm.py