Fix MLA CP Bugs (#1896)

yuzhongw-nvidia · cyanguwa · KshitijLakhani · commit f05f12c974b3 · 2025-06-29T09:52:10.000-07:00
* fix: (1) UT ignores MLA; (2) bshd format runtime error. Ban fp8 mla attn + cp due to correctness problem

Signed-off-by: Yuzhong Wang &lt;yuzhongw@nvidia.com&gt;

* only disable FP8 CP for MLA

Signed-off-by: Charlene Yang &lt;8636796+cyanguwa@users.noreply.github.com&gt;

---------

Signed-off-by: Yuzhong Wang &lt;yuzhongw@nvidia.com&gt;
Signed-off-by: Charlene Yang &lt;8636796+cyanguwa@users.noreply.github.com&gt;
Co-authored-by: Charlene Yang &lt;8636796+cyanguwa@users.noreply.github.com&gt;
diff --git a/tests/pytorch/fused_attn/run_fused_attn_with_cp.py b/tests/pytorch/fused_attn/run_fused_attn_with_cp.py
@@ -89,7 +89,7 @@ def run_dpa_with_cp(
     # instantiate core attn module
     core_attn = DotProductAttention(
         config.num_heads,
-        config.head_dim_qk,
+        (config.head_dim_qk, config.head_dim_v),
         num_gqa_groups=config.num_gqa_groups,
         attention_dropout=config.dropout_p,
         qkv_format=qkv_format,
@@ -106,16 +106,22 @@ def run_dpa_with_cp(
             config.num_heads,
             config.head_dim_qk,
         )
-        kv_input_shape = (
+        k_input_shape = (
             config.batch_size,
             config.max_seqlen_kv,
             config.num_gqa_groups,
             config.head_dim_qk,
         )
+        v_input_shape = (
+            config.batch_size,
+            config.max_seqlen_kv,
+            config.num_gqa_groups,
+            config.head_dim_v,
+        )
         attn_output_shape = (
             config.batch_size,
             config.max_seqlen_q,
-            config.num_heads * config.head_dim_qk,
+            config.num_heads * config.head_dim_v,
         )
         cu_seqlens_q = None
         cu_seqlens_kv = None
@@ -128,16 +134,22 @@ def run_dpa_with_cp(
             config.num_heads,
             config.head_dim_qk,
         )
-        kv_input_shape = (
+        k_input_shape = (
             config.max_seqlen_kv,
             config.batch_size,
             config.num_gqa_groups,
             config.head_dim_qk,
         )
+        v_input_shape = (
+            config.max_seqlen_kv,
+            config.batch_size,
+            config.num_gqa_groups,
+            config.head_dim_v,
+        )
         attn_output_shape = (
             config.max_seqlen_q,
             config.batch_size,
-            config.num_heads * config.head_dim_qk,
+            config.num_heads * config.head_dim_v,
         )
         cu_seqlens_q = None
         cu_seqlens_kv = None
@@ -149,14 +161,19 @@ def run_dpa_with_cp(
             config.num_heads,
             config.head_dim_qk,
         )
-        kv_input_shape = (
+        k_input_shape = (
             config.batch_size * config.max_seqlen_q,
             config.num_gqa_groups,
             config.head_dim_qk,
         )
+        v_input_shape = (
+            config.batch_size * config.max_seqlen_q,
+            config.num_gqa_groups,
+            config.head_dim_v,
+        )
         attn_output_shape = (
             config.batch_size * config.max_seqlen_q,
-            config.num_heads * config.head_dim_qk,
+            config.num_heads * config.head_dim_v,
         )
         seqlens_q = torch.randint(0, config.max_seqlen_q + 1, [config.batch_size]).to(torch.int32)
         seqlens_q_padded = (seqlens_q + 2 * world_size - 1) // (world_size * 2) * (world_size * 2)
@@ -177,8 +194,8 @@ def run_dpa_with_cp(
         assert False, f"{qkv_format} is an unsupported qkv_format!"
 
     q = torch.randn(q_input_shape, dtype=dtypes[dtype]).cuda()
-    k = torch.randn(kv_input_shape, dtype=dtypes[dtype]).cuda()
-    v = torch.randn(kv_input_shape, dtype=dtypes[dtype]).cuda()
+    k = torch.randn(k_input_shape, dtype=dtypes[dtype]).cuda()
+    v = torch.randn(v_input_shape, dtype=dtypes[dtype]).cuda()
     dout = torch.randn(attn_output_shape, dtype=dtypes[dtype]).cuda()
     dout_quantizer = Float8Quantizer(
         fp8_dtype=tex.DType.kFloat8E5M2,
diff --git a/tests/pytorch/fused_attn/test_fused_attn_with_cp.py b/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
@@ -173,6 +173,8 @@ def test_cp_with_fused_attention(dtype, model, qkv_format, cp_comm_type, fp8_mha
         pytest.skip("Only fp8 works with fp8_mha=True!")
     if "p2p" not in cp_comm_type and config.head_dim_qk != config.head_dim_v:
         pytest.skip("MLA CP currently only support KV P2P!")
+    if dtype == "fp8" and config.head_dim_qk != config.head_dim_v:
+        pytest.skip("MLA CP currently does not support FP8 attention!")
 
     subprocess.run(
         get_bash_arguments(
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py b/transformer_engine/pytorch/attention/dot_product_attention/context_parallel.py
@@ -2559,8 +2559,8 @@ def backward(ctx, dout):
 
             if ctx.enable_mla:
                 # [cp, b, 2, sk//2, np, hn] or [cp, 2, sk//2, b, np, hn]
-                dk_fp8 = dkv_fp8[: ctx.k_numel].view(cp_size, *ctx.k_shape)
-                dv_fp8 = dkv_fp8[ctx.k_numel :].view(cp_size, *ctx.v_shape)
+                dk_fp8 = dkv_fp8[:, : ctx.k_numel].view(cp_size, *ctx.k_shape)
+                dv_fp8 = dkv_fp8[:, ctx.k_numel :].view(cp_size, *ctx.v_shape)
                 dk = ctx.dQKV_CP_quantizer.create_tensor_from_data(
                     dk_fp8, fake_dtype=torch.float32, internal=True
                 )
@@ -2586,8 +2586,8 @@ def backward(ctx, dout):
                 dq = dq.view(dq.shape[0], -1, *dq.shape[-2:])
                 if ctx.enable_mla:
                     # [b, 2, sk//2, np, hn] -> [b, sk, np, hn]
-                    dk = dk.view(*dk.shape[0], -1, *dk.shape[-2:])
-                    dv = dv.view(*dv.shape[0], -1, *dv.shape[-2:])
+                    dk = dk.view(dk.shape[0], -1, *dk.shape[-2:])
+                    dv = dv.view(dv.shape[0], -1, *dv.shape[-2:])
                 else:
                     # [2, b, 2, sk//2, np, hn] -> [2, b, sk, np, hn]
                     dkv = dkv.view(*dkv.shape[0:2], -1, *dkv.shape[-2:])
diff --git a/transformer_engine/pytorch/attention/dot_product_attention/utils.py b/transformer_engine/pytorch/attention/dot_product_attention/utils.py
@@ -608,6 +608,12 @@ def get_attention_backend(
                 " bias for THD format"
             )
             use_fused_attention = False
+        elif fp8 and head_dim_qk != head_dim_v:
+            logger.debug(
+                "Disabling FusedAttention as it does not support context parallelism with FP8"
+                " MLA attention"
+            )
+            use_fused_attention = False
 
     # Filter: Attention mask
     # attn_mask_type              | attention_mask                       | supported backends