NVIDIA · cyanguwa · Jun 10, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/tests/pytorch/fused_attn/run_fused_attn_with_cp.py b/tests/pytorch/fused_attn/run_fused_attn_with_cp.py
@@ -22,6 +22,8 @@ def run_dpa_with_cp(dtype='bf16', model=None, qkv_format='bshd', kernel_backend=
     if kernel_backend == "FusedAttention":
         os.environ["NVTE_FUSED_ATTN"] = "1"
         config = model_configs_fused_attn[model]
+        if qkv_format == 'thd' and (config.num_heads != config.num_gqa_groups or config.attn_bias_type == "post_scale_bias"):
+            return
 
     rank = int(os.getenv('RANK', '0'))
     world_size = int(os.getenv('WORLD_SIZE', '1'))
@@ -45,6 +47,12 @@ def run_dpa_with_cp(dtype='bf16', model=None, qkv_format='bshd', kernel_backend=
 
     assert config.attn_mask_type in ['causal', 'no_mask'], f"{config.attn_mask_type} is an unsupported attention mask type!"
 
+    if kernel_backend == 'FusedAttention' and qkv_format == 'thd':
+        if 'causal' in config.attn_mask_type:
+            config.attn_mask_type = 'padding_causal'
+        else:
+            config.attn_mask_type = 'padding'
+
     # instantiate core attn module
     core_attn = DotProductAttention(config.num_heads,
                                     config.head_dim,
@@ -112,24 +120,22 @@ def run_dpa_with_cp(dtype='bf16', model=None, qkv_format='bshd', kernel_backend=
     out.backward(dout)
 
     # run core_attn wit CP
+    q_, k_, v_, dout_, *rest = [x.clone().detach() for x in [q, k, v, dout] + ([] if bias is None else [bias])]
+    bias_ = rest[0] if len(rest) else None
     if qkv_format == "bshd" or qkv_format == "sbhd":
-        q_, k_, v_, dout_, *rest = [x.clone().detach() for x in [q, k, v, dout] + ([] if bias is None else [bias])]
-        bias_ = rest[0] if len(rest) else None
         seq_dim = qkv_format.index('s')
         q_, k_, v_, dout_ = [x.view(*x.shape[:seq_dim], 2*world_size, x.shape[seq_dim]//(2*world_size), *x.shape[(seq_dim+1):]) \
             for x in [q_, k_, v_, dout_]]
         seq_idx = torch.tensor([rank, 2*world_size-rank-1], device=q_.device)
         q_, k_, v_, dout_ = [x.index_select(seq_dim, seq_idx) for x in [q_, k_, v_, dout_]]
         q_, k_, v_, dout_ = [x.view(*x.shape[:seq_dim], -1, *x.shape[(seq_dim+2):]) for x in [q_, k_, v_, dout_]]
     elif qkv_format == "thd":
-        q_, k_, v_, dout_ = [x.clone().detach() for x in [q, k, v, dout]]
         seq_idx_q  = tex.thd_get_partitioned_indices(cu_seqlens_q, q_.size(0), world_size, rank)
         seq_idx_kv = tex.thd_get_partitioned_indices(cu_seqlens_kv, k_.size(0), world_size, rank)
         q_, dout_ = [x.index_select(0, seq_idx_q) for x in [q_, dout_]]
         k_, v_ = [x.index_select(0, seq_idx_kv) for x in [k_, v_]]
         cu_seqlens_q = cu_seqlens_q // world_size
         cu_seqlens_kv = cu_seqlens_kv // world_size
-        bias_ = None
     else:
         assert False, f"{qkv_format} is an unsupported qkv_format!"
     q_, k_, v_ = [x.requires_grad_() for x in [q_, k_, v_]]
@@ -158,7 +164,10 @@ def run_dpa_with_cp(dtype='bf16', model=None, qkv_format='bshd', kernel_backend=
     # compare results with and without CP
     tols = dict(atol=5e-3, rtol=5e-3)
     if dtype == 'bf16':
-        tols = dict(atol=2.5e-2, rtol=2.5e-2)
+        if config.num_heads == config.num_gqa_groups:
+            tols = dict(atol=2.5e-2, rtol=2.5e-2)
+        else:
+            tols = dict(atol=3.5e-2, rtol=3.5e-2)
 
     if qkv_format == "bshd" or qkv_format == "sbhd":
         dq, dk, dv, out = [x.view(*x.shape[:seq_dim], 2*world_size, x.shape[seq_dim]//(2*world_size), *x.shape[(seq_dim+1):]) \

diff --git a/tests/pytorch/fused_attn/test_fused_attn_with_cp.py b/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
@@ -14,10 +14,10 @@
 
 model_configs_flash_attn = {
     #   test:             b,  h, hg,   d,   sq,  skv,   p,      mask,      bias
-    "cp_1_0": ModelConfig(1, 12, 12, 128, 4096, 4096, 0.0,  "causal", "no_bias"), # MHA
-    "cp_1_1": ModelConfig(1, 12, 12, 128, 4096, 4096, 0.0, "no_mask", "no_bias"), # MHA
-    "cp_2_0": ModelConfig(1, 12,  1, 128, 4096, 4096, 0.0,  "causal", "no_bias"), # GQA
-    "cp_2_1": ModelConfig(1, 12,  1, 128, 4096, 4096, 0.0, "no_mask", "no_bias"), # GQA
+    "cp_1_0": ModelConfig(2, 12, 12, 128, 4096, 4096, 0.0,  "causal", "no_bias"), # MHA
+    "cp_1_1": ModelConfig(2, 12, 12, 128, 4096, 4096, 0.0, "no_mask", "no_bias"), # MHA
+    "cp_2_0": ModelConfig(2, 12,  1, 128, 4096, 4096, 0.0,  "causal", "no_bias"), # GQA
+    "cp_2_1": ModelConfig(2, 12,  1, 128, 4096, 4096, 0.0, "no_mask", "no_bias"), # GQA
 }
 
 def get_bash_arguments(**kwargs):
@@ -47,21 +47,21 @@ def test_cp_with_flash_attention(dtype, model, qkv_format):
 
 model_configs_fused_attn = {
     #   test:             b,  h, hg,   d,   sq,  skv,   p,      mask,              bias
-    "cp_1_0": ModelConfig(1, 12, 12, 128, 4096, 4096, 0.0,  "causal",         "no_bias"), # MHA
-    "cp_1_1": ModelConfig(1, 12, 12, 128, 4096, 4096, 0.0, "no_mask",         "no_bias"), # MHA
-    "cp_1_2": ModelConfig(1, 12, 12, 128, 4096, 4096, 0.0,  "causal", "post_scale_bias"), # MHA
-    "cp_1_3": ModelConfig(1, 12, 12, 128, 4096, 4096, 0.0, "no_mask", "post_scale_bias"), # MHA
-    "cp_2_0": ModelConfig(1, 12,  1, 128, 4096, 4096, 0.0,  "causal",         "no_bias"), # GQA
-    "cp_2_1": ModelConfig(1, 12,  1, 128, 4096, 4096, 0.0, "no_mask",         "no_bias"), # GQA
-    "cp_2_2": ModelConfig(1, 12,  1, 128, 4096, 4096, 0.0,  "causal", "post_scale_bias"), # GQA
-    "cp_2_3": ModelConfig(1, 12,  1, 128, 4096, 4096, 0.0, "no_mask", "post_scale_bias"), # GQA
+    "cp_1_0": ModelConfig(2, 12, 12, 128, 4096, 4096, 0.0,  "causal",         "no_bias"), # MHA
+    "cp_1_1": ModelConfig(2, 12, 12, 128, 4096, 4096, 0.0, "no_mask",         "no_bias"), # MHA
+    "cp_1_2": ModelConfig(2, 12, 12, 128, 4096, 4096, 0.0,  "causal", "post_scale_bias"), # MHA
+    "cp_1_3": ModelConfig(2, 12, 12, 128, 4096, 4096, 0.0, "no_mask", "post_scale_bias"), # MHA
+    "cp_2_0": ModelConfig(2, 12,  1, 128, 4096, 4096, 0.0,  "causal",         "no_bias"), # GQA
+    "cp_2_1": ModelConfig(2, 12,  1, 128, 4096, 4096, 0.0, "no_mask",         "no_bias"), # GQA
+    "cp_2_2": ModelConfig(2, 12,  1, 128, 4096, 4096, 0.0,  "causal", "post_scale_bias"), # GQA
+    "cp_2_3": ModelConfig(2, 12,  1, 128, 4096, 4096, 0.0, "no_mask", "post_scale_bias"), # GQA
 }
 
 @pytest.mark.skipif(_cudnn_version() < (8,9,7), reason="cuDNN 8.9.7+ is required.")
 @pytest.mark.skipif(get_device_compute_capability() < (8, 0), reason="CP tests require sm80+.")
 @pytest.mark.parametrize("dtype", ['bf16', 'fp16'])
 @pytest.mark.parametrize("model", model_configs_fused_attn.keys())
-@pytest.mark.parametrize("qkv_format", ['bshd', 'sbhd'])
+@pytest.mark.parametrize("qkv_format", ['bshd', 'sbhd', 'thd'])
 def test_cp_with_fused_attention(dtype, model, qkv_format):
     subprocess.run(
         get_bash_arguments(