diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py index 5c2618b559..9268b9636e 100644 --- a/transformer_engine/pytorch/attention.py +++ b/transformer_engine/pytorch/attention.py @@ -604,7 +604,8 @@ def get_attention_backend( use_fused_attention = False elif cudnn_version >= (9, 6, 0) and qkv_format == "thd": logger.debug( - "Disabling FusedAttention as it does not support context parallelism with THD for cuDNN 9.6+" + "Disabling FusedAttention as it does not support context parallelism with THD for" + " cuDNN 9.6+" ) use_fused_attention = False