Merge pull request #114 from gianlucamacri/patch-1

turning exception into warning for flash attention inference
dvlab-research · Nov 2, 2023 · 39866af · 39866af
2 parents bb06477 + b5c6809
commit 39866af
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/llama_attn_replace.py b/llama_attn_replace.py
@@ -34,7 +34,7 @@ def forward_flashattn(
     attention_mask: [bsz, q_len]
     """
     if not self.training:
-        raise ValueError("This function is only for training. For inference, please use forward_flashattn_inference.")
+        warnings.warn("This function should be used just for training as it may exhibit reduced inference performance. For inference, please use forward_flashattn_inference.")
 
     if output_attentions:
         warnings.warn(

diff --git a/llama_attn_replace_sft.py b/llama_attn_replace_sft.py
@@ -36,7 +36,7 @@ def forward_flashattn(
     attention_mask: [bsz, q_len]
     """
     if not self.training:
-        raise ValueError("This function is only for training. For inference, please use forward_flashattn_inference.")
+        warnings.warn("This function should be used just for training as it may exhibit reduced inference performance. For inference, please use forward_flashattn_inference.")
 
     if output_attentions:
         warnings.warn(