-
Notifications
You must be signed in to change notification settings - Fork 10.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow compiling cuda without mmq and flash attention #11190
base: master
Are you sure you want to change the base?
Allow compiling cuda without mmq and flash attention #11190
Conversation
3a1d670
to
8dfe3d8
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The FLASH_ATTN_AVAILABLE
macros is not being applied correctly to all ggml FlashAttention kernels but this is not the fault of this PR; I'll fix it myself (unless you want to do it).
@@ -149,6 +149,7 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING | |||
"ggml: max. batch size for using peer access") | |||
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) | |||
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) | |||
option(GGML_CUDA_FA "ggml: compile with FlashAttention" ON) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
option(GGML_CUDA_FA "ggml: compile with FlashAttention" ON) | |
option(GGML_CUDA_FA "ggml: compile ggml FlashAttention kernels" ON) |
file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu") | ||
list(FILTER GGML_SOURCES_CUDA EXCLUDE REGEX ".*fattn.*") | ||
list(FILTER GGML_HEADERS_CUDA EXCLUDE REGEX ".*fattn.*") | ||
# message(FATAL_ERROR ${GGML_SOURCES_CUDA}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Forgot to remove?
@@ -151,6 +151,10 @@ typedef float2 dfloat2; | |||
#define FLASH_ATTN_AVAILABLE | |||
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1) | |||
|
|||
#if !defined(GGML_CUDA_FA) | |||
#undef FLASH_ATTN_AVAILABLE | |||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#endif | |
#endif // !defined(GGML_CUDA_FA) |
#include "ggml-cuda/fattn.cuh" | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#endif | |
#endif // FLASH_ATTN_AVAILABLE |
ggml_cuda_flash_attn_ext(ctx, dst); | ||
break; | ||
#else | ||
return false; | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#endif | |
#endif // FLASH_ATTN_AVAILABLE |
#ifdef GGML_CUDA_FORCE_CUBLAS | ||
void ggml_cuda_op_mul_mat_q( | ||
ggml_backend_cuda_context &, | ||
const ggml_tensor *, const ggml_tensor *, ggml_tensor *, const char *, const float *, | ||
const char *, float *, const int64_t, const int64_t, const int64_t, | ||
const int64_t, cudaStream_t) {} | ||
#else |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add GGML_ABORT("CUDA was compiled without MMQ support")
to the function instead.
@@ -2924,6 +2925,7 @@ extern DECL_MMQ_CASE(GGML_TYPE_IQ3_S); | |||
extern DECL_MMQ_CASE(GGML_TYPE_IQ1_S); | |||
extern DECL_MMQ_CASE(GGML_TYPE_IQ4_NL); | |||
extern DECL_MMQ_CASE(GGML_TYPE_IQ4_XS); | |||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#endif | |
#endif // !defined(GGML_CUDA_FORCE_CUBLAS) |
I have integrated the ProstT5 protein language into Foldseek. Thanks a lot for the great library! I am upstreaming a few fixes for issues I found in ggml during the integration. I hope that it's okay to push the changes here and that they get synced at some point to the main ggml repo.
This is the last patch in my patch series. Feel free to reject this one since it might be too specific.
I want to reduce CI compile times and binary sizes for the CUDA builds. My model doesn't profit a lot from flash attention and the i only use f16 weights, so I added options to disable the kernels that compile the longest and contribute most to binary size. For MMQ I reuse the FORCE_CUBLAS option, for flash attention I added a new GGML_CUDA_FA option.