diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..a8e0d13 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,63 @@ +{ + "files.associations": { + "*.yml": "yaml", + "*.ke": "Kestrel", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "concepts": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "deque": "cpp", + "unordered_map": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "ratio": "cpp", + "string": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "initializer_list": "cpp", + "iosfwd": "cpp", + "istream": "cpp", + "limits": "cpp", + "mutex": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "ranges": "cpp", + "stdexcept": "cpp", + "stop_token": "cpp", + "streambuf": "cpp", + "thread": "cpp", + "typeinfo": "cpp", + "__nullptr": "cpp", + "__bit_reference": "cpp", + "__functional_base": "cpp", + "__memory": "cpp" + } +} \ No newline at end of file diff --git a/csrc/cache.h b/csrc/cache.h index da49d91..d779cba 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -19,10 +19,3 @@ void reshape_and_cache( torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping); - -void gather_cached_kv( - torch::Tensor& key, - torch::Tensor& value, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - torch::Tensor& slot_mapping); diff --git a/csrc/ops.h b/csrc/ops.h index cfb18fb..8e26548 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -29,18 +29,6 @@ void paged_attention_v2( int max_context_len, const c10::optional& alibi_slopes); -void rms_norm( - torch::Tensor& out, - torch::Tensor& input, - torch::Tensor& weight, - float epsilon); - -void fused_add_rms_norm( - torch::Tensor& input, - torch::Tensor& residual, - torch::Tensor& weight, - float epsilon); - void rotary_embedding( torch::Tensor& positions, torch::Tensor& query, @@ -48,28 +36,3 @@ void rotary_embedding( int head_size, torch::Tensor& cos_sin_cache, bool is_neox); - -void silu_and_mul( - torch::Tensor& out, - torch::Tensor& input); - -void gelu_new( - torch::Tensor& out, - torch::Tensor& input); - -void gelu_fast( - torch::Tensor& out, - torch::Tensor& input); - -torch::Tensor awq_gemm( - torch::Tensor _in_feats, - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters); - -void squeezellm_gemm( - torch::Tensor vec, - torch::Tensor mat, - torch::Tensor mul, - torch::Tensor lookup_table); diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp deleted file mode 100644 index 9e31429..0000000 --- a/csrc/pybind.cpp +++ /dev/null @@ -1,80 +0,0 @@ -#include "cache.h" -#include "cuda_utils.h" -#include "ops.h" -#include - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - // vLLM custom ops - pybind11::module ops = m.def_submodule("ops", "vLLM custom operators"); - - // Attention ops - ops.def( - "paged_attention_v1", - &paged_attention_v1, - "Compute the attention between an input query and the cached keys/values using PagedAttention."); - ops.def( - "paged_attention_v2", - &paged_attention_v2, - "PagedAttention V2."); - - // Activation ops - ops.def( - "silu_and_mul", - &silu_and_mul, - "Activation function used in SwiGLU."); - ops.def( - "gelu_new", - &gelu_new, - "GELU implementation used in GPT-2."); - ops.def( - "gelu_fast", - &gelu_fast, - "Approximate GELU implementation."); - - // Layernorm - ops.def( - "rms_norm", - &rms_norm, - "Apply Root Mean Square (RMS) Normalization to the input tensor."); - - ops.def( - "fused_add_rms_norm", - &fused_add_rms_norm, - "In-place fused Add and RMS Normalization"); - - // Rotary embedding - ops.def( - "rotary_embedding", - &rotary_embedding, - "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); - - // Quantization ops - ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); - ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); - - // Cache ops - pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops"); - cache_ops.def( - "swap_blocks", - &swap_blocks, - "Swap in (out) the cache blocks from src to dst"); - cache_ops.def( - "copy_blocks", - ©_blocks, - "Copy the cache blocks from src to dst"); - cache_ops.def( - "reshape_and_cache", - &reshape_and_cache, - "Reshape the key and value tensors and cache them"); - cache_ops.def( - "gather_cached_kv", - &gather_cached_kv, - "Gather key and value from the cache into contiguous QKV tensors"); - - // Cuda utils - pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils"); - cuda_utils.def( - "get_device_attribute", - &get_device_attribute, - "Gets the specified device attribute."); -} diff --git a/csrc/rustbind.cpp b/csrc/rustbind.cpp new file mode 100644 index 0000000..489074b --- /dev/null +++ b/csrc/rustbind.cpp @@ -0,0 +1,3 @@ +#include "cache.h" +#include "cuda_utils.h" +#include "ops.h" \ No newline at end of file diff --git a/setup.py b/setup.py index d937c8c..c5192ba 100644 --- a/setup.py +++ b/setup.py @@ -148,13 +148,16 @@ def get_torch_arch_list() -> Set[str]: "csrc/cache_kernels.cu", "csrc/attention/attention_kernels.cu", "csrc/pos_encoding_kernels.cu", - "csrc/activation_kernels.cu", - "csrc/layernorm_kernels.cu", - "csrc/quantization/awq/gemm_kernels.cu", - "csrc/quantization/squeezellm/quant_cuda_kernel.cu", - "csrc/cuda_utils_kernels.cu", - "csrc/ops.h", - "csrc/cache.h", + #"csrc/activation_kernels.cu", + #"csrc/layernorm_kernels.cu", + #"csrc/quantization/awq/gemm_kernels.cu", + #"csrc/quantization/squeezellm/quant_cuda_kernel.cu", + #"csrc/cuda_utils_kernels.cu", + + #"csrc/ops.h", + #"csrc/cache.h", + + "csrc/rustbind.cpp" ], extra_compile_args={ "cxx": CXX_FLAGS,