diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..a8e0d13
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,63 @@
+{
+    "files.associations": {
+        "*.yml": "yaml",
+        "*.ke": "Kestrel",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "cctype": "cpp",
+        "chrono": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "compare": "cpp",
+        "concepts": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdint": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "deque": "cpp",
+        "unordered_map": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "ratio": "cpp",
+        "string": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "initializer_list": "cpp",
+        "iosfwd": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "mutex": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "ranges": "cpp",
+        "stdexcept": "cpp",
+        "stop_token": "cpp",
+        "streambuf": "cpp",
+        "thread": "cpp",
+        "typeinfo": "cpp",
+        "__nullptr": "cpp",
+        "__bit_reference": "cpp",
+        "__functional_base": "cpp",
+        "__memory": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/csrc/cache.h b/csrc/cache.h
index da49d91..d779cba 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -19,10 +19,3 @@ void reshape_and_cache(
   torch::Tensor& key_cache,
   torch::Tensor& value_cache,
   torch::Tensor& slot_mapping);
-
-void gather_cached_kv(
-  torch::Tensor& key,
-  torch::Tensor& value,
-  torch::Tensor& key_cache,
-  torch::Tensor& value_cache,
-  torch::Tensor& slot_mapping);
diff --git a/csrc/ops.h b/csrc/ops.h
index cfb18fb..8e26548 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -29,18 +29,6 @@ void paged_attention_v2(
   int max_context_len,
   const c10::optional<torch::Tensor>& alibi_slopes);
 
-void rms_norm(
-  torch::Tensor& out,
-  torch::Tensor& input,
-  torch::Tensor& weight,
-  float epsilon);
-
-void fused_add_rms_norm(
-  torch::Tensor& input,
-  torch::Tensor& residual,
-  torch::Tensor& weight,
-  float epsilon);
-
 void rotary_embedding(
   torch::Tensor& positions,
   torch::Tensor& query,
@@ -48,28 +36,3 @@ void rotary_embedding(
   int head_size,
   torch::Tensor& cos_sin_cache,
   bool is_neox);
-
-void silu_and_mul(
-  torch::Tensor& out,
-  torch::Tensor& input);
-
-void gelu_new(
-  torch::Tensor& out,
-  torch::Tensor& input);
-
-void gelu_fast(
-  torch::Tensor& out,
-  torch::Tensor& input);
-
-torch::Tensor awq_gemm(
-  torch::Tensor _in_feats,
-  torch::Tensor _kernel,
-  torch::Tensor _scaling_factors,
-  torch::Tensor _zeros,
-  int split_k_iters);
-
-void squeezellm_gemm(
-  torch::Tensor vec,
-  torch::Tensor mat,
-  torch::Tensor mul,
-  torch::Tensor lookup_table);
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
deleted file mode 100644
index 9e31429..0000000
--- a/csrc/pybind.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-#include "cache.h"
-#include "cuda_utils.h"
-#include "ops.h"
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // vLLM custom ops
-  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
-
-  // Attention ops
-  ops.def(
-    "paged_attention_v1",
-    &paged_attention_v1,
-    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
-  ops.def(
-    "paged_attention_v2",
-    &paged_attention_v2,
-    "PagedAttention V2.");
-
-  // Activation ops
-  ops.def(
-    "silu_and_mul",
-    &silu_and_mul,
-    "Activation function used in SwiGLU.");
-  ops.def(
-    "gelu_new",
-    &gelu_new,
-    "GELU implementation used in GPT-2.");
-  ops.def(
-    "gelu_fast",
-    &gelu_fast,
-    "Approximate GELU implementation.");
-
-  // Layernorm
-  ops.def(
-    "rms_norm",
-    &rms_norm,
-    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
-
-  ops.def(
-    "fused_add_rms_norm",
-    &fused_add_rms_norm,
-    "In-place fused Add and RMS Normalization");
-
-  // Rotary embedding
-  ops.def(
-    "rotary_embedding",
-    &rotary_embedding,
-    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
-
-  // Quantization ops
-  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
-  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-
-  // Cache ops
-  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
-  cache_ops.def(
-    "swap_blocks",
-    &swap_blocks,
-    "Swap in (out) the cache blocks from src to dst");
-  cache_ops.def(
-    "copy_blocks",
-    &copy_blocks,
-    "Copy the cache blocks from src to dst");
-  cache_ops.def(
-    "reshape_and_cache",
-    &reshape_and_cache,
-    "Reshape the key and value tensors and cache them");
-  cache_ops.def(
-    "gather_cached_kv",
-    &gather_cached_kv,
-    "Gather key and value from the cache into contiguous QKV tensors");
-
-  // Cuda utils
-  pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
-  cuda_utils.def(
-    "get_device_attribute",
-    &get_device_attribute,
-    "Gets the specified device attribute.");
-}
diff --git a/csrc/rustbind.cpp b/csrc/rustbind.cpp
new file mode 100644
index 0000000..489074b
--- /dev/null
+++ b/csrc/rustbind.cpp
@@ -0,0 +1,3 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d937c8c..c5192ba 100644
--- a/setup.py
+++ b/setup.py
@@ -148,13 +148,16 @@ def get_torch_arch_list() -> Set[str]:
         "csrc/cache_kernels.cu",
         "csrc/attention/attention_kernels.cu",
         "csrc/pos_encoding_kernels.cu",
-        "csrc/activation_kernels.cu",
-        "csrc/layernorm_kernels.cu",
-        "csrc/quantization/awq/gemm_kernels.cu",
-        "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
-        "csrc/cuda_utils_kernels.cu",
-        "csrc/ops.h",
-        "csrc/cache.h",
+        #"csrc/activation_kernels.cu",
+        #"csrc/layernorm_kernels.cu",
+        #"csrc/quantization/awq/gemm_kernels.cu",
+        #"csrc/quantization/squeezellm/quant_cuda_kernel.cu",
+        #"csrc/cuda_utils_kernels.cu",
+        
+        #"csrc/ops.h",
+        #"csrc/cache.h",
+
+        "csrc/rustbind.cpp"
     ],
     extra_compile_args={
         "cxx": CXX_FLAGS,