diff --git a/Cargo.lock b/Cargo.lock
index 295b25822..5942afc2f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2287,7 +2287,7 @@ dependencies = [
 
 [[package]]
 name = "mistralrs"
-version = "0.1.26"
+version = "0.2.0"
 dependencies = [
  "anyhow",
  "candle-core",
@@ -2301,7 +2301,7 @@ dependencies = [
 
 [[package]]
 name = "mistralrs-bench"
-version = "0.1.26"
+version = "0.2.0"
 dependencies = [
  "anyhow",
  "candle-core",
@@ -2317,7 +2317,7 @@ dependencies = [
 
 [[package]]
 name = "mistralrs-core"
-version = "0.1.26"
+version = "0.2.0"
 dependencies = [
  "accelerate-src",
  "akin",
@@ -2381,7 +2381,7 @@ dependencies = [
 
 [[package]]
 name = "mistralrs-paged-attn"
-version = "0.1.26"
+version = "0.2.0"
 dependencies = [
  "anyhow",
  "bindgen_cuda 0.1.6",
@@ -2391,7 +2391,7 @@ dependencies = [
 
 [[package]]
 name = "mistralrs-pyo3"
-version = "0.1.26"
+version = "0.2.0"
 dependencies = [
  "accelerate-src",
  "base64 0.22.1",
@@ -2412,7 +2412,7 @@ dependencies = [
 
 [[package]]
 name = "mistralrs-server"
-version = "0.1.26"
+version = "0.2.0"
 dependencies = [
  "accelerate-src",
  "anyhow",
@@ -2440,7 +2440,7 @@ dependencies = [
 
 [[package]]
 name = "mistralrs-vision"
-version = "0.1.26"
+version = "0.2.0"
 dependencies = [
  "candle-core",
  "image",
diff --git a/Cargo.toml b/Cargo.toml
index 157af9b87..358936ee4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,7 +13,7 @@ exclude = [
 resolver = "2"
 
 [workspace.package]
-version = "0.1.26"
+version = "0.2.0"
 edition = "2021"
 description = "Fast and easy LLM serving."
 homepage = "https://github.com/EricLBuehler/mistral.rs"
diff --git a/README.md b/README.md
index 8965aa6ae..27303994c 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ Mistal.rs supports several model categories:
 ## Description
 **Fast**:
 - Quantized model support: 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit for faster inference and optimized memory usage.
-- Continuous batching and Paged Attention support.
+- Continuous batching and PagedAttention support.
 - Prefix caching.
 - [Device mapping](docs/DEVICE_MAPPING.md): load and run some layers on the device and the rest on the CPU.
 
@@ -90,7 +90,7 @@ Mistal.rs supports several model categories:
 - AnyMoE: Build a memory-efficient MoE model from anything, in seconds
     - [Paper](https://arxiv.org/abs/2405.19076)
     - [Docs](docs/ANYMOE.md)
-- Paged Attention: [docs](docs/PAGED_ATTENTION.md)
+- PagedAttention: [docs](docs/PAGED_ATTENTION.md)
 
 
 This is a demo of interactive mode with streaming running Phi 3 128k mini with quantization via ISQ to Q4K.
@@ -189,7 +189,7 @@ Please submit more benchmarks via raising an issue!
 > Note: You can use our [Docker containers here](https://github.com/EricLBuehler/mistral.rs/pkgs/container/mistral.rs).
 > Learn more about running Docker containers: https://docs.docker.com/engine/reference/run/
 
-> Note: You can use pre-built `mistralrs-server` binaries [here](https://github.com/EricLBuehler/mistral.rs/releases/tag/v0.1.26)
+> Note: You can use pre-built `mistralrs-server` binaries [here](https://github.com/EricLBuehler/mistral.rs/releases/tag/v0.2.0)
 
 - Install the [Python package here](mistralrs-pyo3/README.md).
 
diff --git a/docs/ISQ.md b/docs/ISQ.md
index 36e005454..e6b5de5c7 100644
--- a/docs/ISQ.md
+++ b/docs/ISQ.md
@@ -46,7 +46,7 @@ let pipeline = loader.load_model_from_hf(
     false,
     DeviceMapMetadata::dummy(),
     Some(GgmlDType::Q4K),
-    None, // No Paged Attention yet
+    None, // No PagedAttention yet
 )?;
 ```
 
diff --git a/docs/PAGED_ATTENTION.md b/docs/PAGED_ATTENTION.md
index 835e82052..8ee028e4c 100644
--- a/docs/PAGED_ATTENTION.md
+++ b/docs/PAGED_ATTENTION.md
@@ -1,12 +1,12 @@
-# Paged Attention in mistral.rs
+# PagedAttention in mistral.rs
 
-Mistral.rs supports Paged Attention ([paper here](https://arxiv.org/abs/2309.06180)) to accelerate both normal inference and batched inference on CUDA.
+Mistral.rs supports PagedAttention ([paper here](https://arxiv.org/abs/2309.06180)) to accelerate both normal inference and batched inference on CUDA devices on Unix-like platforms such as WSL, Linux, or Mac.
 
-Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and block size. This enables you to have fine-tuned control over the available context length, by configuring the available memory for KV cache. When using a CUDA device, Paged Attention is actiated by default but can be disabled with `no_paged_attn` for Python or `no-paged-attn` for the CLI tools.
+Our PagedAttention implementation has 2 inputs: GPU KV cache memory size, and block size. This enables you to have fine-tuned control over the available context length, by configuring the available memory for KV cache. When using a CUDA device, PagedAttention is actiated by default but can be disabled with `no_paged_attn` for Python or `no-paged-attn` for the CLI tools.
 
 > Note: The default block size if not specified is 32.
 
-> Note: if OOM happens (this can be caused by a variety of factors including adapter activation, re-ISQ, and others), it happens because the Paged Attention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount or usage percentage (recommended) or disable paged attention entirely for a dynamically allocated cache.
+> Note: if OOM occurs (this can be caused by a variety of factors including adapter activation, re-ISQ, and others), it is likely because the PagedAttention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount or usage percentage (recommended) or disable paged attention entirely for a dynamically allocated cache.
 
 > Note: Paged Attention is not enabled on Windows platforms, only Unix-based platforms.
 
@@ -21,7 +21,7 @@ Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and b
 - GGUF models
 - Vision models
 
-> Note: the prefix cacher will be disabled when using Paged Attention regardless of settings. This functionality will be added soon!
+> Note: the prefix cacher will be disabled when using PagedAttention regardless of settings. This functionality will be added soon!
 
 ## Using the CLI
 
diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
index e494d8248..19ecd6d5f 100644
--- a/mistralrs-bench/Cargo.toml
+++ b/mistralrs-bench/Cargo.toml
@@ -18,7 +18,7 @@ candle-core.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 clap.workspace = true
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" }
 tracing.workspace = true
 either.workspace = true
 tokio.workspace = true
diff --git a/mistralrs-bench/src/main.rs b/mistralrs-bench/src/main.rs
index a3e4225e7..308f96e9c 100644
--- a/mistralrs-bench/src/main.rs
+++ b/mistralrs-bench/src/main.rs
@@ -279,23 +279,23 @@ struct Args {
     #[arg(short, long, value_parser, value_delimiter = ';')]
     num_device_layers: Option<Vec<String>>,
 
-    /// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to
-    /// using `pa-gpu-mem-usage` set to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+    /// GPU memory to allocate for KV cache with PagedAttention in MBs. If this is not set and the device is CUDA, it will default to
+    /// using `pa-gpu-mem-usage` set to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
     #[arg(long = "pa-gpu-mem")]
     paged_attn_gpu_mem: Option<usize>,
 
-    /// Percentage of GPU memory to utilize after allocation of KV cache with Paged Attention, from 0 to 1.
-    /// If this is not set and the device is CUDA, it will default to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+    /// Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1.
+    /// If this is not set and the device is CUDA, it will default to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
     /// This is always used over `pa-gpu-mem` if both are specified.
     #[arg(long = "pa-gpu-mem-usage")]
     paged_attn_gpu_mem_usage: Option<f32>,
 
-    /// Block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, it will default to 32.
-    /// Paged Attention is only supported on CUDA and is always automatically activated.
+    /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32.
+    /// PagedAttention is only supported on CUDA and is always automatically activated.
     #[arg(long = "pa-blk-size")]
     paged_attn_block_size: Option<usize>,
 
-    /// Disable Paged Attention on CUDA.
+    /// Disable PagedAttention on CUDA.
     #[arg(long = "no_paged_attn", default_value_t = false)]
     no_paged_attn: bool,
 }
diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
index 24cee4c3e..7ba0d5f13 100644
--- a/mistralrs-core/Cargo.toml
+++ b/mistralrs-core/Cargo.toml
@@ -64,13 +64,13 @@ tracing-subscriber.workspace = true
 derive-new = "0.6.0"
 itertools = "0.13.0"
 sysinfo = "0.30.12"
-mistralrs-vision = { version = "0.1.13", path = "../mistralrs-vision" }
+mistralrs-vision = { version = "0.2.0", path = "../mistralrs-vision" }
 csv = "1.3.0"
 reqwest.workspace = true
 base64.workspace = true
 bytemuck_derive = "1.7.0"
 plotly = { version = "0.9.0", features = ["kaleido"], optional = true }
-mistralrs-paged-attn = { version = "0.1.13", path = "../mistralrs-paged-attn", optional = true }
+mistralrs-paged-attn = { version = "0.2.0", path = "../mistralrs-paged-attn", optional = true }
 
 [features]
 default = ["dep:plotly"]
diff --git a/mistralrs-core/src/dummy_paged_attention/mod.rs b/mistralrs-core/src/dummy_paged_attention/mod.rs
index 9f67e8207..a3438cb20 100644
--- a/mistralrs-core/src/dummy_paged_attention/mod.rs
+++ b/mistralrs-core/src/dummy_paged_attention/mod.rs
@@ -88,14 +88,14 @@ pub fn calculate_cache_config(
             let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
             let used = total - free;
             let size = (total * f - used) as usize;
-            info!("Allocating {size} MB for Paged Attention KV cache");
+            info!("Allocating {size} MB for PagedAttention KV cache");
             size
         }
     };
 
     let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
     let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
-    info!("Using Paged Attention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
+    info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
     Ok(CacheConfig {
         block_size,
         num_gpu_blocks,
diff --git a/mistralrs-core/src/models/gemma2.rs b/mistralrs-core/src/models/gemma2.rs
index 7fb16273f..3ed16af87 100644
--- a/mistralrs-core/src/models/gemma2.rs
+++ b/mistralrs-core/src/models/gemma2.rs
@@ -483,7 +483,7 @@ impl Model {
         let vb_l = vb_m.pp("layers");
         if matches!(attention_mechanism, AttentionImplementation::PagedAttention) {
             // TODO softcapping in paged attn
-            candle_core::bail!("Gemma 2 does not support Paged Attention.");
+            candle_core::bail!("Gemma 2 does not support PagedAttention.");
         }
         for layer_idx in
             NiceProgressBar::<_, 'b'>(0..cfg.num_hidden_layers, "Loading repeating layers")
diff --git a/mistralrs-core/src/paged_attention/mod.rs b/mistralrs-core/src/paged_attention/mod.rs
index 59f6bf8cd..abf44a34a 100644
--- a/mistralrs-core/src/paged_attention/mod.rs
+++ b/mistralrs-core/src/paged_attention/mod.rs
@@ -92,14 +92,14 @@ pub fn calculate_cache_config(
             let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
             let used = total - free;
             let size = (total * f - used) as usize;
-            info!("Allocating {size} MB for Paged Attention KV cache");
+            info!("Allocating {size} MB for PagedAttention KV cache");
             size
         }
     };
 
     let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
     let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
-    info!("Using Paged Attention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
+    info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
     Ok(CacheConfig {
         block_size,
         num_gpu_blocks,
diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
index 87a9d302e..9ad3fc357 100644
--- a/mistralrs-core/src/pipeline/mod.rs
+++ b/mistralrs-core/src/pipeline/mod.rs
@@ -444,7 +444,7 @@ pub struct GeneralMetadata {
     pub is_xlora: bool,
     pub activation_dtype: DType,
     pub sliding_window: Option<usize>,
-    // Paged Attention stuff
+    // PagedAttention stuff
     pub cache_config: Option<CacheConfig>,
     pub cache_engine: Option<CacheEngine>,
 }
diff --git a/mistralrs-core/src/scheduler/mod.rs b/mistralrs-core/src/scheduler/mod.rs
index 25aff8a07..5f4b4fb5f 100644
--- a/mistralrs-core/src/scheduler/mod.rs
+++ b/mistralrs-core/src/scheduler/mod.rs
@@ -52,7 +52,7 @@ pub trait Scheduler {
     /// This may do nothing. It depends on the implementation
     fn free_finished_sequence_groups(&mut self);
 
-    // Paged Attention metadata
+    // PagedAttention metadata
     fn block_tables(&self) -> Option<&BlockTables>;
     fn block_size(&self) -> Option<usize>;
     fn block_engine(&mut self) -> Option<&mut BlockEngine>;
diff --git a/mistralrs-paged-attn/src/backend/paged_attention.rs b/mistralrs-paged-attn/src/backend/paged_attention.rs
index 0b264999e..d3a3ed812 100644
--- a/mistralrs-paged-attn/src/backend/paged_attention.rs
+++ b/mistralrs-paged-attn/src/backend/paged_attention.rs
@@ -250,7 +250,7 @@ impl candle::CustomOp1 for PagedAttention {
     }
 }
 
-/// Paged Attention layer.
+/// PagedAttention layer.
 ///
 /// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`.
 /// Multi-query and grouped-query attention are supported by using tensors key_cache and value_cache
diff --git a/mistralrs-pyo3/Cargo.toml b/mistralrs-pyo3/Cargo.toml
index 004b9d374..2f8613309 100644
--- a/mistralrs-pyo3/Cargo.toml
+++ b/mistralrs-pyo3/Cargo.toml
@@ -17,7 +17,7 @@ doc = false
 
 [dependencies]
 pyo3.workspace = true
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core", features = ["pyo3_macros"] }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core", features = ["pyo3_macros"] }
 serde.workspace = true
 serde_json.workspace = true
 candle-core.workspace = true
diff --git a/mistralrs-pyo3/Cargo_template.toml b/mistralrs-pyo3/Cargo_template.toml
index 130b5b123..ae85ece7d 100644
--- a/mistralrs-pyo3/Cargo_template.toml
+++ b/mistralrs-pyo3/Cargo_template.toml
@@ -17,7 +17,7 @@ doc = false
 
 [dependencies]
 pyo3.workspace = true
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] }
 serde.workspace = true
 serde_json.workspace = true
 candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.6.0", rev = "c967be9", features=["$feature_name"] }
diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi
index 82f51b1b6..738ab8866 100644
--- a/mistralrs-pyo3/mistralrs.pyi
+++ b/mistralrs-pyo3/mistralrs.pyi
@@ -211,11 +211,11 @@ class Runner:
             the corresponding number of layers.
         - `in_situ_quant` sets the optional in-situ quantization for models that are not quantized (not GGUF or GGML).
         - `anymoe_config` specifies the AnyMoE config. If this is set, then the model will be loaded as an AnyMoE model.
-        - `pa_gpu_mem` sets GPU memory to allocate for KV cache with Paged Attention in MBs *OR* the percentage utilization, from 0 to 1. If this is not set and the device is
-            CUDA, it will default to using 90% of the total memory after allocation of the KV cache. Paged Attention is only supported on CUDA and is always automatically activated.
-        - `pa_blk_size` sets the block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA,
-            it will default to 32. Paged Attention is only supported on CUDA and is always automatically activated.
-        - `no_paged_attn` disables Paged Attention on CUDA
+        - `pa_gpu_mem` sets GPU memory to allocate for KV cache with PagedAttention in MBs *OR* the percentage utilization, from 0 to 1. If this is not set and the device is
+            CUDA, it will default to using 90% of the total memory after allocation of the KV cache. PagedAttention is only supported on CUDA and is always automatically activated.
+        - `pa_blk_size` sets the block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA,
+            it will default to 32. PagedAttention is only supported on CUDA and is always automatically activated.
+        - `no_paged_attn` disables PagedAttention on CUDA
         """
         ...
 
diff --git a/mistralrs-pyo3/pyproject.toml b/mistralrs-pyo3/pyproject.toml
index 15da54830..e4e813d36 100644
--- a/mistralrs-pyo3/pyproject.toml
+++ b/mistralrs-pyo3/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "mistralrs"
-version = "0.1.26"
+version = "0.2.0"
 requires-python = ">=3.8"
 classifiers = [
     "Programming Language :: Rust",
diff --git a/mistralrs-pyo3/pyproject_template.toml b/mistralrs-pyo3/pyproject_template.toml
index dda899334..67403d2aa 100644
--- a/mistralrs-pyo3/pyproject_template.toml
+++ b/mistralrs-pyo3/pyproject_template.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "$name"
-version = "0.1.26"
+version = "0.2.0"
 requires-python = ">=3.8"
 classifiers = [
     "Programming Language :: Rust",
diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml
index 6909c0b9c..c73daa0c8 100644
--- a/mistralrs-server/Cargo.toml
+++ b/mistralrs-server/Cargo.toml
@@ -22,7 +22,7 @@ axum = { version = "0.7.4", features = ["tokio"] }
 tower-http = { version = "0.5.1", features = ["cors"]}
 utoipa = { version = "4.2", features = ["axum_extras"] }
 utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]}
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" }
 indexmap.workspace = true
 accelerate-src = { workspace = true, optional = true }
 intel-mkl-src = { workspace = true, optional = true }
diff --git a/mistralrs-server/src/main.rs b/mistralrs-server/src/main.rs
index 1a01b6301..6e4ddfeba 100644
--- a/mistralrs-server/src/main.rs
+++ b/mistralrs-server/src/main.rs
@@ -123,23 +123,23 @@ struct Args {
     #[arg(long = "isq", value_parser = parse_isq)]
     in_situ_quant: Option<GgmlDType>,
 
-    /// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to
-    /// using `pa-gpu-mem-usage` set to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+    /// GPU memory to allocate for KV cache with PagedAttention in MBs. If this is not set and the device is CUDA, it will default to
+    /// using `pa-gpu-mem-usage` set to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
     #[arg(long = "pa-gpu-mem")]
     paged_attn_gpu_mem: Option<usize>,
 
-    /// Percentage of GPU memory to utilize after allocation of KV cache with Paged Attention, from 0 to 1.
-    /// If this is not set and the device is CUDA, it will default to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+    /// Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1.
+    /// If this is not set and the device is CUDA, it will default to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
     /// This is always used over `pa-gpu-mem` if both are specified.
     #[arg(long = "pa-gpu-mem-usage")]
     paged_attn_gpu_mem_usage: Option<f32>,
 
-    /// Block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, it will default to 32.
-    /// Paged Attention is only supported on CUDA and is always automatically activated.
+    /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32.
+    /// PagedAttention is only supported on CUDA and is always automatically activated.
     #[arg(long = "pa-blk-size")]
     paged_attn_block_size: Option<usize>,
 
-    /// Disable Paged Attention on CUDA.
+    /// Disable PagedAttention on CUDA.
     #[arg(long = "no-paged-attn", default_value_t = false)]
     no_paged_attn: bool,
 }
diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml
index df2682a95..a37f3e769 100644
--- a/mistralrs/Cargo.toml
+++ b/mistralrs/Cargo.toml
@@ -12,7 +12,7 @@ license.workspace = true
 homepage.workspace = true
 
 [dependencies]
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" }
 anyhow.workspace = true
 tokio.workspace = true
 candle-core.workspace = true