Bump version to 0.2.0 and update docs (#582)

EricLBuehler · Jul 19, 2024 · fecd84a · fecd84a
1 parent 1af54bc
commit fecd84a
Show file tree

Hide file tree

Showing 22 changed files with 53 additions and 53 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,7 +13,7 @@ exclude = [
 resolver = "2"
 
 [workspace.package]
-version = "0.1.26"
+version = "0.2.0"
 edition = "2021"
 description = "Fast and easy LLM serving."
 homepage = "https://github.com/EricLBuehler/mistral.rs"

diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ Mistal.rs supports several model categories:
 ## Description
 **Fast**:
 - Quantized model support: 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit for faster inference and optimized memory usage.
-- Continuous batching and Paged Attention support.
+- Continuous batching and PagedAttention support.
 - Prefix caching.
 - [Device mapping](docs/DEVICE_MAPPING.md): load and run some layers on the device and the rest on the CPU.
 
@@ -90,7 +90,7 @@ Mistal.rs supports several model categories:
 - AnyMoE: Build a memory-efficient MoE model from anything, in seconds
  - [Paper](https://arxiv.org/abs/2405.19076)
  - [Docs](docs/ANYMOE.md)
-- Paged Attention: [docs](docs/PAGED_ATTENTION.md)
+- PagedAttention: [docs](docs/PAGED_ATTENTION.md)
 
 
 This is a demo of interactive mode with streaming running Phi 3 128k mini with quantization via ISQ to Q4K.
@@ -189,7 +189,7 @@ Please submit more benchmarks via raising an issue!
 > Note: You can use our [Docker containers here](https://github.com/EricLBuehler/mistral.rs/pkgs/container/mistral.rs).
 > Learn more about running Docker containers: https://docs.docker.com/engine/reference/run/
 
-> Note: You can use pre-built `mistralrs-server` binaries [here](https://github.com/EricLBuehler/mistral.rs/releases/tag/v0.1.26)
+> Note: You can use pre-built `mistralrs-server` binaries [here](https://github.com/EricLBuehler/mistral.rs/releases/tag/v0.2.0)
 
 - Install the [Python package here](mistralrs-pyo3/README.md).
 

diff --git a/docs/ISQ.md b/docs/ISQ.md
@@ -46,7 +46,7 @@ let pipeline = loader.load_model_from_hf(
  false,
  DeviceMapMetadata::dummy(),
  Some(GgmlDType::Q4K),
- None, // No Paged Attention yet
+ None, // No PagedAttention yet
 )?;
 ```
 

diff --git a/docs/PAGED_ATTENTION.md b/docs/PAGED_ATTENTION.md
@@ -1,12 +1,12 @@
-# Paged Attention in mistral.rs
+# PagedAttention in mistral.rs
 
-Mistral.rs supports Paged Attention ([paper here](https://arxiv.org/abs/2309.06180)) to accelerate both normal inference and batched inference on CUDA.
+Mistral.rs supports PagedAttention ([paper here](https://arxiv.org/abs/2309.06180)) to accelerate both normal inference and batched inference on CUDA devices on Unix-like platforms such as WSL, Linux, or Mac.
 
-Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and block size. This enables you to have fine-tuned control over the available context length, by configuring the available memory for KV cache. When using a CUDA device, Paged Attention is actiated by default but can be disabled with `no_paged_attn` for Python or `no-paged-attn` for the CLI tools.
+Our PagedAttention implementation has 2 inputs: GPU KV cache memory size, and block size. This enables you to have fine-tuned control over the available context length, by configuring the available memory for KV cache. When using a CUDA device, PagedAttention is actiated by default but can be disabled with `no_paged_attn` for Python or `no-paged-attn` for the CLI tools.
 
 > Note: The default block size if not specified is 32.
 
-> Note: if OOM happens (this can be caused by a variety of factors including adapter activation, re-ISQ, and others), it happens because the Paged Attention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount or usage percentage (recommended) or disable paged attention entirely for a dynamically allocated cache.
+> Note: if OOM occurs (this can be caused by a variety of factors including adapter activation, re-ISQ, and others), it is likely because the PagedAttention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount or usage percentage (recommended) or disable paged attention entirely for a dynamically allocated cache.
 
 > Note: Paged Attention is not enabled on Windows platforms, only Unix-based platforms.
 
@@ -21,7 +21,7 @@ Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and b
 - GGUF models
 - Vision models
 
-> Note: the prefix cacher will be disabled when using Paged Attention regardless of settings. This functionality will be added soon!
+> Note: the prefix cacher will be disabled when using PagedAttention regardless of settings. This functionality will be added soon!
 
 ## Using the CLI
 

diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
@@ -18,7 +18,7 @@ candle-core.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 clap.workspace = true
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" }
 tracing.workspace = true
 either.workspace = true
 tokio.workspace = true

diff --git a/mistralrs-bench/src/main.rs b/mistralrs-bench/src/main.rs
@@ -279,23 +279,23 @@ struct Args {
  #[arg(short, long, value_parser, value_delimiter = ';')]
  num_device_layers: Option<Vec<String>>,
 
- /// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to
- /// using `pa-gpu-mem-usage` set to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+ /// GPU memory to allocate for KV cache with PagedAttention in MBs. If this is not set and the device is CUDA, it will default to
+ /// using `pa-gpu-mem-usage` set to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
  #[arg(long = "pa-gpu-mem")]
  paged_attn_gpu_mem: Option<usize>,
 
- /// Percentage of GPU memory to utilize after allocation of KV cache with Paged Attention, from 0 to 1.
- /// If this is not set and the device is CUDA, it will default to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+ /// Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1.
+ /// If this is not set and the device is CUDA, it will default to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
  /// This is always used over `pa-gpu-mem` if both are specified.
  #[arg(long = "pa-gpu-mem-usage")]
  paged_attn_gpu_mem_usage: Option<f32>,
 
- /// Block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, it will default to 32.
- /// Paged Attention is only supported on CUDA and is always automatically activated.
+ /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32.
+ /// PagedAttention is only supported on CUDA and is always automatically activated.
  #[arg(long = "pa-blk-size")]
  paged_attn_block_size: Option<usize>,
 
- /// Disable Paged Attention on CUDA.
+ /// Disable PagedAttention on CUDA.
  #[arg(long = "no_paged_attn", default_value_t = false)]
  no_paged_attn: bool,
 }

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -64,13 +64,13 @@ tracing-subscriber.workspace = true
 derive-new = "0.6.0"
 itertools = "0.13.0"
 sysinfo = "0.30.12"
-mistralrs-vision = { version = "0.1.13", path = "../mistralrs-vision" }
+mistralrs-vision = { version = "0.2.0", path = "../mistralrs-vision" }
 csv = "1.3.0"
 reqwest.workspace = true
 base64.workspace = true
 bytemuck_derive = "1.7.0"
 plotly = { version = "0.9.0", features = ["kaleido"], optional = true }
-mistralrs-paged-attn = { version = "0.1.13", path = "../mistralrs-paged-attn", optional = true }
+mistralrs-paged-attn = { version = "0.2.0", path = "../mistralrs-paged-attn", optional = true }
 
 [features]
 default = ["dep:plotly"]

diff --git a/mistralrs-core/src/dummy_paged_attention/mod.rs b/mistralrs-core/src/dummy_paged_attention/mod.rs
@@ -88,14 +88,14 @@ pub fn calculate_cache_config(
  let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
  let used = total - free;
  let size = (total * f - used) as usize;
- info!("Allocating {size} MB for Paged Attention KV cache");
+ info!("Allocating {size} MB for PagedAttention KV cache");
  size
  }
  };
 
  let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
  let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
- info!("Using Paged Attention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
+ info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
  Ok(CacheConfig {
  block_size,
  num_gpu_blocks,

diff --git a/mistralrs-core/src/models/gemma2.rs b/mistralrs-core/src/models/gemma2.rs
@@ -483,7 +483,7 @@ impl Model {
  let vb_l = vb_m.pp("layers");
  if matches!(attention_mechanism, AttentionImplementation::PagedAttention) {
  // TODO softcapping in paged attn
- candle_core::bail!("Gemma 2 does not support Paged Attention.");
+ candle_core::bail!("Gemma 2 does not support PagedAttention.");
  }
  for layer_idx in
  NiceProgressBar::<_, 'b'>(0..cfg.num_hidden_layers, "Loading repeating layers")

diff --git a/mistralrs-core/src/paged_attention/mod.rs b/mistralrs-core/src/paged_attention/mod.rs
@@ -92,14 +92,14 @@ pub fn calculate_cache_config(
  let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32;
  let used = total - free;
  let size = (total * f - used) as usize;
- info!("Allocating {size} MB for Paged Attention KV cache");
+ info!("Allocating {size} MB for PagedAttention KV cache");
  size
  }
  };
 
  let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config);
  let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config);
- info!("Using Paged Attention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
+ info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size);
  Ok(CacheConfig {
  block_size,
  num_gpu_blocks,

diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
@@ -444,7 +444,7 @@ pub struct GeneralMetadata {
  pub is_xlora: bool,
  pub activation_dtype: DType,
  pub sliding_window: Option<usize>,
- // Paged Attention stuff
+ // PagedAttention stuff
  pub cache_config: Option<CacheConfig>,
  pub cache_engine: Option<CacheEngine>,
 }

diff --git a/mistralrs-core/src/scheduler/mod.rs b/mistralrs-core/src/scheduler/mod.rs
@@ -52,7 +52,7 @@ pub trait Scheduler {
  /// This may do nothing. It depends on the implementation
  fn free_finished_sequence_groups(&mut self);
 
- // Paged Attention metadata
+ // PagedAttention metadata
  fn block_tables(&self) -> Option<&BlockTables>;
  fn block_size(&self) -> Option<usize>;
  fn block_engine(&mut self) -> Option<&mut BlockEngine>;

diff --git a/mistralrs-paged-attn/src/backend/paged_attention.rs b/mistralrs-paged-attn/src/backend/paged_attention.rs
@@ -250,7 +250,7 @@ impl candle::CustomOp1 for PagedAttention {
  }
 }
 
-/// Paged Attention layer.
+/// PagedAttention layer.
 ///
 /// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`.
 /// Multi-query and grouped-query attention are supported by using tensors key_cache and value_cache

diff --git a/mistralrs-pyo3/Cargo.toml b/mistralrs-pyo3/Cargo.toml
@@ -17,7 +17,7 @@ doc = false
 
 [dependencies]
 pyo3.workspace = true
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core", features = ["pyo3_macros"] }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core", features = ["pyo3_macros"] }
 serde.workspace = true
 serde_json.workspace = true
 candle-core.workspace = true

diff --git a/mistralrs-pyo3/Cargo_template.toml b/mistralrs-pyo3/Cargo_template.toml
@@ -17,7 +17,7 @@ doc = false
 
 [dependencies]
 pyo3.workspace = true
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] }
 serde.workspace = true
 serde_json.workspace = true
 candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.6.0", rev = "c967be9", features=["$feature_name"] }

diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi
@@ -211,11 +211,11 @@ class Runner:
  the corresponding number of layers.
  - `in_situ_quant` sets the optional in-situ quantization for models that are not quantized (not GGUF or GGML).
  - `anymoe_config` specifies the AnyMoE config. If this is set, then the model will be loaded as an AnyMoE model.
- - `pa_gpu_mem` sets GPU memory to allocate for KV cache with Paged Attention in MBs *OR* the percentage utilization, from 0 to 1. If this is not set and the device is
- CUDA, it will default to using 90% of the total memory after allocation of the KV cache. Paged Attention is only supported on CUDA and is always automatically activated.
- - `pa_blk_size` sets the block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA,
- it will default to 32. Paged Attention is only supported on CUDA and is always automatically activated.
- - `no_paged_attn` disables Paged Attention on CUDA
+ - `pa_gpu_mem` sets GPU memory to allocate for KV cache with PagedAttention in MBs *OR* the percentage utilization, from 0 to 1. If this is not set and the device is
+ CUDA, it will default to using 90% of the total memory after allocation of the KV cache. PagedAttention is only supported on CUDA and is always automatically activated.
+ - `pa_blk_size` sets the block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA,
+ it will default to 32. PagedAttention is only supported on CUDA and is always automatically activated.
+ - `no_paged_attn` disables PagedAttention on CUDA
  """
  ...
 

diff --git a/mistralrs-pyo3/pyproject.toml b/mistralrs-pyo3/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "mistralrs"
-version = "0.1.26"
+version = "0.2.0"
 requires-python = ">=3.8"
 classifiers = [
  "Programming Language :: Rust",

diff --git a/mistralrs-pyo3/pyproject_template.toml b/mistralrs-pyo3/pyproject_template.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "$name"
-version = "0.1.26"
+version = "0.2.0"
 requires-python = ">=3.8"
 classifiers = [
  "Programming Language :: Rust",

diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml
@@ -22,7 +22,7 @@ axum = { version = "0.7.4", features = ["tokio"] }
 tower-http = { version = "0.5.1", features = ["cors"]}
 utoipa = { version = "4.2", features = ["axum_extras"] }
 utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]}
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" }
 indexmap.workspace = true
 accelerate-src = { workspace = true, optional = true }
 intel-mkl-src = { workspace = true, optional = true }

diff --git a/mistralrs-server/src/main.rs b/mistralrs-server/src/main.rs
@@ -123,23 +123,23 @@ struct Args {
  #[arg(long = "isq", value_parser = parse_isq)]
  in_situ_quant: Option<GgmlDType>,
 
- /// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to
- /// using `pa-gpu-mem-usage` set to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+ /// GPU memory to allocate for KV cache with PagedAttention in MBs. If this is not set and the device is CUDA, it will default to
+ /// using `pa-gpu-mem-usage` set to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
  #[arg(long = "pa-gpu-mem")]
  paged_attn_gpu_mem: Option<usize>,
 
- /// Percentage of GPU memory to utilize after allocation of KV cache with Paged Attention, from 0 to 1.
- /// If this is not set and the device is CUDA, it will default to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated.
+ /// Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1.
+ /// If this is not set and the device is CUDA, it will default to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated.
  /// This is always used over `pa-gpu-mem` if both are specified.
  #[arg(long = "pa-gpu-mem-usage")]
  paged_attn_gpu_mem_usage: Option<f32>,
 
- /// Block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, it will default to 32.
- /// Paged Attention is only supported on CUDA and is always automatically activated.
+ /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32.
+ /// PagedAttention is only supported on CUDA and is always automatically activated.
  #[arg(long = "pa-blk-size")]
  paged_attn_block_size: Option<usize>,
 
- /// Disable Paged Attention on CUDA.
+ /// Disable PagedAttention on CUDA.
  #[arg(long = "no-paged-attn", default_value_t = false)]
  no_paged_attn: bool,
 }

diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml
@@ -12,7 +12,7 @@ license.workspace = true
 homepage.workspace = true
 
 [dependencies]
-mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" }
 anyhow.workspace = true
 tokio.workspace = true
 candle-core.workspace = true