diff --git a/Cargo.lock b/Cargo.lock index 295b25822..5942afc2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2287,7 +2287,7 @@ dependencies = [ [[package]] name = "mistralrs" -version = "0.1.26" +version = "0.2.0" dependencies = [ "anyhow", "candle-core", @@ -2301,7 +2301,7 @@ dependencies = [ [[package]] name = "mistralrs-bench" -version = "0.1.26" +version = "0.2.0" dependencies = [ "anyhow", "candle-core", @@ -2317,7 +2317,7 @@ dependencies = [ [[package]] name = "mistralrs-core" -version = "0.1.26" +version = "0.2.0" dependencies = [ "accelerate-src", "akin", @@ -2381,7 +2381,7 @@ dependencies = [ [[package]] name = "mistralrs-paged-attn" -version = "0.1.26" +version = "0.2.0" dependencies = [ "anyhow", "bindgen_cuda 0.1.6", @@ -2391,7 +2391,7 @@ dependencies = [ [[package]] name = "mistralrs-pyo3" -version = "0.1.26" +version = "0.2.0" dependencies = [ "accelerate-src", "base64 0.22.1", @@ -2412,7 +2412,7 @@ dependencies = [ [[package]] name = "mistralrs-server" -version = "0.1.26" +version = "0.2.0" dependencies = [ "accelerate-src", "anyhow", @@ -2440,7 +2440,7 @@ dependencies = [ [[package]] name = "mistralrs-vision" -version = "0.1.26" +version = "0.2.0" dependencies = [ "candle-core", "image", diff --git a/Cargo.toml b/Cargo.toml index 157af9b87..358936ee4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ exclude = [ resolver = "2" [workspace.package] -version = "0.1.26" +version = "0.2.0" edition = "2021" description = "Fast and easy LLM serving." homepage = "https://github.com/EricLBuehler/mistral.rs" diff --git a/README.md b/README.md index 8965aa6ae..27303994c 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Mistal.rs supports several model categories: ## Description **Fast**: - Quantized model support: 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit for faster inference and optimized memory usage. -- Continuous batching and Paged Attention support. +- Continuous batching and PagedAttention support. - Prefix caching. - [Device mapping](docs/DEVICE_MAPPING.md): load and run some layers on the device and the rest on the CPU. @@ -90,7 +90,7 @@ Mistal.rs supports several model categories: - AnyMoE: Build a memory-efficient MoE model from anything, in seconds - [Paper](https://arxiv.org/abs/2405.19076) - [Docs](docs/ANYMOE.md) -- Paged Attention: [docs](docs/PAGED_ATTENTION.md) +- PagedAttention: [docs](docs/PAGED_ATTENTION.md) This is a demo of interactive mode with streaming running Phi 3 128k mini with quantization via ISQ to Q4K. @@ -189,7 +189,7 @@ Please submit more benchmarks via raising an issue! > Note: You can use our [Docker containers here](https://github.com/EricLBuehler/mistral.rs/pkgs/container/mistral.rs). > Learn more about running Docker containers: https://docs.docker.com/engine/reference/run/ -> Note: You can use pre-built `mistralrs-server` binaries [here](https://github.com/EricLBuehler/mistral.rs/releases/tag/v0.1.26) +> Note: You can use pre-built `mistralrs-server` binaries [here](https://github.com/EricLBuehler/mistral.rs/releases/tag/v0.2.0) - Install the [Python package here](mistralrs-pyo3/README.md). diff --git a/docs/ISQ.md b/docs/ISQ.md index 36e005454..e6b5de5c7 100644 --- a/docs/ISQ.md +++ b/docs/ISQ.md @@ -46,7 +46,7 @@ let pipeline = loader.load_model_from_hf( false, DeviceMapMetadata::dummy(), Some(GgmlDType::Q4K), - None, // No Paged Attention yet + None, // No PagedAttention yet )?; ``` diff --git a/docs/PAGED_ATTENTION.md b/docs/PAGED_ATTENTION.md index 835e82052..8ee028e4c 100644 --- a/docs/PAGED_ATTENTION.md +++ b/docs/PAGED_ATTENTION.md @@ -1,12 +1,12 @@ -# Paged Attention in mistral.rs +# PagedAttention in mistral.rs -Mistral.rs supports Paged Attention ([paper here](https://arxiv.org/abs/2309.06180)) to accelerate both normal inference and batched inference on CUDA. +Mistral.rs supports PagedAttention ([paper here](https://arxiv.org/abs/2309.06180)) to accelerate both normal inference and batched inference on CUDA devices on Unix-like platforms such as WSL, Linux, or Mac. -Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and block size. This enables you to have fine-tuned control over the available context length, by configuring the available memory for KV cache. When using a CUDA device, Paged Attention is actiated by default but can be disabled with `no_paged_attn` for Python or `no-paged-attn` for the CLI tools. +Our PagedAttention implementation has 2 inputs: GPU KV cache memory size, and block size. This enables you to have fine-tuned control over the available context length, by configuring the available memory for KV cache. When using a CUDA device, PagedAttention is actiated by default but can be disabled with `no_paged_attn` for Python or `no-paged-attn` for the CLI tools. > Note: The default block size if not specified is 32. -> Note: if OOM happens (this can be caused by a variety of factors including adapter activation, re-ISQ, and others), it happens because the Paged Attention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount or usage percentage (recommended) or disable paged attention entirely for a dynamically allocated cache. +> Note: if OOM occurs (this can be caused by a variety of factors including adapter activation, re-ISQ, and others), it is likely because the PagedAttention KV cache has already been allocated. To counter this, either set the KV cache memory to a lower amount or usage percentage (recommended) or disable paged attention entirely for a dynamically allocated cache. > Note: Paged Attention is not enabled on Windows platforms, only Unix-based platforms. @@ -21,7 +21,7 @@ Our Paged Attention implementation has 2 inputs: GPU KV cache memory size, and b - GGUF models - Vision models -> Note: the prefix cacher will be disabled when using Paged Attention regardless of settings. This functionality will be added soon! +> Note: the prefix cacher will be disabled when using PagedAttention regardless of settings. This functionality will be added soon! ## Using the CLI diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml index e494d8248..19ecd6d5f 100644 --- a/mistralrs-bench/Cargo.toml +++ b/mistralrs-bench/Cargo.toml @@ -18,7 +18,7 @@ candle-core.workspace = true serde.workspace = true serde_json.workspace = true clap.workspace = true -mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" } +mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" } tracing.workspace = true either.workspace = true tokio.workspace = true diff --git a/mistralrs-bench/src/main.rs b/mistralrs-bench/src/main.rs index a3e4225e7..308f96e9c 100644 --- a/mistralrs-bench/src/main.rs +++ b/mistralrs-bench/src/main.rs @@ -279,23 +279,23 @@ struct Args { #[arg(short, long, value_parser, value_delimiter = ';')] num_device_layers: Option>, - /// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to - /// using `pa-gpu-mem-usage` set to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated. + /// GPU memory to allocate for KV cache with PagedAttention in MBs. If this is not set and the device is CUDA, it will default to + /// using `pa-gpu-mem-usage` set to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated. #[arg(long = "pa-gpu-mem")] paged_attn_gpu_mem: Option, - /// Percentage of GPU memory to utilize after allocation of KV cache with Paged Attention, from 0 to 1. - /// If this is not set and the device is CUDA, it will default to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated. + /// Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1. + /// If this is not set and the device is CUDA, it will default to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated. /// This is always used over `pa-gpu-mem` if both are specified. #[arg(long = "pa-gpu-mem-usage")] paged_attn_gpu_mem_usage: Option, - /// Block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, it will default to 32. - /// Paged Attention is only supported on CUDA and is always automatically activated. + /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32. + /// PagedAttention is only supported on CUDA and is always automatically activated. #[arg(long = "pa-blk-size")] paged_attn_block_size: Option, - /// Disable Paged Attention on CUDA. + /// Disable PagedAttention on CUDA. #[arg(long = "no_paged_attn", default_value_t = false)] no_paged_attn: bool, } diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml index 24cee4c3e..7ba0d5f13 100644 --- a/mistralrs-core/Cargo.toml +++ b/mistralrs-core/Cargo.toml @@ -64,13 +64,13 @@ tracing-subscriber.workspace = true derive-new = "0.6.0" itertools = "0.13.0" sysinfo = "0.30.12" -mistralrs-vision = { version = "0.1.13", path = "../mistralrs-vision" } +mistralrs-vision = { version = "0.2.0", path = "../mistralrs-vision" } csv = "1.3.0" reqwest.workspace = true base64.workspace = true bytemuck_derive = "1.7.0" plotly = { version = "0.9.0", features = ["kaleido"], optional = true } -mistralrs-paged-attn = { version = "0.1.13", path = "../mistralrs-paged-attn", optional = true } +mistralrs-paged-attn = { version = "0.2.0", path = "../mistralrs-paged-attn", optional = true } [features] default = ["dep:plotly"] diff --git a/mistralrs-core/src/dummy_paged_attention/mod.rs b/mistralrs-core/src/dummy_paged_attention/mod.rs index 9f67e8207..a3438cb20 100644 --- a/mistralrs-core/src/dummy_paged_attention/mod.rs +++ b/mistralrs-core/src/dummy_paged_attention/mod.rs @@ -88,14 +88,14 @@ pub fn calculate_cache_config( let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32; let used = total - free; let size = (total * f - used) as usize; - info!("Allocating {size} MB for Paged Attention KV cache"); + info!("Allocating {size} MB for PagedAttention KV cache"); size } }; let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config); let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config); - info!("Using Paged Attention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size); + info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size); Ok(CacheConfig { block_size, num_gpu_blocks, diff --git a/mistralrs-core/src/models/gemma2.rs b/mistralrs-core/src/models/gemma2.rs index 7fb16273f..3ed16af87 100644 --- a/mistralrs-core/src/models/gemma2.rs +++ b/mistralrs-core/src/models/gemma2.rs @@ -483,7 +483,7 @@ impl Model { let vb_l = vb_m.pp("layers"); if matches!(attention_mechanism, AttentionImplementation::PagedAttention) { // TODO softcapping in paged attn - candle_core::bail!("Gemma 2 does not support Paged Attention."); + candle_core::bail!("Gemma 2 does not support PagedAttention."); } for layer_idx in NiceProgressBar::<_, 'b'>(0..cfg.num_hidden_layers, "Loading repeating layers") diff --git a/mistralrs-core/src/paged_attention/mod.rs b/mistralrs-core/src/paged_attention/mod.rs index 59f6bf8cd..abf44a34a 100644 --- a/mistralrs-core/src/paged_attention/mod.rs +++ b/mistralrs-core/src/paged_attention/mod.rs @@ -92,14 +92,14 @@ pub fn calculate_cache_config( let total = MemoryUsage.get_total_memory(device)? as f32 / SIZE_IN_MB as f32; let used = total - free; let size = (total * f - used) as usize; - info!("Allocating {size} MB for Paged Attention KV cache"); + info!("Allocating {size} MB for PagedAttention KV cache"); size } }; let num_gpu_blocks = mb_to_blocks!(mem_gpu * SIZE_IN_MB, dtype_size, block_size, config); let num_cpu_blocks = mb_to_blocks!(mem_cpu * SIZE_IN_MB, dtype_size, block_size, config); - info!("Using Paged Attention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size); + info!("Using PagedAttention with block size {block_size} and {num_gpu_blocks} GPU blocks: available context length is {} tokens", num_gpu_blocks*block_size); Ok(CacheConfig { block_size, num_gpu_blocks, diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs index 87a9d302e..9ad3fc357 100644 --- a/mistralrs-core/src/pipeline/mod.rs +++ b/mistralrs-core/src/pipeline/mod.rs @@ -444,7 +444,7 @@ pub struct GeneralMetadata { pub is_xlora: bool, pub activation_dtype: DType, pub sliding_window: Option, - // Paged Attention stuff + // PagedAttention stuff pub cache_config: Option, pub cache_engine: Option, } diff --git a/mistralrs-core/src/scheduler/mod.rs b/mistralrs-core/src/scheduler/mod.rs index 25aff8a07..5f4b4fb5f 100644 --- a/mistralrs-core/src/scheduler/mod.rs +++ b/mistralrs-core/src/scheduler/mod.rs @@ -52,7 +52,7 @@ pub trait Scheduler { /// This may do nothing. It depends on the implementation fn free_finished_sequence_groups(&mut self); - // Paged Attention metadata + // PagedAttention metadata fn block_tables(&self) -> Option<&BlockTables>; fn block_size(&self) -> Option; fn block_engine(&mut self) -> Option<&mut BlockEngine>; diff --git a/mistralrs-paged-attn/src/backend/paged_attention.rs b/mistralrs-paged-attn/src/backend/paged_attention.rs index 0b264999e..d3a3ed812 100644 --- a/mistralrs-paged-attn/src/backend/paged_attention.rs +++ b/mistralrs-paged-attn/src/backend/paged_attention.rs @@ -250,7 +250,7 @@ impl candle::CustomOp1 for PagedAttention { } } -/// Paged Attention layer. +/// PagedAttention layer. /// /// This implements scaled dot-product attention, `softmax(Q @ K^T . softmax_scale) @ V`. /// Multi-query and grouped-query attention are supported by using tensors key_cache and value_cache diff --git a/mistralrs-pyo3/Cargo.toml b/mistralrs-pyo3/Cargo.toml index 004b9d374..2f8613309 100644 --- a/mistralrs-pyo3/Cargo.toml +++ b/mistralrs-pyo3/Cargo.toml @@ -17,7 +17,7 @@ doc = false [dependencies] pyo3.workspace = true -mistralrs-core = { version = "0.1.26", path = "../mistralrs-core", features = ["pyo3_macros"] } +mistralrs-core = { version = "0.2.0", path = "../mistralrs-core", features = ["pyo3_macros"] } serde.workspace = true serde_json.workspace = true candle-core.workspace = true diff --git a/mistralrs-pyo3/Cargo_template.toml b/mistralrs-pyo3/Cargo_template.toml index 130b5b123..ae85ece7d 100644 --- a/mistralrs-pyo3/Cargo_template.toml +++ b/mistralrs-pyo3/Cargo_template.toml @@ -17,7 +17,7 @@ doc = false [dependencies] pyo3.workspace = true -mistralrs-core = { version = "0.1.26", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } +mistralrs-core = { version = "0.2.0", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } serde.workspace = true serde_json.workspace = true candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.6.0", rev = "c967be9", features=["$feature_name"] } diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi index 82f51b1b6..738ab8866 100644 --- a/mistralrs-pyo3/mistralrs.pyi +++ b/mistralrs-pyo3/mistralrs.pyi @@ -211,11 +211,11 @@ class Runner: the corresponding number of layers. - `in_situ_quant` sets the optional in-situ quantization for models that are not quantized (not GGUF or GGML). - `anymoe_config` specifies the AnyMoE config. If this is set, then the model will be loaded as an AnyMoE model. - - `pa_gpu_mem` sets GPU memory to allocate for KV cache with Paged Attention in MBs *OR* the percentage utilization, from 0 to 1. If this is not set and the device is - CUDA, it will default to using 90% of the total memory after allocation of the KV cache. Paged Attention is only supported on CUDA and is always automatically activated. - - `pa_blk_size` sets the block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, - it will default to 32. Paged Attention is only supported on CUDA and is always automatically activated. - - `no_paged_attn` disables Paged Attention on CUDA + - `pa_gpu_mem` sets GPU memory to allocate for KV cache with PagedAttention in MBs *OR* the percentage utilization, from 0 to 1. If this is not set and the device is + CUDA, it will default to using 90% of the total memory after allocation of the KV cache. PagedAttention is only supported on CUDA and is always automatically activated. + - `pa_blk_size` sets the block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, + it will default to 32. PagedAttention is only supported on CUDA and is always automatically activated. + - `no_paged_attn` disables PagedAttention on CUDA """ ... diff --git a/mistralrs-pyo3/pyproject.toml b/mistralrs-pyo3/pyproject.toml index 15da54830..e4e813d36 100644 --- a/mistralrs-pyo3/pyproject.toml +++ b/mistralrs-pyo3/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "mistralrs" -version = "0.1.26" +version = "0.2.0" requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", diff --git a/mistralrs-pyo3/pyproject_template.toml b/mistralrs-pyo3/pyproject_template.toml index dda899334..67403d2aa 100644 --- a/mistralrs-pyo3/pyproject_template.toml +++ b/mistralrs-pyo3/pyproject_template.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "$name" -version = "0.1.26" +version = "0.2.0" requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml index 6909c0b9c..c73daa0c8 100644 --- a/mistralrs-server/Cargo.toml +++ b/mistralrs-server/Cargo.toml @@ -22,7 +22,7 @@ axum = { version = "0.7.4", features = ["tokio"] } tower-http = { version = "0.5.1", features = ["cors"]} utoipa = { version = "4.2", features = ["axum_extras"] } utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]} -mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" } +mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" } indexmap.workspace = true accelerate-src = { workspace = true, optional = true } intel-mkl-src = { workspace = true, optional = true } diff --git a/mistralrs-server/src/main.rs b/mistralrs-server/src/main.rs index 1a01b6301..6e4ddfeba 100644 --- a/mistralrs-server/src/main.rs +++ b/mistralrs-server/src/main.rs @@ -123,23 +123,23 @@ struct Args { #[arg(long = "isq", value_parser = parse_isq)] in_situ_quant: Option, - /// GPU memory to allocate for KV cache with Paged Attention in MBs. If this is not set and the device is CUDA, it will default to - /// using `pa-gpu-mem-usage` set to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated. + /// GPU memory to allocate for KV cache with PagedAttention in MBs. If this is not set and the device is CUDA, it will default to + /// using `pa-gpu-mem-usage` set to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated. #[arg(long = "pa-gpu-mem")] paged_attn_gpu_mem: Option, - /// Percentage of GPU memory to utilize after allocation of KV cache with Paged Attention, from 0 to 1. - /// If this is not set and the device is CUDA, it will default to `0.9`. Paged Attention is only supported on CUDA and is always automatically activated. + /// Percentage of GPU memory to utilize after allocation of KV cache with PagedAttention, from 0 to 1. + /// If this is not set and the device is CUDA, it will default to `0.9`. PagedAttention is only supported on CUDA and is always automatically activated. /// This is always used over `pa-gpu-mem` if both are specified. #[arg(long = "pa-gpu-mem-usage")] paged_attn_gpu_mem_usage: Option, - /// Block size (number of tokens per block) for Paged Attention. If this is not set and the device is CUDA, it will default to 32. - /// Paged Attention is only supported on CUDA and is always automatically activated. + /// Block size (number of tokens per block) for PagedAttention. If this is not set and the device is CUDA, it will default to 32. + /// PagedAttention is only supported on CUDA and is always automatically activated. #[arg(long = "pa-blk-size")] paged_attn_block_size: Option, - /// Disable Paged Attention on CUDA. + /// Disable PagedAttention on CUDA. #[arg(long = "no-paged-attn", default_value_t = false)] no_paged_attn: bool, } diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml index df2682a95..a37f3e769 100644 --- a/mistralrs/Cargo.toml +++ b/mistralrs/Cargo.toml @@ -12,7 +12,7 @@ license.workspace = true homepage.workspace = true [dependencies] -mistralrs-core = { version = "0.1.26", path = "../mistralrs-core" } +mistralrs-core = { version = "0.2.0", path = "../mistralrs-core" } anyhow.workspace = true tokio.workspace = true candle-core.workspace = true