Device map topology (#717)

* Add device support to topology * Wip * Device topology works now * Update topology * Update docs * Update readme
EricLBuehler · Aug 27, 2024 · 91a423e · 91a423e
1 parent 8ddf258
commit 91a423e
Show file tree

Hide file tree

Showing 33 changed files with 426 additions and 115 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -85,12 +85,12 @@ Mistal.rs supports several model categories:
 - [ISQ](docs/ISQ.md) (In situ quantization): run `.safetensors` models directly from Hugging Face Hub by quantizing them after loading instead of creating a GGUF file.
  - This loads the ISQ-able weights on CPU before quantizing with ISQ and then moving to the device to avoid memory spikes.
  - Extremely fast due to working in parallel
-- Use a [model topology](docs/TOPOLOGY.md) to configure ISQ types *per layer* with a single [YAML file](topologies/isq.yml)
 
 **Easy**:
 - Lightweight OpenAI API compatible HTTP server.
 - Python API.
 - Grammar support with Regex and Yacc.
+- Use a simple [model topology](docs/TOPOLOGY.md) to configure ISQ and device mapping for *per-layer* with a single [YAML file](topologies/isq_and_device.yml) (examples [here](topologies))
 
 **Powerful**:
 - Fast LoRA support with weight merging.

diff --git a/docs/TOPOLOGY.md b/docs/TOPOLOGY.md
@@ -1,37 +1,43 @@
 # Model topology configuration
 
+<h3>Quantization and device mapping in one file.</h3>
+
 To support per-layer mix of ISQ, Mistral.rs supports loading a model topology YAML file. This YAML file is formatted as follows:
 
 1) Top-level keys are either:
- - A range of layers (`start-end`) where `start < end`. `start` is inclusive and `end` is inclusive
+ - A range of layers (`start-end`) where `start < end`. `start` is inclusive and `end` is exclusive
  - A single layer number
  2) The topology for the range or layer:
- - A single key (`isq`) which mapps to a single value, which can be any [ISQ type](ISQ.md#isq-quantization-types)
+ - An optional key (`isq`) which maps to a single value, which can be any [ISQ type](ISQ.md#isq-quantization-types). If not specified, there is no ISQ for this range of layers applied.
+ - An optional key (`device`) which maps to a single value, which is one of the below. If not specified, the default loading deice will be used.
+ - `cpu`
+ - `cuda[ORDINAL]`
+ - `metal[ORDINAL]`
 
 Note that:
 - The topology for the range is expanded to fill the range
 - If ranges overlap, the range with the higher end layer takes precedence and will overwrite
 - Any layers which are not covered will have no topology mapping. They will inherit any other ISQ (e.g. with `--isq`/`in_situ_quant`) set.
 - Unless the layer is not covered by the topology, the topology value will override any other ISQ (e.g. with `--isq`/`in_situ_quant`).
+- The topology device mapping will override any other device mapping.
 
 
 ```yml
 0-8:
  isq: Q3K
+ device: cuda[0]
 8-16:
  isq: Q4K
+ device: cpu
 16-24:
  isq: Q6K
 # Skip 24-28
 28-32:
  isq: Q8_0
+ device: cuda[0]
 ```
 
-Model topologies may be applied to the following model types:
-- `plain`/`Plain`
-- `xlora`/`XLora`
-- `lora`/`Lora`
-- `vision-plain`/`VisionPlain`
+Model topologies may be applied to all model types.
 
 ## CLI example
 ```

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -75,6 +75,7 @@ mistralrs-quant = { version = "0.2.0", path = "../mistralrs-quant" }
 uuid = { version = "1.10.0", features = ["v4"] }
 schemars = "0.8.21"
 serde_yaml = "0.9.34"
+regex = "1.10.6"
 
 [features]
 default = ["plotly"]

diff --git a/mistralrs-core/src/device_map.rs b/mistralrs-core/src/device_map.rs
@@ -1,6 +1,6 @@
 use std::fmt::Debug;
 
-use crate::{utils::debug::DeviceRepr, TryIntoDType};
+use crate::{utils::debug::DeviceRepr, Topology, TryIntoDType};
 use candle_core::{DType, Device, Result, Tensor};
 use candle_nn::VarBuilder;
 use serde::Deserialize;
@@ -40,7 +40,37 @@ impl DeviceMapMetadata {
  &self,
  model_layers: usize,
  device: &Device,
+ topology: Option<&Topology>,
  ) -> Result<Box<dyn DeviceMapper + Send + Sync>> {
+ if let Some(topology) = topology {
+ if topology.0.iter().all(|x| x.is_none()) {
+ return Ok(Box::new(DummyDeviceMapper {
+ nm_device: device.clone(),
+ }));
+ } else {
+ let layers = topology
+ .0
+ .iter()
+ .map(|layer| {
+ layer
+ .as_ref()
+ .map(|x| x.device.clone().unwrap_or(device.clone()))
+ .unwrap_or(device.clone())
+ })
+ .collect::<Vec<_>>();
+
+ info!("Loading model according to the following repeating layer mappings based on topology:");
+ for (i, dev) in layers.iter().enumerate() {
+ info!("Layer {i}: {}", dev.device_pretty_repr());
+ }
+
+ return Ok(Box::new(LayerDeviceMapper {
+ mappings: layers,
+ nm_device: device.clone(),
+ }));
+ }
+ }
+
  // How many device layers
  // Clamp to max of model layers
  let n_device_layers = if let Some(layers) = &self.device_layers {

diff --git a/mistralrs-core/src/layers.rs b/mistralrs-core/src/layers.rs
@@ -112,6 +112,7 @@ pub enum PhiRopeScalingConfig {
  Classic {
  short_factor: Vec<f64>,
  long_factor: Vec<f64>,
+ #[serde(rename = "type")]
  scaling_type: ScaledRopeType,
  },
  Scaled {

diff --git a/mistralrs-core/src/lib.rs b/mistralrs-core/src/lib.rs
@@ -67,12 +67,13 @@ pub use mistralrs_quant::IsqType;
 pub use paged_attention::{MemoryGpuConfig, PagedAttentionConfig};
 pub use pipeline::{
  chat_template::ChatTemplate, parse_isq_value, AnyMoeLoader, AnyMoePipeline, GGMLLoader,
- GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoader, GGUFLoaderBuilder, GemmaLoader,
- Idefics2Loader, LLaVALoader, LLaVANextLoader, LlamaLoader, Loader, LocalModelPaths,
- MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader, NormalLoaderBuilder,
- NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader, Phi3VLoader, Qwen2Loader,
- SpeculativeConfig, SpeculativeLoader, SpeculativePipeline, Starcoder2Loader, TokenSource,
- VisionLoader, VisionLoaderBuilder, VisionLoaderType, VisionSpecificConfig,
+ GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoader, GGUFLoaderBuilder, GGUFSpecificConfig,
+ GemmaLoader, Idefics2Loader, LLaVALoader, LLaVANextLoader, LlamaLoader, Loader,
+ LocalModelPaths, MistralLoader, MixtralLoader, ModelKind, ModelPaths, NormalLoader,
+ NormalLoaderBuilder, NormalLoaderType, NormalSpecificConfig, Phi2Loader, Phi3Loader,
+ Phi3VLoader, Qwen2Loader, SpeculativeConfig, SpeculativeLoader, SpeculativePipeline,
+ Starcoder2Loader, TokenSource, VisionLoader, VisionLoaderBuilder, VisionLoaderType,
+ VisionSpecificConfig,
 };
 pub use request::{Constraint, MessageContent, NormalRequest, Request, RequestMessage};
 pub use response::Response;

diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs
@@ -6,8 +6,8 @@ use std::{
 use crate::{
  get_toml_selected_model_dtype,
  pipeline::{GGMLLoaderBuilder, GGMLSpecificConfig, GGUFLoaderBuilder, NormalSpecificConfig},
- Loader, ModelDType, ModelSelected, NormalLoaderBuilder, TomlLoaderArgs, TomlSelector, Topology,
- VisionLoaderBuilder, VisionSpecificConfig, GGUF_MULTI_FILE_DELIMITER,
+ GGUFSpecificConfig, Loader, ModelDType, ModelSelected, NormalLoaderBuilder, TomlLoaderArgs,
+ TomlSelector, Topology, VisionLoaderBuilder, VisionSpecificConfig, GGUF_MULTI_FILE_DELIMITER,
 };
 
 /// A builder for a loader using the selected model.
@@ -191,6 +191,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  tok_model_id,
  quantized_model_id,
  quantized_filename,
+ topology,
  } => GGUFLoaderBuilder::new(
  args.chat_template,
  tok_model_id,
@@ -199,7 +200,10 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  .split(GGUF_MULTI_FILE_DELIMITER)
  .map(ToOwned::to_owned)
  .collect::<Vec<_>>(),
- args.prompt_batchsize,
+ GGUFSpecificConfig {
+ prompt_batchsize: args.prompt_batchsize,
+ topology: Topology::from_option_path(topology)?,
+ },
  )
  .build(),
  ModelSelected::XLoraGGUF {
@@ -209,6 +213,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  xlora_model_id,
  order,
  tgt_non_granular_index,
+ topology,
  } => GGUFLoaderBuilder::new(
  args.chat_template,
  tok_model_id,
@@ -217,7 +222,10 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  .split(GGUF_MULTI_FILE_DELIMITER)
  .map(ToOwned::to_owned)
  .collect::<Vec<_>>(),
- args.prompt_batchsize,
+ GGUFSpecificConfig {
+ prompt_batchsize: args.prompt_batchsize,
+ topology: Topology::from_option_path(topology)?,
+ },
  )
  .with_xlora(
  xlora_model_id,
@@ -235,6 +243,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  quantized_filename,
  adapters_model_id,
  order,
+ topology,
  } => GGUFLoaderBuilder::new(
  args.chat_template,
  tok_model_id,
@@ -243,7 +252,10 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  .split(GGUF_MULTI_FILE_DELIMITER)
  .map(ToOwned::to_owned)
  .collect::<Vec<_>>(),
- args.prompt_batchsize,
+ GGUFSpecificConfig {
+ prompt_batchsize: args.prompt_batchsize,
+ topology: Topology::from_option_path(topology)?,
+ },
  )
  .with_lora(
  adapters_model_id,
@@ -259,10 +271,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  quantized_model_id,
  quantized_filename,
  gqa,
+ topology,
  } => GGMLLoaderBuilder::new(
  GGMLSpecificConfig {
  gqa,
  prompt_batchsize: args.prompt_batchsize,
+ topology: Topology::from_option_path(topology)?,
  },
  args.chat_template,
  tokenizer_json,
@@ -280,10 +294,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  order,
  tgt_non_granular_index,
  gqa,
+ topology,
  } => GGMLLoaderBuilder::new(
  GGMLSpecificConfig {
  gqa,
  prompt_batchsize: args.prompt_batchsize,
+ topology: Topology::from_option_path(topology)?,
  },
  args.chat_template,
  tokenizer_json,
@@ -309,10 +325,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
  adapters_model_id,
  order,
  gqa,
+ topology,
  } => GGMLLoaderBuilder::new(
  GGMLSpecificConfig {
  gqa,
  prompt_batchsize: args.prompt_batchsize,
+ topology: Topology::from_option_path(topology)?,
  },
  args.chat_template,
  tokenizer_json,

diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs
@@ -133,6 +133,10 @@ pub enum ModelSelected {
  /// May be a single filename, or use a delimiter of " " (a single space) for multiple files.
  #[arg(short = 'f', long)]
  quantized_filename: String,
+
+ /// Path to a topology YAML file.
+ #[arg(long)]
+ topology: Option<String>,
  },
 
  /// Select a GGUF model with X-LoRA.
@@ -165,6 +169,10 @@ pub enum ModelSelected {
  /// This makes the maximum running sequences 1.
  #[arg(long)]
  tgt_non_granular_index: Option<usize>,
+
+ /// Path to a topology YAML file.
+ #[arg(long)]
+ topology: Option<String>,
  },
 
  /// Select a GGUF model with LoRA.
@@ -192,6 +200,10 @@ pub enum ModelSelected {
  /// Ordering JSON file
  #[arg(short, long)]
  order: String,
+
+ /// Path to a topology YAML file.
+ #[arg(long)]
+ topology: Option<String>,
  },
 
  /// Select a GGML model.
@@ -216,6 +228,10 @@ pub enum ModelSelected {
  /// GQA value
  #[arg(short, long, default_value_t = 1)]
  gqa: usize,
+
+ /// Path to a topology YAML file.
+ #[arg(long)]
+ topology: Option<String>,
  },
 
  /// Select a GGML model with X-LoRA.
@@ -253,6 +269,10 @@ pub enum ModelSelected {
  /// GQA value
  #[arg(short, long, default_value_t = 1)]
  gqa: usize,
+
+ /// Path to a topology YAML file.
+ #[arg(long)]
+ topology: Option<String>,
  },
 
  /// Select a GGML model with LoRA.
@@ -285,6 +305,10 @@ pub enum ModelSelected {
  /// GQA value
  #[arg(short, long, default_value_t = 1)]
  gqa: usize,
+
+ /// Path to a topology YAML file.
+ #[arg(long)]
+ topology: Option<String>,
  },
 
  /// Select a vision plain model, without quantization or adapters

diff --git a/mistralrs-core/src/models/quantized_llama.rs b/mistralrs-core/src/models/quantized_llama.rs
@@ -20,6 +20,7 @@ use crate::utils::gguf_metadata::ContentMetadata;
 use crate::utils::model_config as ModelConfig;
 use crate::utils::progress::NiceProgressBar;
 use crate::DeviceMapMetadata;
+use crate::Topology;
 const MAX_SEQ_LEN: u32 = 4096;
 
 struct Mlp {
@@ -395,6 +396,7 @@ impl ModelConfig::FromGGUF for ModelWeights {
  mut ct: Content<'_, R>,
  device: &Device,
  mapper: DeviceMapMetadata,
+ topology: Option<&'_ Topology>,
  attention_mechanism: AttentionImplementation,
  ) -> Result<Self> {
  // Parameter extraction from metadata.
@@ -423,7 +425,7 @@ impl ModelConfig::FromGGUF for ModelWeights {
  let output = ct.tensor("output.weight", device)?;
  let mut layers = Vec::with_capacity(block_count);
 
- let mapper = mapper.into_mapper(block_count, device)?;
+ let mapper = mapper.into_mapper(block_count, device, topology)?;
 
  let head_dim = key_length;
  if key_length != value_length {

diff --git a/mistralrs-core/src/models/quantized_phi2.rs b/mistralrs-core/src/models/quantized_phi2.rs
@@ -24,6 +24,7 @@ use crate::utils::gguf_metadata::ContentMetadata;
 use crate::utils::model_config as ModelConfig;
 use crate::utils::progress::NiceProgressBar;
 use crate::DeviceMapMetadata;
+use crate::Topology;
 
 pub const MAX_SEQ_LEN: usize = 4096;
 
@@ -217,6 +218,7 @@ impl ModelConfig::FromGGUF for ModelWeights {
  mut ct: Content<'_, R>,
  device: &Device,
  mapper: DeviceMapMetadata,
+ topology: Option<&'_ Topology>,
  attention_mechanism: AttentionImplementation,
  ) -> Result<Self> {
  // Parameter extraction from metadata.
@@ -247,7 +249,7 @@ impl ModelConfig::FromGGUF for ModelWeights {
  let mut layers = Vec::with_capacity(block_count);
  let head_dim = embedding_length / head_count;
 
- let mapper = mapper.into_mapper(block_count, device)?;
+ let mapper = mapper.into_mapper(block_count, device, topology)?;
 
  for layer_idx in NiceProgressBar::<_, 'b'>(0..block_count, "Loading repeating layers") {
  let prefix = format!("blk.{layer_idx}");