Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example quantized with custom GGUF model error: cannot find llama.attention.head_count in metadata #2450

Open
evgenyigumnov opened this issue Aug 27, 2024 · 0 comments

Comments

@evgenyigumnov
Copy link
Contributor

evgenyigumnov commented Aug 27, 2024

C:\Users\igumn\candle\candle-examples\examples\quantized>cargo run --features=cuda --example quantized  --release -- --model=gemma-2-2b-it.q4_k_m.gguf --prompt "def fibonacci(n): "
    Finished `release` profile [optimized] target(s) in 0.48s
     Running `C:\Users\igumn\candle\target\release\examples\quantized.exe --model=gemma-2-2b-it.q4_k_m.gguf --prompt "def fibonacci(n): "`
avx: true, neon: false, simd128: false, f16c: true
temp: 0.80 repeat-penalty: 1.10 repeat-last-n: 64
loaded 288 tensors (1.70GB) in 1.70s
Error: cannot find llama.attention.head_count in metadata
error: process didn't exit successfully: `C:\Users\igumn\candle\target\release\examples\quantized.exe --model=gemma-2-2b-it.q4_k_m.gguf --prompt "def fibonacci(n): "` (exit code: 1)

https://huggingface.co/unsloth/gemma-2-it-GGUF/blob/main/gemma-2-2b-it.q4_k_m.gguf

Probably you could add "quantized" support for "gemma" example. For gemma-2-2b-it.q4_k_m.gguf model. I try adopting "recurrent-gemma" example but take error: "Error: missing field conv1d_width at line 36 column 1"

my changed "recurrent-gemma" example code:

#[cfg(feature = "mkl")]
extern crate intel_mkl_src;

#[cfg(feature = "accelerate")]
extern crate accelerate_src;

use anyhow::{Error as E, Result};
use clap::Parser;

use candle_transformers::models::quantized_recurrent_gemma::Model as QModel;
use candle_transformers::models::recurrent_gemma::{Config, Model as BModel};

use candle::{DType, Device, Tensor};
use candle_examples::token_output_stream::TokenOutputStream;
use candle_nn::VarBuilder;
use candle_transformers::generation::LogitsProcessor;
use hf_hub::{api::sync::Api, Repo, RepoType};
use tokenizers::Tokenizer;

enum Model {
    B(BModel),
    Q(QModel),
}

impl Model {
    fn forward(&mut self, xs: &Tensor, pos: usize) -> candle::Result<Tensor> {
        match self {
            Self::B(m) => m.forward(xs, pos),
            Self::Q(m) => m.forward(xs, pos),
        }
    }
}

#[derive(Clone, Debug, Copy, PartialEq, Eq, clap::ValueEnum)]
enum Which {
    #[value(name = "2b")]
    Base2B,
    #[value(name = "2b-it")]
    Instruct2B,
}

struct TextGeneration {
    model: Model,
    device: Device,
    tokenizer: TokenOutputStream,
    logits_processor: LogitsProcessor,
    repeat_penalty: f32,
    repeat_last_n: usize,
}

impl TextGeneration {
    #[allow(clippy::too_many_arguments)]
    fn new(
        model: Model,
        tokenizer: Tokenizer,
        seed: u64,
        temp: Option<f64>,
        top_p: Option<f64>,
        top_k: usize,
        repeat_penalty: f32,
        repeat_last_n: usize,
        device: &Device,
    ) -> Self {
        let sampling = match temp {
            None => candle_transformers::generation::Sampling::ArgMax,
            Some(temperature) => match top_p {
                None => candle_transformers::generation::Sampling::TopK {
                    temperature,
                    k: top_k,
                },
                Some(top_p) => candle_transformers::generation::Sampling::TopKThenTopP {
                    temperature,
                    k: top_k,
                    p: top_p,
                },
            },
        };
        let logits_processor = LogitsProcessor::from_sampling(seed, sampling);
        Self {
            model,
            tokenizer: TokenOutputStream::new(tokenizer),
            logits_processor,
            repeat_penalty,
            repeat_last_n,
            device: device.clone(),
        }
    }

    fn run(&mut self, prompt: &str, sample_len: usize) -> Result<()> {
        use std::io::Write;
        self.tokenizer.clear();
        let mut tokens = self
            .tokenizer
            .tokenizer()
            .encode(prompt, true)
            .map_err(E::msg)?
            .get_ids()
            .to_vec();
        for &t in tokens.iter() {
            if let Some(t) = self.tokenizer.next_token(t)? {
                print!("{t}")
            }
        }
        std::io::stdout().flush()?;

        let mut generated_tokens = 0usize;
        let eos_token = match self.tokenizer.get_token("<eos>") {
            Some(token) => token,
            None => anyhow::bail!("cannot find the <eos> token"),
        };
        let start_gen = std::time::Instant::now();
        for index in 0..sample_len {
            let context_size = if index > 0 { 1 } else { tokens.len() };
            let start_pos = tokens.len().saturating_sub(context_size);
            let ctxt = &tokens[start_pos..];
            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
            let logits = self.model.forward(&input, start_pos)?;
            let logits = logits.squeeze(0)?.squeeze(0)?.to_dtype(DType::F32)?;
            let logits = if self.repeat_penalty == 1. {
                logits
            } else {
                let start_at = tokens.len().saturating_sub(self.repeat_last_n);
                candle_transformers::utils::apply_repeat_penalty(
                    &logits,
                    self.repeat_penalty,
                    &tokens[start_at..],
                )?
            };

            let next_token = self.logits_processor.sample(&logits)?;
            tokens.push(next_token);
            generated_tokens += 1;
            if next_token == eos_token {
                break;
            }
            if let Some(t) = self.tokenizer.next_token(next_token)? {
                print!("{t}");
                std::io::stdout().flush()?;
            }
        }
        let dt = start_gen.elapsed();
        if let Some(rest) = self.tokenizer.decode_rest().map_err(E::msg)? {
            print!("{rest}");
        }
        std::io::stdout().flush()?;
        println!(
            "\n{generated_tokens} tokens generated ({:.2} token/s)",
            generated_tokens as f64 / dt.as_secs_f64(),
        );
        Ok(())
    }
}

#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
    /// Run on CPU rather than on GPU.
    #[arg(long)]
    cpu: bool,

    /// Enable tracing (generates a trace-timestamp.json file).
    #[arg(long)]
    tracing: bool,

    #[arg(long)]
    prompt: String,

    /// The temperature used to generate samples.
    #[arg(long)]
    temperature: Option<f64>,

    /// Nucleus sampling probability cutoff.
    #[arg(long)]
    top_p: Option<f64>,

    #[arg(long, default_value_t = 250)]
    top_k: usize,

    /// The seed to use when generating random samples.
    #[arg(long, default_value_t = 299792458)]
    seed: u64,

    /// The length of the sample to generate (in tokens).
    #[arg(long, short = 'n', default_value_t = 8000)]
    sample_len: usize,

    #[arg(long)]
    model_id: Option<String>,

    #[arg(long, default_value = "main")]
    revision: String,

    #[arg(long)]
    tokenizer_file: Option<String>,

    #[arg(long)]
    config_file: Option<String>,

    #[arg(long)]
    weight_files: Option<String>,

    /// Penalty to be applied for repeating tokens, 1. means no penalty.
    #[arg(long, default_value_t = 1.1)]
    repeat_penalty: f32,

    /// The context size to consider for the repeat penalty.
    #[arg(long, default_value_t = 64)]
    repeat_last_n: usize,

    /// The model to use.
    #[arg(long, default_value = "2b")]
    which: Which,

    #[arg(long)]
    quantized: bool,
}

fn main() -> Result<()> {
    use tracing_chrome::ChromeLayerBuilder;
    use tracing_subscriber::prelude::*;

    let args = Args::parse();
    let _guard = if args.tracing {
        let (chrome_layer, guard) = ChromeLayerBuilder::new().build();
        tracing_subscriber::registry().with(chrome_layer).init();
        Some(guard)
    } else {
        None
    };
    println!(
        "avx: {}, neon: {}, simd128: {}, f16c: {}",
        candle::utils::with_avx(),
        candle::utils::with_neon(),
        candle::utils::with_simd128(),
        candle::utils::with_f16c()
    );
    println!(
        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
        args.temperature.unwrap_or(0.),
        args.repeat_penalty,
        args.repeat_last_n
    );

    let start = std::time::Instant::now();
    let api = Api::new()?;
    let model_id = match &args.model_id {
        Some(model_id) => model_id.to_string(),
        None => match args.which {
            Which::Base2B => "google/recurrentgemma-2b".to_string(),
            Which::Instruct2B => "google/recurrentgemma-2b-it".to_string(),
        },
    };
    let repo = api.repo(Repo::with_revision(
        model_id,
        RepoType::Model,
        args.revision,
    ));
    // let tokenizer_filename = match args.tokenizer_file {
    //     Some(file) => std::path::PathBuf::from(file),
    //     None => repo.get("tokenizer.json")?,
    // };
    let tokenizer_filename = std::path::PathBuf::from("tokenizer.json");
    // let config_filename = match args.config_file {
    //     Some(file) => std::path::PathBuf::from(file),
    //     None => repo.get("config.json")?,
    // };
    let config_filename = std::path::PathBuf::from("config.json");
    // let filenames = match args.weight_files {
    //     Some(files) => files
    //         .split(',')
    //         .map(std::path::PathBuf::from)
    //         .collect::<Vec<_>>(),
    //     None => {
    //         if args.quantized {
    //             let filename = match args.which {
    //                 Which::Base2B => "recurrent-gemma-2b-q4k.gguf",
    //                 Which::Instruct2B => "recurrent-gemma-7b-q4k.gguf",
    //             };
    //             let filename = api.model("lmz/candle-gemma".to_string()).get(filename)?;
    //             vec![filename]
    //         } else {
    //             candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json")?
    //         }
    //     }
    // };
    let filenames = vec![std::path::PathBuf::from("gemma-2-2b-it.q4_k_m.gguf ")];
    println!("retrieved the files in {:?}", start.elapsed());
    let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?;
    let config: Config = serde_json::from_reader(std::fs::File::open(config_filename)?)?;

    let start = std::time::Instant::now();
    let device = candle_examples::device(args.cpu)?;
    let dtype = if device.is_cuda() {
        DType::BF16
    } else {
        DType::F32
    };
    let model = if args.quantized {
        let vb = candle_transformers::quantized_var_builder::VarBuilder::from_gguf(
            &filenames[0],
            &device,
        )?;
        Model::Q(QModel::new(&config, vb.pp("model"))?)
    } else {
        let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device)? };
        Model::B(BModel::new(&config, vb.pp("model"))?)
    };

    println!("loaded the model in {:?}", start.elapsed());

    let mut pipeline = TextGeneration::new(
        model,
        tokenizer,
        args.seed,
        args.temperature,
        args.top_p,
        args.top_k,
        args.repeat_penalty,
        args.repeat_last_n,
        &device,
    );
    pipeline.run(&args.prompt, args.sample_len)?;
    Ok(())
}

cargo run --features cuda -r --example recurrent-gemma -- --prompt "Write me a poem about Machine Learning." --quantized

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant