tch 0.14.0 update (#435)

* updated tch version * Addition of casting operation for cpu compat * Fix ONNX resource path * Fix GPT-J bias bool loading * Updated changelog * Fix Clippy warnings * Updated readme
guillaume-be · Nov 26, 2023 · 9f2cd17 · 9f2cd17
1 parent dc99a30
commit 9f2cd17
Show file tree

Hide file tree

Showing 46 changed files with 231 additions and 109 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,12 +5,17 @@ All notable changes to this project will be documented in this file. The format
 ## Added
 - Addition of `new_with_tokenizer` constructor for `SentenceEmbeddingsModel` allowing passing custom tokenizers for sentence embeddings pipelines.
 - Support for [Tokenizers](https://github.com/huggingface/tokenizers) in pipelines, allowing loading `tokenizer.json` and `special_token_map.json` tokenizer files. 
+- (BREAKING) Most model configuration can now take an optional `kind` parameter to specify the model weight precision. If not provided, will default to full precision on CPU, or the serialized weights precision otherwise.
 
 ## Fixed
 - (BREAKING) Fixed the keyword extraction pipeline for n-gram sizes > 2. Add new configuration option `tokenizer_forbidden_ngram_chars` to specify characters that should be excluded from n-grams (allows filtering m-grams spanning multiple sentences).
 - Improved MPS device compatibility setting the `sparse_grad` flag to false for `gather` operations
 - Updated ONNX runtime backend version to 1.15.x
 - Issue with incorrect results for QA models with a tokenizer not using segment ids
+- Issue with GPT-J that was incorrectly tracking the gradients for the attention bias
+
+## Changed
+- (BREAKING) Upgraded to `torch` 2.1 (via `tch` 0.14.0).
 
 ## [0.21.0] - 2023-06-03
 ## Added

diff --git a/Cargo.toml b/Cargo.toml
@@ -76,7 +76,7 @@ features = ["doc-only"]
 
 [dependencies]
 rust_tokenizers = "8.1.1"
-tch = "0.13.0"
+tch = "0.14.0"
 serde_json = "1"
 serde = { version = "1", features = ["derive"] }
 ordered-float = "3"
@@ -97,7 +97,7 @@ anyhow = "1"
 csv = "1"
 criterion = "0.4"
 tokio = { version = "1.24", features = ["sync", "rt-multi-thread", "macros"] }
-torch-sys = "0.13.0"
+torch-sys =  "0.14.0"
 tempfile = "3"
 itertools = "0.10"
 tracing-subscriber = { version = "0.3", default-features = false, features = [ "env-filter", "fmt" ] }

diff --git a/README.md b/README.md
@@ -80,8 +80,8 @@ This cache location defaults to `~/.cache/.rustbert`, but can be changed by sett
 
 ### Manual installation (recommended)
 
-1. Download `libtorch` from https://pytorch.org/get-started/locally/. This package requires `v2.0.0`: if this version is no longer available on the "get started" page,
-the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcu118.zip` for a Linux version with CUDA11. **NOTE:** When using `rust-bert` as dependency from [crates.io](https://crates.io), please check the required `LIBTORCH` on the published package [readme](https://crates.io/crates/rust-bert) as it may differ from the version documented here (applying to the current repository version).
+1. Download `libtorch` from https://pytorch.org/get-started/locally/. This package requires `v2.1`: if this version is no longer available on the "get started" page,
+the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.1%2Bcu118.zip` for a Linux version with CUDA11. **NOTE:** When using `rust-bert` as dependency from [crates.io](https://crates.io), please check the required `LIBTORCH` on the published package [readme](https://crates.io/crates/rust-bert) as it may differ from the version documented here (applying to the current repository version).
 2. Extract the library to a location of your choice
 3. Set the following environment variables
 ##### Linux:

diff --git a/benches/generation_benchmark.rs b/benches/generation_benchmark.rs
@@ -37,6 +37,7 @@ fn create_text_generation_model() -> TextGenerationModel {
  diversity_penalty: None,
  num_return_sequences: 5,
  device: Device::cuda_if_available(),
+ kind: None,
  };
  TextGenerationModel::new(config).unwrap()
 }

diff --git a/examples/natural_language_inference_deberta.rs b/examples/natural_language_inference_deberta.rs
@@ -38,7 +38,7 @@ fn main() -> anyhow::Result<()> {
  )?;
  let config = DebertaConfig::from_file(config_path);
  let model = DebertaForSequenceClassification::new(vs.root(), &config)?;
- load_weights(&model_resource, &mut vs)?;
+ load_weights(&model_resource, &mut vs, None, device)?;
 
  // Define input
  let input = [("I love you.", "I like you.")];

diff --git a/src/common/resources/mod.rs b/src/common/resources/mod.rs
@@ -30,6 +30,7 @@ use std::ops::DerefMut;
 use std::path::PathBuf;
 use std::sync::RwLockWriteGuard;
 use tch::nn::VarStore;
+use tch::{Device, Kind};
 
 pub enum Resource<'a> {
  PathBuf(PathBuf),
@@ -84,17 +85,19 @@ impl<T: ResourceProvider + ?Sized> ResourceProvider for Box<T> {
 pub fn load_weights(
  rp: &(impl ResourceProvider + ?Sized),
  vs: &mut VarStore,
+ kind: Option<Kind>,
+ device: Device,
 ) -> Result<(), RustBertError> {
  match rp.get_resource()? {
- Resource::Buffer(mut data) => {
- vs.load_from_stream(std::io::Cursor::new(data.deref_mut()))?;
- Ok(())
- }
- Resource::PathBuf(path) => Ok(vs.load(path)?),
- }
+ Resource::Buffer(mut data) => vs.load_from_stream(std::io::Cursor::new(data.deref_mut())),
+ Resource::PathBuf(path) => vs.load(path),
+ }?;
+ cast_var_store(vs, kind, device);
+ Ok(())
 }
 
 #[cfg(feature = "remote")]
 mod remote;
+use crate::pipelines::common::cast_var_store;
 #[cfg(feature = "remote")]
 pub use remote::RemoteResource;
diff --git a/src/lib.rs b/src/lib.rs
@@ -90,8 +90,8 @@
 //!
 //! ### Manual installation (recommended)
 //!
-//! 1. Download `libtorch` from <https://pytorch.org/get-started/locally/>. This package requires `v2.0`: if this version is no longer available on the "get started" page,
-//! the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcu118.zip` for a Linux version with CUDA11.
+//! 1. Download `libtorch` from <https://pytorch.org/get-started/locally/>. This package requires `v2.1`: if this version is no longer available on the "get started" page,
+//! the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.1%2Bcu118.zip` for a Linux version with CUDA11.
 //! 2. Extract the library to a location of your choice
 //! 3. Set the following environment variables
 //! ##### Linux:

diff --git a/src/models/bart/bart_model.rs b/src/models/bart/bart_model.rs
@@ -1004,7 +1004,12 @@ impl BartGenerator {
  let mut var_store = nn::VarStore::new(device);
  let config = BartConfig::from_file(config_path);
  let model = BartForConditionalGeneration::new(var_store.root(), &config);
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
  let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/gpt2/gpt2_model.rs b/src/models/gpt2/gpt2_model.rs
@@ -652,7 +652,12 @@ impl GPT2Generator {
 
  let config = Gpt2Config::from_file(config_path);
  let model = GPT2LMHeadModel::new(var_store.root(), &config);
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = tokenizer.get_bos_id();
  let eos_token_ids = tokenizer.get_eos_id().map(|id| vec![id]);

diff --git a/src/models/gpt_j/attention.rs b/src/models/gpt_j/attention.rs
@@ -68,11 +68,16 @@ impl GptJAttention {
  let p = p.borrow();
 
  let max_positions = config.n_positions;
- let bias = Tensor::ones([max_positions, max_positions], (Kind::Uint8, p.device()))
+ let bias_value = Tensor::ones([max_positions, max_positions], (Kind::Uint8, p.device()))
  .tril(0)
  .view([1, 1, max_positions, max_positions])
  .requires_grad_(false);
- let bias = p.var_copy("bias", &bias);
+ let mut bias = p
+ .f_ones_no_train("bias", &[1, 1, max_positions, max_positions])
+ .unwrap()
+ .to_kind(Kind::Uint8)
+ .to_device(p.device());
+ bias.copy_(&bias_value);
 
  let attn_pdrop = config.attn_pdrop.unwrap_or(0.1);
  let resid_pdrop = config.resid_pdrop.unwrap_or(0.1);
@@ -95,21 +100,9 @@ impl GptJAttention {
  ..Default::default()
  };
  let k_proj = nn::linear(p / "k_proj", config.n_embd, config.n_embd, linear_config);
- if config.use_float16 {
- (p / "k_proj").half();
- }
  let v_proj = nn::linear(p / "v_proj", config.n_embd, config.n_embd, linear_config);
- if config.use_float16 {
- (p / "v_proj").half();
- }
  let q_proj = nn::linear(p / "q_proj", config.n_embd, config.n_embd, linear_config);
- if config.use_float16 {
- (p / "q_proj").half();
- }
  let out_proj = nn::linear(p / "out_proj", config.n_embd, config.n_embd, linear_config);
- if config.use_float16 {
- (p / "out_proj").half();
- }
 
  GptJAttention {
  bias,

diff --git a/src/models/gpt_j/gpt_j_model.rs b/src/models/gpt_j/gpt_j_model.rs
@@ -131,8 +131,6 @@ pub struct GptJConfig {
  pub rotary_dim: Option<i64>,
  pub vocab_size: i64,
  pub scale_attn_weights: Option<bool>,
- #[serde(default = "default_use_float16")]
- pub use_float16: bool,
  #[serde(default = "default_preload_on_cpu")]
  pub preload_on_cpu: bool,
  pub decoder_start_token_id: Option<i64>,
@@ -164,7 +162,6 @@ impl Default for GptJConfig {
  rotary_dim: Some(64),
  vocab_size: 50400,
  scale_attn_weights: Some(true),
- use_float16: default_use_float16(),
  preload_on_cpu: default_preload_on_cpu(),
  decoder_start_token_id: None,
  forced_bos_token_id: None,
@@ -173,10 +170,6 @@ impl Default for GptJConfig {
  }
 }
 
-fn default_use_float16() -> bool {
- true
-}
-
 fn default_preload_on_cpu() -> bool {
  true
 }
@@ -233,9 +226,6 @@ impl GptJModel {
  config.n_embd,
  Default::default(),
  );
- if config.use_float16 {
- (&(&p / "wte") / "weight").half()
- };
 
  let embd_pdrop = config.embd_pdrop.unwrap_or(0.1);
  let drop = Dropout::new(embd_pdrop);
@@ -245,9 +235,6 @@ impl GptJModel {
  ..Default::default()
  };
  let ln_f = nn::layer_norm(&p / "ln_f", vec![config.n_embd], layer_norm_config);
- if config.use_float16 {
- (&p / "ln_f").half()
- };
 
  let mut h: Vec<GptJBlock> = vec![];
  let h_path = &p / "h";
@@ -475,9 +462,6 @@ impl GptJLMHeadModel {
  config.vocab_size,
  Default::default(),
  );
- if config.use_float16 {
- (p / "lm_head").half();
- }
 
  GptJLMHeadModel {
  transformer,
@@ -625,7 +609,12 @@ impl GptJGenerator {
  if config.preload_on_cpu && device != Device::Cpu {
  var_store.set_device(Device::Cpu);
  }
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
  if device != Device::Cpu {
  var_store.set_device(device);
  }

diff --git a/src/models/gpt_j/transformer.rs b/src/models/gpt_j/transformer.rs
@@ -43,18 +43,12 @@ impl GptJMLP {
  intermediate_size,
  Default::default(),
  );
- if config.use_float16 {
- (p / "fc_in").half()
- };
  let fc_out = nn::linear(
  p / "fc_out",
  intermediate_size,
  config.n_embd,
  Default::default(),
  );
- if config.use_float16 {
- (p / "fc_out").half()
- };
 
  let activation = match &config.afn {
  Some(activation_enum) => match activation_enum {
@@ -100,9 +94,6 @@ impl GptJBlock {
  ..Default::default()
  };
  let ln_1 = nn::layer_norm(p / "ln_1", vec![config.n_embd], layer_norm_config);
- if config.use_float16 {
- (p / "ln_1").half()
- };
  let attn = GptJAttention::new(p / "attn", config);
  let mlp = GptJMLP::new(p / "mlp", config);
 

diff --git a/src/models/gpt_neo/gpt_neo_model.rs b/src/models/gpt_neo/gpt_neo_model.rs
@@ -672,7 +672,12 @@ impl GptNeoGenerator {
  let mut var_store = nn::VarStore::new(device);
  let config = GptNeoConfig::from_file(config_path);
  let model = GptNeoForCausalLM::new(var_store.root(), &config)?;
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = tokenizer.get_bos_id();
  let eos_token_ids = tokenizer.get_eos_id().map(|id| vec![id]);

diff --git a/src/models/longt5/encoder.rs b/src/models/longt5/encoder.rs
@@ -288,8 +288,8 @@ impl LongT5Stack {
 
  let (batch_size, sequence_length) = (input_shape[0], input_shape[1]);
 
- let mask_seq_length = if old_layer_states.is_some() {
- if old_layer_states.as_ref().unwrap()[0].0.is_some() {
+ let mask_seq_length = if let Some(old_layer_states_value) = &old_layer_states {
+ if old_layer_states_value[0].0.is_some() {
  old_layer_states.as_ref().unwrap()[0]
  .0
  .as_ref()

diff --git a/src/models/longt5/longt5_model.rs b/src/models/longt5/longt5_model.rs
@@ -595,7 +595,12 @@ impl LongT5Generator {
 
  let config = LongT5Config::from_file(config_path);
  let model = LongT5ForConditionalGeneration::new(var_store.root(), &config);
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = config.bos_token_id;
  let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/m2m_100/m2m_100_model.rs b/src/models/m2m_100/m2m_100_model.rs
@@ -544,7 +544,12 @@ impl M2M100Generator {
 
  let config = M2M100Config::from_file(config_path);
  let model = M2M100ForConditionalGeneration::new(var_store.root(), &config);
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
  let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/marian/marian_model.rs b/src/models/marian/marian_model.rs
@@ -761,7 +761,12 @@ impl MarianGenerator {
 
  let config = BartConfig::from_file(config_path);
  let model = MarianForConditionalGeneration::new(var_store.root(), &config);
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
  let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/mbart/mbart_model.rs b/src/models/mbart/mbart_model.rs
@@ -650,7 +650,7 @@ impl MBartForSequenceClassification {
  /// # let device = Device::Cpu;
  /// # let vs = nn::VarStore::new(device);
  /// # let config = MBartConfig::from_file(config_path);
- /// # let mbart_model: MBartForSequenceClassification = MBartForSequenceClassification::new(&vs.root(), &config).unwrap();;
+ /// # let mbart_model: MBartForSequenceClassification = MBartForSequenceClassification::new(&vs.root(), &config).unwrap();
  /// let (batch_size, source_sequence_length, target_sequence_length) = (64, 128, 56);
  /// let input_tensor = Tensor::rand(&[batch_size, source_sequence_length], (Int64, device));
  /// let target_tensor = Tensor::rand(&[batch_size, target_sequence_length], (Int64, device));
@@ -800,7 +800,12 @@ impl MBartGenerator {
 
  let config = MBartConfig::from_file(config_path);
  let model = MBartForConditionalGeneration::new(var_store.root(), &config);
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
  let eos_token_ids = Some(match config.eos_token_id {

diff --git a/src/models/openai_gpt/openai_gpt_model.rs b/src/models/openai_gpt/openai_gpt_model.rs
@@ -498,7 +498,12 @@ impl OpenAIGenerator {
  let mut var_store = nn::VarStore::new(device);
  let config = Gpt2Config::from_file(config_path);
  let model = OpenAIGPTLMHeadModel::new(var_store.root(), &config);
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = tokenizer.get_bos_id();
  let eos_token_ids = tokenizer.get_eos_id().map(|id| vec![id]);

diff --git a/src/models/pegasus/pegasus_model.rs b/src/models/pegasus/pegasus_model.rs
@@ -505,7 +505,12 @@ impl PegasusConditionalGenerator {
  let mut var_store = nn::VarStore::new(device);
  let config = PegasusConfig::from_file(config_path);
  let model = PegasusForConditionalGeneration::new(var_store.root(), &config);
- crate::resources::load_weights(&generate_config.model_resource, &mut var_store)?;
+ crate::resources::load_weights(
+ &generate_config.model_resource,
+ &mut var_store,
+ generate_config.kind,
+ device,
+ )?;
 
  let bos_token_id = Some(config.bos_token_id.unwrap_or(0));
  let eos_token_ids = config