From 2d939e6852c6c0770bed573bb4c2f83b9e1425ac Mon Sep 17 00:00:00 2001
From: Steven Weiss <steventrouble@gmail.com>
Date: Wed, 5 Jul 2023 12:32:11 -0700
Subject: [PATCH 1/5] Add verify_state to ggml/sys/build.rs

Helps newbies like me set up the repo correctly.
---
 crates/ggml/sys/build.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)
diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs
index 37afb56a..bba8d95d 100644
--- a/crates/ggml/sys/build.rs
+++ b/crates/ggml/sys/build.rs
@@ -5,6 +5,8 @@ use std::path::{Path, PathBuf};
 // the host and target are the same. If they are not, it will turn off auto-feature-detection,
 // and you will need to manually specify target features through target-features.
 fn main() {
+    verify_state();
+
     println!("cargo:rerun-if-changed=llama-cpp");
 
     let mut builder = cc::Build::new();
@@ -104,6 +106,14 @@ fn main() {
     }
 }
 
+/// Verify the state of the repo to catch common newbie mistakes.
+fn verify_state() {
+    assert!(
+        Path::new("llama-cpp/ggml.c").exists(),
+        "Could not find llama-cpp/ggml.c. Try running `git submodule update --init`"
+    );
+}
+
 fn cfg_cublas() -> bool {
     !cfg!(target_os = "macos") && cfg!(feature = "cublas")
 }

From 2646aba7f8c151fbdc3eb7b6d72f326a7727c894 Mon Sep 17 00:00:00 2001
From: Steven Weiss <steventrouble@gmail.com>
Date: Wed, 5 Jul 2023 12:47:18 -0700
Subject: [PATCH 2/5] Refactor tests to put each type in it's own file.

This'll help when I add my tests, because otherwise this file will explode in size
---
 binaries/llm-test/src/common.rs    |  30 ++++++
 binaries/llm-test/src/inference.rs | 114 +++++++++++++++++++++
 binaries/llm-test/src/main.rs      | 155 ++---------------------------
 3 files changed, 152 insertions(+), 147 deletions(-)
 create mode 100644 binaries/llm-test/src/common.rs
 create mode 100644 binaries/llm-test/src/inference.rs

diff --git a/binaries/llm-test/src/common.rs b/binaries/llm-test/src/common.rs
new file mode 100644
index 00000000..4c858820
--- /dev/null
+++ b/binaries/llm-test/src/common.rs
@@ -0,0 +1,30 @@
+//! Tests that are run on every model, regardless of config.
+
+pub(super) fn can_send<M: llm::KnownModel + 'static>(model: M) -> anyhow::Result<M> {
+    let model = std::thread::spawn(move || model)
+        .join()
+        .map_err(|e| anyhow::anyhow!("Failed to join thread: {e:?}"));
+
+    log::info!("`can_send` test passed!");
+
+    model
+}
+
+pub(super) fn can_roundtrip_hyperparameters<M: llm::KnownModel + 'static>(
+    model: &M,
+) -> anyhow::Result<()> {
+    fn test_hyperparameters<M: llm::Hyperparameters>(hyperparameters: &M) -> anyhow::Result<()> {
+        let mut data = vec![];
+        hyperparameters.write_ggml(&mut data)?;
+        let new_hyperparameters =
+            <M as llm::Hyperparameters>::read_ggml(&mut std::io::Cursor::new(data))?;
+
+        assert_eq!(hyperparameters, &new_hyperparameters);
+
+        log::info!("`can_roundtrip_hyperparameters` test passed!");
+
+        Ok(())
+    }
+
+    test_hyperparameters(model.hyperparameters())
+}
diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs
new file mode 100644
index 00000000..42d8f5be
--- /dev/null
+++ b/binaries/llm-test/src/inference.rs
@@ -0,0 +1,114 @@
+//! Test cases for [crate::TestCase::Inference] tests.
+
+use std::{convert::Infallible, sync::Arc};
+
+use llm::InferenceStats;
+
+use crate::{ModelConfig, TestCaseReport, TestCaseReportInner, TestCaseReportMeta};
+
+pub(super) fn can_infer(
+    model: &dyn llm::Model,
+    model_config: &ModelConfig,
+    input: &str,
+    expected_output: Option<&str>,
+    maximum_token_count: usize,
+) -> anyhow::Result<TestCaseReport> {
+    let mut session = model.start_session(Default::default());
+    let (actual_output, res) = run_inference(
+        model,
+        model_config,
+        &mut session,
+        input,
+        maximum_token_count,
+    );
+
+    // Process the results
+    Ok(TestCaseReport {
+        meta: match &res {
+            Ok(_) => match expected_output {
+                Some(expected_output) => {
+                    if expected_output == actual_output {
+                        log::info!("`can_infer` test passed!");
+                        TestCaseReportMeta::Success
+                    } else {
+                        TestCaseReportMeta::Error {
+                            error: "The output did not match the expected output.".to_string(),
+                        }
+                    }
+                }
+                None => {
+                    log::info!("`can_infer` test passed (no expected output)!");
+                    TestCaseReportMeta::Success
+                }
+            },
+            Err(err) => TestCaseReportMeta::Error {
+                error: err.to_string(),
+            },
+        },
+        report: TestCaseReportInner::Inference {
+            input: input.into(),
+            expect_output: expected_output.map(|s| s.to_string()),
+            actual_output,
+            inference_stats: res.ok(),
+        },
+    })
+}
+
+fn run_inference(
+    model: &dyn llm::Model,
+    model_config: &ModelConfig,
+    session: &mut llm::InferenceSession,
+    input: &str,
+    maximum_token_count: usize,
+) -> (String, Result<InferenceStats, llm::InferenceError>) {
+    let mut actual_output: String = String::new();
+    let res = session.infer::<Infallible>(
+        model,
+        &mut rand::rngs::mock::StepRng::new(0, 1),
+        &llm::InferenceRequest {
+            prompt: input.into(),
+            parameters: &llm::InferenceParameters {
+                n_threads: model_config.threads,
+                n_batch: 1,
+                sampler: Arc::new(DeterministicSampler),
+            },
+            play_back_previous_tokens: false,
+            maximum_token_count: Some(maximum_token_count),
+        },
+        &mut Default::default(),
+        |r| match r {
+            llm::InferenceResponse::PromptToken(t) | llm::InferenceResponse::InferredToken(t) => {
+                actual_output += &t;
+                Ok(llm::InferenceFeedback::Continue)
+            }
+            _ => Ok(llm::InferenceFeedback::Continue),
+        },
+    );
+
+    (actual_output, res)
+}
+
+#[derive(Debug)]
+struct DeterministicSampler;
+impl llm::Sampler for DeterministicSampler {
+    fn sample(
+        &self,
+        previous_tokens: &[llm::TokenId],
+        logits: &[f32],
+        _rng: &mut dyn rand::RngCore,
+    ) -> llm::TokenId {
+        // Takes the most likely element from the logits, except if they've appeared in `previous_tokens`
+        // at all
+        let mut logits = logits.to_vec();
+        for &token in previous_tokens {
+            logits[token as usize] = f32::NEG_INFINITY;
+        }
+
+        logits
+            .iter()
+            .enumerate()
+            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
+            .unwrap()
+            .0 as llm::TokenId
+    }
+}
diff --git a/binaries/llm-test/src/main.rs b/binaries/llm-test/src/main.rs
index aa259779..60df0153 100644
--- a/binaries/llm-test/src/main.rs
+++ b/binaries/llm-test/src/main.rs
@@ -1,3 +1,8 @@
+//! Test runner for all LLMs.
+
+mod common;
+mod inference;
+
 use anyhow::Context;
 use clap::Parser;
 use indicatif::{ProgressBar, ProgressStyle};
@@ -7,13 +12,11 @@ use serde::{Deserialize, Serialize};
 use std::{
     cmp::min,
     collections::HashMap,
-    convert::Infallible,
     env,
     fs::{self, File},
     io::Write,
     path::{Path, PathBuf},
     str::FromStr,
-    sync::Arc,
     time::Instant,
 };
 
@@ -240,10 +243,10 @@ async fn test_model(
             //
 
             // Confirm that the model can be sent to a thread, then sent back
-            let model = tests::can_send(model)?;
+            let model = common::can_send(model)?;
 
             // Confirm that the hyperparameters can be roundtripped
-            tests::can_roundtrip_hyperparameters(&model)?;
+            common::can_roundtrip_hyperparameters(&model)?;
 
             //
 
@@ -259,7 +262,7 @@ async fn test_model(
                         input,
                         output,
                         maximum_token_count,
-                    } => test_case_reports.push(tests::can_infer(
+                    } => test_case_reports.push(inference::can_infer(
                         &model,
                         model_config,
                         input,
@@ -320,148 +323,6 @@ fn write_report(
     Ok(())
 }
 
-mod tests {
-    use super::*;
-
-    pub(super) fn can_send<M: llm::KnownModel + 'static>(model: M) -> anyhow::Result<M> {
-        let model = std::thread::spawn(move || model)
-            .join()
-            .map_err(|e| anyhow::anyhow!("Failed to join thread: {e:?}"));
-
-        log::info!("`can_send` test passed!");
-
-        model
-    }
-
-    pub(super) fn can_roundtrip_hyperparameters<M: llm::KnownModel + 'static>(
-        model: &M,
-    ) -> anyhow::Result<()> {
-        fn test_hyperparameters<M: llm::Hyperparameters>(
-            hyperparameters: &M,
-        ) -> anyhow::Result<()> {
-            let mut data = vec![];
-            hyperparameters.write_ggml(&mut data)?;
-            let new_hyperparameters =
-                <M as llm::Hyperparameters>::read_ggml(&mut std::io::Cursor::new(data))?;
-
-            assert_eq!(hyperparameters, &new_hyperparameters);
-
-            log::info!("`can_roundtrip_hyperparameters` test passed!");
-
-            Ok(())
-        }
-
-        test_hyperparameters(model.hyperparameters())
-    }
-
-    pub(super) fn can_infer(
-        model: &dyn llm::Model,
-        model_config: &ModelConfig,
-        input: &str,
-        expected_output: Option<&str>,
-        maximum_token_count: usize,
-    ) -> anyhow::Result<TestCaseReport> {
-        let mut session = model.start_session(Default::default());
-        let (actual_output, res) = run_inference(
-            model,
-            model_config,
-            &mut session,
-            input,
-            maximum_token_count,
-        );
-
-        // Process the results
-        Ok(TestCaseReport {
-            meta: match &res {
-                Ok(_) => match expected_output {
-                    Some(expected_output) => {
-                        if expected_output == actual_output {
-                            log::info!("`can_infer` test passed!");
-                            TestCaseReportMeta::Success
-                        } else {
-                            TestCaseReportMeta::Error {
-                                error: "The output did not match the expected output.".to_string(),
-                            }
-                        }
-                    }
-                    None => {
-                        log::info!("`can_infer` test passed (no expected output)!");
-                        TestCaseReportMeta::Success
-                    }
-                },
-                Err(err) => TestCaseReportMeta::Error {
-                    error: err.to_string(),
-                },
-            },
-            report: TestCaseReportInner::Inference {
-                input: input.into(),
-                expect_output: expected_output.map(|s| s.to_string()),
-                actual_output,
-                inference_stats: res.ok(),
-            },
-        })
-    }
-}
-
-fn run_inference(
-    model: &dyn llm::Model,
-    model_config: &ModelConfig,
-    session: &mut llm::InferenceSession,
-    input: &str,
-    maximum_token_count: usize,
-) -> (String, Result<InferenceStats, llm::InferenceError>) {
-    let mut actual_output: String = String::new();
-    let res = session.infer::<Infallible>(
-        model,
-        &mut rand::rngs::mock::StepRng::new(0, 1),
-        &llm::InferenceRequest {
-            prompt: input.into(),
-            parameters: &llm::InferenceParameters {
-                n_threads: model_config.threads,
-                n_batch: 1,
-                sampler: Arc::new(DeterministicSampler),
-            },
-            play_back_previous_tokens: false,
-            maximum_token_count: Some(maximum_token_count),
-        },
-        &mut Default::default(),
-        |r| match r {
-            llm::InferenceResponse::PromptToken(t) | llm::InferenceResponse::InferredToken(t) => {
-                actual_output += &t;
-                Ok(llm::InferenceFeedback::Continue)
-            }
-            _ => Ok(llm::InferenceFeedback::Continue),
-        },
-    );
-
-    (actual_output, res)
-}
-
-#[derive(Debug)]
-struct DeterministicSampler;
-impl llm::Sampler for DeterministicSampler {
-    fn sample(
-        &self,
-        previous_tokens: &[llm::TokenId],
-        logits: &[f32],
-        _rng: &mut dyn rand::RngCore,
-    ) -> llm::TokenId {
-        // Takes the most likely element from the logits, except if they've appeared in `previous_tokens`
-        // at all
-        let mut logits = logits.to_vec();
-        for &token in previous_tokens {
-            logits[token as usize] = f32::NEG_INFINITY;
-        }
-
-        logits
-            .iter()
-            .enumerate()
-            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
-            .unwrap()
-            .0 as llm::TokenId
-    }
-}
-
 async fn download_file(url: &str, local_path: &Path) -> anyhow::Result<()> {
     if local_path.exists() {
         return Ok(());

From b2238c21bfb63cd2c01cd83a78ca7fc446aecb2f Mon Sep 17 00:00:00 2001
From: Steven Weiss <steventrouble@gmail.com>
Date: Wed, 5 Jul 2023 14:21:30 -0700
Subject: [PATCH 3/5] Add Tokens test to test the feed_prompt method

---
 binaries/llm-test/configs/bloom.json   |  8 ++-
 binaries/llm-test/configs/gptj.json    |  8 ++-
 binaries/llm-test/configs/gptneox.json |  8 ++-
 binaries/llm-test/configs/llama.json   |  8 ++-
 binaries/llm-test/configs/mpt.json     |  8 ++-
 binaries/llm-test/src/inference.rs     |  6 +-
 binaries/llm-test/src/main.rs          | 11 ++-
 binaries/llm-test/src/tokens.rs        | 93 ++++++++++++++++++++++++++
 8 files changed, 142 insertions(+), 8 deletions(-)
 create mode 100644 binaries/llm-test/src/tokens.rs

diff --git a/binaries/llm-test/configs/bloom.json b/binaries/llm-test/configs/bloom.json
index f8e0ad18..b8c4bc2a 100644
--- a/binaries/llm-test/configs/bloom.json
+++ b/binaries/llm-test/configs/bloom.json
@@ -9,6 +9,12 @@
                 "output_disabled": "When a llama rides a crab, ,.-\n\n/? '， , ; A；A = (b)，d e orm\n“t” + “p。n unus et les el duetant alle that are by no ... ”\n( ? ) – ‘?\n!!\n«…..’,\nS.\n\n‘l」之 attergoir à dit-on pas .. 。。 ..\n– La leçon se confond quelquefois con ce qui es vée par occident .\n( 2 ) .\nLa protestation del paysan mécontent regardait pendre eussent mœurs faillite forteresse rivières lieues forteressemelés inquiétudes crackdown brawl slaughter massacresokea .\n» » … « …\n. . . \" \" ….",
                 "maximum_token_count": 128
             }
+        },
+        {
+            "Tokens": {
+                "input": "Rustformers is",
+                "output": 15
+            }
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/binaries/llm-test/configs/gptj.json b/binaries/llm-test/configs/gptj.json
index 8eb832c3..85493b5c 100644
--- a/binaries/llm-test/configs/gptj.json
+++ b/binaries/llm-test/configs/gptj.json
@@ -9,6 +9,12 @@
                 "output_disabled": "\"When a llama rides a crab, \nit's not the same as when an elephant does it.\" - John Steinbeck, East of Eden.\n\n \"The best way to predict your future is by looking at history.\"- Robert Kiyosaki (author). Rich Dad Poor dad : what 10 rules for success really mean and how you can apply them in life! The rich dads guidebook on personal finance: How To Become A Millionaire In Less Than 5 years! http://www..richdadpoordaddyguidebooksalexanderkimballblogcom/the_bestwaytopredictyourfutureislookingathistory/. You will learn about money management",
                 "maximum_token_count": 128
             }
+        },
+        {
+            "Tokens": {
+                "input": "Rustformers is",
+                "output": 257
+            }
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/binaries/llm-test/configs/gptneox.json b/binaries/llm-test/configs/gptneox.json
index 3aab6a99..3e6b84cb 100644
--- a/binaries/llm-test/configs/gptneox.json
+++ b/binaries/llm-test/configs/gptneox.json
@@ -9,6 +9,12 @@
                 "output_disabled": "<|padding|>When a llama rides a crab, \n“The Greatest Show on Earth” is the title of an 1875 book by Phineas Taylor Barnum, who founded and operated The circus. He was born in Bethel Connecticut to Meshack (Meshake) Bowman Jr., from New York City; his mother’s name has not been recorded but she may have had some Native American ancestry as well.[2] His father died when he[3][4], at age three,[5]: 9–10 (p1), 11-12​—was left with relatives until they could find him work or send for them back home where there",
                 "maximum_token_count": 128
             }
+        },
+        {
+            "Tokens": {
+                "input": "Rustformers is",
+                "output": 247
+            }
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/binaries/llm-test/configs/llama.json b/binaries/llm-test/configs/llama.json
index c7e485e3..1e2f23f4 100644
--- a/binaries/llm-test/configs/llama.json
+++ b/binaries/llm-test/configs/llama.json
@@ -9,6 +9,12 @@
                 "output": "When a llama rides a crab, 10-year olds are the ones who get to eat.\nTheir parents have been told that they will be eating for another year or two before their children can enjoy it again – and then only if there is enough food left over from Christmas dinner!",
                 "maximum_token_count": 128
             }
+        },
+        {
+            "Tokens": {
+                "input": "Rustformers is",
+                "output": 260
+            }
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/binaries/llm-test/configs/mpt.json b/binaries/llm-test/configs/mpt.json
index 6dd316ee..7142c143 100644
--- a/binaries/llm-test/configs/mpt.json
+++ b/binaries/llm-test/configs/mpt.json
@@ -9,6 +9,12 @@
                 "output": "When a llama rides a crab,  the llama is called the \"crab rider\".\nThe crabs are very popular in South America, especially Brazil. They have been used as transportation for many years and they can carry up to five people at once!",
                 "maximum_token_count": 128
             }
+        },
+        {
+            "Tokens": {
+                "input": "Rustformers is",
+                "output": 247
+            }
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/binaries/llm-test/src/inference.rs b/binaries/llm-test/src/inference.rs
index 42d8f5be..ec803cdd 100644
--- a/binaries/llm-test/src/inference.rs
+++ b/binaries/llm-test/src/inference.rs
@@ -1,4 +1,6 @@
-//! Test cases for [crate::TestCase::Inference] tests.
+//! Tests the model's inference APIs.
+//!
+//! See [crate::TestCase::Inference].
 
 use std::{convert::Infallible, sync::Arc};
 
@@ -6,7 +8,7 @@ use llm::InferenceStats;
 
 use crate::{ModelConfig, TestCaseReport, TestCaseReportInner, TestCaseReportMeta};
 
-pub(super) fn can_infer(
+pub(crate) fn can_infer(
     model: &dyn llm::Model,
     model_config: &ModelConfig,
     input: &str,
diff --git a/binaries/llm-test/src/main.rs b/binaries/llm-test/src/main.rs
index 60df0153..b14ed78a 100644
--- a/binaries/llm-test/src/main.rs
+++ b/binaries/llm-test/src/main.rs
@@ -2,6 +2,7 @@
 
 mod common;
 mod inference;
+mod tokens;
 
 use anyhow::Context;
 use clap::Parser;
@@ -123,6 +124,10 @@ enum TestCase {
         output: Option<String>,
         maximum_token_count: usize,
     },
+    Tokens {
+        input: String,
+        output: usize,
+    },
 }
 
 #[derive(Serialize)]
@@ -145,13 +150,14 @@ enum TestCaseReportMeta {
 }
 
 #[derive(Serialize)]
-enum TestCaseReportInner {
+pub enum TestCaseReportInner {
     Inference {
         input: String,
         expect_output: Option<String>,
         actual_output: String,
         inference_stats: Option<InferenceStats>,
     },
+    Tokens(tokens::TokensReport),
 }
 
 async fn test_model(
@@ -269,6 +275,9 @@ async fn test_model(
                         output.as_deref(),
                         *maximum_token_count,
                     )?),
+                    TestCase::Tokens { input, output } => {
+                        test_case_reports.push(tokens::can_feed(&model, input, *output));
+                    }
                 }
             }
             let first_error: Option<String> =
diff --git a/binaries/llm-test/src/tokens.rs b/binaries/llm-test/src/tokens.rs
new file mode 100644
index 00000000..52f37019
--- /dev/null
+++ b/binaries/llm-test/src/tokens.rs
@@ -0,0 +1,93 @@
+//! Tests the model's token manipulation APIs:
+//!
+//! *   [llm::InferenceSession::feed_prompt()]
+//!
+//! See [crate::TestCase::Tokens].
+
+use std::convert::Infallible;
+
+use llm::{InferenceFeedback, Model, OutputRequest};
+use serde::Serialize;
+
+use crate::{TestCaseReport, TestCaseReportMeta};
+
+/// Tests that the model performs as expected when feeding tokens
+pub(crate) fn can_feed(model: &impl Model, input: &str, expected_output: usize) -> TestCaseReport {
+    let mut report = TokensReport::default();
+
+    let mut session = model.start_session(Default::default());
+    let mut output = OutputRequest {
+        all_logits: Some(vec![]),
+        ..Default::default()
+    };
+
+    let feed_prompt = &mut |prompt: &str| {
+        session.feed_prompt(model, &Default::default(), prompt, &mut output, |x| {
+            always_continue(x)
+        })
+    };
+
+    if let Err(err) = feed_prompt(input) {
+        return report.failure(&err.to_string());
+    };
+
+    let top_token;
+    match output.all_logits {
+        Some(logits) => {
+            let start = logits.len() - model.tokenizer().len();
+            let mut iter = logits[start..].iter().enumerate();
+            let Some((mut max_idx, mut max)) = iter.next() else {
+                return report.failure("Could not find any logits for last token.");
+            };
+            for (idx, score) in iter {
+                if score > max {
+                    max = score;
+                    max_idx = idx;
+                }
+            }
+            top_token = max_idx;
+        }
+        None => return report.failure("Model did not output any logits."),
+    }
+
+    report.output = top_token;
+
+    if top_token != expected_output {
+        let tokenizer = model.tokenizer();
+        let top_token_str = String::from_utf8_lossy(&tokenizer.token(top_token)).to_string();
+        let expected_str = String::from_utf8_lossy(&tokenizer.token(expected_output)).to_string();
+        return report.failure(&format!(
+            "Expected top token to be {expected_output} ({expected_str}), \
+            but was {top_token} ({top_token_str})"
+        ));
+    }
+
+    report.success()
+}
+
+fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {
+    Ok(InferenceFeedback::Continue)
+}
+
+#[derive(Serialize, Default)]
+pub struct TokensReport {
+    output: usize,
+}
+
+impl TokensReport {
+    fn failure(self, msg: &str) -> TestCaseReport {
+        TestCaseReport {
+            meta: TestCaseReportMeta::Error {
+                error: msg.to_owned(),
+            },
+            report: crate::TestCaseReportInner::Tokens(self),
+        }
+    }
+
+    fn success(self) -> TestCaseReport {
+        TestCaseReport {
+            meta: TestCaseReportMeta::Success,
+            report: crate::TestCaseReportInner::Tokens(self),
+        }
+    }
+}

From 2e35b46f44968dc98ccd9854716ff92090be7b3e Mon Sep 17 00:00:00 2001
From: Steven Weiss <steventrouble@gmail.com>
Date: Wed, 5 Jul 2023 19:06:39 -0700
Subject: [PATCH 4/5] Add delete tokens test and impl.

Note that llama.json fails the tests, so it's likely it doesn't support it.
I may investigate further, though.
---
 binaries/llm-test/configs/bloom.json     |   3 +
 binaries/llm-test/configs/gptj.json      |   3 +
 binaries/llm-test/configs/gptneox.json   |   3 +
 binaries/llm-test/configs/llama.json     |   3 +
 binaries/llm-test/configs/mpt.json       |   3 +
 binaries/llm-test/src/delete.rs          | 100 +++++++++++++++++++++++
 binaries/llm-test/src/main.rs            |   6 ++
 binaries/llm-test/src/tokens.rs          |  23 ++++--
 crates/llm-base/src/inference_session.rs |  43 ++++++++++
 crates/llm-base/src/lib.rs               |   6 +-
 crates/llm-base/src/model/mod.rs         |  13 +++
 crates/llm/src/lib.rs                    |  14 ++--
 crates/models/bloom/src/lib.rs           |   4 +
 crates/models/gptj/src/lib.rs            |   4 +
 crates/models/gptneox/src/lib.rs         |   4 +
 crates/models/llama/src/lib.rs           |   4 +
 crates/models/mpt/src/lib.rs             |   4 +
 17 files changed, 221 insertions(+), 19 deletions(-)
 create mode 100644 binaries/llm-test/src/delete.rs

diff --git a/binaries/llm-test/configs/bloom.json b/binaries/llm-test/configs/bloom.json
index b8c4bc2a..5383386d 100644
--- a/binaries/llm-test/configs/bloom.json
+++ b/binaries/llm-test/configs/bloom.json
@@ -15,6 +15,9 @@
                 "input": "Rustformers is",
                 "output": 15
             }
+        },
+        {
+            "Delete": {}
         }
     ]
 }
diff --git a/binaries/llm-test/configs/gptj.json b/binaries/llm-test/configs/gptj.json
index 85493b5c..50966748 100644
--- a/binaries/llm-test/configs/gptj.json
+++ b/binaries/llm-test/configs/gptj.json
@@ -15,6 +15,9 @@
                 "input": "Rustformers is",
                 "output": 257
             }
+        },
+        {
+            "Delete": {}
         }
     ]
 }
diff --git a/binaries/llm-test/configs/gptneox.json b/binaries/llm-test/configs/gptneox.json
index 3e6b84cb..c8cce4d9 100644
--- a/binaries/llm-test/configs/gptneox.json
+++ b/binaries/llm-test/configs/gptneox.json
@@ -15,6 +15,9 @@
                 "input": "Rustformers is",
                 "output": 247
             }
+        },
+        {
+            "Delete": {}
         }
     ]
 }
diff --git a/binaries/llm-test/configs/llama.json b/binaries/llm-test/configs/llama.json
index 1e2f23f4..9bd6094a 100644
--- a/binaries/llm-test/configs/llama.json
+++ b/binaries/llm-test/configs/llama.json
@@ -15,6 +15,9 @@
                 "input": "Rustformers is",
                 "output": 260
             }
+        },
+        {
+            "Delete": {}
         }
     ]
 }
diff --git a/binaries/llm-test/configs/mpt.json b/binaries/llm-test/configs/mpt.json
index 7142c143..57a8bc89 100644
--- a/binaries/llm-test/configs/mpt.json
+++ b/binaries/llm-test/configs/mpt.json
@@ -15,6 +15,9 @@
                 "input": "Rustformers is",
                 "output": 247
             }
+        },
+        {
+            "Delete": {}
         }
     ]
 }
diff --git a/binaries/llm-test/src/delete.rs b/binaries/llm-test/src/delete.rs
new file mode 100644
index 00000000..2c609cb5
--- /dev/null
+++ b/binaries/llm-test/src/delete.rs
@@ -0,0 +1,100 @@
+//! Tests the model's token manipulation APIs:
+//!
+//! *   [llm::InferenceSession::feed_prompt()]
+//!
+//! See [crate::TestCase::Tokens].
+
+use std::convert::Infallible;
+
+use llm::{InferenceFeedback, InferenceSession, Model, OutputRequest};
+use serde::Serialize;
+
+use crate::{TestCaseReport, TestCaseReportMeta};
+
+/// Error tolerance for the float comparisons.
+const TOLERANCE: f32 = 1e-7;
+
+/// Tests that models can delete tokens without changing the model's behavior.
+pub(crate) fn can_delete(model: &impl Model) -> TestCaseReport {
+    let report = DeleteReport::default();
+    let mut session = model.start_session(Default::default());
+    let mut output = OutputRequest {
+        all_logits: Some(vec![]),
+        ..Default::default()
+    };
+
+    // Feed some tokens
+    if let Err(err) = feed_prompt("The llama lived on the", &mut session, model, &mut output) {
+        return report.failure(&err.to_string());
+    }
+
+    // Add token and get the logits
+    if let Err(err) = feed_prompt(" ", &mut session, model, &mut output) {
+        return report.failure(&err.to_string());
+    }
+    let Some(original_logits) = output.all_logits.clone() else {
+        return report.failure("Model did not return logits.");
+    };
+
+    // Delete, then re-add. Verify logits are the same.
+    if let Err(err) = session.delete_tokens(model, 1) {
+        return report.failure(&err.to_string());
+    }
+    if let Err(err) = feed_prompt(" ", &mut session, model, &mut output) {
+        return report.failure(&err.to_string());
+    }
+    let Some(redone_logits) = output.all_logits.clone() else {
+        return report.failure("Second run of model did not return logits.");
+    };
+
+    // Compare the logits
+    for (idx, (&original, redone)) in original_logits.iter().zip(redone_logits).enumerate() {
+        if original > redone + TOLERANCE || original < redone - TOLERANCE {
+            return report.failure(&format!(
+                "Expected logits to be the same after delete, but differed at {idx}, \
+                expected {original}, but was {redone}."
+            ));
+        }
+    }
+
+    log::info!("`can_delete` test passed (no expected output)!");
+    report.success()
+}
+
+fn feed_prompt(
+    prompt: &str,
+    session: &mut InferenceSession,
+    model: &impl Model,
+    output: &mut OutputRequest,
+) -> Result<(), llm::InferenceError> {
+    session.feed_prompt(model, &Default::default(), prompt, output, |x| {
+        always_continue(x)
+    })
+}
+
+fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {
+    Ok(InferenceFeedback::Continue)
+}
+
+#[derive(Serialize, Default)]
+pub struct DeleteReport {
+    output: usize,
+}
+
+impl DeleteReport {
+    fn failure(self, msg: &str) -> TestCaseReport {
+        TestCaseReport {
+            meta: TestCaseReportMeta::Error {
+                error: msg.to_owned(),
+            },
+            report: crate::TestCaseReportInner::Delete(self),
+        }
+    }
+
+    fn success(self) -> TestCaseReport {
+        TestCaseReport {
+            meta: TestCaseReportMeta::Success,
+            report: crate::TestCaseReportInner::Delete(self),
+        }
+    }
+}
diff --git a/binaries/llm-test/src/main.rs b/binaries/llm-test/src/main.rs
index b14ed78a..b1bc9b07 100644
--- a/binaries/llm-test/src/main.rs
+++ b/binaries/llm-test/src/main.rs
@@ -1,6 +1,7 @@
 //! Test runner for all LLMs.
 
 mod common;
+mod delete;
 mod inference;
 mod tokens;
 
@@ -128,6 +129,7 @@ enum TestCase {
         input: String,
         output: usize,
     },
+    Delete {},
 }
 
 #[derive(Serialize)]
@@ -158,6 +160,7 @@ pub enum TestCaseReportInner {
         inference_stats: Option<InferenceStats>,
     },
     Tokens(tokens::TokensReport),
+    Delete(delete::DeleteReport),
 }
 
 async fn test_model(
@@ -278,6 +281,9 @@ async fn test_model(
                     TestCase::Tokens { input, output } => {
                         test_case_reports.push(tokens::can_feed(&model, input, *output));
                     }
+                    TestCase::Delete {} => {
+                        test_case_reports.push(delete::can_delete(&model));
+                    }
                 }
             }
             let first_error: Option<String> =
diff --git a/binaries/llm-test/src/tokens.rs b/binaries/llm-test/src/tokens.rs
index 52f37019..a2a490b6 100644
--- a/binaries/llm-test/src/tokens.rs
+++ b/binaries/llm-test/src/tokens.rs
@@ -6,7 +6,7 @@
 
 use std::convert::Infallible;
 
-use llm::{InferenceFeedback, Model, OutputRequest};
+use llm::{InferenceFeedback, InferenceSession, Model, OutputRequest};
 use serde::Serialize;
 
 use crate::{TestCaseReport, TestCaseReportMeta};
@@ -14,20 +14,13 @@ use crate::{TestCaseReport, TestCaseReportMeta};
 /// Tests that the model performs as expected when feeding tokens
 pub(crate) fn can_feed(model: &impl Model, input: &str, expected_output: usize) -> TestCaseReport {
     let mut report = TokensReport::default();
-
     let mut session = model.start_session(Default::default());
     let mut output = OutputRequest {
         all_logits: Some(vec![]),
         ..Default::default()
     };
 
-    let feed_prompt = &mut |prompt: &str| {
-        session.feed_prompt(model, &Default::default(), prompt, &mut output, |x| {
-            always_continue(x)
-        })
-    };
-
-    if let Err(err) = feed_prompt(input) {
+    if let Err(err) = feed_prompt(input, &mut session, model, &mut output) {
         return report.failure(&err.to_string());
     };
 
@@ -62,9 +55,21 @@ pub(crate) fn can_feed(model: &impl Model, input: &str, expected_output: usize)
         ));
     }
 
+    log::info!("`can_feed` test passed (no expected output)!");
     report.success()
 }
 
+fn feed_prompt(
+    prompt: &str,
+    session: &mut InferenceSession,
+    model: &impl Model,
+    output: &mut OutputRequest,
+) -> Result<(), llm::InferenceError> {
+    session.feed_prompt(model, &Default::default(), prompt, output, |x| {
+        always_continue(x)
+    })
+}
+
 fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {
     Ok(InferenceFeedback::Continue)
 }
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 37861174..27428d73 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -333,6 +333,37 @@ impl InferenceSession {
         Ok(())
     }
 
+    /// Removes `num` tokens from the end of the buffer. Roughly the inverse of `feed_prompt`.
+    pub fn delete_tokens(
+        &mut self,
+        model: &dyn Model,
+        num: usize,
+    ) -> Result<Vec<TokenId>, DeleteError> {
+        if !model.supports_delete() {
+            return Err(DeleteError::UnsupportedArchitecture);
+        }
+
+        if num >= self.n_past {
+            return Err(DeleteError::NotEnoughTokens);
+        }
+
+        // Remove the tokens from self.tokens.
+        let token_start = self.n_past - num;
+        let deleted_tokens: Vec<_> = self.tokens.drain(token_start..).collect();
+
+        // Remove the corresponding chars from decoded
+        let mut decoded_start = self.decoded_tokens.len();
+        for id in &deleted_tokens {
+            decoded_start -= model.tokenizer().token(*id as usize).len();
+        }
+        self.decoded_tokens.truncate(decoded_start);
+
+        // Decrement the n_past tokens counter.
+        self.n_past -= num;
+
+        Ok(deleted_tokens)
+    }
+
     /// Infer the next token for this session.
     pub fn infer_next_token(
         &mut self,
@@ -637,6 +668,18 @@ pub enum InferenceError {
     UserCallback(Box<dyn std::error::Error>),
 }
 
+#[derive(Error, Debug)]
+/// Errors encountered during the snapshot process.
+pub enum DeleteError {
+    /// Tried deleting more tokens than were available
+    #[error("tried deleting more tokens than were available")]
+    NotEnoughTokens,
+
+    /// Model architecture does not support delete
+    #[error("model architecture does not support deletes")]
+    UnsupportedArchitecture,
+}
+
 #[derive(Error, Debug)]
 /// Errors encountered during the snapshot process.
 pub enum SnapshotError {
diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
index d40a9077..479aba2d 100644
--- a/crates/llm-base/src/lib.rs
+++ b/crates/llm-base/src/lib.rs
@@ -23,9 +23,9 @@ pub use ggml;
 pub use ggml::Type as ElementType;
 
 pub use inference_session::{
-    feed_prompt_callback, GraphOutputs, InferenceError, InferenceFeedback, InferenceRequest,
-    InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot,
-    InferenceSnapshotRef, InferenceStats, ModelKVMemoryType, SnapshotError,
+    feed_prompt_callback, DeleteError, GraphOutputs, InferenceError, InferenceFeedback,
+    InferenceRequest, InferenceResponse, InferenceSession, InferenceSessionConfig,
+    InferenceSnapshot, InferenceSnapshotRef, InferenceStats, ModelKVMemoryType, SnapshotError,
 };
 pub use loader::{
     load, load_progress_callback_stdout, ContainerType, FileType, FileTypeFormat, FormatMagic,
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index 2de02828..90c86d58 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -86,6 +86,12 @@ pub trait KnownModel: Send + Sync {
 
     /// Get the list of regexes to use to determine if a tensor in this model should not be quantized.
     fn skip_quantize_tensors() -> Vec<Regex>;
+
+    /// Returns whether the model supports deleting tokens.
+    fn supports_delete(&self) -> bool {
+        // Assume we can't delete unless otherwise specified
+        false
+    }
 }
 
 /// A type-erased model to allow for interacting with a model without knowing
@@ -118,6 +124,9 @@ pub trait Model: Send + Sync {
 
     /// Get the end of text/end of string token ID. This value is defined by model implementers.
     fn eot_token_id(&self) -> TokenId;
+
+    /// Returns whether the model supports deleting tokens.
+    fn supports_delete(&self) -> bool;
 }
 impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
     fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
@@ -149,6 +158,10 @@ impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
     fn eot_token_id(&self) -> TokenId {
         KnownModel::eot_token_id(self)
     }
+
+    fn supports_delete(&self) -> bool {
+        KnownModel::supports_delete(self)
+    }
 }
 
 /// Implemented by model hyperparameters for interacting with hyperparameters
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 2be90739..2cc22609 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -78,13 +78,13 @@ use std::{
 // This is the "user-facing" API, and GGML may not always be our backend.
 pub use llm_base::{
     feed_prompt_callback, ggml::format as ggml_format, load, load_progress_callback_stdout,
-    quantize, samplers, ElementType, FileType, FileTypeFormat, FormatMagic, Hyperparameters,
-    InferenceError, InferenceFeedback, InferenceParameters, InferenceRequest, InferenceResponse,
-    InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef,
-    InferenceStats, InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model,
-    ModelKVMemoryType, ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress,
-    Sampler, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Tokenizer,
-    TokenizerSource,
+    quantize, samplers, DeleteError, ElementType, FileType, FileTypeFormat, FormatMagic,
+    Hyperparameters, InferenceError, InferenceFeedback, InferenceParameters, InferenceRequest,
+    InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot,
+    InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel, LoadError, LoadProgress,
+    Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest, Prompt, QuantizeError,
+    QuantizeProgress, Sampler, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer,
+    TokenizationError, Tokenizer, TokenizerSource,
 };
 
 use serde::Serialize;
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index 4e9aa192..5ef63b79 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -396,6 +396,10 @@ impl KnownModel for Bloom {
     fn skip_quantize_tensors() -> Vec<Regex> {
         vec![]
     }
+
+    fn supports_delete(&self) -> bool {
+        true
+    }
 }
 
 /// BLOOM [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index 195f876a..3d6cbcd2 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -318,6 +318,10 @@ impl KnownModel for GptJ {
     fn skip_quantize_tensors() -> Vec<Regex> {
         vec![]
     }
+
+    fn supports_delete(&self) -> bool {
+        true
+    }
 }
 
 /// GPT-J [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 5339b901..641cfbcb 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -364,6 +364,10 @@ impl KnownModel for GptNeoX {
     fn skip_quantize_tensors() -> Vec<Regex> {
         vec![]
     }
+
+    fn supports_delete(&self) -> bool {
+        true
+    }
 }
 
 /// GPT-NeoX [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 6e7d4b11..0ea6661e 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -348,6 +348,10 @@ impl KnownModel for Llama {
     fn skip_quantize_tensors() -> Vec<Regex> {
         vec![]
     }
+
+    fn supports_delete(&self) -> bool {
+        true
+    }
 }
 
 /// LLaMA [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index 18991adf..757a372f 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -298,6 +298,10 @@ impl KnownModel for Mpt {
     fn skip_quantize_tensors() -> Vec<Regex> {
         vec![]
     }
+
+    fn supports_delete(&self) -> bool {
+        true
+    }
 }
 
 /// MPT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))

From 2badcd96418859996b325af47717beca359cb110 Mon Sep 17 00:00:00 2001
From: Steven Weiss <steventrouble@gmail.com>
Date: Sun, 9 Jul 2023 10:40:11 -0700
Subject: [PATCH 5/5] Address PR comments

---
 binaries/llm-test/src/delete.rs          | 15 +++++----------
 binaries/llm-test/src/tokens.rs          |  2 +-
 crates/llm-base/src/inference_session.rs | 14 +++++---------
 crates/llm-base/src/lib.rs               |  6 +++---
 crates/llm-base/src/model/mod.rs         |  8 ++++----
 crates/llm/src/lib.rs                    | 14 +++++++-------
 crates/models/bloom/src/lib.rs           |  2 +-
 crates/models/gptj/src/lib.rs            |  2 +-
 crates/models/gptneox/src/lib.rs         |  2 +-
 crates/models/llama/src/lib.rs           |  2 +-
 crates/models/mpt/src/lib.rs             |  2 +-
 11 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/binaries/llm-test/src/delete.rs b/binaries/llm-test/src/delete.rs
index 2c609cb5..9ddbe7a8 100644
--- a/binaries/llm-test/src/delete.rs
+++ b/binaries/llm-test/src/delete.rs
@@ -11,9 +11,6 @@ use serde::Serialize;
 
 use crate::{TestCaseReport, TestCaseReportMeta};
 
-/// Error tolerance for the float comparisons.
-const TOLERANCE: f32 = 1e-7;
-
 /// Tests that models can delete tokens without changing the model's behavior.
 pub(crate) fn can_delete(model: &impl Model) -> TestCaseReport {
     let report = DeleteReport::default();
@@ -36,8 +33,8 @@ pub(crate) fn can_delete(model: &impl Model) -> TestCaseReport {
         return report.failure("Model did not return logits.");
     };
 
-    // Delete, then re-add. Verify logits are the same.
-    if let Err(err) = session.delete_tokens(model, 1) {
+    // Rewind, then re-add. Verify logits are the same.
+    if let Err(err) = session.rewind(model, 1) {
         return report.failure(&err.to_string());
     }
     if let Err(err) = feed_prompt(" ", &mut session, model, &mut output) {
@@ -49,7 +46,7 @@ pub(crate) fn can_delete(model: &impl Model) -> TestCaseReport {
 
     // Compare the logits
     for (idx, (&original, redone)) in original_logits.iter().zip(redone_logits).enumerate() {
-        if original > redone + TOLERANCE || original < redone - TOLERANCE {
+        if original > redone + f32::EPSILON || original < redone - f32::EPSILON {
             return report.failure(&format!(
                 "Expected logits to be the same after delete, but differed at {idx}, \
                 expected {original}, but was {redone}."
@@ -57,7 +54,7 @@ pub(crate) fn can_delete(model: &impl Model) -> TestCaseReport {
         }
     }
 
-    log::info!("`can_delete` test passed (no expected output)!");
+    log::info!("`can_delete` test passed!");
     report.success()
 }
 
@@ -67,9 +64,7 @@ fn feed_prompt(
     model: &impl Model,
     output: &mut OutputRequest,
 ) -> Result<(), llm::InferenceError> {
-    session.feed_prompt(model, &Default::default(), prompt, output, |x| {
-        always_continue(x)
-    })
+    session.feed_prompt(model, &Default::default(), prompt, output, always_continue)
 }
 
 fn always_continue(_: &[u8]) -> Result<InferenceFeedback, Infallible> {
diff --git a/binaries/llm-test/src/tokens.rs b/binaries/llm-test/src/tokens.rs
index a2a490b6..260546b8 100644
--- a/binaries/llm-test/src/tokens.rs
+++ b/binaries/llm-test/src/tokens.rs
@@ -55,7 +55,7 @@ pub(crate) fn can_feed(model: &impl Model, input: &str, expected_output: usize)
         ));
     }
 
-    log::info!("`can_feed` test passed (no expected output)!");
+    log::info!("`can_feed` test passed!");
     report.success()
 }
 
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 27428d73..2c4fcf6e 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -334,17 +334,13 @@ impl InferenceSession {
     }
 
     /// Removes `num` tokens from the end of the buffer. Roughly the inverse of `feed_prompt`.
-    pub fn delete_tokens(
-        &mut self,
-        model: &dyn Model,
-        num: usize,
-    ) -> Result<Vec<TokenId>, DeleteError> {
-        if !model.supports_delete() {
-            return Err(DeleteError::UnsupportedArchitecture);
+    pub fn rewind(&mut self, model: &dyn Model, num: usize) -> Result<Vec<TokenId>, RewindError> {
+        if !model.supports_rewind() {
+            return Err(RewindError::UnsupportedArchitecture);
         }
 
         if num >= self.n_past {
-            return Err(DeleteError::NotEnoughTokens);
+            return Err(RewindError::NotEnoughTokens);
         }
 
         // Remove the tokens from self.tokens.
@@ -670,7 +666,7 @@ pub enum InferenceError {
 
 #[derive(Error, Debug)]
 /// Errors encountered during the snapshot process.
-pub enum DeleteError {
+pub enum RewindError {
     /// Tried deleting more tokens than were available
     #[error("tried deleting more tokens than were available")]
     NotEnoughTokens,
diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
index 479aba2d..1ec18d1c 100644
--- a/crates/llm-base/src/lib.rs
+++ b/crates/llm-base/src/lib.rs
@@ -23,9 +23,9 @@ pub use ggml;
 pub use ggml::Type as ElementType;
 
 pub use inference_session::{
-    feed_prompt_callback, DeleteError, GraphOutputs, InferenceError, InferenceFeedback,
-    InferenceRequest, InferenceResponse, InferenceSession, InferenceSessionConfig,
-    InferenceSnapshot, InferenceSnapshotRef, InferenceStats, ModelKVMemoryType, SnapshotError,
+    feed_prompt_callback, GraphOutputs, InferenceError, InferenceFeedback, InferenceRequest,
+    InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot,
+    InferenceSnapshotRef, InferenceStats, ModelKVMemoryType, RewindError, SnapshotError,
 };
 pub use loader::{
     load, load_progress_callback_stdout, ContainerType, FileType, FileTypeFormat, FormatMagic,
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index 90c86d58..b2b49a0a 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -88,7 +88,7 @@ pub trait KnownModel: Send + Sync {
     fn skip_quantize_tensors() -> Vec<Regex>;
 
     /// Returns whether the model supports deleting tokens.
-    fn supports_delete(&self) -> bool {
+    fn supports_rewind(&self) -> bool {
         // Assume we can't delete unless otherwise specified
         false
     }
@@ -126,7 +126,7 @@ pub trait Model: Send + Sync {
     fn eot_token_id(&self) -> TokenId;
 
     /// Returns whether the model supports deleting tokens.
-    fn supports_delete(&self) -> bool;
+    fn supports_rewind(&self) -> bool;
 }
 impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
     fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
@@ -159,8 +159,8 @@ impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
         KnownModel::eot_token_id(self)
     }
 
-    fn supports_delete(&self) -> bool {
-        KnownModel::supports_delete(self)
+    fn supports_rewind(&self) -> bool {
+        KnownModel::supports_rewind(self)
     }
 }
 
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 2cc22609..35692951 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -78,13 +78,13 @@ use std::{
 // This is the "user-facing" API, and GGML may not always be our backend.
 pub use llm_base::{
     feed_prompt_callback, ggml::format as ggml_format, load, load_progress_callback_stdout,
-    quantize, samplers, DeleteError, ElementType, FileType, FileTypeFormat, FormatMagic,
-    Hyperparameters, InferenceError, InferenceFeedback, InferenceParameters, InferenceRequest,
-    InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot,
-    InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel, LoadError, LoadProgress,
-    Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest, Prompt, QuantizeError,
-    QuantizeProgress, Sampler, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer,
-    TokenizationError, Tokenizer, TokenizerSource,
+    quantize, samplers, ElementType, FileType, FileTypeFormat, FormatMagic, Hyperparameters,
+    InferenceError, InferenceFeedback, InferenceParameters, InferenceRequest, InferenceResponse,
+    InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef,
+    InferenceStats, InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model,
+    ModelKVMemoryType, ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress,
+    RewindError, Sampler, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError,
+    Tokenizer, TokenizerSource,
 };
 
 use serde::Serialize;
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index 5ef63b79..0897a210 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -397,7 +397,7 @@ impl KnownModel for Bloom {
         vec![]
     }
 
-    fn supports_delete(&self) -> bool {
+    fn supports_rewind(&self) -> bool {
         true
     }
 }
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index 3d6cbcd2..42a039c6 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -319,7 +319,7 @@ impl KnownModel for GptJ {
         vec![]
     }
 
-    fn supports_delete(&self) -> bool {
+    fn supports_rewind(&self) -> bool {
         true
     }
 }
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 641cfbcb..e033bdbe 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -365,7 +365,7 @@ impl KnownModel for GptNeoX {
         vec![]
     }
 
-    fn supports_delete(&self) -> bool {
+    fn supports_rewind(&self) -> bool {
         true
     }
 }
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index 0ea6661e..94585218 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -349,7 +349,7 @@ impl KnownModel for Llama {
         vec![]
     }
 
-    fn supports_delete(&self) -> bool {
+    fn supports_rewind(&self) -> bool {
         true
     }
 }
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index 757a372f..203d779d 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -299,7 +299,7 @@ impl KnownModel for Mpt {
         vec![]
     }
 
-    fn supports_delete(&self) -> bool {
+    fn supports_rewind(&self) -> bool {
         true
     }
 }