diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 615ecd5..5f9d70e 100644
Binary files a/.github/workflows/CI.yml and b/.github/workflows/CI.yml differ
diff --git a/.github/workflows/MacOS-CI.yml b/.github/workflows/MacOS-CI.yml
new file mode 100644
index 0000000..af0f7f3
--- /dev/null
+++ b/.github/workflows/MacOS-CI.yml
@@ -0,0 +1,39 @@
+# This file is autogenerated by maturin v0.15.2
+# To update, run
+#
+#    maturin generate-ci --zig github
+#
+name: MacOS-CI
+
+on:
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  macos:
+    runs-on: macos-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        target: [x86_64, aarch64]
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+      - name: Build wheels
+        uses: PyO3/maturin-action@v1
+        with:
+          target: ${{ matrix.target }}
+          args: --release --out dist --find-interpreter --zig
+          sccache: 'true'
+        env:
+          RUSTFLAGS: "-C link-arg=-undefined -C link-arg=dynamic_lookup"
+
+      - name: Upload wheels
+        uses: actions/upload-artifact@v3
+        with:
+          name: wheels
+          path: dist
diff --git a/Cargo.toml b/Cargo.toml
index d6a4e44..d60b38c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "llm-rs"
-version = "0.2.8"
+version = "0.2.9"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -9,10 +9,10 @@ name = "llm_rs"
 crate-type = ["cdylib"]
 
 [dependencies]
-pyo3 = "0.18.3"
+pyo3 = {version="0.19.0", features=["extension-module", "generate-import-lib"]}
 rand = "0.8.5"
 rand_chacha = "0.3.1"
 log  =  "0.4.17"
-llm = { git = "https://github.com/rustformers/llm.git", rev="ccdd2ab"  }
-llm-base = { git = "https://github.com/rustformers/llm.git",rev="ccdd2ab" }
-ggml = { git = "https://github.com/rustformers/llm.git",rev="ccdd2ab" }
+llm = { git = "https://github.com/rustformers/llm.git", rev="e52a102"  }
+llm-base = { git = "https://github.com/rustformers/llm.git",rev="e52a102" }
+ggml = { git = "https://github.com/rustformers/llm.git",rev="e52a102" }
diff --git a/llm_rs/auto.py b/llm_rs/auto.py
index d345a0c..268c9d3 100644
--- a/llm_rs/auto.py
+++ b/llm_rs/auto.py
@@ -4,7 +4,7 @@
 from .models import Mpt,GptNeoX,GptJ,Gpt2,Bloom,Llama
 from .base_model import Model
 import logging
-from typing import Optional, List, Union,Type,Dict
+from typing import Optional, List, Union,Type,Dict, Callable
 import os
 from enum import Enum, auto
 from dataclasses import dataclass
@@ -209,11 +209,20 @@ def _infer_model_type(cls,model_file:Union[str,os.PathLike],known_model:Optional
     def from_file(cls, path:Union[str,os.PathLike],
                   model_type: Optional[KnownModels] = None,
                   session_config:SessionConfig=SessionConfig(),
+                  tokenizer_path_or_repo_id: Optional[Union[str,os.PathLike]]=None,
                   lora_paths:Optional[List[Union[str,os.PathLike]]]=None,
-                  verbose:bool=False)->Model:
+                  verbose:bool=False,
+                  use_hf_tokenizer:bool=True)->Model:
         
+        tokenizer = tokenizer_path_or_repo_id
+        if use_hf_tokenizer and tokenizer is None:
+            metadata = cls.load_metadata(path)
+            tokenizer = metadata.base_model
+            if tokenizer is None or tokenizer == "":
+                raise ValueError(f"Model file '{path}' does not have a base_model specified in its metadata file but wants to use a huggingface-tokenizer! Please specify a base_model or expilicitly specify a tokenizer via `tokenizer_path_or_repo_id`.")
+
         model = cls._infer_model_type(path,model_type)
-        return model(path,session_config,lora_paths,verbose)
+        return model(path,session_config,tokenizer_path_or_repo_id,lora_paths,verbose)
     
     @classmethod
     def from_pretrained(cls,
@@ -221,8 +230,10 @@ def from_pretrained(cls,
         model_file: Optional[str] = None,
         model_type: Optional[KnownModels] = None,
         session_config:SessionConfig=SessionConfig(),
+        tokenizer_path_or_repo_id: Optional[Union[str,os.PathLike]]=None,
         lora_paths:Optional[List[Union[str,os.PathLike]]]=None,
         verbose:bool=False,
+        use_hf_tokenizer:bool=True,
         default_quantization:QuantizationType=QuantizationType.Q4_0,
         default_container:ContainerType=ContainerType.GGJT)->Model:
 
@@ -231,7 +242,7 @@ def from_pretrained(cls,
         if path_type == PathType.UNKNOWN:
             raise ValueError(f"Unknown path type for '{model_path_or_repo_id}'")
         elif path_type == PathType.FILE:
-            return cls.from_file(model_path_or_repo_id,model_type,session_config,lora_paths,verbose)
+            return cls.from_file(model_path_or_repo_id,model_type,session_config,tokenizer_path_or_repo_id,lora_paths,verbose,use_hf_tokenizer)
         else:
             if path_type == PathType.REPO:
 
@@ -246,14 +257,14 @@ def from_pretrained(cls,
 
                 if config.repo_type != "GGML":
                     logging.warning("Found normal HuggingFace model, starting conversion...")
-                    return cls.from_transformer(model_path_or_repo_id, session_config, lora_paths, verbose, default_quantization, default_container)
+                    return cls.from_transformer(model_path_or_repo_id, session_config, tokenizer_path_or_repo_id, lora_paths, verbose, use_hf_tokenizer,default_quantization, default_container)
             
                 resolved_path = cls._find_model_path_from_repo(str(model_path_or_repo_id),model_file)
-                return cls.from_file(resolved_path,model_type,session_config,lora_paths,verbose)
+                return cls.from_file(resolved_path,model_type,session_config,tokenizer_path_or_repo_id,lora_paths,verbose,use_hf_tokenizer)
             
             elif path_type == PathType.DIR:
                 resolved_path = cls._find_model_path_from_dir(str(model_path_or_repo_id),model_file)
-                return cls.from_file(resolved_path,model_type,session_config,lora_paths,verbose)
+                return cls.from_file(resolved_path,model_type,session_config,tokenizer_path_or_repo_id,lora_paths,verbose,use_hf_tokenizer)
             
             else:
                 raise ValueError(f"Unknown path type '{path_type}'")
@@ -322,8 +333,10 @@ def _find_model_path_from_repo(
     def from_transformer(cls,
         model_path_or_repo_id: Union[str,os.PathLike],
         session_config:SessionConfig=SessionConfig(),
+        tokenizer_path_or_repo_id: Optional[Union[str,os.PathLike]]=None,
         lora_paths:Optional[List[Union[str,os.PathLike]]]=None,
         verbose:bool=False,
+        use_hf_tokenizer:bool=True,
         default_quantization:QuantizationType=QuantizationType.Q4_0,
         default_container:ContainerType=ContainerType.GGJT):
         
@@ -341,7 +354,7 @@ def from_transformer(cls,
         converted_model = AutoConverter.convert(model_path_or_repo_id,export_path)
         if default_quantization != QuantizationType.F16:
             converted_model = AutoQuantizer.quantize(converted_model,quantization=default_quantization,container=default_container)
-        return cls.from_file(converted_model,None,session_config,lora_paths,verbose)
+        return cls.from_file(converted_model,None,session_config,tokenizer_path_or_repo_id,lora_paths,verbose,use_hf_tokenizer)
     
 # Hack to make the quantization type enum hashable
 _APPENDIX_MAP = {
@@ -357,7 +370,13 @@ class AutoQuantizer():
     Utility to quantize models, without having to specify the model type.
     """
     @staticmethod
-    def quantize(model_file:Union[str,os.PathLike],target_path:Optional[Union[str,os.PathLike]]=None,quantization:QuantizationType=QuantizationType.Q4_0,container:ContainerType=ContainerType.GGJT)->Union[str,os.PathLike]:
+    def quantize(
+        model_file:Union[str,os.PathLike],
+        target_path:Optional[Union[str,os.PathLike]]=None,
+        quantization:QuantizationType=QuantizationType.Q4_0,
+        container:ContainerType=ContainerType.GGJT,
+        callback:Optional[Callable[[str],None]]=None
+        )->Union[str,os.PathLike]:
         metadata=AutoModel.load_metadata(model_file)
         if metadata.quantization != QuantizationType.F16:
             raise ValueError(f"Model '{model_file}' is already quantized to '{metadata.quantization}'")
@@ -391,7 +410,7 @@ def build_target_name()->str:
             return target_file
         
         logging.info(f"Quantizing model '{model_file}' to '{target_file}'")
-        model_type.quantize(str(model_file),target_file,quantization,container)
+        model_type.quantize(str(model_file),target_file,quantization,container,callback=callback)
 
         metadata_file = pathlib.Path(target_file).with_suffix(".meta")
         quantized_metadata = ModelMetadata(model=metadata.model,quantization=quantization,container=container,quantization_version=CURRENT_QUANTIZATION_VERSION,base_model=metadata.base_model)
diff --git a/llm_rs/base_model.py b/llm_rs/base_model.py
index c68e915..6a86b01 100644
--- a/llm_rs/base_model.py
+++ b/llm_rs/base_model.py
@@ -26,12 +26,13 @@ def lora_paths(self)->Optional[List[str]]: ...
     def  __init__(self,
                   path:Union[str,os.PathLike],
                   session_config:SessionConfig=SessionConfig(),
+                  tokenizer_name_or_path:Optional[Union[str,os.PathLike]]=None,
                   lora_paths:Optional[List[Union[str,os.PathLike]]]=None,
                   verbose:bool=False) -> None: ...
     
     def generate(self,prompt:str,
                  generation_config:Optional[GenerationConfig]=None,
-                 callback:Callable[[str],Optional[bool]]=None) -> GenerationResult: 
+                 callback:Optional[Callable[[str],Optional[bool]]]=None) -> GenerationResult: 
         """
         Generates text from a prompt.
         """ 
@@ -58,7 +59,7 @@ def decode(self,tokens:List[int]) -> str:
         ...
 
     @staticmethod
-    def quantize(source:str,destination:str,quantization:QuantizationType=QuantizationType.Q4_0,container:ContainerType=ContainerType.GGJT)->None:
+    def quantize(source:str,destination:str,quantization:QuantizationType=QuantizationType.Q4_0,container:ContainerType=ContainerType.GGJT,callback:Optional[Callable[[str],None]]=None)->None:
         """
         Quantizes the model.
         """
diff --git a/src/configs.rs b/src/configs.rs
index eba24d8..fff3ce2 100644
--- a/src/configs.rs
+++ b/src/configs.rs
@@ -82,14 +82,16 @@ impl GenerationConfig {
 impl GenerationConfig {
     pub fn to_llm_params(&self, n_threads: usize, n_batch: usize) -> InferenceParameters {
         InferenceParameters {
-            top_k: self.top_k,
-            top_p: self.top_p,
-            temperature: self.temperature,
-            repeat_penalty: self.repetition_penalty,
-            repetition_penalty_last_n: self.repetition_penalty_last_n,
-            bias_tokens: TokenBias::default(),
             n_threads,
             n_batch,
+            sampler: std::sync::Arc::new(llm::samplers::TopPTopK {
+                top_k: self.top_k,
+                top_p: self.top_p,
+                temperature: self.temperature,
+                repeat_penalty: self.repetition_penalty,
+                repetition_penalty_last_n: self.repetition_penalty_last_n,
+                bias_tokens: TokenBias::default(),
+            }),
         }
     }
 }
diff --git a/src/model_base.rs b/src/model_base.rs
index a0eb074..82bd928 100644
--- a/src/model_base.rs
+++ b/src/model_base.rs
@@ -89,7 +89,7 @@ impl GenerationStreamer {
     }
 }
 
-pub fn _tokenize(model: &dyn llm::Model, text: &str) -> Result<Vec<i32>, InferenceError> {
+pub fn _tokenize(model: &dyn llm::Model, text: &str) -> Result<Vec<u32>, InferenceError> {
     Ok(model
         .vocabulary()
         .tokenize(text, false)?
@@ -98,12 +98,9 @@ pub fn _tokenize(model: &dyn llm::Model, text: &str) -> Result<Vec<i32>, Inferen
         .collect())
 }
 
-pub fn _decode(model: &dyn llm::Model, tokens: Vec<i32>) -> Result<String, std::str::Utf8Error> {
+pub fn _decode(model: &dyn llm::Model, tokens: Vec<u32>) -> Result<String, std::str::Utf8Error> {
     let vocab = model.vocabulary();
-    let characters: Vec<u8> = tokens
-        .into_iter()
-        .flat_map(|token| vocab.id_to_token[token as usize].to_owned())
-        .collect();
+    let characters: Vec<u8> = vocab.decode(tokens, false);
 
     match std::str::from_utf8(&characters) {
         Ok(text) => Ok(text.to_string()),
@@ -163,7 +160,7 @@ pub fn _infer_next_token(
         }
 
         //Buffer until a valid utf8 sequence is found
-        if let Some(s) = utf8_buf.push(token) {
+        if let Some(s) = utf8_buf.push(&token) {
             return Ok(Some(s));
         }
     }
@@ -303,6 +300,7 @@ macro_rules! wrap_model {
             fn new(
                 path: String,
                 session_config: Option<crate::configs::SessionConfig>,
+                tokenizer_name_or_path: Option<String>,
                 lora_paths: Option<Vec<String>>,
                 verbose: Option<bool>,
             ) -> Self {
@@ -320,8 +318,27 @@ macro_rules! wrap_model {
                     prefer_mmap: config_to_use.prefer_mmap,
                     lora_adapters: lora_paths.clone(),
                 };
+
+                let vocabulary_source: llm_base::VocabularySource;
+
+                if let Some(name_or_path) = tokenizer_name_or_path {
+                    let tokenizer_path = std::path::Path::new(&name_or_path);
+                    if tokenizer_path.is_file() && tokenizer_path.exists() {
+                        // Load tokenizer from file
+                        vocabulary_source = llm_base::VocabularySource::HuggingFaceTokenizerFile(
+                            tokenizer_path.to_owned(),
+                        );
+                    } else {
+                        // Load tokenizer from HuggingFace
+                        vocabulary_source =
+                            llm_base::VocabularySource::HuggingFaceRemote(name_or_path);
+                    }
+                } else {
+                    vocabulary_source = llm_base::VocabularySource::Model;
+                }
+
                 let llm_model: $llm_model =
-                    llm_base::load(&path, model_params, None, |load_progress| {
+                    llm_base::load(&path, vocabulary_source, model_params, |load_progress| {
                         if should_log {
                             llm_base::load_progress_callback_stdout(load_progress)
                         }
@@ -399,14 +416,14 @@ macro_rules! wrap_model {
                 })
             }
 
-            fn tokenize(&self, text: String) -> PyResult<Vec<i32>> {
+            fn tokenize(&self, text: String) -> PyResult<Vec<u32>> {
                 match crate::model_base::_tokenize(self.llm_model.as_ref(), &text) {
                     Ok(tokens) => Ok(tokens),
                     Err(e) => Err(pyo3::exceptions::PyException::new_err(e.to_string())),
                 }
             }
 
-            fn decode(&self, tokens: Vec<i32>) -> PyResult<String> {
+            fn decode(&self, tokens: Vec<u32>) -> PyResult<String> {
                 match crate::model_base::_decode(self.llm_model.as_ref(), tokens) {
                     Ok(tokens) => Ok(tokens),
                     Err(e) => Err(pyo3::exceptions::PyException::new_err(e.to_string())),
@@ -415,16 +432,34 @@ macro_rules! wrap_model {
 
             #[staticmethod]
             fn quantize(
+                _py: Python,
                 source: String,
                 destination: String,
                 quantization: Option<crate::quantize::QuantizationType>,
                 container: Option<crate::quantize::ContainerType>,
+                callback: Option<PyObject>,
             ) -> PyResult<()> {
+                let mut callback_function: Option<&PyAny> = None;
+                let pytohn_object: Py<PyAny>;
+
+                if let Some(unwrapped) = callback {
+                    pytohn_object = unwrapped;
+                    let python_function = pytohn_object.as_ref(_py);
+                    callback_function = Some(python_function);
+                    assert!(python_function.is_callable(), "Callback is not callable!");
+                }
+
                 crate::quantize::_quantize::<$llm_model>(
                     source.into(),
                     destination.into(),
                     container.unwrap_or(crate::quantize::ContainerType::GGJT),
                     quantization.unwrap_or(crate::quantize::QuantizationType::Q4_0),
+                    |message| {
+                        if let Some(callback) = callback_function {
+                            let args = pyo3::types::PyTuple::new(_py, &[message]);
+                            callback.call1(args).unwrap();
+                        }
+                    },
                 )
                 .map_err(|e| pyo3::exceptions::PyException::new_err(e.to_string()))
             }
diff --git a/src/quantize.rs b/src/quantize.rs
index 107ec06..346edd8 100644
--- a/src/quantize.rs
+++ b/src/quantize.rs
@@ -32,6 +32,7 @@ pub fn _quantize<M: llm::KnownModel + 'static>(
     destination: PathBuf,
     container: ContainerType,
     quantization: QuantizationType,
+    progress_callback: impl Fn(String),
 ) -> Result<(), QuantizeError> {
     let container = match container {
         ContainerType::GGML => ggml::format::SaveContainerType::Ggml,
@@ -49,43 +50,49 @@ pub fn _quantize<M: llm::KnownModel + 'static>(
         }),
     }?;
 
-    let mut source = BufReader::new(std::fs::File::open(source)?);
-    let mut destination = BufWriter::new(std::fs::File::create(destination)?);
+    let mut source_reader = BufReader::new(std::fs::File::open(&source)?);
+    let mut destination_reader = BufWriter::new(std::fs::File::create(destination)?);
+    let vocabulary = llm::VocabularySource::Model.retrieve(&source).unwrap();
 
     quantize::<M, _, _>(
-        &mut source,
-        &mut destination,
+        &mut source_reader,
+        &mut destination_reader,
+        vocabulary,
         container,
         quantization,
         |progress| match progress {
-            QuantizeProgress::HyperparametersLoaded => log::info!("Loaded hyperparameters"),
+            QuantizeProgress::HyperparametersLoaded => {
+                progress_callback("Loaded hyperparameters".to_string())
+            }
             QuantizeProgress::TensorLoading {
                 name,
                 dims,
                 element_type,
                 n_elements,
-            } => println!(
+            } => progress_callback(format!(
                 "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)"
-            ),
-            QuantizeProgress::TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"),
+            )),
+            QuantizeProgress::TensorQuantizing { name } => {
+                progress_callback(format!("Quantizing tensor `{name}`"))
+            }
             QuantizeProgress::TensorQuantized {
                 name,
                 original_size,
                 reduced_size,
                 history,
-            } => println!(
+            } => progress_callback(format!(
         "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})"
-    ),
+    )),
             QuantizeProgress::TensorSkipped { name, size } => {
-                println!("Skipped tensor `{name}` ({size} bytes)")
+                progress_callback(format!("Skipped tensor `{name}` ({size} bytes)"))
             }
             QuantizeProgress::Finished {
                 original_size,
                 reduced_size,
                 history,
-            } => println!(
+            } => progress_callback(format!(
                 "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})"
-            ),
+            )),
         },
     )
 }
diff --git a/src/stopwords.rs b/src/stopwords.rs
index 291e7d6..2837c36 100644
--- a/src/stopwords.rs
+++ b/src/stopwords.rs
@@ -38,8 +38,7 @@ impl StopWordHandler {
                     .tokenize(word, false)
                     .unwrap()
                     .iter()
-                    .flat_map(|(encoding, _)| *encoding)
-                    .copied()
+                    .flat_map(|(encoding, _)| encoding.to_owned())
                     .collect::<Vec<u8>>()
             })
             .collect();