Merge pull request #26 from LLukas22/feat/stable-gpu-support

Stabilize GPU support
LLukas22 · Jul 17, 2023 · b5eaae5 · b5eaae5
2 parents 3eef549 + 6c4494f
commit b5eaae5
Show file tree

Hide file tree

Showing 11 changed files with 148 additions and 77 deletions.
diff --git a/.github/workflows/CI-CuBLAS.yml b/.github/workflows/CI-CuBLAS.yml
@@ -36,12 +36,15 @@ jobs:
  - name: Install libssl-dev
  run: sudo apt-get install libssl-dev
 
- - uses: Jimver/[email protected]
- id: cuda-toolkit
+ - uses: Jimver/[email protected]
+ name: Install CUDA toolkit on Linux
+ id: cuda-toolkit-linux
  with:
- cuda: '12.1.0'
- method: 'local'
- linux-local-args: '["--toolkit"]'
+ cuda: "12.2.0"
+ method: "network"
+ #See e.g. https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/
+ non-cuda-sub-packages: '["libcublas","libcublas-dev"]'
+ sub-packages: '["nvcc","compiler","libraries","libraries-dev","cudart","cudart-dev"]'
 
 
  - name: Build wheels
@@ -76,10 +79,14 @@ jobs:
  - name: Set package name
  run: python ./build_scripts/pyproject_patcher.py
 
- - uses: Jimver/[email protected]
- id: cuda-toolkit
+ - uses: Jimver/[email protected]
+ name: Install CUDA toolkit on Windows
+ id: cuda-toolkit-windows
  with:
- cuda: '12.1.0'
+ cuda: "12.2.0"
+ #See https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#install-the-cuda-software
+ method: "local"
+
  - name: Build wheels
  uses: PyO3/maturin-action@v1
  with:

diff --git a/.github/workflows/CI-OpenCL.yml b/.github/workflows/CI-OpenCL.yml
@@ -51,33 +51,64 @@ jobs:
  name: wheels
  path: dist
 
- # windows:
- # runs-on: windows-latest
- # strategy:
- # fail-fast: false
- # matrix:
- # target: [x64]
- # steps:
- # - uses: actions/checkout@v3
- # - uses: actions/setup-python@v4
- # with:
- # python-version: '3.10'
- # architecture: ${{ matrix.target }}
- # - name: Install build dependencies
- # run: pip install -r ./build_scripts/requirements.txt
- # - name: Set package name
- # run: python ./build_scripts/pyproject_patcher.py
- # - name: Build wheels
- # uses: PyO3/maturin-action@v1
- # with:
- # target: ${{ matrix.target }}
- # args: --release --out dist --find-interpreter --features clblast
- # sccache: 'true'
- # - name: Upload wheels
- # uses: actions/upload-artifact@v3
- # with:
- # name: wheels
- # path: dist
+ windows:
+ runs-on: windows-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ target: [x64]
+ steps:
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ with:
+ python-version: '3.10'
+ architecture: ${{ matrix.target }}
+ - name: Install build dependencies
+ run: pip install -r ./build_scripts/requirements.txt
+ - name: Set package name
+ run: python ./build_scripts/pyproject_patcher.py
+
+ - name: Install vcpkg on windows
+ run: |
+ git clone https://github.com/microsoft/vcpkg.git
+ cd vcpkg
+ ./bootstrap-vcpkg.sh
+ ls -la
+ shell: bash
+
+ - name: Install OpenCL on windows
+ run: |
+ ${{ github.workspace }}\vcpkg\vcpkg.exe install opencl:x64-windows
+ shell: pwsh
+
+ - name: Install CLBlast on windows
+ run: |
+ ${{ github.workspace }}\vcpkg\vcpkg.exe install clblast:x64-windows
+ shell: pwsh
+
+ - name: Set Windows Environment Variables
+ run: |
+ echo "CLBLAST_PATH=${{ github.workspace }}/vcpkg/packages/clblast_x64-windows" >> $GITHUB_ENV
+ echo "OPENCL_PATH=${{ github.workspace }}/vcpkg/packages/opencl_x64-windows" >> $GITHUB_ENV
+ echo "${{ github.workspace }}/vcpkg/packages/clblast_x64-windows/bin" >> $GITHUB_PATH
+ echo "${{ github.workspace }}/vcpkg/packages/opencl_x64-windows/bin" >> $GITHUB_PATH
+ shell: bash
+
+ - name: Build wheels
+ uses: PyO3/maturin-action@v1
+ with:
+ target: ${{ matrix.target }}
+ args: --release --out dist --find-interpreter --features clblast
+ sccache: 'true'
+
+ - name: Include OpenCL DLLs
+ run: python ./build_scripts/repair_windows_wheels.py "./dist"
+
+ - name: Upload wheels
+ uses: actions/upload-artifact@v3
+ with:
+ name: wheels
+ path: wheelhouse
 
  sdist:
  runs-on: ubuntu-latest

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "llm-rs"
-version = "0.2.12"
+version = "0.2.13"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -15,11 +15,11 @@ log = "0.4.17"
 serde = "1.0.163"
 serde_json = "1.0"
 
-llm = { git = "https://github.com/LLukas22/llm.git", branch = "feat/cuda-opencl-acceleration" }
-llm-base = { git = "https://github.com/LLukas22/llm.git", branch = "feat/cuda-opencl-acceleration" }
+llm = { git = "https://github.com/rustformers/llm.git", rev = "5d09eed"}
+llm-base = { git = "https://github.com/rustformers/llm.git", rev = "5d09eed"}
 
 [dependencies.pyo3]
-version = "0.19.0"
+version = "0.19.1"
 # "abi3-py37" tells pyo3 (and maturin) to build using the stable ABI with
 # Python 3.7 or later.
 features = ["abi3-py37","extension-module", "generate-import-lib"]

diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 Welcome to `llm-rs`, an unofficial Python interface for the Rust-based [llm](https://github.com/rustformers/llm) library, made possible through [PyO3](https://github.com/PyO3/pyo3). Our package combines the convenience of Python with the performance of Rust to offer an efficient tool for your machine learning projects. 🐍❤️🦀
 
-With `llm-rs`, you can operate a variety of Large Language Models (LLMs) including LLama and GPT-NeoX directly on your CPU. 
+With `llm-rs`, you can operate a variety of Large Language Models (LLMs) including LLama and GPT-NeoX directly on your CPU or GPU. 
 
 For a detailed overview of all the supported architectures, visit the [llm](https://github.com/rustformers/llm) project page. 
 
@@ -18,34 +18,34 @@ For a detailed overview of all the supported architectures, visit the [llm](http
 
 Simply install it via pip: `pip install llm-rs`
 
-### Installation with GPU Acceleration Support
-> ⚠️ Please note that GPU Acceleration support is currently in its experimental phase.
+<details>
+<summary>Installation with GPU Acceleration Support</summary>
+<br>
 
 `llm-rs` incorporates support for various GPU-accelerated backends to facilitate enhanced inference times. To enable GPU-acceleration the `use_gpu` parameter of your `SessionConfig` must be set to `True`. We distribute prebuilt binaries for the following operating systems and graphics APIs:
 
-#### MacOS (Using Metal)
+### MacOS (Using Metal)
 For MacOS users, the Metal-supported version of `llm-rs` can be easily installed via pip:
 
 `
 pip install llm-rs-metal
 `
 
-#### Windows/Linux (Using CUDA for Nvidia GPUs)
+### Windows/Linux (Using CUDA for Nvidia GPUs)
 Due to the significant file size, CUDA-supported packages cannot be directly uploaded to `pip`. To install them, download the appropriate `*.whl` file from the latest [Release](https://github.com/LLukas22/llm-rs-python/releases/latest) and install it using pip as follows:
 
 `
 pip install [wheelname].whl
 `
 
-#### Windows/Linux (Using OpenCL for All GPUs)
-> ⚠️ OpenCL support is highly experimental and may not provide stable results.
+### Windows/Linux (Using OpenCL for All GPUs)
 
 For universal GPU support on Windows and Linux, we offer an OpenCL-supported version. It can be installed via pip:
 
 `
 pip install llm-rs-opencl
 `
-
+</details>
 
 
 ## Usage

diff --git a/llm_rs/__init__.py b/llm_rs/__init__.py
@@ -1,4 +1,28 @@
-from .llm_rs import *
+try:
+ from .llm_rs import *
+except ImportError as e:
+ print("DLLs were not boundled with this package. Trying to locate them...")
+ import os
+ import platform
+
+ #Try to locate CUDA_PATH environment variable
+ cuda_path = os.environ.get("CUDA_PATH",None)
+ if cuda_path:
+ print(f"Found CUDA_PATH environment variable: {cuda_path}")
+ if platform.system() == "Windows":
+ cuda_path = os.path.join(cuda_path,"bin")
+ else:
+ cuda_path = os.path.join(cuda_path,"lib64")
+
+ print(f"Adding {cuda_path} to DLL search path...")
+ os.add_dll_directory(cuda_path)
+
+ try:
+ from .llm_rs import *
+ except ImportError as inner_e:
+ raise ImportError("Could not locate DLLs. Please check the documentation for more information.")
+
+
 
 from .config import GenerationConfig, SessionConfig, Precision, ContainerType, QuantizationType
 from .models import Llama, GptJ, Gpt2, Bloom, GptNeoX, Mpt

diff --git a/llm_rs/llm_rs.pyi b/llm_rs/llm_rs.pyi
@@ -0,0 +1,7 @@
+
+
+def get_accelerator() -> str:
+ """
+ Returns the accelerator the package was compiled with. This is useful to check if the package can use the GPU.
+ """
+ ...
diff --git a/src/configs.rs b/src/configs.rs
@@ -115,9 +115,8 @@ impl GenerationConfig {
 }
 
 impl GenerationConfig {
- pub fn to_llm_params(&self, n_threads: usize) -> InferenceParameters {
+ pub fn to_llm_params(&self) -> InferenceParameters {
  InferenceParameters {
- n_threads,
  sampler: std::sync::Arc::new(llm::samplers::TopPTopK {
  top_k: self.top_k,
  top_p: self.top_p,
@@ -268,7 +267,7 @@ impl SessionConfig {
  memory_k_type: self.keys_memory_type.to_llama_rs_memory_type(),
  memory_v_type: self.values_memory_type.to_llama_rs_memory_type(),
  n_batch: self.batch_size,
- use_gpu: self.use_gpu,
+ n_threads: self.threads,
  }
  }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -7,8 +7,21 @@ mod quantize;
 mod results;
 mod stopwords;
 
+#[pyfunction]
+fn get_accelerator() -> String {
+ match llm_base::ggml::accelerator::get_accelerator() {
+ llm_base::ggml::accelerator::Accelerator::CuBLAS => "cuda".to_owned(),
+ llm_base::ggml::accelerator::Accelerator::CLBlast => "opencl".to_owned(),
+ llm_base::ggml::accelerator::Accelerator::Metal => "metal".to_owned(),
+ _ => "cpu".to_owned(),
+ }
+}
+
 #[pymodule]
 fn llm_rs(_py: Python, m: &PyModule) -> PyResult<()> {
+ m.add_function(wrap_pyfunction!(get_accelerator, m)?)
+ .unwrap();
+
  let config_module = PyModule::new(_py, "config")?;
  config_module.add_class::<configs::GenerationConfig>()?;
  config_module.add_class::<configs::Precision>()?;

diff --git a/src/model_base.rs b/src/model_base.rs
@@ -91,15 +91,15 @@ impl GenerationStreamer {
 
 pub fn _tokenize(model: &dyn llm::Model, text: &str) -> Result<Vec<u32>, InferenceError> {
  Ok(model
- .vocabulary()
+ .tokenizer()
  .tokenize(text, false)?
  .iter()
  .map(|(_, token)| *token)
  .collect())
 }
 
 pub fn _decode(model: &dyn llm::Model, tokens: Vec<u32>) -> Result<String, std::str::Utf8Error> {
- let vocab = model.vocabulary();
+ let vocab = model.tokenizer();
  let characters: Vec<u8> = vocab.decode(tokens, false);
 
  match std::str::from_utf8(&characters) {
@@ -124,7 +124,7 @@ pub fn _start_session<'a>(
  //Build the correct generation parameters
  let mut config_to_use = generation_config.unwrap_or(configs::GenerationConfig::default());
 
- let generation_params = config_to_use.to_llm_params(session_config.threads);
+ let generation_params = config_to_use.to_llm_params();
 
  let rng = ChaCha8Rng::seed_from_u64(config_to_use.seed);
  let prompt = Prompt::from(prompt);
@@ -193,13 +193,9 @@ pub fn _generate(
  let mut output_request_feeding = OutputRequest::default();
  _py.allow_threads(|| {
  session
- .feed_prompt::<Infallible, _>(
- model,
- &inference_params,
- prompt,
- &mut output_request_feeding,
- |_| Ok(InferenceFeedback::Continue),
- )
+ .feed_prompt::<Infallible, _>(model, prompt, &mut output_request_feeding, |_| {
+ Ok(InferenceFeedback::Continue)
+ })
  .unwrap()
  });
  let feed_prompt_duration = feed_start_at.elapsed().unwrap();
@@ -283,8 +279,7 @@ pub fn _embed(
  session_config: &configs::SessionConfig,
  prompt: String,
 ) -> Result<Vec<f32>, PyErr> {
- let (_, inference_params, _, prompt, mut session) =
- _start_session(model, session_config, &prompt, None);
+ let (_, _, _, prompt, mut session) = _start_session(model, session_config, &prompt, None);
 
  //Feed the prompt
  let mut output_request_feeding = OutputRequest {
@@ -293,13 +288,9 @@ pub fn _embed(
  };
  _py.allow_threads(|| {
  session
- .feed_prompt::<Infallible, _>(
- model,
- &inference_params,
- prompt,
- &mut output_request_feeding,
- |_| Ok(InferenceFeedback::Continue),
- )
+ .feed_prompt::<Infallible, _>(model, prompt, &mut output_request_feeding, |_| {
+ Ok(InferenceFeedback::Continue)
+ })
  .unwrap()
  });
  Ok(output_request_feeding.embeddings.unwrap())
@@ -348,22 +339,22 @@ macro_rules! wrap_model {
  gpu_layers: config_to_use.gpu_layers,
  };
 
- let vocabulary_source: llm_base::VocabularySource;
+ let vocabulary_source: llm_base::TokenizerSource;
 
  if let Some(name_or_path) = tokenizer_name_or_path {
  let tokenizer_path = std::path::Path::new(&name_or_path);
  if tokenizer_path.is_file() && tokenizer_path.exists() {
  // Load tokenizer from file
- vocabulary_source = llm_base::VocabularySource::HuggingFaceTokenizerFile(
+ vocabulary_source = llm_base::TokenizerSource::HuggingFaceTokenizerFile(
  tokenizer_path.to_owned(),
  );
  } else {
  // Load tokenizer from HuggingFace
  vocabulary_source =
- llm_base::VocabularySource::HuggingFaceRemote(name_or_path);
+ llm_base::TokenizerSource::HuggingFaceRemote(name_or_path);
  }
  } else {
- vocabulary_source = llm_base::VocabularySource::Model;
+ vocabulary_source = llm_base::TokenizerSource::Embedded;
  }
 
  let llm_model: $llm_model =
@@ -434,7 +425,6 @@ macro_rules! wrap_model {
  session
  .feed_prompt::<std::convert::Infallible, _>(
  self.llm_model.as_ref(),
- &inference_params,
  prompt,
  &mut output_request_feeding,
  |_| Ok(llm_base::InferenceFeedback::Continue),