Skip to content

Commit

Permalink
Merge pull request #26 from LLukas22/feat/stable-gpu-support
Browse files Browse the repository at this point in the history
Stabilize GPU support
  • Loading branch information
LLukas22 authored Jul 17, 2023
2 parents 3eef549 + 6c4494f commit b5eaae5
Show file tree
Hide file tree
Showing 11 changed files with 148 additions and 77 deletions.
23 changes: 15 additions & 8 deletions .github/workflows/CI-CuBLAS.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,15 @@ jobs:
- name: Install libssl-dev
run: sudo apt-get install libssl-dev

- uses: Jimver/[email protected]
id: cuda-toolkit
- uses: Jimver/[email protected]
name: Install CUDA toolkit on Linux
id: cuda-toolkit-linux
with:
cuda: '12.1.0'
method: 'local'
linux-local-args: '["--toolkit"]'
cuda: "12.2.0"
method: "network"
#See e.g. https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/
non-cuda-sub-packages: '["libcublas","libcublas-dev"]'
sub-packages: '["nvcc","compiler","libraries","libraries-dev","cudart","cudart-dev"]'


- name: Build wheels
Expand Down Expand Up @@ -76,10 +79,14 @@ jobs:
- name: Set package name
run: python ./build_scripts/pyproject_patcher.py

- uses: Jimver/[email protected]
id: cuda-toolkit
- uses: Jimver/[email protected]
name: Install CUDA toolkit on Windows
id: cuda-toolkit-windows
with:
cuda: '12.1.0'
cuda: "12.2.0"
#See https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#install-the-cuda-software
method: "local"

- name: Build wheels
uses: PyO3/maturin-action@v1
with:
Expand Down
85 changes: 58 additions & 27 deletions .github/workflows/CI-OpenCL.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,33 +51,64 @@ jobs:
name: wheels
path: dist

# windows:
# runs-on: windows-latest
# strategy:
# fail-fast: false
# matrix:
# target: [x64]
# steps:
# - uses: actions/checkout@v3
# - uses: actions/setup-python@v4
# with:
# python-version: '3.10'
# architecture: ${{ matrix.target }}
# - name: Install build dependencies
# run: pip install -r ./build_scripts/requirements.txt
# - name: Set package name
# run: python ./build_scripts/pyproject_patcher.py
# - name: Build wheels
# uses: PyO3/maturin-action@v1
# with:
# target: ${{ matrix.target }}
# args: --release --out dist --find-interpreter --features clblast
# sccache: 'true'
# - name: Upload wheels
# uses: actions/upload-artifact@v3
# with:
# name: wheels
# path: dist
windows:
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
target: [x64]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
architecture: ${{ matrix.target }}
- name: Install build dependencies
run: pip install -r ./build_scripts/requirements.txt
- name: Set package name
run: python ./build_scripts/pyproject_patcher.py

- name: Install vcpkg on windows
run: |
git clone https://github.com/microsoft/vcpkg.git
cd vcpkg
./bootstrap-vcpkg.sh
ls -la
shell: bash

- name: Install OpenCL on windows
run: |
${{ github.workspace }}\vcpkg\vcpkg.exe install opencl:x64-windows
shell: pwsh

- name: Install CLBlast on windows
run: |
${{ github.workspace }}\vcpkg\vcpkg.exe install clblast:x64-windows
shell: pwsh

- name: Set Windows Environment Variables
run: |
echo "CLBLAST_PATH=${{ github.workspace }}/vcpkg/packages/clblast_x64-windows" >> $GITHUB_ENV
echo "OPENCL_PATH=${{ github.workspace }}/vcpkg/packages/opencl_x64-windows" >> $GITHUB_ENV
echo "${{ github.workspace }}/vcpkg/packages/clblast_x64-windows/bin" >> $GITHUB_PATH
echo "${{ github.workspace }}/vcpkg/packages/opencl_x64-windows/bin" >> $GITHUB_PATH
shell: bash

- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.target }}
args: --release --out dist --find-interpreter --features clblast
sccache: 'true'

- name: Include OpenCL DLLs
run: python ./build_scripts/repair_windows_wheels.py "./dist"

- name: Upload wheels
uses: actions/upload-artifact@v3
with:
name: wheels
path: wheelhouse

sdist:
runs-on: ubuntu-latest
Expand Down
8 changes: 4 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "llm-rs"
version = "0.2.12"
version = "0.2.13"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand All @@ -15,11 +15,11 @@ log = "0.4.17"
serde = "1.0.163"
serde_json = "1.0"

llm = { git = "https://github.com/LLukas22/llm.git", branch = "feat/cuda-opencl-acceleration" }
llm-base = { git = "https://github.com/LLukas22/llm.git", branch = "feat/cuda-opencl-acceleration" }
llm = { git = "https://github.com/rustformers/llm.git", rev = "5d09eed"}
llm-base = { git = "https://github.com/rustformers/llm.git", rev = "5d09eed"}

[dependencies.pyo3]
version = "0.19.0"
version = "0.19.1"
# "abi3-py37" tells pyo3 (and maturin) to build using the stable ABI with
# Python 3.7 or later.
features = ["abi3-py37","extension-module", "generate-import-lib"]
Expand Down
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

Welcome to `llm-rs`, an unofficial Python interface for the Rust-based [llm](https://github.com/rustformers/llm) library, made possible through [PyO3](https://github.com/PyO3/pyo3). Our package combines the convenience of Python with the performance of Rust to offer an efficient tool for your machine learning projects. 🐍❤️🦀

With `llm-rs`, you can operate a variety of Large Language Models (LLMs) including LLama and GPT-NeoX directly on your CPU.
With `llm-rs`, you can operate a variety of Large Language Models (LLMs) including LLama and GPT-NeoX directly on your CPU or GPU.

For a detailed overview of all the supported architectures, visit the [llm](https://github.com/rustformers/llm) project page.

Expand All @@ -18,34 +18,34 @@ For a detailed overview of all the supported architectures, visit the [llm](http

Simply install it via pip: `pip install llm-rs`

### Installation with GPU Acceleration Support
> ⚠️ Please note that GPU Acceleration support is currently in its experimental phase.
<details>
<summary>Installation with GPU Acceleration Support</summary>
<br>

`llm-rs` incorporates support for various GPU-accelerated backends to facilitate enhanced inference times. To enable GPU-acceleration the `use_gpu` parameter of your `SessionConfig` must be set to `True`. We distribute prebuilt binaries for the following operating systems and graphics APIs:

#### MacOS (Using Metal)
### MacOS (Using Metal)
For MacOS users, the Metal-supported version of `llm-rs` can be easily installed via pip:

`
pip install llm-rs-metal
`

#### Windows/Linux (Using CUDA for Nvidia GPUs)
### Windows/Linux (Using CUDA for Nvidia GPUs)
Due to the significant file size, CUDA-supported packages cannot be directly uploaded to `pip`. To install them, download the appropriate `*.whl` file from the latest [Release](https://github.com/LLukas22/llm-rs-python/releases/latest) and install it using pip as follows:

`
pip install [wheelname].whl
`

#### Windows/Linux (Using OpenCL for All GPUs)
> ⚠️ OpenCL support is highly experimental and may not provide stable results.
### Windows/Linux (Using OpenCL for All GPUs)

For universal GPU support on Windows and Linux, we offer an OpenCL-supported version. It can be installed via pip:

`
pip install llm-rs-opencl
`

</details>


## Usage
Expand Down
26 changes: 25 additions & 1 deletion llm_rs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,28 @@
from .llm_rs import *
try:
from .llm_rs import *
except ImportError as e:
print("DLLs were not boundled with this package. Trying to locate them...")
import os
import platform

#Try to locate CUDA_PATH environment variable
cuda_path = os.environ.get("CUDA_PATH",None)
if cuda_path:
print(f"Found CUDA_PATH environment variable: {cuda_path}")
if platform.system() == "Windows":
cuda_path = os.path.join(cuda_path,"bin")
else:
cuda_path = os.path.join(cuda_path,"lib64")

print(f"Adding {cuda_path} to DLL search path...")
os.add_dll_directory(cuda_path)

try:
from .llm_rs import *
except ImportError as inner_e:
raise ImportError("Could not locate DLLs. Please check the documentation for more information.")



from .config import GenerationConfig, SessionConfig, Precision, ContainerType, QuantizationType
from .models import Llama, GptJ, Gpt2, Bloom, GptNeoX, Mpt
Expand Down
7 changes: 7 additions & 0 deletions llm_rs/llm_rs.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@


def get_accelerator() -> str:
"""
Returns the accelerator the package was compiled with. This is useful to check if the package can use the GPU.
"""
...
5 changes: 2 additions & 3 deletions src/configs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,8 @@ impl GenerationConfig {
}

impl GenerationConfig {
pub fn to_llm_params(&self, n_threads: usize) -> InferenceParameters {
pub fn to_llm_params(&self) -> InferenceParameters {
InferenceParameters {
n_threads,
sampler: std::sync::Arc::new(llm::samplers::TopPTopK {
top_k: self.top_k,
top_p: self.top_p,
Expand Down Expand Up @@ -268,7 +267,7 @@ impl SessionConfig {
memory_k_type: self.keys_memory_type.to_llama_rs_memory_type(),
memory_v_type: self.values_memory_type.to_llama_rs_memory_type(),
n_batch: self.batch_size,
use_gpu: self.use_gpu,
n_threads: self.threads,
}
}
}
13 changes: 13 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,21 @@ mod quantize;
mod results;
mod stopwords;

#[pyfunction]
fn get_accelerator() -> String {
match llm_base::ggml::accelerator::get_accelerator() {
llm_base::ggml::accelerator::Accelerator::CuBLAS => "cuda".to_owned(),
llm_base::ggml::accelerator::Accelerator::CLBlast => "opencl".to_owned(),
llm_base::ggml::accelerator::Accelerator::Metal => "metal".to_owned(),
_ => "cpu".to_owned(),
}
}

#[pymodule]
fn llm_rs(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(get_accelerator, m)?)
.unwrap();

let config_module = PyModule::new(_py, "config")?;
config_module.add_class::<configs::GenerationConfig>()?;
config_module.add_class::<configs::Precision>()?;
Expand Down
38 changes: 14 additions & 24 deletions src/model_base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,15 +91,15 @@ impl GenerationStreamer {

pub fn _tokenize(model: &dyn llm::Model, text: &str) -> Result<Vec<u32>, InferenceError> {
Ok(model
.vocabulary()
.tokenizer()
.tokenize(text, false)?
.iter()
.map(|(_, token)| *token)
.collect())
}

pub fn _decode(model: &dyn llm::Model, tokens: Vec<u32>) -> Result<String, std::str::Utf8Error> {
let vocab = model.vocabulary();
let vocab = model.tokenizer();
let characters: Vec<u8> = vocab.decode(tokens, false);

match std::str::from_utf8(&characters) {
Expand All @@ -124,7 +124,7 @@ pub fn _start_session<'a>(
//Build the correct generation parameters
let mut config_to_use = generation_config.unwrap_or(configs::GenerationConfig::default());

let generation_params = config_to_use.to_llm_params(session_config.threads);
let generation_params = config_to_use.to_llm_params();

let rng = ChaCha8Rng::seed_from_u64(config_to_use.seed);
let prompt = Prompt::from(prompt);
Expand Down Expand Up @@ -193,13 +193,9 @@ pub fn _generate(
let mut output_request_feeding = OutputRequest::default();
_py.allow_threads(|| {
session
.feed_prompt::<Infallible, _>(
model,
&inference_params,
prompt,
&mut output_request_feeding,
|_| Ok(InferenceFeedback::Continue),
)
.feed_prompt::<Infallible, _>(model, prompt, &mut output_request_feeding, |_| {
Ok(InferenceFeedback::Continue)
})
.unwrap()
});
let feed_prompt_duration = feed_start_at.elapsed().unwrap();
Expand Down Expand Up @@ -283,8 +279,7 @@ pub fn _embed(
session_config: &configs::SessionConfig,
prompt: String,
) -> Result<Vec<f32>, PyErr> {
let (_, inference_params, _, prompt, mut session) =
_start_session(model, session_config, &prompt, None);
let (_, _, _, prompt, mut session) = _start_session(model, session_config, &prompt, None);

//Feed the prompt
let mut output_request_feeding = OutputRequest {
Expand All @@ -293,13 +288,9 @@ pub fn _embed(
};
_py.allow_threads(|| {
session
.feed_prompt::<Infallible, _>(
model,
&inference_params,
prompt,
&mut output_request_feeding,
|_| Ok(InferenceFeedback::Continue),
)
.feed_prompt::<Infallible, _>(model, prompt, &mut output_request_feeding, |_| {
Ok(InferenceFeedback::Continue)
})
.unwrap()
});
Ok(output_request_feeding.embeddings.unwrap())
Expand Down Expand Up @@ -348,22 +339,22 @@ macro_rules! wrap_model {
gpu_layers: config_to_use.gpu_layers,
};

let vocabulary_source: llm_base::VocabularySource;
let vocabulary_source: llm_base::TokenizerSource;

if let Some(name_or_path) = tokenizer_name_or_path {
let tokenizer_path = std::path::Path::new(&name_or_path);
if tokenizer_path.is_file() && tokenizer_path.exists() {
// Load tokenizer from file
vocabulary_source = llm_base::VocabularySource::HuggingFaceTokenizerFile(
vocabulary_source = llm_base::TokenizerSource::HuggingFaceTokenizerFile(
tokenizer_path.to_owned(),
);
} else {
// Load tokenizer from HuggingFace
vocabulary_source =
llm_base::VocabularySource::HuggingFaceRemote(name_or_path);
llm_base::TokenizerSource::HuggingFaceRemote(name_or_path);
}
} else {
vocabulary_source = llm_base::VocabularySource::Model;
vocabulary_source = llm_base::TokenizerSource::Embedded;
}

let llm_model: $llm_model =
Expand Down Expand Up @@ -434,7 +425,6 @@ macro_rules! wrap_model {
session
.feed_prompt::<std::convert::Infallible, _>(
self.llm_model.as_ref(),
&inference_params,
prompt,
&mut output_request_feeding,
|_| Ok(llm_base::InferenceFeedback::Continue),
Expand Down
Loading

0 comments on commit b5eaae5

Please sign in to comment.