|
| 1 | +# # Fast inference with vLLM (CohereForAI/aya-23-8B) |
| 2 | +# |
| 3 | +# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm) |
| 4 | +# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching. |
| 5 | + |
| 6 | +import os |
| 7 | +import subprocess |
| 8 | +import secrets |
| 9 | + |
| 10 | + |
| 11 | +from modal import Image, Secret, App, enter, gpu, method, web_server |
| 12 | + |
| 13 | +MODEL_DIR = "/model" |
| 14 | +BASE_MODEL = "CohereForAI/aya-23-8B" |
| 15 | + |
| 16 | +# ## Define a container image |
| 17 | + |
| 18 | + |
| 19 | +# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this |
| 20 | +# is that the container no longer has to re-download the model from Huggingface - instead, it will take |
| 21 | +# advantage of Modal's internal filesystem for faster cold starts. |
| 22 | +# |
| 23 | +# ### Download the weights |
| 24 | +# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`. |
| 25 | +# |
| 26 | +# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run. |
| 27 | +def download_model_to_folder(): |
| 28 | + from huggingface_hub import snapshot_download |
| 29 | + from transformers.utils import move_cache |
| 30 | + |
| 31 | + os.makedirs(MODEL_DIR, exist_ok=True) |
| 32 | + |
| 33 | + snapshot_download( |
| 34 | + BASE_MODEL, |
| 35 | + local_dir=MODEL_DIR, |
| 36 | + ignore_patterns=["*.pt", "*.bin"], # Using safetensors |
| 37 | + ) |
| 38 | + move_cache() |
| 39 | + |
| 40 | + |
| 41 | +# ### Image definition |
| 42 | +# We'll start from a recommended Docker Hub image and install `vLLM`. |
| 43 | +# Then we'll use `run_function` to run the function defined above to ensure the weights of |
| 44 | +# the model are saved within the container image. |
| 45 | +image = ( |
| 46 | + Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10") |
| 47 | + .pip_install( |
| 48 | + "vllm==0.5.2", |
| 49 | + "wheel==0.43.0", |
| 50 | + "packaging==24.1", |
| 51 | + "huggingface_hub==0.24.0", |
| 52 | + "hf-transfer==0.1.6", |
| 53 | + "torch==2.3.1", |
| 54 | + "autoawq==0.2.5", |
| 55 | + ) |
| 56 | + .apt_install("git") |
| 57 | + .run_commands( |
| 58 | + "pip install flash-attn==2.6.1 --no-build-isolation", |
| 59 | + ) # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s. |
| 60 | + .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) |
| 61 | + .run_function( |
| 62 | + download_model_to_folder, |
| 63 | + secrets=[Secret.from_name("huggingface")], |
| 64 | + timeout=60 * 20, |
| 65 | + ) |
| 66 | +) |
| 67 | + |
| 68 | +app = App("vllm-aya-8b", image=image) |
| 69 | +GPU_CONFIG = gpu.A100(size="40GB", count=1) |
| 70 | + |
| 71 | + |
| 72 | +# Run a web server on port 7997 and expose the Infinity embedding server |
| 73 | +@app.function( |
| 74 | + allow_concurrent_inputs=100, |
| 75 | + container_idle_timeout=15, |
| 76 | + gpu=GPU_CONFIG, |
| 77 | + secrets=[ |
| 78 | + Secret.from_name("huggingface"), |
| 79 | + Secret.from_dotenv(), |
| 80 | + ], |
| 81 | +) |
| 82 | +@web_server(8000, startup_timeout=300) |
| 83 | +def openai_compatible_server(): |
| 84 | + target = BASE_MODEL |
| 85 | + cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000" |
| 86 | + subprocess.Popen(cmd, shell=True) |
0 commit comments