Skip to content

Commit 06b405c

Browse files
committed
feat: add aya 8b and update versions
1 parent 52bd5b0 commit 06b405c

13 files changed

+155
-74
lines changed

infinity_mxbai_embed_large_v1.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,12 @@ def download_model_to_folder():
4242
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
4343
.pip_install(
4444
"wheel==0.43.0",
45-
"huggingface_hub==0.23.3",
45+
"huggingface_hub==0.24.0",
4646
"hf-transfer==0.1.6",
47-
"torch==2.3.0",
48-
"poetry==1.8.3",
49-
"transformers==4.41.2",
50-
"sentence-transformers==3.0.0",
51-
"infinity_emb[all]==0.0.39"
47+
"torch==2.3.1",
48+
"transformers==4.42.4",
49+
"sentence-transformers==3.0.1",
50+
"infinity_emb[all]==0.0.51"
5251
)
5352
.apt_install("git")
5453
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
@@ -67,7 +66,7 @@ def download_model_to_folder():
6766
# Run a web server on port 7997 and expose the Infinity embedding server
6867
@app.function(
6968
allow_concurrent_inputs=100,
70-
container_idle_timeout=60,
69+
container_idle_timeout=15,
7170
gpu=GPU_CONFIG,
7271
secrets=[
7372
Secret.from_name("huggingface"),

infinity_mxbai_rerank_large_v1.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,12 @@ def download_model_to_folder():
4242
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
4343
.pip_install(
4444
"wheel==0.43.0",
45-
"huggingface_hub==0.23.3",
45+
"huggingface_hub==0.24.0",
4646
"hf-transfer==0.1.6",
47-
"torch==2.3.0",
48-
"poetry==1.8.3",
49-
"transformers==4.41.2",
50-
"sentence-transformers==3.0.0",
51-
"infinity_emb[all]==0.0.39"
47+
"torch==2.3.1",
48+
"transformers==4.42.4",
49+
"sentence-transformers==3.0.1",
50+
"infinity_emb[all]==0.0.51"
5251
)
5352
.apt_install("git")
5453
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
@@ -67,7 +66,7 @@ def download_model_to_folder():
6766
# Run a web server on port 7997 and expose the Infinity embedding server
6867
@app.function(
6968
allow_concurrent_inputs=100,
70-
container_idle_timeout=60,
69+
container_idle_timeout=15,
7170
gpu=GPU_CONFIG,
7271
secrets=[
7372
Secret.from_name("huggingface"),

infinity_snowflake_arctic_embed_l_335m.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,12 @@ def download_model_to_folder():
4242
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
4343
.pip_install(
4444
"wheel==0.43.0",
45-
"huggingface_hub==0.23.3",
45+
"huggingface_hub==0.24.0",
4646
"hf-transfer==0.1.6",
47-
"torch==2.3.0",
48-
"poetry==1.8.3",
49-
"transformers==4.41.2",
50-
"sentence-transformers==3.0.0",
51-
"infinity_emb[all]==0.0.39"
47+
"torch==2.3.1",
48+
"transformers==4.42.4",
49+
"sentence-transformers==3.0.1",
50+
"infinity_emb[all]==0.0.51"
5251
)
5352
.apt_install("git")
5453
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
@@ -67,7 +66,7 @@ def download_model_to_folder():
6766
# Run a web server on port 8000 and expose vLLM OpenAI compatible server
6867
@app.function(
6968
allow_concurrent_inputs=100,
70-
container_idle_timeout=60,
69+
container_idle_timeout=15,
7170
gpu=GPU_CONFIG,
7271
secrets=[
7372
Secret.from_name("huggingface"),

outlines_llama3_8b.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def download_model_to_folder():
7474
# Run a web server on port 7997 and expose the Infinity embedding server
7575
@app.function(
7676
allow_concurrent_inputs=100,
77-
container_idle_timeout=60,
77+
container_idle_timeout=15,
7878
gpu=GPU_CONFIG,
7979
secrets=[
8080
Secret.from_name("huggingface"),

vllm_arctic_480b.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,18 @@ def download_model_to_folder():
4545
image = (
4646
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
4747
.pip_install(
48-
"vllm==0.4.3",
48+
"vllm==0.5.2",
4949
"wheel==0.43.0",
50-
"packaging==24.0",
51-
"huggingface_hub==0.23.3",
50+
"packaging==24.1",
51+
"huggingface_hub==0.24.0",
5252
"hf-transfer==0.1.6",
53-
"torch==2.3.0",
53+
"torch==2.3.1",
5454
"autoawq==0.2.5",
5555
)
5656
.apt_install("git")
5757
.run_commands(
58-
"pip install flash-attn==2.5.8 --no-build-isolation",
59-
)
60-
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
58+
"pip install flash-attn==2.6.1 --no-build-isolation",
59+
) # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
6160
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
6261
.run_function(
6362
download_model_to_folder,
@@ -73,7 +72,7 @@ def download_model_to_folder():
7372
# Run a web server on port 7997 and expose the Infinity embedding server
7473
@app.function(
7574
allow_concurrent_inputs=100,
76-
container_idle_timeout=60,
75+
container_idle_timeout=15,
7776
gpu=GPU_CONFIG,
7877
secrets=[
7978
Secret.from_name("huggingface"),

vllm_aya_8b.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# # Fast inference with vLLM (CohereForAI/aya-23-8B)
2+
#
3+
# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm)
4+
# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching.
5+
6+
import os
7+
import subprocess
8+
import secrets
9+
10+
11+
from modal import Image, Secret, App, enter, gpu, method, web_server
12+
13+
MODEL_DIR = "/model"
14+
BASE_MODEL = "CohereForAI/aya-23-8B"
15+
16+
# ## Define a container image
17+
18+
19+
# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
20+
# is that the container no longer has to re-download the model from Huggingface - instead, it will take
21+
# advantage of Modal's internal filesystem for faster cold starts.
22+
#
23+
# ### Download the weights
24+
# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
25+
#
26+
# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
27+
def download_model_to_folder():
28+
from huggingface_hub import snapshot_download
29+
from transformers.utils import move_cache
30+
31+
os.makedirs(MODEL_DIR, exist_ok=True)
32+
33+
snapshot_download(
34+
BASE_MODEL,
35+
local_dir=MODEL_DIR,
36+
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
37+
)
38+
move_cache()
39+
40+
41+
# ### Image definition
42+
# We'll start from a recommended Docker Hub image and install `vLLM`.
43+
# Then we'll use `run_function` to run the function defined above to ensure the weights of
44+
# the model are saved within the container image.
45+
image = (
46+
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
47+
.pip_install(
48+
"vllm==0.5.2",
49+
"wheel==0.43.0",
50+
"packaging==24.1",
51+
"huggingface_hub==0.24.0",
52+
"hf-transfer==0.1.6",
53+
"torch==2.3.1",
54+
"autoawq==0.2.5",
55+
)
56+
.apt_install("git")
57+
.run_commands(
58+
"pip install flash-attn==2.6.1 --no-build-isolation",
59+
) # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
60+
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
61+
.run_function(
62+
download_model_to_folder,
63+
secrets=[Secret.from_name("huggingface")],
64+
timeout=60 * 20,
65+
)
66+
)
67+
68+
app = App("vllm-aya-8b", image=image)
69+
GPU_CONFIG = gpu.A100(size="40GB", count=1)
70+
71+
72+
# Run a web server on port 7997 and expose the Infinity embedding server
73+
@app.function(
74+
allow_concurrent_inputs=100,
75+
container_idle_timeout=15,
76+
gpu=GPU_CONFIG,
77+
secrets=[
78+
Secret.from_name("huggingface"),
79+
Secret.from_dotenv(),
80+
],
81+
)
82+
@web_server(8000, startup_timeout=300)
83+
def openai_compatible_server():
84+
target = BASE_MODEL
85+
cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000"
86+
subprocess.Popen(cmd, shell=True)

vllm_codeqwen_110b_v1_5.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,19 +45,18 @@ def download_model_to_folder():
4545
image = (
4646
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
4747
.pip_install(
48-
"vllm==0.4.3",
48+
"vllm==0.5.2",
4949
"wheel==0.43.0",
50-
"packaging==24.0",
51-
"huggingface_hub==0.23.3",
50+
"packaging==24.1",
51+
"huggingface_hub==0.24.0",
5252
"hf-transfer==0.1.6",
53-
"torch==2.3.0",
53+
"torch==2.3.1",
5454
"autoawq==0.2.5",
5555
)
5656
.apt_install("git")
5757
.run_commands(
58-
"pip install flash-attn==2.5.8 --no-build-isolation",
59-
)
60-
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
58+
"pip install flash-attn==2.6.1 --no-build-isolation",
59+
) # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
6160
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
6261
.run_function(
6362
download_model_to_folder,
@@ -73,7 +72,7 @@ def download_model_to_folder():
7372
# Run a web server on port 8000 and expose vLLM OpenAI compatible server
7473
@app.function(
7574
allow_concurrent_inputs=100,
76-
container_idle_timeout=60,
75+
container_idle_timeout=15,
7776
gpu=GPU_CONFIG,
7877
secrets=[
7978
Secret.from_name("huggingface"),

vllm_deepseek_coder_33b.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,17 +45,17 @@ def download_model_to_folder():
4545
image = (
4646
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
4747
.pip_install(
48-
"vllm==0.4.3",
48+
"vllm==0.5.2",
4949
"wheel==0.43.0",
50-
"packaging==24.0",
51-
"huggingface_hub==0.23.3",
50+
"packaging==24.1",
51+
"huggingface_hub==0.24.0",
5252
"hf-transfer==0.1.6",
53-
"torch==2.3.0",
53+
"torch==2.3.1",
5454
"autoawq==0.2.5",
5555
)
5656
.apt_install("git")
5757
.run_commands(
58-
"pip install flash-attn==2.5.8 --no-build-isolation",
58+
"pip install flash-attn==2.6.1 --no-build-isolation",
5959
)
6060
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
6161
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -73,7 +73,7 @@ def download_model_to_folder():
7373
# Run a web server on port 8000 and expose vLLM OpenAI compatible server
7474
@app.function(
7575
allow_concurrent_inputs=100,
76-
container_idle_timeout=60,
76+
container_idle_timeout=15,
7777
gpu=GPU_CONFIG,
7878
secrets=[
7979
Secret.from_name("huggingface"),

vllm_duckdb_nsql_7b.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,17 +45,17 @@ def download_model_to_folder():
4545
image = (
4646
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
4747
.pip_install(
48-
"vllm==0.4.3",
48+
"vllm==0.5.2",
4949
"wheel==0.43.0",
50-
"packaging==24.0",
51-
"huggingface_hub==0.23.3",
50+
"packaging==24.1",
51+
"huggingface_hub==0.24.0",
5252
"hf-transfer==0.1.6",
53-
"torch==2.3.0",
53+
"torch==2.3.1",
5454
"autoawq==0.2.5",
5555
)
5656
.apt_install("git")
5757
.run_commands(
58-
"pip install flash-attn==2.5.8 --no-build-isolation",
58+
"pip install flash-attn==2.6.1 --no-build-isolation",
5959
)
6060
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
6161
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -73,7 +73,7 @@ def download_model_to_folder():
7373
# Run a web server on port 8000 and expose vLLM OpenAI compatible server
7474
@app.function(
7575
allow_concurrent_inputs=100,
76-
container_idle_timeout=60,
76+
container_idle_timeout=15,
7777
gpu=GPU_CONFIG,
7878
secrets=[
7979
Secret.from_name("huggingface"),

vllm_llama3_70b.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,17 +45,17 @@ def download_model_to_folder():
4545
image = (
4646
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
4747
.pip_install(
48-
"vllm==0.4.3",
48+
"vllm==0.5.2",
4949
"wheel==0.43.0",
50-
"packaging==24.0",
51-
"huggingface_hub==0.23.3",
50+
"packaging==24.1",
51+
"huggingface_hub==0.24.0",
5252
"hf-transfer==0.1.6",
53-
"torch==2.3.0",
53+
"torch==2.3.1",
5454
"autoawq==0.2.5",
5555
)
5656
.apt_install("git")
5757
.run_commands(
58-
"pip install flash-attn==2.5.8 --no-build-isolation",
58+
"pip install flash-attn==2.6.1 --no-build-isolation",
5959
)
6060
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
6161
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -73,7 +73,7 @@ def download_model_to_folder():
7373
# Run a web server on port 8000 and expose vLLM OpenAI compatible server
7474
@app.function(
7575
allow_concurrent_inputs=100,
76-
container_idle_timeout=60,
76+
container_idle_timeout=15,
7777
gpu=GPU_CONFIG,
7878
secrets=[
7979
Secret.from_name("huggingface"),

0 commit comments

Comments
 (0)