feat: add aya 8b and update versions

monotykamary · monotykamary · commit 06b405c0d68a · 2024-07-20T22:54:09.000+07:00
diff --git a/infinity_mxbai_embed_large_v1.py b/infinity_mxbai_embed_large_v1.py
@@ -42,13 +42,12 @@ def download_model_to_folder():
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
         "wheel==0.43.0",
-        "huggingface_hub==0.23.3",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
-        "poetry==1.8.3",
-        "transformers==4.41.2",
-        "sentence-transformers==3.0.0",
-        "infinity_emb[all]==0.0.39"
+        "torch==2.3.1",
+        "transformers==4.42.4",
+        "sentence-transformers==3.0.1",
+        "infinity_emb[all]==0.0.51"
     )
     .apt_install("git")
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
@@ -67,7 +66,7 @@ def download_model_to_folder():
 # Run a web server on port 7997 and expose the Infinity embedding server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/infinity_mxbai_rerank_large_v1.py b/infinity_mxbai_rerank_large_v1.py
@@ -42,13 +42,12 @@ def download_model_to_folder():
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
         "wheel==0.43.0",
-        "huggingface_hub==0.23.3",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
-        "poetry==1.8.3",
-        "transformers==4.41.2",
-        "sentence-transformers==3.0.0",
-        "infinity_emb[all]==0.0.39"
+        "torch==2.3.1",
+        "transformers==4.42.4",
+        "sentence-transformers==3.0.1",
+        "infinity_emb[all]==0.0.51"
     )
     .apt_install("git")
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
@@ -67,7 +66,7 @@ def download_model_to_folder():
 # Run a web server on port 7997 and expose the Infinity embedding server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/infinity_snowflake_arctic_embed_l_335m.py b/infinity_snowflake_arctic_embed_l_335m.py
@@ -42,13 +42,12 @@ def download_model_to_folder():
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
         "wheel==0.43.0",
-        "huggingface_hub==0.23.3",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
-        "poetry==1.8.3",
-        "transformers==4.41.2",
-        "sentence-transformers==3.0.0",
-        "infinity_emb[all]==0.0.39"
+        "torch==2.3.1",
+        "transformers==4.42.4",
+        "sentence-transformers==3.0.1",
+        "infinity_emb[all]==0.0.51"
     )
     .apt_install("git")
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
@@ -67,7 +66,7 @@ def download_model_to_folder():
 # Run a web server on port 8000 and expose vLLM OpenAI compatible server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/outlines_llama3_8b.py b/outlines_llama3_8b.py
@@ -74,7 +74,7 @@ def download_model_to_folder():
 # Run a web server on port 7997 and expose the Infinity embedding server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/vllm_arctic_480b.py b/vllm_arctic_480b.py
@@ -45,19 +45,18 @@ def download_model_to_folder():
 image = (
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
-        "vllm==0.4.3",
+        "vllm==0.5.2",
         "wheel==0.43.0",
-        "packaging==24.0",
-        "huggingface_hub==0.23.3",
+        "packaging==24.1",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
+        "torch==2.3.1",
         "autoawq==0.2.5",
     )
     .apt_install("git")
     .run_commands(
-        "pip install flash-attn==2.5.8 --no-build-isolation",
-    )
-    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
+        "pip install flash-attn==2.6.1 --no-build-isolation",
+    )    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
     .run_function(
         download_model_to_folder,
@@ -73,7 +72,7 @@ def download_model_to_folder():
 # Run a web server on port 7997 and expose the Infinity embedding server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/vllm_aya_8b.py b/vllm_aya_8b.py
@@ -0,0 +1,86 @@
+# # Fast inference with vLLM (CohereForAI/aya-23-8B)
+#
+# In this example, we show how to run basic inference, using [`vLLM`](https://github.com/vllm-project/vllm)
+# to take advantage of PagedAttention, which speeds up sequential inferences with optimized key-value caching.
+
+import os
+import subprocess
+import secrets
+
+
+from modal import Image, Secret, App, enter, gpu, method, web_server
+
+MODEL_DIR = "/model"
+BASE_MODEL = "CohereForAI/aya-23-8B"
+
+# ## Define a container image
+
+
+# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
+# is that the container no longer has to re-download the model from Huggingface - instead, it will take
+# advantage of Modal's internal filesystem for faster cold starts.
+#
+# ### Download the weights
+# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
+#
+# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
+def download_model_to_folder():
+    from huggingface_hub import snapshot_download
+    from transformers.utils import move_cache
+
+    os.makedirs(MODEL_DIR, exist_ok=True)
+
+    snapshot_download(
+        BASE_MODEL,
+        local_dir=MODEL_DIR,
+        ignore_patterns=["*.pt", "*.bin"],  # Using safetensors
+    )
+    move_cache()
+
+
+# ### Image definition
+# We'll start from a recommended Docker Hub image and install `vLLM`.
+# Then we'll use `run_function` to run the function defined above to ensure the weights of
+# the model are saved within the container image.
+image = (
+    Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
+    .pip_install(
+        "vllm==0.5.2",
+        "wheel==0.43.0",
+        "packaging==24.1",
+        "huggingface_hub==0.24.0",
+        "hf-transfer==0.1.6",
+        "torch==2.3.1",
+        "autoawq==0.2.5",
+    )
+    .apt_install("git")
+    .run_commands(
+        "pip install flash-attn==2.6.1 --no-build-isolation",
+    )    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+    .run_function(
+        download_model_to_folder,
+        secrets=[Secret.from_name("huggingface")],
+        timeout=60 * 20,
+    )
+)
+
+app = App("vllm-aya-8b", image=image)
+GPU_CONFIG = gpu.A100(size="40GB", count=1)
+
+
+# Run a web server on port 7997 and expose the Infinity embedding server
+@app.function(
+    allow_concurrent_inputs=100,
+    container_idle_timeout=15,
+    gpu=GPU_CONFIG,
+    secrets=[
+        Secret.from_name("huggingface"),
+        Secret.from_dotenv(),
+    ],
+)
+@web_server(8000, startup_timeout=300)
+def openai_compatible_server():
+    target = BASE_MODEL
+    cmd = f"python -m vllm.entrypoints.openai.api_server --model {target} --port 8000"
+    subprocess.Popen(cmd, shell=True)
diff --git a/vllm_codeqwen_110b_v1_5.py b/vllm_codeqwen_110b_v1_5.py
@@ -45,19 +45,18 @@ def download_model_to_folder():
 image = (
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
-        "vllm==0.4.3",
+        "vllm==0.5.2",
         "wheel==0.43.0",
-        "packaging==24.0",
-        "huggingface_hub==0.23.3",
+        "packaging==24.1",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
+        "torch==2.3.1",
         "autoawq==0.2.5",
     )
     .apt_install("git")
     .run_commands(
-        "pip install flash-attn==2.5.8 --no-build-isolation",
-    )
-    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
+        "pip install flash-attn==2.6.1 --no-build-isolation",
+    )    # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
     .run_function(
         download_model_to_folder,
@@ -73,7 +72,7 @@ def download_model_to_folder():
 # Run a web server on port 8000 and expose vLLM OpenAI compatible server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/vllm_deepseek_coder_33b.py b/vllm_deepseek_coder_33b.py
@@ -45,17 +45,17 @@ def download_model_to_folder():
 image = (
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
-        "vllm==0.4.3",
+        "vllm==0.5.2",
         "wheel==0.43.0",
-        "packaging==24.0",
-        "huggingface_hub==0.23.3",
+        "packaging==24.1",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
+        "torch==2.3.1",
         "autoawq==0.2.5",
     )
     .apt_install("git")
     .run_commands(
-        "pip install flash-attn==2.5.8 --no-build-isolation",
+        "pip install flash-attn==2.6.1 --no-build-isolation",
     )
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -73,7 +73,7 @@ def download_model_to_folder():
 # Run a web server on port 8000 and expose vLLM OpenAI compatible server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/vllm_duckdb_nsql_7b.py b/vllm_duckdb_nsql_7b.py
@@ -45,17 +45,17 @@ def download_model_to_folder():
 image = (
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
-        "vllm==0.4.3",
+        "vllm==0.5.2",
         "wheel==0.43.0",
-        "packaging==24.0",
-        "huggingface_hub==0.23.3",
+        "packaging==24.1",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
+        "torch==2.3.1",
         "autoawq==0.2.5",
     )
     .apt_install("git")
     .run_commands(
-        "pip install flash-attn==2.5.8 --no-build-isolation",
+        "pip install flash-attn==2.6.1 --no-build-isolation",
     )
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -73,7 +73,7 @@ def download_model_to_folder():
 # Run a web server on port 8000 and expose vLLM OpenAI compatible server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/vllm_llama3_70b.py b/vllm_llama3_70b.py
@@ -45,17 +45,17 @@ def download_model_to_folder():
 image = (
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
-        "vllm==0.4.3",
+        "vllm==0.5.2",
         "wheel==0.43.0",
-        "packaging==24.0",
-        "huggingface_hub==0.23.3",
+        "packaging==24.1",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
+        "torch==2.3.1",
         "autoawq==0.2.5",
     )
     .apt_install("git")
     .run_commands(
-        "pip install flash-attn==2.5.8 --no-build-isolation",
+        "pip install flash-attn==2.6.1 --no-build-isolation",
     )
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -73,7 +73,7 @@ def download_model_to_folder():
 # Run a web server on port 8000 and expose vLLM OpenAI compatible server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/vllm_llama3_8b.py b/vllm_llama3_8b.py
@@ -45,17 +45,17 @@ def download_model_to_folder():
 image = (
     Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
     .pip_install(
-        "vllm==0.4.3",
+        "vllm==0.5.2",
         "wheel==0.43.0",
-        "packaging==24.0",
-        "huggingface_hub==0.23.3",
+        "packaging==24.1",
+        "huggingface_hub==0.24.0",
         "hf-transfer==0.1.6",
-        "torch==2.3.0",
+        "torch==2.3.1",
         "autoawq==0.2.5",
     )
     .apt_install("git")
     .run_commands(
-        "pip install flash-attn==2.5.8 --no-build-isolation",
+        "pip install flash-attn==2.6.1 --no-build-isolation",
     )
     # Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
     .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
@@ -73,7 +73,7 @@ def download_model_to_folder():
 # Run a web server on port 7997 and expose the Infinity embedding server
 @app.function(
     allow_concurrent_inputs=100,
-    container_idle_timeout=60,
+    container_idle_timeout=15,
     gpu=GPU_CONFIG,
     secrets=[
         Secret.from_name("huggingface"),
diff --git a/vllm_seallm_7b_v2_5.py b/vllm_seallm_7b_v2_5.py
diff --git a/vllm_sqlcoder_7b_2.py b/vllm_sqlcoder_7b_2.py