From ab4f9dda2231e4b980f2ebd9edd01f14ad35eeb3 Mon Sep 17 00:00:00 2001 From: Michael Katsoulis Date: Thu, 7 Dec 2023 10:10:12 +0200 Subject: [PATCH] Initial toml docs --- available-hardware.mdx | 2 +- cerebrium/environments/custom-images.mdx | 4 +- cerebrium/environments/initial-setup.mdx | 101 +++++++++++++++-------- cerebrium/environments/warm-models.mdx | 4 +- cerebrium/getting-started/quickstart.mdx | 6 +- examples/langchain.mdx | 87 ++++++++++++------- examples/logo-controlnet.mdx | 68 ++++++++++----- examples/mistral-vllm.mdx | 71 +++++++++++----- examples/sdxl.mdx | 69 +++++++++++----- examples/streaming-falcon-7B.mdx | 71 +++++++++++----- examples/transcribe-whisper.mdx | 57 +++++++++---- 11 files changed, 363 insertions(+), 177 deletions(-) diff --git a/available-hardware.mdx b/available-hardware.mdx index cf42d301..30901857 100644 --- a/available-hardware.mdx +++ b/available-hardware.mdx @@ -26,7 +26,7 @@ We have the following graphics cards available on the platform: _NOTE: The maximum model sizes are calculated as a guideline, assuming that the model is the only thing loaded into VRAM. Longer inputs will result in a smaller maximum model size. Your mileage may vary._ -These GPUs can be selected using the `--hardware` flag when deploying your model on Cortex or can be specified in your config.yaml. +These GPUs can be selected using the `--gpu` flag when deploying your model on Cortex or can be specified in your `cerebrium.toml`. For more help with deciding which GPU you require, see this section [here](#choosing-a-gpu). _Due to the global shortage of GPUs at the moment, we may not always have the Enterprise edition of your GPU available. In this case, we will deploy to the Workstation edition of the GPU._ diff --git a/cerebrium/environments/custom-images.mdx b/cerebrium/environments/custom-images.mdx index 6532b15c..b3f22968 100644 --- a/cerebrium/environments/custom-images.mdx +++ b/cerebrium/environments/custom-images.mdx @@ -3,7 +3,7 @@ title: Custom Images description: Specify your versions, dependencies and packages to use --- -By default, Cerebrium models are executed in Python 3.9 unless the Python version specified by you in your **config.yaml** is different. However, Cerebrium only supports version 3.9 and above. +By default, Cerebrium models are executed in Python 3.9 unless the Python version specified by you in your **cerebrium.toml** is different. However, Cerebrium only supports version 3.9 and above. Traditionally, when working with Python, you will need access to Apt packages, Pip packages and Conda packages, and so we replicate this functionality as if you were developing locally. When creating your Cortex project, you can contain the following files @@ -15,4 +15,4 @@ When creating your Cortex project, you can contain the following files Each package must be represented on a new line just as you would locally. All the files above are optional, however, have to contain these file names specifically. Typically, specifying versions for packages leads to faster builds however, if you ever find you would like to change version numbers or find your library versions aren't -updating, please add the following flag to your deploy command: `cerebrium deploy model-name --force-rebuild` +updating, please add the following flag to your deploy command: `cerebrium deploy --name model-name --force-rebuild` diff --git a/cerebrium/environments/initial-setup.mdx b/cerebrium/environments/initial-setup.mdx index 75ba567b..9e4b2177 100644 --- a/cerebrium/environments/initial-setup.mdx +++ b/cerebrium/environments/initial-setup.mdx @@ -14,13 +14,10 @@ This will create a Cortex project in the specified directory with the following ``` project_name/ ├── main.py -├── requirements.txt -├── pkglist.txt -├── conda_pkglist.txt -└── config.yaml +└── cerebrium.toml ``` -Cortex supports the use of config YAML files to configure various aspects of your project such as hardware requirements, memory and much more. +Cortex supports the use of `toml` config files to configure various aspects of your project such as hardware requirements, scaling parameters and much more. Using config files makes it easier to keep track of your Cerebrium deployments, share them and use git versioning to show changes over time. To deploy your model with a specific config file, you can use the `cerebrium deploy` command with the `--config-file` flag to specify the path to your config file. Otherwise `cerebrium deploy` will use the only yaml in the file directory. @@ -35,36 +32,72 @@ Your config file can be named anything you want and can be placed anywhere on yo The parameters for your config file are the same as those which you would use as flags for a normal `cerebrium deploy` command. They're tabulated below for your convenience: -| Parameter | Description | Type | Default | -| ------------------- | ----------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------ | -| `name` | Name of the deployment | string | | -| `api_key` | API key for the deployment | string | not included for safety | -| `hardware` | Hardware to use for the deployment | string | GPU | -| `gpu_count` | The number of GPUs to specify | int | 2 | -| `cpu` | The number of CPU cores to use | int | 2 | -| `memory` | The amount of Memory to use in GB | int | 14.5 | -| `log_level` | Log level for the deployment | string | INFO | -| `include` | Local files to include in the deployment | string | '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' | -| `exclude` | Local Files to exclude from the deployment | string | '[./.*, ./__*]' | -| `disable_animation` | Whether to disable the animation in the logs. | boolean | false | -| `python_version` | The Python version you would like to run | float | 3.9 | -| `min_replicas` | The minimum number of replicas to run. | int | 0 | -| `max_replicas` | The maximum number of replicas to scale to. | int | \*plan limit | -| `cooldown` | The number of seconds to keep your model warm after each request. It resets after every request | int | 60 | +| Section | Parameter | Description | Type | Default | +| --- | --- | --- | --- | --- | +| `cerebrium.build` | A section for all the parameters governing your cortex builds | | | | +| | `predict_data` | The data to use to test your predict function on build. This is the same as the payload in a inference call | string | '{"prompt": "Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build."}' | +| | `force_rebuild` | Whether to force a rebuild of your deployment | boolean | false | +| | `disable_animation` | Whether to disable the animation in the logs. | boolean | false | +| | `log_level` | Log level for the deployment | string | INFO | +| | `disable_deployment_confirmation` | Whether to disable the pre-deployment confirmation prompt | boolean | false | +| `cerebrium.deployment` | All the parameters related to the lifetime of your deployment live here. | | | +| | `python_version` | The Python version you would like to run | float | 3.9 | +| | `include` | Local files to include in the deployment | string | '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' | +| | `exclude` | Local Files to exclude from the deployment | string | '[./.*, ./__*]' | +| `cerebrium.hardware` | Select the specifics for the machine you would like to run here. | | | +| | `gpu` | The GPU you would like to use. | string | AMPERE_A5000 | +| | `cpu` | The number of CPU cores to use | int | 2 | +| | `memory` | The amount of Memory to use in GB | float | 14.5 | +| | `gpu_count` | The number of GPUs to specify | int | 2 | +| `cerebrium.scaling` | All the parameters related to the auto scaling of your deployment when live are placed here. | | | +| | `min_replicas` | The minimum number of replicas to run. | int | 0 | +| | `max_replicas` | The maximum number of replicas to scale to. | int | \*plan limit | +| | `cooldown` | The number of seconds to keep your model warm after each request. It resets after every request ends. | int | 60 | +| `cerebrium.requirements` | All the parameters related to the packages you would like to install on your deployment are placed here. | | | +| | `pip` | The pip packages you would like to install. In the format 'module' = 'version_constraints' | dict (toml) | | +| | `conda` | The conda packages you would like to install. In the format 'module' = 'version_constraints' | dict (toml) | | +| | `apt` | The apt packages you would like to install. | list (toml) | | + + + ## Config File Example -```yaml -%YAML 1.2 ---- -name: an-optional-name -api_key: an-optional-api-key -hardware: GPU -exclude: "[./.*, ./__*]" -include: "[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]" -log_level: INFO -disable_animation: false -python_version: 3.9 -min_replicas: 0 -max_replicas: 30 +```toml +# This file was automatically generated by Cerebrium as a starting point for your project. +# You can edit it as you wish. +# If you would like to learn more about your Cerebrium config, please visit https://docs.cerebrium.ai/cerebrium/environments/initial-setup#config-file-example + +[cerebrium.build] +predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" +force_rebuild = false +disable_animation = false +log_level = "INFO" +disable_deployment_confirmation = false + +[cerebrium.deployment] +python_version = "3.10" +include = "[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]" +exclude = "[./.*, ./__*]" + +[cerebrium.hardware] +gpu = "AMPERE_A5000" +cpu = 2 +memory = 16.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +cooldown = 60 + +[cerebrium.requirements.pip] +torch = ">=2.0.0" + +[cerebrium.requirements.conda] +cuda = ">=11.7" +cudatoolkit = "==11.7" + +[cerebrium.requirements] +apt = [ "libgl1-mesa-glx", "libglib2.0-0"] + ``` diff --git a/cerebrium/environments/warm-models.mdx b/cerebrium/environments/warm-models.mdx index de69a49b..223bc3fa 100644 --- a/cerebrium/environments/warm-models.mdx +++ b/cerebrium/environments/warm-models.mdx @@ -8,11 +8,11 @@ There are two ways to do this based on your use case: 1. Set min replicas to 1 or more. -This is set through the **min_replicas** option in your `config.yaml` file. This is typically the best option if you would like to sustain a base load or would like +This is set through the **min_replicas** option in your `cerebrium.toml` file. This is typically the best option if you would like to sustain a base load or would like to meet minimum SLA's with customers. Please note that you are charged for 24/7 usage of the instances 2. Set your cooldown period -You set this using the **cooldown** parameter in your `config.yaml` and is by default set to 60 seconds. This is the number of seconds of inactivity from when your last +You set this using the **cooldown** parameter in your `cerebrium.toml` and is by default set to 60 seconds. This is the number of seconds of inactivity from when your last request finishes that a container must experience before terminating. Every time you get a new request, this time is reset. It is important to note that you are charged for the cooldown time since your container is constantly running. diff --git a/cerebrium/getting-started/quickstart.mdx b/cerebrium/getting-started/quickstart.mdx index 172e88af..cfa4c611 100644 --- a/cerebrium/getting-started/quickstart.mdx +++ b/cerebrium/getting-started/quickstart.mdx @@ -12,10 +12,8 @@ cerebrium init first-project Currently, our implementation has five components: - **main.py** - This is where your Python code lives. This is mandatory to include. -- **requirements.txt** - This is where you define your Python packages where each package should be on a new line. Deployment will be quicker if you specify specific versions. This is optional to include. -- **pkglist.txt** - This is where you can define Linux packages where each package should be on a new line. We run the apt-install command for items here. This is optional to include. -- **conda_pkglist.txt** - This is where you can define Conda packages where each package should be on a new line. if you prefer using it for some libraries over pip. You can use both conda and pip in conjunction. This is optional to include. -- **config.yaml** - This is where you define all the configurations around your model such as the hardware you use, memory required, min replicas etc. Check [here](../environments/initial-setup) for a full list + +- **cerebrium.toml** - This is where you define all the configurations around your model such as the hardware you use, scaling parameters, deployment config, build parameters, etc. Check [here](../environments/initial-setup) for a full list Every main.py you deploy needs the following mandatory layout: diff --git a/examples/langchain.mdx b/examples/langchain.mdx index 6a4cd201..ff4fa821 100644 --- a/examples/langchain.mdx +++ b/examples/langchain.mdx @@ -18,25 +18,24 @@ First we create our project: cerebrium init langchain-QA ``` -We need certain Python packages to implement this project. Let's add those to our **_requirements.txt_** file: - -``` -pytube # For audio downloading -langchain -faiss-gpu -ffmpeg -openai-whisper -transformers -sentence_transformers -cerebrium +We need certain Python packages to implement this project. Let's add those to our **[cerebrium.requirements.pip]** section of our `cerebrium.toml` file: + +```toml +[cerebrium.requirements.pip] +pytube = "" # For audio downloading +langchain = "" +faiss-gpu = "" +ffmpeg = "" +openai-whisper = "" +transformers = ">=4.35.0" +sentence_transformers = ">=2.2.0" ``` -To use Whisper, we also have to install ffmpeg and a few other packages as a Linux package and therefore have to define these in **pkglist.txt** - this is to install all Linux-based packages. +To use Whisper, we also have to install ffmpeg and a few other packages as a Linux package and therefore have to define these in **[cerebrium.requirements]** - this is to install all Linux-based packages. -``` -ffmpeg -libopenblas-base -libomp-dev +```toml +[cerebrium.requirements] +apt = [ "ffmpeg", "libopenblas-base", "libomp-dev"] ``` Our **main.py** file will contain our main Python code. This is a relatively simple implementation, so we can do everything in 1 file. We would like a user to send in a link to a YouTube video with a question and return to them the answer as well as the time segment of where we got that response. @@ -147,26 +146,54 @@ We then integrate Langchain with a Cerebrium deployed endpoint to answer questio ## Deploy -Your config.yaml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000, and that you have enough memory (RAM) on your instance to run the models. You config.yaml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000, and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: + + +```toml + +[cerebrium.build] +predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" +force_rebuild = false +disable_animation = false +log_level = "INFO" +disable_deployment_confirmation = false + +[cerebrium.deployment] +name = "langchain-qa" +python_version = "3.10" +include = "[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]" +exclude = "[./.*, ./__*]" + +[cerebrium.hardware] +gpu = "AMPERE_A5000" +cpu = 2 +memory = 16.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +cooldown = 60 + +[cerebrium.requirements] +apt = [ "ffmpeg", "libopenblas-base", "libomp-dev"] + +[cerebrium.requirements.pip] +pytube = "" # For audio downloading +langchain = "" +faiss-gpu = "" +ffmpeg = "" +openai-whisper = "" +transformers = ">=4.35.0" +sentence_transformers = ">=2.2.0" + +[cerebrium.requirements.conda] -``` -%YAML 1.2 ---- -hardware: AMPERE_A5000 -memory: 14 -cpu: 2 -min_replicas: 0 -log_level: INFO -include: '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' -exclude: '[./.*, ./__*]' -cooldown: 60 -disable_animation: false ``` To deploy the model use the following command: ```bash -cerebrium deploy langchain-QA +cerebrium deploy ``` Once deployed, we can make the following request: diff --git a/examples/logo-controlnet.mdx b/examples/logo-controlnet.mdx index b2725690..1c02f949 100644 --- a/examples/logo-controlnet.mdx +++ b/examples/logo-controlnet.mdx @@ -22,14 +22,15 @@ cerebrium init controlnet-logo It is important to think of the way you develop models using Cerebrium should be identical to developing on a virtual machine or Google Colab - so converting this should be very easy! -Let us create our **_requirements.txt_** file and add the following packages: - -``` -accelerate -transformers -safetensors -opencv-python -diffusers +Let us add the following packages to the **[cerebrium.requirements.pip]** section of our `cerebrium.toml` file: + +```toml +[cerebrium.requirements.pip] +accelerate = "" +transformers = ">=4.35.0" +safetensors = "" +opencv-python = "" +diffusers = "" ``` To start, we need to create a **main.py** file which will contain our main Python code. This is a relatively simple implementation, so we can do everything in 1 file. We would like a user to send in a link to a YouTube video with a question and return to them the answer as well as the time segment of where we got that response. @@ -120,20 +121,45 @@ def predict(item, run_id, logger): ## Deploy -Your config.yaml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You config.yaml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: + +```toml + +[cerebrium.build] +predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" +force_rebuild = false +disable_animation = false +log_level = "INFO" +disable_deployment_confirmation = false + +[cerebrium.deployment] +name = "controlnet-logo" +python_version = "3.10" +include = "[./*, main.py]" +exclude = "[./.*, ./__*]" + +[cerebrium.hardware] +gpu = "AMPERE_A5000" +cpu = 2 +memory = 16.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +cooldown = 60 + +[cerebrium.requirements] +apt = ["ffmpeg"] + +[cerebrium.requirements.pip] +accelerate = "" +transformers = ">=4.35.0" +safetensors = "" +opencv-python = "" +diffusers = "" + +[cerebrium.requirements.conda] -``` -%YAML 1.2 ---- -hardware: AMPERE_A5000 -memory: 14 -cpu: 2 -min_replicas: 0 -log_level: INFO -include: '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' -exclude: '[./.*, ./__*]' -cooldown: 60 -disable_animation: false ``` To deploy the model, use the following command: diff --git a/examples/mistral-vllm.mdx b/examples/mistral-vllm.mdx index 4b7e3d7b..262df77a 100644 --- a/examples/mistral-vllm.mdx +++ b/examples/mistral-vllm.mdx @@ -18,15 +18,16 @@ First we create our project: cerebrium init mistral-vllm ``` -We need certain Python packages to implement this project. Lets add those to our **_requirements.txt_** file: +We need certain Python packages to implement this project. Lets add those to our **[cerebrium.requirements.pip]** in our `cerebrium.toml` file: ``` -sentencepiece -torch -vllm -transformers -accelerate -xformers +[cerebrium.requirements.pip] +sentencepiece = "" +torch = ">=2.0.0" +vllm = "" +transformers = ">=4.35.0" +accelerate = "" +xformers = "" ``` Our **main.py** file will contain our main Python code. This is a relatively simple implementation, so we can do everything in 1 file. We would like a user to send in a link to a YouTube video with a question and return to them the answer as well as the time segment of where we got that response. @@ -59,7 +60,7 @@ llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1", dtype="bfloat16") def predict(item, run_id, logger): item = Item(**item) - # Now jusst setup your sampling parameters for inference: + # Now just setup your sampling parameters for inference: sampling_params = SamplingParams(temperature=item.temperature, top_p=item.top_p, top_k=item.top_k, max_tokens=item.max_tokens, frequency_penalty=item.frequency_penalty) # And feed your prompt and sampling params into your LLM pipeline as follows. @@ -82,26 +83,52 @@ The implementation in our **predict** function is pretty straight forward in tha ## Deploy -Your config.yaml file is where you can set your compute/environment. Please make sure that the hardware you specify is an AMPERE_A5000, and that you have enough memory (RAM) on your instance to run the models. Your config.yaml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the hardware you specify is an AMPERE_A5000, and that you have enough memory (RAM) on your instance to run the models. Your cerebrium.toml file should look like: + +```toml + +[cerebrium.build] +predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" +force_rebuild = false +disable_animation = false +log_level = "INFO" +disable_deployment_confirmation = false + +[cerebrium.deployment] +name = "mistral-vllm" +python_version = "3.10" +include = "[./*, main.py]" +exclude = "[./.*, ./__*]" + +[cerebrium.hardware] +gpu = "AMPERE_A5000" +cpu = 2 +memory = 16.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +cooldown = 60 + +[cerebrium.requirements] +apt = ["ffmpeg"] + +[cerebrium.requirements.pip] +sentencepiece = "" +torch = ">=2.0.0" +vllm = "" +transformers = ">=4.35.0" +accelerate = "" +xformers = "" + +[cerebrium.requirements.conda] -``` -%YAML 1.2 ---- -hardware: AMPERE_A5000 -memory: 14 -cpu: 2 -min_replicas: 0 -log_level: INFO -include: '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' -exclude: '[./.*, ./__*]' -cooldown: 60 -disable_animation: false ``` To deploy the model use the following command: ```bash -cerebrium deploy mistral-vllm +cerebrium deploy ``` Once deployed, we can make the following request: diff --git a/examples/sdxl.mdx b/examples/sdxl.mdx index 65dc9b03..65a91f80 100644 --- a/examples/sdxl.mdx +++ b/examples/sdxl.mdx @@ -20,14 +20,15 @@ cerebrium init sdxl-refiner It is important to think of the way you develop models using Cerebrium should be identical to developing on a virtual machine or Google Colab - so converting this should be very easy! -Let us create our **_requirements.txt_** file and add the following packages: - -``` -invisible_watermark -transformers -accelerate -safetensors -diffusers +Let us add the following packages to the **[cerebrium.requirements.pip]** section of our `cerebrium.toml` file: + +```toml +[cerebrium.requirements.pip] +invisible_watermark = "" +transformers = ">=4.35.0" +accelerate = "" +safetensors = "" +diffusers = "" ``` To start, we need to create a **main.py** file which will contain our main Python code. This is a relatively simple implementation, so we can do everything in 1 file. We would like a user to send in a link to a YouTube video with a question and return to them the answer as well as the time segment of where we got that response. @@ -101,20 +102,46 @@ def predict(item, run_id, logger): ## Deploy -Your config.yaml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You config.yaml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: + + +```toml + +[cerebrium.build] +predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" +force_rebuild = false +disable_animation = false +log_level = "INFO" +disable_deployment_confirmation = false + +[cerebrium.deployment] +name = "sdxl" +python_version = "3.10" +include = "[./*, main.py]" +exclude = "[./.*, ./__*]" + +[cerebrium.hardware] +gpu = "AMPERE_A5000" +cpu = 2 +memory = 16.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +cooldown = 60 + +[cerebrium.requirements] +apt = ["ffmpeg"] + +[cerebrium.requirements.pip] +accelerate = "" +transformers = ">=4.35.0" +safetensors = "" +opencv-python = "" +diffusers = "" + +[cerebrium.requirements.conda] -``` -%YAML 1.2 ---- -hardware: AMPERE_A5000 -memory: 14 -cpu: 2 -min_replicas: 0 -log_level: INFO -include: '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' -exclude: '[./.*, ./__*]' -cooldown: 60 -disable_animation: false ``` To deploy the model use the following command: diff --git a/examples/streaming-falcon-7B.mdx b/examples/streaming-falcon-7B.mdx index 42868cff..7147d6e9 100644 --- a/examples/streaming-falcon-7B.mdx +++ b/examples/streaming-falcon-7B.mdx @@ -20,15 +20,16 @@ cerebrium init streaming-falcon It is important to think of the way you develop models using Cerebrium should be identical to developing on a virtual machine or Google Colab - so converting this should be very easy! -Let us create our **_requirements.txt_** file and add the following packages: - -``` -git+https://github.com/huggingface/peft.git -git+https://github.com/huggingface/transformers.git -git+https://github.com/huggingface/accelerate.git -bitsandbytes -sentencepiece -torch +Let us add the following packages to the **[cerebrium.requirements.pip]** section of our `cerebrium.toml` file: + +```toml +[cerebrium.requirements.pip] +peft = "git+https://github.com/huggingface/peft.git" +transformers = "git+https://github.com/huggingface/transformers.git" +accelerate = "git+https://github.com/huggingface/accelerate.git" +bitsandbytes = "" +sentencepiece = "" +torch = "" ``` Our **main.py** file will contain our main Python code. This is a relatively simple implementation, so we can do everything in 1 file. We would like a user to send in a link to a YouTube video with a question and return to them the answer as well as the time segment of where we got that response. @@ -116,20 +117,46 @@ importantly, we use the **yield** keyword to return output from our model as its ## Deploy -Your config.yaml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You config.yaml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: + +```toml + +[cerebrium.build] +predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" +force_rebuild = false +disable_animation = false +log_level = "INFO" +disable_deployment_confirmation = false + +[cerebrium.deployment] +name = "streaming-falcon" +python_version = "3.10" +include = "[./*, main.py]" +exclude = "[./.*, ./__*]" + +[cerebrium.hardware] +gpu = "AMPERE_A5000" +cpu = 2 +memory = 16.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +cooldown = 60 + +[cerebrium.requirements] +apt = ["ffmpeg"] + +[cerebrium.requirements.pip] +peft = "git+https://github.com/huggingface/peft.git" +transformers = "git+https://github.com/huggingface/transformers.git" +accelerate = "git+https://github.com/huggingface/accelerate.git" +bitsandbytes = "" +sentencepiece = "" +torch = "" + +[cerebrium.requirements.conda] -``` -%YAML 1.2 ---- -hardware: AMPERE_A5000 -memory: 14 -cpu: 2 -min_replicas: 0 -log_level: INFO -include: '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' -exclude: '[./.*, ./__*]' -cooldown: 60 -disable_animation: false ``` To deploy the model use the following command: diff --git a/examples/transcribe-whisper.mdx b/examples/transcribe-whisper.mdx index dac52473..94331d5b 100644 --- a/examples/transcribe-whisper.mdx +++ b/examples/transcribe-whisper.mdx @@ -21,11 +21,12 @@ cerebrium init distil-whisper It is important to think of the way you develop models using Cerebrium should be identical to developing on a virtual machine or Google Colab - so converting this should be very easy! -Let us create our **_requirements.txt_** file and add the following packages: +Let us add the following packages to the **[cerebrium.requirements.pip]** section of our `cerebrium.toml` file: -``` -transformers -accelerate +```toml +[cerebrium.requirements.pip] +accelerate = "" +transformers = ">=4.35.0" openai-whisper ``` @@ -120,22 +121,42 @@ In our predict function, which only runs on inference requests, we simply create ## Deploy -Your config.yaml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You config.yaml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the hardware you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: -``` -%YAML 1.2 ---- -hardware: AMPERE_A5000 -memory: 10 -cpu: 2 -min_replicas: 0 -log_level: INFO -include: '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' -exclude: '[./.*, ./__*]' -cooldown: 60 -disable_animation: false -``` +```toml + +[cerebrium.build] +predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" +force_rebuild = false +disable_animation = false +log_level = "INFO" +disable_deployment_confirmation = false + +[cerebrium.deployment] +name = "controlnet-logo" +python_version = "3.10" +include = "[./*, main.py]" +exclude = "[./.*, ./__*]" + +[cerebrium.hardware] +gpu = "AMPERE_A5000" +cpu = 2 +memory = 10.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +cooldown = 60 + +[cerebrium.requirements.pip] +accelerate = "" +transformers = ">=4.35.0" +openai-whisper + +[cerebrium.requirements.conda] + +``` To deploy the model use the following command: ```bash