diff --git a/README.md b/README.md index 00985438..ed2588de 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Welcome to Cerebrium's documentation hub currently available at [docs.cerebrium.ai](https://docs.cerebrium.ai) -Cerebrium is an AWS Sagemaker alternative providing all the features you need to quickly build an ML product. +Cerebrium is an AWS SageMaker alternative providing all the features you need to quickly build an ML product. ### πŸš€ Setup @@ -26,7 +26,7 @@ yarn installed already run `npm install --global yarn` in your terminal. ### 😎 Publishing Changes -Changes will be deployed to production automatically after pushing to the default (`master`) branch. +Changes are deployed to production automatically after pushing to the default (`master`) branch. You can also preview changes using PRs, which generates a preview link of the docs. diff --git a/available-hardware.mdx b/available-hardware.mdx index 32e14506..a99d3666 100644 --- a/available-hardware.mdx +++ b/available-hardware.mdx @@ -10,28 +10,26 @@ This page lists the hardware that is currently available on the platform. If you # Hardware -## GPU's +## GPUs We have the following graphics cards available on the platform: -| Name | Cerebrium Name | VRAM | Minimum Plan | Max fp32 Model Params | Max fp16 Model Params -| --------------------------------------------------------------------------------------------------- | :------: |------ | :-------------------: | :-------------------: | :------------------: | -| [NVIDIA H100](https://www.nvidia.com/en-us/data-center/h100/) | Special Request | 80GB | Enterprise | 18B | 36B -| [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) | Special Request | 80GB | Standard | 18B | 36B -| [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) | AMPERE_A100 | 40GB | Standard | 9B | 18B -| [NVIDIA RTX A6000](https://www.nvidia.com/en-us/design-visualization/rtx-a6000/) | AMPERE_A6000 | 48GB | Hobby | 10B | 21B -| [NVIDIA RTX A5000](https://www.nvidia.com/en-us/design-visualization/rtx-a5000/) | AMPERE_A5000 | 24GB | Hobby | 5B | 10B -| [NVIDIA RTX A4000](https://www.nvidia.com/en-us/design-visualization/rtx-a4000/) | AMPERE_A4000 | 16GB | Hobby | 3B | 7B -| [NVIDIA Quadro RTX 5000](https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-5000-data-sheet-us-nvidia-704120-r4-web.pdf) | TURING_5000 | 16GB | Hobby | 3B | 7B -| [NVIDIA Quadro RTX 4000](https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/quadro-product-literature/quadro-rtx-4000-datasheet-us-nvidia-1060942-r2-web.pdf) | TURING_4000 | 8GB | Hobby | 1B | 3B - -_NOTE: The maximum model sizes are calculated as a guideline, assuming that the model is the only thing loaded into VRAM. Longer inputs will result in a smaller maximum model size. Your mileage may vary._ - -These GPUs can be selected using the `--gpu` flag when deploying your model on Cortex or can be specified in your `cerebrium.toml`. +| Name | Cerebrium Name | VRAM | Minimum Plan | Provider +| --------------------------------------------------------------------------------------------------- | :------: |------ | :-------------------: | :-------------------: | +| [NVIDIA H100](https://www.nvidia.com/en-us/data-center/h100/) | Special Request | 80GB | Enterprise | [AWS] +| [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) | Special Request | 80GB | Enterprise | [AWS] +| [NVIDIA A100_80GB](https://www.nvidia.com/en-us/data-center/a100/) | AMPERE_A100 | 80GB | Enterprise | [AWS] +| [NVIDIA A100_40GB](https://www.nvidia.com/en-us/data-center/a100/) | AMPERE_A100 | 40GB | Enterprise | [AWS] +| [NVIDIA A10](https://www.nvidia.com/en-us/data-center/a100/) | AMPERE_A10 | 24GB | Hobby | [AWS] +| [NVIDIA L4](https://www.nvidia.com/en-us/data-center/l4/) | ADA_L4 | 24GB | Hobby | [AWS] +| [NVIDIA L40s](https://www.nvidia.com/en-us/data-center/l40s/) | ADA_L40 | 48GB | Hobby | [AWS] +| [NVIDIA T4](https://www.nvidia.com/en-us/data-center/tesla-t4/) | TURING_T4 | 16GB | Hobby | [AWS] +| [AWS INFERENTIA](https://aws.amazon.com/machine-learning/inferentia/) | INF2 | 32GB | Hobby | [AWS] +| [AWS TRANIUM](https://aws.amazon.com/machine-learning/trainium/) | TRN1 | 32GB | Hobby | [AWS] + + +These GPUs can be selected using the `--gpu` flag when deploying your app on Cortex or can be specified in your `cerebrium.toml`. For more help with deciding which GPU you require, see this section [here](#choosing-a-gpu). -_Due to the global shortage of GPUs at the moment, we may not always have the Enterprise edition of your GPU available. In this case, we will deploy to the Workstation edition of the GPU._ -_These are the same GPUs, and it will not affect the performance of your model in any way._ - ## CPUs We select the CPU based on your choice of hardware, choosing the best available options so you can get the performance you need. @@ -42,13 +40,13 @@ You can choose the number of CPU cores you require for your deployment. If you d We let you select the amount of memory you require for your deployment. All the memory you request is dedicated to your deployment and is not shared with any other deployments, ensuring that you get the performance you need. -This is the amount of memory that is available to your code when it is running and you should choose an adequate amount for your model to be loaded into VRAM if you are deploying onto a GPU. +This is the amount of memory that is available to your code when it is running, and you should choose an adequate amount for your model to be loaded into VRAM if you are deploying onto a GPU. Once again, you only pay for what you need! ## Storage We provide you with a persistent storage volume attached to your deployment. -You can use this storage volume to store any data that you need to persist between deployments. Accessing your persistent storage is covered in depth for [cortex here](./cerebrium/data-sharing-storage/persistent-storage). +You can use this storage volume to store any data that you need to persist between deployments. Accessing your persistent storage is covered in depth for [cortex here](/cerebrium/data-sharing-storage/persistent-storage). The storage volume is backed by high-performance SSDs so that you can get the best performance possible Pricing for storage is based on the amount of storage you use and is charged per GB per month. @@ -60,10 +58,10 @@ On one hand, you want the best performance possible, but on the other hand, you ## Choosing a GPU -Choosing a GPU can be a complicated task of calculating VRAM usage based on the number of parameters you have as well as the length of your inputs. Additionally, some variables are dependent on your inputs to your model which will affect the VRAM usage substantially. For example, with LLMs and transformer-based architectures, you need to factor in attention processes as well as any memory-heavy positional encoding that may be happening which can increase VRAM usage exponentially for some methods. Similarly, for CNNs, you need to look at the number of filters you are using as well as the size of your inputs. +Choosing a GPU can be a complicated task of calculating VRAM usage based on the number of parameters you have as well as the length of your inputs. Additionally, some variables are dependent on your inputs to your app which will affect the VRAM usage substantially. For example, with LLMs and transformer-based architectures, you need to factor in attention processes as well as any memory-heavy positional encoding that may be happening which can increase VRAM usage exponentially for some methods. Similarly, for CNNs, you need to look at the number of filters you are using as well as the size of your inputs. As a rule of thumb, the easiest way is to choose the GPU that has at least 1.5x the minimum amount of VRAM that your model requires. -This approach is conservative and will ensure that your model will fit on the GPU you choose even if you have longer inputs than you expect. However, it is just a rule of thumb and you should test the VRAM usage of your model to ensure that it will fit on the GPU you choose. +This approach is conservative and will ensure that your model will fit on the GPU you choose even if you have longer inputs than you expect. However, it's just a rule of thumb, and you should test the VRAM usage of your model to ensure that it will fit on the GPU you choose. You can calculate the VRAM usage of your model by using the following formula: diff --git a/calculating-cost.mdx b/calculating-cost.mdx index 97b74611..ccf6b742 100644 --- a/calculating-cost.mdx +++ b/calculating-cost.mdx @@ -9,19 +9,19 @@ view the pricing of various compute on our [pricing page](https://www.cerebrium. When you deploy a model, there are two processes we charge you for: -1. We charge you for the build process where we set up your model environment. In this step, we set up a Python environment according to your parameters before downloading and installing the required apt packages, Conda and Python packages as well as any model files you require. +1. We charge you for the build process where we set up your app environment. In this step, we set up a Python environment according to your parameters before downloading and installing the required apt packages, Conda and Python packages as well as any model files you require. You are only charged for a build if we need to rebuild your environment, ie: you have run a `build` or `deploy` command and have changed your requirements, parameters or code. Note that we cache each of the steps in a build so subsequent builds will cost substantially less than the first. -2. The model runtime. This is the amount of time it takes your code to run from start to finish on each request. There are 3 costs to consider here: +2. The app runtime. This is the amount of time it takes your code to run from start to finish on each request. There are 3 costs to consider here: - Cold start: This is the amount of time it takes to spin up a server(s), load your environment, connect storage etc. This is part of the Cerebrium service and something we are working on every day to get as low as possible. We do not charge you for this! - Model initialization: This part of your code is outside of the predict - function and only runs when your model incurs a cold start. You are charged - for the amount of time it takes for this code to run. Typically this is - loading a model into GPU RAM. + function and only runs when your app incurs a cold start. You are charged for + the amount of time it takes for this code to run. Typically this is loading a + model into GPU RAM. - Predict runtime: This is the code stored in your predict function and runs every time a request hits your endpoint @@ -34,7 +34,7 @@ The model you wish to deploy requires: - 20GB Memory: 20 \* $0.00000659 per second - 10 GB persistent storage: 10 \* $0.3 per month -In our situation, your model works on the first deployment and so you incur only one build process of 2 minutes. Additionally, let's say that the model has 10 cold starts a day with an average initialization of 2 seconds and lastly and average runtime (predict) of 2 seconds. Let us calculate your +In our situation, your app works on the first deployment and so you incur only one build process of 2 minutes. Additionally, let's say that the app has 10 cold starts a day with an average initialization of 2 seconds and lastly and average runtime (predict) of 2 seconds. Let us calculate your expected cost at month end with you expecting to do 100 000 model inferences. ```python diff --git a/cerebrium/data-sharing-storage/persistent-storage.mdx b/cerebrium/data-sharing-storage/persistent-storage.mdx index 408b9cf1..da4af03a 100644 --- a/cerebrium/data-sharing-storage/persistent-storage.mdx +++ b/cerebrium/data-sharing-storage/persistent-storage.mdx @@ -1,53 +1,99 @@ --- -title: "Persistent Storage" +title: "Persistent Volumes" --- -Cerebrium gives to access to persistent storage to store model weights, files and much more. This storage volume persists across your project, meaning that if -you refer to model weights or a file created in a different deployment, you will be able to access it! +Cerebrium gives you access to persistent volumes to store model weights and files. +This volume persists across your project, meaning that if +you refer to model weights or files created in a different app (but in the same project), you're able to access them. -This allows you to load in model weights more efficiently as well as reduce the size of your deployment container images. Currently, -the volume can be accessed through `/persistent-storage` in your container instance, should you wish to access it directly and store other artifacts. +This allows model weights to be loaded in more efficiently, as well as reduce the size of your App container image. -While you have full access to this drive, we recommend that you only store files in directories other than `/persistent-storage/cache`, as this and its subdirectories -are used by Cerebrium to store your models. As a simple example, suppose you have an external SAM model that you want to use in your custom deployment. You can download it to the cache -as such: +### How it works -```python -import os -import torch +Every Cerebrium Project comes with a 50GB volume by default. This volume is mounted on all apps as `/persistent-storage`. -file_path = "/persistent-storage/segment-anything/model.pt" -# Check if the file already exists, if not download it -if not os.path.exists("/persistent-storage/segment-anything/"): - response = requests.get("https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth") - with open(file_path, "wb") as f: - f.write(response.content) +### Uploading files -# Load the model -model = torch.jit.load(file_path) -... # Continue with your initialization -``` +To upload files to your persistent volume, you can use the `cerebrium cp local_path dest_path` command. This command copies files from your local machine to the specified destination path in the volume. The dest_path is optional; if not provided, the files will be uploaded to the root of the persistent volume. -Now, in subsequent deployments, the model will load from the cache rather than download it again. +```bash +Usage: cerebrium cp [OPTIONS] LOCAL_PATH REMOTE_PATH (Optional) -## Increasing your Persistent Storage Size + Copy contents to persistent volume. -Once increased, your persistent storage size cannot be decreased. +Options: + -h, --help Show this message and exit. -By default, your account is given 50GB of persistent storage to start with. However, if you find you need more (for example, you get an error saying `disk quote exceeded`) then you can increase your allocation using the following steps: +Examples: + # Copy a single file + cerebrium cp src_file_name.txt # copies to /src_file_name.txt -1. Check your current persistent storage allocation by running: + cerebrium cp src_file_name.txt dest_file_name.txt # copies to /dest_file_name.txt + + # Copy a directory + cerebrium cp dir_name # copies to the root directory + cerebrium cp dir_name sub_folder/ # copies to sub_folder/ +``` + +### Listing files + +To list the files on your persistent volume, you can use the cerebrium ls [remote_path] command. This command lists all files and directories within the specified remote_path. If no remote_path is provided, it lists the contents of the root directory of the persistent volume. ```bash -cerebrium storage --get-capacity +Usage: cerebrium ls [OPTIONS] REMOTE_PATH (Optional) + + List contents of persistent volume. + +Options: + -h, --help Show this message and exit. + +Examples: + # List all files in the root directory + cerebrium ls + + # List all files in a specific folder + cerebrium ls sub_folder/ ``` -This will return your current persistent storage allocation in GB. +### Deleting files -2. To increase your persistent storage allocation run: +To delete files or directories from your persistent volume, use the `cerebrium rm remote_path` command. This command removes the specified file or directory from the persistent volume. Be careful, as this operation is irreversible. ```bash -cerebrium storage --increase-in-gb +Usage: cerebrium rm [OPTIONS] REMOTE_PATH + + Remove a file or directory from persistent volume. + +Options: + -h, --help Show this message and exit. + +Examples: + # Remove a specific file + cerebrium rm /file_name.txt + + # Remove a directory and all its contents + cerebrium rm /sub_folder/ +``` + +### Real world example + +```bash +wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth +cerebrium cp sam_vit_h_4b8939.pth segment-anything/sam_vit_h_4b8939.pth +``` + +As a simple example, suppose you have an external SAM model that you want to use in your custom deployment. You can download it to a cache directory on your persistent volume. +as such: + +```python +import os +import torch + +file_path = "/persistent-storage/segment-anything/sam_vit_h_4b8939.pth" + +# Load the model +model = torch.jit.load(file_path) +... # Continue with your initialization ``` -This will return a confirmation message and your new persistent storage allocation in GB if successful. +Now, in later inference requests, the model loads from the persistent volume instead of downloading again. diff --git a/cerebrium/deployments/async-functions.mdx b/cerebrium/deployments/async-functions.mdx deleted file mode 100644 index 32152fb4..00000000 --- a/cerebrium/deployments/async-functions.mdx +++ /dev/null @@ -1,37 +0,0 @@ ---- -title: "Async Functionality" ---- - -Unfortunately Cerebrium doesn't "properly" support async functionality however please see below how you can implement something similar using Cortex. Please -let our team know you would like the ability to use async functionality and your use case so we can add it to our roadmap. - -The main reason Cortex, doesn't support async functionality is because our **predict** function is executed synchronously. This means that you can use async -functionality throughout your code however, when it gets to the predict functionality, it needs to be executed synchronously. - -For example you can implement the following below: - -```python -from asyncio import ( - new_event_loop, - set_event_loop, - create_task, - gather, - run, -) - -def predict(item, run_id, logger): - loop = new_event_loop() - set_event_loop(loop) - first_model = loop.create_task(predict_first_model()) - second_model = loop.create_task(predict_second_model()) - tasks = gather(first_model, second_model) - results = loop.run_until_complete(tasks) - - return results -``` - -Essentially what we are doing above is creating an event loop which is responsible for executing coroutines and scheduling callbacks. -We then run two separate async functions on the same loop since we would like both these tasks to finish. If this is not the case, you can create multiple different -loops. We then use the 'run_until_complete' function to wait until both functions have returned. Lastly we return the results from the two predict functions. - -The above code converts asynchronous code to run synchronously. diff --git a/cerebrium/deployments/ci-cd.mdx b/cerebrium/deployments/ci-cd.mdx index 132db955..af6f4434 100644 --- a/cerebrium/deployments/ci-cd.mdx +++ b/cerebrium/deployments/ci-cd.mdx @@ -2,3 +2,95 @@ title: "CI/CD Pipelines" description: "Integrate Cerebrium into your CI/CD workflow for automated deployments" --- + +If you would like to automatically deploy a new version of your app to production/development once you have merged into the +respective branch you can do so using the commands below. We will be using a GitHub actions workflow. + +### 1. Get your Cerebrium OAuth credentials + +Cerebrium stores your credentials in the directory ~/.cerebrium/config.yaml. If you run the command 'cat ~/.cerebrium/config' then +you should see an output of 3 variables, namely: + +- accessToken +- refreshToken +- projectId + +We will need these in the next step + +### 2. Define secrets in your GitHub environment + +Go to your GitHub repository -> Settings -> Environments + +![Github](/images/githubActions.png) + +You should create a new environment with a title of your choice (prod and dev). Within these environments, you have two types +of variables you can work with: + +- Environment secrets: These are encrypted and do not show in your workflow logs +- Environment variables: These are not encrypted and so will show in the workflow logs. + +In our use case, we will store all our variables (accessToken, refreshToken, projectId) as secrets to keep the values hidden + +### 3. GitHub Actions Workflow + +In this flow, we do the following: + +1. Install a python version of our choice (3.8 to 3.11) +2. Pip install the cerebrium package +3. Login to Cerebrium using our credentials +4. Deploy our app +5. Notify a Slack channel of successful deployment + +``` +name: Cerebrium Deployment +on: + push: + branches: + - master + workflow_dispatch: + inputs: + environment: + description: "Environment" + type: choice + options: + - "dev" + - "prod" + required: true + default: "dev" + pull_request: + branches: + - master + - development + +jobs: + deployment: + runs-on: ubuntu-latest + environment: ${{ github.event.inputs.environment || (github.ref == 'refs/heads/master' && 'prod') || 'dev' }} + env: + ENV: ${{ github.event.inputs.environment || (github.ref == 'refs/heads/master' && 'prod') || 'dev' }} + PROJECT_ID: ${{ secrets.ROJECT_ID }} + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN}} + REFRESH_TOKEN: ${{ secrets.REFRESH_TOKEN}} + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install Cerebrium + run: pip install cerebrium + + - name: Add Auth keys + run: cerebrium save-auth-config "$ACCESS_TOKEN" "$REFRESH_TOKEN" "$PROJECT_ID" + + - name: Deploy App + run: cerebrium deploy + + - name: Notify Slack + uses: someimportantcompany/github-actions-slack-message@v1 + with: + webhook-url: ${{ secrets.SLACK_WEBHOOK_URL }} + text: "Cerebrium app deployed to production! :tada:" + if: github.ref == 'refs/heads/master' + +``` diff --git a/cerebrium/deployments/long-running-tasks.mdx b/cerebrium/deployments/long-running-tasks.mdx index d7b16e9f..f760877a 100644 --- a/cerebrium/deployments/long-running-tasks.mdx +++ b/cerebrium/deployments/long-running-tasks.mdx @@ -2,6 +2,8 @@ title: "Long Running Tasks" --- +This feature is currently *unavailable* in the v4 API, + There are cases where the model pipelines you are running are longer than what your clients would be willing to wait for or are longer than the 3-minute limit Cerebrium allows for on endpoints. Therefore you might want tasks to execute in the background and be alerted of them when they are completed. Cerebrium automatically adds the following name parameter to every request object you send in - named **webhook_endpoint**. This means you can provide an endpoint for us to send with your model results. If we detect the parameter in your request, we will give you a response immediately with the run_id and status code 200. The results we send to your webhook_endpoint later will contain the same run_id so that you can make the link on your side. We will always alert your endpoint regardless of whether the function executes successfully or not. If the function fails, we will send you the error message. @@ -12,7 +14,7 @@ We send the following request to our endpoint on Cerebrium that runs a LLama 2 7 ```bash curl --location --request POST 'https://run.cerebrium.ai/v3/xxxxxx/predict' \ ---header 'Authorization: ' \ +--header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data '{"prompt": "Give me a detailed plan of how I can drill to the center of the Earth.", "webhook_endpoint": "https://webhook.site/0dc4773b-5e5b-4ef1-8a72-87065852a80e"}' ``` diff --git a/cerebrium/development/serve.mdx b/cerebrium/development/serve.mdx new file mode 100644 index 00000000..9572a83c --- /dev/null +++ b/cerebrium/development/serve.mdx @@ -0,0 +1,92 @@ +--- +title: Code hot-reloading +description: Use the `cerebrium serve` command to rapidly iterate on your code +--- + +This feature is currently *unavailable* in the v4 API. + +When you are developing a `cortex` deployment on **Cerebrium**, waiting for a build to complete can be time-consuming. To speed up your development process, you can use the `cerebrium serve` command to rapidly iterate on your deployment. + +This allows you to run your deployment on a dedicated server, and see the results of your changes in a few seconds. + + + This feature is currently in beta and is available to all users. As such, you + may encounter bugs or limitations. We are actively working on improving the + experience and adding more features. If you have any feedback or suggestions, + we'd love to hear from you on [Discord](https://discord.gg/ATj6USmeE2) or + [Slack](https://join.slack.com/t/cerebriumworkspace/shared_invite/zt-1qojg3eac-q4xyu5O~MeniNIg2jNeadg)! + + +**Limitations:** + +- Build process (packages, apt, etc.) changes require a full restart of the served instance +- Serve sessions are not cached, meaning a full build process is done even when environments have not changed between sessions +- Responses are not returned via the Local API server + +## Usage + +To start a served instance, first navigate to the root folder of your cortex deployment. + +Then, simply run the following command in your terminal: + +```bash +cerebrium serve start +``` + +After running the `cerebrium serve start` command, it will start up an instance and create your environment. ie: install all your requested packages and dependencies. + +Once completed, it will output a URL that you can use to query your instance locally: mimicking a production endpoint. + +```bash +Info: πŸ—οΈ Starting served session... +πŸ†” Serve ID: p-abcd1234-8-cpu-only-a296 +πŸ”„ Syncing files +⬆️ Uploading to Cerebrium... +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2.89k/2.89k [00:00<00:00, 68.2kB/s] +βœ… Resources uploaded successfully. +Info: Session p-abcd1234-8-cpu-only-a296 started successfully + +Info: Local API server started successfully on port 7900 +Send a POST request to: http://localhost:7900/predict +``` + +### Local API Server + +Since we are using a local API server, you don't need to worry about providing an API key in the Authorization header. +Your served instance will be running on port 7900 by default, you can change this by using the `--port` flag when you start a served instance + +You can make a request to your local endpoint using: + +```bash +curl -X POST http://localhost:7900/predict -H "Content-Type: application/json" --data '{"prompt": "this is my input data"}' +``` + +For the Beta, we don't return the response back through the local API server but rather send the data to your instance. Please use the logs +as a source of reference on output. + +### File Changes + +As you save changes to your **main.py** or add/delete files in your directory, the instance will automatically update in a few seconds (unless some files you add are in the GB's) +and new changes will be live that you can inference. + +If you would like to make changes to the environment, ie: hardware, pip/apt packages, etc then you will need to restart the serve instance which you can do by pressing `Ctrl+C` and running the `start` command again. + +![Example changing code with serve](/images/serve/ModifyingServeCode.gif) + + + Please note that you are charged for your compute as long as serve is running. + ie: If you are running serve for 8 minutes, you will be charged for 8 minutes + of compute based on the hardware requirements you specified. It is very + important to end your session when done. We will automatically end the session + after 10 minutes of inactivity. + + +## How it works + +When you run `cerebrium serve start`, the following happens: + +1. The `cerebrium` CLI uploads your deployment to a dedicated instance(s). +2. The server builds your deployment in the same way as `cerebrium deploy`. +3. The server starts your deployment and waits for you to send in requests. +4. If you make changes to your main.py or other code in your deployment, the server reloads your deployment and applies your changes without rebuilding the entire deployment. +5. When you're done, you can stop the server by pressing `Ctrl+C` in the same terminal where you started the server. diff --git a/cerebrium/endpoints/openai-compatible-endpoints.mdx b/cerebrium/endpoints/openai-compatible-endpoints.mdx new file mode 100644 index 00000000..29720654 --- /dev/null +++ b/cerebrium/endpoints/openai-compatible-endpoints.mdx @@ -0,0 +1,84 @@ +--- +title: "OpenAI Compatible Endpoints" +description: "" +--- + +By default, all functions deployed on Cerebrium are a REST API that are accessible through an authenticated POST request. We have made all these endpoints OpenAI compatible whether it +be /chat/completions or /embedding. Below we show you a very basic implementation of implementing a streaming OpenAI compatible endpoint. + +We recommend you checkout a full example of how to deploy a OpenAI compatible endpoint using vLLM [here](https://github.com/CerebriumAI/examples/tree/master/29-openai-compatible-endpoint) + +To create a streaming compatible endpoint, we need to make sure our cerebrium function: + +- Specifies all the parameters that OpenAI sends in the function signature +- We return `yield data`. Where yield signifies we are streaming and data is the json serializable + object which we are returning to our user. + +Here's a small snippet from the example listed above: + +```python + +def run(messages: List[Message], model: str,...): + ##existing code + + async for output in results_generator: + prompt = output.outputs + new_text = prompt[0].text[len(previous_text):] + previous_text = prompt[0].text + full_text += new_text + + response = ChatCompletionResponse( + id=run_id, + object="chat.completion", + created=int(time.time()), + model=model, + choices=[{ + "text": new_text, + "index": 0, + "logprobs": None, + "finish_reason": prompt[0].finish_reason or "stop" + }] + ) + yield json.dumps(response.model_dump()) +``` + +Once deployed, we can set the base URL to the desired function we wish to call and use our Cerebrium JWT (accessible on your dashboard) as the API key. + +Our client code will then look something like this: + +```python +import os +from openai import OpenAI + +client = OpenAI( + # This is the default and can be omitted + base_url="https://api.cortex.cerebrium.ai/v4/dev-p-xxxxx/openai-compatible-endpoint/run", ##This is the name of the function you are calling + api_key="", +) + +chat_completion = client.chat.completions.create( + messages=[ + {"role": "user", "content": "What is a mistral?"}, + {"role": "assistant", "content": "A mistral is a type of cold, dry wind that blows across the southern slopes of the Alps from the Valais region of Switzerland into the Ligurian Sea near Genoa. It is known for its strong and steady gusts, sometimes reaching up to 60 miles per hour."}, + {"role": "user", "content": "How does the mistral wind form?"} + ], + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + stream=True +) +print("Starting to receive chunks...") +for chunk in chat_completion: + print(chunk) +print("Finished receiving chunks.") +``` + +The output then looks like this + +``` +Starting to receive chunks... +ChatCompletionChunk(id='412f0e25-61c4-93b8-a00f-09a5076cd9fa', choices=[Choice(delta=None, finish_reason='stop', index=0, logprobs=None, text=' The')], created=1724166657, model='gpt-3.5-turbo', object='chat.completion', service_tier=None, system_fingerprint=None, usage=None) +ChatCompletionChunk(id='412f0e25-61c4-93b8-a00f-09a5076cd9fa', choices=[Choice(delta=None, finish_reason='stop', index=0, logprobs=None, text=' formation')], created=1724166657, model='gpt-3.5-turbo', object='chat.completion', service_tier=None, system_fingerprint=None, usage=None) +ChatCompletionChunk(id='412f0e25-61c4-93b8-a00f-09a5076cd9fa', choices=[Choice(delta=None, finish_reason='stop', index=0, logprobs=None, text=' of')], created=1724166657, model='gpt-3.5-turbo', object='chat.completion', service_tier=None, system_fingerprint=None, usage=None) +ChatCompletionChunk(id='412f0e25-61c4-93b8-a00f-09a5076cd9fa', choices=[Choice(delta=None, finish_reason='stop', index=0, logprobs=None, text=' the')], created=1724166657, model='gpt-3.5-turbo', object='chat.completion', service_tier=None, system_fingerprint=None, usage=None) +ChatCompletionChunk(id='412f0e25-61c4-93b8-a00f-09a5076cd9fa', choices=[Choice(delta=None, finish_reason='stop', index=0, logprobs=None, text=' mist')], created=1724166657, model='gpt-3.5-turbo', object='chat.completion', service_tier=None, system_fingerprint=None, usage=None) +... +``` diff --git a/cerebrium/endpoints/rest-api.mdx b/cerebrium/endpoints/rest-api.mdx index a7d08c46..e79b1815 100644 --- a/cerebrium/endpoints/rest-api.mdx +++ b/cerebrium/endpoints/rest-api.mdx @@ -6,14 +6,14 @@ description: "" By default, all deployments on Cerebrium are a REST API that are accessible through an authenticated POST request. Authentication is done using your JWT token from your API Keys section on your dashboard. -Typically, a POST requests take the form: +Typically, a POST requests take the form below where the final word is the name of your function you would like to call. ie: in this case, the function in our main.py we are calling is predict() ```bash -curl --location --request POST 'https://run.cerebrium.ai/v3/p-xxxxx/test-model/predict' \ ---header 'Authorization: ' \ +curl --location --request POST 'https://api.cortex.cerebrium.ai/v4/p-xxxxx/{app-name}/{function}' \ +--header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data '{ - "prompt": "Hello world!" + "function_param": "data" }' ``` @@ -22,8 +22,8 @@ Responses then take the form: ```bash { "run_id": "52eda406-b81b-43f5-8deb-fcf80dfsb74b", - "message": "Your input said: Hello World!", - "runtime: 326.34 + "run_time_ms": 326.34 + "result": {} } ``` diff --git a/cerebrium/endpoints/streaming.mdx b/cerebrium/endpoints/streaming.mdx index c9bf3c3f..2d854e63 100644 --- a/cerebrium/endpoints/streaming.mdx +++ b/cerebrium/endpoints/streaming.mdx @@ -2,24 +2,63 @@ title: "Streaming Endpoints" --- -Streaming allows users to stream live output from their models using server-sent event (SSE) streams. This works for Python objects which implement the -iterator protocol which is anything that essentially uses the 'yield' command in Python. You can return any content types as long as it is returned as a string +Streaming allows users to stream live output from their models using server-sent event (SSE) streams. +This works for Python objects which use the iterator or generator protocol. - - This feature is currently in beta and so if you would like to stream output, - please replace '**predict**' in your endpoint url with '**stream**' - +Currently, your generator/iterator is required to yield data, as it will be sent downstream via the `text/event-stream` Content-Type. +You may still send data in JSON format and then can decode it appropriately. Let us see how we can implement a simple example below: ```python -def predict(item, run_id, logger): - for i in range(10): +import time + +def run(upper_range: int): + for i in range(upper_range): yield f"Number {i} " time.sleep(1) ``` -Once you deploy this code snippet and hit the stream endpoint, you will see the SSE events progressively appear. The latest Postman has great functionality to show this. +Once you deploy this code snippet and hit the stream endpoint, you will see the SSE events progressively appear every second. + +You can do this as follows: + +```bash +curl -X POST https://api.cortex.cerebrium.ai/v4//stream-example/run \ + -H 'Content-Type: application/json'\ + -H 'Accept: text/event-stream\ + -H 'Authorization: Bearer \ + --data '{"upper_range": 3}' +``` + +This should output: + +```bash +HTTP/1.1 200 OK +cache-control: no-cache +content-encoding: gzip +content-type: text/event-stream; charset=utf-8 +date: Tue, 28 May 2024 21:12:46 GMT +server: envoy +transfer-encoding: chunked +vary: Accept-Encoding +x-envoy-upstream-service-time: 198995 +x-request-id: e6b55132-32af-96d7-a064-8915c4a42452 + +data: Number 0 +... +``` + +Progressively, you will see the rest of the data stream in every second: + +``` +... +data: Number 1 + +data: Number 2 +``` + +The latest Postman also has great functionality to show this. ![Streaming](/images/cortex/streaming-postman.png) diff --git a/cerebrium/environments/config-files.mdx b/cerebrium/environments/config-files.mdx new file mode 100644 index 00000000..35e1faaa --- /dev/null +++ b/cerebrium/environments/config-files.mdx @@ -0,0 +1,164 @@ +--- +title: Using Config Files +description: Using config files to configure Cortex deployments easily and quickly +--- + +After you've created your Cortex project, you may find that you need more control over your deployment than the default `cerebrium deploy` command provides. +For example, you may want to specify the number of GPUs to use, the amount of memory to use or even the version of Python for your environment. +These settings and more are all configurable using config files. + +Your config file is a TOML file that you can use to specify the parameters of your cortex deployment. This file is used to specify the deployment parameters, build parameters, hardware parameters, scaling parameters and dependencies for your deployment. + +## Creating a config file + +The fastest and simplest way to create a config file is to run the `cerebrium init` command and specify the directory in which you would like to create the config file. +This command will create a `cerebrium.toml` file in your project root, which you can then edit to suit your needs. +You can specify any field you wish to be prepopulated with a specific value. + +```bash +cerebrium init my-project-dir --name= --gpu=ADA_L4 +``` + +## Deployment Parameters + +Deployment parameters govern the persistent environment in which your app is deployed. +These parameters are specified under the `cerebrium.deployment` section of your config file. + +The available deployment parameters are: +| parameter | description | type | default | +| --- | --- | --- | --- | +| `name` | The name of your app | string | my-app | +| `python_version` | The Python version available for your runtime | float | {interpreter_version}| +| `include` | Local files to include in the deployment. | list\[string] | \["./\*", main.py] | +| `exclude` | Local Files to exclude from the deployment. | list\[string] | \["./.\*"] | +| `docker_base_image_url` | The docker base image you would like to run | string | 'debian:bookworm-slim' | +| `shell_commands` | A list of commands to run an app entrypoint script | list\[string] | [] + +## Hardware Parameters + +The hardware parameters section is where you can define the specifications of the machine you would like to use for your deployment. This allows you to tailor your deployment to your specific needs, optimizing for cost or performance as you see fit. +These parameters are specified under the `cerebrium.hardware` section of your config file. + +The available hardware parameters in your config are: +| parameter | description | type | default | +| --- | --- | --- | --- | +| `cpu` | The number of CPU cores to use. | int | 2 | +| `memory` | The amount of Memory to use in GB. | float | 14 | +| `compute` | The GPU you would like to use. | string | CPU | +| `gpu_count` | The number of GPUs to specify. | int | 1 | +| `provider` | The provider you would like your deployment to be on. v4 only supports `aws` | string | aws | +| `region` | The region you would like your deployment to be on. v4 only supports `us-east-1` | string | us-east-1 | + +### Available Hardware + +The following is the hardware available on Cerebrium + +| Name | Provider | API Compatibility | +| ------------------ | -------- | ----------------- | +| `CPU` | [aws] | [v4] | +| `AMPERE_A10` | [aws] | [v4] | +| `ADA_L4` | [aws] | [v4] | +| `ADA_L40` | [aws] | [v4] | +| `TURING_T4` | [aws] | [v4] | +| `AMPERE_A100` | [aws] | [v4] | +| `AMPERE_A100_40GB` | [aws] | [v4] | +| `HOPPER_H100` | [aws] | [v4] | +| `INF2` | [aws] | [v4] | +| `TRN1` | [aws] | [v4] | + +## Scaling Parameters + +This section lets you configure how you would like your deployment to scale. You can use these parameters to control the minimum and maximum number of replicas to run, as well as the cooldown period between requests. For example, you could increase your cooldown time or even set a minimum number of replicas to run, increasing availability and avoiding cold starts. + +These parameters are specified under the `cerebrium.scaling` section of your config file. + +| parameter | description | type | default | +| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---- | ---------- | +| `min_replicas` | The minimum number of replicas to run at all times. | int | 0 | +| `max_replicas` | The maximum number of replicas to scale to. | int | plan limit | +| `cooldown` | The number of seconds to keep your app warm after each request. It resets after every request ends. | int | 60 | +| `replica_concurrency` | The maximum number of requests an instance of your app can handle at a time. You should ensure your deployment can handle the concurrency before setting this above 1. | int | 1 | + +## Adding Dependencies + +The dependencies section of your config file is where you can specify any dependencies you would like to install in your deployment. We support **pip**, **conda** and **apt** dependencies and you can specify each of these in their relevant subsection of the dependencies section. + +For each dependency type, you can specify the name of the package you would like to install and the version constraints. If you do not want to specify any version constraints, you can use the `latest` keyword to install the latest version of the package. + +If you have an existing **requirements.txt**, **pkglist.txt** or **conda_pkglist.txt**, we'll prompt you to automatically integrate these into your config file when you run `cerebrium deploy`. + +### pip + +Your pip dependencies are specified under the `cerebrium.dependencies.pip` section of your config file. +An example of a pip dependency is shown below: + +```toml +[cerebrium.dependencies.pip] +torch = ">=2.0.0" +numpy = "latest" +``` + +### conda + +Similarly, your conda dependencies are specified under the `cerebrium.dependencies.conda` section of your config file. +An example of a conda dependency is shown below: + +```toml +[cerebrium.dependencies.conda] +cuda = ">=11.7" +cudatoolkit = "11.7" +``` + +### apt + +Finally, your apt dependencies are specified under the `cerebrium.dependencies.apt` section of your config file. +These are any package that you would install using `apt-get install` on a Linux machine. +An example of an apt dependency is shown below: + +```toml +[cerebrium.dependencies.apt] +"libgl1-mesa-glx" = "latest" +"libglib2.0-0" = "latest" +``` + +## Config File Example + +That was a lot of information! +Let's see an example of a config file in action. + +Below is an example of a config file that takes advantage of all the features we've discussed so far. + +```toml +[cerebrium.deployment] +name = "my-app" +python_version = "3.10" +include = ["./*", "main.py"] +exclude = ["./.*", "./__*"] +docker_base_image_url = "debian:bookworm-slim" +shell_commands = [] + +[cerebrium.hardware] +compute = "AMPERE_A10" +cpu = 2 +memory = 16.0 +gpu_count = 1 +provider = "aws" +region = "us-east-1" + +[cerebrium.scaling] +min_replicas = 0 +max_replicas = 2 +cooldown = 60 +replica_concurrency = 1 + +[cerebrium.dependencies.pip] +torch = ">=2.0.0" + +[cerebrium.dependencies.conda] +cuda = ">=11.7" +cudatoolkit = "11.7" + +[cerebrium.dependencies.apt] +"libgl1-mesa-glx" = "latest" +"libglib2.0-0" = "latest" +``` diff --git a/cerebrium/environments/custom-images.mdx b/cerebrium/environments/custom-images.mdx index e212dbf9..5db472fd 100644 --- a/cerebrium/environments/custom-images.mdx +++ b/cerebrium/environments/custom-images.mdx @@ -1,9 +1,33 @@ --- -title: Custom Images +title: Customizing the Cortex Runtime description: Specify your versions, dependencies and packages to use --- -By default, Cerebrium models are executed in Python 3.9 unless the Python version you specified in your **cerebrium.toml** is different (see [here](./initial-setup#config-file-format-and-parameters)). Note, Cerebrium only supports python from version 3.9 and above. +## Python Version + +By default, Cerebrium configs are [initialized](./initial-setup#config-file-format-and-parameters) to the Python version of your interpreter. +You can edit the Python version you wish to use for your app in the **[cerebrium.deployment]** section of your config. +Note, Cerebrium only supports Python 3.10 through 3.12. Support for 3.13 will follow shortly after its release. + +```toml + +[cerebrium.deployment] +name = "cerebrium-app" +... +python_version = "3.12" +``` + +## Docker Base Image + +Cerebrium supports the ability to define the base docker image you would like to use. At the moment, this feature is currently in **beta** and so we only have three options available: + +- debian:bookworm-slim (default) +- nvidia/cuda:12.1.1-runtime-ubuntu22.04 +- nvidia/cuda:11.8.0-runtime-ubuntu22.04 + +We will add support for bringing your own in the future. + +## Dependencies Traditionally, when working with Python, you will need access to Apt packages, Pip packages and Conda packages. For a deployment to cerebrium, you can specify all of these in your **cerebrium.toml** file in the following locations: @@ -34,3 +58,31 @@ All the sections above are optional, however, have to contain these file names s Typically, specifying versions for packages leads to faster builds however, if you ever find you would like to change version numbers or find your library versions aren't updating, please add the following flag to your deploy command: `cerebrium deploy --name <> --force-rebuild` + +## Shell Commands + +Cerebrium gives you the ability to run shell commands - this is for more complicated use cases of cloning repositories, running install scripts etc. +You can also define environment variables, which will populate every time your app starts up. + + + Please note that shell commands are run last in the build process which means + it happens after dependency installation. If you would like to install a pip + package after some shell commands, then run 'pip install transformers' in your + shell commands. + + +To run shell commands, you define a list of strings in the [cerebrium.build] section of your **cerebrium.toml**: + +```toml + +[cerebrium.deployment] +name = "cerebrium-app" +... +shell_commands = [ + "echo 'Hello, World!'", + "curl -LsSf https://astral.sh/uv/install.sh | sh", + "uv venv", + "uv pip install transformers", + "export SOME_ENV_VAR=value" +] +``` diff --git a/cerebrium/environments/custom-runtime.mdx b/cerebrium/environments/custom-runtime.mdx new file mode 100644 index 00000000..96e035a1 --- /dev/null +++ b/cerebrium/environments/custom-runtime.mdx @@ -0,0 +1,108 @@ +--- +title: Using Custom Runtimes (Preview) +description: Configure custom ASGI or WSGI runtimes +--- + + + This is a new feature! As such, the API is still currently subject to changes. + +Most applications are expected to work with the current implementation. +However, should you encounter an issue deploying a Custom Runtime please +reach out to us on Discord! + +Still on the way: Websocket support, Healthcheck grace period + + + +The default Cortex runtime can be great for getting up and running and simple use cases. However, you may already have an application built, or need +more complex functionality built into your app such as custom authentication, dynamic batching, public endpoints or websockets. +The Cerebrium platform allows you to deploy a custom python-based runtime to achieve this. To illustrate how this works, let's +take a straightforward example ASGI webserver written with FastAPI called `main.py`: + +```python +from fastapi import FastAPI + +server = FastAPI() + +# This function would map to a request to api.cortex.cerebrium.ai/project-id/app-name/hello +@server.get("/hello") +async def hello(): + return {"message": "Hello Cerebrium!"} + +# You can define an endpoint that can relay to Cerebrium that the app is ready to receive requests +@server.get("/health") +async def health(): + return "Ok" +``` + +To enable us to deploy this application, we modify our `cerebrium.toml` with a 'cerebrium.runtime.custom' section. +There are 3 parameters in this section: + +| parameter | description | type | default | +| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | ---------------------------------------------------------------------- | +| `entrypoint` | The command used to enter your application as either a list of strings or a single string. This is run from the `/cortex` directory | list\[str] | \["uvicorn", "app.main:server", "--host", "0.0.0.0", "--port", "8000"] | +| `port` | The port your application runs on. You must ensure this port is the same your app exposes and expects to receive traffic on | int | 8000 | +| `healthcheck_endpoint` | The endpoint the application uses to relay that it is ready to receive requests. If set, a _200_ response is required from the endpoint before the app can receive requests. If this is an empty string, we will check replica health with TCP to your specified port | string | "" | + +An example of a config section for a custom runtime for our main file may look something like this: + +```toml +[cerebrium.deployment] +name = "my-app" +python_version = "3.10" +... + +[cerebrium.runtime.custom] +entrypoint = ["uvicorn", "app.main:server", "--host", "0.0.0.0", "--port", "8080"] +port = 8080 +healthcheck_endpoint = "" # An empty string here means health will be checked with TCP on your specified `port` + +... +``` + +An important note about entrypoints. Since your source code is in `/cortex/app`, your entrypoint must be run from the `app` directory +(e.g. if you want to run `main.py`, the entrypoint would be: `python app/main.py`). Furthermore, notice that any port used in the entrypoint +matches the specified port. + +Depending on whether you deploy an ASGI application or an app with a self-contained webserver, you may need to install an ASGI runtime +to run your app just as you would usually. In this case, we are using an ASGI server (FastAPI), so we will need to install `uvicorn`. +Specify this in your dependencies: + +```toml +... + +[cerebrium.dependencies.pip] +fastapi = "latest" +uvicorn = "latest" + +... +``` + +Conversely, it is possible to run WSGI or apps with self contained servers. For example, you could deploy +a VLLM app using only the 'cerebrium.runtime.custom' and 'cerebrium.dependencies.pip' sections and **no** +Python code! + +```toml +... +# Note you can specify the entrypoint as a single string! +[cerebrium.runtime.custom] +entrypoint = "vllm serve meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 8000 --device cuda" +port = 8000 +healthcheck_endpoint = "/health" # Replica health will be checked with a GET request to /health + +[cerebrium.dependencies.pip] +torch = "latest" +vllm = "latest" + +... +``` + +Once you have made the necessary changes to your configuration, you are ready to deploy! You can deploy as normal +and our system will detect you are running a custom runtime automatically. + +```bash +cerebrium deploy -y +``` + +Your call signature is exactly the same as when you deploy a Cortex application. Every endpoint your custom server exposes will be available on +`api.cortex.cerebrium/{project-id}/{app-name}/an/example/endpoint` diff --git a/cerebrium/environments/initial-setup.mdx b/cerebrium/environments/initial-setup.mdx index b836ec07..02469524 100644 --- a/cerebrium/environments/initial-setup.mdx +++ b/cerebrium/environments/initial-setup.mdx @@ -6,7 +6,7 @@ description: Using config files to configure Cortex deployments easily and quick You can quickly set up a Cortex project by running the following command: ```bash -cerebrium init <> +cerebrium init my-project-dir --name= ``` This will create a Cortex project in the specified directory with the following structure: @@ -20,89 +20,10 @@ project_name/ Cortex supports the use of `toml` config files to configure various aspects of your project such as hardware requirements, scaling parameters and much more. Using config files makes it easier to keep track of your Cerebrium deployments, share them and use git versioning to show changes over time. -To deploy your model with a specific config file, you can use the `cerebrium deploy` command with the `--config-file` flag to specify the path to your config file. This can be any .toml file or a .yaml (deprecated) file. Otherwise `cerebrium deploy` will use the `cerebrium.toml` file in your directory. - - - If you are still on the legacy YAML config file format, you can convert your - config file to a `toml` config file by answering yes to the prompt when you - run `cerebrium deploy` as normal. This will create a `cerebrium.toml` file in - your project directory which you can then edit as you wish. Additionally, all - the dependencies in your `pkglist.txt` and `conda_pkglist.txt` files will be - added to the `cerebrium.toml` file automatically - +To deploy your model with a specific config file, you can use the `cerebrium deploy` command with the `--config-file` flag to specify the path to your config file. This can be any .toml file or a .yaml (deprecated) file. Otherwise `cerebrium deploy` will use the `cerebrium.toml` file in your directory. If you would like more information on how to use config files, please see the [Using Config Files](/cerebrium/environments/config-files) page. ```bash cerebrium deploy ``` Your config file can be named anything you want and can be placed anywhere on your local machine. However, remember to use the `cerebrium deploy` command in the same directory as your main.py as you would normally. - -### Config File Format and Parameters - -The parameters for your config file are the same as those which you would use as flags for a normal `cerebrium deploy` command. They're tabulated below for your convenience: - -| Parameter | Description | Type | Default | -| -------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------- | -| | -| `cerebrium.build.predict_data` | The data to use to test your predict function on build. This is the same as the payload in a inference call | string | '\{"prompt": "Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build."\}' | -| `cerebrium.build.force_rebuild` | Whether to force a rebuild of your deployment | boolean | false | -| `cerebrium.build.disable_animation` | Whether to disable the animation in the logs. | boolean | false | -| `cerebrium.build.log_level` | Log level for the deployment | string | INFO | -| `cerebrium.build.disable_confirmation` | Whether to disable the pre-deployment confirmation prompt | boolean | false | -| `cerebrium.deployment.named ` | The name of your deployment | string | my-model | -| `cerebrium.deployment.python_version` | The Python version you would like to run | float | 3.9 | -| `cerebrium.deployment.include` | Local files to include in the deployment | string | '[./\*, main.py]' | -| `cerebrium.deployment.exclude` | Local Files to exclude from the deployment | string | '[./.\*, ./__\*]' | -| `cerebrium.hardware.gpu` | The GPU you would like to use. | string | AMPERE_A5000 | -| `cerebrium.hardware.cpu` | The number of CPU cores to use | int | 2 | -| `cerebrium.hardware.memory` | The amount of Memory to use in GB | float | 14.5 | -| `cerebrium.hardware.gpu_count` | The number of GPUs to specify | int | 2 | -| `cerebrium.scaling.min_replicas` | The minimum number of replicas to run. | int | 0 | -| `cerebrium.scaling.max_replicas` | The maximum number of replicas to scale to. | int | plan limit | -| `cerebrium.scaling.cooldown` | The number of seconds to keep your model warm after each request. It resets after every request ends. | int | 60 | -| `cerebrium.dependencies.pip` | The pip packages you would like to install. In the format 'module' = 'version_constraints' | dict (toml) | | -| `cerebrium.dependencies.conda` | The conda packages you would like to install. In the format 'module' = 'version_constraints' | dict (toml) | | -| `cerebrium.dependencies.apt` | The apt packages you would like to install. | list (toml) | | - -## Config File Example - -```toml -# This file was automatically generated by Cerebrium as a starting point for your project. -# You can edit it as you wish. -# If you would like to learn more about your Cerebrium config, please visit https://docs.cerebrium.ai/cerebrium/environments/initial-setup#config-file-example - -[cerebrium.build] -predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" -force_rebuild = false -disable_animation = false -log_level = "INFO" -disable_confirmation = false - -[cerebrium.deployment] -name = "my-model" -python_version = "3.10" -include = "[./*, main.py]" -exclude = "[./.*, ./__*]" - -[cerebrium.hardware] -gpu = "AMPERE_A5000" -cpu = 2 -memory = 16.0 -gpu_count = 1 - -[cerebrium.scaling] -min_replicas = 0 -cooldown = 60 - -[cerebrium.dependencies.pip] -torch = ">=2.0.0" - -[cerebrium.dependencies.conda] -cuda = ">=11.7" -cudatoolkit = "==11.7" - -[cerebrium.dependencies.apt] -"libgl1-mesa-glx" = "latest" -"libglib2.0-0" = "latest" - -``` diff --git a/cerebrium/environments/legacy-yaml-config.mdx b/cerebrium/environments/legacy-yaml-config.mdx deleted file mode 100644 index c8f797e8..00000000 --- a/cerebrium/environments/legacy-yaml-config.mdx +++ /dev/null @@ -1,78 +0,0 @@ ---- -title: Legacy YAML Config -description: Using yaml config files to configure Cortex deployments easily and quickly ---- - -## Introduction - -This document is for users who have been using Cerebrium for a while and are familiar with the legacy YAML config file format. If you are new to Cerebrium, please refer to the [initial setup](./initial-setup) guide. - - - Your legacy YAML config file can be converted to a `toml` config file by - answering yes to the prompt when you run `cerebrium deploy` as normal. This - will create a `cerebrium.toml` file in your project directory which you can - then edit as you wish. Additionally, all the dependencies in your - `pkglist.txt` and `conda_pkglist.txt` files will be added to the - `cerebrium.toml` file automatically - - -## Legacy Folder Structure - -Cortex supports the use of config YAML files to configure various aspects of your project such as hardware requirements, memory and much more. -Using config files makes it easier to keep track of your Cerebrium deployments, share them and use git versioning to show changes over time. - -``` -project_name/ -β”œβ”€β”€ main.py -β”œβ”€β”€ requirements.txt -β”œβ”€β”€ pkglist.txt -β”œβ”€β”€ conda_pkglist.txt -└── config.yaml -``` - -To deploy your model with a specific config file, you can use the `cerebrium deploy` command with the `--config-file` flag to specify the path to your config file. Otherwise `cerebrium deploy` will use the config.yaml (if there is no `cerebrium.toml`) in the file directory. -Your config file can be named anything you want and can be placed anywhere on your local machine. However, remember to use the `cerebrium deploy` command in the same directory as your main.py as you would normally. - -Deploying your model with a config file is as simple as: - -```bash -cerebrium deploy -``` - -### Config File Format and Parameters - -The parameters for your config file are the same as those which you would use as flags for a normal `cerebrium deploy` command. They're tabulated below for your convenience: - -| Parameter | Description | Type | Default | -| ------------------- | ----------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------ | -| `name` | Name of the deployment | string | | -| `api_key` | API key for the deployment | string | not included for safety | -| `hardware` | Hardware to use for the deployment | string | GPU | -| `gpu_count` | The number of GPUs to specify | int | 2 | -| `cpu` | The number of CPU cores to use | int | 2 | -| `memory` | The amount of Memory to use in GB | int | 14.5 | -| `log_level` | Log level for the deployment | string | INFO | -| `include` | Local files to include in the deployment | string | '[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]' | -| `exclude` | Local Files to exclude from the deployment | string | '[./.*, ./__*]' | -| `disable_animation` | Whether to disable the animation in the logs. | boolean | false | -| `python_version` | The Python version you would like to run | float | 3.9 | -| `min_replicas` | The minimum number of replicas to run. | int | 0 | -| `max_replicas` | The maximum number of replicas to scale to. | int | \*plan limit | -| `cooldown` | The number of seconds to keep your model warm after each request. It resets after every request | int | 60 | - -## Config File Example - -```yaml -%YAML 1.2 ---- -name: an-optional-name -api_key: an-optional-api-key -hardware: GPU -exclude: "[./.*, ./__*]" -include: "[./*, main.py, requirements.txt, pkglist.txt, conda_pkglist.txt]" -log_level: INFO -disable_animation: false -python_version: 3.9 -min_replicas: 0 -max_replicas: 30 -``` diff --git a/cerebrium/environments/model-scaling.mdx b/cerebrium/environments/model-scaling.mdx index 021586e5..c815fc54 100644 --- a/cerebrium/environments/model-scaling.mdx +++ b/cerebrium/environments/model-scaling.mdx @@ -8,8 +8,8 @@ description: "Control the way your deployment scales up and down" Deployments on Cerebrium are scaled up and down automatically based on the number of requests your deployment is receiving. This is done to ensure that you are not paying for idle resources and that your deployment is always available to serve requests. There are three parameters that regulate the scaling of your deployment: -- minReplicas: The minimum number of replicas you would like to allow for your deployment. Set to 0 if you would like serverless deployments. Otherwise, latency sensitive applications, you can set this to a higher number to skip scale-up time and keep servers waiting however you will be charged for runtime. Defaults to 0. -- maxReplicas: The maximum number of replicas you would like to allow for your deployment. Useful for cost-sensitive applications when you need to limit the number of replicas that can be created. By default this is not set and we scale your model to fit the your volume of requests. +- minReplicas: The minimum number of replicas you would like to allow for your deployment. Set to 0 if you would like serverless deployments. Otherwise, latency sensitive apps, you can set this to a higher number to skip scale-up time and keep servers waiting however you will be charged for runtime. Defaults to 0. +- maxReplicas: The maximum number of replicas you would like to allow for your deployment. Useful for cost-sensitive apps when you need to limit the number of replicas that can be created. By default this is not set and we scale your model to fit the your volume of requests. - cooldown: Cooldown period in seconds is the period of inactivity before the number of replicas for your deployment is scaled down by 1. Defaults to 60s. While these parameters are set at deployment time, you can change them at any time using the Cerebrium CLI. @@ -30,7 +30,7 @@ You may want to update your model using a rest endpoint. To do so, you can use t ```bash curl --location 'https://rest-api.cerebrium.ai/update-model-scaling' \ ---header 'Authorization: YOUR-PRIVATE-API-KEY' \ +--header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data '{ "name": "your-models-unique-name-here", diff --git a/cerebrium/environments/multi-gpu-inferencing.mdx b/cerebrium/environments/multi-gpu-inferencing.mdx index 9dd97f73..f71e9f2f 100644 --- a/cerebrium/environments/multi-gpu-inferencing.mdx +++ b/cerebrium/environments/multi-gpu-inferencing.mdx @@ -3,6 +3,10 @@ title: Multi-GPU Inferencing description: Tips and tricks for multi-GPU inferencing. --- + + This feature is currently only available in the v4 API using INF2 or TRN1. + + When deploying your model on Cerebrium, setting up a multi-GPU inference is as simple as specifying the `gpu_count` parameter. If using Huggingface transformers, you then just need to set the `device_map` to **auto** and the model will be automatically distributed across all available GPUs. diff --git a/cerebrium/environments/using-secrets.mdx b/cerebrium/environments/using-secrets.mdx index 9c85181d..496a4a39 100644 --- a/cerebrium/environments/using-secrets.mdx +++ b/cerebrium/environments/using-secrets.mdx @@ -3,31 +3,32 @@ title: "Using Secrets" description: "Access third-party platforms using secure credentials encrypted on Cerebrium" --- -You may want to use API keys, passwords or other sensitive information in your application, -but you don't want it stored in your code. -If this is the case, it would be best to make use of our Secrets functionality. -Secrets are stored encrypted on our servers (256-bit Advanced Encryption Standard (AES)) and are only decrypted when your model is run. +You may want to use API keys, passwords or other sensitive information in your app, +but you don't want them stored in your code. +If this is the case, it's best to make use of Secrets. +Secrets are stored encrypted (256-bit Advanced Encryption Standard (AES)) and are only decrypted when your app is run. -Secrets are shared across all models in your project and are not available in a model that is already running - they are loaded in on startup. +Secrets are shared across all apps in your project. -```python -from cerebrium import get_secret +Adding a Secret will make the value available to your app as an environment variable. + +Secrets are loaded on container startup. If you update a Secret, it will not take effect until your app container restarts. -def predict(item, run_id, logger): - item = Item(**item) +```python +def predict(run_id): - logger.info(f"Run ID: {run_id}") + print(f"Run ID: {run_id}") - my_secret = get_secret('my-secret') - logger.info("my_secret: " + my_secret) + HF_TOKEN = os.environ.get("HF_TOKEN") + logger.info("HF_TOKEN: " + HF_TOKEN) - return {"result": f"Your secret is {my_secret}"} + return {"result": f"Your HF_TOKEN is {HF_TOKEN}"} ``` Secrets are stored as a string so if your secret is a JSON payload or similar please remember to convert it to the correct format using something such as - json.loads(get_secret('my-secret')). + json.loads(os.environ.get("MY_JSON_SECRET")). ### Managing Secrets @@ -36,27 +37,21 @@ Secrets are created, updated and deleted in your dashboard. ![Secrets](/images/secrets_dashboard.png) -## Local Development + + Secrets are loaded on model start, you will need to wait for your app + container to restart, or deploy your app before the new secret is available. + -When running your model locally, you can use still make use of Secrets. -Store them in a file called `secrets.json` or `secrets.yaml` in the root of your project and add them to your .gitignore. -These files will not be uploaded to Cerebrium. +### Local Development -### secret.yaml +When developing locally, you can use an `.env` file to store your secrets. +Later the secrets from that file can be added to your project from the dashboard. -```yaml -"my-yaml-secret": "this value comes from yaml" -``` +```python +import os +from dotenv import load_dotenv -### secret.json +load_dotenv() -```json -{ - "my-json-secret": "this value comes from json" -} +HF_TOKEN = os.environ.get("HF_TOKEN") ``` - - - Secrets are loaded on model start, you will need to redeploy your model for - changes to take effect. - diff --git a/cerebrium/faqs-and-help/fast-deployments-dos-and-donts.mdx b/cerebrium/faqs-and-help/fast-deployments-dos-and-donts.mdx index 328aa810..0a23dc44 100644 --- a/cerebrium/faqs-and-help/fast-deployments-dos-and-donts.mdx +++ b/cerebrium/faqs-and-help/fast-deployments-dos-and-donts.mdx @@ -32,7 +32,7 @@ In this brief guide, we delve into the strategies, best practices, and pitfalls ### Downloading files and setting up models -- **Do** make sure you utilise [persistent storage](./cerebrium/data-sharing-storage/persistent-storage). +- **Do** make sure you utilise [persistent storage](/cerebrium/data-sharing-storage/persistent-storage). - **Do** set the Huggingface cache dir to `/persistent-storage` for your models, tokenizers and datasets. - **Don't** re-download models, tokenizers, etc. if possible. diff --git a/cerebrium/faqs-and-help/fitting-large-models-on-small-gpus.mdx b/cerebrium/faqs-and-help/fitting-large-models-on-small-gpus.mdx index 58f2702e..02f18a8b 100644 --- a/cerebrium/faqs-and-help/fitting-large-models-on-small-gpus.mdx +++ b/cerebrium/faqs-and-help/fitting-large-models-on-small-gpus.mdx @@ -1,7 +1,2 @@ - +I think we can help users get more from open source models by helping them fit larger models +In Huggingface, these are things such as: - load-in-8bit - offload-dir - low-cpu-mem-usage (or whatever the flag is) diff --git a/cerebrium/getting-started/installation.mdx b/cerebrium/getting-started/installation.mdx index 38dac12e..c28c3614 100644 --- a/cerebrium/getting-started/installation.mdx +++ b/cerebrium/getting-started/installation.mdx @@ -21,30 +21,31 @@ If you have trouble installing the package, please [contact](mailto:support@cere ## Login to your Cerebrium account -1. If you haven't already, sign up for - [Cerebrium](https://dashboard.cerebrium.ai) using your email address or - Google login credentials. -2. From your dashboard you will be able to view all deployed models, API calls, - and invite teammates! +Run the following command: -The first thing you will need is your private API key for the project that you intend to deploy your model into. You can find the API key by navigating to the API keys section in [your dashboard](https://dashboard.cerebrium.ai/projects/) on Cerebrium. - - - Note that each project has a unique API key so double check you are using the - key that corresponds to the right project. - +```bash +cerebrium login +``` -To use the login feature, navigate to your terminal window which you would like to use to deploy your models and run the following command: +This should open up your web browser to the Cerebrium dashboard. If you have not created an account, please sign up otherwise if you are already logged in it should authenticate automatically +and you should be able to return to your terminal. You should get the following success message. -```bash -cerebrium login +``` +βœ… Logged in successfully +Current project context set to p-12345 ``` -This will store your API key locally, ensuring that you are logged in whenever you open a terminal and use `cerebrium deploy` without you having to enter your API key again. +You will see it prints out your current project - think of this as your working environment. In the top left of your Cerebrium dashboard you can create multiple projects, such as a dev and prod +environment for your app with different apps, secrets etc. You can see your projects using: -Some important details to note: +``` +cerebrium project list +``` + +and switch your working environment using -- You will remain logged in between different terminals provided you are the same user. -- You can still parse in an API key when deploying your models without being logged out. The parsed API key takes preference over the local one. This is particularly useful if you are deploying to a different project. +``` +cerebrium project set p-xxxx +``` Awesome! Let's start creating ML-based applications diff --git a/cerebrium/getting-started/introduction.mdx b/cerebrium/getting-started/introduction.mdx index 653d4d8e..29006abf 100644 --- a/cerebrium/getting-started/introduction.mdx +++ b/cerebrium/getting-started/introduction.mdx @@ -19,11 +19,11 @@ You can send us feedback requests at [support@cerebrium.ai](mailto:support@cereb - Research - We try to implement the latest research towards your model as best we can in order - for you to deliver the best experience to your users. Besides giving you to the - option to select the best chip for your workload, we look to see how we can take - maximum advantage of the GPU to get your model to run faster and cheaper without - sacrificing performance. + We try to implement the latest research towards your model as best we can in + order for you to deliver the best experience to your users. Besides giving you + to the option to select the best chip for your workload, we look to see how we + can take maximum advantage of the GPU to get your model to run faster and + cheaper without sacrificing performance. ## Our users favorite features diff --git a/cerebrium/getting-started/quickstart.mdx b/cerebrium/getting-started/quickstart.mdx index 960c9c2a..ab9d0275 100644 --- a/cerebrium/getting-started/quickstart.mdx +++ b/cerebrium/getting-started/quickstart.mdx @@ -14,34 +14,18 @@ Currently, our implementation has five components: - **main.py** - This is where your Python code lives. This is mandatory to include. - **cerebrium.toml** - This is where you define all the configurations around your model such as the hardware you use, scaling parameters, deployment config, build parameters, etc. Check [here](../environments/initial-setup) for a full list -Every main.py you deploy needs the following mandatory layout: +Your **main.py** can follow a layout similar to below but its really up to you! ```bash -from pydantic import BaseModel - - -class Item(BaseModel): - parameter: value - - -def predict(item, run_id, logger): - item = Item(**item) - - # Do something with parameters from item - - return {"key": "value} +def run(param: str, run_id): + return {"message": f"Running {param} remotely on Cerebrium!"} ``` -The Item class is where you define the parameters your model receives as well as their type. Item needs to inherit from BaseModel which uses Pydantic to validate request schemas. -You need to define a function with the name **predict** which receives 3 params: item, run_id and logger. +You need to define a function and the name of the parameters you are sending in. The names in the function signature and in +your JSON request should match exactly. Note, your URL endpoint with end with the name of your function. In this case /run. -- **item**: This is the expected request object containing the parameters you defined above. -- **run_id**: This is a unique identifier for the user request if you want to use it to track predictions through another system -- **logger**: Cerebrium supports logging via the logger (we also support "print()" statements) however, using the logger will format your logs nicer. It contains - the 3 states across most loggers: -- logger.info -- logger.debug -- logger.error +You can also have the optional parameter called **run_id** in your function signature. This is the unique identifier for you request and will match that which you will see in your dashboard. +If you send in a JSON payload, with a parameter called run_id, it will override it. As long as your **main.py** contains the above you can write any other Python code. Import classes, add other functions etc. @@ -50,7 +34,7 @@ As long as your **main.py** contains the above you can write any other Python co Then navigate to where your model code (specifically your `main.py`) is located and run the following command: ```bash -cerebrium deploy my-first-model +cerebrium deploy ``` Voila! Your app should start building and you should see logs of the deployment process. It shouldn't take longer than a minute - easy peasy! @@ -74,5 +58,4 @@ Below are some links outlining some of the more advanced functionality that Cort - [Custom Images](../environments/custom-images): How to create your custom environments to run your ML Models. - [Secrets](../environments/using-secrets): Use secrets to authenticate with third-party platforms. - [Persistent Storage](../data-sharing-storage/persistent-storage): Store model weights and files locally for faster access. -- [Long Running Tasks](../deployments/long-running-tasks): Execute long running tasks in the background. - [Streaming](../endpoints/streaming): Stream output live back to your endpoint diff --git a/cerebrium/integrations/vercel.mdx b/cerebrium/integrations/vercel.mdx new file mode 100644 index 00000000..b51b7ad1 --- /dev/null +++ b/cerebrium/integrations/vercel.mdx @@ -0,0 +1,69 @@ +--- +title: Vercel Integration +description: Integrate Cerebrium with Vercel to build AI applications +--- + +You can use the Cerebrium + Vercel integration to access apps deployed on Cerebrium via REST Endpoints from Vercel projects. You’ll find the Cerebrium integration available to install in the Vercel AI marketplace. + +### What this integration does + +This integration allows you to: + +1. Easily synchronize your Cerebrium API keys to one or more Vercel projects +2. Call Cerebrium Endpoints over HTTP in connected Vercel projects + +### Authentication + +The integration will set the following environment variables against the user’s selected Vercel projects: + +- CEREBRIUM_JWT + +The environment variables will be set in the β€œpreview” and β€œproduction” project targets. You can read more about environment variables within Vercel in the [documentation](https://vercel.com/docs/concepts/projects/environment-variables#environments). + +### Installing the integration + +1. Click β€œAdd integration” on the Vercel integrations page +2. Select the Vercel account you want to connect with +3. (If logged out) Sign into an existing Cerebrium project, or create a new Cerebrium project +4. Select the Vercel projects that you wish to connect to your Modal workspace +5. Click β€œContinue” + 6.Back in your Vercel dashboard, confirm the environment variables were added by going to your Vercel project > "Settings" > "Environment variables" + +### Uninstalling the integration + +The Cerebrium Vercel integration is managed under the user’s Vercel dashboard under the β€œIntegrations” tab. From there they can remove the specific integration installation from their Vercel account. + +Important: removing an integration will delete the corresponding API token set by Modal in your Vercel project(s). + +## Example + +You can view our example [here](/v4/examples/mistral-vllm) on how to deploy Mistral 7B with vLLM to an auto-scaling endpoint. + +Once you have followed the example and deployed the app, you should have an output of the endpoint your app is deployed at. You can then deploy this within your vercel project as: + +```javascript +fetch( + "https://api.cortex.cerebrium.ai/v4/p-/mistral-vllm/predict", + { + method: "POST", + headers: { + Authorization: `Bearer ${process.env.CEREBRIUM_JWT}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + prompt: "What is the capital city of France?", + }), + }, +) + .then((response) => response.json()) + .then((data) => console.log(data)) + .catch((error) => console.error("Error:", error)); +``` + +In this example, we built our app to take in a prompt as input and to return with the output of the mode. + +## Pricing + +Requests to apps use usage based pricing, billed at 1ms granularity. The exact cost per millisecond is based on the underlying hardware you specify. + +See our [pricing page](https://www.cerebrium.ai/pricing) for current GPU prices. diff --git a/cerebrium/misc/faster-model-loading.mdx b/cerebrium/misc/faster-model-loading.mdx index cd227492..62e95717 100644 --- a/cerebrium/misc/faster-model-loading.mdx +++ b/cerebrium/misc/faster-model-loading.mdx @@ -91,4 +91,4 @@ def deserialise_saved_model(model_path, model_id, plaid=True): Note that your model does not need to be a transformers or even a huggingface model. If you have a diffusers, scikit-learn or even a custom pytorch model, you can still use **Tensorizer** to load your model from storage into GPU memory in a single step. -The only requirement to obtain the speedup from deserialisation is that you can initialise an empty model. The Deserialiser object will then restore the weights into the empty model. +The only requirement to obtain the speedup from deserialization is that you can initialize an empty model. The Deserialiser object will then restore the weights into the empty model. diff --git a/cerebrium/prebuilt-models/introduction.mdx b/cerebrium/prebuilt-models/introduction.mdx index 1d4ceb84..ab820ef9 100644 --- a/cerebrium/prebuilt-models/introduction.mdx +++ b/cerebrium/prebuilt-models/introduction.mdx @@ -5,11 +5,11 @@ description: "Cerebrium provides prebuilt models that you can deploy to an API s Cerebrium and its community keep a library of popular pre-built models that you can deploy using one click. If you would like any pre-built models added you can: -- [Submit PR on Github](https://github.com/CerebriumAI/cerebrium-prebuilts) All our prebuilt models live here so submit a PR or one you would like to contribute to the community. Instructions in the README :) +- [Submit PR on GitHub](https://github.com/CerebriumAI/cerebrium-prebuilts) All our prebuilt models live here so submit a PR or one you would like to contribute to the community. Instructions in the README :) - [Contact](mailto:support@cerebrium.ai) the Cerebrium team and we will see what we can do -You can deploy prebuilt models via Cerebrium by using a simple one-click deploy from your dashboard by navigating to the Prebuilt tab. Otherwise, if -you would like to read through the source code, you can navigate to the [Cerebrium Prebuilts Github](https://github.com/CerebriumAI/cerebrium-prebuilts) where you can find the source code for each of the models. +You can navigate to the [Cerebrium Prebuilts GitHub](https://github.com/CerebriumAI/cerebrium-prebuilts) where you can find the source code for each of the models. You can then clone these +repositories as a starting point. Each model's folder is a cortex deployment that can be deployed using the `cerebrium deploy` command. Navigate to the folder of the model you would like to deploy and run the command. diff --git a/data/training-dataset-example.json b/data/training-dataset-example.json index ec211982..e191a53c 100644 --- a/data/training-dataset-example.json +++ b/data/training-dataset-example.json @@ -1,7 +1,7 @@ [ { "prompt": "What is Cerebrium?", - "completion": "Cerebrium is an AWS Sagemaker alternative providing all the features you need to quickly build an ML product.", + "completion": "Cerebrium is an AWS SageMaker alternative providing all the features you need to quickly build an ML product.", "source": "https://docs.cerebrium.ai/introduction" }, { diff --git a/examples/logo-controlnet.mdx b/examples/logo-controlnet.mdx deleted file mode 100644 index ae09aa49..00000000 --- a/examples/logo-controlnet.mdx +++ /dev/null @@ -1,202 +0,0 @@ ---- -title: "ControlNet Generated Logo" -description: "Generate a custom Logo using ControlNet" ---- - -In this tutorial, we will be using ControlNet Canny and SDXL, to alter the images of the HuggingFace logo to make it more appealing to the end user. SDXL is the -Stable Diffusion model released by Stability AI for high-resolution image generation. ControlNet allows you to provide a image and to replace parts of the image keeping the -original outlines of the image. - -To see the final implementation, you can view it [here](https://github.com/CerebriumAI/examples/tree/master/9-logo-controlnet) - -## Basic Setup - -It is important to think of the way you develop models using Cerebrium should be identical to developing on a virtual machine or Google Colab - so converting this should be very easy! -Please make sure you have the Cerebrium package installed and have logged in. If not, please take a look at our docs [here](https://docs.cerebrium.ai/cerebrium/getting-started/installation) - -First, we create our project: - -``` -cerebrium init controlnet-logo -``` - -It is important to think of the way you develop models using Cerebrium should be identical to developing on a virtual machine or Google Colab - so converting this should be very easy! - -Let us add the following packages to the **[cerebrium.dependencies.pip]** section of our `cerebrium.toml` file: - -```toml -[cerebrium.dependencies.pip] -accelerate = "latest" -transformers = ">=4.35.0" -safetensors = "latest" -opencv-python = "latest" -diffusers = "latest" -``` - -To start, we need to create a **main.py** file which will contain our main Python code. This is a relatively simple implementation, so we can do everything in 1 file. We would like a user to send in a link to a YouTube video with a question and return to them the answer as well as the time segment of where we got that response. -So let us define our request object. - -```python -from typing import Optional -from pydantic import BaseModel -from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL -from PIL import Image -import torch -import numpy as np -import cv2 -import io -import base64 - -class Item(BaseModel): - prompt: str - image: str - negative_prompt: Optional[str] = None - conditioning_scale: Optional[float] = 0.5 - height: Optional[int] = 512 - width: Optional[int] = 512 - num_inference_steps: Optional[int] = 20 - guidance_scale: Optional[float] = 7.5 - num_images_per_prompt: Optional[int] = 1 -``` - -Above, we import all the various Python libraries we require as well as use Pydantic as our data validation library. Due to the way that we have defined the Base Model, "prompt" and "image" are required parameters and so if they are not present in the request, the user will automatically receive an error message. Everything else is optional. - -## Instantiate model - -Below, we load in our ControlNet and SDXL models. This will be downloaded during your deployment, however, in subsequent deploys or inference requests it will be automatically cached in your persistent storage for subsequent use. You can read more about persistent storage [here]() -We do this outside our **predict** function since we only want this code to run on a cold start (ie: on startup). If the container is already warm, we just want it to do inference and it will execute just the **predict** function. - -```python -controlnet = ControlNetModel.from_pretrained( - "diffusers/controlnet-canny-sdxl-1.0", - torch_dtype=torch.float16 -) -vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16) -pipe = StableDiffusionXLControlNetPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", - controlnet=controlnet, - vae=vae, - torch_dtype=torch.float16, -) -pipe.enable_model_cpu_offload() -pipe = pipe.to("cuda") -``` - -## Predict Function - -Below we simply get the parameters from our request and pass it to the ControlNet model to generate the image(s). You will notice we convert the images to base64, this is so we can return it directly instead of writing the files to an S3 bucket - the return of the predict function needs to be JSON serializable. - -```python -def predict(item, run_id, logger): - item = Item(**item) - - init_image = load_image(item.image_url) - image = np.array(init_image) - image = cv2.Canny(image, 100, 200) - image = image[:, :, None] - image = np.concatenate([image, image, image], axis=2) - image = Image.fromarray(image) - - images = pipe( - item.prompt, - negative_prompt=item.negative_prompt, - image=image, - controlnet_conditioning_scale=item.conditioning_scale, - height=item.height, - width=item.width, - num_inference_steps=item.num_inference_steps, - guidance_scale=item.guidance_scale, - num_images_per_prompt=item.num_images_per_prompt - ).images - - finished_images = [] - for image in images: - buffered = io.BytesIO() - image.save(buffered, format="PNG") - finished_images.append(base64.b64encode(buffered.getvalue()).decode("utf-8")) - - return {"images": finished_images} - -``` - -## Deploy - -Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the GPU you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: - -```toml - -[cerebrium.build] -predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" -force_rebuild = false -disable_animation = false -log_level = "INFO" -disable_confirmation = false - -[cerebrium.deployment] -name = "controlnet-logo" -python_version = "3.10" -include = "[./*, main.py]" -exclude = "[./.*, ./__*]" - -[cerebrium.hardware] -gpu = "AMPERE_A5000" -cpu = 2 -memory = 16.0 -gpu_count = 1 - -[cerebrium.scaling] -min_replicas = 0 -cooldown = 60 - -[cerebrium.dependencies.apt] -ffmpeg = "latest" - -[cerebrium.dependencies.pip] -accelerate = "latest" -transformers = ">=4.35.0" -safetensors = "latest" -opencv-python = "latest" -diffusers = "latest" - -[cerebrium.dependencies.conda] - -``` - -To deploy the model, use the following command: - -```bash -cerebrium deploy controlnet-logo -``` - -Once deployed, we can make the following request: - -```curl -curl --location --request POST 'https://run.cerebrium.ai/v3/p-xxxxxx/controlnet-logo/predict' \ ---header 'Authorization: public-XXXXXXXXXXXX' \ ---header 'Content-Type: application/json' \ ---data-raw '{ - "prompt":"aerial view, a futuristic research complex in a bright foggy jungle, hard lighting", - "negative_prompt": "low quality, bad quality, sketches", - "image_url": "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" -}' -``` - -We then get the following results: - -```json -{ - "run_id": "xMc1UBBmjZOBCn5iR58idyy4pX59kk34og2THcoxzmQp9HSLLbknhw==", - "message": "Finished inference request with run_id: `xMc1UBBmjZOBCn5iR58idyy4pX59kk34og2THcoxzmQp9HSLLbknhw==`", - "result": { - "images": [ - - ] - }, - "status_code": 200, - "run_time_ms": 9617.486715316772 -} -``` - -Our image then looks like this: - -![ControlNet Logo](/images/examples/controlnet.png) diff --git a/images/examples/comfyui1.png b/images/examples/comfyui1.png new file mode 100644 index 00000000..18f1e2b4 Binary files /dev/null and b/images/examples/comfyui1.png differ diff --git a/images/examples/comfyui2.png b/images/examples/comfyui2.png new file mode 100644 index 00000000..53db61e4 Binary files /dev/null and b/images/examples/comfyui2.png differ diff --git a/images/examples/comfyui3.png b/images/examples/comfyui3.png new file mode 100644 index 00000000..775a37ab Binary files /dev/null and b/images/examples/comfyui3.png differ diff --git a/images/examples/comfyui4.png b/images/examples/comfyui4.png new file mode 100644 index 00000000..334d8569 Binary files /dev/null and b/images/examples/comfyui4.png differ diff --git a/images/examples/langchain_langsmith/cal_api_keys.png b/images/examples/langchain_langsmith/cal_api_keys.png new file mode 100644 index 00000000..750b2b25 Binary files /dev/null and b/images/examples/langchain_langsmith/cal_api_keys.png differ diff --git a/images/examples/langchain_langsmith/cerebrium_deploy.png b/images/examples/langchain_langsmith/cerebrium_deploy.png new file mode 100644 index 00000000..22ef8f5b Binary files /dev/null and b/images/examples/langchain_langsmith/cerebrium_deploy.png differ diff --git a/images/examples/langchain_langsmith/cerebrium_secrets.png b/images/examples/langchain_langsmith/cerebrium_secrets.png new file mode 100644 index 00000000..aa7272b8 Binary files /dev/null and b/images/examples/langchain_langsmith/cerebrium_secrets.png differ diff --git a/images/examples/langchain_langsmith/langchain_agent.png b/images/examples/langchain_langsmith/langchain_agent.png new file mode 100644 index 00000000..8782c500 Binary files /dev/null and b/images/examples/langchain_langsmith/langchain_agent.png differ diff --git a/images/examples/langchain_langsmith/langsmith_performance.png b/images/examples/langchain_langsmith/langsmith_performance.png new file mode 100644 index 00000000..5e2043f2 Binary files /dev/null and b/images/examples/langchain_langsmith/langsmith_performance.png differ diff --git a/images/examples/langchain_langsmith/langsmith_runs.png b/images/examples/langchain_langsmith/langsmith_runs.png new file mode 100644 index 00000000..80facb8c Binary files /dev/null and b/images/examples/langchain_langsmith/langsmith_runs.png differ diff --git a/images/examples/langchain_langsmith/langsmith_threads.png b/images/examples/langchain_langsmith/langsmith_threads.png new file mode 100644 index 00000000..bf189c3a Binary files /dev/null and b/images/examples/langchain_langsmith/langsmith_threads.png differ diff --git a/images/examples/voice_agent/daily-api-key.png b/images/examples/voice_agent/daily-api-key.png new file mode 100644 index 00000000..e0ad119d Binary files /dev/null and b/images/examples/voice_agent/daily-api-key.png differ diff --git a/images/examples/voice_agent/deployment.png b/images/examples/voice_agent/deployment.png new file mode 100644 index 00000000..a46bbff7 Binary files /dev/null and b/images/examples/voice_agent/deployment.png differ diff --git a/images/githubActions.png b/images/githubActions.png new file mode 100644 index 00000000..5723c216 Binary files /dev/null and b/images/githubActions.png differ diff --git a/images/migrations/replicate-curl.png b/images/migrations/replicate-curl.png new file mode 100644 index 00000000..ce5ae892 Binary files /dev/null and b/images/migrations/replicate-curl.png differ diff --git a/images/migrations/replicate-folder-structure.png b/images/migrations/replicate-folder-structure.png new file mode 100644 index 00000000..28733577 Binary files /dev/null and b/images/migrations/replicate-folder-structure.png differ diff --git a/images/serve/ModifyingServeCode.gif b/images/serve/ModifyingServeCode.gif new file mode 100644 index 00000000..8cc0ec96 Binary files /dev/null and b/images/serve/ModifyingServeCode.gif differ diff --git a/images/serve/ServeCurl.gif b/images/serve/ServeCurl.gif new file mode 100644 index 00000000..0708111e Binary files /dev/null and b/images/serve/ServeCurl.gif differ diff --git a/inkeep.js b/inkeep.js new file mode 100644 index 00000000..3a05f244 --- /dev/null +++ b/inkeep.js @@ -0,0 +1,113 @@ +// customize +const inkeepSettings = { + baseSettings: { + apiKey: "9ddec4493a80e40a51b3f23cf02c2caca5ada0b4aed2e007", + integrationId: "clzr542ms00041subip8qtf6y", + organizationId: "org_Qtt1DKDCsrdG2UqL", + primaryBrandColor: "#EB3A6F", + }, + aiChatSettings: { + chatSubjectName: "Cerebrium", + botAvatarSrcUrl: + "https://framerusercontent.com/images/iIYnR41hLhNJq7vtreIPiv8K6Eo.png", + getHelpCallToActions: [ + { + name: "Contact Us", + url: "mailto:support@cerebrium.ai", + icon: { + builtIn: "IoChatbubblesOutline", + }, + }, + ], + quickQuestions: [ + "How do I specify which files to include in my deployment?", + "What types of dependencies does Cerebrium support?", + "Where can I store models and files for faster loading?", + "How to migrate from Replicate?", + ], + }, +}; + +// The Mintlify search triggers, which we'll reuse to trigger the Inkeep modal +const searchButtonContainerIds = [ + "search-bar-entry", + "search-bar-entry-mobile", +]; + +// Clone and replace, needed to remove existing event listeners +const clonedSearchButtonContainers = searchButtonContainerIds.map((id) => { + const originalElement = document.getElementById(id); + const clonedElement = originalElement.cloneNode(true); + originalElement.parentNode.replaceChild(clonedElement, originalElement); + + return clonedElement; +}); + +// Load the Inkeep component library +const inkeepScript = document.createElement("script"); +inkeepScript.type = "module"; +inkeepScript.src = "https://unpkg.com/@inkeep/uikit-js@latest/dist/embed.js"; +document.body.appendChild(inkeepScript); + +// Once the Inkeep library is loaded, instantiate the UI components +inkeepScript.addEventListener("load", function () { + // Customization settings + + // for syncing with dark mode + const colorModeSettings = { + observedElement: document.documentElement, + isDarkModeCallback: (el) => { + return el.classList.contains("dark"); + }, + colorModeAttribute: "class", + }; + + // Instantiate the 'Ask AI' pill chat button. Optional. + Inkeep().embed({ + componentType: "ChatButton", + colorModeSync: colorModeSettings, + properties: inkeepSettings, + }); + + // Instantiate the search bar modal + const inkeepSearchModal = Inkeep({ + ...inkeepSettings.baseSettings, + }).embed({ + componentType: "CustomTrigger", + colorModeSync: colorModeSettings, + properties: { + ...inkeepSettings, + isOpen: false, + onClose: () => { + inkeepSearchModal.render({ + isOpen: false, + }); + }, + }, + }); + + // When the Mintlify search bar elements are clicked, open the Inkeep search modal + clonedSearchButtonContainers.forEach((trigger) => { + trigger.addEventListener("click", function () { + inkeepSearchModal.render({ + isOpen: true, + }); + }); + }); + + // Open the Inkeep Modal with cmd+k + window.addEventListener( + "keydown", + (event) => { + if ( + (event.metaKey || event.ctrlKey) && + (event.key === "k" || event.key === "K") + ) { + event.stopPropagation(); + inkeepSearchModal.render({ isOpen: true }); + return false; + } + }, + true, + ); +}); diff --git a/migrations/hugging-face.mdx b/migrations/hugging-face.mdx new file mode 100644 index 00000000..552b2d45 --- /dev/null +++ b/migrations/hugging-face.mdx @@ -0,0 +1,230 @@ +--- +title: "Migrating from Hugging Face" +description: "Deploy a Model from Hugging Face on Cerebrium" +--- + +## Introduction + +This guide provides a detailed walkthrough for migrating from Hugging Face inference endpoints to Cerebrium's serverless infrastructure platform. We'll cover the key differences between the two services, the benefits of migration, and provide step-by-step instructions for setting up and deploying a Llama 3.1 8B model on Cerebrium. + +## Comparing Hugging Face and Cerebrium + +Before diving into the migration process, let's compare the key features and performance metrics of Hugging Face inference endpoints and Cerebrium's serverless infrastructure platform. + +| **Feature** | **Hugging Face** | **Cerebrium** | +| --------------------------------- | ------------------------------------------------------------------- | ---------------------------------------------------------------------- | +| **Pricing** | $0.000278 per second | $0.0004676 per second | +| **Minimum cooldown period** | 15m | 1s | +| **First build timed** | 9m25s | 49s | +| **Subsequent build times** | 1m50s - 2m15s | 58s - 1m5s | +| **Response time (From cold)** | 1m45s - 1m48s | 8s - 17s | +| **Response time (From warm)** | 6s | 2s | +| **Co-locating your models** | Requires a separate repository for each inference endpoint and mode | Co-locate multiple models from various sources in a single application | +| **Response handling (From cold)** | Throws an error | Waits for infrastructure to become available and returns a response | + +## Benefits of Migrating to Cerebrium + +1. **Faster build times**: Cerebrium significantly reduces build times by up to 95%, especially for subsequent builds (An additional 56% reduction). This can greatly improve iteration speed and the cost of running experiments with complex ML applications. +2. **Flexible cooldown period**: With a minimum cooldown period of just 1 second (compared to Huggingface's 15 minutes), Cerebrium allows for more efficient resource utilization and cost management. +3. **Improved cold start handling**: When encountering a cold start, Cerebrium waits for the infrastructure to become available instead of throwing an error. This results in a better user experience and fewer failed requests. +4. **Model colocation flexibility**: Cerebrium doesn't require a separate repository for each inference endpoint, simplifying the management of models. Each function in your application becomes an endpoint automatically, which means that you can run multiple models from the same application to save costs. +5. **Pay-per-use model**: Cerebrium's pricing model ensures you only pay for the compute resources you actually use. This can lead to cost savings, especially for sporadic or low-volume inference needs. +6. **Competitive performance**: Cerebrium only adds up to 50ms of latency to your inference requests. This is why we’re able to outperform our competitors in response times from a warm start. In addition, our caching mechanisms and highly optimized orchestration pipelines help your applications start from a cold state in an average of 2-5 seconds. +7. **Customizable infrastructure**: Cerebrium allows for fine-grained control over the infrastructure specifications, enabling you to optimize for your specific use case. + +## Migration process + +Let's walk through the process of migrating a Llama 3.1 8B model from Huggingface to Cerebrium. We'll cover the entire process from setting up the configuration to deploying and using the model. + +### 1. Cerebrium setup and configuration + +To migrate to Cerebrium, we'll need to set up a few files and configure our environment. Let's go through this step-by-step. + +#### 1.1 Install Cerebrium CLI + +First, install the Cerebrium CLI: + +```bash +pip install cerebrium --upgrade +``` + +#### 1.2 Update your requirements file + +Scaffold your application by running `cerebrium init [PROJECT_NAME]`. During the initialisation, a `cerebrium.toml`is created. This file configures the deployment, hardware, scaling, and dependencies for your Cerebrium project. Update your `cerebrium.toml` file to reflect the following: + +```toml +[cerebrium.deployment] +name = "llama-8b-vllm" +python_version = "3.11" +docker_base_image_url = "debian:bookworm-slim" +include = ["./*", "main.py", "cerebrium.toml"] +exclude = [".*"] + +[cerebrium.hardware] +cpu = 2 +memory = 12.0 +compute = "AMPERE_A10" + +[cerebrium.scaling] +min_replicas = 0 +max_replicas = 5 +cooldown = 30 + +[cerebrium.dependencies.pip] +sentencepiece = "latest" +torch = "latest" +transformers = "latest" +accelerate = "latest" +xformers = "latest" +pydantic = "latest" +bitsandbytes = "latest" + +``` + +Let's break down this configuration: + +- `cerebrium.deployment`: Specifies the project name, Python version, base Docker image, and which files to include/exclude as project files. +- `cerebrium.hardware`: Defines the CPU, memory, and GPU requirements for your deployment. +- `cerebrium.scaling`: Configures auto-scaling behavior, including minimum and maximum replicas, and cooldown period. +- `cerebrium.dependencies.pip`: Lists the Python packages required for your project. + +#### 1.3 Update your code + +Next, Update your `main.py` file. This is where you'll define your model loading and inference logic. + +```python +import torch +from cerebrium import get_secret +from huggingface_hub import login +from pydantic import BaseModel +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig + +# Log into Hugging Face Hub +login(token=get_secret("HF_AUTH_TOKEN")) + +model_path = "meta-llama/Meta-Llama-3.1-8B-Instruct" +cache_directory = "/persistent-storage" + +# Set up tokenizer and model +tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_directory) +tokenizer.pad_token_id = 0 +model = AutoModelForCausalLM.from_pretrained( + model_path, + load_in_8bit=True, + torch_dtype=torch.float16, + device_map="auto", + cache_dir=cache_directory, +) + +class Item(BaseModel): + prompt: str + temperature: float + top_p: float + top_k: int + max_tokens: int + frequency_penalty: float + +def run( + prompt, temperature=0.6, top_p=0.9, top_k=0, max_tokens=512, frequency_penalty=1 +): + item = Item( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_tokens=max_tokens, + frequency_penalty=frequency_penalty, + ) + + # Place prompt in template + inputs = tokenizer( + item.prompt, return_tensors="pt", max_length=512, truncation=True, padding=True + ) + input_ids = inputs["input_ids"].to("cuda") + + # Set up generation config + generation_config = GenerationConfig( + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_tokens=max_tokens, + ) + with torch.no_grad(): + outputs = model.generate( + input_ids=input_ids, + generation_config=generation_config, + return_dict_in_generate=True, + output_scores=True, + ) + result = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) + + return {"result": result} +``` + +This script does the following: + +1. Authenticates with Hugging Face using a secret token. Don’t forget to add this secret on your cerebrium dashboard. +2. Initializes the Llama 3.1 8B model using vLLM for efficient inference. +3. Defines an `Item` class to structure and validate (using Pydantic) the input parameters. +4. Implements a `run` function that generates text based on the provided prompt and parameters. + +### 2. Deployment + +To deploy your model to Cerebrium, use the following CLI command in your project directory: + +```bash +cerebrium deploy +``` + +This command will use the configuration in `cerebrium.toml` to set up and deploy your model. + +### 3. Using the Deployed Model + +Once deployed, you can use your model as follows: + +```python +import requests +import json + +url = "https://api.cortex.cerebrium.ai/v4/[PROJECT_NAME]/llama-8b-vllm/run" + +payload = json.dumps({"prompt": "tell me about yourself"}) + +headers = { + 'Authorization': 'Bearer [CEREBRIUM_API_KEY]', + 'Content-Type': 'application/json' +} + +response = requests.request("POST", url, headers=headers, data=payload) + +print(response.text) +``` + +Make sure to replace `[CEREBRIUM_API_KEY]` with your Cerebrium API key, which can be found in your dashboard under API keys. This code sends a POST request to your deployed model's endpoint with a prompt, and prints the model's response. + +## Additional Considerations + +When migrating from Huggingface to Cerebrium, keep the following points in mind: + +1. **API structure**: The Cerebrium implementation uses a different API structure compared to Huggingface. Make sure to update your client-side code accordingly. +2. **Authentication**: Ensure you have set up the `HF_AUTH_TOKEN` secret in Cerebrium for authenticating with Hugging Face. You can do this through the Cerebrium dashboard. +3. **Model permissions**: The example uses the Llama 3.1 8B Instruct model. Ensure you have the necessary permissions to use this model, as it may require special access. +4. **Hardware optimization**: The `cerebrium.toml` file specifies the hardware requirements. You may need to adjust these based on your specific model and performance needs. +5. **Dependency management**: Regularly review and update the dependencies listed in `cerebrium.toml` to ensure you're using the latest compatible versions. +6. **Scaling configuration**: The example sets up auto-scaling with 0 to 5 replicas and a 30-second cooldown. Monitor your usage patterns and adjust these parameters as needed to balance performance and cost. +7. **Cold starts**: While Cerebrium handles cold starts more gracefully than Huggingface, be aware that the first request after a period of inactivity may still take longer to process. Set your cooldown period accordingly, to strike a balance between cost and performance. +8. **Monitoring and logging**: Familiarize yourself with Cerebrium's monitoring and logging capabilities to track your model's performance and usage effectively. +9. **Cost management**: Although Cerebrium's pay-per-use model can be more cost-effective, set up proper monitoring and alerts to avoid unexpected costs, especially if you're running large models or handling high volumes of requests. +10. **Testing**: Thoroughly test your migrated models to ensure they perform as expected on the new platform. Pay special attention to response times, output quality, and error handling. + +## Conclusion + +Migrating from Huggingface inference endpoints to Cerebrium's serverless infrastructure platform offers numerous benefits, including faster build times, more flexible resource management, and lower costs. While the migration process requires some setup and code changes, the resulting deployment can provide improved performance and scalability for your machine learning models. + +**Remember:** Continuously monitor and optimize your deployment as you use it in production, and don't hesitate to reach out to support or join our Slack and Discord communities if you encounter any issues or have questions during the migration process. + +You can read further about some of the functionality Cerebrium has to offer, here: + +- [Secrets](../cerebrium/environments/using-secrets) +- [Model scaling](../cerebrium/environments/model-scaling) +- [Keeping models warm](../cerebrium/environments/warm-models) diff --git a/migrations/replicate.mdx b/migrations/replicate.mdx new file mode 100644 index 00000000..9d3b56cf --- /dev/null +++ b/migrations/replicate.mdx @@ -0,0 +1,295 @@ +--- +title: "Migrating from Replicate" +description: "Deploy a Model from Replicate on Cerebrium" +--- + +### Introduction + +In this tutorial, I will show you how you can migrate your workloads from Replicate to Cerebrium in less than 5 minutes! + +As an example, we will be migrating the model SDXL-Lightning-4step from ByteDance. You can find the link to it on replicate [here](https://replicate.com/bytedance/sdxl-lightning-4step): + +It is best to look at the code in the [GitHub repo](https://github.com/lucataco/cog-sdxl-lightning-4step) and follow along as we migrate it. + +To start, let us create our cerebrium project + +```python +cerebrium init cog-migration-sdxl +``` + +Now Cerebrium and Replicate have a common setup in that they both have a setup file. **cog.yaml** and **cerebrium.toml** for Replicate and Cerebrium respectively. + +Looking at the cog.yaml, we need to add/change the following in our cerebrium.toml + +```python +[cerebrium.deployment] +name = "cog-migration-sdxl" +python_version = "3.11" +include = ["./*", "main.py", "cerebrium.toml"] +exclude = ["./example_exclude"] +docker_base_image_url = "nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04" +shell_commands = [ + "curl -o /usr/local/bin/pget -L 'https://github.com/replicate/pget/releases/download/v0.6.2/pget_linux_x86_64' && chmod +x /usr/local/bin/pget" +] + +[cerebrium.hardware] +region = "us-east-1" +provider = "aws" +compute = "AMPERE_A10" +cpu = 2 +memory = 12.0 +gpu_count = 1 + +[cerebrium.dependencies.pip] +"accelerate" = "latest" +"diffusers" = "latest" +"torch" = "==2.0.1" +"torchvision" = "==0.15.2" +"transformers" = "latest" + +[cerebrium.dependencies.apt] +"curl" = "latest" +``` + +From the above we do the following: + +- Since we need a GPU, we need to use one of the base images that come from Nvidia that has the CUDA libraries installed. We use the Cuda 12 image. You can see other images [here](https://docs.cerebrium.ai/cerebrium/environments/custom-images) +- Depending on the type of CPU/GPU you need, you can update the hardware settings to run your app. You can see the full list available [here](https://docs.cerebrium.ai/available-hardware) +- We copy across the pip packages we need to install +- Replicate uses pget to download model weights - we therefore need to download it to use it. We do this by installing curl and then adding the shell commands in our cerebrium.toml + +Great now our setup is the same in terms of our hardware and environment. + +Now the cog.yaml usually indicates the file that the endpoint calls - in this case, **predict.py** so let us inspect that file. + +Cerebrium has a similar notion in that the main file that is called on our side is **main.py** + +To start, I copy across all import statements and constant variables that have nothing to do with Replicate/Cog. In this case: + +```python +import os +import time +import torch +import subprocess +import numpy as np +from typing import List +from transformers import CLIPImageProcessor +from diffusers import ( + StableDiffusionXLPipeline, + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + PNDMScheduler, + KDPM2AncestralDiscreteScheduler, +) +from diffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) + +UNET = "sdxl_lightning_4step_unet.pth" +MODEL_BASE = "stabilityai/stable-diffusion-xl-base-1.0" +UNET_CACHE = "unet-cache" +BASE_CACHE = "checkpoints" +SAFETY_CACHE = "safety-cache" +FEATURE_EXTRACTOR = "feature-extractor" +MODEL_URL = "https://weights.replicate.delivery/default/sdxl-lightning/sdxl-1.0-base-lightning.tar" +SAFETY_URL = "https://weights.replicate.delivery/default/sdxl/safety-1.0.tar" +UNET_URL = "https://weights.replicate.delivery/default/comfy-ui/unet/sdxl_lightning_4step_unet.pth.tar" + +class KarrasDPM: + def from_config(config): + return DPMSolverMultistepScheduler.from_config(config, use_karras_sigmas=True) + + +SCHEDULERS = { + "DDIM": DDIMScheduler, + "DPMSolverMultistep": DPMSolverMultistepScheduler, + "HeunDiscrete": HeunDiscreteScheduler, + "KarrasDPM": KarrasDPM, + "K_EULER_ANCESTRAL": EulerAncestralDiscreteScheduler, + "K_EULER": EulerDiscreteScheduler, + "PNDM": PNDMScheduler, + "DPM++2MSDE": KDPM2AncestralDiscreteScheduler, +} +``` + +Replicate makes use of classes for their syntax which we shy away from - we run whatever python code you give us and make each function an endpoint. Therefore, when you see a reference to **self.** remove it throughout the code + +There is a folder in the repo called β€œfeature-extractor” which we need to have in our repository. We could git clone the repo, however, its quite small, so I would just copy the contents of the folder and put it in your cerebrium project ie: + +![Folder Structure](/images/migrations/replicate-folder-structure.png) + +The setup function on Replicate runs on each cold start (ie: each new instantiation of the app) and so we just define it as normal code that gets run at the top of our file. I put it right below my import statements above. + +```python +def download_weights(url, dest): + start = time.time() + print("downloading url: ", url) + print("downloading to: ", dest) + subprocess.check_call(["pget", "-x", url, dest], close_fds=False) + print("downloading took: ", time.time() - start) + +"""Load the model into memory to make running multiple predictions efficient""" +start = time.time() +print("Loading safety checker...") +if not os.path.exists(SAFETY_CACHE): + download_weights(SAFETY_URL, SAFETY_CACHE) +print("Loading model") +if not os.path.exists(BASE_CACHE): + download_weights(MODEL_URL, BASE_CACHE) +print("Loading Unet") +if not os.path.exists(UNET_CACHE): + download_weights(UNET_URL, UNET_CACHE) +self.safety_checker = StableDiffusionSafetyChecker.from_pretrained( + SAFETY_CACHE, torch_dtype=torch.float16 +).to("cuda") +self.feature_extractor = CLIPImageProcessor.from_pretrained(FEATURE_EXTRACTOR) +print("Loading txt2img pipeline...") +self.pipe = StableDiffusionXLPipeline.from_pretrained( + MODEL_BASE, + torch_dtype=torch.float16, + variant="fp16", + cache_dir=BASE_CACHE, + local_files_only=True, +).to("cuda") +unet_path = os.path.join(UNET_CACHE, UNET) +self.pipe.unet.load_state_dict(torch.load(unet_path, map_location="cuda")) +print("setup took: ", time.time() - start) +``` + +The code above downloads the model weights if they don’t exist and then instantiates the models. To persist files/data on Cerebrium, you need to store it on the path **/persistent-storage**. So we can update the following paths above: + +```python +UNET_CACHE = "/persistent-storage/unet-cache" +BASE_CACHE = "/persistent-storage/checkpoints" +SAFETY_CACHE = "/persistent-storage/safety-cache" +``` + +We can then copy the two other functions, run_safety_checker() and predict(). In Cerebrium, the parameters of a function is the json data it expects if you make a request to it. We can then define it as follows: + +```python +def run_safety_checker(image): + safety_checker_input = feature_extractor(image, return_tensors="pt").to( + "cuda" + ) + np_image = [np.array(val) for val in image] + image, has_nsfw_concept = safety_checker( + images=np_image, + clip_input=safety_checker_input.pixel_values.to(torch.float16), + ) + return image, has_nsfw_concept + +def predict( + prompt: str = "A superhero smiling", + negative_prompt: str = "worst quality, low quality", + width: int = 1024, + height: int = 1024, + num_outputs: int = 1, + scheduler: str = "K_EULER", + num_inference_steps: int = 4, + guidance_scale: float = 0, + seed: int = None, + disable_safety_checker: bool = False, +): + """Run a single prediction on the model""" + global pipe + if seed is None: + seed = int.from_bytes(os.urandom(4), "big") + print(f"Using seed: {seed}") + generator = torch.Generator("cuda").manual_seed(seed) + + # OOMs can leave vae in bad state + if pipe.vae.dtype == torch.float32: + pipe.vae.to(dtype=torch.float16) + + sdxl_kwargs = {} + print(f"Prompt: {prompt}") + sdxl_kwargs["width"] = width + sdxl_kwargs["height"] = height + + pipe.scheduler = SCHEDULERS[scheduler].from_config( + pipe.scheduler.config, timestep_spacing="trailing" + ) + + common_args = { + "prompt": [prompt] * num_outputs, + "negative_prompt": [negative_prompt] * num_outputs, + "guidance_scale": guidance_scale, + "generator": generator, + "num_inference_steps": num_inference_steps, + } + + output = pipe(**common_args, **sdxl_kwargs) + + if not disable_safety_checker: + _, has_nsfw_content = run_safety_checker(output.images) + + + output_paths = [] + for i, image in enumerate(output.images): + if not disable_safety_checker: + if has_nsfw_content[i]: + print(f"NSFW content detected in image {i}") + continue + output_path = f"/tmp/out-{i}.png" + image.save(output_path) + output_paths.append(Path(output_path)) + + if len(output_paths) == 0: + raise Exception( + "NSFW content detected. Try running it again, or try a different prompt." + ) + + return output_paths +``` + +The above returns a path to the generated images, but we would like to return it as a base64 encoded image so that users can render the image instantly. You are welcome to upload the images to a storage bucket to reference directly - its up to you. + +```python +from io import BytesIO +import base64 + +encoded_images = [] + for i, image in enumerate(output.images): + if not disable_safety_checker: + if has_nsfw_content[i]: + print(f"NSFW content detected in image {i}") + continue + buffered = BytesIO() + image.save(buffered, format="PNG") + img_b64 = base64.b64encode(buffered.getvalue()).decode("utf-8") + encoded_images.append(img_b64) + + if len(encoded_images) == 0: + raise Exception( + "NSFW content detected. Try running it again, or try a different prompt." + ) + + return encoded_images +``` + +Now we can run `cerebrium deploy`. You should see your app build in under 90 seconds. + +It should output the curl statement to run your app: + +![Curl Request](/images/migrations/replicate-curl.png) + +Make sure to replace the end of the URL with **/predict** (since that is the function we are calling) and send it the required JSON data. This is our result + +```python +{ + "run_id": "c6797f2e-333a-9e89-bafa-4dd0f4fbe22a", + "result": ["iVBORw0KGgoAAAANSUhEUgAABAAAAAQACAIAAADwf7zUAA...."], + "run_time_ms": 43623.4176158905 +} +``` + +You should be all ready to go! + +You can read further about some of the functionality Cerebrium has to offer + +- [Secrets](../cerebrium/environments/using-secrets) +- [Model scaling](../cerebrium/environments/model-scaling) +- [Keeping models warm](../cerebrium/environments/warm-models) diff --git a/mint.json b/mint.json index cef579a8..8b47534b 100644 --- a/mint.json +++ b/mint.json @@ -4,6 +4,7 @@ "light": "/logo/light.svg", "dark": "/logo/dark.svg" }, + "versions": ["v4"], "favicon": "/favicon.png", "colors": { "primary": "#EB3A6F", @@ -29,7 +30,12 @@ "tabs": [ { "name": "Examples", - "url": "examples" + "url": "v4/examples", + "version": "v4" + }, + { + "name": "Migrations", + "url": "migrations" } ], "anchors": [ @@ -72,10 +78,12 @@ "group": "Environment Setup", "pages": [ "cerebrium/environments/initial-setup", + "cerebrium/environments/config-files", "cerebrium/environments/custom-images", "cerebrium/environments/using-secrets", "cerebrium/environments/multi-gpu-inferencing", - "cerebrium/environments/warm-models" + "cerebrium/environments/warm-models", + "cerebrium/environments/custom-runtime" ] }, { @@ -84,14 +92,19 @@ }, { "group": "Deployments", + "pages": ["cerebrium/deployments/ci-cd"] + }, + { + "group": "Endpoints", "pages": [ - "cerebrium/deployments/long-running-tasks", - "cerebrium/deployments/async-functions" + "cerebrium/endpoints/rest-api", + "cerebrium/endpoints/streaming", + "cerebrium/endpoints/openai-compatible-endpoints" ] }, { - "group": "Endpoints", - "pages": ["cerebrium/endpoints/rest-api", "cerebrium/endpoints/streaming"] + "group": "Integrations", + "pages": ["cerebrium/integrations/vercel"] }, { "group": "Misc", @@ -101,10 +114,6 @@ "group": "Prebuilt Models", "pages": ["cerebrium/prebuilt-models/introduction"] }, - { - "group": "Legacy Docs", - "pages": ["cerebrium/environments/legacy-yaml-config"] - }, { "group": "FAQs and Tips", "pages": [ @@ -117,14 +126,22 @@ }, { "group": "Examples", + "version": "v4", "pages": [ - "examples/sdxl", - "examples/logo-controlnet", - "examples/mistral-vllm", - "examples/streaming-falcon-7B", - "examples/langchain", - "examples/transcribe-whisper" + "v4/examples/tensorRT", + "v4/examples/comfyUI", + "v4/examples/langchain-langsmith", + "v4/examples/sdxl", + "v4/examples/mistral-vllm", + "v4/examples/streaming-falcon-7B", + "v4/examples/transcribe-whisper", + "v4/examples/realtime-voice-agents", + "v4/examples/openai-compatible-endpoint-vllm" ] + }, + { + "group": "Migrations", + "pages": ["migrations/replicate", "migrations/hugging-face"] } ], "analytics": { diff --git a/package-lock.json b/package-lock.json index 68bc6c18..74392f8a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,335 +1,6 @@ { + "name": "documentation", + "lockfileVersion": 3, "requires": true, - "lockfileVersion": 1, - "dependencies": { - "base64-js": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", - "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==" - }, - "bl": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", - "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", - "requires": { - "buffer": "^5.5.0", - "inherits": "^2.0.4", - "readable-stream": "^3.4.0" - } - }, - "buffer": { - "version": "5.7.1", - "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", - "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", - "requires": { - "base64-js": "^1.3.1", - "ieee754": "^1.1.13" - } - }, - "chownr": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", - "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==" - }, - "color": { - "version": "4.2.3", - "resolved": "https://registry.npmjs.org/color/-/color-4.2.3.tgz", - "integrity": "sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==", - "requires": { - "color-convert": "^2.0.1", - "color-string": "^1.9.0" - } - }, - "color-convert": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", - "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", - "requires": { - "color-name": "~1.1.4" - } - }, - "color-name": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", - "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==" - }, - "color-string": { - "version": "1.9.1", - "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz", - "integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==", - "requires": { - "color-name": "^1.0.0", - "simple-swizzle": "^0.2.2" - } - }, - "decompress-response": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz", - "integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==", - "requires": { - "mimic-response": "^3.1.0" - } - }, - "deep-extend": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz", - "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==" - }, - "detect-libc": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.1.tgz", - "integrity": "sha512-463v3ZeIrcWtdgIg6vI6XUncguvr2TnGl4SzDXinkt9mSLpBJKXT3mW6xT3VQdDN11+WVs29pgvivTc4Lp8v+w==" - }, - "end-of-stream": { - "version": "1.4.4", - "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", - "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==", - "requires": { - "once": "^1.4.0" - } - }, - "expand-template": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", - "integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==" - }, - "fs-constants": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", - "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==" - }, - "github-from-package": { - "version": "0.0.0", - "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", - "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==" - }, - "ieee754": { - "version": "1.2.1", - "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", - "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==" - }, - "inherits": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" - }, - "ini": { - "version": "1.3.8", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", - "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==" - }, - "is-arrayish": { - "version": "0.3.2", - "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", - "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==" - }, - "lru-cache": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", - "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", - "requires": { - "yallist": "^4.0.0" - } - }, - "mimic-response": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", - "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==" - }, - "minimist": { - "version": "1.2.7", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.7.tgz", - "integrity": "sha512-bzfL1YUZsP41gmu/qjrEk0Q6i2ix/cVeAhbCbqH9u3zYutS1cLg00qhrD0M2MVdCcx4Sc0UpP2eBWo9rotpq6g==" - }, - "mkdirp-classic": { - "version": "0.5.3", - "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", - "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==" - }, - "napi-build-utils": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz", - "integrity": "sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==" - }, - "node-abi": { - "version": "3.28.0", - "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.28.0.tgz", - "integrity": "sha512-fRlDb4I0eLcQeUvGq7IY3xHrSb0c9ummdvDSYWfT9+LKP+3jCKw/tKoqaM7r1BAoiAC6GtwyjaGnOz6B3OtF+A==", - "requires": { - "semver": "^7.3.5" - } - }, - "node-addon-api": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-5.0.0.tgz", - "integrity": "sha512-CvkDw2OEnme7ybCykJpVcKH+uAOLV2qLqiyla128dN9TkEWfrYmxG6C2boDe5KcNQqZF3orkqzGgOMvZ/JNekA==" - }, - "once": { - "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", - "requires": { - "wrappy": "1" - } - }, - "prebuild-install": { - "version": "7.1.1", - "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.1.tgz", - "integrity": "sha512-jAXscXWMcCK8GgCoHOfIr0ODh5ai8mj63L2nWrjuAgXE6tDyYGnx4/8o/rCgU+B4JSyZBKbeZqzhtwtC3ovxjw==", - "requires": { - "detect-libc": "^2.0.0", - "expand-template": "^2.0.3", - "github-from-package": "0.0.0", - "minimist": "^1.2.3", - "mkdirp-classic": "^0.5.3", - "napi-build-utils": "^1.0.1", - "node-abi": "^3.3.0", - "pump": "^3.0.0", - "rc": "^1.2.7", - "simple-get": "^4.0.0", - "tar-fs": "^2.0.0", - "tunnel-agent": "^0.6.0" - } - }, - "pump": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", - "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", - "requires": { - "end-of-stream": "^1.1.0", - "once": "^1.3.1" - } - }, - "rc": { - "version": "1.2.8", - "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", - "integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==", - "requires": { - "deep-extend": "^0.6.0", - "ini": "~1.3.0", - "minimist": "^1.2.0", - "strip-json-comments": "~2.0.1" - } - }, - "readable-stream": { - "version": "3.6.0", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.0.tgz", - "integrity": "sha512-BViHy7LKeTz4oNnkcLJ+lVSL6vpiFeX6/d3oSH8zCW7UxP2onchk+vTGB143xuFjHS3deTgkKoXXymXqymiIdA==", - "requires": { - "inherits": "^2.0.3", - "string_decoder": "^1.1.1", - "util-deprecate": "^1.0.1" - } - }, - "safe-buffer": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", - "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==" - }, - "semver": { - "version": "7.3.8", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.8.tgz", - "integrity": "sha512-NB1ctGL5rlHrPJtFDVIVzTyQylMLu9N9VICA6HSFJo8MCGVTMW6gfpicwKmmK/dAjTOrqu5l63JJOpDSrAis3A==", - "requires": { - "lru-cache": "^6.0.0" - } - }, - "sharp": { - "version": "0.31.2", - "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.31.2.tgz", - "integrity": "sha512-DUdNVEXgS5A97cTagSLIIp8dUZ/lZtk78iNVZgHdHbx1qnQR7JAHY0BnXnwwH39Iw+VKhO08CTYhIg0p98vQ5Q==", - "requires": { - "color": "^4.2.3", - "detect-libc": "^2.0.1", - "node-addon-api": "^5.0.0", - "prebuild-install": "^7.1.1", - "semver": "^7.3.8", - "simple-get": "^4.0.1", - "tar-fs": "^2.1.1", - "tunnel-agent": "^0.6.0" - } - }, - "simple-concat": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", - "integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==" - }, - "simple-get": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz", - "integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==", - "requires": { - "decompress-response": "^6.0.0", - "once": "^1.3.1", - "simple-concat": "^1.0.0" - } - }, - "simple-swizzle": { - "version": "0.2.2", - "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", - "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==", - "requires": { - "is-arrayish": "^0.3.1" - } - }, - "string_decoder": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", - "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", - "requires": { - "safe-buffer": "~5.2.0" - } - }, - "strip-json-comments": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", - "integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==" - }, - "tar-fs": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz", - "integrity": "sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==", - "requires": { - "chownr": "^1.1.1", - "mkdirp-classic": "^0.5.2", - "pump": "^3.0.0", - "tar-stream": "^2.1.4" - } - }, - "tar-stream": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", - "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", - "requires": { - "bl": "^4.0.3", - "end-of-stream": "^1.4.1", - "fs-constants": "^1.0.0", - "inherits": "^2.0.3", - "readable-stream": "^3.1.1" - } - }, - "tunnel-agent": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", - "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==", - "requires": { - "safe-buffer": "^5.0.1" - } - }, - "util-deprecate": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==" - }, - "wrappy": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" - }, - "yallist": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", - "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==" - } - } + "packages": {} } diff --git a/package.json b/package.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/package.json @@ -0,0 +1 @@ +{} diff --git a/security.mdx b/security.mdx index 099cf449..ef3fa7ab 100644 --- a/security.mdx +++ b/security.mdx @@ -1,13 +1,13 @@ --- -title: "Security" +title: "Security & Data Privacy" description: "Cerebrium follows security best practices" --- -Cerebrium is GDPR and and SOC 2 Type I compliant which means we enforce certain security standards and protocols. Our compliance is continually monitored through Vanta. Please reach out to security@cerebrium.ai if you would like more information regarding our security compliance and implementations. +Cerebrium is SOC 2 Type I and HIPAA compliant. That means we enforce certain security standards and protocols. Our compliance is continually monitored through Vanta and a dedicated team. Please reach out to security@cerebrium.ai if you would like more information regarding our security compliance and implementations. ## Infrastructure Security -- Cerebrium frequently performs vulnerabilities scans and these vulnerabilities are remediated based on the time frame set out in our incident response plan. +- Cerebrium frequently performs vulnerability scans, and these vulnerabilities are remediated based on the time frame set out in our incident response plan. - Cerebrium conducts annual business continuity and security incident exercises. This is a requirement to remain SOC 2 compliant. - Cerebrium has daily database backups enabled. - Employee Computers are frequently monitored via the Vanta agent. @@ -23,20 +23,80 @@ Cerebrium is GDPR and and SOC 2 Type I compliant which means we enforce certain ## Product Security -- Cerebrium frequently performs vulnerabilities scans and these vulnerabilities are remediated based on the time frame set out in our incident response plan. +- Cerebrium frequently performs vulnerability scans, and these vulnerabilities are remediated based on the time frame set out in our incident response plan. - Cerebrium conducts annual business continuity and security incident exercises. This is a requirement to remain SOC 2 compliant. - Cerebrium enforces HTTPS for all services using TLS (SSL), including our Cerebrium Dashboard and our Cerebrium Python package. - Cerebrium maintains access logs across all its infrastructure services. -- Software dependencies are audited by Github’s Dependabot. +- Software dependencies are audited by GitHub’s Dependabot. - User data is encrypted at rest. ## Internal Security Procedures -- Cerebrium frequently performs vulnerabilities scans and these vulnerabilities are remediated based on the time frame set out in our incident response plan. +- Cerebrium frequently performs vulnerability scans, and these vulnerabilities are remediated based on the time frame set out in our incident response plan. - Cerebrium regularly audits employee access to internal systems. -- Cerebrium conduct annual business continuity and security incident exercises. This is a requirement to remain SOC 2 compliant. +- Cerebrium conducts annual business continuity and security incident exercises. This is a requirement to remain SOC 2 compliant. ## Data and Privacy -- Cerebrium deletes customer data upon request -- User data is encrypted at rest. +- Cerebrium does not use any customer data to train machine learning models or anything of a similar nature. +- For customers on our Hobby and Standard plan, we automatically delete request/log data after 7 and 30 days respectively. +- Cerebrium deletes customer data upon request and we have a purge request endpoint where you can request us to delete data sooner. +- All user data is encrypted at rest. + +## HIPAA Compliance + +Cerebrium is committed to supporting our customers' HIPAA compliance needs + +As a business associate to covered entities in the healthcare sector, Cerebrium has implemented robust measures to support HIPAA compliance: + +### Business Associate Agreements (BAA) + +- Cerebrium offers a standardized BAA to all customers who require HIPAA compliance. +- Our BAA clearly outlines the responsibilities and obligations of both parties in protecting Protected Health Information (PHI). +- Customers can initiate the BAA process by contacting compliance@cerebrium.ai. + +### PHI Handling and Storage + +- Cerebrium's infrastructure is designed to handle PHI securely, with encryption at rest and in transit. +- We do not access, use, or disclose PHI unless explicitly required for providing our services. +- Customers are responsible for de-identifying PHI before transmission to Cerebrium's systems, if de-identification is required for their use case. + +### Access Controls + +- Strict access controls are in place to ensure that only authorized personnel can access systems that may contain PHI. +- Role-based access controls are used to limit access to PHI based on job responsibilities and the principle of least privilege. + +### Audit Logging + +- Comprehensive audit logs are maintained for all activities that could potentially involve PHI. +- These logs are available to support customers' accounting of disclosures requirements. + +### Breach Notification + +- Cerebrium has a robust incident response plan that includes HIPAA-compliant breach notification procedures. +- Any potential breaches involving PHI are promptly investigated and reported to affected customers within required timeframes. + +### Employee Training + +- All Cerebrium employees undergo HIPAA awareness training as part of their onboarding process. +- Regular refresher training is conducted to ensure ongoing HIPAA compliance. + +### Risk Assessments + +- Cerebrium conducts regular risk assessments to identify and address potential vulnerabilities in our handling of PHI. +- These assessments are part of our ongoing commitment to maintaining a secure environment for our customers' sensitive data. + +### Subcontractors + +- Any subcontractors who may have access to PHI are required to sign a BAA and comply with the same HIPAA requirements as Cerebrium. + +### Data Retention and Destruction + +- Cerebrium adheres to HIPAA-compliant data retention policies. +- Secure data destruction processes are in place for when PHI needs to be deleted or when a customer relationship ends. + +### Compliance Monitoring + +- Our HIPAA compliance measures are continuously monitored and updated to align with any changes in regulations or best practices. + +For more detailed information about our HIPAA compliance measures or to discuss specific compliance needs, please contact our compliance team at compliance@cerebrium.ai. diff --git a/v4/examples/aiVoiceAgents.mdx b/v4/examples/aiVoiceAgents.mdx new file mode 100644 index 00000000..708a9d83 --- /dev/null +++ b/v4/examples/aiVoiceAgents.mdx @@ -0,0 +1,360 @@ +--- +title: "Real-time Voice Agent" +description: "Deploy a real-time AI voice agent" +--- + +In this tutorial, I'm going to create a real-time voice agent that can respond to any query via speech, in speech, in ~500ms. This is an extremely flexible implementation where you can swap in any LLM or Text-to-speech (TTS) model of your liking. This is extremely useful for use cases involving voice such as customer support bots and receptionists. + +To create this app, we use the [PipeCat](https://www.pipecat.ai/) framework that takes care of stringing together all the components and it handles some of the functionality we +might need such as user interruptions, dealing with audio data etc. We will show this functionality by joining a meeting room with our voice agent using [Daily](https://daily.co) (the creators of Pipecat) and will deploy this app on Cerebrium to show how it handles deploying and scaling our app seamlessly. + +You can find the final version of the code [here](TODO) + +### Cerebrium setup + +If you don’t have a Cerebrium account, you can create one by signing up [here](https://dashboard.cerebrium.ai/register) and following the documentation [here](https://docs.cerebrium.ai/cerebrium/getting-started/installation) to get setup + +In your IDE, run the following command to create our Cerebrium starter project: `cerebrium init agent-tool-calling`. This creates two files: + +- **Main.py** - Our entrypoint file where our code lives +- **cerebrium.toml** - A configuration file that contains all our build and environment settings + ‍ + Add the following pip packages near the bottom of your cerebrium.toml. This will be used in creating our deployment environment. + +``` +[cerebrium.deployment] +docker_base_image_url = "registry.cerebrium.ai/daily:latest" + +[cerebrium.dependencies.pip] +torch = ">=2.0.0" +"pipecat-ai[silero, daily, openai, deepgram]" = "latest" +aiohttp = "latest" +torchaudio = "latest" +vllm = "latest" +huggingface_hub = "latest" +``` + +You will also see we specify a Docker base image above. The reason for this is Daily has supplied a Docker image that contains local [Deepgram](https://deepgram.com/) Speech-to-Text (STT) and Text-to-Speech (TTS) models. +This helps us achieve our low latency since everything is running locally and not going over the network. + + + Docker files are not support yet but are rather in the works to be released + soon. This is just a very early preview of how it would work. + + +In this example, we will be using Llama 3 8B as our LLM and serving it via vLLM. To use Llama 3, we need to be authenticated via Hugging Face. + +To authenticate ourselves, we need to go to HuggingFace and accept the model permissions for [Lllama 8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) if we haven’t already. It takes about 30 minutes or less for them to accept your request. + +In your Cerebrium dashboard, you can add your HuggingFace token as a secret by navigating to β€œSecrets” in the sidebar. For the sake of this tutorial, I called mine β€œHF_TOKEN”. We can now access these values in our code at runtime without exposing them in our code. + +You can then add the following code to your main.py: + +```python +from huggingface_hub import login +import subprocess + +login(token=get_secret('HF_TOKEN')) +# Run vllM Server in background process +def start_server(): + while True: + process = subprocess.Popen(f"python -m vllm.entrypoints.openai.api_server --port 5000 --model meta-llama/Meta-Llama-3-8B-Instruct --dtype bfloat16 --api-key {get_secret('HF_TOKEN')} --download-dir /persistent-storage/", shell=True) + process.wait() # Wait for the process to complete + logger.error("Server process ended unexpectedly. Restarting in 5 seconds...") + time.sleep(5) # Wait before restarting + +# Start the server in a separate process +server_process = Process(target=start_server) +server_process.start() +``` + +Pipecat doesn't currently support locally instantiated models and requires them to follow the OpenAI compatible format. Therefore, we run the vLLM server locally on our instance in a background process. +We run monitor the background process to make sure it launched successfully since there seems to be a bug with rapidly starting multiple vLLM instances. If it doesn't launch correctly, we wait 5 seconds before trying again. +Note, we are running the vLLM server on port 5000 (8000 is automatically used by Cerebrium) and we set the download directory of the model so that subsequent cold starts can be much quicker. + +Now we implement the Pipecat framework by instantiating the various components. Create a function called "main" with the following code: + +```python +import aiohttp +import os +import sys +import subprocess +import time +import requests +import asyncio +from multiprocessing import Process +from loguru import logger + +from pipecat.vad.vad_analyzer import VADParams +from pipecat.vad.silero import SileroVADAnalyzer +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.services.openai import OpenAILLMService +from pipecat.services.deepgram import DeepgramSTTService +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.pipeline import Pipeline +from pipecat.frames.frames import LLMMessagesFrame, EndFrame + +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, LLMUserResponseAggregator +) + +from helpers import ( + ClearableDeepgramTTSService, + AudioVolumeTimer, + TranscriptionTimingLogger +) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + +deepgram_voice: str = "aura-asteria-en" + +async def main(room_url: str, token: str = None): + + async with aiohttp.ClientSession() as session: + transport = DailyTransport( + room_url, + token if token else get_secret("DAILY_TOKEN"), + "Respond bots", + DailyParams( + audio_out_enabled=True, + transcription_enabled=False, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams( + stop_secs=0.2 + )), + vad_audio_passthrough=True + ) + ) + + stt = DeepgramSTTService( + name="STT", + api_key=None, + url='ws://127.0.0.1:8082/v1/listen' + ) + + tts = ClearableDeepgramTTSService( + name="Voice", + aiohttp_session=session, + api_key=None, + voice=deepgram_voice, + base_url="http://127.0.0.1:8082/v1/speak" + ) + + llm = OpenAILLMService( + name="LLM", + api_key=get_secret("HF_TOKEN"), + model="casperhansen/llama-3-8b-instruct-awq", + base_url="http://0.0.0.0:5000/v1" + ) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + avt = AudioVolumeTimer() + tl = TranscriptionTimingLogger(avt) + + tma_in = LLMUserResponseAggregator(messages) + tma_out = LLMAssistantResponseAggregator(messages) + + pipeline = Pipeline([ + transport.input(), # Transport user input + avt, # Audio volume timer + stt, # Speech-to-text + tl, # Transcription timing logger + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ]) + + task = PipelineTask( + pipeline, + PipelineParams( + allow_interruptions=True, + enable_metrics=True, + report_only_initial_ttfb=True + )) +``` + +First, in our main function, we initialize the daily transport layer to receive/send the audio/video data from the Daily room we will connect to. You can see we pass the room_url +we would like to join as well as a token to authenticate us programmatically joining. We also set our VAD stop seconds which is the amount of time we wait for a pause before our bot will respond - in this +example, we set it to 200 milliseconds. + +Next, we connect to our locally running Deepgram models that come part of our Docker base image we specified in our cerebrium.toml - these are running on port 8082. This is where the Pipecat framework helps convert audio data to text +and vice versa. We then follow the same patten to connect our locally running LLM model from our vLLM server. Please make sure to set the Deepgram API key to **None** to work with the locally running instance otherwise it will +not work. + +Lastly, we then put this all together as a PipelineTask which is what Pipecat runs all together. The makeup of a task is completely customisable and has support for Image and Vision use cases. You can read more [here](https://docs.pipecat.ai/docs/category/services). +Pipeline tasks come with a structure and parameters that make it easy to handle interruptions out the box, and we are able to swap models to our preference only changing a few lines of code. + +The Daily Python SDK comes with a lot of event webhooks where you can trigger functionality based on events occurring. So let us handle how our bot handles certain events such as a user leaving/joining a call. +Continue to add the following code to the main() function. + +```python +# When the first participant joins, the bot should introduce itself. +@transport.event_handler("on_first_participant_joined") +async def on_first_participant_joined(transport, participant): + # Kick off the conversation. + messages.append( + {"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frame(LLMMessagesFrame(messages)) + +# When a participant joins, start transcription for that participant so the +# bot can "hear" and respond to them. +@transport.event_handler("on_participant_joined") +async def on_participant_joined(transport, participant): + transport.capture_participant_transcription(participant["id"]) + +# When the participant leaves, we exit the bot. +@transport.event_handler("on_participant_left") +async def on_participant_left(transport, participant, reason): + await task.queue_frame(EndFrame()) + +# If the call is ended make sure we quit as well. +@transport.event_handler("on_call_state_updated") +async def on_call_state_updated(transport, state): + if state == "left": + await task.queue_frame(EndFrame()) + +runner = PipelineRunner() + +await runner.run(task) +await session.close() +``` + +Above, we handle the following events: + +- When the first participant joins, we get the bot to introduce itself to the user. We do this by adding a message to the conversation. +- We add support for multiple participants to join and listen/respond to the bot. +- When a participant leaves or the call is ended, we get the bot to terminate itself. + +From the code above, you will see the events are attached to Transport, which is the method of communication - in this case the meeting room. We then pass in our defined Pipeline task +to our pipeline runner, which execute indefinitely until we signal it to exit which in this case happens when a call ends. If you want to read further about the PipeCat infrastructure +you can read more [here](https://docs.pipecat.ai/docs/understanding-bots/dailyai-architecture) + +The above code needs to be run in a separate execution environment, so PipeCat does not get instantiate multiple instances. To do this, we need to run the above code as a +background process. This will be the entry point of our REST API endpoint to start the PipeCat bot. Once the pipecat bot has returned (ie: the call has ended) then we will return a response to our API endpoint. +We therefore create the following function: + + + Currently, Cerebrium does not support workloads running longer than 5 minutes + however it is currently being worked on internally and will be released soon. + This means that conversations are limited to a 5 minute window. If this is a + issue and you have a urgent use case, please reach out to + [support](mailto:support@cerebrium.ai) + + +```python +def start_bot(room_url: str, token: str = None): + + def target(): + asyncio.run(main(room_url, token)) + + check_model_status() + process = Process(target=target) + process.start() + process.join() # Wait for the process to complete + return {"message": "session finished"} +``` + +That's it! You now have a fully functioning AI bot that can interact with a user through speech in ~500ms. Image the possibilities! + +Let us now create a user facing UI for you to interface with this bot. + +## Creating Meeting Room + +Cerebrium doesn't only have to be used to run AI heavy workloads, it can run any Python code. Therefore, we define two functions for our demo that will create a room to join programmatically +and a temporary token, both of which will only be usable for 5 minutes. To implement this, we use the Daily REST API. + +We need to get our Daily developer token from our profile. If, you don't have an account you can sign up for one [here](https://dashboard.daily.co/u/signup) (they have a generous free tier). +You can then go to the "developers" tab to fetch your API key - add this to your Cerebrium Secrets. + +![Daily API Key](/images/examples/voice_agent/daily-api-key.png) + +Below we create a room that only lasts 5 minutes and a temporary token to access it + +```python +def create_room(): + url = "https://api.daily.co/v1/rooms/" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {get_secret('DAILY_TOKEN')}" + } + data = { + "properties": { + "exp": int(time.time()) + 60*5 ##5 mins + } + } + + response = requests.post(url, headers=headers, json=data) + if response.status_code == 200: + room_info = response.json() + token = create_token(room_info['name']) + if token and 'token' in token: + room_info['token'] = token['token'] + else: + logger.error("Failed to create token") + return {"message": 'There was an error creating your room', "status_code": 500} + return room_info + else: + logger.error(f"Failed to create room: {response.status_code}") + return {"message": 'There was an error creating your room', "status_code": 500} + +def create_token(room_name: str): + url = "https://api.daily.co/v1/meeting-tokens" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {get_secret('DAILY_TOKEN')}" + } + data = { + "properties": { + "room_name": room_name + } + } + + response = requests.post(url, headers=headers, json=data) + if response.status_code == 200: + token_info = response.json() + return token_info + else: + logger.error(f"Failed to create token: {response.status_code}") + return None +``` + +### Deploy to Cerebrium + +To deploy this app to Cerebrium you can simply run the command: cerebrium deploy in your terminal. + +If it deployed successfully, you should see something like this: + +![Cerebrium Deployment](/images/examples/langchain_langsmith/cerebrium_deploy.png) + +We will add these endpoints to our frontend interface. + +## Connect frontend + +We created a public fork of the PipeCat frontend to show you a nice demo of this application. You can clone the repo [here](https://github.com/CerebriumAI/web-client-ui). + +Follow the instructions in the README.md and then populate the following variables in your .env.development.local + +``` +VITE_SERVER_URL=https://api.cortex.cerebrium.ai/v4/p-xxxxx/ #This is the base url. Do not include the function names +VITE_SERVER_AUTH= #This is the JWT token you can get from the API Keys section of your Cerebrium Dashboard. +``` + +You can now run yarn dev and go to the URL: http://localhost:5173/ to test your application! + +### Conclusion + +Hopefully, this tutorial acts as a good starting point for you to implement voice into your application as well as extend it into image and vision capabilities. Pipecat is a +extensible and open-source framework that makes it easy to build applications like this, and Cerebrium makes the process seamless to deploy and autoscale while only paying for the +compute you need. + +Tag us as **@cerebriumai** so we can see what you build and please feel free to ask questions/send feedback to us on [Slack](https://join.slack.com/t/cerebriumworkspace/shared_invite/zt-1qojg3eac-q4xyu5O~MeniNIg2jNeadg) or [Discord](https://discord.gg/ATj6USmeE2) communities diff --git a/v4/examples/comfyUI.mdx b/v4/examples/comfyUI.mdx new file mode 100644 index 00000000..a8caa4f4 --- /dev/null +++ b/v4/examples/comfyUI.mdx @@ -0,0 +1,216 @@ +--- +title: "ComfyUI application at Scale" +description: "Deploy a ComfyUI application" +--- + + + This example is only compatible with CLI v1.20 and later. Should you be making + use of an older version of the CLI, please run `pip install --upgrade + cerebrium` to upgrade it to the latest version. + + +### Introduction + +ComfyUI is a popular no-code interface for building complex stable diffusion workflows. Due to its ease of use, modular setup as well as its intuitive flowchart interface, the community of ComfyUI users has built a pretty phenomenal collection of workflows! There are even websites dedicated to the sharing of workflows built to help get you started: + +- https://comfyworkflows.com/ +- https://openart.ai/workflows/home‍ + +While it is currently easy to experiment with ComfyUI workflows, there isn’t a lot of guidance or tutorials on how to productionize these workflows at scale. In this tutorial, I am going to show you how you can use Cerebrium to deploy your pipelines to an API endpoint so they can autoscale based on demand and that you only pay for the compute you use. You can find the full example code [here](https://github.com/CerebriumAI/examples/tree/master/12-comfyui). + +### Creating your Comfy UI workflow locally + +We first need to create our workflow which you can do locally on your machine or by renting a GPU from [Lambda Labs](https://lambdalabs.com/). + +Before we get started, make sure ComfyUI is [installed properly on your local environment](https://github.com/comfyanonymous/ComfyUI#installing). + +For our use case, we are going to use Stable Diffusion XL and ControlNet to create a cool QR code. If you have a existing workflow setup, you can simply skip to the step β€œExport ComfyUI Workflow” + +1. Let us first create our Cerebrium project with: `cerebrium init comfyUI` +2. Inside your project, lets copy the ComfyUI GitHub project: `git clone https://github.com/comfyanonymous/ComfyUI` +3. Download the following models and install them in the appropriate folders within the ComfyUI folder: + - SDXL base in models/checkpoints. + - ControlNet in models/ControlNet. +4. To run ComfyUI locally, run the command: python main.py --force-fp16 on MacOS. Make sure you run this inside the ComfyUI folder you just cloned. +5. A server should be loaded locally at http://127.0.0.1:8188/‍ + ‍ + +In this view, you should be able to see the default user interface for a ComfyUI workflow. You can use this locally running instance to create your image generation pipeline. + +### Export ComfyUI Workflow + +In our example GitHub repository, we have a workflow.json file. You can click the β€œLoad” button on the right to load in our workflow. You then should see the workflow populated + +![ComfyUI Workflow](/images/examples/comfyui1.png) + +Don’t worry about the pre-filled values and prompts, we will edit these values on inference when we run our workflow. + +To export this workflow to work with Cerebrium, we need to export this to an API format. In the top right over that hovering box on the right. Click the gear icon (settings). You must then make sure that β€œEnable dev mode” is selected and then can close the popup. + +![Save ComfyUI API format](/images/examples/comfyui2.png) + +You should then see that a button appear on the right hover box that reads β€œSave (API format)”. Use that to save your workflow with the name β€œworkflow_api.json” + +### ComfyUI Application + +Our main application code lives in main.py so we now use the exported ComfyUI API workflow file above to create an API endpoint for our workflow. In the code below we initialize the ComfyUI server and load in our workflow API template. In our predict function, we then send in the various values we would like to alter in our workflow ie: prompt, image and run the workflow. Our function then users the inputs to alter the ComfyUI workflow values and then generates the output which in this case is base64 encoded images. + +In Cerebrium, the code outside the predict function runs only on initialization (ie: startup) whereas subsequent requests will only run the predict function. + +We need to alter our workflow_api.json file to have placeholders so that we can replace user values on inference. You can alter the file as follows. + +- Replace line 4, the seed input, with: "\{\{seed\}\}" +- Replace line 45, the input text of node 6 with: "\{\{positive_prompt\}\}" +- Replace line 58, the input text of node 7 with: "\{\{negative_prompt\}\}" +- Replace line 108, the image of node 11 with: "\{\{controlnet_image\}\}" + +We created a file that contains utility functions to make it easier to work with ComfyUI. You can find the code here. Create a file named helpers.py and copy the code into there. + +```python +from typing import Optional +from pydantic import BaseModel + +import copy +import json +import os +import time +import uuid +from multiprocessing import Process +from typing import Dict + +import websocket +from helpers import ( + convert_outputs_to_base64, + convert_request_file_url_to_path, + fill_template, + get_images, + setup_comfyui, +) + +server_address = "127.0.0.1:8188" +client_id = str(uuid.uuid4()) +original_working_directory = os.getcwd() +global json_workflow +json_workflow = None + +global side_process +side_process = None +if side_process is None: + side_process = Process( + target=setup_comfyui, + kwargs=dict( + original_working_directory=original_working_directory, + data_dir="", + ), + ) + side_process.start() + +# Load the workflow file as a python dictionary +with open( + os.path.join("./", "workflow_api.json"), "r" +) as json_file: + json_workflow = json.load(json_file) + +# Connect to the ComfyUI server via websockets +socket_connected = False +while not socket_connected: + try: + ws = websocket.WebSocket() + ws.connect( + "ws://{}/ws?clientId={}".format(server_address, client_id) + ) + socket_connected = True + except Exception as e: + print("Could not connect to comfyUI server. Trying again...") + time.sleep(5) + +print("Successfully connected to the ComfyUI server!") + +class Item(BaseModel): + workflow_values: Optional[Dict] + +def predict(workflow_values=None, run_id, logger): + item = Item(workflow_values=workflow_values) + + template_values = item.workflow_values + + template_values, tempfiles = convert_request_file_url_to_path(template_values) + json_workflow_copy = copy.deepcopy(json_workflow) + json_workflow_copy = fill_template(json_workflow_copy, template_values) + outputs = {} # Initialize outputs to an empty dictionary + + try: + outputs = get_images( + ws, json_workflow_copy, client_id, server_address + ) + + except Exception as e: + print('did it get here') + print("Error occurred while running Comfy workflow: ", e) + + for file in tempfiles: + file.close() + + result = [] + for node_id in outputs: + for unit in outputs[node_id]: + file_name = unit.get("filename") + file_data = unit.get("data") + output = convert_outputs_to_base64( + node_id=node_id, file_name=file_name, file_data=file_data + ) + result.append(output) + + return {"result": result} +``` + +### Deploy ComfyUI Application + +If we run cerebrium deploy it will upload the ComfyUI directory and our ~10GB of model weights so if you have a slow internet connection this can be a pain. However, in our repo we have a helper file that will download the models weights for us and put it in the appropriate folder. Create a file called model.json with the following contents: + +``` +[ + { + "url": "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors", + "path": "models/checkpoints/sd_xl_base_1.0.safetensors" + }, + { + "url": "https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0/resolve/main/diffusion_pytorch_model.fp16.safetensors", + "path": "models/controlnet/diffusers_xl_canny_full.safetensors" + } +] +``` + +This file is basically telling our helper function the URL to download the model from, and the directory to save the file in. It will only download the file on first deploy. On subsequent deploys, we wrote the logic to skip downloading the model if the file already exists. + +This is what your file folder structure should look like: + +![ComfyUI folder structure](/images/examples/comfyui3.png) + +You can now deploy your application by running: cerebrium deploy + +Once your ComfyUI application has been deployed successfully, you should be able to make a request to the endpoint using the following JSON payload: + +```curl +curl --location 'https://api.cortex.cerebrium.ai/v4/p-xxxx/comfyui/predict' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer ' \ +--data '{"workflow_values": { + "positive_prompt": "A top down view of a mountain with large trees and green plants", + "negative_prompt": "blurry, text, low quality", + "controlnet_image": "https://cerebrium-assets.s3.eu-west-1.amazonaws.com/qr-code.png", + "seed": 1000 +} +}' +``` + +You will get two responses from the output: + +- The base64 encoded image of the outline of your original ControlNet image. This is what it uses as a input in your flow. +- A base64 encoded image of the final result. + +![ComfyUI generated QR code](/images/examples/comfyui4.png) + +### Conclusion + +With Cerebrium, companies can implement productionized instances of their ComfyUI workflows to create unique user experiences. Users can have peace of mine that their workloads will autoscale with demand and will only charge based on the compute used. We are excited to see what you build and please tag @cerebriumai so we can share your work diff --git a/v4/examples/langchain-langsmith.mdx b/v4/examples/langchain-langsmith.mdx new file mode 100644 index 00000000..1ef58881 --- /dev/null +++ b/v4/examples/langchain-langsmith.mdx @@ -0,0 +1,433 @@ +--- +title: "Langchain and Langsmith" +description: "Deploy an executive assistant using Langsmith and Langchain" +--- + +In this tutorial, I am going to create an executive assistant, Cal-vin, to manage my calendar (Cal.com) with employees, customers, partners and friends. I will use the LangChain SDK to create my agent, +the LangSmith platform to monitor how it is scheduling my time throughout the day and monitor situations in which it fails to do a correct job. Lastly, we will deploy this application on Cerebrium to +show how it handles deploying and scaling our application seamlessly. + +You can find the final version of the code [here](https://github.com/CerebriumAI/examples/tree/master/13-tool-calling-langsmith) + +### Concepts + +To create an application like this, we will need to interact with my calendar based on instructions from a user. This is a perfect use case for an agent with function (tool) calling ability. LangChain is a framework with a lot of functionality supporting agents, they are also the creators of LangSmith and so an integration should be relatively easy. + +When we refer to a tool, we are referring to any framework, utility, or system that has defined functionality around a use case. For example, we might have a tool to search Google, a tool to pull our credit card transactions etc. + +LangChain also has three concepts/functions that we need to understand: + +`ChatModel.bind_tools()`: This is a method for attaching tool definitions to model calls. Each model provider has a different way they expect tools to be defined however; LangChain has created a standard interface so you can switch between providers and it is versatile. You can pass in a tool definition (a dict), as well as other objects from which a tool definition can be derived: namely Pydantic classes, LangChain tools, and arbitrary functions etc. The tool definition tells the LLM what this tool does and how to interact with it. + +```python +@tool +def exponentiate(x: float, y: float) -> float: + """Raise 'x' to the 'y'.""" + return x**y +``` + +`AIMessage.tool_calls`: This is an attribute on the AIMessage type returned from the model for easily accessing the tool calls the model decided to make. It will specify any tool invocations in the format specified from the bind_tools call: + +```python +# -> AIMessage( +# content=..., +# additional_kwargs={...}, +# tool_calls=[{'name': 'exponentiate', 'args': {'y': 2.743, 'x': 5.0}, 'id': '54c166b2-f81a-481a-9289-eea68fc84e4f'}] +# response_metadata={...}, +# id='...' +# ) +``` + +`create_tool_calling_agent()`: The tool_calling_agent is just a standard way to bring the above concepts all together to work across providers that have different formats so you can easily switch out models. + +```python +agent = create_tool_calling_agent(llm, tools, prompt) +agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + +agent_executor.invoke({"input": "what's 3 plus 5 raised to the 2.743. also what's 17.24 - 918.1241", }) +``` + +### Setup Cal.com + +I am a big fan of [Cal.com](https://cal.com) and believe the team is going to keep shipping incredible features and so I wanted to build a demo using them. If you do not have an account you can create one [here](https://app.cal.com/signup). +Cal will be our source of truth, so if you update time zones, or working hours in Cal, our assistant will reflect that. + +Once your account is created, click on β€œAPI keys” in the left sidebar and create an API key with no expiration date. + +![Cal.comAPI Keys](/images/examples/langchain_langsmith/cal_api_keys.png) + +To test that it’s working, you can do a simple CURL request. Just replace the following variables below: + +- Username +- API key +- Update the dateFrom and to dateTo variables + ‍ + +```curl +curl --location 'https://api.cal.com/v1/availability?apiKey=cal_live_xxxxxxxxxxxxxx&dateFrom=2024-04-15T00%3A00%3A00.000Z&dateTo=2024-04-22T00%3A00%3A00.000Z&username=michael-louis-xxxx' +``` + +You should get a response similar to the following: + +``` +{ + "busy": [ + { + "start": "2024-04-15T13:00:00.000Z", + "end": "2024-04-15T13:30:00.000Z" + }, + { + "start": "2024-04-22T13:00:00.000Z", + "end": "2024-04-22T13:30:00.000Z" + }, + { + "start": "2024-04-29T13:00:00.000Z", + "end": "2024-04-29T13:30:00.000Z" + }, + .... + ], + "timeZone": "America/New_York", + "dateRanges": [ + { + "start": "2024-04-15T13:45:00.000Z", + "end": "2024-04-15T16:00:00.000Z" + }, + { + "start": "2024-04-15T16:45:00.000Z", + "end": "2024-04-15T19:45:00.000Z" + }, + .... + { + "start": "2024-04-19T18:45:00.000Z", + "end": "2024-04-19T21:00:00.000Z" + } + ], + "oooExcludedDateRanges": [ + + ], + "workingHours": [ + { + "days": [ + 1, + 2, + 3, + 4, + 5 + ], + "startTime": 780, + "endTime": 1260, + "userId": xxxx + } + ], + "dateOverrides": [], + "currentSeats": null, + "datesOutOfOffice": {} +} +``` + +Great! Now we know that our API key is working and pulling information from our calendar. The API calls we will be using later in this tutorial are: + +- **/availability**: Get your availability +- **/bookings**: Book a slot + +### Cerebrium setup + +If you don’t have a Cerebrium account, you can create one by signing up [here](https://dashboard.cerebrium.ai/register) and following the documentation [here](https://docs.cerebrium.ai/cerebrium/getting-started/installation) to get setup + +In your IDE, run the following command to create our Cerebrium starter project: `cerebrium init agent-tool-calling`. This creates two files: + +- Main.py - Our entrypoint file where our code lives +- cerebrium.toml - A configuration file that contains all our build and environment settings + ‍ + +Add the following pip packages near the bottom of your cerebrium.toml. This will be used in creating our deployment environment. + +``` +[cerebrium.dependencies.pip] +pydantic = "latest" +langchain = "latest" +pytz = "latest" ##this is used for timezones +openai = "latest" +langchain_openai = "latest" +``` + +We will be using OpenAI GPT3.5 for our use cases and so we need an API key from them. If you don’t have an account, you can sign up [here](https://openai.com/). You can then create an API key [here](https://platform.openai.com/api-keys). The API key should be in the format: β€œsk_xxxxx”. + +In your Cerebrium dashboard you can then add your Cal.com and OpenAI API keys as secrets by navigating to β€œSecrets” in the sidebar. For the sake of this tutorial I called mine β€œCAL_API_KEY” and β€œOPENAI_API_KEY”. We can now access these values in our code at runtime without exposing them in our code. + +![Cerebrium Secrets Dashboard](/images/examples/langchain_langsmith/cerebrium_secrets.png) + +### Agent Setup + +To start we need to write two tool functions (in our `main.py` file) that the agent will use to check availability on our calendar as well as book a slot. + +- Get availability tool +- You would have seen from the test API request we did above to Cal.com that the API returns your availability in the following way: +- The time slots that you are already busy +- Your working hours on each day + ‍ + +Below is the code to achieve this: + +```python + +from langchain_core.tools import tool +from cerebrium import get_secret +import requests +from cal import find_available_slots + +@tool +def get_availability(fromDate: str, toDate: str) -> float: + """Get my calendar availability using the 'fromDate' and 'toDate' variables in the date format '%Y-%m-%dT%H:%M:%S.%fZ'""" + + url = "https://api.cal.com/v1/availability" + params = { + "apiKey": get_secret("CAL_API_KEY"), + "username": "xxxxx", + "dateFrom": fromDate, + "dateTo": toDate + } + response = requests.get(url, params=params) + if response.status_code == 200: + availability_data = response.json() + available_slots = find_available_slots(availability_data, fromDate, toDate) + return available_slots + else: + return {} +``` + +In the above snippet we are doing a few things: + +- We give our function the @tool decorator so that LangChain can tell the LLM this is a tool. +- We write a docstring that explains to the LLM what this function does and what input it expects.The LLM will make sure it asks the user enough questions to collect this input data. +- We wrote a helper function, find_available_slots, to take the information returned from the Cal.com API and format it so its more readable. It will show the user the time slots available on each day. Make sure its in your directory! + ‍ + +We then follow a similar practice to write our book_slot tool. This will book a slot in my calendar based on the selected time/day. You can get the eventTypeId from your dashboard, select an event and grab the ID in the URL. + +```python +@tool +def book_slot(datetime: str, name: str, email: str, title: str, description: str) -> float: + """Book a meeting on my calendar at the requested date and time using the 'datetime' variable. Get a description about what the meeting is about and make a title for it""" + url = "https://api.cal.com/v1/bookings" + params = { + "apiKey": get_secret("CAL_API_KEY"), + "username": "xxxx", + "eventTypeId": "xxx", + "start": datetime, + "responses": { + "name": name, + "email": email, + "guests": [], + "metadata": {}, + "location": { + "value": "inPerson", + "optionValue": "" + } + }, + "timeZone": "America/New York", + "language": "en", + "status": "PENDING", + "title": title, + "description": description, + } + response = requests.post(url, params=params) + if response.status_code == 200: + booking_data = response.json() + return booking_data + else: + print('error') + print(response) + return {} +``` + +Now that we have created our two tools let us create our agent in our `main.py` file too: + +```python +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain_core.tools import tool +from langchain.agents import create_tool_calling_agent, AgentExecutor +from langchain_openai import ChatOpenAI + +prompt = ChatPromptTemplate.from_messages([ + ("system", "you're a helpful assistant managing the calendar of Michael Louis. You need to book appointments for a user based on available capacity and their preference. You need to find out if the user is: From Michaels team, a customer of Cerebrium or a friend or entrepreneur. If the person is from his team, book a morning slot. If its a potential customer for Cerebrium, book an afternoon slot. If its a friend or entrepreneur needing help or advice, book a night time slot. If none of these are available, book the earliest slot. Do not book a slot without asking the user what their preferred time is. Find out from the user, their name and email address."), + MessagesPlaceholder(variable_name="chat_history"), + ("human", "{input}"), + MessagesPlaceholder(variable_name="agent_scratchpad"), +]) + +tools = [get_availability, book_slot] + + +llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0, api_key=get_secret("OPENAI_API_KEY")) +agent = create_tool_calling_agent(llm, tools, prompt) +agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True) +``` + +The above snippet is used to create our agent executor which consists of: + +- Our prompt template: + - This is where we can give instructions to our agent on what role it is taking on, its goal and how it should perform in certain situations etc. The more precise and concise this is, the better. + - Chat History is where we will inject all previous messages so that the agent has context on what was said previously. + - Input is new input from the end user. +- We then instantiate our GPT3.5 model that will be the LLM we will be using. You can swap this our with Antrophic or any other provider just by replacing this one line - LangChain makes this seamless. +- Lastly, we join this all together with our tools to create an agent executor. + +### Setup Chatbot + +The above code is static in that it will only reply to our first question but we might need to have a conversation to find a time that suits both the user and my schedule. We therefore need to create a chatbot with tool calling capabilities and the ability to remember past messages. LangChain supports this with RunnableWithMessageHistory(). + +It essentially allows us to store the previous replies of our conversation in a chat_history variable (mentioned above in our prompt template) and tie this all to a session identifier so your API can remember information pertaining to a specific user/session. Below is our code to implement this: + +```python +from langchain.memory import ChatMessageHistory +from langchain_core.runnables.history import RunnableWithMessageHistory + +demo_ephemeral_chat_history_for_chain = ChatMessageHistory() +conversational_agent_executor = RunnableWithMessageHistory( + agent_executor, + lambda session_id: demo_ephemeral_chat_history_for_chain, + input_messages_key="input", + output_messages_key="output", + history_messages_key="chat_history", +) +``` + +Let us run a simple local test to make sure everything is working as expected. + +```python +class Item(BaseModel): + prompt: str + session_id: str + +def predict(item, run_id, logger): + item = Item(**item) + + output = conversational_agent_executor.invoke( + { + "input": user_input, + }, + {"configurable": {"session_id": item.session_id}}, + ) + + return {"result": output} # return your results + +if __name__ == "__main__": + while True: + user_input = input("Enter the input (or type 'exit' to stop): ") + if user_input.lower() == 'exit': + break + result = predict({"prompt": user_input, "session_id": "12345"}, "test", logger=None) + print(result) +``` + +The above code does the following: + +- We define a Pydantic object which specifies the parameters our API expects - the user prompt and a session id to tie the conversation to. +- The predict function in Cerebrium is the entry point for our API so we just pass the prompt and session id to our agent and print the results. + ‍ + +To run this, simply install the pip dependencies manually by typing the following into your terminal `pip install pydantic langchain pytz openai langchain_openai langchain-community` and then run `python main.py` to execute your main python file. You will need to replace your secrets with the actual values when running locally. You should then see output similar to the following: + +![Langchain Agent](/images/examples/langchain_langsmith/langchain_agent.png) + +If you keep talking and answering, you will see it will eventually book a slot. + +### Integrate Langsmith + +When releasing an application to production, its vital to know how it is performing, how users are interacting with it, where is it going wrong etc. This is especially true for agent applications since they have indeterministic workflows based on how a user interacts with the application and so we want to make sure we handle any and all edge cases. LangSmith is a logging, debugging and monitoring tool from LangChain that we will use. Your can read more about LangSmith [here](https://docs.smith.langchain.com/monitoring). + +Lets setup LangSmith to monitor and debug our application. First, add LangSmith as a pip dependency to our cerebrium.toml file. + +Next, we need to create an account on LangSmith and generate and API key - its free πŸ™‚. You can sign up for an account [here](https://smith.langchain.com/) and can generate an API key by clicking the settings (gear icon) bottom left. + +Next we need to set the following environment variables. You can add the following code at the top of your main.py. You can add the API key to your secrets in Cerebrium + +```python +import os +os.environ['LANGCHAIN_TRACING_V2']="true" +os.environ['LANGCHAIN_API_KEY']=get_secret("LANGCHAIN_API_KEY") +``` + +To integrate tracing into your applications it is as easy as adding the @traceable decorator to your function(s). LangSmith automatically traverses our functions and subsequent calls so we need to only put it above the predict function and we will see all the tool invocations and OpenAI responses automatically. If there is a function, that predict doesn’t call for example, but you instantiate another way, then make sure to decorate it with traceable. Edit main.py to have the following: + +``` +from langsmith import traceable + +@traceable +def predict(item, run_id, logger): +``` + +Easy! Now LangSmith is set up. Run python main.py to run your file and test booking an appointment with yourself. + +After you have completed a successful test run you should see data populating in LangSmith. You should see the following: + +![LangSmith Runs Dashboard](/images/examples/langchain_langsmith/langsmith_runs.png) + +In the Runs tab, you can see all your runs (ie: invocations/API requests). + +In 1 above, it takes the name of our function, input is set to the Cerebrium RunID which in this case we set to β€œtest”. Lastly, you can see the input as well as the total latency of your run. + +LangSmith wants you to create various automations based on your data. These can be: + +- Sending data to annotation queues that your team needs to label for positive and negative use cases +- Sending to datasets that you can eventually train a model on +- Online evaluation is a new feature that allows you to use a LLM to evaluate data for rudeness, topic etc. +- Triggering webhook endpoints +- and much more… + +You can set these automations by clicking the β€œAdd rule” button above (2) and specifying under what conditions you would like the above to occur. The options to create a rule on are a filter, a sampling rate, and an action. + +Lastly, in 3 you can see overall metrics about your project such as number of runs, error rate, latency etc. + +Since our interface is conversational, there are many use cases where you would like to follow the conversation between your agent and a user without all the bloat. Threads in LangSmith does exactly this. I can see how a conversation evolved over time and if something seems out of the ordinary, I can open the trace to dive deeper. Note that threads are associated with the session id we gave to it. + +![LangSmith Threads](/images/examples/langchain_langsmith/langsmith_threads.png) + +Lastly, you can monitor performance metrics regarding your agent in the Monitor tab. It shows metrics such as trace count, LLM call success rate, First time for token and much more. + +![LangSmith Performance Monitoring](/images/examples/langchain_langsmith/langsmith_performance.png) + +LangSmith is a great choice of tool for those building agents and one that’s extremely simple to integrate. There is so much more functionality that we didn’t explore but its covers a lot of functionality in the application feedback loop of , collecting/annotating data β†’ monitoring and then repeating + +### Deploy to Cerebrium + +To deploy this application to Cerebrium you can simply run the command: cerebrium deploy in your terminal. Just make sure to delete the name == β€œmain” code since that was just to run locally. + +If it deployed successfully, you should see something like this: + +![Cerebrium Deployment](/images/examples/langchain_langsmith/cerebrium_deploy.png) + +You can now call this via an API endpoint and our agent will remember the conversation as long as the session id is the same. Cerebrium will automatically scale up your application based on demand and only pay for the compute you use. + +``` +{ + "run_id": "UHCJ_GkTKh451R_nKUd3bDxp8UJrcNoPWfEZ3AYiqdY85UQkZ6S1vg==", + "status_code": 200, + "result": { + "result": { + "input": "Hi! I would like to book a time with Michael the 18th of April 2024.", + "chat_history": [], + "output": "Michael is available on the 18th of April 2024 at the following times:\n1. 13:00 - 13:30\n2. 14:45 - 17:00\n3. 17:45 - 19:00\n\nPlease let me know your preferred time slot. Are you from Michael's team, a potential customer of Cerebrium, or a friend/entrepreneur seeking advice?" + } + }, + "run_time_ms": 6728.828907012939, + "process_time_ms": 6730.178117752075 +} +``` + +You can find the final version of the code [here](https://github.com/CerebriumAI/examples/tree/master/13-tool-calling-langsmith). + +### Further improvements + +In this tutorial I didn’t get to the following but I think it would be interesting to implement: + +- You can stream back the responses to the user to make the experience more seamless. LangChain makes it easy to do this. +- Integrate with my email, that if I tag Claire in a thread, it can go through the conversation and get context to schedule the meeting. +- Add voice capabilities so that someone can phone me and book a time and Claire can respond. + +### Conclusion + +The integration of LangChain, LangSmith and Cerebrium make it extremely easy to deploy agents at scale! LangChain is a great frameworks for the orchestration of LLMs, tooling, memory as well as LangSmith for monitoring this in production and using it to identify and iterate on edge cases. Cerebrium makes this agent scalable across across 100’s or 1000’s or CPU/GPUs while only allowing you to pay for compute as its used. + +Tag us as **@cerebriumai** in extensions you make to the code repository so we can share it with our community. diff --git a/examples/langchain.mdx b/v4/examples/langchain.mdx similarity index 94% rename from examples/langchain.mdx rename to v4/examples/langchain.mdx index c9a1ada2..12067139 100644 --- a/examples/langchain.mdx +++ b/v4/examples/langchain.mdx @@ -82,8 +82,8 @@ def store_segments(segments): return texts, start_times -def predict(item, run_id, logger): - item = Item(**item) +def predict(url, question, run_id): + item = Item(url=url, question=question) video = pytube.YouTube(item.url) video.streams.get_highest_resolution().filesize @@ -146,7 +146,7 @@ We then integrate Langchain with a Cerebrium deployed endpoint to answer questio ## Deploy -Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the GPU you specify is a AMPERE_A5000, and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. You cerebrium.toml file should look like: ```toml @@ -160,8 +160,8 @@ disable_confirmation = false [cerebrium.deployment] name = "langchain-qa" python_version = "3.10" -include = "[./*, main.py]" -exclude = "[./.*, ./__*]" +include = ["./*", "main.py"] +exclude = ["./.*", "./__*"] [cerebrium.hardware] gpu = "AMPERE_A5000" @@ -200,8 +200,8 @@ cerebrium deploy Once deployed, we can make the following request: ```curl -curl --location --request POST 'https://run.cerebrium.ai/v3/p-xxxxxx/langchain/predict' \ ---header 'Authorization: ' \ +curl --location --request POST 'https://api.cortex.cerebrium.ai/v4/p-xxxxxx/langchain/predict' \ +--header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ "url": "https://www.youtube.com/watch?v=UF8uR6Z6KLc&ab_channel=Stanford", diff --git a/examples/mistral-vllm.mdx b/v4/examples/mistral-vllm.mdx similarity index 69% rename from examples/mistral-vllm.mdx rename to v4/examples/mistral-vllm.mdx index 36e7da2c..056f7935 100644 --- a/examples/mistral-vllm.mdx +++ b/v4/examples/mistral-vllm.mdx @@ -3,6 +3,12 @@ title: "Mistral 7B with vLLM" description: "Deploy Mistral 7B with vLLM" --- + + This example is only compatible with CLI v1.20 and later. Should you be making + use of an older version of the CLI, please run `pip install --upgrade + cerebrium` to upgrade it to the latest version. + + In this tutorial, we will show you how to deploy Mistral 7B using the popular vLLM inference framework. To see the final implementation, you can view it [here](https://github.com/CerebriumAI/examples/tree/master/4-faster-inference-with-vllm) @@ -38,11 +44,11 @@ from pydantic import BaseModel class Item(BaseModel): prompt: str - temperature: Optional[float] = 0.8 - top_p: Optional[float] = 0.75 - top_k: Optional[float] = 40 - max_tokens: Optional[int] = 256 - frequency_penalty: Optional[float] = 1 + temperature: float + top_p: float + top_k: float + max_tokens: int + frequency_penalty: float ``` Above, we use Pydantic as our data validation library. We specify the parameters that are required as well as the parameters that are not (ie: using the Optional keyword) as well as assign defaults to some values. Prompt is the only required parameter so if it is not present in the request, the user will automatically receive an error message. @@ -52,26 +58,41 @@ Above, we use Pydantic as our data validation library. We specify the parameters Below, we will use the Whisper model from OpenAI to convert the video audio to text. We will then split the text into its phrase segments with its respective timings, so we know the exact source of where our model got the answer from. ```python -import torch from vllm import LLM, SamplingParams +from huggingface_hub import login +from cerebrium import get_secret + +# Your huggingface token (HF_AUTH_TOKEN) should be stored in your project secrets on your dashboard +login(token=get_secret("HF_AUTH_TOKEN")) + +# Initialize the model +llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1", dtype="bfloat16", max_model_len=20000, gpu_memory_utilization=0.9) + + +def predict(prompt, temperature=0.8, top_p=0.75, top_k=40, max_tokens=256, frequency_penalty=1): + item = Item( + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_tokens=max_tokens, + frequency_penalty=frequency_penalty + ) + + sampling_params = SamplingParams( + temperature=item.temperature, + top_p=item.top_p, + top_k=item.top_k, + max_tokens=item.max_tokens, + frequency_penalty=item.frequency_penalty + ) -llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1", dtype="bfloat16") - -def predict(item, run_id, logger): - item = Item(**item) - - # Now just setup your sampling parameters for inference: - sampling_params = SamplingParams(temperature=item.temperature, top_p=item.top_p, top_k=item.top_k, max_tokens=item.max_tokens, frequency_penalty=item.frequency_penalty) - - # And feed your prompt and sampling params into your LLM pipeline as follows. outputs = llm.generate([item.prompt], sampling_params) - # Extract your text outputs: generated_text = [] for output in outputs: generated_text.append(output.outputs[0].text) - # And return the result return {"result": generated_text} ``` @@ -83,37 +104,42 @@ The implementation in our **predict** function is pretty straight forward in tha ## Deploy -Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the GPU you specify is an AMPERE_A5000, and that you have enough memory (RAM) on your instance to run the models. Your cerebrium.toml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. Your cerebrium.toml file should look like: ```toml [cerebrium.build] -predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" -force_rebuild = false +predict_data = "{\"prompt\": \"Here is some example predict data for your config.yaml which will be used to test your predict function on build.\"}" +hide_public_endpoint = false disable_animation = false +disable_build_logs = false +disable_syntax_check = false +disable_predict = false log_level = "INFO" disable_confirmation = false [cerebrium.deployment] name = "mistral-vllm" -python_version = "3.10" -include = "[./*, main.py]" -exclude = "[./.*, ./__*]" +python_version = "3.11" +include = ["./*", "main.py", "cerebrium.toml"] +exclude = ["./example_exclude"] +docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" [cerebrium.hardware] -gpu = "AMPERE_A5000" +region = "us-east-1" +provider = "aws" +compute = "AMPERE_A10" cpu = 2 memory = 16.0 gpu_count = 1 [cerebrium.scaling] min_replicas = 0 +max_replicas = 5 cooldown = 60 -[cerebrium.dependencies.apt] -ffmpeg = "latest" - [cerebrium.dependencies.pip] +huggingface-hub = "latest" sentencepiece = "latest" torch = ">=2.0.0" vllm = "latest" @@ -123,6 +149,9 @@ xformers = "latest" [cerebrium.dependencies.conda] +[cerebrium.dependencies.apt] +ffmpeg = "latest" + ``` To deploy the model use the following command: @@ -134,8 +163,8 @@ cerebrium deploy Once deployed, we can make the following request: ```curl -curl --location --request POST 'https://run.cerebrium.ai/v3/p-xxxxxx/mistral-vllm/predict' \ ---header 'Authorization: ' \ +curl --location --request POST 'https://api.cortex.cerebrium.ai/v4/p-/mistral-vllm/predict' \ +--header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ "prompt: "What is the capital city of France?" diff --git a/v4/examples/openai-compatible-endpoint-vllm.mdx b/v4/examples/openai-compatible-endpoint-vllm.mdx new file mode 100644 index 00000000..69219dd1 --- /dev/null +++ b/v4/examples/openai-compatible-endpoint-vllm.mdx @@ -0,0 +1,151 @@ +--- +title: "OpenAI compatible vLLM endpoint" +description: "Create a OpenAI compatible endpoint using the vLLM framework" +--- + +In this tutorial, we will create a OpenAI compatible endpoint that can be used with any open-source mode. This allows you to use the same code as your OpenAI commands but swap +in Cerebrium serverless functions with a 2 line code change. + +To see the final code implementation, you can view it [here](https://github.com/CerebriumAI/examples/tree/master/29-openai-compatible-endpoint) + +### Cerebrium setup + +If you don’t have a Cerebrium account, you can create one by signing up [here](https://dashboard.cerebrium.ai/register) and following the documentation [here](https://docs.cerebrium.ai/cerebrium/getting-started/installation) to get setup + +In your IDE, run the following command to create our Cerebrium starter project: `cerebrium init voice-agent`. This creates two files: + +- **Main.py** - Our entrypoint file where our code lives +- **cerebrium.toml** - A configuration file that contains all our build and environment settings + ‍ + Add the following pip packages and hardware requirements near the bottom of your cerebrium.toml. This will be used in creating our deployment environment. + +``` +[cerebrium.hardware] +cpu = 2 +memory = 12.0 +compute = "AMPERE_A10" + +[cerebrium.dependencies.pip] +vllm = "latest" +pydantic = "latest" +``` + +To start, let us define our imports and initialize our model. In this tutorial, we will use the Llama 3.1 model by Meta which requires authorization on Hugging Face. Add your HF token +to your secrets section in the Cerebrium dashboard. Add the following to your main.py + +```python +from vllm import SamplingParams. AsyncLLMEngine +from vllm.engine.arg_utils import AsyncEngineArgs +from pydantic import BaseModel +from typing import List, Dict, Any +import time +import json +from huggingface_hub import login +from cerebrium import get_secret + +# Your huggingface token (HF_AUTH_TOKEN) should be stored in your project secrets on your dashboard +login(token=get_secret("HF_AUTH_TOKEN")) + +engine_args = AsyncEngineArgs( + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + gpu_memory_utilization=0.9, # Increase GPU memory utilization + max_model_len=8192 # Decrease max model length +) +engine = AsyncLLMEngine.from_engine_args(engine_args) +``` + +We now define the require output format OpenAI endpoints expect using Pydantic and specify our endpoint + +```python +class Message(BaseModel): + role: str + content: str + +class ChatCompletionResponse(BaseModel): + id: str + object: str + created: int + model: str + choices: List[Dict[str, Any]] + +async def run(messages: List[Message], model: str, run_id: str, stream: bool = True, temperature: float = 0.8, top_p: float = 0.95): + prompt = " ".join([f"{msg['role']}: {msg['content']}" for msg in messages]) + sampling_params = SamplingParams(temperature=temperature, top_p=top_p) + results_generator = engine.generate(prompt, sampling_params, run_id) + previous_text = "" + full_text = "" # Collect all generated text here + + async for output in results_generator: + prompt = output.outputs + new_text = prompt[0].text[len(previous_text):] + previous_text = prompt[0].text + full_text += new_text # Append new text to full_text + + response = ChatCompletionResponse( + id=run_id, + object="chat.completion", + created=int(time.time()), + model=model, + choices=[{ + "text": new_text, + "index": 0, + "logprobs": None, + "finish_reason": prompt[0].finish_reason or "stop" + }] + ) + yield json.dumps(response.model_dump()) +``` + +Above the following is happening: + +- We specify all the parameters we send in our function signature. You can set optional or default values. The run_id parameter we automatically add to your function with a unique identifier for every request. +- We put the entire prompt through the model and loop through the generated results. +- If stream=True, we yield a result. Since we are using a async function and yield, this is how we achieve streaming functionality on Cerebrium else we return the entire result at the end. + +## Deploy & Inference + +To deploy the model use the following command: + +```bash +cerebrium deploy +``` + +Once deployed, you will see we generate a curl for this application that looks something like: + +```curl +curl --location 'https://api.cortex.cerebrium.ai/v4/p-/openai-compatible-endpoint/{function}' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer ' \ +--data '{"..."}'' +``` + +In Cerebrium, every function name is now and endpoint so to call this endpoint we would end the URL with /run. However, OpenAI compatible endpoints need to end with /chat/completions. +We have made all endpoints OpenAI compatible so to call the endpoint you can do the following in another file: + +```python +import os +from openai import OpenAI + +client = OpenAI( + base_url="https://api.cortex.cerebrium.ai/v4/dev-p-xxxxxxx/openai-compatible-endpoint/run", + api_key="", +) + +chat_completion = client.chat.completions.create( + messages=[ + {"role": "user", "content": "What is a mistral?"}, + {"role": "assistant", "content": "A mistral is a type of cold, dry wind that blows across the southern slopes of the Alps from the Valais region of Switzerland into the Ligurian Sea near Genoa. It is known for its strong and steady gusts, sometimes reaching up to 60 miles per hour."}, + {"role": "user", "content": "How does the mistral wind form?"} + ], + model="meta-llama/Meta-Llama-3.1-8B-Instruct", + stream=True +) +for chunk in chat_completion: + print(chunk) +print("Finished receiving chunks.") +``` + +Above we set our base url to the one returned by our deploy command - it ends in /run since that's the function we are calling. Lastly, we use our JWT token, which is returned in the +CURL command when you deploy or can be found in your Cerebrium dashboard under the section API Keys. + +VoilΓ ! You now have a OpenAI compatible endpoint that you can customize to your liking! diff --git a/v4/examples/realtime-voice-agents.mdx b/v4/examples/realtime-voice-agents.mdx new file mode 100644 index 00000000..be46343e --- /dev/null +++ b/v4/examples/realtime-voice-agents.mdx @@ -0,0 +1,392 @@ +--- +title: "Real-time Voice AI Agent" +description: "Deploy a real-time voice AI agent" +--- + +In this tutorial, I am going to create a real-time voice AI agent that can respond to any query via speech, in speech, in ~500ms. This is an extremely flexible implementation where you can swap in any Large Language model, Text-to-speech (TTS) model and Speech-to-text (STT) model of your liking. This is extremely useful for use cases involving voice such as customer service bots, receptionists and many more. + +To create this application, we use the [PipeCat](https://www.pipecat.ai/), an open source framework for voice and multimodal conversational AI that handles some of the functionality we might need such as handling user interruptions, dealing with audio data etc. We will speak with our voice AI agent via a WebRTC transport, using [Daily](https://daily.co) (the creators of Pipecat) and will deploy this application on Cerebrium to show how it handles deploying and scaling our application seamlessly. + +You can find the final version of the code [here](https://github.com/CerebriumAI/examples/tree/master/18-realtime-voice-agent) + +### Cerebrium setup + +If you don’t have a Cerebrium account, you can create one by signing up [here](https://dashboard.cerebrium.ai/register) and following the documentation [here](https://docs.cerebrium.ai/cerebrium/getting-started/installation) to get setup + +In your IDE, run the following command to create our Cerebrium starter project: `cerebrium init voice-agent`. This creates two files: + +- **Main.py** - Our entrypoint file where our code lives +- **cerebrium.toml** - A configuration file that contains all our build and environment settings + ‍ + Add the following pip packages and hardware requirements near the bottom of your cerebrium.toml. This will be used in creating our deployment environment. + +``` +[cerebrium.deployment] +# existing values... +docker_base_image_url = "registry.cerebrium.ai/daily:latest" + +[cerebrium.hardware] +region = "us-east-1" +provider = "aws" +gpu = "AMPERE_A10" +cpu = 4 +memory = 18.0 +gpu_count = 1 + +[cerebrium.dependencies.pip] +torch = ">=2.0.0" +"pipecat-ai[silero, daily, openai, deepgram]" = "latest" +aiohttp = "latest" +torchaudio = "latest" +vllm = "latest" +huggingface_hub = "latest" +``` + +You will also see we specify a Docker base image above. The reason for this is Daily has supplied a Docker image that contains local [Deepgram](https://deepgram.com/) Speech-to-Text (STT) and Text-to-Speech (TTS) models. +This helps us achieve our low latency since everything is running locally and not going over the network. + + + Custom Docker files are not support yet but are rather in the works to be + released soon. This is just a very early preview of how it would work. + + +### Pipecat setup + +In this example, we will be using Llama 3 8B as our LLM and serving it via [vLLM](https://docs.vllm.ai/en/latest/getting_started/installation.html). To use Llama 3, we need to be authenticated via Hugging Face. + +To authenticate ourselves, we need to go to HuggingFace and accept the model permissions for [Llama 8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) if we haven’t already. It takes about 30 minutes or less for them to accept your request. + +In your Cerebrium dashboard, you can add your HuggingFace token as a secret by navigating to β€œSecrets” in the sidebar. For the sake of this tutorial, I called mine β€œHF_TOKEN”. We can now access these values in our code at runtime without exposing them in our code. + +You can then add the following code to your main.py: + +```python +from huggingface_hub import login +import subprocess + +os.environ['OUTLINES_CACHE_DIR'] = '/tmp/.outlines' + +login(token=get_secret('HF_TOKEN')) +# Run vllM Server in background process +def start_server(): + while True: + process = subprocess.Popen( + f"subprocess.Popen(f"python -m vllm.entrypoints.openai.api_server --port 5000 --model NousResearch/Meta-Llama-3-8B-Instruct --dtype bfloat16 --api-key {get_secret('HF_TOKEN')} --download-dir /persistent-storage/", shell=True)", + shell=True + ) + process.wait() # Wait for the process to complete + logger.error("Server process ended unexpectedly. Restarting in 5 seconds...") + time.sleep(5) # Wait before restarting + +# Start the server in a separate process +server_process = Process(target=start_server, daemon=True) +server_process.start() +``` + +Pipecat currently doesn't support locally instantiated models and requires them to follow the OpenAI compatible format. Therefore we run the vLLM server locally on our instance in a background process. +We monitor the background process to make sure it launched successfully since there seems to be a bug with rapidly starting multiple vLLM instances. If it doesn't launch correctly, we wait 5 seconds before trying again. +We set the environment variable for OUTLINES_CACHE_DIR, this has to do with a disk I/O bug in outlines that vLLM uses. GitHub issue is [here](https://github.com/vllm-project/vllm/issues/4193) + +Note, we are running the vLLM server on port **5000** (8000 is automatically used by Cerebrium) and we set the download directory of the model so that subsequent cold starts can be much quicker. + +Now we implement the Pipecat framework by instantiating the various components. Create a function call **main** with the following code: + +```python +import aiohttp +import os +import sys +import subprocess +import time +import requests +import asyncio +from multiprocessing import Process +from loguru import logger + +from pipecat.vad.vad_analyzer import VADParams +from pipecat.vad.silero import SileroVADAnalyzer +from pipecat.transports.services.daily import DailyParams, DailyTransport +from pipecat.services.openai import OpenAILLMService +from pipecat.services.deepgram import DeepgramSTTService +from pipecat.pipeline.task import PipelineParams, PipelineTask +from pipecat.pipeline.runner import PipelineRunner +from pipecat.pipeline.pipeline import Pipeline +from pipecat.frames.frames import LLMMessagesFrame, EndFrame + +from pipecat.processors.aggregators.llm_response import ( + LLMAssistantResponseAggregator, LLMUserResponseAggregator +) + +from helpers import ( + ClearableDeepgramTTSService, + AudioVolumeTimer, + TranscriptionTimingLogger +) + +logger.remove(0) +logger.add(sys.stderr, level="DEBUG") + +deepgram_voice: str = "aura-asteria-en" + +async def main(room_url: str, token: str = None): + + async with aiohttp.ClientSession() as session: + transport = DailyTransport( + room_url, + token if token else get_secret("DAILY_TOKEN"), + "Respond bots", + DailyParams( + audio_out_enabled=True, + transcription_enabled=False, + vad_enabled=True, + vad_analyzer=SileroVADAnalyzer(params=VADParams( + stop_secs=0.2 + )), + vad_audio_passthrough=True + ) + ) + + stt = DeepgramSTTService( + name="STT", + api_key=None, + url='ws://127.0.0.1:8082/v1/listen' + ) + + tts = ClearableDeepgramTTSService( + name="Voice", + aiohttp_session=session, + api_key=None, + voice=deepgram_voice, + base_url="http://127.0.0.1:8082/v1/speak" + ) + + llm = OpenAILLMService( + name="LLM", + api_key=get_secret("HF_TOKEN"), + model="NousResearch/Meta-Llama-3-8B-Instruct", + base_url="http://0.0.0.0:5000/v1" + ) + + messages = [ + { + "role": "system", + "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be converted to audio so don't include special characters in your answers. Respond to what the user said in a creative and helpful way.", + }, + ] + + avt = AudioVolumeTimer() + tl = TranscriptionTimingLogger(avt) + + tma_in = LLMUserResponseAggregator(messages) + tma_out = LLMAssistantResponseAggregator(messages) + + pipeline = Pipeline([ + transport.input(), # Transport user input + avt, # Audio volume timer + stt, # Speech-to-text + tl, # Transcription timing logger + tma_in, # User responses + llm, # LLM + tts, # TTS + transport.output(), # Transport bot output + tma_out, # Assistant spoken responses + ]) + + task = PipelineTask( + pipeline, + PipelineParams( + allow_interruptions=True, + enable_metrics=True, + report_only_initial_ttfb=True + )) +``` + +First, in our main function, we initialize the daily transport layer to receive/send the audio/video data from the Daily room we will connect to. You can see we pass the room_url +we would like to join as well as a token to authenticate us programmatically joining. We also set our VAD stop seconds which is the amount of time we wait for a pause before our bot will respond - in this +example, we set it to 200 milliseconds. + +Next we connect to our locally running Deepgram models that are part of our Docker base image we specified in our cerebrium.toml - these are running on port 8082. This is where the Pipecat framework helps convert audio data to text +and vice versa. We then follow the same patten to connect our locally running LLM model from our vLLM server. + +Lastly, we then put this all together as a PipelineTask which is what Pipecat runs all together. The make up of a task is completely customizable and has support for Image and Vision use cases. You can read more [here](https://docs.pipecat.ai/docs/category/services). +Pipeline tasks come with a parameters that make it easy to handle interruptions, swap models to our preference and much more only changing a few lines of code. + +In the code above, we are importing some helper functions at the top of our file to help with our implementation. You can copy the file from the github repository [here](https://github.com/CerebriumAI/examples/blob/master/18-realtime-voice-agent/helpers.py). +Make sure to name the file **helpers.py**. + +### Daily Event Webhooks + +The Daily Python SDK comes with a lot of event webhooks where you can trigger functionality based on events occurring within your Daily room. We would like to handle events such as a user leaving/joining a call. +Continue to add the following code to the **main()** function. + +```python +# When the first participant joins, the bot should introduce itself. +@transport.event_handler("on_first_participant_joined") +async def on_first_participant_joined(transport, participant): + # Kick off the conversation. + messages.append( + {"role": "system", "content": "Please introduce yourself to the user."}) + await task.queue_frame(LLMMessagesFrame(messages)) + +# When the participant leaves, we exit the bot. +@transport.event_handler("on_participant_left") +async def on_participant_left(transport, participant, reason): + await task.queue_frame(EndFrame()) + +# If the call is ended make sure we quit as well. +@transport.event_handler("on_call_state_updated") +async def on_call_state_updated(transport, state): + if state == "left": + await task.queue_frame(EndFrame()) + +runner = PipelineRunner() + +await runner.run(task) +await session.close() +``` + +Above we handle the following events: + +- When the first participant joins, we get the bot to introduce itself to the user. We do this by adding a message to the conversation. +- We add support for multiple participants to join and listen/respond to the bot. +- When a participant leaves or the call is ended, we get the bot to terminate itself. + +From the code above, you will see the events are attached to "Transport", which is the method of communication - in this case the meeting room. We then pass in our defined Pipeline task +to our pipeline runner which executes indefinitely until we signal it to exit which in this case happens when a call ends. If you want to read further about the PipeCat infrastructure +you can read more [here](https://docs.pipecat.ai/docs/understanding-bots/dailyai-architecture) + +### Starting Bot + +We can run our instance with a minimum number of instances by settings the "min_replicas" in our cerebrium.toml for the optimal user experience however we do also want to handle autoscaling use cases. +We want to make sure the vLLM server is live before the bot joins the meeting and so we make a local GET request to check this. +These models take about 40s to load into VRAM from disk. + +Additionally, we need to run the above code in a separate execution environment so PipeCat does not get instantiate multiple instances. To do this, we need to run the above code as a +background process. This will be the entry point of our REST API endpoint to start the PipeCat bot. Once the pipecat bot has returned (ie: the call has ended) then we will return a response to our API endpoint. +We therefore create the following function: + + + Currently, Cerebrium does not support workloads running longer than 5 minutes + however it is currently being worked on internally and will be released soon. + This means that conversations are limited to a 5 minute window. If this is a + issue and you have a urgent use case, please reach out to + [support](mailto:support@cerebrium.ai) + + +```python +def check_vllm_model_status(): + url = "http://0.0.0.0:5000/v1/models" + headers = { + "Authorization": f"Bearer {get_secret('HF_TOKEN')}" + } + max_retries = 8 + for _ in range(max_retries): + response = requests.get(url, headers=headers) + if response.status_code == 200: + return True + time.sleep(15) + return False + +def start_bot(room_url: str, token: str = None): + + def target(): + asyncio.run(main(room_url, token)) + + check_vllm_model_status() + process = Process(target=target) + process.start() + process.join() # Wait for the process to complete + return {"message": "session finished"} +``` + +That's it! You now have a fully functioning AI bot that can interact with a user through speech in ~500ms. Imagine the possibilities! + +Let us now create a user facing UI in order for you to interface with this bot. + +### Creating Meeting Room + +Cerebrium doesn't only have to be used to run AI heavy workloads, it can run any Python code. Therefore we define two functions for our demo that will create a room to join programmatically +and a temporary token, both of which will only be usable for 5 minutes. To implement this, we use the Daily REST API. + +We need to get our Daily developer token from our profile. If, you don't have an account you can sign up for one [here](https://dashboard.daily.co/u/signup) (they have a generous free tier). +You can then go to the "developers" tab to fetch your API key - add this to your Cerebrium Secrets. + +![Daily API Key](/images/examples/voice_agent/daily-api-key.png) + +Below we create a room that only lasts 5 minutes and a temporary token to access it + +```python +def create_room(): + url = "https://api.daily.co/v1/rooms/" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {get_secret('DAILY_TOKEN')}" + } + data = { + "properties": { + "exp": int(time.time()) + 60*5 ##5 mins + } + } + + response = requests.post(url, headers=headers, json=data) + if response.status_code == 200: + room_info = response.json() + token = create_token(room_info['name']) + if token and 'token' in token: + room_info['token'] = token['token'] + else: + logger.error("Failed to create token") + return {"message": 'There was an error creating your room', "status_code": 500} + return room_info + else: + logger.error(f"Failed to create room: {response.status_code}") + return {"message": 'There was an error creating your room', "status_code": 500} + +def create_token(room_name: str): + url = "https://api.daily.co/v1/meeting-tokens" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {get_secret('DAILY_TOKEN')}" + } + data = { + "properties": { + "room_name": room_name + } + } + + response = requests.post(url, headers=headers, json=data) + if response.status_code == 200: + token_info = response.json() + return token_info + else: + logger.error(f"Failed to create token: {response.status_code}") + return None +``` + +### Deploy to Cerebrium + +To deploy this application to Cerebrium you can simply run the command: cerebrium deploy in your terminal. + +If it deployed successfully, you should see something like this: + +![Cerebrium Deployment](/images/examples/voice_agent/deployment.png) + +We will add these endpoints to our frontend interface. + +### Connect frontend + +We created a public fork of the [PipeCat frontend](https://github.com/pipecat-ai/web-client-ui) to show you a nice demo of this application. You can clone the repo [here](https://github.com/CerebriumAI/web-client-ui). + +Follow the instructions in the README.md and then populate the following variables in your .env.development.local + +``` +VITE_SERVER_URL=https://api.cortex.cerebrium.ai/v4/p-xxxxx/ #This is the base url. Do not include the function names +VITE_SERVER_AUTH= #This is the JWT token you can get from the API Keys section of your Cerebrium Dashboard. +``` + +You can now run yarn dev and go to the URL: http://localhost:5173/ to test your application! + +### Conclusion + +This tutorial acts as a good starting point for you to implement voice AI agents into your application as well as extend it into image and vision capabilities. Pipecat is an extensible and open-source framework that makes it easy to build applications using generative AI and Cerebrium makes the process seamless to deploy and auto scale while only paying for the compute you need. + +Tag us as **@cerebriumai** so we can see what you build and please feel free to ask questions/send feedback to us on [Slack](https://join.slack.com/t/cerebriumworkspace/shared_invite/zt-1qojg3eac-q4xyu5O~MeniNIg2jNeadg) or [Discord](https://discord.gg/ATj6USmeE2) communities diff --git a/examples/sdxl.mdx b/v4/examples/sdxl.mdx similarity index 72% rename from examples/sdxl.mdx rename to v4/examples/sdxl.mdx index 72aca31c..6fec9afe 100644 --- a/examples/sdxl.mdx +++ b/v4/examples/sdxl.mdx @@ -3,6 +3,12 @@ title: "Generate Images using SDXL" description: "Generate high quality images using SDXL with refiner" --- + + This example is only compatible with CLI v1.20 and later. Should you be making + use of an older version of the CLI, please run `pip install --upgrade + cerebrium` to upgrade it to the latest version. + + This is a simple tutorial on how to generate a high quality image using the SDXL refiner model located on [Huggingface](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) from Stability AI. To see the final implementation, you can view it [here](https://github.com/CerebriumAI/examples/tree/master/10-sdxl-refiner) @@ -20,18 +26,45 @@ cerebrium init sdxl-refiner It is important to think of the way you develop models using Cerebrium should be identical to developing on a virtual machine or Google Colab - so converting this should be very easy! -Let us add the following packages to the **[cerebrium.dependencies.pip]** section of our `cerebrium.toml` file: +To start, your cerebrium.toml file is where you can set your compute/environment. You cerebrium.toml file should look like: ```toml + +[cerebrium.deployment] +name = "sdxl" +python_version = "3.10" +include = ["./*", "main.py", "cerebrium.toml"] +exclude = ["./.*", "./__*"] +docker_base_image_url = "debian:bookworm-slim" + +[cerebrium.hardware] +region = "us-east-1" +provider = "aws" +compute = "AMPERE_A10" +cpu = 2 +memory = 16.0 +gpu_count = 1 + +[cerebrium.scaling] +min_replicas = 0 +max_replicas = 5 +cooldown = 60 + [cerebrium.dependencies.pip] -invisible_watermark = "latest" -transformers = ">=4.35.0" accelerate = "latest" +transformers = ">=4.35.0" safetensors = "latest" +opencv-python = "latest" diffusers = "latest" + +[cerebrium.dependencies.conda] + +[cerebrium.dependencies.apt] +ffmpeg = "latest" + ``` -To start, we need to create a **main.py** file which will contain our main Python code. This is a relatively simple implementation, so we can do everything in 1 file. We would like a user to send in a link to a YouTube video with a question and return to them the answer as well as the time segment of where we got that response. +We now need to create a **main.py** file which will contain our main Python code. This is a relatively simple implementation, so we can do everything in 1 file. We would like a user to send in a link to a YouTube video with a question and return to them the answer as well as the time segment of where we got that response. So let us define our request object. ```python @@ -46,16 +79,16 @@ import base64 class Item(BaseModel): prompt: str url: str - negative_prompt: Optional[str] = None - conditioning_scale: Optional[float] = 0.5 - height: Optional[int] = 512 - width: Optional[int] = 512 - num_inference_steps: Optional[int] = 20 - guidance_scale: Optional[float] = 7.5 - num_images_per_prompt: Optional[int] = 1 + negative_prompt: Optional[str] + conditioning_scale: float + height: int + width: int + num_inference_steps: int + guidance_scale: float + num_images_per_prompt: int ``` -Above, we import all the various Python libraries we require as well as use Pydantic as our data validation library. Due to the way that we have defined the Base Model, "prompt" and "url" are required parameters and so if they are not present in the request, the user will automatically receive an error message. Everything else is optional. +Above, we import all the various Python libraries we require as well as use Pydantic as our data validation library. Due to the way that we have defined the Base Model, "prompt" and "URL" are required parameters and so if they are not present in the request, the user will automatically receive an error message. Everything else is optional. ## Instantiate model @@ -74,8 +107,19 @@ pipe = pipe.to("cuda") Below we simply get the parameters from our request and pass it to the SDXL model to generate the image(s). You will notice we convert the images to base64, this is so we can return it directly instead of writing the files to an S3 bucket - the return of the predict function needs to be JSON serializable. ```python -def predict(item, run_id, logger): - item = Item(**item) +def predict(prompt, url, negative_prompt=None, conditioning_scale=0.5, height=512, width=512, num_inference_steps=20, + guidance_scale=7.5, num_images_per_prompt=1): + item = Item( + prompt=prompt, + url=url, + negative_prompt=negative_prompt, + conditioning_scale=conditioning_scale, + height=height, + width=width, + num_inference_steps=num_inference_steps, + guidance_scale=guidance_scale, + num_images_per_prompt=num_images_per_prompt + ) init_image = load_image(item.url).convert("RGB") images = pipe( @@ -102,47 +146,6 @@ def predict(item, run_id, logger): ## Deploy -Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the GPU you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: - -```toml - -[cerebrium.build] -predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" -force_rebuild = false -disable_animation = false -log_level = "INFO" -disable_confirmation = false - -[cerebrium.deployment] -name = "sdxl" -python_version = "3.10" -include = "[./*, main.py]" -exclude = "[./.*, ./__*]" - -[cerebrium.hardware] -gpu = "AMPERE_A5000" -cpu = 2 -memory = 16.0 -gpu_count = 1 - -[cerebrium.scaling] -min_replicas = 0 -cooldown = 60 - -[cerebrium.dependencies.apt] -ffmpeg = "latest" - -[cerebrium.dependencies.pip] -accelerate = "latest" -transformers = ">=4.35.0" -safetensors = "latest" -opencv-python = "latest" -diffusers = "latest" - -[cerebrium.dependencies.conda] - -``` - To deploy the model use the following command: ```bash @@ -152,9 +155,9 @@ cerebrium deploy sdxl-refiner Once deployed, we can make the following request: ```curl -curl --location 'https://run.cerebrium.ai/v3/p-xxxxx/sdxl-refiner/predict' \ +curl --location 'https://api.cortex.cerebrium.ai/v4/p-/sdxl-refiner/predict' \ --header 'Content-Type: application/json' \ ---header 'Authorization: ' \ +--header 'Authorization: Bearer ' \ --data '{ "url": "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png", "prompt": "a photo of an astronaut riding a horse on mars" diff --git a/examples/segment_anything.mdx b/v4/examples/segment_anything.mdx similarity index 96% rename from examples/segment_anything.mdx rename to v4/examples/segment_anything.mdx index ba7b47c1..13b49105 100644 --- a/examples/segment_anything.mdx +++ b/v4/examples/segment_anything.mdx @@ -93,7 +93,7 @@ class Item(BaseModel): Pydantic is a data validation library and BaseModel is where Cerebrium keeps some default parameters like "webhook_url" that allows a user to send in a webhook url. We will call it when the job has finished processing β€” this is useful for long-running tasks. Do not worry about that functionality for this tutorial. The reason -the user is sending in an image or a file url is giving the user a choice to send in a base64 encoded image or a publicly accessible file_url we can download. +the user is sending in an image or a file URL is giving the user a choice to send in a base64 encoded image or a publicly accessible file_url we can download. ## Identifying and classifying objects @@ -165,8 +165,8 @@ def download_image(url): return Image.open(BytesIO(r.content)) -def predict(item, run_id, logger): - if (not item.image and not item.file_url): return "" +def predict(image, file_url, coordinates, run_id): + if (not image and not file_url): return "" if item.image: image = v2.cvtColor(Image.open(BytesIO(base64.b64decode(item.image))), cv2.COLOR_BGR2RGB) @@ -194,14 +194,14 @@ In the above code we do a few things: We can then deploy our model to an AMPERE_A5000 instance with the following line of code ```bash -cerebrium deploy segment-anything --gpu AMPERE_A5000 --api-key private-XXXXXXXXXXXXX +cerebrium deploy --name segment-anything --gpu AMPERE_A5000 ``` After a few minutes, your model should be deployed and an endpoint should be returned. Let us create a CURL request to see the response ``` -curl --location --request POST 'https://run.cerebrium.ai/v2/p-xxxxxx/segment-anything/predict' \ ---header 'Authorization: public-XXXXXXXXXXXX' \ +curl --location --request POST 'https://api.cortex.cerebrium.ai/v4/p-xxxxxx/segment-anything/predict' \ +--header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ "file_url": "https://cdn-www.thefashionspot.com/assets/uploads/gallery/the-off-duty-male-models-of-milan-mens-fashion-week/milano-m-moc-rf14-8533.jpg", diff --git a/examples/streaming-falcon-7B.mdx b/v4/examples/streaming-falcon-7B.mdx similarity index 71% rename from examples/streaming-falcon-7B.mdx rename to v4/examples/streaming-falcon-7B.mdx index 4d33cec2..d742dbf7 100644 --- a/examples/streaming-falcon-7B.mdx +++ b/v4/examples/streaming-falcon-7B.mdx @@ -1,8 +1,14 @@ --- -title: "Streaming Output - Falcon 7B" +title: "Streaming LLM Output" description: "Stream outputs live from Falcon 7B using SSE" --- + + This example is only compatible with CLI v1.20 and later. Should you be making + use of an older version of the CLI, please run `pip install --upgrade + cerebrium` to upgrade it to the latest version. + + In this tutorial, we will show you how to implement streaming to return results to your users as soon as possible with the use of SSE. To see the final implementation, you can view it [here](https://github.com/CerebriumAI/examples/tree/master/7-streaming-endpoint) @@ -29,6 +35,7 @@ transformers = "git+https://github.com/huggingface/transformers.git" accelerate = "git+https://github.com/huggingface/accelerate.git" bitsandbytes = "latest" sentencepiece = "latest" +pydantic = "latest" torch = "2.1.0" ``` @@ -40,11 +47,11 @@ from pydantic import BaseModel class Item(BaseModel): prompt: str - cutoff_len: Optional[int] = 256 - temperature: Optional[float] = 0.8 - top_p: Optional[float] = 0.75 - top_k: Optional[float] = 40 - max_new_tokens: Optional[int] = 250 + cutoff_len: int + temperature: float + top_p: float + top_k: float + max_new_tokens: int ``` Above, we use Pydantic as our data validation library. We specify the parameters that are required as well as the parameters that are not (ie: using the Optional keyword) as well as assign defaults to some values. Prompt is the only required parameter so if it is not present in the request, the user will automatically receive an error message. @@ -83,10 +90,17 @@ onto the GPU with every request but rather only on model startup. Below, we define our predict function, which will be responsible for our logic to stream results back from our endpoint. ```python -def predict(item, run_id, logger): - item = Item(**item) +def stream(prompt, cutoff_len=256, temperature=0.8, top_p=0.75, top_k=40, max_new_tokens=250): + item = Item( + prompt=prompt, + cutoff_len=cutoff_len, + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_new_tokens=max_new_tokens, + ) inputs = tokenizer( - item.prompt, return_tensors="pt", max_length=512, truncation=True, padding=True + item.prompt, return_tensors="pt", max_length=512, truncation=True, padding=True ) input_ids = inputs["input_ids"].to("cuda") @@ -97,18 +111,18 @@ def predict(item, run_id, logger): top_k=item.top_k, ) with torch.no_grad(): - generation_kwargs = { - "input_ids": input_ids, - "generation_config": generation_config, - "return_dict_in_generate": True, - "output_scores": True, - "pad_token_id": tokenizer.eos_token_id, - "max_new_tokens": item.max_new_tokens, - "streamer": streamer, - } - model.generate(**generation_kwargs) - for text in streamer: - yield text #vital for streaming + generation_kwargs = { + "input_ids": input_ids, + "generation_config": generation_config, + "return_dict_in_generate": True, + "output_scores": True, + "pad_token_id": tokenizer.eos_token_id, + "max_new_tokens": item.max_new_tokens, + "streamer": streamer, + } + model.generate(**generation_kwargs) + for text in streamer: + yield text # vital for streaming ``` @@ -117,45 +131,52 @@ importantly, we use the **yield** keyword to return output from our model as its ## Deploy -Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the GPU you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. You cerebrium.toml file should look like: ```toml - [cerebrium.build] -predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" -force_rebuild = false +predict_data = "{\"prompt\": \"Here is some example predict data for your config.yaml which will be used to test your predict function on build.\"}" +hide_public_endpoint = false disable_animation = false +disable_build_logs = false +disable_syntax_check = false +disable_predict = false log_level = "INFO" disable_confirmation = false [cerebrium.deployment] name = "streaming-falcon" -python_version = "3.10" -include = "[./*, main.py]" -exclude = "[./.*, ./__*]" +python_version = "3.11" +include = ["./*", "main.py", "cerebrium.toml"] +exclude = ["./example_exclude"] +docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" [cerebrium.hardware] -gpu = "AMPERE_A5000" +region = "us-east-1" +provider = "aws" +compute = "AMPERE_A10" cpu = 2 memory = 16.0 gpu_count = 1 [cerebrium.scaling] min_replicas = 0 +max_replicas = 5 cooldown = 60 -[cerebrium.dependencies.apt] - [cerebrium.dependencies.pip] peft = "git+https://github.com/huggingface/peft.git" transformers = "git+https://github.com/huggingface/transformers.git" accelerate = "git+https://github.com/huggingface/accelerate.git" bitsandbytes = "latest" sentencepiece = "latest" +pydantic = "latest" torch = "2.1.0" [cerebrium.dependencies.conda] +[cerebrium.dependencies.apt] + ``` To deploy the model use the following command: @@ -167,16 +188,16 @@ cerebrium deploy streaming-falcon Once deployed, we can make the following request: - Please note the end of the URL is set to **stream** instead of predict + Please our function is called **stream** and so this is what the final path + should be in your url ```curl -curl --location --request POST 'https://run.cerebrium.ai/v3/p-xxxxxx/streaming-falcon/stream' \ ---header 'Authorization: public-XXXXXXXXXXXX' \ +curl --location --request POST 'https://api.cortex.cerebrium.ai/v4/p-/streaming-falcon/stream' \ +--header 'Authorization: Bearer ' \ --header 'Content-Type: application/json' \ --data-raw '{ - "url": "https://www.youtube.com/watch?v=UF8uR6Z6KLc&ab_channel=Stanford", - "question": "How old was Steve Jobs when started Apple?" + "prompt": "Tell me a story", }' ``` diff --git a/v4/examples/tensorRT.mdx b/v4/examples/tensorRT.mdx new file mode 100644 index 00000000..a5c162b3 --- /dev/null +++ b/v4/examples/tensorRT.mdx @@ -0,0 +1,332 @@ +--- +title: "Llama 3B on TensorRT-LLM" +description: "Achieve high throughput with the TensorRT-LLM framework" +--- + + + This example is only compatible with CLI v1.20 and later. Should you be making + use of an older version of the CLI, please run `pip install --upgrade + cerebrium` to upgrade it to the latest version. + + +In this tutorial blog post, we will guide you through the process of implementing the TensorRT-LLM framework to serve Llama 3 8B model on the Cerebrium platform. TensorRT-LLM is a powerful framework that can be used to optimise machine learning models for inference. It can lead to significant improvements in performance, especially in terms of inference speed and throughput. + +We will achieve ~1700 output tokens per second on a single Nvidia A10 instance however you can go up to ~4500 output tokens per second on a single Nvidia A100 40GB instance or even ~19,000 tokens on a H100. For further improvements, you can use speculative sampling or FP8 quantisation to increase latency and throughput. You can view the official benchmarks across different GPU types, model sizes and input/output token lengths [here](https://github.com/NVIDIA/TensorRT-LLM/blob/71d8d4d3dc655671f32535d6d2b60cab87f36e87/docs/source/performance.md). + +### Overview + +TensorRT-LLM is a specialised library within NVIDIA's TensorRT, a high-performance deep learning inference platform. It is designed to accelerate large language models (LLMs) using NVIDIA GPUs. It can significantly improve the performance of your machine learning models however it comes at the expense of a very complicated setup process. + +You are required to convert and build the model using very specific arguments that replicate your workloads as closely as possible. If you don’t configure these steps properly, you might witness subpar performance and subsequently it will become very complicated to deploy. We will cover these concepts in depth throughout the tutorial. + +### Cerebrium Setup + +If you don’t have a Cerebrium account, you can create one by signing up here and following the documentation [here](https://docs.cerebrium.ai/cerebrium/getting-started/installation) to get setup + +In your IDE, run the following command to create our Cerebrium starter project: `cerebrium init llama-3b-tensorrt`. This creates two files: + +- main.py - Our entrypoint file where our code lives +- cerebrium.toml - A configuration file that contains all our build and environment settings + +TensorRT-LLM has a demo implementation of Llama on its GitHub repo which you can look at here. The first thing you will notice is that TensorRT-LLM requires Python 3.10. Subsequently, the code that converts the model weights to the TensorRT format requires a lot of memory and so we need to set this in our configuration file. Please change your cerebrium.toml file to reflect the below: + +``` +[cerebrium.deployment] +name = "llama-3b-tensorrt" +python_version = "3.10" +include = ["./*", "main.py", "cerebrium.toml"] +exclude = ["./example_exclude"] +docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" + +[cerebrium.hardware] +region = "us-east-1" +provider = "aws" +compute = "AMPERE_A10" +cpu = 3 +memory = 40.0 +gpu_count = 1 +``` + +The most important decision to make is to decide what GPU chip you would like to run on. Larger models, longer sequence lengths and bigger batches all require more GPU memory and so if throughput is your desired metric, we recommend using a A100/H100. In this example we went with a A10 which gives a good cost/performance trade-off. Also, there is no capacity shortages and so its more stable for low-latency enterprise workloads. However, if this is a requirement for you - please reach out. + +Let us then install the required pip and apt requirements. You can add the following to your cerebrium.toml + +``` +[cerebrium.dependencies.pip] +transformers = "latest" +torch = ">=2.0.0" +pydantic = "latest" +huggingface-hub = "latest" +flax = "latest" +h5py = "latest" +sentencepiece = "latest" +easydict = "latest" +mpmath = "==1.3.0" + +[cerebrium.dependencies.apt] +software-properties-common = "latest" +gcc = "latest" +"g++" = "latest" +aria2 = "latest" +git = "latest" +git-lfs = "latest" +wget = "latest" +openmpi-bin = "latest" +libopenmpi-dev = "latest" +``` + +We want to install the tensorrt_llm package after the above installs and want to grab it from the Nvidia PyPI index url. To do this, we use shell commands which allows you to run command line arguments during the build process - this happens as the last step of the build process (ie: post pip, apt and conda installs). + +Add the following under [cerebrium.build] in your cerebrium.toml: + +``` +shell_commands = [ "pip install tensorrt_llm -U --pre --extra-index-url https://pypi.nvidia.com https://pypi.nvidia.com/"] +``` + +We then need to write an initial code in our main.py that will: + +- Download Llama 3 7B from HuggingFace +- Convert the model checkpoints +- Build the TensorTRT-LLM inference engine. + +At the moment, Cerebrium does not have a way to run code only during the build process (work in progress) however, one easy way for us to side step this is to check if the file output from the trtllm-build step already exists meaning its been converted. + +To start we need to go to HuggingFace and accept the model permissions for [Lllama 8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) if we haven’t already. It takes about 30 minutes or less for them to accept your request. Since HuggingFace requires you to be authenticated to download the model weights, we need to authenticate ourselves in Cerebrium before downloading the model. + +In your Cerebrium dashboard, you can add your HuggingFace token as a secret by navigating to β€œSecrets” in the sidebar. For the sake of this tutorial I called mine β€œHF_AUTH_TOKEN”. We can now access these values in our code at runtime without exposing them in our code. + +You can then add the following code to your main.py to download the model: + +```python +from huggingface_hub import snapshot_download +from cerebrium import get_secret + +huggingface_hub.login(token=get_secret("HF_AUTH_TOKEN")) + +MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" +MODEL_DIR="/persistent-storage/model_input" +ENGINE_DIR="/persistent-storage/model_output" + +if not os.path.exists(ENGINE_DIR): + snapshot_download( + MODEL_ID, + local_dir=MODEL_DIR, + ignore_patterns=["*.pt", "*.bin"], # using safetensors + ) +``` + +In the above code, we log in to HuggingFace using our HF_AUTH_TOKEN and download the Llama 3 7B model. We check if the ENGINE_DIR exists as a way to prevent running this code on cold start but rather only running this if it the final TensorRT-LLM engine files don’t exist. + +### Setup TensorRT-LLM + +Next, we need to convert the downloaded model. We can use the script that exists in the (tensorRT-LLM)[https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama] repo. To download this script to your Cerebrium instance put the following code in your shell commands. + +``` +["wget https://raw.githubusercontent.com/NVIDIA/TensorRT-LLM/71d8d4d3dc655671f32535d6d2b60cab87f36e87/examples/llama/convert_checkpoint.py -O ./convert_checkpoint.py"] +``` + +This downloads the specific script file to your instance. Shell commands works as an array of strings so you can just add it to the existing shell command already there. + +### Converting and Compiling TensorRT-LLM engine + +Below we convert the model into float16 format, the reason being that it results in marginally higher performance over float32. You can go even further and use the quantised model (FP8) which will give you the lowest latency. + +What allows TensorRT-LLM to achieve its high throughput is that it is compiled in advance to predefined settings which you set based on your expected workloads. This therefore makes concrete choices of the CUDA kernels to execute for each operation which are then optimized for specific types and shapes of tensors for the specific hardware it runs on. + +So we need to specify the maximum input and output lengths as well as the typical batch size. The closer these values are to production, the higher our throughput will be. There are many different options you can pass to the command trtllm-build to tune the engine for your specific workload, we selected just two plugins that accelerate two core components. You can read more about the plugin options [here](https://fetch.ai/blog/advancing-llm-optimization). + +We then need to run the convert_checkpoint script and then run the trtllm-build script to build the TensorTRT-LLM model. You can add the following code to your main.py: + +```python +import tensorrt_llm +import subprocess +import torch + +MAX_INPUT_LEN, MAX_OUTPUT_LEN = 256, 256 +MAX_BATCH_SIZE = ( + 128 +) + +if not os.path.exists(ENGINE_DIR): + #snapshot_download( + MODEL_ID, + ... + ) + + # Build the TRT engine + convert_checkpoint = f""" + python convert_checkpoint.py --model_dir {MODEL_DIR} \\ + --output_dir ./model_ckpt \\ + --dtype float16 + """ + + SIZE_ARGS = f"--max_batch_size={MAX_BATCH_SIZE} --max_input_len={MAX_INPUT_LEN} --max_output_len={MAX_OUTPUT_LEN}" + build_engine = f""" + trtllm-build --checkpoint_dir ./model_ckpt --output_dir {ENGINE_DIR} \\ + --tp_size=1 --workers=1 \\ + --max_batch_size={MAX_BATCH_SIZE} --max_input_len={MAX_INPUT_LEN} --max_output_len={MAX_OUTPUT_LEN} \\ + --gemm_plugin=float16 --gpt_attention_plugin=float16 + """ + + print("Building engine...") + subprocess.run(convert_checkpoint, shell=True, check=True) + subprocess.run(build_engine, shell=True, check=True) + print("\\nEngine built successfully! You can find it at: ", ENGINE_DIR) +else: + print("Engine already exists at: ", ENGINE_DIR) +``` + +You will see we run these command line arguments as a subprocess. The reason I did it like this and not as shell commands is: + +Currently Cerebrium doesn’t support Secrets in shell commands and I need the model to be downloaded before I can continue with the other model conversion steps. +It seems much cleaner to reuse variables and use subprocesses than squash everything in the cerebrium.toml file. +Model Instantiation +Now that our model is converted with our specifications, let us initialise the model and set it up based on our requirements. This code will run on every cold start and takes roughly ~10-15s to load the model into GPU memory. If the container is warm, it will run your predict function immediately which we talk about in the next section. + +Above your predict function, add the following code. + +```python +from tensorrt_llm.runtime import ModelRunner +from transformers import AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) +# LLaMA models do not have a padding token, so we use the EOS token +tokenizer.add_special_tokens( + {"pad_token": tokenizer.eos_token} +) +# and then we add it from the left, to minimize impact on the output +tokenizer.padding_side = "left" +pad_id = tokenizer.pad_token_id +end_id = tokenizer.eos_token_id + +runner_kwargs = dict( + engine_dir=f"{ENGINE_DIR}", + lora_dir=None, + rank=tensorrt_llm.mpi_rank(), +) + +model = ModelRunner.from_dir(**runner_kwargs) +``` + +We need to pass the input that a user sends into our prompt template and cater for special tokens. Add the the following function that will handle this for us + +```python +def parse_input( + tokenizer, + input_text, + prompt_template=None, + add_special_tokens=True, + max_input_length=923 + ): + + # Apply prompt template if provided + if prompt_template is not None: + input_text = prompt_template.format(input_text=input_text) + + # Encode the text to input IDs + input_ids = tokenizer.encode( + input_text, + add_special_tokens=add_special_tokens, + truncation=True, + max_length=max_input_length, + ) + + # Convert to tensor + input_ids_tensor = torch.tensor([input_ids], dtype=torch.int32) # Add batch dimension + + return input_ids_tensor +``` + +Before we get to our predict function that runs at runtime we need to define our Pyandtic object that will make sure user requests conform to this standard as well as have default values. + +```python +from typing import Optional +from pydantic import BaseModel + +class Item(BaseModel): + prompt: str + temperature: Optional[float] = 0.95 + top_k: Optional[int] = 100 + top_p: Optional[float] = 1.0 + repetition_penalty: Optional[float] = 1.05 + num_tokens: Optional[int] = 250 + prompt_template: Optional[str] = "user\\n{input_text}\\nmodel\\n" +``` + +### Inference function + +Lastly, let us bring this all together with our predict function + +```python +def predict(prompt, temperature, top_k, top_p, repetition_penalty, num_tokens, prompt_template, run_id, logger): + item = Item( + prompt=prompt, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + num_tokens=num_tokens, + prompt_template=prompt_template + ) + + stop_words_list = None + bad_words_list = None + + batch_input_ids = parse_input( + tokenizer=tokenizer, + input_text=item.prompt, + prompt_template=item.prompt_template + ) + input_length = batch_input_ids[0].size(0) + + time_begin = time.time() + with torch.no_grad(): + outputs = model.generate( + batch_input_ids, + max_new_tokens=item.num_tokens, + max_attention_window_size=None, + sink_token_length=None, + end_id=end_id, + pad_id=pad_id, + temperature=item.temperature, + top_k=item.top_k, + top_p=item.top_p, + num_beams=1, + repetition_penalty=item.repetition_penalty, + stop_words_list=stop_words_list, + bad_words_list=bad_words_list, + output_sequence_lengths=True, + return_dict=True, + ) + torch.cuda.synchronize() + + time_total = time.time() - time_begin + + output_ids = outputs["output_ids"] + sequence_lengths = outputs["sequence_lengths"] + + # Decode the output + output_begin = input_length + output_end = sequence_lengths + output_text = tokenizer.decode(output_ids[0][0][output_begin:output_end].tolist()) + + return { + "response_txt": output_text, + "latency_s": time_total, + } +``` + +Now that our code is deployed, we can deploy the application with the command: cerebrium deploy. + +On initial deploy, it will take about 15-20 minutes since besides installing all your packages and dependencies, it will download the model and convert it to the TensorRT-LLM format. Once completed, it should output a curl which you can copy and paste to test your inference endpoint. + +``` +curl --location 'https://api.cortex.cerebrium.ai/v4/p-xxxxxx/predict' \\ +--header 'Content-Type: application/json' \\ +--header 'Authorization: Bearer ' \\ +--data '{"prompt": "Tell me about yourself?"}' +``` + +TensorRT-LLM is one of the top performing inference frameworks on the market and especially if you know details about your expected future workloads. You should now have a low latency endpoint with high throughput that can auto-scale to tens of thousands of inferences all while only paying for the compute you use. + +To view the final version of the code, you can look [here](https://github.com/CerebriumAI/examples/tree/master/15-tensor-trt). diff --git a/examples/transcribe-whisper.mdx b/v4/examples/transcribe-whisper.mdx similarity index 71% rename from examples/transcribe-whisper.mdx rename to v4/examples/transcribe-whisper.mdx index 3469f6de..457ac954 100644 --- a/examples/transcribe-whisper.mdx +++ b/v4/examples/transcribe-whisper.mdx @@ -3,8 +3,8 @@ title: "Transcribe 1 hour podcast" description: "Using Distill Whisper to transcribe an audio file" --- -In this tutorial, we will transcribe an hour audio file using Distil Whisper - an optimised version of Whisper-large-v2 but 60% faster and within 1% of the error rate. We -will accept either a base64 encode string of the audio file or a url from which we can download the audio file from. +In this tutorial, we will transcribe an hour audio file using Distill Whisper - an optimized version of Whisper-large-v2 but 60% faster and within 1% of the error rate. We +will accept either a base64 encode string of the audio file or an URL from which we can download the audio file from. To see the final implementation, you can view it [here](https://github.com/CerebriumAI/examples/tree/master/11-whisper-transcription) @@ -27,10 +27,11 @@ Let us add the following packages to the **[cerebrium.dependencies.pip]** sectio [cerebrium.dependencies.pip] accelerate = "latest" transformers = ">=4.35.0" -openai-whisper +openai-whisper = "latest" +pydantic = "latest" ``` -To start let us create a util.py file for our utility functions - downloading a file from a url or converting a base64 string to a file. Our **util.py** would look something like below: +To start let us create a util.py file for our utility functions - downloading a file from a URL or converting a base64 string to a file. Our **util.py** would look something like below: ```python import base64 @@ -38,14 +39,12 @@ import uuid DOWNLOAD_ROOT = "/tmp/" # Change this to /persistent-storage/ if you want to save files to the persistent storage -def download_file_from_url(logger, url: str, filename: str): - logger.info("Downloading file...") - - import requests +def download_file_from_url(url: str, filename: str): + print("Downloading file...") response = requests.get(url) if response.status_code == 200: - logger.info("Download was successful") + print("Download was successful") with open(filename, "wb") as f: f.write(response.content) @@ -53,13 +52,13 @@ def download_file_from_url(logger, url: str, filename: str): return filename else: - logger.info(response) + print(response) raise Exception("Download failed") # Saves a base64 encoded file string to a local file -def save_base64_string_to_file(logger, audio: str): - logger.info("Converting file...") +def save_base64_string_to_file(audio: str): + print("Converting file...") decoded_data = base64.b64decode(audio) @@ -68,7 +67,7 @@ def save_base64_string_to_file(logger, audio: str): with open(filename, "wb") as file: file.write(decoded_data) - logger.info("Decoding base64 to file was successful") + print("Decoding base64 to file was successful") return filename ``` @@ -81,9 +80,9 @@ from typing import Optional from pydantic import BaseModel, HttpUrl class Item(BaseModel): - audio: Optional[str] = None - file_url: Optional[HttpUrl] = None - webhook_endpoint: Optional[HttpUrl] = None + audio: Optional[str] + file_url: Optional[HttpUrl] + webhook_endpoint: Optional[HttpUrl] ``` Above, we use Pydantic as our data validation library. Due to the way that we have defined the Base Model, "audio" and "file_url" are optional parameters but we must do a check to make sure we are given the one or the other. The webhook_endpoint parameter is something Cerebrium automatically includes in every request and can be used for long running requests. @@ -91,7 +90,7 @@ Currently, Cerebrium has a max timeout of 3 minutes for each inference request. ## Setup Model and inference -Below, we import the required packages and load in our Whisper model. This will download during your deployment however in subsequent deploys or inference requests it will be automatically cached in your persistent storage for subsequent use. You can read more about persistent storage [here]() +Below, we import the required packages and load in our Whisper model. This will be downloaded during your deployment, however, in subsequent deploys or inference requests it will be automatically cached in your persistent storage for subsequent use. You can read more about persistent storage [here]() We do this outside our **predict** function since we only want this code to run on a cold start (ie: on startup). If the container is already warm, we just want it to do inference and it will execute just the **predict** function. ```python @@ -99,78 +98,78 @@ from huggingface_hub import hf_hub_download from whisper import load_model, transcribe from util import download_file_from_url, save_base64_string_to_file -distil_large_v2 = hf_hub_download(repo_id="distil-whisper/distil-large-v2", filename="original-model.bin") +distil_large_v2 = hf_hub_download(repo_id="distil-whisper/distil-large-v3", filename="original-model.bin") model = load_model(distil_large_v2) -def predict(item, run_id, logger): - item = Item(**item) +def predict(run_id, audio=None, file_url=None, webhook_endpoint=None): + item = Item(audio=audio, file_url=file_url, webhook_endpoint=webhook_endpoint) input_filename = f"{run_id}.mp3" - if item.audio is not None: - file = save_base64_string_to_file(logger, item.audio) - elif item.file_url is not None: - file = download_file_from_url(logger, item.file_url, input_filename) - logger.info("Transcribing file...") - - result = transcribe(model, audio=file) - - return result + if audio is None and file_url is None: + raise 'Either audio or file_url must be provided' + else: + if item.audio is not None: + file = save_base64_string_to_file(item.audio) + elif item.file_url is not None: + file = download_file_from_url(item.file_url, input_filename) + print("Transcribing file...") + + result = transcribe(model, audio=file) + return result ``` In our predict function, which only runs on inference requests, we simply create a audio file from the download URL or string given to us via the request. We then transcribe the file and return the output to a user. ## Deploy -Your cerebrium.toml file is where you can set your compute/environment. Please make sure that the GPU you specify is a AMPERE_A5000 and that you have enough memory (RAM) on your instance to run the models. You cerebrium.toml file should look like: +Your cerebrium.toml file is where you can set your compute/environment. Your cerebrium.toml file should look like: ```toml - -[cerebrium.build] -predict_data = "{\"prompt\": \"Here is some example predict data for your cerebrium.toml which will be used to test your predict function on build.\"}" -force_rebuild = false -disable_animation = false -log_level = "INFO" -disable_confirmation = false - [cerebrium.deployment] -name = "controlnet-logo" -python_version = "3.10" -include = "[./*, main.py]" -exclude = "[./.*, ./__*]" +name = "distil-whisper" +python_version = "3.11" +include = ["./*", "main.py", "cerebrium.toml"] +exclude = ["./example_exclude"] +docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" [cerebrium.hardware] -gpu = "AMPERE_A5000" -cpu = 2 -memory = 10.0 +region = "us-east-1" +provider = "aws" +compute = "AMPERE_A10" +cpu = 3 +memory = 12.0 gpu_count = 1 [cerebrium.scaling] min_replicas = 0 +max_replicas = 5 cooldown = 60 -[cerebrium.dependencies.apt] - [cerebrium.dependencies.pip] accelerate = "latest" transformers = ">=4.35.0" -openai-whisper +openai-whisper = "latest" +pydantic = "latest" [cerebrium.dependencies.conda] +[cerebrium.dependencies.apt] +"ffmpeg" = "latest" + ``` -To deploy the model use the following command: +Deploy the app use the following command: ```bash cerebrium deploy distill-whisper ``` -Once deployed, we can make the following request: +Once deployed, make the following request: ```curl -curl --location 'https://run.cerebrium.ai/v3/p-xxxxx/distill-whisper/predict' \ +curl --location 'https://api.cortex.cerebrium.ai/v4/p-/distill-whisper/predict' \ --header 'Content-Type: application/json' \ ---header 'Authorization: ' \ +--header 'Authorization: Bearer ' \ --data '{"file_url": "https://your-public-url.com/test.mp3"}'' ```