diff --git a/.circleci/config.yml b/.circleci/config.yml index 85c49ea90b95..e5bc82a59678 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,8 +1,8 @@ version: 2.1 orbs: codecov: codecov/codecov@4.0.1 - node: circleci/node@5.1.0 # Add this line to declare the node orb - win: circleci/windows@5.0 # Add Windows orb + node: circleci/node@5.1.0 # Add this line to declare the node orb + win: circleci/windows@5.0 # Add Windows orb commands: setup_google_dns: @@ -24,6 +24,40 @@ commands: cd enterprise python -m pip install -e . cd .. + setup_litellm_test_deps: + steps: + - checkout + - setup_google_dns + - restore_cache: + keys: + - v2-litellm-deps-{{ checksum "requirements.txt" }}-{{ checksum ".circleci/config.yml" }} + - v2-litellm-deps- + - run: + name: Install Dependencies + command: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + pip install "pytest-mock==3.12.0" + pip install "pytest==7.3.1" + pip install "pytest-retry==1.6.3" + pip install "pytest-cov==5.0.0" + pip install "pytest-asyncio==0.21.1" + pip install "respx==0.22.0" + pip install "hypercorn==0.17.3" + pip install "pydantic==2.11.0" + pip install "mcp==1.25.0" + pip install "requests-mock>=1.12.1" + pip install "responses==0.25.7" + pip install "pytest-xdist==3.6.1" + pip install "pytest-timeout==2.2.0" + pip install "semantic_router==0.1.10" + pip install "fastapi-offline==1.7.3" + pip install "a2a" + - setup_litellm_enterprise_pip + - save_cache: + paths: + - ~/.cache/pip + key: v2-litellm-deps-{{ checksum "requirements.txt" }}-{{ checksum ".circleci/config.yml" }} jobs: # Add Windows testing job @@ -50,7 +84,7 @@ jobs: name: Run Windows-specific test command: | python -m pytest tests/windows_tests/test_litellm_on_windows.py -v - + mypy_linting: docker: - image: cimg/python:3.12 @@ -78,14 +112,142 @@ jobs: python -m mypy . cd .. no_output_timeout: 10m - local_testing: + local_testing_part1: docker: - image: cimg/python:3.12 auth: username: ${DOCKERHUB_USERNAME} password: ${DOCKERHUB_PASSWORD} working_directory: ~/project + parallelism: 4 + steps: + - checkout + - setup_google_dns + - run: + name: Show git commit hash + command: | + echo "Git commit hash: $CIRCLE_SHA1" + + - restore_cache: + keys: + - v1-dependencies-{{ checksum ".circleci/requirements.txt" }} + - run: + name: Install Dependencies + command: | + python -m pip install --upgrade pip + python -m pip install -r .circleci/requirements.txt + pip install "pytest==7.3.1" + pip install "pytest-retry==1.6.3" + pip install "pytest-asyncio==0.21.1" + pip install "pytest-cov==5.0.0" + pip install "mypy==1.18.2" + pip install "google-generativeai==0.3.2" + pip install "google-cloud-aiplatform==1.43.0" + pip install pyarrow + pip install "boto3==1.36.0" + pip install "aioboto3==13.4.0" + pip install langchain + pip install lunary==0.2.5 + pip install "azure-identity==1.16.1" + pip install "langfuse==2.59.7" + pip install "logfire==0.29.0" + pip install numpydoc + pip install traceloop-sdk==0.21.1 + pip install opentelemetry-api==1.25.0 + pip install opentelemetry-sdk==1.25.0 + pip install opentelemetry-exporter-otlp==1.25.0 + pip install openai==1.100.1 + pip install prisma==0.11.0 + pip install "detect_secrets==1.5.0" + pip install "httpx==0.24.1" + pip install "respx==0.22.0" + pip install fastapi + pip install "gunicorn==21.2.0" + pip install "anyio==4.2.0" + pip install "aiodynamo==23.10.1" + pip install "asyncio==3.4.3" + pip install "apscheduler==3.10.4" + pip install "PyGithub==1.59.1" + pip install argon2-cffi + pip install "pytest-mock==3.12.0" + pip install python-multipart + pip install google-cloud-aiplatform + pip install prometheus-client==0.20.0 + pip install "pydantic==2.10.2" + pip install "diskcache==5.6.1" + pip install "Pillow==10.3.0" + pip install "jsonschema==4.22.0" + pip install "pytest-xdist==3.6.1" + pip install "pytest-timeout==2.2.0" + pip install "websockets==13.1.0" + pip install semantic_router --no-deps + pip install aurelio_sdk --no-deps + pip uninstall posthog -y + - setup_litellm_enterprise_pip + - save_cache: + paths: + - ./venv + key: v1-dependencies-{{ checksum ".circleci/requirements.txt" }} + - run: + name: Run prisma ./docker/entrypoint.sh + command: | + set +e + chmod +x docker/entrypoint.sh + ./docker/entrypoint.sh + set -e + - run: + name: Black Formatting + command: | + cd litellm + python -m pip install black + python -m black . + cd .. + + # Run pytest and generate JUnit XML report + - run: + name: Run tests (Part 1 - A-M) + command: | + mkdir test-results + + # Discover test files (A-M) + TEST_FILES=$(circleci tests glob "tests/local_testing/**/test_[a-mA-M]*.py") + echo "$TEST_FILES" | circleci tests run \ + --split-by=timings \ + --verbose \ + --command="xargs python -m pytest \ + -vv \ + --cov=litellm \ + --cov-report=xml \ + --junitxml=test-results/junit.xml \ + --durations=20 \ + -k \"not test_python_38.py and not test_basic_python_version.py and not router and not assistants and not langfuse and not caching and not cache\" \ + -n 4 \ + --timeout=300 \ + --timeout_method=thread" + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml local_testing_part1_coverage.xml + mv .coverage local_testing_part1_coverage + + # Store test results + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - local_testing_part1_coverage.xml + - local_testing_part1_coverage + local_testing_part2: + docker: + - image: cimg/python:3.12 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + parallelism: 4 steps: - checkout - setup_google_dns @@ -144,6 +306,7 @@ jobs: pip install "Pillow==10.3.0" pip install "jsonschema==4.22.0" pip install "pytest-xdist==3.6.1" + pip install "pytest-timeout==2.2.0" pip install "websockets==13.1.0" pip install semantic_router --no-deps pip install aurelio_sdk --no-deps @@ -170,17 +333,32 @@ jobs: # Run pytest and generate JUnit XML report - run: - name: Run tests + name: Run tests (Part 2 - N-Z) command: | - pwd - ls - python -m pytest -vv tests/local_testing --cov=litellm --cov-report=xml --junitxml=test-results/junit.xml --durations=5 -k "not test_python_38.py and not test_basic_python_version.py and not router and not assistants and not langfuse and not caching and not cache" -n 4 + mkdir test-results + + # Discover test files (N-Z) + TEST_FILES=$(circleci tests glob "tests/local_testing/**/test_[n-zN-Z]*.py") + + echo "$TEST_FILES" | circleci tests run \ + --split-by=timings \ + --verbose \ + --command="xargs python -m pytest \ + -vv \ + --cov=litellm \ + --cov-report=xml \ + --junitxml=test-results/junit.xml \ + --durations=20 \ + -k \"not test_python_38.py and not test_basic_python_version.py and not router and not assistants and not langfuse and not caching and not cache\" \ + -n 4 \ + --timeout=300 \ + --timeout_method=thread" no_output_timeout: 120m - run: name: Rename the coverage files command: | - mv coverage.xml local_testing_coverage.xml - mv .coverage local_testing_coverage + mv coverage.xml local_testing_part2_coverage.xml + mv .coverage local_testing_part2_coverage # Store test results - store_test_results: @@ -188,8 +366,8 @@ jobs: - persist_to_workspace: root: . paths: - - local_testing_coverage.xml - - local_testing_coverage + - local_testing_part2_coverage.xml + - local_testing_part2_coverage langfuse_logging_unit_tests: docker: - image: cimg/python:3.11 @@ -461,7 +639,6 @@ jobs: username: ${DOCKERHUB_USERNAME} password: ${DOCKERHUB_PASSWORD} working_directory: ~/project - steps: - checkout - setup_google_dns @@ -475,6 +652,7 @@ jobs: pip install "pytest-cov==5.0.0" pip install "pytest-retry==1.6.3" pip install "pytest-asyncio==0.21.1" + pip install "pytest-xdist==3.6.1" pip install semantic_router --no-deps pip install aurelio_sdk --no-deps # Run pytest and generate JUnit XML report @@ -500,7 +678,7 @@ jobs: paths: - litellm_router_coverage.xml - litellm_router_coverage - + litellm_router_unit_testing: # Runs all tests with the "router" keyword docker: - image: cimg/python:3.11 @@ -532,13 +710,13 @@ jobs: command: | pwd ls - python -m pytest -vv tests/router_unit_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv tests/router_unit_tests --cov=litellm --cov-report=xml -x -s --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m - run: name: Rename the coverage files command: | - mv coverage.xml litellm_router_coverage.xml - mv .coverage litellm_router_coverage + mv coverage.xml litellm_router_unit_coverage.xml + mv .coverage litellm_router_unit_coverage # Store test results - store_test_results: path: test-results @@ -546,8 +724,8 @@ jobs: - persist_to_workspace: root: . paths: - - litellm_router_coverage.xml - - litellm_router_coverage + - litellm_router_unit_coverage.xml + - litellm_router_unit_coverage litellm_security_tests: machine: image: ubuntu-2204:2023.10.1 @@ -563,8 +741,9 @@ jobs: - run: name: Install Docker CLI (In case it's not already installed) command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version - run: name: Install Python 3.13 command: | @@ -579,6 +758,12 @@ jobs: - run: name: Install Dependencies command: | + export PATH="$HOME/miniconda/bin:$PATH" + source $HOME/miniconda/etc/profile.d/conda.sh + conda activate myenv + python --version + which python + pip install --upgrade typing-extensions>=4.12.0 pip install "pytest==7.3.1" pip install "pytest-asyncio==0.21.1" pip install aiohttp @@ -642,6 +827,9 @@ jobs: - run: name: Run prisma ./docker/entrypoint.sh command: | + export PATH="$HOME/miniconda/bin:$PATH" + source $HOME/miniconda/etc/profile.d/conda.sh + conda activate myenv set +e chmod +x docker/entrypoint.sh ./docker/entrypoint.sh @@ -650,6 +838,9 @@ jobs: - run: name: Run tests command: | + export PATH="$HOME/miniconda/bin:$PATH" + source $HOME/miniconda/etc/profile.d/conda.sh + conda activate myenv pwd ls python -m pytest tests/proxy_security_tests --cov=litellm --cov-report=xml -vv -x -v --junitxml=test-results/junit.xml --durations=5 @@ -667,13 +858,16 @@ jobs: paths: - litellm_security_tests_coverage.xml - litellm_security_tests_coverage - litellm_proxy_unit_testing: # Runs all tests with the "proxy", "key", "jwt" filenames + # Split proxy unit tests into 3 jobs for faster execution and better debugging + # test_key_generate_prisma runs separately without parallel execution to avoid event loop issues with logging worker + litellm_proxy_unit_testing_key_generation: docker: - image: cimg/python:3.11 auth: username: ${DOCKERHUB_USERNAME} password: ${DOCKERHUB_PASSWORD} working_directory: ~/project + resource_class: large steps: - checkout - setup_google_dns @@ -698,6 +892,8 @@ jobs: pip install "pytest-retry==1.6.3" pip install "pytest-asyncio==0.21.1" pip install "pytest-cov==5.0.0" + pip install "pytest-timeout==2.2.0" + pip install "pytest-forked==1.6.0" pip install "mypy==1.18.2" pip install "google-generativeai==0.3.2" pip install "google-cloud-aiplatform==1.43.0" @@ -738,7 +934,6 @@ jobs: pip install "jsonschema==4.22.0" pip install "pytest-postgresql==7.0.1" pip install "fakeredis==2.28.1" - pip install "pytest-xdist==3.6.1" - setup_litellm_enterprise_pip - save_cache: paths: @@ -751,102 +946,316 @@ jobs: chmod +x docker/entrypoint.sh ./docker/entrypoint.sh set -e - # Run pytest and generate JUnit XML report - run: - name: Run tests + name: Run key generation tests (no parallel execution to avoid event loop issues) command: | pwd ls - python -m pytest tests/proxy_unit_tests --cov=litellm --cov-report=xml -vv -x -v --junitxml=test-results/junit.xml --durations=5 -n 4 + # Run without -n flag to avoid pytest-xdist event loop conflicts with logging worker + python -m pytest tests/proxy_unit_tests/test_key_generate_prisma.py --cov=litellm --cov-report=xml --junitxml=test-results/junit-key-generation.xml --durations=10 --timeout=300 -vv --log-cli-level=INFO no_output_timeout: 120m - run: name: Rename the coverage files command: | - mv coverage.xml litellm_proxy_unit_tests_coverage.xml - mv .coverage litellm_proxy_unit_tests_coverage - # Store test results + mv coverage.xml litellm_proxy_unit_tests_key_generation_coverage.xml + mv .coverage litellm_proxy_unit_tests_key_generation_coverage - store_test_results: path: test-results - - persist_to_workspace: root: . paths: - - litellm_proxy_unit_tests_coverage.xml - - litellm_proxy_unit_tests_coverage - litellm_assistants_api_testing: # Runs all tests with the "assistants" keyword + - litellm_proxy_unit_tests_key_generation_coverage.xml + - litellm_proxy_unit_tests_key_generation_coverage + litellm_proxy_unit_testing_part1: docker: - - image: cimg/python:3.13.1 + - image: cimg/python:3.11 auth: username: ${DOCKERHUB_USERNAME} password: ${DOCKERHUB_PASSWORD} working_directory: ~/project - + resource_class: large steps: - checkout - setup_google_dns + - run: + name: Show git commit hash + command: | + echo "Git commit hash: $CIRCLE_SHA1" + - run: + name: Install PostgreSQL + command: | + sudo apt-get update + sudo apt-get install -y postgresql-14 postgresql-contrib-14 + - restore_cache: + keys: + - v1-dependencies-{{ checksum ".circleci/requirements.txt" }} - run: name: Install Dependencies command: | python -m pip install --upgrade pip - pip install wheel - pip install --upgrade pip wheel setuptools - python -m pip install -r requirements.txt + python -m pip install -r .circleci/requirements.txt pip install "pytest==7.3.1" - pip install "respx==0.22.0" pip install "pytest-retry==1.6.3" pip install "pytest-asyncio==0.21.1" pip install "pytest-cov==5.0.0" - # Run pytest and generate JUnit XML report + pip install "pytest-timeout==2.2.0" + pip install "pytest-forked==1.6.0" + pip install "mypy==1.18.2" + pip install "google-generativeai==0.3.2" + pip install "google-cloud-aiplatform==1.43.0" + pip install "google-genai==1.22.0" + pip install pyarrow + pip install "boto3==1.36.0" + pip install "aioboto3==13.4.0" + pip install langchain + pip install lunary==0.2.5 + pip install "azure-identity==1.16.1" + pip install "langfuse==2.59.7" + pip install "logfire==0.29.0" + pip install numpydoc + pip install traceloop-sdk==0.21.1 + pip install opentelemetry-api==1.25.0 + pip install opentelemetry-sdk==1.25.0 + pip install opentelemetry-exporter-otlp==1.25.0 + pip install openai==1.100.1 + pip install prisma==0.11.0 + pip install "detect_secrets==1.5.0" + pip install "httpx==0.24.1" + pip install "respx==0.22.0" + pip install fastapi + pip install "gunicorn==21.2.0" + pip install "anyio==4.2.0" + pip install "aiodynamo==23.10.1" + pip install "asyncio==3.4.3" + pip install "apscheduler==3.10.4" + pip install "PyGithub==1.59.1" + pip install argon2-cffi + pip install "pytest-mock==3.12.0" + pip install python-multipart + pip install google-cloud-aiplatform + pip install prometheus-client==0.20.0 + pip install "pydantic==2.10.2" + pip install "diskcache==5.6.1" + pip install "Pillow==10.3.0" + pip install "jsonschema==4.22.0" + pip install "pytest-postgresql==7.0.1" + pip install "fakeredis==2.28.1" + pip install "pytest-xdist==3.6.1" - setup_litellm_enterprise_pip + - save_cache: + paths: + - ./venv + key: v1-dependencies-{{ checksum ".circleci/requirements.txt" }} - run: - name: Run tests + name: Run prisma ./docker/entrypoint.sh + command: | + set +e + chmod +x docker/entrypoint.sh + ./docker/entrypoint.sh + set -e + - run: + name: Run proxy unit tests (part 1 - auth checks only, key generation in separate job) command: | pwd ls - python -m pytest tests/local_testing/ -vv -k "assistants" --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + # Run auth tests with parallel execution (test_key_generate_prisma moved to separate job to avoid event loop issues) + python -m pytest tests/proxy_unit_tests/test_auth_checks.py tests/proxy_unit_tests/test_user_api_key_auth.py --cov=litellm --cov-report=xml --junitxml=test-results/junit-part1.xml --durations=10 -n 8 --timeout=300 -vv --log-cli-level=INFO no_output_timeout: 120m - run: name: Rename the coverage files command: | - mv coverage.xml litellm_assistants_api_coverage.xml - mv .coverage litellm_assistants_api_coverage - # Store test results + mv coverage.xml litellm_proxy_unit_tests_part1_coverage.xml + mv .coverage litellm_proxy_unit_tests_part1_coverage - store_test_results: path: test-results - persist_to_workspace: root: . paths: - - litellm_assistants_api_coverage.xml - - litellm_assistants_api_coverage - llm_translation_testing: + - litellm_proxy_unit_tests_part1_coverage.xml + - litellm_proxy_unit_tests_part1_coverage + litellm_proxy_unit_testing_part2: docker: - image: cimg/python:3.11 auth: username: ${DOCKERHUB_USERNAME} password: ${DOCKERHUB_PASSWORD} working_directory: ~/project - + resource_class: large steps: - checkout - setup_google_dns + - run: + name: Show git commit hash + command: | + echo "Git commit hash: $CIRCLE_SHA1" + - run: + name: Install PostgreSQL + command: | + sudo apt-get update + sudo apt-get install -y postgresql-14 postgresql-contrib-14 + - restore_cache: + keys: + - v1-dependencies-{{ checksum ".circleci/requirements.txt" }} - run: name: Install Dependencies command: | python -m pip install --upgrade pip - python -m pip install -r requirements.txt + python -m pip install -r .circleci/requirements.txt pip install "pytest==7.3.1" pip install "pytest-retry==1.6.3" - pip install "pytest-cov==5.0.0" pip install "pytest-asyncio==0.21.1" - pip install "respx==0.22.0" - pip install "pytest-xdist==3.6.1" - # Run pytest and generate JUnit XML report - - run: + pip install "pytest-cov==5.0.0" + pip install "pytest-timeout==2.2.0" + pip install "pytest-forked==1.6.0" + pip install "mypy==1.18.2" + pip install "google-generativeai==0.3.2" + pip install "google-cloud-aiplatform==1.43.0" + pip install "google-genai==1.22.0" + pip install pyarrow + pip install "boto3==1.36.0" + pip install "aioboto3==13.4.0" + pip install langchain + pip install lunary==0.2.5 + pip install "azure-identity==1.16.1" + pip install "langfuse==2.59.7" + pip install "logfire==0.29.0" + pip install numpydoc + pip install traceloop-sdk==0.21.1 + pip install opentelemetry-api==1.25.0 + pip install opentelemetry-sdk==1.25.0 + pip install opentelemetry-exporter-otlp==1.25.0 + pip install openai==1.100.1 + pip install prisma==0.11.0 + pip install "detect_secrets==1.5.0" + pip install "httpx==0.24.1" + pip install "respx==0.22.0" + pip install fastapi + pip install "gunicorn==21.2.0" + pip install "anyio==4.2.0" + pip install "aiodynamo==23.10.1" + pip install "asyncio==3.4.3" + pip install "apscheduler==3.10.4" + pip install "PyGithub==1.59.1" + pip install argon2-cffi + pip install "pytest-mock==3.12.0" + pip install python-multipart + pip install google-cloud-aiplatform + pip install prometheus-client==0.20.0 + pip install "pydantic==2.10.2" + pip install "diskcache==5.6.1" + pip install "Pillow==10.3.0" + pip install "jsonschema==4.22.0" + pip install "pytest-postgresql==7.0.1" + pip install "fakeredis==2.28.1" + pip install "pytest-xdist==3.6.1" + - setup_litellm_enterprise_pip + - save_cache: + paths: + - ./venv + key: v1-dependencies-{{ checksum ".circleci/requirements.txt" }} + - run: + name: Run prisma ./docker/entrypoint.sh + command: | + set +e + chmod +x docker/entrypoint.sh + ./docker/entrypoint.sh + set -e + - run: + name: Run proxy unit tests (part 2 - remaining tests) + command: | + pwd + ls + python -m pytest tests/proxy_unit_tests --ignore=tests/proxy_unit_tests/test_key_generate_prisma.py --ignore=tests/proxy_unit_tests/test_auth_checks.py --ignore=tests/proxy_unit_tests/test_user_api_key_auth.py --cov=litellm --cov-report=xml --junitxml=test-results/junit-part2.xml --durations=10 -n 8 --timeout=300 -vv --log-cli-level=INFO + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml litellm_proxy_unit_tests_part2_coverage.xml + mv .coverage litellm_proxy_unit_tests_part2_coverage + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - litellm_proxy_unit_tests_part2_coverage.xml + - litellm_proxy_unit_tests_part2_coverage + litellm_assistants_api_testing: # Runs all tests with the "assistants" keyword + docker: + - image: cimg/python:3.13.1 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + + steps: + - checkout + - setup_google_dns + - run: + name: Install Dependencies + command: | + python -m pip install --upgrade pip + pip install wheel + pip install --upgrade pip wheel setuptools + python -m pip install -r requirements.txt + pip install "pytest==7.3.1" + pip install "respx==0.22.0" + pip install "pytest-retry==1.6.3" + pip install "pytest-asyncio==0.21.1" + pip install "pytest-cov==5.0.0" + # Run pytest and generate JUnit XML report + - setup_litellm_enterprise_pip + - run: + name: Run tests + command: | + pwd + ls + python -m pytest tests/local_testing/ -vv -k "assistants" --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml litellm_assistants_api_coverage.xml + mv .coverage litellm_assistants_api_coverage + # Store test results + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - litellm_assistants_api_coverage.xml + - litellm_assistants_api_coverage + llm_translation_testing: + docker: + - image: cimg/python:3.11 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + + steps: + - checkout + - setup_google_dns + - run: + name: Install Dependencies + command: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + pip install "pytest==7.3.1" + pip install "pytest-retry==1.6.3" + pip install "pytest-cov==5.0.0" + pip install "pytest-asyncio==0.21.1" + pip install "respx==0.22.0" + pip install "pytest-xdist==3.6.1" + pip install "pytest-timeout==2.2.0" + # Run pytest and generate JUnit XML report + - run: name: Run tests command: | pwd ls - python -m pytest -vv tests/llm_translation --cov=litellm --cov-report=xml -v --junitxml=test-results/junit.xml --durations=5 -n 4 + # Add --timeout to kill hanging tests after 120s (2 min) + # Add --durations=20 to show 20 slowest tests for debugging + python -m pytest -vv tests/llm_translation --cov=litellm --cov-report=xml -v --junitxml=test-results/junit.xml --durations=20 -n 4 --timeout=120 --timeout_method=thread no_output_timeout: 120m - run: name: Rename the coverage files @@ -883,8 +1292,8 @@ jobs: pip install "pytest-cov==5.0.0" pip install "pytest-asyncio==0.21.1" pip install "respx==0.22.0" - pip install "pydantic==2.10.2" - pip install "mcp==1.10.1" + pip install "pydantic==2.11.0" + pip install "mcp==1.25.0" # Run pytest and generate JUnit XML report - run: name: Run tests @@ -1127,59 +1536,143 @@ jobs: paths: - search_coverage.xml - search_coverage - litellm_mapped_tests: + # Split litellm_mapped_tests into 3 parallel jobs for 3x faster execution + litellm_mapped_tests_proxy: docker: - image: cimg/python:3.11 auth: username: ${DOCKERHUB_USERNAME} password: ${DOCKERHUB_PASSWORD} working_directory: ~/project - + resource_class: xlarge steps: - - checkout - - setup_google_dns + - setup_litellm_test_deps - run: - name: Install Dependencies + name: Run proxy tests command: | - python -m pip install --upgrade pip - python -m pip install -r requirements.txt - pip install "pytest-mock==3.12.0" - pip install "pytest==7.3.1" - pip install "pytest-retry==1.6.3" - pip install "pytest-cov==5.0.0" - pip install "pytest-asyncio==0.21.1" - pip install "respx==0.22.0" - pip install "hypercorn==0.17.3" - pip install "pydantic==2.10.2" - pip install "mcp==1.10.1" - pip install "requests-mock>=1.12.1" - pip install "responses==0.25.7" - pip install "pytest-xdist==3.6.1" - pip install "semantic_router==0.1.10" - pip install "fastapi-offline==1.7.3" - - setup_litellm_enterprise_pip - # Run pytest and generate JUnit XML report + prisma generate + python -m pytest tests/test_litellm/proxy --cov=litellm --cov-report=xml --junitxml=test-results/junit-proxy.xml --durations=10 -n 16 --maxfail=5 --timeout=300 -vv --log-cli-level=WARNING + no_output_timeout: 120m - run: - name: Run litellm tests + name: Rename the coverage files command: | - pwd - ls - python -m pytest -vv tests/test_litellm --cov=litellm --cov-report=xml -s -v --junitxml=test-results/junit-litellm.xml --durations=10 -n 8 + mv coverage.xml litellm_proxy_tests_coverage.xml + mv .coverage litellm_proxy_tests_coverage + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - litellm_proxy_tests_coverage.xml + - litellm_proxy_tests_coverage + litellm_mapped_tests_llms: + docker: + - image: cimg/python:3.11 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + resource_class: xlarge + steps: + - setup_litellm_test_deps + - run: + name: Run LLM provider tests + command: | + python -m pytest tests/test_litellm/llms --cov=litellm --cov-report=xml --junitxml=test-results/junit-llms.xml --durations=10 -n 16 --maxfail=5 --timeout=300 -vv --log-cli-level=WARNING no_output_timeout: 120m - run: name: Rename the coverage files command: | - mv coverage.xml litellm_mapped_tests_coverage.xml - mv .coverage litellm_mapped_tests_coverage - - # Store test results + mv coverage.xml litellm_llms_tests_coverage.xml + mv .coverage litellm_llms_tests_coverage - store_test_results: path: test-results - persist_to_workspace: root: . paths: - - litellm_mapped_tests_coverage.xml - - litellm_mapped_tests_coverage + - litellm_llms_tests_coverage.xml + - litellm_llms_tests_coverage + litellm_mapped_tests_core: + docker: + - image: cimg/python:3.11 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + resource_class: xlarge + steps: + - setup_litellm_test_deps + - run: + name: Run core tests + command: | + python -m pytest tests/test_litellm --ignore=tests/test_litellm/proxy --ignore=tests/test_litellm/llms --ignore=tests/test_litellm/integrations --ignore=tests/test_litellm/litellm_core_utils --cov=litellm --cov-report=xml --junitxml=test-results/junit-core.xml --durations=10 -n 16 --maxfail=5 --timeout=300 -vv --log-cli-level=WARNING + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml litellm_core_tests_coverage.xml + mv .coverage litellm_core_tests_coverage + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - litellm_core_tests_coverage.xml + - litellm_core_tests_coverage + litellm_mapped_tests_litellm_core_utils: + docker: + - image: cimg/python:3.11 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + resource_class: xlarge + steps: + - setup_litellm_test_deps + - run: + name: Run litellm_core_utils tests + command: | + python -m pytest tests/test_litellm/litellm_core_utils --cov=litellm --cov-report=xml --junitxml=test-results/junit-litellm-core-utils.xml --durations=10 -n 16 --maxfail=5 --timeout=300 -vv --log-cli-level=WARNING + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml litellm_core_utils_tests_coverage.xml + mv .coverage litellm_core_utils_tests_coverage + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - litellm_core_utils_tests_coverage.xml + - litellm_core_utils_tests_coverage + litellm_mapped_tests_integrations: + docker: + - image: cimg/python:3.11 + auth: + username: ${DOCKERHUB_USERNAME} + password: ${DOCKERHUB_PASSWORD} + working_directory: ~/project + resource_class: xlarge + steps: + - setup_litellm_test_deps + - run: + name: Run integrations tests + command: | + python -m pytest tests/test_litellm/integrations --cov=litellm --cov-report=xml --junitxml=test-results/junit-integrations.xml --durations=10 -n 16 --maxfail=5 --timeout=300 -vv --log-cli-level=WARNING + no_output_timeout: 120m + - run: + name: Rename the coverage files + command: | + mv coverage.xml litellm_integrations_tests_coverage.xml + mv .coverage litellm_integrations_tests_coverage + - store_test_results: + path: test-results + - persist_to_workspace: + root: . + paths: + - litellm_integrations_tests_coverage.xml + - litellm_integrations_tests_coverage litellm_mapped_enterprise_tests: docker: - image: cimg/python:3.11 @@ -1203,8 +1696,8 @@ jobs: pip install "pytest-asyncio==0.21.1" pip install "respx==0.22.0" pip install "hypercorn==0.17.3" - pip install "pydantic==2.10.2" - pip install "mcp==1.10.1" + pip install "pydantic==2.11.0" + pip install "mcp==1.25.0" pip install "requests-mock>=1.12.1" pip install "responses==0.25.7" pip install "pytest-xdist==3.6.1" @@ -1390,13 +1883,14 @@ jobs: pip install "pytest-cov==5.0.0" pip install "pytest-asyncio==0.21.1" pip install "respx==0.22.0" + pip install "pytest-xdist==3.6.1" # Run pytest and generate JUnit XML report - run: name: Run tests command: | pwd ls - python -m pytest -vv tests/image_gen_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv tests/image_gen_tests -n 4 --cov=litellm --cov-report=xml -x -v --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m - run: name: Rename the coverage files @@ -1439,6 +1933,7 @@ jobs: pip install "mlflow==2.17.2" pip install "anthropic==0.52.0" pip install "blockbuster==1.5.24" + pip install "pytest-xdist==3.6.1" # Run pytest and generate JUnit XML report - setup_litellm_enterprise_pip - run: @@ -1446,7 +1941,7 @@ jobs: command: | pwd ls - python -m pytest -vv tests/logging_callback_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5 + python -m pytest -vv tests/logging_callback_tests --cov=litellm -n 4 --cov-report=xml -s -v --junitxml=test-results/junit.xml --durations=5 no_output_timeout: 120m - run: name: Rename the coverage files @@ -1507,7 +2002,7 @@ jobs: - audio_coverage installing_litellm_on_python: docker: - - image: circleci/python:3.8 + - image: cimg/python:3.11 auth: username: ${DOCKERHUB_USERNAME} password: ${DOCKERHUB_PASSWORD} @@ -1562,7 +2057,7 @@ jobs: pip install "pytest-asyncio==0.21.1" pip install "pytest-cov==5.0.0" pip install "tomli==2.2.1" - pip install "mcp==1.10.1" + pip install "mcp==1.25.0" - run: name: Run tests command: | @@ -1571,7 +2066,7 @@ jobs: python -m pytest -vv tests/local_testing/test_basic_python_version.py helm_chart_testing: machine: - image: ubuntu-2204:2023.10.1 # Use machine executor instead of docker + image: ubuntu-2204:2023.10.1 # Use machine executor instead of docker resource_class: medium working_directory: ~/project @@ -1583,7 +2078,7 @@ jobs: name: Install Helm command: | curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - + # Install kind - run: name: Install Kind @@ -1591,7 +2086,7 @@ jobs: curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.20.0/kind-linux-amd64 chmod +x ./kind sudo mv ./kind /usr/local/bin/kind - + # Install kubectl - run: name: Install kubectl @@ -1599,43 +2094,58 @@ jobs: curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" chmod +x kubectl sudo mv kubectl /usr/local/bin/ - + # Create kind cluster - run: name: Create Kind Cluster command: | kind create cluster --name litellm-test - + + - run: + name: Build Docker image for helm tests + command: | + IMAGE_TAG=${CIRCLE_SHA1:-ci} + docker build -t litellm-ci:${IMAGE_TAG} -f docker/Dockerfile.database . + + - run: + name: Load Docker image into Kind + command: | + IMAGE_TAG=${CIRCLE_SHA1:-ci} + kind load docker-image litellm-ci:${IMAGE_TAG} --name litellm-test + # Run helm lint - run: name: Run helm lint command: | helm lint ./deploy/charts/litellm-helm - + # Run helm tests - run: name: Run helm tests command: | - helm install litellm ./deploy/charts/litellm-helm -f ./deploy/charts/litellm-helm/ci/test-values.yaml + IMAGE_TAG=${CIRCLE_SHA1:-ci} + helm install litellm ./deploy/charts/litellm-helm -f ./deploy/charts/litellm-helm/ci/test-values.yaml \ + --set image.repository=litellm-ci \ + --set image.tag=${IMAGE_TAG} \ + --set image.pullPolicy=Never # Wait for pod to be ready echo "Waiting 30 seconds for pod to be ready..." sleep 30 - + # Print pod logs before running tests echo "Printing pod logs..." kubectl logs $(kubectl get pods -l app.kubernetes.io/name=litellm -o jsonpath="{.items[0].metadata.name}") - + # Run the helm tests helm test litellm --logs helm test litellm --logs - + # Cleanup - run: name: Cleanup command: | kind delete cluster --name litellm-test - when: always # This ensures cleanup runs even if previous steps fail - + when: always # This ensures cleanup runs even if previous steps fail check_code_and_doc_quality: docker: @@ -1662,11 +2172,13 @@ jobs: - run: ruff check ./litellm # - run: python ./tests/documentation_tests/test_general_setting_keys.py - run: python ./tests/code_coverage_tests/check_licenses.py + - run: python ./tests/code_coverage_tests/check_provider_folders_documented.py - run: python ./tests/code_coverage_tests/router_code_coverage.py - run: python ./tests/code_coverage_tests/test_chat_completion_imports.py - run: python ./tests/code_coverage_tests/info_log_check.py - run: python ./tests/code_coverage_tests/test_ban_set_verbose.py - run: python ./tests/code_coverage_tests/code_qa_check_tests.py + - run: python ./tests/code_coverage_tests/check_get_model_cost_key_performance.py - run: python ./tests/code_coverage_tests/test_proxy_types_import.py - run: python ./tests/code_coverage_tests/callback_manager_test.py - run: python ./tests/code_coverage_tests/recursive_detector.py @@ -1682,6 +2194,7 @@ jobs: - run: python ./tests/code_coverage_tests/check_unsafe_enterprise_import.py - run: python ./tests/code_coverage_tests/ban_copy_deepcopy_kwargs.py - run: python ./tests/code_coverage_tests/check_fastuuid_usage.py + - run: python ./tests/code_coverage_tests/memory_test.py - run: helm lint ./deploy/charts/litellm-helm db_migration_disable_update_check: @@ -1709,10 +2222,14 @@ jobs: pip install "pytest==7.3.1" pip install "pytest-asyncio==0.21.1" pip install aiohttp + pip install apscheduler + - attach_workspace: + at: ~/project - run: - name: Build Docker image + name: Load Docker Database Image command: | - docker build -t myapp . -f ./docker/Dockerfile.database + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database - run: name: Run Docker container command: | @@ -1725,7 +2242,7 @@ jobs: -v $(pwd)/litellm/proxy/example_config_yaml/bad_schema.prisma:/app/litellm/proxy/schema.prisma \ -v $(pwd)/litellm/proxy/example_config_yaml/disable_schema_update.yaml:/app/config.yaml \ --name my-app \ - myapp:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 - run: @@ -1744,10 +2261,11 @@ jobs: name: Check container logs for expected message command: | echo "=== Printing Full Container Startup Logs ===" - docker logs my-app + LOG_OUTPUT="$(docker logs my-app 2>&1)" + printf '%s\n' "$LOG_OUTPUT" echo "=== End of Full Container Startup Logs ===" - - if docker logs my-app 2>&1 | grep -q "prisma schema out of sync with db. Consider running these sql_commands to sync the two"; then + + if printf '%s\n' "$LOG_OUTPUT" | grep -q "prisma schema out of sync with db. Consider running these sql_commands to sync the two"; then echo "Expected message found in logs. Test passed." else echo "Expected message not found in logs. Test failed." @@ -1759,7 +2277,6 @@ jobs: python -m pytest -vv tests/basic_proxy_startup_tests -x --junitxml=test-results/junit-2.xml --durations=5 no_output_timeout: 120m - build_and_test: machine: image: ubuntu-2204:2023.10.1 @@ -1771,8 +2288,9 @@ jobs: - run: name: Install Docker CLI (In case it's not already installed) command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version - run: name: Install Python 3.9 command: | @@ -1816,6 +2334,8 @@ jobs: pip install "asyncio==3.4.3" pip install "PyGithub==1.59.1" pip install "openai==1.100.1" + pip install "litellm[proxy]" + pip install "pytest-xdist==3.6.1" - run: name: Install dockerize command: | @@ -1892,7 +2412,7 @@ jobs: command: | pwd ls - python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml --durations=5 --ignore=tests/otel_tests --ignore=tests/spend_tracking_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/mcp_tests --ignore=tests/guardrails_tests --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests + python -m pytest -s -vv tests/*.py -x --junitxml=test-results/junit.xml -n 4 --durations=5 --ignore=tests/otel_tests --ignore=tests/spend_tracking_tests --ignore=tests/pass_through_tests --ignore=tests/proxy_admin_ui_tests --ignore=tests/load_tests --ignore=tests/llm_translation --ignore=tests/llm_responses_api_testing --ignore=tests/mcp_tests --ignore=tests/guardrails_tests --ignore=tests/image_gen_tests --ignore=tests/pass_through_unit_tests no_output_timeout: 120m # Store test results @@ -1909,17 +2429,18 @@ jobs: - run: name: Install Docker CLI (In case it's not already installed) command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version - run: - name: Install Python 3.9 + name: Install Python 3.10 command: | curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" conda init bash source ~/.bashrc - conda create -n myenv python=3.9 -y + conda create -n myenv python=3.10 -y conda activate myenv python --version - run: @@ -1976,9 +2497,13 @@ jobs: - run: name: Wait for PostgreSQL to be ready command: dockerize -wait tcp://localhost:5432 -timeout 1m + - attach_workspace: + at: ~/project - run: - name: Build Docker image - command: docker build -t my-app:latest -f ./docker/Dockerfile.database . + name: Load Docker Database Image + command: | + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database - run: name: Run Docker container command: | @@ -2013,7 +2538,7 @@ jobs: --add-host host.docker.internal:host-gateway \ --name my-app \ -v $(pwd)/litellm/proxy/example_config_yaml/oai_misc_config.yaml:/app/config.yaml \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 \ --detailed_debug \ @@ -2051,8 +2576,9 @@ jobs: - run: name: Install Docker CLI (In case it's not already installed) command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version - run: name: Install Python 3.9 command: | @@ -2115,9 +2641,13 @@ jobs: - run: name: Wait for PostgreSQL to be ready command: dockerize -wait tcp://localhost:5432 -timeout 1m + - attach_workspace: + at: ~/project - run: - name: Build Docker image - command: docker build -t my-app:latest -f ./docker/Dockerfile.database . + name: Load Docker Database Image + command: | + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database - run: name: Run Docker container # intentionally give bad redis credentials here @@ -2150,7 +2680,7 @@ jobs: --name my-app \ -v $(pwd)/litellm/proxy/example_config_yaml/otel_test_config.yaml:/app/config.yaml \ -v $(pwd)/litellm/proxy/example_config_yaml/custom_guardrail.py:/app/custom_guardrail.py \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 \ --detailed_debug \ @@ -2201,7 +2731,7 @@ jobs: --add-host host.docker.internal:host-gateway \ --name my-app-3 \ -v $(pwd)/litellm/proxy/example_config_yaml/enterprise_config.yaml:/app/config.yaml \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 \ --detailed_debug @@ -2235,8 +2765,9 @@ jobs: - run: name: Install Docker CLI (In case it's not already installed) command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version - run: name: Install Python 3.9 command: | @@ -2275,9 +2806,13 @@ jobs: - run: name: Wait for PostgreSQL to be ready command: dockerize -wait tcp://localhost:5432 -timeout 1m + - attach_workspace: + at: ~/project - run: - name: Build Docker image - command: docker build -t my-app:latest -f ./docker/Dockerfile.database . + name: Load Docker Database Image + command: | + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database - run: name: Run Docker container # intentionally give bad redis credentials here @@ -2301,7 +2836,7 @@ jobs: --add-host host.docker.internal:host-gateway \ --name my-app \ -v $(pwd)/litellm/proxy/example_config_yaml/spend_tracking_config.yaml:/app/config.yaml \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 \ --detailed_debug \ @@ -2343,8 +2878,9 @@ jobs: - run: name: Install Docker CLI (In case it's not already installed) command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version - run: name: Install Python 3.9 command: | @@ -2387,9 +2923,13 @@ jobs: - run: name: Wait for PostgreSQL to be ready command: dockerize -wait tcp://localhost:5432 -timeout 1m + - attach_workspace: + at: ~/project - run: - name: Build Docker image - command: docker build -t my-app:latest -f ./docker/Dockerfile.database . + name: Load Docker Database Image + command: | + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database - run: name: Run Docker container 1 # intentionally give bad redis credentials here @@ -2409,7 +2949,7 @@ jobs: --add-host host.docker.internal:host-gateway \ --name my-app \ -v $(pwd)/litellm/proxy/example_config_yaml/multi_instance_simple_config.yaml:/app/config.yaml \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 \ --detailed_debug \ @@ -2430,7 +2970,7 @@ jobs: --add-host host.docker.internal:host-gateway \ --name my-app-2 \ -v $(pwd)/litellm/proxy/example_config_yaml/multi_instance_simple_config.yaml:/app/config.yaml \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4001 \ --detailed_debug @@ -2476,8 +3016,10 @@ jobs: - run: name: Install Docker CLI (In case it's not already installed) command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version + sudo systemctl restart docker - run: name: Install Python 3.9 command: | @@ -2521,9 +3063,13 @@ jobs: - run: name: Wait for PostgreSQL to be ready command: dockerize -wait tcp://localhost:5432 -timeout 1m + - attach_workspace: + at: ~/project - run: - name: Build Docker image - command: docker build -t my-app:latest -f ./docker/Dockerfile.database . + name: Load Docker Database Image + command: | + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database - run: name: Run Docker container # intentionally give bad redis credentials here @@ -2538,7 +3084,7 @@ jobs: --add-host host.docker.internal:host-gateway \ --name my-app \ -v $(pwd)/litellm/proxy/example_config_yaml/store_model_db_config.yaml:/app/config.yaml \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 \ --detailed_debug \ @@ -2563,8 +3109,7 @@ jobs: pwd ls python -m pytest -vv tests/store_model_in_db_tests -x --junitxml=test-results/junit.xml --durations=5 - no_output_timeout: - 120m + no_output_timeout: 120m - run: name: Stop and remove containers command: | @@ -2575,7 +3120,7 @@ jobs: when: always - store_test_results: path: test-results - + proxy_build_from_pip_tests: # Change from docker to machine executor machine: @@ -2685,22 +3230,26 @@ jobs: - run: name: Install Docker CLI (In case it's not already installed) command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version - run: - name: Install Python 3.9 + name: Install Python 3.10 command: | curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh bash miniconda.sh -b -p $HOME/miniconda export PATH="$HOME/miniconda/bin:$PATH" conda init bash source ~/.bashrc - conda create -n myenv python=3.9 -y + conda create -n myenv python=3.10 -y conda activate myenv python --version - run: name: Install Dependencies command: | + export PATH="$HOME/miniconda/bin:$PATH" + source $HOME/miniconda/etc/profile.d/conda.sh + conda activate myenv pip install "pytest==7.3.1" pip install "pytest-retry==1.6.3" pip install "pytest-asyncio==0.21.1" @@ -2729,6 +3278,8 @@ jobs: pip install "langchain_mcp_adapters==0.0.5" pip install "langchain_openai==0.2.1" pip install "langgraph==0.3.18" + pip install "fastuuid==0.13.5" + pip install -r requirements.txt - run: name: Install dockerize command: | @@ -2748,10 +3299,13 @@ jobs: - run: name: Wait for PostgreSQL to be ready command: dockerize -wait tcp://localhost:5432 -timeout 1m - # Run pytest and generate JUnit XML report + - attach_workspace: + at: ~/project - run: - name: Build Docker image - command: docker build -t my-app:latest -f ./docker/Dockerfile.database . + name: Load Docker Database Image + command: | + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database - run: name: Run Docker container command: | @@ -2773,7 +3327,7 @@ jobs: --name my-app \ -v $(pwd)/litellm/proxy/example_config_yaml/pass_through_config.yaml:/app/config.yaml \ -v $(pwd)/litellm/proxy/example_config_yaml/custom_auth_basic.py:/app/custom_auth_basic.py \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 \ --detailed_debug \ @@ -2793,17 +3347,17 @@ jobs: curl -sSL https://rvm.io/mpapis.asc | gpg --import - curl -sSL https://rvm.io/pkuczynski.asc | gpg --import - } - + # Install Ruby version manager (RVM) curl -sSL https://get.rvm.io | bash -s stable - + # Source RVM from the correct location source $HOME/.rvm/scripts/rvm - + # Install Ruby 3.2.2 rvm install 3.2.2 rvm use 3.2.2 --default - + # Install latest Bundler gem install bundler @@ -2841,6 +3395,9 @@ jobs: - run: name: Run tests command: | + export PATH="$HOME/miniconda/bin:$PATH" + source $HOME/miniconda/etc/profile.d/conda.sh + conda activate myenv pwd ls python -m pytest -vv tests/pass_through_tests/ -x --junitxml=test-results/junit.xml --durations=5 @@ -2850,6 +3407,110 @@ jobs: - store_test_results: path: test-results + proxy_e2e_anthropic_messages_tests: + machine: + image: ubuntu-2204:2023.10.1 + resource_class: xlarge + working_directory: ~/project + steps: + - checkout + - setup_google_dns + - run: + name: Install Docker CLI (In case it's not already installed) + command: | + curl -fsSL https://get.docker.com | sh + sudo usermod -aG docker $USER + docker version + - run: + name: Install Python 3.10 + command: | + curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh + bash miniconda.sh -b -p $HOME/miniconda + export PATH="$HOME/miniconda/bin:$PATH" + conda init bash + source ~/.bashrc + conda create -n myenv python=3.10 -y + conda activate myenv + python --version + - run: + name: Install Dependencies + command: | + export PATH="$HOME/miniconda/bin:$PATH" + source $HOME/miniconda/etc/profile.d/conda.sh + conda activate myenv + pip install "pytest==7.3.1" + pip install "pytest-asyncio==0.21.1" + pip install "boto3==1.36.0" + pip install "httpx==0.27.0" + pip install "claude-agent-sdk" + pip install -r requirements.txt + - run: + name: Install dockerize + command: | + wget https://github.com/jwilder/dockerize/releases/download/v0.6.1/dockerize-linux-amd64-v0.6.1.tar.gz + sudo tar -C /usr/local/bin -xzvf dockerize-linux-amd64-v0.6.1.tar.gz + rm dockerize-linux-amd64-v0.6.1.tar.gz + - run: + name: Start PostgreSQL Database + command: | + docker run -d \ + --name postgres-db \ + -e POSTGRES_USER=postgres \ + -e POSTGRES_PASSWORD=postgres \ + -e POSTGRES_DB=circle_test \ + -p 5432:5432 \ + postgres:14 + - run: + name: Wait for PostgreSQL to be ready + command: dockerize -wait tcp://localhost:5432 -timeout 1m + - attach_workspace: + at: ~/project + - run: + name: Load Docker Database Image + command: | + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database + - run: + name: Run Docker container with test config + command: | + docker run -d \ + -p 4000:4000 \ + -e DATABASE_URL=postgresql://postgres:postgres@host.docker.internal:5432/circle_test \ + -e LITELLM_MASTER_KEY="sk-1234" \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e AWS_REGION_NAME="us-east-1" \ + --add-host host.docker.internal:host-gateway \ + --name my-app \ + -v $(pwd)/tests/proxy_e2e_anthropic_messages_tests/test_config.yaml:/app/config.yaml \ + litellm-docker-database:ci \ + --config /app/config.yaml \ + --port 4000 \ + --detailed_debug + - run: + name: Start outputting logs + command: docker logs -f my-app + background: true + - run: + name: Wait for app to be ready + command: dockerize -wait http://localhost:4000 -timeout 5m + - run: + name: Run Claude Agent SDK E2E Tests + command: | + export PATH="$HOME/miniconda/bin:$PATH" + source $HOME/miniconda/etc/profile.d/conda.sh + conda activate myenv + export LITELLM_PROXY_URL="http://localhost:4000" + export LITELLM_API_KEY="sk-1234" + pwd + ls + python -m pytest -vv tests/proxy_e2e_anthropic_messages_tests/ -x -s --junitxml=test-results/junit.xml --durations=5 + no_output_timeout: 120m + + # Store test results + - store_test_results: + path: test-results + upload-coverage: docker: - image: cimg/python:3.9 @@ -2871,7 +3532,7 @@ jobs: python -m venv venv . venv/bin/activate pip install coverage - coverage combine llm_translation_coverage llm_responses_api_coverage ocr_coverage search_coverage mcp_coverage logging_coverage audio_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_security_tests_coverage guardrails_coverage + coverage combine llm_translation_coverage llm_responses_api_coverage ocr_coverage search_coverage mcp_coverage logging_coverage audio_coverage litellm_router_coverage litellm_router_unit_coverage local_testing_part1_coverage local_testing_part2_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_part1_coverage litellm_proxy_unit_tests_part2_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_security_tests_coverage guardrails_coverage litellm_mapped_tests_coverage coverage xml - codecov/upload: file: ./coverage.xml @@ -2921,8 +3582,22 @@ jobs: ls dist/ twine upload --verbose dist/* else - echo "Version ${VERSION} of package is already published on PyPI. Skipping PyPI publish." - circleci step halt + echo "Version ${VERSION} of package is already published on PyPI." + + # Check if corresponding Docker nightly image exists + NIGHTLY_TAG="v${VERSION}-nightly" + echo "Checking for Docker nightly image: litellm/litellm:${NIGHTLY_TAG}" + + # Check Docker Hub for the nightly image + if curl -s "https://hub.docker.com/v2/repositories/litellm/litellm/tags/${NIGHTLY_TAG}" | grep -q "name"; then + echo "Docker nightly image ${NIGHTLY_TAG} exists. This release was already completed successfully." + echo "Skipping PyPI publish and continuing to ensure Docker images are up to date." + circleci step halt + else + echo "ERROR: PyPI package ${VERSION} exists but Docker nightly image ${NIGHTLY_TAG} does not exist!" + echo "This indicates an incomplete release. Please investigate." + exit 1 + fi fi - run: name: Trigger Github Action for new Docker Container + Trigger Load Testing @@ -2931,11 +3606,21 @@ jobs: python3 -m pip install toml VERSION=$(python3 -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['version'])") echo "LiteLLM Version ${VERSION}" + + # Determine which branch to use for Docker build + if [[ "$CIRCLE_BRANCH" =~ ^litellm_release_day_.* ]]; then + BUILD_BRANCH="$CIRCLE_BRANCH" + echo "Using release branch: $BUILD_BRANCH" + else + BUILD_BRANCH="main" + echo "Using default branch: $BUILD_BRANCH" + fi + curl -X POST \ -H "Accept: application/vnd.github.v3+json" \ -H "Authorization: Bearer $GITHUB_TOKEN" \ "https://api.github.com/repos/BerriAI/litellm/actions/workflows/ghcr_deploy.yml/dispatches" \ - -d "{\"ref\":\"main\", \"inputs\":{\"tag\":\"v${VERSION}-nightly\", \"commit_hash\":\"$CIRCLE_SHA1\"}}" + -d "{\"ref\":\"${BUILD_BRANCH}\", \"inputs\":{\"tag\":\"v${VERSION}-nightly\", \"commit_hash\":\"$CIRCLE_SHA1\"}}" echo "triggering load testing server for version ${VERSION} and commit ${CIRCLE_SHA1}" curl -X POST "https://proxyloadtester-production.up.railway.app/start/load/test?version=${VERSION}&commit_hash=${CIRCLE_SHA1}&release_type=nightly" @@ -2957,32 +3642,32 @@ jobs: python -m pip install toml # Get current version from pyproject.toml CURRENT_VERSION=$(python -c "import toml; print(toml.load('pyproject.toml')['tool']['poetry']['version'])") - + # Get last published version from PyPI LAST_VERSION=$(curl -s https://pypi.org/pypi/litellm-proxy-extras/json | python -c "import json, sys; print(json.load(sys.stdin)['info']['version'])") - + echo "Current version: $CURRENT_VERSION" echo "Last published version: $LAST_VERSION" - + # Compare versions using Python's packaging.version VERSION_COMPARE=$(python -c "from packaging import version; print(1 if version.parse('$CURRENT_VERSION') < version.parse('$LAST_VERSION') else 0)") - + echo "Version compare: $VERSION_COMPARE" if [ "$VERSION_COMPARE" = "1" ]; then echo "Error: Current version ($CURRENT_VERSION) is less than last published version ($LAST_VERSION)" exit 1 fi - + # If versions are equal or current is greater, check contents pip download --no-deps litellm-proxy-extras==$LAST_VERSION -d /tmp - + echo "Contents of /tmp directory:" ls -la /tmp - + # Find the downloaded file (could be .whl or .tar.gz) DOWNLOADED_FILE=$(ls /tmp/litellm_proxy_extras-*) echo "Downloaded file: $DOWNLOADED_FILE" - + # Extract based on file extension if [[ "$DOWNLOADED_FILE" == *.whl ]]; then echo "Extracting wheel file..." @@ -2993,10 +3678,10 @@ jobs: tar -xzf "$DOWNLOADED_FILE" -C /tmp EXTRACTED_DIR="/tmp/litellm_proxy_extras-$LAST_VERSION" fi - + echo "Contents of extracted package:" ls -R "$EXTRACTED_DIR" - + # Compare contents if ! diff -r "$EXTRACTED_DIR/litellm_proxy_extras" ./litellm_proxy_extras; then if [ "$CURRENT_VERSION" = "$LAST_VERSION" ]; then @@ -3047,7 +3732,7 @@ jobs: python -m build twine upload --verbose dist/* - e2e_ui_testing: + ui_build: machine: image: ubuntu-2204:2023.10.1 resource_class: xlarge @@ -3062,62 +3747,31 @@ jobs: export NVM_DIR="/opt/circleci/.nvm" source "$NVM_DIR/nvm.sh" source "$NVM_DIR/bash_completion" - + # Install and use Node version nvm install v20 nvm use v20 - + cd ui/litellm-dashboard - + # Install dependencies first npm install - + # Now source the build script source ./build_ui.sh - - run: - name: Install Docker CLI (In case it's not already installed) - command: | - sudo apt-get update - sudo apt-get install -y docker-ce docker-ce-cli containerd.io - - run: - name: Install Python 3.9 - command: | - curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh --output miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - export PATH="$HOME/miniconda/bin:$PATH" - conda init bash - source ~/.bashrc - conda create -n myenv python=3.9 -y - conda activate myenv - python --version - - run: - name: Install Dependencies - command: | - npm install -D @playwright/test - npm install @google-cloud/vertexai - pip install "pytest==7.3.1" - pip install "pytest-retry==1.6.3" - pip install "pytest-asyncio==0.21.1" - pip install aiohttp - pip install "openai==1.100.1" - python -m pip install --upgrade pip - pip install "pydantic==2.10.2" - pip install "pytest==7.3.1" - pip install "pytest-mock==3.12.0" - pip install "pytest-asyncio==0.21.1" - pip install "mypy==1.18.2" - pip install pyarrow - pip install numpydoc - pip install prisma - pip install fastapi - pip install jsonschema - pip install "httpx==0.24.1" - pip install "anyio==3.7.1" - pip install "asyncio==3.4.3" - - run: - name: Install Playwright Browsers - command: | - npx playwright install + - persist_to_workspace: + root: . + paths: + - litellm/proxy/_experimental/out + + ui_unit_tests: + machine: + image: ubuntu-2204:2023.10.1 + resource_class: xlarge + working_directory: ~/project + steps: + - checkout + - setup_google_dns - run: name: Run UI unit tests (Vitest) command: | @@ -3126,10 +3780,12 @@ jobs: source "$NVM_DIR/nvm.sh" nvm install 20 nvm use 20 - + cd ui/litellm-dashboard - npm ci || npm install - + # Remove node_modules and package-lock to ensure clean install (fixes optional deps issue) + rm -rf node_modules package-lock.json + npm install + # CI run, with both LCOV (Codecov) and HTML (artifact you can click) CI=true npm run test -- --run --coverage \ --coverage.provider=v8 \ @@ -3137,24 +3793,96 @@ jobs: --coverage.reporter=html \ --coverage.reportsDirectory=coverage/html + build_docker_database_image: + machine: + image: ubuntu-2204:2023.10.1 + resource_class: xlarge + working_directory: ~/project + steps: + - checkout + + - run: + name: Upgrade Docker + command: | + curl -fsSL https://get.docker.com | sh + docker version - run: name: Build Docker image - command: docker build -t my-app:latest -f ./docker/Dockerfile.database . + command: | + docker build \ + -t litellm-docker-database:ci \ + -f docker/Dockerfile.database . + + - run: + name: Save Docker image to workspace root + command: | + docker save litellm-docker-database:ci | gzip > litellm-docker-database.tar.gz + + - persist_to_workspace: + root: . + paths: + - litellm-docker-database.tar.gz + + e2e_ui_testing: + machine: + image: ubuntu-2204:2023.10.1 + resource_class: xlarge + working_directory: ~/project + steps: + - checkout + - setup_google_dns + - attach_workspace: + at: ~/project + - run: + name: Load Docker Database Image + command: | + gunzip -c litellm-docker-database.tar.gz | docker load + docker images | grep litellm-docker-database + - run: + name: Install Dependencies + command: | + npm install -D @playwright/test + - run: + name: Install Playwright Browsers + command: | + npx playwright install + - run: + name: Install Neon CLI + command: | + npm i -g neonctl + - run: + name: Create Neon branch + command: | + export EXPIRES_AT=$(date -u -d "+3 hours" +"%Y-%m-%dT%H:%M:%SZ") + echo "Expires at: $EXPIRES_AT" + neon branches create \ + --project-id $NEON_PROJECT_ID \ + --name preview/commit-${CIRCLE_SHA1:0:7} \ + --expires-at $EXPIRES_AT \ + --parent br-fancy-paper-ad1olsb3 \ + --api-key $NEON_API_KEY || true - run: name: Run Docker container command: | + E2E_UI_TEST_DATABASE_URL=$(neon connection-string \ + --project-id $NEON_PROJECT_ID \ + --api-key $NEON_API_KEY \ + --branch preview/commit-${CIRCLE_SHA1:0:7} \ + --database-name yuneng-trial-db \ + --role neondb_owner) + echo $E2E_UI_TEST_DATABASE_URL docker run -d \ -p 4000:4000 \ - -e DATABASE_URL=$SMALL_DATABASE_URL \ + -e DATABASE_URL=$E2E_UI_TEST_DATABASE_URL \ -e LITELLM_MASTER_KEY="sk-1234" \ -e OPENAI_API_KEY=$OPENAI_API_KEY \ -e UI_USERNAME="admin" \ -e UI_PASSWORD="gm" \ -e LITELLM_LICENSE=$LITELLM_LICENSE \ - --name my-app \ + --name litellm-docker-database \ -v $(pwd)/litellm/proxy/example_config_yaml/simple_config.yaml:/app/config.yaml \ - my-app:latest \ + litellm-docker-database:ci \ --config /app/config.yaml \ --port 4000 \ --detailed_debug @@ -3168,7 +3896,7 @@ jobs: sudo rm dockerize-linux-amd64-v0.6.1.tar.gz - run: name: Start outputting logs - command: docker logs -f my-app + command: docker logs -f litellm-docker-database background: true - run: name: Wait for app to be ready @@ -3176,10 +3904,18 @@ jobs: - run: name: Run Playwright Tests command: | - npx playwright test e2e_ui_tests/ --reporter=html --output=test-results + npx playwright test \ + --config ui/litellm-dashboard/e2e_tests/playwright.config.ts \ + --reporter=html \ + --output=test-results no_output_timeout: 120m - - store_test_results: + - store_artifacts: path: test-results + destination: playwright-results + + - store_artifacts: + path: playwright-report + destination: playwright-report test_nonroot_image: machine: @@ -3275,7 +4011,13 @@ workflows: only: - main - /litellm_.*/ - - local_testing: + - local_testing_part1: + filters: + branches: + only: + - main + - /litellm_.*/ + - local_testing_part2: filters: branches: only: @@ -3293,7 +4035,19 @@ workflows: only: - main - /litellm_.*/ - - litellm_proxy_unit_testing: + - litellm_proxy_unit_testing_key_generation: + filters: + branches: + only: + - main + - /litellm_.*/ + - litellm_proxy_unit_testing_part1: + filters: + branches: + only: + - main + - /litellm_.*/ + - litellm_proxy_unit_testing_part2: filters: branches: only: @@ -3329,13 +4083,37 @@ workflows: only: - main - /litellm_.*/ + - ui_build: + filters: + branches: + only: + - main + - /litellm_.*/ + - ui_unit_tests: + requires: + - ui_build + filters: + branches: + only: + - main + - /litellm_.*/ - auth_ui_unit_tests: filters: branches: only: - main - /litellm_.*/ + - build_docker_database_image: + filters: + branches: + only: + - main + - /litellm_.*/ - e2e_ui_testing: + context: e2e_ui_tests + requires: + - ui_build + - build_docker_database_image filters: branches: only: @@ -3348,30 +4126,40 @@ workflows: - main - /litellm_.*/ - e2e_openai_endpoints: + requires: + - build_docker_database_image filters: branches: only: - main - /litellm_.*/ - proxy_logging_guardrails_model_info_tests: + requires: + - build_docker_database_image filters: branches: only: - main - /litellm_.*/ - proxy_spend_accuracy_tests: + requires: + - build_docker_database_image filters: branches: only: - main - /litellm_.*/ - proxy_multi_instance_tests: + requires: + - build_docker_database_image filters: branches: only: - main - /litellm_.*/ - proxy_store_model_in_db_tests: + requires: + - build_docker_database_image filters: branches: only: @@ -3384,6 +4172,16 @@ workflows: - main - /litellm_.*/ - proxy_pass_through_endpoint_tests: + requires: + - build_docker_database_image + filters: + branches: + only: + - main + - /litellm_.*/ + - proxy_e2e_anthropic_messages_tests: + requires: + - build_docker_database_image filters: branches: only: @@ -3437,7 +4235,31 @@ workflows: only: - main - /litellm_.*/ - - litellm_mapped_tests: + - litellm_mapped_tests_proxy: + filters: + branches: + only: + - main + - /litellm_.*/ + - litellm_mapped_tests_llms: + filters: + branches: + only: + - main + - /litellm_.*/ + - litellm_mapped_tests_core: + filters: + branches: + only: + - main + - /litellm_.*/ + - litellm_mapped_tests_integrations: + filters: + branches: + only: + - main + - /litellm_.*/ + - litellm_mapped_tests_litellm_core_utils: filters: branches: only: @@ -3488,7 +4310,11 @@ workflows: - llm_responses_api_testing - ocr_testing - search_testing - - litellm_mapped_tests + - litellm_mapped_tests_proxy + - litellm_mapped_tests_llms + - litellm_mapped_tests_core + - litellm_mapped_tests_integrations + - litellm_mapped_tests_litellm_core_utils - litellm_mapped_enterprise_tests - batches_testing - litellm_utils_testing @@ -3499,13 +4325,18 @@ workflows: - litellm_router_testing - litellm_router_unit_testing - caching_unit_tests - - litellm_proxy_unit_testing + - litellm_proxy_unit_testing_key_generation + - litellm_proxy_unit_testing_part1 + - litellm_proxy_unit_testing_part2 - litellm_security_tests - langfuse_logging_unit_tests - - local_testing + - local_testing_part1 + - local_testing_part2 - litellm_assistants_api_testing - auth_ui_unit_tests - db_migration_disable_update_check: + requires: + - build_docker_database_image filters: branches: only: @@ -3540,10 +4371,12 @@ workflows: branches: only: - main + - /litellm_release_day_.*/ - publish_to_pypi: requires: - mypy_linting - - local_testing + - local_testing_part1 + - local_testing_part2 - build_and_test - e2e_openai_endpoints - test_bad_database_url @@ -3553,7 +4386,11 @@ workflows: - llm_responses_api_testing - ocr_testing - search_testing - - litellm_mapped_tests + - litellm_mapped_tests_proxy + - litellm_mapped_tests_llms + - litellm_mapped_tests_core + - litellm_mapped_tests_integrations + - litellm_mapped_tests_litellm_core_utils - litellm_mapped_enterprise_tests - batches_testing - litellm_utils_testing @@ -3569,7 +4406,9 @@ workflows: - auth_ui_unit_tests - db_migration_disable_update_check - e2e_ui_testing - - litellm_proxy_unit_testing + - litellm_proxy_unit_testing_key_generation + - litellm_proxy_unit_testing_part1 + - litellm_proxy_unit_testing_part2 - litellm_security_tests - installing_litellm_on_python - installing_litellm_on_python_3_13 @@ -3582,4 +4421,3 @@ workflows: - check_code_and_doc_quality - publish_proxy_extras - guardrails_testing - diff --git a/.circleci/requirements.txt b/.circleci/requirements.txt index 8e0f1dfe7e9b..a5ec74424fec 100644 --- a/.circleci/requirements.txt +++ b/.circleci/requirements.txt @@ -8,11 +8,13 @@ redis==5.2.1 redisvl==0.4.1 anthropic orjson==3.10.12 # fast /embedding responses -pydantic==2.10.2 +pydantic==2.11.0 google-cloud-aiplatform==1.43.0 google-cloud-iam==2.19.1 fastapi-sso==0.16.0 uvloop==0.21.0 -mcp==1.10.1 # for MCP server +mcp==1.25.0 # for MCP server semantic_router==0.1.10 # for auto-routing with litellm -fastuuid==0.12.0 \ No newline at end of file +fastuuid==0.12.0 +responses==0.25.7 # for proxy client tests +pytest-retry==1.6.3 # for automatic test retries \ No newline at end of file diff --git a/.dockerignore b/.dockerignore index 89c3c34bd718..76e31546c2f7 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,9 +4,51 @@ cookbook .github tests .git -.github -.circleci .devcontainer *.tgz log.txt docker/Dockerfile.* + +# Claude Flow generated files (must be excluded from Docker build) +.claude/ +.claude-flow/ +.swarm/ +.hive-mind/ +memory/ +coordination/ +claude-flow +.mcp.json +hive-mind-prompt-*.txt + +# Python virtual environments and version managers +.venv/ +venv/ +**/.venv/ +**/venv/ +.python-version +.pyenv/ +__pycache__/ +**/__pycache__/ +*.pyc +.mypy_cache/ +.pytest_cache/ +.ruff_cache/ +**/pyvenv.cfg + +# Common project exclusions +.vscode +*.pyo +*.pyd +.Python +env/ +.pytest_cache +.coverage +htmlcov/ +dist/ +build/ +*.egg-info/ +.DS_Store +node_modules/ +*.log +.env +.env.local diff --git a/.gitguardian.yaml b/.gitguardian.yaml new file mode 100644 index 000000000000..1eeec0677af6 --- /dev/null +++ b/.gitguardian.yaml @@ -0,0 +1,111 @@ +version: 2 + +secret: + # Exclude files and paths by globbing + ignored_paths: + - "**/*.whl" + - "**/*.pyc" + - "**/__pycache__/**" + - "**/node_modules/**" + - "**/dist/**" + - "**/build/**" + - "**/.git/**" + - "**/venv/**" + - "**/.venv/**" + + # Large data/metadata files that don't need scanning + - "**/model_prices_and_context_window*.json" + - "**/*_metadata/*.txt" + - "**/tokenizers/*.json" + - "**/tokenizers/*" + - "miniconda.sh" + + # Build outputs and static assets + - "litellm/proxy/_experimental/out/**" + - "ui/litellm-dashboard/public/**" + - "**/swagger/*.js" + - "**/*.woff" + - "**/*.woff2" + - "**/*.avif" + - "**/*.webp" + + # Test data files + - "**/tests/**/data_map.txt" + - "tests/**/*.txt" + + # Documentation and other non-code files + - "docs/**" + - "**/*.md" + - "**/*.lock" + - "poetry.lock" + - "package-lock.json" + + # Ignore security incidents with the SHA256 of the occurrence (false positives) + ignored_matches: + # === Current detected false positives (SHA-based) === + + # gcs_pub_sub_body - folder name, not a password + - name: GCS pub/sub test folder name + match: 75f377c456eede69e5f6e47399ccee6016a2a93cc5dd11db09cc5b1359ae569a + + # os.environ/APORIA_API_KEY_1 - environment variable reference + - name: Environment variable reference APORIA_API_KEY_1 + match: e2ddeb8b88eca97a402559a2be2117764e11c074d86159ef9ad2375dea188094 + + # os.environ/APORIA_API_KEY_2 - environment variable reference + - name: Environment variable reference APORIA_API_KEY_2 + match: 09aa39a29e050b86603aa55138af1ff08fb86a4582aa965c1bd0672e1575e052 + + # oidc/circleci_v2/ - test authentication path, not a secret + - name: OIDC CircleCI test path + match: feb3475e1f89a65b7b7815ac4ec597e18a9ec1847742ad445c36ca617b536e15 + + # text-davinci-003 - OpenAI model identifier, not a secret + - name: OpenAI model identifier text-davinci-003 + match: c489000cf6c7600cee0eefb80ad0965f82921cfb47ece880930eb7e7635cf1f1 + + # Base64 Basic Auth in test_pass_through_endpoints.py - test fixture, not a real secret + - name: Test Base64 Basic Auth header in pass_through_endpoints test + match: 61bac0491f395040617df7ef6d06029eac4d92a4457ac784978db80d97be1ae0 + + # PostgreSQL password "postgres" in CI configs - standard test database password + - name: Test PostgreSQL password in CI configurations + match: 6e0d657eb1f0fbc40cf0b8f3c3873ef627cc9cb7c4108d1c07d979c04bc8a4bb + + # Bearer token in locustfile.py - test/example API key for load testing + - name: Test Bearer token in locustfile load test + match: 2a0abc2b0c3c1760a51ffcdf8d6b1d384cef69af740504b1cfa82dd70cdc7ff9 + + # Inkeep API key in docusaurus.config.js - public documentation site key + - name: Inkeep API key in documentation config + match: c366657791bfb5fc69045ec11d49452f09a0aebbc8648f94e2469b4025e29a75 + + # Langfuse credentials in test_completion.py - test credentials for integration test + - name: Langfuse test credentials in test_completion + match: c39310f68cc3d3e22f7b298bb6353c4f45759adcc37080d8b7f4e535d3cfd7f4 + + # Test password "sk-1234" in e2e test fixtures - test fixture, not a real secret + - name: Test password in e2e test fixtures + match: ce32b547202e209ec1dd50107b64be4cfcf2eb15c3b4f8e9dc611ef747af634f + + # === Preventive patterns for test keys (pattern-based) === + + # Test API keys (124 instances across 45 files) + - name: Test API keys with sk-test prefix + match: sk-test- + + # Mock API keys + - name: Mock API keys with sk-mock prefix + match: sk-mock- + + # Fake API keys + - name: Fake API keys with sk-fake prefix + match: sk-fake- + + # Generic test API key patterns + - name: Test API key patterns + match: test-api-key + + - name: Short fake sk keys (1–9 digits only) + match: \bsk-\d{1,9}\b + diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 8fbf1b3c5b43..bbe4b76775da 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -7,6 +7,16 @@ body: attributes: value: | Thanks for taking the time to fill out this bug report! + + **💡 Tip:** See our [Troubleshooting Guide](https://docs.litellm.ai/docs/troubleshoot) for what information to include. + - type: checkboxes + id: duplicate-check + attributes: + label: Check for existing issues + description: Please search to see if an issue already exists for the bug you encountered. + options: + - label: I have searched the existing issues and checked that my issue is not a duplicate. + required: true - type: textarea id: what-happened attributes: @@ -16,6 +26,21 @@ body: value: "A bug happened!" validations: required: true + - type: textarea + id: steps-to-reproduce + attributes: + label: Steps to Reproduce + description: Please provide detailed steps to reproduce this bug(A curl/python code to reproduce the bug) + placeholder: | + 1. config.yaml file/ .env file/ etc. + 2. Run the following code... + 3. Observe the error... + value: | + 1. + 2. + 3. + validations: + required: true - type: textarea id: logs attributes: @@ -23,13 +48,16 @@ body: description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. render: shell - type: dropdown - id: ml-ops-team + id: component attributes: - label: Are you a ML Ops Team? - description: This helps us prioritize your requests correctly + label: What part of LiteLLM is this about? options: - - "No" - - "Yes" + - '' + - "SDK (litellm Python package)" + - "Proxy" + - "UI Dashboard" + - "Docs" + - "Other" validations: required: true - type: input diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index 13a2132ec95b..4cc429018977 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -7,6 +7,14 @@ body: attributes: value: | Thanks for making LiteLLM better! + - type: checkboxes + id: duplicate-check + attributes: + label: Check for existing issues + description: Please search to see if an issue already exists for the feature you are requesting. + options: + - label: I have searched the existing issues and checked that my issue is not a duplicate. + required: true - type: textarea id: the-feature attributes: @@ -22,6 +30,19 @@ body: description: Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., "I'm working on X and would like Y to be possible". If this is related to another GitHub issue, please link here too. validations: required: true + - type: dropdown + id: component + attributes: + label: What part of LiteLLM is this about? + options: + - '' + - "SDK (litellm Python package)" + - "Proxy" + - "UI Dashboard" + - "Docs" + - "Other" + validations: + required: true - type: dropdown id: hiring-interest attributes: diff --git a/.github/carto-features.yml b/.github/carto-features.yml new file mode 100644 index 000000000000..18df3f20a65d --- /dev/null +++ b/.github/carto-features.yml @@ -0,0 +1,69 @@ +# CARTO Features Manifest +# +# Static registry of critical CARTO customizations that must survive upstream syncs. +# The resolver should verify all patterns exist after every sync. If any are missing, +# re-prompt to restore them with the specific PR context. +# +# Usage: After sync resolution, run verification: +# for each feature.verification entry: +# grep -q "" "" || echo "MISSING: " + +features: + - name: "OCI Gemini Tool Call UUIDs" + source_prs: [68] + files: + - litellm/llms/oci/chat/transformation.py + verification: + - pattern: "uuid.uuid4().hex" + file: litellm/llms/oci/chat/transformation.py + - pattern: "_handle_tool_call_delta" + file: litellm/responses/litellm_completion_transformation/streaming_iterator.py + + - name: "Snowflake Streaming + Tool Calling" + source_prs: [38, 58] + files: + - litellm/llms/snowflake/chat/transformation.py + - litellm/responses/litellm_completion_transformation/transformation.py + verification: + - pattern: "SnowflakeStreamingHandler" + file: litellm/llms/snowflake/chat/transformation.py + - pattern: "_transform_messages" + file: litellm/llms/snowflake/chat/transformation.py + - pattern: "tool_choice_value is None and tools" + file: litellm/responses/litellm_completion_transformation/transformation.py + + - name: "Snowflake Full URL Passthrough" + source_prs: [] + description: "Prevents URL duplication when api_base already contains the full Cortex endpoint path" + files: + - litellm/llms/snowflake/chat/transformation.py + verification: + - pattern: "CARTO: skip path construction if api_base already contains" + file: litellm/llms/snowflake/chat/transformation.py + + - name: "Azure URL Suffix Stripping" + source_prs: [70] + files: + - litellm/llms/azure/common_utils.py + verification: + - pattern: "re.sub.*chat/completions" + file: litellm/llms/azure/common_utils.py + + - name: "JSON Repair for Streaming Tool Calls" + source_prs: [54] + files: + - litellm/litellm_core_utils/streaming_chunk_builder_utils.py + verification: + - pattern: "_validate_and_repair_tool_arguments" + file: litellm/litellm_core_utils/streaming_chunk_builder_utils.py + + - name: "Redis Session Storage" + source_prs: [16] + files: + - litellm/responses/litellm_completion_transformation/streaming_iterator.py + - litellm/responses/litellm_completion_transformation/transformation.py + verification: + - pattern: "_store_session_in_redis" + file: litellm/responses/litellm_completion_transformation/streaming_iterator.py + - pattern: "_patch_store_session_in_redis" + file: litellm/responses/litellm_completion_transformation/transformation.py diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 85f1769b6f3e..b91b16c955c8 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,7 +1,3 @@ -## Title - - - ## Relevant issues @@ -11,10 +7,25 @@ **Please complete all items before asking a LiteLLM maintainer to review your PR** - [ ] I have Added testing in the [`tests/litellm/`](https://github.com/BerriAI/litellm/tree/main/tests/litellm) directory, **Adding at least 1 test is a hard requirement** - [see details](https://docs.litellm.ai/docs/extras/contributing_code) -- [ ] I have added a screenshot of my new test passing locally - [ ] My PR passes all unit tests on [`make test-unit`](https://docs.litellm.ai/docs/extras/contributing_code) - [ ] My PR's scope is as isolated as possible, it only solves 1 specific problem +## CI (LiteLLM team) + +> **CI status guideline:** +> +> - 50-55 passing tests: main is stable with minor issues. +> - 45-49 passing tests: acceptable but needs attention +> - <= 40 passing tests: unstable; be careful with your merges and assess the risk. + +- [ ] **Branch creation CI run** + Link: + +- [ ] **CI run for the last commit** + Link: + +- [ ] **Merge / cherry-pick CI run** + Links: ## Type @@ -29,5 +40,3 @@ ✅ Test ## Changes - - diff --git a/.github/workflows/carto-ghcr-deploy.yaml b/.github/workflows/carto-ghcr-deploy.yaml index 4cbb69976593..4fad68d5c845 100644 --- a/.github/workflows/carto-ghcr-deploy.yaml +++ b/.github/workflows/carto-ghcr-deploy.yaml @@ -17,7 +17,8 @@ on: - "docker/**" - "Dockerfile" - "requirements.txt" - - ".github/workflows/ghcr_carto_deploy.yaml" + - ".github/workflows/carto-ghcr-deploy.yaml" + - ".github/workflows/docker-build-multiarch.yaml" pull_request: branches: - carto/main @@ -27,17 +28,14 @@ on: - "docker/**" - "Dockerfile" - "requirements.txt" - - ".github/workflows/ghcr_carto_deploy.yaml" + - ".github/workflows/carto-ghcr-deploy.yaml" + - ".github/workflows/docker-build-multiarch.yaml" # Cancel previous runs on the same branch/PR concurrency: group: docker-build-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - jobs: print: runs-on: ubuntu-latest @@ -47,25 +45,11 @@ jobs: echo "SHA : ${{ github.sha }}" echo "Event : ${{ github.event_name }}" - build-and-push-image-non_root: + compute-tags: runs-on: ubuntu-latest - permissions: - contents: read - packages: write + outputs: + tags: ${{ steps.tag-config.outputs.tags }} steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - # Checkout the release tag if provided, otherwise use default (branch that triggered the workflow) - ref: ${{ github.event.inputs.release_tag || github.ref }} - - - name: Log in to the Container registry - uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Configure tags id: tag-config run: | @@ -104,29 +88,14 @@ jobs: echo "EOF" >> $GITHUB_OUTPUT - - name: Extract metadata for tags & labels - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-non_root - tags: | - ${{ steps.tag-config.outputs.tags }} - - # Configure multi platform Docker builds - # - name: Set up QEMU - # uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 - - - name: Build and push non_root Docker image - uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 - with: - context: . - file: ./docker/Dockerfile.non_root - push: true - cache-from: type=gha - cache-to: type=gha,mode=max - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - platforms: linux/amd64 #linux/arm64 + docker: + needs: [print, compute-tags] + uses: ./.github/workflows/docker-build-multiarch.yaml + with: + checkout_ref: ${{ github.event.inputs.release_tag || github.ref }} + tags: ${{ needs.compute-tags.outputs.tags }} + cache_scope_prefix: build + permissions: + contents: read + packages: write + secrets: inherit diff --git a/.github/workflows/carto-release.yaml b/.github/workflows/carto-release.yaml index 0b9892d63763..63b81490f5f3 100644 --- a/.github/workflows/carto-release.yaml +++ b/.github/workflows/carto-release.yaml @@ -107,6 +107,12 @@ jobs: runs-on: ubuntu-latest needs: security-check if: needs.security-check.outputs.authorized == 'true' + outputs: + release_tag: ${{ steps.calc-version.outputs.release_tag }} + upstream_version: ${{ steps.detect-upstream.outputs.upstream_version }} + carto_version: ${{ steps.calc-version.outputs.carto_version }} + release_type: ${{ steps.calc-version.outputs.release_type }} + previous_upstream: ${{ steps.calc-version.outputs.previous_upstream }} permissions: contents: write packages: write @@ -518,41 +524,6 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Log in to Container registry - uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Set up QEMU - uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 - - - name: Extract metadata for Docker - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-non_root - tags: | - type=raw,value=${{ steps.calc-version.outputs.release_tag }} - type=raw,value=carto-stable - type=raw,value=carto-v${{ steps.detect-upstream.outputs.upstream_version }}-latest - - - name: Build and push Docker image - uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 - with: - context: . - file: ./docker/Dockerfile.non_root - push: true - cache-from: type=gha - cache-to: type=gha,mode=max - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} - platforms: linux/amd64 - - name: Release Summary run: | echo "## 🎉 Release Created Successfully!" >> $GITHUB_STEP_SUMMARY @@ -701,3 +672,18 @@ jobs: fi echo "::endgroup::" + + docker: + needs: create-release + uses: ./.github/workflows/docker-build-multiarch.yaml + with: + checkout_ref: ${{ needs.create-release.outputs.release_tag }} + tags: | + type=raw,value=${{ needs.create-release.outputs.release_tag }} + type=raw,value=carto-stable + type=raw,value=carto-v${{ needs.create-release.outputs.upstream_version }}-latest + cache_scope_prefix: release + permissions: + contents: read + packages: write + secrets: inherit diff --git a/.github/workflows/check_duplicate_issues.yml b/.github/workflows/check_duplicate_issues.yml new file mode 100644 index 000000000000..14d6964fcdb0 --- /dev/null +++ b/.github/workflows/check_duplicate_issues.yml @@ -0,0 +1,29 @@ +name: Check Duplicate Issues + +on: + issues: + types: [opened, edited] + +jobs: + check-duplicate: + runs-on: ubuntu-latest + permissions: + issues: write + contents: read + steps: + - name: Check for potential duplicates + uses: wow-actions/potential-duplicates@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + label: potential-duplicate + threshold: 0.6 + reaction: eyes + comment: | + **⚠️ Potential duplicate detected** + + This issue appears similar to existing issue(s): + {{#issues}} + - [#{{number}}]({{html_url}}) - {{title}} ({{accuracy}}% similar) + {{/issues}} + + Please review the linked issue(s) to see if they address your concern. If this is not a duplicate, please provide additional context to help us understand the difference. diff --git a/.github/workflows/create_daily_staging_branch.yml b/.github/workflows/create_daily_staging_branch.yml new file mode 100644 index 000000000000..9d0093e8b161 --- /dev/null +++ b/.github/workflows/create_daily_staging_branch.yml @@ -0,0 +1,43 @@ +name: Create Daily Staging Branch + +on: + schedule: + - cron: '0 0,12 * * *' # Runs every 12 hours at midnight and noon UTC + workflow_dispatch: # Allow manual trigger + +jobs: + create-staging-branch: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Create daily staging branch + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Configure Git user + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + # Generate branch name with MM_DD_YYYY format + BRANCH_NAME="litellm_oss_staging_$(date +'%m_%d_%Y')" + echo "Creating branch: $BRANCH_NAME" + + # Fetch all branches + git fetch --all + + # Check if the branch already exists + if git show-ref --verify --quiet refs/remotes/origin/$BRANCH_NAME; then + echo "Branch $BRANCH_NAME already exists. Skipping creation." + else + echo "Creating new branch: $BRANCH_NAME" + # Create the new branch from main + git checkout -b $BRANCH_NAME origin/main + # Push the new branch + git push origin $BRANCH_NAME + echo "Successfully created and pushed branch: $BRANCH_NAME" + fi diff --git a/.github/workflows/docker-build-multiarch.yaml b/.github/workflows/docker-build-multiarch.yaml new file mode 100644 index 000000000000..4f182698f629 --- /dev/null +++ b/.github/workflows/docker-build-multiarch.yaml @@ -0,0 +1,134 @@ +name: Build Multi-Arch Docker Image + +on: + workflow_call: + inputs: + checkout_ref: + description: 'Git ref to checkout (branch, tag, SHA)' + type: string + required: true + tags: + description: 'Newline-separated docker/metadata-action format tags' + type: string + required: true + cache_scope_prefix: + description: 'Cache scope prefix to avoid collisions between build and release' + type: string + required: false + default: 'build' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + build: + runs-on: ${{ matrix.runner }} + permissions: + contents: read + packages: write + strategy: + fail-fast: true + matrix: + include: + - platform: linux/amd64 + runner: ubuntu-latest + - platform: linux/arm64 + runner: ubuntu-24.04-arm + steps: + - name: Prepare + id: prep + run: | + echo "platform_pair=$(echo '${{ matrix.platform }}' | tr '/' '-')" >> $GITHUB_OUTPUT + echo "image_name=$(echo '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-non_root' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ inputs.checkout_ref }} + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (labels only) + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ steps.prep.outputs.image_name }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 + + - name: Build and push by digest + id: build + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + file: ./docker/Dockerfile.non_root + platforms: ${{ matrix.platform }} + labels: ${{ steps.meta.outputs.labels }} + outputs: type=image,name=${{ steps.prep.outputs.image_name }},push-by-digest=true,name-canonical=true,push=true + cache-from: type=gha,scope=${{ inputs.cache_scope_prefix }}-${{ steps.prep.outputs.platform_pair }} + cache-to: type=gha,mode=max,scope=${{ inputs.cache_scope_prefix }}-${{ steps.prep.outputs.platform_pair }} + + - name: Export digest + run: | + mkdir -p /tmp/digests + digest="${{ steps.build.outputs.digest }}" + touch "/tmp/digests/${digest#sha256:}" + + - name: Upload digest + uses: actions/upload-artifact@v4 + with: + name: digests-${{ steps.prep.outputs.platform_pair }} + path: /tmp/digests/* + if-no-files-found: error + retention-days: 1 + + merge: + runs-on: ubuntu-latest + needs: build + permissions: + contents: read + packages: write + steps: + - name: Prepare + id: prep + run: | + echo "image_name=$(echo '${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-non_root' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + + - name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp/digests + pattern: digests-* + merge-multiple: true + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 + + - name: Extract metadata for tags & labels + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ steps.prep.outputs.image_name }} + tags: | + ${{ inputs.tags }} + + - name: Create multi-arch manifest + working-directory: /tmp/digests + run: | + docker buildx imagetools create \ + $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + $(printf '${{ steps.prep.outputs.image_name }}@sha256:%s ' *) diff --git a/.github/workflows/ghcr_deploy.yml b/.github/workflows/ghcr_deploy.yml new file mode 100644 index 000000000000..f67538a42723 --- /dev/null +++ b/.github/workflows/ghcr_deploy.yml @@ -0,0 +1,434 @@ +# this workflow is triggered by an API call when there is a new PyPI release of LiteLLM +name: Build, Publish LiteLLM Docker Image. New Release +on: + workflow_dispatch: + inputs: + tag: + description: "The tag version you want to build" + required: true + release_type: + description: "The release type you want to build. Can be 'latest', 'stable', 'dev', 'rc'" + type: string + default: "latest" + commit_hash: + description: "Commit hash" + required: true + +# Defines two custom environment variables for the workflow. Used for the Container registry domain, and a name for the Docker image that this workflow builds. +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + CHART_NAME: litellm-helm + +# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. +jobs: + # print commit hash, tag, and release type + print: + runs-on: ubuntu-latest + steps: + - run: | + echo "Commit hash: ${{ github.event.inputs.commit_hash }}" + echo "Tag: ${{ github.event.inputs.tag }}" + echo "Release type: ${{ github.event.inputs.release_type }}" + docker-hub-deploy: + if: github.repository == 'BerriAI/litellm' + runs-on: ubuntu-latest + steps: + - + name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} + - + name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: litellm/litellm:${{ github.event.inputs.tag || 'latest' }} + - + name: Build and push litellm-database image + uses: docker/build-push-action@v5 + with: + context: . + push: true + file: ./docker/Dockerfile.database + tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }} + - + name: Build and push litellm-spend-logs image + uses: docker/build-push-action@v5 + with: + context: . + push: true + file: ./litellm-js/spend-logs/Dockerfile + tags: litellm/litellm-spend_logs:${{ github.event.inputs.tag || 'latest' }} + - + name: Build and push litellm-non_root image + uses: docker/build-push-action@v5 + with: + context: . + push: true + file: ./docker/Dockerfile.non_root + tags: litellm/litellm-non_root:${{ github.event.inputs.tag || 'latest' }} + build-and-push-image: + runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} + # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + # Configure multi platform Docker builds + - name: Set up QEMU + uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 + # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. + # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. + # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. + - name: Build and push Docker image + uses: docker/build-push-action@4976231911ebf5f32aad765192d35f942aa48cb8 + with: + context: . + push: true + tags: | + ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, + ${{ steps.meta.outputs.tags }}-${{ github.event.inputs.release_type }} + ${{ (github.event.inputs.release_type == 'stable' || github.event.inputs.release_type == 'rc') && format('{0}/berriai/litellm:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm:main-stable', env.REGISTRY) || '' }}, + ${{ (github.event.inputs.release_type == 'stable' || github.event.inputs.release_type == 'rc') && format('{0}/berriai/litellm:{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + labels: ${{ steps.meta.outputs.labels }} + platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 + + build-and-push-image-ee: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for EE Dockerfile + id: meta-ee + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-ee + # Configure multi platform Docker builds + - name: Set up QEMU + uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 + + - name: Build and push EE Docker image + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + file: Dockerfile + push: true + tags: | + ${{ steps.meta-ee.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, + ${{ steps.meta-ee.outputs.tags }}-${{ github.event.inputs.release_type }} + ${{ (github.event.inputs.release_type == 'stable' || github.event.inputs.release_type == 'rc') && format('{0}/berriai/litellm-ee:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-ee:main-stable', env.REGISTRY) || '' }} + labels: ${{ steps.meta-ee.outputs.labels }} + platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 + + build-and-push-image-database: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for database Dockerfile + id: meta-database + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-database + # Configure multi platform Docker builds + - name: Set up QEMU + uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 + + - name: Build and push Database Docker image + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + file: ./docker/Dockerfile.database + push: true + tags: | + ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, + ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.release_type }} + ${{ (github.event.inputs.release_type == 'stable' || github.event.inputs.release_type == 'rc') && format('{0}/berriai/litellm-database:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-database:main-stable', env.REGISTRY) || '' }} + labels: ${{ steps.meta-database.outputs.labels }} + platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 + + build-and-push-image-non_root: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for non_root Dockerfile + id: meta-non_root + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-non_root + # Configure multi platform Docker builds + - name: Set up QEMU + uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 + + - name: Build and push non_root Docker image + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + file: ./docker/Dockerfile.non_root + push: true + tags: | + ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, + ${{ steps.meta-non_root.outputs.tags }}-${{ github.event.inputs.release_type }} + ${{ (github.event.inputs.release_type == 'stable' || github.event.inputs.release_type == 'rc') && format('{0}/berriai/litellm-non_root:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-non_root:main-stable', env.REGISTRY) || '' }} + labels: ${{ steps.meta-non_root.outputs.labels }} + platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 + + build-and-push-image-spend-logs: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.commit_hash }} + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for spend-logs Dockerfile + id: meta-spend-logs + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-spend_logs + # Configure multi platform Docker builds + - name: Set up QEMU + uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345 + + - name: Build and push Database Docker image + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + file: ./litellm-js/spend-logs/Dockerfile + push: true + tags: | + ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, + ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.release_type }} + ${{ (github.event.inputs.release_type == 'stable' || github.event.inputs.release_type == 'rc') && format('{0}/berriai/litellm-spend_logs:main-{1}', env.REGISTRY, github.event.inputs.tag) || '' }}, + ${{ github.event.inputs.release_type == 'stable' && format('{0}/berriai/litellm-spend_logs:main-stable', env.REGISTRY) || '' }} + platforms: local,linux/amd64,linux/arm64,linux/arm64/v8 + + build-and-push-helm-chart: + if: github.event.inputs.release_type != 'dev' + needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database] + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: lowercase github.repository_owner + run: | + echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV} + + # Sync Helm chart version with LiteLLM release version (1-1 versioning) + # This allows users to easily map Helm chart versions to LiteLLM versions + # See: https://codefresh.io/docs/docs/ci-cd-guides/helm-best-practices/ + - name: Calculate chart and app versions + id: chart_version + shell: bash + run: | + INPUT_TAG="${{ github.event.inputs.tag }}" + RELEASE_TYPE="${{ github.event.inputs.release_type }}" + + # Chart version = LiteLLM version without 'v' prefix (Helm semver convention) + # v1.81.0 -> 1.81.0, v1.81.0.rc.1 -> 1.81.0.rc.1 + CHART_VERSION="${INPUT_TAG#v}" + + # Add suffix for 'latest' releases (rc already has suffix in tag) + if [ "$RELEASE_TYPE" = "latest" ]; then + CHART_VERSION="${CHART_VERSION}-latest" + fi + + # App version = Docker tag (keeps 'v' prefix to match Docker image tags) + APP_VERSION="${INPUT_TAG}" + + echo "version=${CHART_VERSION}" | tee -a $GITHUB_OUTPUT + echo "app_version=${APP_VERSION}" | tee -a $GITHUB_OUTPUT + + - uses: ./.github/actions/helm-oci-chart-releaser + with: + name: ${{ env.CHART_NAME }} + repository: ${{ env.REPO_OWNER }} + tag: ${{ steps.chart_version.outputs.version }} + app_version: ${{ steps.chart_version.outputs.app_version }} + path: deploy/charts/${{ env.CHART_NAME }} + registry: ${{ env.REGISTRY }} + registry_username: ${{ github.actor }} + registry_password: ${{ secrets.GITHUB_TOKEN }} + update_dependencies: true + + release: + name: "New LiteLLM Release" + needs: [docker-hub-deploy, build-and-push-image, build-and-push-image-database] + + runs-on: "ubuntu-latest" + + steps: + - name: Display version + run: echo "Current version is ${{ github.event.inputs.tag }}" + - name: "Set Release Tag" + run: echo "RELEASE_TAG=${{ github.event.inputs.tag }}" >> $GITHUB_ENV + - name: Display release tag + run: echo "RELEASE_TAG is $RELEASE_TAG" + - name: "Create release" + uses: "actions/github-script@v6" + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + const commitHash = "${{ github.event.inputs.commit_hash}}"; + console.log("Commit Hash:", commitHash); // Add this line for debugging + try { + const response = await github.rest.repos.createRelease({ + draft: false, + generate_release_notes: true, + target_commitish: commitHash, + name: process.env.RELEASE_TAG, + owner: context.repo.owner, + prerelease: false, + repo: context.repo.repo, + tag_name: process.env.RELEASE_TAG, + }); + + core.exportVariable('RELEASE_ID', response.data.id); + core.exportVariable('RELEASE_UPLOAD_URL', response.data.upload_url); + } catch (error) { + core.setFailed(error.message); + } + - name: Fetch Release Notes + id: release-notes + uses: actions/github-script@v6 + with: + github-token: "${{ secrets.GITHUB_TOKEN }}" + script: | + try { + const response = await github.rest.repos.getRelease({ + owner: context.repo.owner, + repo: context.repo.repo, + release_id: process.env.RELEASE_ID, + }); + const formattedBody = JSON.stringify(response.data.body).slice(1, -1); + return formattedBody; + } catch (error) { + core.setFailed(error.message); + } + env: + RELEASE_ID: ${{ env.RELEASE_ID }} + - name: Github Releases To Discord + env: + WEBHOOK_URL: ${{ secrets.WEBHOOK_URL }} + REALEASE_TAG: ${{ env.RELEASE_TAG }} + RELEASE_NOTES: ${{ steps.release-notes.outputs.result }} + run: | + curl -H "Content-Type: application/json" -X POST -d '{ + "content": "New LiteLLM release '"${RELEASE_TAG}"'", + "username": "Release Changelog", + "avatar_url": "https://cdn.discordapp.com/avatars/487431320314576937/bd64361e4ba6313d561d54e78c9e7171.png", + "embeds": [ + { + "title": "Changelog for LiteLLM '"${RELEASE_TAG}"'", + "description": "'"${RELEASE_NOTES}"'", + "color": 2105893 + } + ] + }' $WEBHOOK_URL + diff --git a/.github/workflows/ghcr_helm_deploy.yml.txt b/.github/workflows/ghcr_helm_deploy.yml.txt index 2e4ae69da637..21b2eaafe194 100644 --- a/.github/workflows/ghcr_helm_deploy.yml.txt +++ b/.github/workflows/ghcr_helm_deploy.yml.txt @@ -1,10 +1,12 @@ -# this workflow is triggered by an API call when there is a new PyPI release of LiteLLM +# Standalone workflow to publish LiteLLM Helm Chart +# Note: The main ghcr_deploy.yml workflow also publishes the Helm chart as part of a full release name: Build, Publish LiteLLM Helm Chart. New Release -on: {} - # workflow_dispatch: - # inputs: - # chartVersion: - # description: "Update the helm chart's version to this" +on: + workflow_dispatch: + inputs: + tag: + description: "LiteLLM version tag (e.g., v1.81.0)" + required: true # Defines two custom environment variables for the workflow. Used for the Container registry domain, and a name for the Docker image that this workflow builds. env: @@ -31,24 +33,22 @@ jobs: run: | echo "REPO_OWNER=`echo ${{github.repository_owner}} | tr '[:upper:]' '[:lower:]'`" >>${GITHUB_ENV} - - name: Get LiteLLM Latest Tag - id: current_app_tag - uses: WyriHaximus/github-action-get-previous-tag@v1.3.0 - - - name: Get last published chart version - id: current_version + # Sync Helm chart version with LiteLLM release version (1-1 versioning) + - name: Calculate chart and app versions + id: chart_version shell: bash - run: helm show chart oci://${{ env.REGISTRY }}/${{ env.REPO_OWNER }}/litellm-helm | grep '^version:' | awk 'BEGIN{FS=":"}{print "current-version="$2}' | tr -d " " | tee -a $GITHUB_OUTPUT - env: - HELM_EXPERIMENTAL_OCI: '1' + run: | + INPUT_TAG="${{ github.event.inputs.tag }}" - # Automatically update the helm chart version one "patch" level - - name: Bump release version - id: bump_version - uses: christian-draeger/increment-semantic-version@1.1.0 - with: - current-version: ${{ steps.current_version.outputs.current-version || '0.1.0' }} - version-fragment: 'bug' + # Chart version = LiteLLM version without 'v' prefix + # v1.81.0 -> 1.81.0 + CHART_VERSION="${INPUT_TAG#v}" + + # App version = Docker tag (keeps 'v' prefix) + APP_VERSION="${INPUT_TAG}" + + echo "version=${CHART_VERSION}" | tee -a $GITHUB_OUTPUT + echo "app_version=${APP_VERSION}" | tee -a $GITHUB_OUTPUT - name: Lint helm chart run: helm lint deploy/charts/litellm-helm @@ -57,8 +57,8 @@ jobs: with: name: litellm-helm repository: ${{ env.REPO_OWNER }} - tag: ${{ github.event.inputs.chartVersion || steps.bump_version.outputs.next-version || '0.1.0' }} - app_version: ${{ steps.current_app_tag.outputs.tag || 'latest' }} + tag: ${{ steps.chart_version.outputs.version }} + app_version: ${{ steps.chart_version.outputs.app_version }} path: deploy/charts/litellm-helm registry: ${{ env.REGISTRY }} registry_username: ${{ github.actor }} diff --git a/.github/workflows/issue-keyword-labeler.yml b/.github/workflows/issue-keyword-labeler.yml index 60c18e3b9af3..936f90f747fe 100644 --- a/.github/workflows/issue-keyword-labeler.yml +++ b/.github/workflows/issue-keyword-labeler.yml @@ -19,7 +19,7 @@ jobs: id: scan env: PROVIDER_ISSUE_WEBHOOK_URL: ${{ secrets.PROVIDER_ISSUE_WEBHOOK_URL }} - KEYWORDS: azure,openai,bedrock,vertexai,vertex ai,anthropic + KEYWORDS: azure,openai,bedrock,vertexai,vertex ai,anthropic,gemini,cohere,mistral,groq,ollama,deepseek run: python3 .github/scripts/scan_keywords.py - name: Ensure label exists diff --git a/.github/workflows/label-component.yml b/.github/workflows/label-component.yml new file mode 100644 index 000000000000..fd079fce6c10 --- /dev/null +++ b/.github/workflows/label-component.yml @@ -0,0 +1,116 @@ +name: Label Component Issues + +on: + issues: + types: + - opened + +jobs: + add-component-label: + runs-on: ubuntu-latest + permissions: + issues: write + steps: + - name: Add component labels + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const body = context.payload.issue.body; + if (!body) return; + + // Define component mappings with regex patterns that handle flexible whitespace + const components = [ + { + pattern: /What part of LiteLLM is this about\?\s*SDK \(litellm Python package\)/, + label: 'sdk', + color: '0E7C86', + description: 'Issues related to the litellm Python SDK' + }, + { + pattern: /What part of LiteLLM is this about\?\s*Proxy/, + label: 'proxy', + color: '5319E7', + description: 'Issues related to the LiteLLM Proxy' + }, + { + pattern: /What part of LiteLLM is this about\?\s*UI Dashboard/, + label: 'ui-dashboard', + color: 'D876E3', + description: 'Issues related to the LiteLLM UI Dashboard' + }, + { + pattern: /What part of LiteLLM is this about\?\s*Docs/, + label: 'docs', + color: 'FBCA04', + description: 'Issues related to LiteLLM documentation' + } + ]; + + // Find matching component + for (const component of components) { + if (component.pattern.test(body)) { + // Ensure label exists + try { + await github.rest.issues.getLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + name: component.label + }); + } catch (error) { + if (error.status === 404) { + await github.rest.issues.createLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + name: component.label, + color: component.color, + description: component.description + }); + } + } + + // Add label to issue + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + labels: [component.label] + }); + + break; + } + } + + // Check for 'claude code' keyword (can be applied alongside component labels) + if (/claude code/i.test(body)) { + const claudeLabel = { + name: 'claude code', + color: '7c3aed', + description: 'Issues related to Claude Code usage' + }; + + try { + await github.rest.issues.getLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + name: claudeLabel.name + }); + } catch (error) { + if (error.status === 404) { + await github.rest.issues.createLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + name: claudeLabel.name, + color: claudeLabel.color, + description: claudeLabel.description + }); + } + } + + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + labels: [claudeLabel.name] + }); + } diff --git a/.github/workflows/label-mlops.yml b/.github/workflows/label-mlops.yml deleted file mode 100644 index 37789c1ea76d..000000000000 --- a/.github/workflows/label-mlops.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: Label ML Ops Team Issues - -on: - issues: - types: - - opened - -jobs: - add-mlops-label: - runs-on: ubuntu-latest - steps: - - name: Check if ML Ops Team is selected - uses: actions-ecosystem/action-add-labels@v1 - if: contains(github.event.issue.body, '### Are you a ML Ops Team?') && contains(github.event.issue.body, 'Yes') - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - labels: "mlops user request" diff --git a/.github/workflows/publish-migrations.yml b/.github/workflows/publish-migrations.yml index 8e5a67bcf854..a5187cb2f558 100644 --- a/.github/workflows/publish-migrations.yml +++ b/.github/workflows/publish-migrations.yml @@ -13,6 +13,7 @@ on: jobs: publish-migrations: + if: github.repository == 'BerriAI/litellm' runs-on: ubuntu-latest services: postgres: diff --git a/.github/workflows/test-linting.yml b/.github/workflows/test-linting.yml index c9ba0bc747ab..7c5c269f8993 100644 --- a/.github/workflows/test-linting.yml +++ b/.github/workflows/test-linting.yml @@ -4,11 +4,6 @@ on: pull_request: branches: [ main ] -# Cancel previous runs on the same PR -concurrency: - group: linting-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - jobs: lint: runs-on: ubuntu-latest @@ -35,6 +30,7 @@ jobs: - name: Install dependencies run: | + poetry lock poetry install --with dev poetry run pip install openai==1.100.1 @@ -77,4 +73,4 @@ jobs: - name: Check import safety run: | - poetry run python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1) \ No newline at end of file + poetry run python -c "from litellm import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1) diff --git a/.github/workflows/test-litellm.yml b/.github/workflows/test-litellm.yml index 263682e06067..d9cf2e74a11f 100644 --- a/.github/workflows/test-litellm.yml +++ b/.github/workflows/test-litellm.yml @@ -2,12 +2,7 @@ name: LiteLLM Mock Tests (folder - tests/test_litellm) on: pull_request: - branches: [ main, carto/main ] - -# Cancel previous runs on the same PR -concurrency: - group: litellm-tests-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true + branches: [ main ] jobs: test: @@ -32,17 +27,19 @@ jobs: - name: Install dependencies run: | + poetry lock poetry install --with dev,proxy-dev --extras "proxy semantic-router" poetry run pip install "pytest-retry==1.6.3" poetry run pip install pytest-xdist poetry run pip install "google-genai==1.22.0" poetry run pip install "google-cloud-aiplatform>=1.38" poetry run pip install "fastapi-offline==1.7.3" - poetry run pip install "python-multipart==0.0.18" + poetry run pip install "python-multipart==0.0.22" + poetry run pip install "openapi-core" - name: Setup litellm-enterprise as local package run: | cd enterprise - python -m pip install -e . + poetry run pip install -e . cd .. - name: Run tests run: | diff --git a/.github/workflows/test-mcp.yml b/.github/workflows/test-mcp.yml index 2da6980951a3..e19e67c9c4f0 100644 --- a/.github/workflows/test-mcp.yml +++ b/.github/workflows/test-mcp.yml @@ -27,14 +27,15 @@ jobs: - name: Install dependencies run: | + poetry lock poetry install --with dev,proxy-dev --extras "proxy semantic-router" poetry run pip install "pytest==7.3.1" poetry run pip install "pytest-retry==1.6.3" poetry run pip install "pytest-cov==5.0.0" poetry run pip install "pytest-asyncio==0.21.1" poetry run pip install "respx==0.22.0" - poetry run pip install "pydantic==2.10.2" - poetry run pip install "mcp==1.10.1" + poetry run pip install "pydantic==2.11.0" + poetry run pip install "mcp==1.25.0" poetry run pip install pytest-xdist - name: Setup litellm-enterprise as local package diff --git a/.github/workflows/test-model-map.yaml b/.github/workflows/test-model-map.yaml new file mode 100644 index 000000000000..ae5ac402e234 --- /dev/null +++ b/.github/workflows/test-model-map.yaml @@ -0,0 +1,15 @@ +name: Validate model_prices_and_context_window.json + +on: + pull_request: + branches: [ main ] + +jobs: + validate-model-prices-json: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Validate model_prices_and_context_window.json + run: | + jq empty model_prices_and_context_window.json diff --git a/.gitignore b/.gitignore index 8480465915be..ddf5f6279b34 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .python-version .venv +.venv_policy_test .env .newenv newenv/* @@ -8,10 +9,6 @@ litellm_uuid.txt __pycache__/ *.pyc bun.lockb -# Build artifacts -dist/ -build/ -*.egg-info/ **/.DS_Store .aider* litellm_results.jsonl @@ -63,9 +60,6 @@ litellm/proxy/_super_secret_config.yaml litellm/proxy/myenv/bin/activate litellm/proxy/myenv/bin/Activate.ps1 myenv/* -litellm/proxy/_experimental/out/404/index.html -litellm/proxy/_experimental/out/model_hub/index.html -litellm/proxy/_experimental/out/onboarding/index.html litellm/tests/log.txt litellm/tests/langfuse.log litellm/tests/langfuse.log @@ -78,9 +72,6 @@ tests/local_testing/log.txt litellm/proxy/_new_new_secret_config.yaml litellm/proxy/custom_guardrail.py .mypy_cache/* -litellm/proxy/_experimental/out/404.html -litellm/proxy/_experimental/out/404.html -litellm/proxy/_experimental/out/model_hub.html .mypy_cache/* litellm/proxy/application.log tests/llm_translation/vertex_test_account.json @@ -102,5 +93,10 @@ litellm_config.yaml litellm/proxy/to_delete_loadtest_work/* update_model_cost_map.py tests/test_litellm/proxy/_experimental/mcp_server/test_mcp_server_manager.py -litellm/proxy/_experimental/out/guardrails/index.html scripts/test_vertex_ai_search.py +LAZY_LOADING_IMPROVEMENTS.md +STABILIZATION_TODO.md +**/test-results +**/playwright-report +**/*.storageState.json +**/coverage \ No newline at end of file diff --git a/.trivyignore b/.trivyignore new file mode 100644 index 000000000000..0d04ecacdb56 --- /dev/null +++ b/.trivyignore @@ -0,0 +1,12 @@ +# LiteLLM Trivy Ignore File +# CVEs listed here are temporarily allowlisted pending fixes + +# Next.js vulnerabilities in UI dashboard (next@14.2.35) +# Allowlisted: 2026-01-31, 7-day fix timeline +# Fix: Upgrade to Next.js 15.5.10+ or 16.1.5+ + +# HIGH: DoS via request deserialization +GHSA-h25m-26qc-wcjf + +# MEDIUM: Image Optimizer DoS +CVE-2025-59471 diff --git a/AGENTS.md b/AGENTS.md index 8e7b5f2bd2ef..5a48049ef451 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -49,6 +49,29 @@ LiteLLM is a unified interface for 100+ LLMs that: - Test provider-specific functionality thoroughly - Consider adding load tests for performance-critical changes +### MAKING CODE CHANGES FOR THE UI (IGNORE FOR BACKEND) + +1. **Tremor is DEPRECATED, do not use Tremor components in new features/changes** + - The only exception is the Tremor Table component and its required Tremor Table sub components. + +2. **Use Common Components as much as possible**: + - These are usually defined in the `common_components` directory + - Use these components as much as possible and avoid building new components unless needed + +3. **Testing**: + - The codebase uses **Vitest** and **React Testing Library** + - **Query Priority Order**: Use query methods in this order: `getByRole`, `getByLabelText`, `getByPlaceholderText`, `getByText`, `getByTestId` + - **Always use `screen`** instead of destructuring from `render()` (e.g., use `screen.getByText()` not `getByText`) + - **Wrap user interactions in `act()`**: Always wrap `fireEvent` calls with `act()` to ensure React state updates are properly handled + - **Use `query` methods for absence checks**: Use `queryBy*` methods (not `getBy*`) when expecting an element to NOT be present + - **Test names must start with "should"**: All test names should follow the pattern `it("should ...")` + - **Mock external dependencies**: Check `setupTests.ts` for global mocks and mock child components/networking calls as needed + - **Structure tests properly**: + - First test should verify the component renders successfully + - Subsequent tests should focus on functionality and user interactions + - Use `waitFor` for async operations that aren't already awaited + - **Avoid using `querySelector`**: Prefer React Testing Library queries over direct DOM manipulation + ### IMPORTANT PATTERNS 1. **Function/Tool Calling**: @@ -94,6 +117,29 @@ LiteLLM supports MCP for agent workflows: - Support for external MCP servers (Zapier, Jira, Linear, etc.) - See `litellm/experimental_mcp_client/` and `litellm/proxy/_experimental/mcp_server/` +## RUNNING SCRIPTS + +Use `poetry run python script.py` to run Python scripts in the project environment (for non-test files). + +## GITHUB TEMPLATES + +When opening issues or pull requests, follow these templates: + +### Bug Reports (`.github/ISSUE_TEMPLATE/bug_report.yml`) +- Describe what happened vs. expected behavior +- Include relevant log output +- Specify LiteLLM version +- Indicate if you're part of an ML Ops team (helps with prioritization) + +### Feature Requests (`.github/ISSUE_TEMPLATE/feature_request.yml`) +- Clearly describe the feature +- Explain motivation and use case with concrete examples + +### Pull Requests (`.github/pull_request_template.md`) +- Add at least 1 test in `tests/litellm/` +- Ensure `make test-unit` passes + + ## TESTING CONSIDERATIONS 1. **Provider Tests**: Test against real provider APIs when possible diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 000000000000..c114a838d6d2 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,398 @@ +# LiteLLM Architecture - LiteLLM SDK + AI Gateway + +This document helps contributors understand where to make changes in LiteLLM. + +--- + +## How It Works + +The LiteLLM AI Gateway (Proxy) uses the LiteLLM SDK internally for all LLM calls: + +``` +OpenAI SDK (client) ──▶ LiteLLM AI Gateway (proxy/) ──▶ LiteLLM SDK (litellm/) ──▶ LLM API +Anthropic SDK (client) ──▶ LiteLLMAI Gateway (proxy/) ──▶ LiteLLM SDK (litellm/) ──▶ LLM API +Any HTTP client ──▶ LiteLLMAI Gateway (proxy/) ──▶ LiteLLM SDK (litellm/) ──▶ LLM API +``` + +The **AI Gateway** adds authentication, rate limiting, budgets, and routing on top of the SDK. +The **SDK** handles the actual LLM provider calls, request/response transformations, and streaming. + +--- + +## 1. AI Gateway (Proxy) Request Flow + +The AI Gateway (`litellm/proxy/`) wraps the SDK with authentication, rate limiting, and management features. + +```mermaid +sequenceDiagram + participant Client + participant ProxyServer as proxy/proxy_server.py + participant Auth as proxy/auth/user_api_key_auth.py + participant Redis as Redis Cache + participant Hooks as proxy/hooks/ + participant Router as router.py + participant Main as main.py + utils.py + participant Handler as llms/custom_httpx/llm_http_handler.py + participant Transform as llms/{provider}/chat/transformation.py + participant Provider as LLM Provider API + participant CostCalc as cost_calculator.py + participant LoggingObj as litellm_logging.py + participant DBWriter as db/db_spend_update_writer.py + participant Postgres as PostgreSQL + + %% Request Flow + Client->>ProxyServer: POST /v1/chat/completions + ProxyServer->>Auth: user_api_key_auth() + Auth->>Redis: Check API key cache + Redis-->>Auth: Key info + spend limits + ProxyServer->>Hooks: max_budget_limiter, parallel_request_limiter + Hooks->>Redis: Check/increment rate limit counters + ProxyServer->>Router: route_request() + Router->>Main: litellm.acompletion() + Main->>Handler: BaseLLMHTTPHandler.completion() + Handler->>Transform: ProviderConfig.transform_request() + Handler->>Provider: HTTP Request + Provider-->>Handler: Response + Handler->>Transform: ProviderConfig.transform_response() + Transform-->>Handler: ModelResponse + Handler-->>Main: ModelResponse + + %% Cost Attribution (in utils.py wrapper) + Main->>LoggingObj: update_response_metadata() + LoggingObj->>CostCalc: _response_cost_calculator() + CostCalc->>CostCalc: completion_cost(tokens × price) + CostCalc-->>LoggingObj: response_cost + LoggingObj-->>Main: Set response._hidden_params["response_cost"] + Main-->>ProxyServer: ModelResponse (with cost in _hidden_params) + + %% Response Headers + Async Logging + ProxyServer->>ProxyServer: Extract cost from hidden_params + ProxyServer->>LoggingObj: async_success_handler() + LoggingObj->>Hooks: async_log_success_event() + Hooks->>DBWriter: update_database(response_cost) + DBWriter->>Redis: Queue spend increment + DBWriter->>Postgres: Batch write spend logs (async) + ProxyServer-->>Client: ModelResponse + x-litellm-response-cost header +``` + +### Proxy Components + +```mermaid +graph TD + subgraph "Incoming Request" + Client["POST /v1/chat/completions"] + end + + subgraph "proxy/proxy_server.py" + Endpoint["chat_completion()"] + end + + subgraph "proxy/auth/" + Auth["user_api_key_auth()"] + end + + subgraph "proxy/" + PreCall["litellm_pre_call_utils.py"] + RouteRequest["route_llm_request.py"] + end + + subgraph "litellm/" + Router["router.py"] + Main["main.py"] + end + + subgraph "Infrastructure" + DualCache["DualCache
(in-memory + Redis)"] + Postgres["PostgreSQL
(keys, teams, spend logs)"] + end + + Client --> Endpoint + Endpoint --> Auth + Auth --> DualCache + DualCache -.->|cache miss| Postgres + Auth --> PreCall + PreCall --> RouteRequest + RouteRequest --> Router + Router --> DualCache + Router --> Main + Main --> Client +``` + +**Key proxy files:** +- `proxy/proxy_server.py` - Main API endpoints +- `proxy/auth/` - Authentication (API keys, JWT, OAuth2) +- `proxy/hooks/` - Proxy-level callbacks +- `router.py` - Load balancing, fallbacks +- `router_strategy/` - Routing algorithms (`lowest_latency.py`, `simple_shuffle.py`, etc.) + +**LLM-specific proxy endpoints:** + +| Endpoint | Directory | Purpose | +|----------|-----------|---------| +| `/v1/messages` | `proxy/anthropic_endpoints/` | Anthropic Messages API | +| `/vertex-ai/*` | `proxy/vertex_ai_endpoints/` | Vertex AI passthrough | +| `/gemini/*` | `proxy/google_endpoints/` | Google AI Studio passthrough | +| `/v1/images/*` | `proxy/image_endpoints/` | Image generation | +| `/v1/batches` | `proxy/batches_endpoints/` | Batch processing | +| `/v1/files` | `proxy/openai_files_endpoints/` | File uploads | +| `/v1/fine_tuning` | `proxy/fine_tuning_endpoints/` | Fine-tuning jobs | +| `/v1/rerank` | `proxy/rerank_endpoints/` | Reranking | +| `/v1/responses` | `proxy/response_api_endpoints/` | OpenAI Responses API | +| `/v1/vector_stores` | `proxy/vector_store_endpoints/` | Vector stores | +| `/*` (passthrough) | `proxy/pass_through_endpoints/` | Direct provider passthrough | + +**Proxy Hooks** (`proxy/hooks/__init__.py`): + +| Hook | File | Purpose | +|------|------|---------| +| `max_budget_limiter` | `proxy/hooks/max_budget_limiter.py` | Enforce budget limits | +| `parallel_request_limiter` | `proxy/hooks/parallel_request_limiter_v3.py` | Rate limiting per key/user | +| `cache_control_check` | `proxy/hooks/cache_control_check.py` | Cache validation | +| `responses_id_security` | `proxy/hooks/responses_id_security.py` | Response ID validation | +| `litellm_skills` | `proxy/hooks/skills_injection.py` | Skills injection | + +To add a new proxy hook, implement `CustomLogger` and register in `PROXY_HOOKS`. + +### Infrastructure Components + +The AI Gateway uses external infrastructure for persistence and caching: + +```mermaid +graph LR + subgraph "AI Gateway (proxy/)" + Proxy["proxy_server.py"] + Auth["auth/user_api_key_auth.py"] + DBWriter["db/db_spend_update_writer.py
DBSpendUpdateWriter"] + InternalCache["utils.py
InternalUsageCache"] + CostCallback["hooks/proxy_track_cost_callback.py
_ProxyDBLogger"] + Scheduler["APScheduler
ProxyStartupEvent"] + end + + subgraph "SDK (litellm/)" + Router["router.py
Router.cache (DualCache)"] + LLMCache["caching/caching_handler.py
LLMCachingHandler"] + CacheClass["caching/caching.py
Cache"] + end + + subgraph "Redis (caching/redis_cache.py)" + RateLimit["Rate Limit Counters"] + SpendQueue["Spend Increment Queue"] + KeyCache["API Key Cache"] + TPM_RPM["TPM/RPM Tracking"] + Cooldowns["Deployment Cooldowns"] + LLMResponseCache["LLM Response Cache"] + end + + subgraph "PostgreSQL (proxy/schema.prisma)" + Keys["LiteLLM_VerificationToken"] + Teams["LiteLLM_TeamTable"] + SpendLogs["LiteLLM_SpendLogs"] + Users["LiteLLM_UserTable"] + end + + Auth --> InternalCache + InternalCache --> KeyCache + InternalCache -.->|cache miss| Keys + InternalCache --> RateLimit + Router --> TPM_RPM + Router --> Cooldowns + LLMCache --> CacheClass + CacheClass --> LLMResponseCache + CostCallback --> DBWriter + DBWriter --> SpendQueue + DBWriter --> SpendLogs + Scheduler --> SpendLogs + Scheduler --> Keys +``` + +| Component | Purpose | Key Files/Classes | +|-----------|---------|-------------------| +| **Redis** | Rate limiting, API key caching, TPM/RPM tracking, cooldowns, LLM response caching, spend queuing | `caching/redis_cache.py` (`RedisCache`), `caching/dual_cache.py` (`DualCache`) | +| **PostgreSQL** | API keys, teams, users, spend logs | `proxy/utils.py` (`PrismaClient`), `proxy/schema.prisma` | +| **InternalUsageCache** | Proxy-level cache for rate limits + API keys (in-memory + Redis) | `proxy/utils.py` (`InternalUsageCache`) | +| **Router.cache** | TPM/RPM tracking, deployment cooldowns, client caching (in-memory + Redis) | `router.py` (`Router.cache: DualCache`) | +| **LLMCachingHandler** | SDK-level LLM response/embedding caching | `caching/caching_handler.py` (`LLMCachingHandler`), `caching/caching.py` (`Cache`) | +| **DBSpendUpdateWriter** | Batches spend updates to reduce DB writes | `proxy/db/db_spend_update_writer.py` (`DBSpendUpdateWriter`) | +| **Cost Tracking** | Calculates and logs response costs | `proxy/hooks/proxy_track_cost_callback.py` (`_ProxyDBLogger`) | + +**Background Jobs** (APScheduler, initialized in `proxy/proxy_server.py` → `ProxyStartupEvent.initialize_scheduled_background_jobs()`): + +| Job | Interval | Purpose | Key Files | +|-----|----------|---------|-----------| +| `update_spend` | 60s | Batch write spend logs to PostgreSQL | `proxy/db/db_spend_update_writer.py` | +| `reset_budget` | 10-12min | Reset budgets for keys/users/teams | `proxy/management_helpers/budget_reset_job.py` | +| `add_deployment` | 10s | Sync new model deployments from DB | `proxy/proxy_server.py` (`ProxyConfig`) | +| `cleanup_old_spend_logs` | cron/interval | Delete old spend logs | `proxy/management_helpers/spend_log_cleanup.py` | +| `check_batch_cost` | 30min | Calculate costs for batch jobs | `proxy/management_helpers/check_batch_cost_job.py` | +| `check_responses_cost` | 30min | Calculate costs for responses API | `proxy/management_helpers/check_responses_cost_job.py` | +| `process_rotations` | 1hr | Auto-rotate API keys | `proxy/management_helpers/key_rotation_manager.py` | +| `_run_background_health_check` | continuous | Health check model deployments | `proxy/proxy_server.py` | +| `send_weekly_spend_report` | weekly | Slack spend alerts | `proxy/utils.py` (`SlackAlerting`) | +| `send_monthly_spend_report` | monthly | Slack spend alerts | `proxy/utils.py` (`SlackAlerting`) | + +**Cost Attribution Flow:** +1. LLM response returns to `utils.py` wrapper after `litellm.acompletion()` completes +2. `update_response_metadata()` (`llm_response_utils/response_metadata.py`) is called +3. `logging_obj._response_cost_calculator()` (`litellm_logging.py`) calculates cost via `litellm.completion_cost()` (`cost_calculator.py`) +4. Cost is stored in `response._hidden_params["response_cost"]` +5. `proxy/common_request_processing.py` extracts cost from `hidden_params` and adds to response headers (`x-litellm-response-cost`) +6. `logging_obj.async_success_handler()` triggers callbacks including `_ProxyDBLogger.async_log_success_event()` +7. `DBSpendUpdateWriter.update_database()` queues spend increments to Redis +8. Background job `update_spend` flushes queued spend to PostgreSQL every 60s + +--- + +## 2. SDK Request Flow + +The SDK (`litellm/`) provides the core LLM calling functionality used by both direct SDK users and the AI Gateway. + +```mermaid +graph TD + subgraph "SDK Entry Points" + Completion["litellm.completion()"] + Messages["litellm.messages()"] + end + + subgraph "main.py" + Main["completion()
acompletion()"] + end + + subgraph "utils.py" + GetProvider["get_llm_provider()"] + end + + subgraph "llms/custom_httpx/" + Handler["llm_http_handler.py
BaseLLMHTTPHandler"] + HTTP["http_handler.py
HTTPHandler / AsyncHTTPHandler"] + end + + subgraph "llms/{provider}/chat/" + TransformReq["transform_request()"] + TransformResp["transform_response()"] + end + + subgraph "litellm_core_utils/" + Streaming["streaming_handler.py"] + end + + subgraph "integrations/ (async, off main thread)" + Callbacks["custom_logger.py
Langfuse, Datadog, etc."] + end + + Completion --> Main + Messages --> Main + Main --> GetProvider + GetProvider --> Handler + Handler --> TransformReq + TransformReq --> HTTP + HTTP --> Provider["LLM Provider API"] + Provider --> HTTP + HTTP --> TransformResp + TransformResp --> Streaming + Streaming --> Response["ModelResponse"] + Response -.->|async| Callbacks +``` + +**Key SDK files:** +- `main.py` - Entry points: `completion()`, `acompletion()`, `embedding()` +- `utils.py` - `get_llm_provider()` resolves model → provider +- `llms/custom_httpx/llm_http_handler.py` - Central HTTP orchestrator +- `llms/custom_httpx/http_handler.py` - Low-level HTTP client +- `llms/{provider}/chat/transformation.py` - Provider-specific transformations +- `litellm_core_utils/streaming_handler.py` - Streaming response handling +- `integrations/` - Async callbacks (Langfuse, Datadog, etc.) + +--- + +## 3. Translation Layer + +When a request comes in, it goes through a **translation layer** that converts between API formats. +Each translation is isolated in its own file, making it easy to test and modify independently. + +### Where to find translations + +| Incoming API | Provider | Translation File | +|--------------|----------|------------------| +| `/v1/chat/completions` | Anthropic | `llms/anthropic/chat/transformation.py` | +| `/v1/chat/completions` | Bedrock Converse | `llms/bedrock/chat/converse_transformation.py` | +| `/v1/chat/completions` | Bedrock Invoke | `llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py` | +| `/v1/chat/completions` | Gemini | `llms/gemini/chat/transformation.py` | +| `/v1/chat/completions` | Vertex AI | `llms/vertex_ai/gemini/transformation.py` | +| `/v1/chat/completions` | OpenAI | `llms/openai/chat/gpt_transformation.py` | +| `/v1/messages` (passthrough) | Anthropic | `llms/anthropic/experimental_pass_through/messages/transformation.py` | +| `/v1/messages` (passthrough) | Bedrock | `llms/bedrock/messages/invoke_transformations/anthropic_claude3_transformation.py` | +| `/v1/messages` (passthrough) | Vertex AI | `llms/vertex_ai/vertex_ai_partner_models/anthropic/experimental_pass_through/transformation.py` | +| Passthrough endpoints | All | `proxy/pass_through_endpoints/llm_provider_handlers/` | + +### Example: Debugging prompt caching + +If `/v1/messages` → Bedrock Converse prompt caching isn't working but Bedrock Invoke works: + +1. **Bedrock Converse translation**: `llms/bedrock/chat/converse_transformation.py` +2. **Bedrock Invoke translation**: `llms/bedrock/chat/invoke_transformations/anthropic_claude3_transformation.py` +3. Compare how each handles `cache_control` in `transform_request()` + +### How translations work + +Each provider has a `Config` class that inherits from `BaseConfig` (`llms/base_llm/chat/transformation.py`): + +```python +class ProviderConfig(BaseConfig): + def transform_request(self, model, messages, optional_params, litellm_params, headers): + # Convert OpenAI format → Provider format + return {"messages": transformed_messages, ...} + + def transform_response(self, model, raw_response, model_response, logging_obj, ...): + # Convert Provider format → OpenAI format + return ModelResponse(choices=[...], usage=Usage(...)) +``` + +The `BaseLLMHTTPHandler` (`llms/custom_httpx/llm_http_handler.py`) calls these methods - you never need to modify the handler itself. + +--- + +## 4. Adding/Modifying Providers + +### To add a new provider: + +1. Create `llms/{provider}/chat/transformation.py` +2. Implement `Config` class with `transform_request()` and `transform_response()` +3. Add tests in `tests/llm_translation/test_{provider}.py` + +### To add a feature (e.g., prompt caching): + +1. Find the translation file from the table above +2. Modify `transform_request()` to handle the new parameter +3. Add unit tests that verify the transformation + +### Testing checklist + +When adding a feature, verify it works across all paths: + +| Test | File Pattern | +|------|--------------| +| OpenAI passthrough | `tests/llm_translation/test_openai*.py` | +| Anthropic direct | `tests/llm_translation/test_anthropic*.py` | +| Bedrock Invoke | `tests/llm_translation/test_bedrock*.py` | +| Bedrock Converse | `tests/llm_translation/test_bedrock*converse*.py` | +| Vertex AI | `tests/llm_translation/test_vertex*.py` | +| Gemini | `tests/llm_translation/test_gemini*.py` | + +### Unit testing translations + +Translations are designed to be unit testable without making API calls: + +```python +from litellm.llms.bedrock.chat.converse_transformation import BedrockConverseConfig + +def test_prompt_caching_transform(): + config = BedrockConverseConfig() + result = config.transform_request( + model="anthropic.claude-3-opus", + messages=[{"role": "user", "content": "test", "cache_control": {"type": "ephemeral"}}], + optional_params={}, + litellm_params={}, + headers={} + ) + assert "cachePoint" in str(result) # Verify cache_control was translated +``` diff --git a/CLAUDE.md b/CLAUDE.md index 50bed6e43e2e..23a0e97eaeec 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -25,6 +25,25 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co - `poetry run pytest tests/path/to/test_file.py -v` - Run specific test file - `poetry run pytest tests/path/to/test_file.py::test_function -v` - Run specific test +### Running Scripts +- `poetry run python script.py` - Run Python scripts (use for non-test files) + +### GitHub Issue & PR Templates +When contributing to the project, use the appropriate templates: + +**Bug Reports** (`.github/ISSUE_TEMPLATE/bug_report.yml`): +- Describe what happened vs. what you expected +- Include relevant log output +- Specify your LiteLLM version + +**Feature Requests** (`.github/ISSUE_TEMPLATE/feature_request.yml`): +- Describe the feature clearly +- Explain the motivation and use case + +**Pull Requests** (`.github/pull_request_template.md`): +- Add at least 1 test in `tests/litellm/` +- Ensure `make test-unit` passes + ## Architecture Overview LiteLLM is a unified interface for 100+ LLM providers with two main components: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ad58a4976d6e..a418c8c57af6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,8 +24,9 @@ Before contributing code to LiteLLM, you must sign our [Contributor License Agre ### 1. Setup Your Local Development Environment ```bash -# Clone the repository -git clone https://github.com/BerriAI/litellm.git +# Fork the repository on GitHub (click the Fork button at https://github.com/BerriAI/litellm) +# Then clone your fork locally +git clone https://github.com/YOUR_USERNAME/litellm.git cd litellm # Create a new branch for your feature @@ -258,7 +259,7 @@ docker run \ If you need help: - 💬 [Join our Discord](https://discord.gg/wuPM9dRgDw) -- 💬 [Join our Slack](https://join.slack.com/share/enQtOTE0ODczMzk2Nzk4NC01YjUxNjY2YjBlYTFmNDRiZTM3NDFiYTM3MzVkODFiMDVjOGRjMmNmZTZkZTMzOWQzZGQyZWIwYjQ0MWExYmE3) +- 💬 [Join our Slack](https://www.litellm.ai/support) - 📧 Email us: ishaan@berri.ai / krrish@berri.ai - 🐛 [Create an issue](https://github.com/BerriAI/litellm/issues/new) diff --git a/Dockerfile b/Dockerfile index aa13c0370583..4bfda939110e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,8 @@ # Base image for building -ARG LITELLM_BUILD_IMAGE=cgr.dev/chainguard/python:latest-dev +ARG LITELLM_BUILD_IMAGE=cgr.dev/chainguard/wolfi-base # Runtime image -ARG LITELLM_RUNTIME_IMAGE=cgr.dev/chainguard/python:latest-dev +ARG LITELLM_RUNTIME_IMAGE=cgr.dev/chainguard/wolfi-base # Builder stage FROM $LITELLM_BUILD_IMAGE AS builder @@ -12,17 +12,16 @@ WORKDIR /app USER root # Install build dependencies -RUN apk add --no-cache gcc python3-dev openssl openssl-dev +RUN apk add --no-cache bash gcc py3-pip python3 python3-dev openssl openssl-dev - -RUN pip install --upgrade pip>=24.3.1 && \ - pip install build +RUN python -m pip install build # Copy the current directory contents into the container at /app COPY . . # Build Admin UI -RUN chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh +# Convert Windows line endings to Unix and make executable +RUN sed -i 's/\r$//' docker/build_admin_ui.sh && chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh # Build the package RUN rm -rf dist/* && python -m build @@ -47,11 +46,9 @@ FROM $LITELLM_RUNTIME_IMAGE AS runtime # Ensure runtime stage runs as root USER root -# Install runtime dependencies -RUN apk add --no-cache openssl tzdata - -# Upgrade pip to fix CVE-2025-8869 -RUN pip install --upgrade pip>=24.3.1 +# Install runtime dependencies (libsndfile needed for audio processing on ARM64) +RUN apk add --no-cache bash openssl tzdata nodejs npm python3 py3-pip libsndfile && \ + npm install -g npm@latest tar@latest WORKDIR /app # Copy the current directory contents into the container at /app @@ -65,14 +62,19 @@ COPY --from=builder /wheels/ /wheels/ # Install the built wheel using pip; again using a wildcard if it's the only file RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels -# Install semantic_router and aurelio-sdk using script -RUN chmod +x docker/install_auto_router.sh && ./docker/install_auto_router.sh +# Remove test files and keys from dependencies +RUN find /usr/lib -type f -path "*/tornado/test/*" -delete && \ + find /usr/lib -type d -path "*/tornado/test" -delete -# Generate prisma client with explicit binary target to avoid wolfi warning -ENV PRISMA_CLI_BINARY_TARGETS="debian-openssl-3.0.x" -RUN prisma generate -RUN chmod +x docker/entrypoint.sh -RUN chmod +x docker/prod_entrypoint.sh +# Install semantic_router and aurelio-sdk using script +# Convert Windows line endings to Unix and make executable +RUN sed -i 's/\r$//' docker/install_auto_router.sh && chmod +x docker/install_auto_router.sh && ./docker/install_auto_router.sh + +# Generate prisma client using the correct schema +RUN prisma generate --schema=./litellm/proxy/schema.prisma +# Convert Windows line endings to Unix for entrypoint scripts +RUN sed -i 's/\r$//' docker/entrypoint.sh && chmod +x docker/entrypoint.sh +RUN sed -i 's/\r$//' docker/prod_entrypoint.sh && chmod +x docker/prod_entrypoint.sh EXPOSE 4000/tcp diff --git a/GEMINI.md b/GEMINI.md index efcee04d4c3b..a9d40c910b20 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -25,6 +25,25 @@ This file provides guidance to Gemini when working with code in this repository. - `poetry run pytest tests/path/to/test_file.py -v` - Run specific test file - `poetry run pytest tests/path/to/test_file.py::test_function -v` - Run specific test +### Running Scripts +- `poetry run python script.py` - Run Python scripts (use for non-test files) + +### GitHub Issue & PR Templates +When contributing to the project, use the appropriate templates: + +**Bug Reports** (`.github/ISSUE_TEMPLATE/bug_report.yml`): +- Describe what happened vs. what you expected +- Include relevant log output +- Specify your LiteLLM version + +**Feature Requests** (`.github/ISSUE_TEMPLATE/feature_request.yml`): +- Describe the feature clearly +- Explain the motivation and use case + +**Pull Requests** (`.github/pull_request_template.md`): +- Add at least 1 test in `tests/litellm/` +- Ensure `make test-unit` passes + ## Architecture Overview LiteLLM is a unified interface for 100+ LLM providers with two main components: diff --git a/Makefile b/Makefile index a79a397f9456..0da83c363cd0 100644 --- a/Makefile +++ b/Makefile @@ -34,17 +34,18 @@ install-proxy-dev: # CI-compatible installations (matches GitHub workflows exactly) install-dev-ci: - pip install openai==1.99.5 + pip install openai==2.8.0 poetry install --with dev - pip install openai==1.99.5 + pip install openai==2.8.0 install-proxy-dev-ci: poetry install --with dev,proxy-dev --extras proxy - pip install openai==1.99.5 + pip install openai==2.8.0 install-test-deps: install-proxy-dev poetry run pip install "pytest-retry==1.6.3" poetry run pip install pytest-xdist + poetry run pip install openapi-core cd enterprise && poetry run pip install -e . && cd .. install-helm-unittest: @@ -100,4 +101,4 @@ test-llm-translation-single: install-test-deps @mkdir -p test-results poetry run pytest tests/llm_translation/$(FILE) \ --junitxml=test-results/junit.xml \ - -v --tb=short --maxfail=100 --timeout=300 \ No newline at end of file + -v --tb=short --maxfail=100 --timeout=300 diff --git a/README.md b/README.md index 812b20e69867..77adddf89784 100644 --- a/README.md +++ b/README.md @@ -2,16 +2,16 @@ 🚅 LiteLLM

+

Call 100+ LLMs in OpenAI format. [Bedrock, Azure, OpenAI, VertexAI, Anthropic, Groq, etc.] +

Deploy to Render Deploy on Railway

-

Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.] -

-

LiteLLM Proxy Server (LLM Gateway) | Hosted Proxy (Preview) | Enterprise Tier

+

LiteLLM Proxy Server (AI Gateway) | Hosted Proxy | Enterprise Tier

PyPI Version @@ -30,29 +30,17 @@

-LiteLLM manages: - -- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints -- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']` -- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing) -- Set Budgets & Rate limits per project, api key, model [LiteLLM Proxy Server (LLM Gateway)](https://docs.litellm.ai/docs/simple_proxy) +Group 7154 (1) -[**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#litellm-proxy-server-llm-gateway---docs)
-[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-providers-docs) -🚨 **Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. [More information about the release cycle here](https://docs.litellm.ai/docs/proxy/release_cycle) +## Use LiteLLM for -Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+). +
+LLMs - Call 100+ LLMs (Python SDK + AI Gateway) -# Usage ([**Docs**](https://docs.litellm.ai/docs/)) +[**All Supported Endpoints**](https://docs.litellm.ai/docs/supported_endpoints) - `/chat/completions`, `/responses`, `/embeddings`, `/images`, `/audio`, `/batches`, `/rerank`, `/a2a`, `/messages` and more. -> [!IMPORTANT] -> LiteLLM v1.0.0 now requires `openai>=1.0.0`. Migration guide [here](https://docs.litellm.ai/docs/migration) -> LiteLLM v1.40.14+ now requires `pydantic>=2.0.0`. No changes required. - - - Open In Colab - +### Python SDK ```shell pip install litellm @@ -62,292 +50,331 @@ pip install litellm from litellm import completion import os -## set ENV variables os.environ["OPENAI_API_KEY"] = "your-openai-key" os.environ["ANTHROPIC_API_KEY"] = "your-anthropic-key" -messages = [{ "content": "Hello, how are you?","role": "user"}] - -# openai call -response = completion(model="openai/gpt-4o", messages=messages) - -# anthropic call -response = completion(model="anthropic/claude-sonnet-4-20250514", messages=messages) -print(response) -``` - -### Response (OpenAI Format) +# OpenAI +response = completion(model="openai/gpt-4o", messages=[{"role": "user", "content": "Hello!"}]) -```json -{ - "id": "chatcmpl-1214900a-6cdd-4148-b663-b5e2f642b4de", - "created": 1751494488, - "model": "claude-sonnet-4-20250514", - "object": "chat.completion", - "system_fingerprint": null, - "choices": [ - { - "finish_reason": "stop", - "index": 0, - "message": { - "content": "Hello! I'm doing well, thank you for asking. I'm here and ready to help with whatever you'd like to discuss or work on. How are you doing today?", - "role": "assistant", - "tool_calls": null, - "function_call": null - } - } - ], - "usage": { - "completion_tokens": 39, - "prompt_tokens": 13, - "total_tokens": 52, - "completion_tokens_details": null, - "prompt_tokens_details": { - "audio_tokens": null, - "cached_tokens": 0 - }, - "cache_creation_input_tokens": 0, - "cache_read_input_tokens": 0 - } -} +# Anthropic +response = completion(model="anthropic/claude-sonnet-4-20250514", messages=[{"role": "user", "content": "Hello!"}]) ``` -Call any model supported by a provider, with `model=/`. There might be provider-specific details here, so refer to [provider docs for more information](https://docs.litellm.ai/docs/providers) - -## Async ([Docs](https://docs.litellm.ai/docs/completion/stream#async-completion)) - -```python -from litellm import acompletion -import asyncio +### AI Gateway (Proxy Server) -async def test_get_response(): - user_message = "Hello, how are you?" - messages = [{"content": user_message, "role": "user"}] - response = await acompletion(model="openai/gpt-4o", messages=messages) - return response +[**Getting Started - E2E Tutorial**](https://docs.litellm.ai/docs/proxy/docker_quick_start) - Setup virtual keys, make your first request -response = asyncio.run(test_get_response()) -print(response) +```shell +pip install 'litellm[proxy]' +litellm --model gpt-4o ``` -## Streaming ([Docs](https://docs.litellm.ai/docs/completion/stream)) - -liteLLM supports streaming the model response back, pass `stream=True` to get a streaming iterator in response. -Streaming is supported for all models (Bedrock, Huggingface, TogetherAI, Azure, OpenAI, etc.) - ```python -from litellm import completion -response = completion(model="openai/gpt-4o", messages=messages, stream=True) -for part in response: - print(part.choices[0].delta.content or "") - -# claude sonnet 4 -response = completion('anthropic/claude-sonnet-4-20250514', messages, stream=True) -for part in response: - print(part) -``` - -### Response chunk (OpenAI Format) +import openai -```json -{ - "id": "chatcmpl-fe575c37-5004-4926-ae5e-bfbc31f356ca", - "created": 1751494808, - "model": "claude-sonnet-4-20250514", - "object": "chat.completion.chunk", - "system_fingerprint": null, - "choices": [ - { - "finish_reason": null, - "index": 0, - "delta": { - "provider_specific_fields": null, - "content": "Hello", - "role": "assistant", - "function_call": null, - "tool_calls": null, - "audio": null - }, - "logprobs": null - } - ], - "provider_specific_fields": null, - "stream_options": null, - "citations": null -} +client = openai.OpenAI(api_key="anything", base_url="http://0.0.0.0:4000") +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello!"}] +) ``` -## Logging Observability ([Docs](https://docs.litellm.ai/docs/observability/callbacks)) - -LiteLLM exposes pre defined callbacks to send data to Lunary, MLflow, Langfuse, DynamoDB, s3 Buckets, Helicone, Promptlayer, Traceloop, Athina, Slack +[**Docs: LLM Providers**](https://docs.litellm.ai/docs/providers) -```python -from litellm import completion +
-## set env variables for logging tools (when using MLflow, no API key set up is required) -os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" -os.environ["HELICONE_API_KEY"] = "your-helicone-auth-key" -os.environ["LANGFUSE_PUBLIC_KEY"] = "" -os.environ["LANGFUSE_SECRET_KEY"] = "" -os.environ["ATHINA_API_KEY"] = "your-athina-api-key" +
+Agents - Invoke A2A Agents (Python SDK + AI Gateway) -os.environ["OPENAI_API_KEY"] = "your-openai-key" - -# set callbacks -litellm.success_callback = ["lunary", "mlflow", "langfuse", "athina", "helicone"] # log input/output to lunary, langfuse, supabase, athina, helicone etc - -#openai call -response = completion(model="openai/gpt-4o", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) -``` +[**Supported Providers**](https://docs.litellm.ai/docs/a2a#add-a2a-agents) - LangGraph, Vertex AI Agent Engine, Azure AI Foundry, Bedrock AgentCore, Pydantic AI -# LiteLLM Proxy Server (LLM Gateway) - ([Docs](https://docs.litellm.ai/docs/simple_proxy)) +### Python SDK - A2A Protocol -Track spend + Load Balance across multiple projects - -[Hosted Proxy (Preview)](https://docs.litellm.ai/docs/hosted) - -The proxy provides: - -1. [Hooks for auth](https://docs.litellm.ai/docs/proxy/virtual_keys#custom-auth) -2. [Hooks for logging](https://docs.litellm.ai/docs/proxy/logging#step-1---create-your-custom-litellm-callback-class) -3. [Cost tracking](https://docs.litellm.ai/docs/proxy/virtual_keys#tracking-spend) -4. [Rate Limiting](https://docs.litellm.ai/docs/proxy/users#set-rate-limits) - -## 📖 Proxy Endpoints - [Swagger Docs](https://litellm-api.up.railway.app/) - - -## Quick Start Proxy - CLI - -```shell -pip install 'litellm[proxy]' -``` - -### Step 1: Start litellm proxy - -```shell -$ litellm --model huggingface/bigcode/starcoder - -#INFO: Proxy running on http://0.0.0.0:4000 +```python +from litellm.a2a_protocol import A2AClient +from a2a.types import SendMessageRequest, MessageSendParams +from uuid import uuid4 + +client = A2AClient(base_url="http://localhost:10001") + +request = SendMessageRequest( + id=str(uuid4()), + params=MessageSendParams( + message={ + "role": "user", + "parts": [{"kind": "text", "text": "Hello!"}], + "messageId": uuid4().hex, + } + ) +) +response = await client.send_message(request) ``` -### Step 2: Make ChatCompletions Request to Proxy +### AI Gateway (Proxy Server) +**Step 1.** [Add your Agent to the AI Gateway](https://docs.litellm.ai/docs/a2a#adding-your-agent) -> [!IMPORTANT] -> 💡 [Use LiteLLM Proxy with Langchain (Python, JS), OpenAI SDK (Python, JS) Anthropic SDK, Mistral SDK, LlamaIndex, Instructor, Curl](https://docs.litellm.ai/docs/proxy/user_keys) +**Step 2.** Call Agent via A2A SDK ```python -import openai # openai v1.0.0+ -client = openai.OpenAI(api_key="anything",base_url="http://0.0.0.0:4000") # set proxy to base_url -# request sent to model set on litellm proxy, `litellm --model` -response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [ - { - "role": "user", - "content": "this is a test request, write a short poem" - } -]) - -print(response) +from a2a.client import A2ACardResolver, A2AClient +from a2a.types import MessageSendParams, SendMessageRequest +from uuid import uuid4 +import httpx + +base_url = "http://localhost:4000/a2a/my-agent" # LiteLLM proxy + agent name +headers = {"Authorization": "Bearer sk-1234"} # LiteLLM Virtual Key + +async with httpx.AsyncClient(headers=headers) as httpx_client: + resolver = A2ACardResolver(httpx_client=httpx_client, base_url=base_url) + agent_card = await resolver.get_agent_card() + client = A2AClient(httpx_client=httpx_client, agent_card=agent_card) + + request = SendMessageRequest( + id=str(uuid4()), + params=MessageSendParams( + message={ + "role": "user", + "parts": [{"kind": "text", "text": "Hello!"}], + "messageId": uuid4().hex, + } + ) + ) + response = await client.send_message(request) ``` -## Proxy Key Management ([Docs](https://docs.litellm.ai/docs/proxy/virtual_keys)) - -Connect the proxy with a Postgres DB to create proxy keys +[**Docs: A2A Agent Gateway**](https://docs.litellm.ai/docs/a2a) -```bash -# Get the code -git clone https://github.com/BerriAI/litellm - -# Go to folder -cd litellm +
-# Add the master key - you can change this after setup -echo 'LITELLM_MASTER_KEY="sk-1234"' > .env +
+MCP Tools - Connect MCP servers to any LLM (Python SDK + AI Gateway) -# Add the litellm salt key - you cannot change this after adding a model -# It is used to encrypt / decrypt your LLM API Key credentials -# We recommend - https://1password.com/password-generator/ -# password generator to get a random hash for litellm salt key -echo 'LITELLM_SALT_KEY="sk-1234"' >> .env +### Python SDK - MCP Bridge -source .env - -# Start -docker compose up +```python +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client +from litellm import experimental_mcp_client +import litellm + +server_params = StdioServerParameters(command="python", args=["mcp_server.py"]) + +async with stdio_client(server_params) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + + # Load MCP tools in OpenAI format + tools = await experimental_mcp_client.load_mcp_tools(session=session, format="openai") + + # Use with any LiteLLM model + response = await litellm.acompletion( + model="gpt-4o", + messages=[{"role": "user", "content": "What's 3 + 5?"}], + tools=tools + ) ``` +### AI Gateway - MCP Gateway -UI on `/ui` on your proxy server -![ui_3](https://github.com/BerriAI/litellm/assets/29436595/47c97d5e-b9be-4839-b28c-43d7f4f10033) +**Step 1.** [Add your MCP Server to the AI Gateway](https://docs.litellm.ai/docs/mcp#adding-your-mcp) -Set budgets and rate limits across multiple projects -`POST /key/generate` +**Step 2.** Call MCP tools via `/chat/completions` -### Request - -```shell -curl 'http://0.0.0.0:4000/key/generate' \ ---header 'Authorization: Bearer sk-1234' \ ---header 'Content-Type: application/json' \ ---data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai", "team": "core-infra"}}' +```bash +curl -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ + -H 'Authorization: Bearer sk-1234' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Summarize the latest open PR"}], + "tools": [{ + "type": "mcp", + "server_url": "litellm_proxy/mcp/github", + "server_label": "github_mcp", + "require_approval": "never" + }] + }' ``` -### Expected Response +### Use with Cursor IDE -```shell +```json { - "key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token - "expires": "2023-11-19T01:38:25.838000+00:00" # datetime object + "mcpServers": { + "LiteLLM": { + "url": "http://localhost:4000/mcp", + "headers": { + "x-litellm-api-key": "Bearer sk-1234" + } + } + } } ``` -## Supported Providers ([Docs](https://docs.litellm.ai/docs/providers)) - -| Provider | [Completion](https://docs.litellm.ai/docs/#basic-usage) | [Streaming](https://docs.litellm.ai/docs/completion/stream#streaming-responses) | [Async Completion](https://docs.litellm.ai/docs/completion/stream#async-completion) | [Async Streaming](https://docs.litellm.ai/docs/completion/stream#async-streaming) | [Async Embedding](https://docs.litellm.ai/docs/embedding/supported_embedding) | [Async Image Generation](https://docs.litellm.ai/docs/image_generation) | -|-------------------------------------------------------------------------------------|---------------------------------------------------------|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------------| -| [openai](https://docs.litellm.ai/docs/providers/openai) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [Meta - Llama API](https://docs.litellm.ai/docs/providers/meta_llama) | ✅ | ✅ | ✅ | ✅ | | | -| [azure](https://docs.litellm.ai/docs/providers/azure) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [AI/ML API](https://docs.litellm.ai/docs/providers/aiml) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [aws - sagemaker](https://docs.litellm.ai/docs/providers/aws_sagemaker) | ✅ | ✅ | ✅ | ✅ | ✅ | | -| [aws - bedrock](https://docs.litellm.ai/docs/providers/bedrock) | ✅ | ✅ | ✅ | ✅ | ✅ | | -| [google - vertex_ai](https://docs.litellm.ai/docs/providers/vertex) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| [google - palm](https://docs.litellm.ai/docs/providers/palm) | ✅ | ✅ | ✅ | ✅ | | | -| [google AI Studio - gemini](https://docs.litellm.ai/docs/providers/gemini) | ✅ | ✅ | ✅ | ✅ | | | -| [mistral ai api](https://docs.litellm.ai/docs/providers/mistral) | ✅ | ✅ | ✅ | ✅ | ✅ | | -| [cloudflare AI Workers](https://docs.litellm.ai/docs/providers/cloudflare_workers) | ✅ | ✅ | ✅ | ✅ | | | -| [CompactifAI](https://docs.litellm.ai/docs/providers/compactifai) | ✅ | ✅ | ✅ | ✅ | | | -| [cohere](https://docs.litellm.ai/docs/providers/cohere) | ✅ | ✅ | ✅ | ✅ | ✅ | | -| [anthropic](https://docs.litellm.ai/docs/providers/anthropic) | ✅ | ✅ | ✅ | ✅ | | | -| [empower](https://docs.litellm.ai/docs/providers/empower) | ✅ | ✅ | ✅ | ✅ | -| [huggingface](https://docs.litellm.ai/docs/providers/huggingface) | ✅ | ✅ | ✅ | ✅ | ✅ | | -| [replicate](https://docs.litellm.ai/docs/providers/replicate) | ✅ | ✅ | ✅ | ✅ | | | -| [together_ai](https://docs.litellm.ai/docs/providers/togetherai) | ✅ | ✅ | ✅ | ✅ | | | -| [openrouter](https://docs.litellm.ai/docs/providers/openrouter) | ✅ | ✅ | ✅ | ✅ | | | -| [ai21](https://docs.litellm.ai/docs/providers/ai21) | ✅ | ✅ | ✅ | ✅ | | | -| [baseten](https://docs.litellm.ai/docs/providers/baseten) | ✅ | ✅ | ✅ | ✅ | | | -| [vllm](https://docs.litellm.ai/docs/providers/vllm) | ✅ | ✅ | ✅ | ✅ | | | -| [nlp_cloud](https://docs.litellm.ai/docs/providers/nlp_cloud) | ✅ | ✅ | ✅ | ✅ | | | -| [aleph alpha](https://docs.litellm.ai/docs/providers/aleph_alpha) | ✅ | ✅ | ✅ | ✅ | | | -| [petals](https://docs.litellm.ai/docs/providers/petals) | ✅ | ✅ | ✅ | ✅ | | | -| [ollama](https://docs.litellm.ai/docs/providers/ollama) | ✅ | ✅ | ✅ | ✅ | ✅ | | -| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra) | ✅ | ✅ | ✅ | ✅ | | | -| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ | | | -| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ | | | -| [Deepseek](https://docs.litellm.ai/docs/providers/deepseek) | ✅ | ✅ | ✅ | ✅ | | | -| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ | | | -| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅ | | -| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ | | -| [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) | | | | | ✅ | | -| [FriendliAI](https://docs.litellm.ai/docs/providers/friendliai) | ✅ | ✅ | ✅ | ✅ | | | -| [Galadriel](https://docs.litellm.ai/docs/providers/galadriel) | ✅ | ✅ | ✅ | ✅ | | | -| [GradientAI](https://docs.litellm.ai/docs/providers/gradient_ai) | ✅ | ✅ | | | | | -| [Novita AI](https://novita.ai/models/llm?utm_source=github_litellm&utm_medium=github_readme&utm_campaign=github_link) | ✅ | ✅ | ✅ | ✅ | | | -| [Featherless AI](https://docs.litellm.ai/docs/providers/featherless_ai) | ✅ | ✅ | ✅ | ✅ | | | -| [Nebius AI Studio](https://docs.litellm.ai/docs/providers/nebius) | ✅ | ✅ | ✅ | ✅ | ✅ | | -| [Heroku](https://docs.litellm.ai/docs/providers/heroku) | ✅ | ✅ | | | | | -| [OVHCloud AI Endpoints](https://docs.litellm.ai/docs/providers/ovhcloud) | ✅ | ✅ | | | | | -| [CometAPI](https://docs.litellm.ai/docs/providers/cometapi) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +[**Docs: MCP Gateway**](https://docs.litellm.ai/docs/mcp) + +
+ +--- + +## How to use LiteLLM + +You can use LiteLLM through either the Proxy Server or Python SDK. Both gives you a unified interface to access multiple LLMs (100+ LLMs). Choose the option that best fits your needs: + + + + + + + + + + + + + + + + + + + + + + + + + + +
LiteLLM AI GatewayLiteLLM Python SDK
Use CaseCentral service (LLM Gateway) to access multiple LLMsUse LiteLLM directly in your Python code
Who Uses It?Gen AI Enablement / ML Platform TeamsDevelopers building LLM projects
Key FeaturesCentralized API gateway with authentication and authorization, multi-tenant cost tracking and spend management per project/user, per-project customization (logging, guardrails, caching), virtual keys for secure access control, admin dashboard UI for monitoring and managementDirect Python library integration in your codebase, Router with retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - Router, application-level load balancing and cost tracking, exception handling with OpenAI-compatible errors, observability callbacks (Lunary, MLflow, Langfuse, etc.)
+ +LiteLLM Performance: **8ms P95 latency** at 1k RPS (See benchmarks [here](https://docs.litellm.ai/docs/benchmarks)) + +[**Jump to LiteLLM Proxy (LLM Gateway) Docs**](https://docs.litellm.ai/docs/simple_proxy)
+[**Jump to Supported LLM Providers**](https://docs.litellm.ai/docs/providers) + +**Stable Release:** Use docker images with the `-stable` tag. These have undergone 12 hour load tests, before being published. [More information about the release cycle here](https://docs.litellm.ai/docs/proxy/release_cycle) + +Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+). + +## OSS Adopters + + + + + + + + + + +
StripeGoogle ADKGreptileOpenHands

Netflix

OpenAI Agents SDK
+ +## Supported Providers ([Website Supported Models](https://models.litellm.ai/) | [Docs](https://docs.litellm.ai/docs/providers)) + +| Provider | `/chat/completions` | `/messages` | `/responses` | `/embeddings` | `/image/generations` | `/audio/transcriptions` | `/audio/speech` | `/moderations` | `/batches` | `/rerank` | +|-------------------------------------------------------------------------------------|---------------------|-------------|--------------|---------------|----------------------|-------------------------|-----------------|----------------|-----------|-----------| +| [Abliteration (`abliteration`)](https://docs.litellm.ai/docs/providers/abliteration) | ✅ | | | | | | | | | | +| [AI/ML API (`aiml`)](https://docs.litellm.ai/docs/providers/aiml) | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | +| [AI21 (`ai21`)](https://docs.litellm.ai/docs/providers/ai21) | ✅ | ✅ | ✅ | | | | | | | | +| [AI21 Chat (`ai21_chat`)](https://docs.litellm.ai/docs/providers/ai21) | ✅ | ✅ | ✅ | | | | | | | | +| [Aleph Alpha](https://docs.litellm.ai/docs/providers/aleph_alpha) | ✅ | ✅ | ✅ | | | | | | | | +| [Amazon Nova](https://docs.litellm.ai/docs/providers/amazon_nova) | ✅ | ✅ | ✅ | | | | | | | | +| [Anthropic (`anthropic`)](https://docs.litellm.ai/docs/providers/anthropic) | ✅ | ✅ | ✅ | | | | | | ✅ | | +| [Anthropic Text (`anthropic_text`)](https://docs.litellm.ai/docs/providers/anthropic) | ✅ | ✅ | ✅ | | | | | | ✅ | | +| [Anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | | | | | | | | +| [AssemblyAI (`assemblyai`)](https://docs.litellm.ai/docs/pass_through/assembly_ai) | ✅ | ✅ | ✅ | | | ✅ | | | | | +| [Auto Router (`auto_router`)](https://docs.litellm.ai/docs/proxy/auto_routing) | ✅ | ✅ | ✅ | | | | | | | | +| [AWS - Bedrock (`bedrock`)](https://docs.litellm.ai/docs/providers/bedrock) | ✅ | ✅ | ✅ | ✅ | | | | | | ✅ | +| [AWS - Sagemaker (`sagemaker`)](https://docs.litellm.ai/docs/providers/aws_sagemaker) | ✅ | ✅ | ✅ | ✅ | | | | | | | +| [Azure (`azure`)](https://docs.litellm.ai/docs/providers/azure) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [Azure AI (`azure_ai`)](https://docs.litellm.ai/docs/providers/azure_ai) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [Azure Text (`azure_text`)](https://docs.litellm.ai/docs/providers/azure) | ✅ | ✅ | ✅ | | | ✅ | ✅ | ✅ | ✅ | | +| [Baseten (`baseten`)](https://docs.litellm.ai/docs/providers/baseten) | ✅ | ✅ | ✅ | | | | | | | | +| [Bytez (`bytez`)](https://docs.litellm.ai/docs/providers/bytez) | ✅ | ✅ | ✅ | | | | | | | | +| [Cerebras (`cerebras`)](https://docs.litellm.ai/docs/providers/cerebras) | ✅ | ✅ | ✅ | | | | | | | | +| [Clarifai (`clarifai`)](https://docs.litellm.ai/docs/providers/clarifai) | ✅ | ✅ | ✅ | | | | | | | | +| [Cloudflare AI Workers (`cloudflare`)](https://docs.litellm.ai/docs/providers/cloudflare_workers) | ✅ | ✅ | ✅ | | | | | | | | +| [Codestral (`codestral`)](https://docs.litellm.ai/docs/providers/codestral) | ✅ | ✅ | ✅ | | | | | | | | +| [Cohere (`cohere`)](https://docs.litellm.ai/docs/providers/cohere) | ✅ | ✅ | ✅ | ✅ | | | | | | ✅ | +| [Cohere Chat (`cohere_chat`)](https://docs.litellm.ai/docs/providers/cohere) | ✅ | ✅ | ✅ | | | | | | | | +| [CometAPI (`cometapi`)](https://docs.litellm.ai/docs/providers/cometapi) | ✅ | ✅ | ✅ | ✅ | | | | | | | +| [CompactifAI (`compactifai`)](https://docs.litellm.ai/docs/providers/compactifai) | ✅ | ✅ | ✅ | | | | | | | | +| [Custom (`custom`)](https://docs.litellm.ai/docs/providers/custom_llm_server) | ✅ | ✅ | ✅ | | | | | | | | +| [Custom OpenAI (`custom_openai`)](https://docs.litellm.ai/docs/providers/openai_compatible) | ✅ | ✅ | ✅ | | | ✅ | ✅ | ✅ | ✅ | | +| [Dashscope (`dashscope`)](https://docs.litellm.ai/docs/providers/dashscope) | ✅ | ✅ | ✅ | | | | | | | | +| [Databricks (`databricks`)](https://docs.litellm.ai/docs/providers/databricks) | ✅ | ✅ | ✅ | | | | | | | | +| [DataRobot (`datarobot`)](https://docs.litellm.ai/docs/providers/datarobot) | ✅ | ✅ | ✅ | | | | | | | | +| [Deepgram (`deepgram`)](https://docs.litellm.ai/docs/providers/deepgram) | ✅ | ✅ | ✅ | | | ✅ | | | | | +| [DeepInfra (`deepinfra`)](https://docs.litellm.ai/docs/providers/deepinfra) | ✅ | ✅ | ✅ | | | | | | | | +| [Deepseek (`deepseek`)](https://docs.litellm.ai/docs/providers/deepseek) | ✅ | ✅ | ✅ | | | | | | | | +| [ElevenLabs (`elevenlabs`)](https://docs.litellm.ai/docs/providers/elevenlabs) | ✅ | ✅ | ✅ | | | | ✅ | | | | +| [Empower (`empower`)](https://docs.litellm.ai/docs/providers/empower) | ✅ | ✅ | ✅ | | | | | | | | +| [Fal AI (`fal_ai`)](https://docs.litellm.ai/docs/providers/fal_ai) | ✅ | ✅ | ✅ | | ✅ | | | | | | +| [Featherless AI (`featherless_ai`)](https://docs.litellm.ai/docs/providers/featherless_ai) | ✅ | ✅ | ✅ | | | | | | | | +| [Fireworks AI (`fireworks_ai`)](https://docs.litellm.ai/docs/providers/fireworks_ai) | ✅ | ✅ | ✅ | | | | | | | | +| [FriendliAI (`friendliai`)](https://docs.litellm.ai/docs/providers/friendliai) | ✅ | ✅ | ✅ | | | | | | | | +| [Galadriel (`galadriel`)](https://docs.litellm.ai/docs/providers/galadriel) | ✅ | ✅ | ✅ | | | | | | | | +| [GitHub Copilot (`github_copilot`)](https://docs.litellm.ai/docs/providers/github_copilot) | ✅ | ✅ | ✅ | ✅ | | | | | | | +| [GitHub Models (`github`)](https://docs.litellm.ai/docs/providers/github) | ✅ | ✅ | ✅ | | | | | | | | +| [Google - PaLM](https://docs.litellm.ai/docs/providers/palm) | ✅ | ✅ | ✅ | | | | | | | | +| [Google - Vertex AI (`vertex_ai`)](https://docs.litellm.ai/docs/providers/vertex) | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | +| [Google AI Studio - Gemini (`gemini`)](https://docs.litellm.ai/docs/providers/gemini) | ✅ | ✅ | ✅ | | | | | | | | +| [GradientAI (`gradient_ai`)](https://docs.litellm.ai/docs/providers/gradient_ai) | ✅ | ✅ | ✅ | | | | | | | | +| [Groq AI (`groq`)](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | | | | | | | | +| [Heroku (`heroku`)](https://docs.litellm.ai/docs/providers/heroku) | ✅ | ✅ | ✅ | | | | | | | | +| [Hosted VLLM (`hosted_vllm`)](https://docs.litellm.ai/docs/providers/vllm) | ✅ | ✅ | ✅ | | | | | | | | +| [Huggingface (`huggingface`)](https://docs.litellm.ai/docs/providers/huggingface) | ✅ | ✅ | ✅ | ✅ | | | | | | ✅ | +| [Hyperbolic (`hyperbolic`)](https://docs.litellm.ai/docs/providers/hyperbolic) | ✅ | ✅ | ✅ | | | | | | | | +| [IBM - Watsonx.ai (`watsonx`)](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | | | | | | | +| [Infinity (`infinity`)](https://docs.litellm.ai/docs/providers/infinity) | | | | ✅ | | | | | | | +| [Jina AI (`jina_ai`)](https://docs.litellm.ai/docs/providers/jina_ai) | | | | ✅ | | | | | | | +| [Lambda AI (`lambda_ai`)](https://docs.litellm.ai/docs/providers/lambda_ai) | ✅ | ✅ | ✅ | | | | | | | | +| [Lemonade (`lemonade`)](https://docs.litellm.ai/docs/providers/lemonade) | ✅ | ✅ | ✅ | | | | | | | | +| [LiteLLM Proxy (`litellm_proxy`)](https://docs.litellm.ai/docs/providers/litellm_proxy) | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | +| [Llamafile (`llamafile`)](https://docs.litellm.ai/docs/providers/llamafile) | ✅ | ✅ | ✅ | | | | | | | | +| [LM Studio (`lm_studio`)](https://docs.litellm.ai/docs/providers/lm_studio) | ✅ | ✅ | ✅ | | | | | | | | +| [Maritalk (`maritalk`)](https://docs.litellm.ai/docs/providers/maritalk) | ✅ | ✅ | ✅ | | | | | | | | +| [Meta - Llama API (`meta_llama`)](https://docs.litellm.ai/docs/providers/meta_llama) | ✅ | ✅ | ✅ | | | | | | | | +| [Mistral AI API (`mistral`)](https://docs.litellm.ai/docs/providers/mistral) | ✅ | ✅ | ✅ | ✅ | | | | | | | +| [Moonshot (`moonshot`)](https://docs.litellm.ai/docs/providers/moonshot) | ✅ | ✅ | ✅ | | | | | | | | +| [Morph (`morph`)](https://docs.litellm.ai/docs/providers/morph) | ✅ | ✅ | ✅ | | | | | | | | +| [Nebius AI Studio (`nebius`)](https://docs.litellm.ai/docs/providers/nebius) | ✅ | ✅ | ✅ | ✅ | | | | | | | +| [NLP Cloud (`nlp_cloud`)](https://docs.litellm.ai/docs/providers/nlp_cloud) | ✅ | ✅ | ✅ | | | | | | | | +| [Novita AI (`novita`)](https://novita.ai/models/llm?utm_source=github_litellm&utm_medium=github_readme&utm_campaign=github_link) | ✅ | ✅ | ✅ | | | | | | | | +| [Nscale (`nscale`)](https://docs.litellm.ai/docs/providers/nscale) | ✅ | ✅ | ✅ | | | | | | | | +| [Nvidia NIM (`nvidia_nim`)](https://docs.litellm.ai/docs/providers/nvidia_nim) | ✅ | ✅ | ✅ | | | | | | | | +| [OCI (`oci`)](https://docs.litellm.ai/docs/providers/oci) | ✅ | ✅ | ✅ | | | | | | | | +| [Ollama (`ollama`)](https://docs.litellm.ai/docs/providers/ollama) | ✅ | ✅ | ✅ | ✅ | | | | | | | +| [Ollama Chat (`ollama_chat`)](https://docs.litellm.ai/docs/providers/ollama) | ✅ | ✅ | ✅ | | | | | | | | +| [Oobabooga (`oobabooga`)](https://docs.litellm.ai/docs/providers/openai_compatible) | ✅ | ✅ | ✅ | | | ✅ | ✅ | ✅ | ✅ | | +| [OpenAI (`openai`)](https://docs.litellm.ai/docs/providers/openai) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| [OpenAI-like (`openai_like`)](https://docs.litellm.ai/docs/providers/openai_compatible) | | | | ✅ | | | | | | | +| [OpenRouter (`openrouter`)](https://docs.litellm.ai/docs/providers/openrouter) | ✅ | ✅ | ✅ | | | | | | | | +| [OVHCloud AI Endpoints (`ovhcloud`)](https://docs.litellm.ai/docs/providers/ovhcloud) | ✅ | ✅ | ✅ | | | | | | | | +| [Perplexity AI (`perplexity`)](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | | | | | | | | +| [Petals (`petals`)](https://docs.litellm.ai/docs/providers/petals) | ✅ | ✅ | ✅ | | | | | | | | +| [Predibase (`predibase`)](https://docs.litellm.ai/docs/providers/predibase) | ✅ | ✅ | ✅ | | | | | | | | +| [Recraft (`recraft`)](https://docs.litellm.ai/docs/providers/recraft) | | | | | ✅ | | | | | | +| [Replicate (`replicate`)](https://docs.litellm.ai/docs/providers/replicate) | ✅ | ✅ | ✅ | | | | | | | | +| [Sagemaker Chat (`sagemaker_chat`)](https://docs.litellm.ai/docs/providers/aws_sagemaker) | ✅ | ✅ | ✅ | | | | | | | | +| [Sambanova (`sambanova`)](https://docs.litellm.ai/docs/providers/sambanova) | ✅ | ✅ | ✅ | | | | | | | | +| [Snowflake (`snowflake`)](https://docs.litellm.ai/docs/providers/snowflake) | ✅ | ✅ | ✅ | | | | | | | | +| [Text Completion Codestral (`text-completion-codestral`)](https://docs.litellm.ai/docs/providers/codestral) | ✅ | ✅ | ✅ | | | | | | | | +| [Text Completion OpenAI (`text-completion-openai`)](https://docs.litellm.ai/docs/providers/text_completion_openai) | ✅ | ✅ | ✅ | | | ✅ | ✅ | ✅ | ✅ | | +| [Together AI (`together_ai`)](https://docs.litellm.ai/docs/providers/togetherai) | ✅ | ✅ | ✅ | | | | | | | | +| [Topaz (`topaz`)](https://docs.litellm.ai/docs/providers/topaz) | ✅ | ✅ | ✅ | | | | | | | | +| [Triton (`triton`)](https://docs.litellm.ai/docs/providers/triton-inference-server) | ✅ | ✅ | ✅ | | | | | | | | +| [V0 (`v0`)](https://docs.litellm.ai/docs/providers/v0) | ✅ | ✅ | ✅ | | | | | | | | +| [Vercel AI Gateway (`vercel_ai_gateway`)](https://docs.litellm.ai/docs/providers/vercel_ai_gateway) | ✅ | ✅ | ✅ | | | | | | | | +| [VLLM (`vllm`)](https://docs.litellm.ai/docs/providers/vllm) | ✅ | ✅ | ✅ | | | | | | | | +| [Volcengine (`volcengine`)](https://docs.litellm.ai/docs/providers/volcano) | ✅ | ✅ | ✅ | | | | | | | | +| [Voyage AI (`voyage`)](https://docs.litellm.ai/docs/providers/voyage) | | | | ✅ | | | | | | | +| [WandB Inference (`wandb`)](https://docs.litellm.ai/docs/providers/wandb_inference) | ✅ | ✅ | ✅ | | | | | | | | +| [Watsonx Text (`watsonx_text`)](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | | | | | | | | +| [xAI (`xai`)](https://docs.litellm.ai/docs/providers/xai) | ✅ | ✅ | ✅ | | | | | | | | +| [Xinference (`xinference`)](https://docs.litellm.ai/docs/providers/xinference) | | | | ✅ | | | | | | | [**Read the Docs**](https://docs.litellm.ai/docs/) @@ -360,7 +387,9 @@ curl 'http://0.0.0.0:4000/key/generate' \ 1. (In root) create virtual environment `python -m venv .venv` 2. Activate virtual environment `source .venv/bin/activate` 3. Install dependencies `pip install -e ".[all]"` -4. Start proxy backend `python litellm/proxy_cli.py` +4. `pip install prisma` +5. `prisma generate` +6. Start proxy backend `python litellm/proxy/proxy_cli.py` ### Frontend 1. Navigate to `ui/litellm-dashboard` @@ -442,4 +471,3 @@ All these checks must pass before your PR can be merged. - diff --git a/ci_cd/.grype.yaml b/ci_cd/.grype.yaml new file mode 100644 index 000000000000..642e2dd9d03f --- /dev/null +++ b/ci_cd/.grype.yaml @@ -0,0 +1,3 @@ +ignore: + - vulnerability: CVE-2026-22184 + reason: no fixed zlib package is available yet in the Wolfi repositories, so this is ignored temporarily until an upstream release exists diff --git a/ci_cd/TEST_KEY_PATTERNS.md b/ci_cd/TEST_KEY_PATTERNS.md new file mode 100644 index 000000000000..bd59f5828399 --- /dev/null +++ b/ci_cd/TEST_KEY_PATTERNS.md @@ -0,0 +1,40 @@ +# Test Key Patterns Standard + +Standard patterns for test/mock keys and credentials in the LiteLLM codebase to avoid triggering secret detection. + +## How GitGuardian Works + +GitGuardian uses **machine learning and entropy analysis**, not just pattern matching: +- **Low entropy** values (like `sk-1234`, `postgres`) are automatically ignored +- **High entropy** values (realistic-looking secrets) trigger detection +- **Context-aware** detection understands code syntax like `os.environ["KEY"]` + +## Recommended Test Key Patterns + +### Option 1: Low Entropy Values (Simplest) +These won't trigger GitGuardian's ML detector: + +```python +api_key = "sk-1234" +api_key = "sk-12345" +database_password = "postgres" +token = "test123" +``` + +### Option 2: High Entropy with Test Prefixes +If you need realistic-looking test keys with high entropy, use these prefixes: + +```python +api_key = "sk-test-abc123def456ghi789..." # OpenAI-style test key +api_key = "sk-mock-1234567890abcdef1234..." # Mock key +api_key = "sk-fake-xyz789uvw456rst123..." # Fake key +token = "test-api-key-with-high-entropy" +``` + +## Configured Ignore Patterns + +These patterns are in `.gitguardian.yaml` for high-entropy test keys: +- `sk-test-*` - OpenAI-style test keys +- `sk-mock-*` - Mock API keys +- `sk-fake-*` - Fake API keys +- `test-api-key` - Generic test tokens diff --git a/ci_cd/security_scans.sh b/ci_cd/security_scans.sh index fbb2ef5c0d92..3a212a56f645 100755 --- a/ci_cd/security_scans.sh +++ b/ci_cd/security_scans.sh @@ -26,15 +26,65 @@ install_grype() { echo "Grype installed successfully" } +# Function to install ggshield +install_ggshield() { + echo "Installing ggshield..." + pip3 install --upgrade pip + pip3 install ggshield + echo "ggshield installed successfully" +} + +# # Function to run secret detection scans +# run_secret_detection() { +# echo "Running secret detection scans..." + +# if ! command -v ggshield &> /dev/null; then +# install_ggshield +# fi + +# # Check if GITGUARDIAN_API_KEY is set (required for CI/CD) +# if [ -z "$GITGUARDIAN_API_KEY" ]; then +# echo "Warning: GITGUARDIAN_API_KEY environment variable is not set." +# echo "ggshield requires a GitGuardian API key to scan for secrets." +# echo "Please set GITGUARDIAN_API_KEY in your CI/CD environment variables." +# exit 1 +# fi + +# echo "Scanning codebase for secrets..." +# echo "Note: Large codebases may take several minutes due to API rate limits (50 requests/minute on free plan)" +# echo "ggshield will automatically handle rate limits and retry as needed." +# echo "Binary files, cache files, and build artifacts are excluded via .gitguardian.yaml" + +# # Use --recursive for directory scanning and auto-confirm if prompted +# # .gitguardian.yaml will automatically exclude binary files, wheel files, etc. +# # GITGUARDIAN_API_KEY environment variable will be used for authentication +# echo y | ggshield secret scan path . --recursive || { +# echo "" +# echo "==========================================" +# echo "ERROR: Secret Detection Failed" +# echo "==========================================" +# echo "ggshield has detected secrets in the codebase." +# echo "Please review discovered secrets above, revoke any actively used secrets" +# echo "from underlying systems and make changes to inject secrets dynamically at runtime." +# echo "" +# echo "For more information, see: https://docs.gitguardian.com/secrets-detection/" +# echo "==========================================" +# echo "" +# exit 1 +# } + +# echo "Secret detection scans completed successfully" +# } + # Function to run Trivy scans run_trivy_scans() { echo "Running Trivy scans..." echo "Scanning LiteLLM Docs..." - trivy fs --scanners vuln --dependency-tree --exit-code 1 --severity HIGH,CRITICAL,MEDIUM ./docs/ + trivy fs --ignorefile .trivyignore --scanners vuln --dependency-tree --exit-code 1 --severity HIGH,CRITICAL,MEDIUM ./docs/ echo "Scanning LiteLLM UI..." - trivy fs --scanners vuln --dependency-tree --exit-code 1 --severity HIGH,CRITICAL,MEDIUM ./ui/ + trivy fs --ignorefile .trivyignore --scanners vuln --dependency-tree --exit-code 1 --severity HIGH,CRITICAL,MEDIUM ./ui/ echo "Trivy scans completed successfully" } @@ -51,12 +101,12 @@ run_grype_scans() { # Build and scan Dockerfile.database echo "Building and scanning Dockerfile.database..." docker build --no-cache -t litellm-database:latest -f ./docker/Dockerfile.database . - grype litellm-database:latest --fail-on critical + grype litellm-database:latest --config ci_cd/.grype.yaml --fail-on critical # Build and scan main Dockerfile echo "Building and scanning main Dockerfile..." docker build --no-cache -t litellm:latest . - grype litellm:latest --fail-on critical + grype litellm:latest --config ci_cd/.grype.yaml --fail-on critical # Restore original .dockerignore echo "Restoring original .dockerignore..." @@ -69,10 +119,41 @@ run_grype_scans() { # Allowlist of CVEs to be ignored in failure threshold/reporting # - CVE-2025-8869: Not applicable on Python >=3.13 (PEP 706 implemented); pip fallback unused; no OS-level fix # - GHSA-4xh5-x5gv-qwph: GitHub Security Advisory alias for CVE-2025-8869 + # - GHSA-5j98-mcp5-4vw2: glob CLI command injection via -c/--cmd; glob CLI is not used in the litellm runtime image, + # and the vulnerable versions are pulled in only via OS-level/node tooling outside of our application code ALLOWED_CVES=( "CVE-2025-8869" "GHSA-4xh5-x5gv-qwph" "CVE-2025-8291" # no fix available as of Oct 11, 2025 + "GHSA-5j98-mcp5-4vw2" + "CVE-2025-13836" # Python 3.13 HTTP response reading OOM/DoS - no fix available in base image + "CVE-2025-12084" # Python 3.13 xml.dom.minidom quadratic algorithm - no fix available in base image + "CVE-2025-60876" # BusyBox wget HTTP request splitting - no fix available in Chainguard Wolfi base image + "CVE-2026-0861" # Wolfi glibc still flagged even on 2.42-r5; upstream patched build unavailable yet + "CVE-2010-4756" # glibc glob DoS - awaiting patched Wolfi glibc build + "CVE-2019-1010022" # glibc stack guard bypass - awaiting patched Wolfi glibc build + "CVE-2019-1010023" # glibc ldd remap issue - awaiting patched Wolfi glibc build + "CVE-2019-1010024" # glibc ASLR mitigation bypass - awaiting patched Wolfi glibc build + "CVE-2019-1010025" # glibc pthread heap address leak - awaiting patched Wolfi glibc build + "CVE-2026-22184" # zlib untgz buffer overflow - untgz unused + no fixed Wolfi build yet + "GHSA-58pv-8j8x-9vj2" # jaraco.context path traversal - setuptools vendored only (v5.3.0), not used in application code (using v6.1.0+) + "GHSA-34x7-hfp2-rc4v" # node-tar hardlink path traversal - not applicable, tar CLI not exposed in application code + "GHSA-r6q2-hw4h-h46w" # node-tar not used by application runtime, Linux-only container, not affect by macOS APFS-specific exploit + "GHSA-8rrh-rw8j-w5fx" # wheel is from chainguard and will be handled by then TODO: Remove this after Chainguard updates the wheel + "CVE-2025-59465" # We do not use Node in application runtime, only used for building Admin UI + "CVE-2025-55131" # We do not use Node in application runtime, only used for building Admin UI + "CVE-2025-59466" # We do not use Node in application runtime, only used for building Admin UI + "CVE-2025-55130" # We do not use Node in application runtime, only used for building Admin UI + "CVE-2025-59467" # We do not use Node in application runtime, only used for building Admin UI + "CVE-2026-21637" # We do not use Node in application runtime, only used for building Admin UI + "CVE-2025-15281" # No fix available yet + "CVE-2026-0865" # No fix available yet + "CVE-2025-15282" # No fix available yet + "CVE-2026-0672" # No fix available yet + "CVE-2025-15366" # No fix available yet + "CVE-2025-15367" # No fix available yet + "CVE-2025-12781" # No fix available yet + "CVE-2025-11468" # No fix available yet ) # Build JSON array of allowlisted CVE IDs for jq @@ -153,6 +234,9 @@ main() { install_trivy install_grype + # echo "Running secret detection scans..." + # run_secret_detection + echo "Running filesystem vulnerability scans..." run_trivy_scans diff --git a/cookbook/LiteLLM_CometAPI.ipynb b/cookbook/LiteLLM_CometAPI.ipynb index bdd916c5bfe7..0a7ab581ae35 100644 --- a/cookbook/LiteLLM_CometAPI.ipynb +++ b/cookbook/LiteLLM_CometAPI.ipynb @@ -28,7 +28,7 @@ "Requirement already satisfied: importlib-metadata>=6.8.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from litellm) (8.6.1)\n", "Requirement already satisfied: jinja2<4.0.0,>=3.1.2 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from litellm) (3.1.6)\n", "Requirement already satisfied: jsonschema<5.0.0,>=4.22.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from litellm) (4.25.1)\n", - "Requirement already satisfied: openai>=1.99.5 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from litellm) (1.109.1)\n", + "Requirement already satisfied: openai>=2.8.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from litellm) (1.109.1)\n", "Requirement already satisfied: pydantic<3.0.0,>=2.5.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from litellm) (2.11.10)\n", "Requirement already satisfied: python-dotenv>=0.2.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from litellm) (1.1.1)\n", "Requirement already satisfied: tiktoken>=0.7.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from litellm) (0.12.0)\n", @@ -50,11 +50,11 @@ "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (2025.9.1)\n", "Requirement already satisfied: referencing>=0.28.4 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.36.2)\n", "Requirement already satisfied: rpds-py>=0.7.1 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from jsonschema<5.0.0,>=4.22.0->litellm) (0.27.1)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=1.99.5->litellm) (1.9.0)\n", - "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=1.99.5->litellm) (0.11.0)\n", - "Requirement already satisfied: sniffio in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=1.99.5->litellm) (1.3.1)\n", - "Requirement already satisfied: tqdm>4 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=1.99.5->litellm) (4.67.1)\n", - "Requirement already satisfied: typing-extensions<5,>=4.11 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=1.99.5->litellm) (4.15.0)\n", + "Requirement already satisfied: distro<2,>=1.7.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=2.8.0->litellm) (1.9.0)\n", + "Requirement already satisfied: jiter<1,>=0.4.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=2.8.0->litellm) (0.11.0)\n", + "Requirement already satisfied: sniffio in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=2.8.0->litellm) (1.3.1)\n", + "Requirement already satisfied: tqdm>4 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=2.8.0->litellm) (4.67.1)\n", + "Requirement already satisfied: typing-extensions<5,>=4.11 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from openai>=2.8.0->litellm) (4.15.0)\n", "Requirement already satisfied: annotated-types>=0.6.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.5.0->litellm) (0.7.0)\n", "Requirement already satisfied: pydantic-core==2.33.2 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.5.0->litellm) (2.33.2)\n", "Requirement already satisfied: typing-inspection>=0.4.0 in /Users/xmx/.miniforge3/lib/python3.12/site-packages (from pydantic<3.0.0,>=2.5.0->litellm) (0.4.2)\n", diff --git a/cookbook/LiteLLM_HuggingFace.ipynb b/cookbook/LiteLLM_HuggingFace.ipynb index d608c2675a13..bf8482a5f115 100644 --- a/cookbook/LiteLLM_HuggingFace.ipynb +++ b/cookbook/LiteLLM_HuggingFace.ipynb @@ -131,7 +131,7 @@ " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\n", - " \"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg\",\n", + " \"url\": \"https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png\",\n", " },\n", " },\n", " ],\n", diff --git a/cookbook/LiteLLM_PromptLayer.ipynb b/cookbook/LiteLLM_PromptLayer.ipynb index 3552636011aa..8fd54941027e 100644 --- a/cookbook/LiteLLM_PromptLayer.ipynb +++ b/cookbook/LiteLLM_PromptLayer.ipynb @@ -39,7 +39,7 @@ "import os\n", "os.environ['OPENAI_API_KEY'] = \"\"\n", "os.environ['REPLICATE_API_TOKEN'] = \"\"\n", - "os.environ['PROMPTLAYER_API_KEY'] = \"pl_4ea2bb00a4dca1b8a70cebf2e9e11564\"\n", + "os.environ['PROMPTLAYER_API_KEY'] = \"test-promptlayer-key-123\"\n", "\n", "# Set Promptlayer as a success callback\n", "litellm.success_callback =['promptlayer']\n", diff --git a/cookbook/Migrating_to_LiteLLM_Proxy_from_OpenAI_Azure_OpenAI.ipynb b/cookbook/Migrating_to_LiteLLM_Proxy_from_OpenAI_Azure_OpenAI.ipynb index 39677ed2a8a1..740e7c7a4c86 100644 --- a/cookbook/Migrating_to_LiteLLM_Proxy_from_OpenAI_Azure_OpenAI.ipynb +++ b/cookbook/Migrating_to_LiteLLM_Proxy_from_OpenAI_Azure_OpenAI.ipynb @@ -1,21 +1,10 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", + "metadata": { + "id": "kccfk0mHZ4Ad" + }, "source": [ "# Migrating to LiteLLM Proxy from OpenAI/Azure OpenAI\n", "\n", @@ -32,29 +21,26 @@ "To pass provider-specific args, [go here](https://docs.litellm.ai/docs/completion/provider_specific_params#proxy-usage)\n", "\n", "To drop unsupported params (E.g. frequency_penalty for bedrock with librechat), [go here](https://docs.litellm.ai/docs/completion/drop_params#openai-proxy-usage)\n" - ], - "metadata": { - "id": "kccfk0mHZ4Ad" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "nmSClzCPaGH6" + }, "source": [ "## /chat/completion\n", "\n" - ], - "metadata": { - "id": "nmSClzCPaGH6" - } + ] }, { "cell_type": "markdown", - "source": [ - "### OpenAI Python SDK" - ], "metadata": { "id": "_vqcjwOVaKpO" - } + }, + "source": [ + "### OpenAI Python SDK" + ] }, { "cell_type": "code", @@ -94,15 +80,20 @@ }, { "cell_type": "markdown", - "source": [ - "## Function Calling" - ], "metadata": { "id": "AqkyKk9Scxgj" - } + }, + "source": [ + "## Function Calling" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wDg10VqLczE1" + }, + "outputs": [], "source": [ "from openai import OpenAI\n", "client = OpenAI(\n", @@ -139,24 +130,24 @@ ")\n", "\n", "print(completion)\n" - ], - "metadata": { - "id": "wDg10VqLczE1" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Azure OpenAI Python SDK" - ], "metadata": { "id": "YYoxLloSaNWW" - } + }, + "source": [ + "### Azure OpenAI Python SDK" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yA1XcgowaSRy" + }, + "outputs": [], "source": [ "import openai\n", "client = openai.AzureOpenAI(\n", @@ -184,24 +175,24 @@ ")\n", "\n", "print(response)" - ], - "metadata": { - "id": "yA1XcgowaSRy" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Langchain Python" - ], "metadata": { "id": "yl9qhDvnaTpL" - } + }, + "source": [ + "### Langchain Python" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5MUZgSquaW5t" + }, + "outputs": [], "source": [ "from langchain.chat_models import ChatOpenAI\n", "from langchain.prompts.chat import (\n", @@ -239,24 +230,22 @@ "response = chat(messages)\n", "\n", "print(response)" - ], - "metadata": { - "id": "5MUZgSquaW5t" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Curl" - ], "metadata": { "id": "B9eMgnULbRaz" - } + }, + "source": [ + "### Curl" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "VWCCk5PFcmhS" + }, "source": [ "\n", "\n", @@ -280,22 +269,24 @@ "}'\n", "```\n", "\n" - ], - "metadata": { - "id": "VWCCk5PFcmhS" - } + ] }, { "cell_type": "markdown", - "source": [ - "### LlamaIndex" - ], "metadata": { "id": "drBAm2e1b6xe" - } + }, + "source": [ + "### LlamaIndex" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d0bZcv8fb9mL" + }, + "outputs": [], "source": [ "import os, dotenv\n", "\n", @@ -326,24 +317,24 @@ "query_engine = index.as_query_engine()\n", "response = query_engine.query(\"What did the author do growing up?\")\n", "print(response)\n" - ], - "metadata": { - "id": "d0bZcv8fb9mL" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Langchain JS" - ], "metadata": { "id": "xypvNdHnb-Yy" - } + }, + "source": [ + "### Langchain JS" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R55mK2vCcBN2" + }, + "outputs": [], "source": [ "import { ChatOpenAI } from \"@langchain/openai\";\n", "\n", @@ -359,24 +350,24 @@ "const message = await model.invoke(\"Hi there!\");\n", "\n", "console.log(message);\n" - ], - "metadata": { - "id": "R55mK2vCcBN2" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### OpenAI JS" - ], "metadata": { "id": "nC4bLifCcCiW" - } + }, + "source": [ + "### OpenAI JS" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MICH8kIMcFpg" + }, + "outputs": [], "source": [ "const { OpenAI } = require('openai');\n", "\n", @@ -398,24 +389,24 @@ "}\n", "\n", "main();\n" - ], - "metadata": { - "id": "MICH8kIMcFpg" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Anthropic SDK" - ], "metadata": { "id": "D1Q07pEAcGTb" - } + }, + "source": [ + "### Anthropic SDK" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qBjFcAvgcI3t" + }, + "outputs": [], "source": [ "import os\n", "\n", @@ -423,7 +414,7 @@ "\n", "client = Anthropic(\n", " base_url=\"http://localhost:4000\", # proxy endpoint\n", - " api_key=\"sk-s4xN1IiLTCytwtZFJaYQrA\", # litellm proxy virtual key\n", + " api_key=\"sk-test-proxy-key-123\", # litellm proxy virtual key (example)\n", ")\n", "\n", "message = client.messages.create(\n", @@ -437,33 +428,33 @@ " model=\"claude-3-opus-20240229\",\n", ")\n", "print(message.content)" - ], - "metadata": { - "id": "qBjFcAvgcI3t" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "## /embeddings" - ], "metadata": { "id": "dFAR4AJGcONI" - } + }, + "source": [ + "## /embeddings" + ] }, { "cell_type": "markdown", - "source": [ - "### OpenAI Python SDK" - ], "metadata": { "id": "lgNoM281cRzR" - } + }, + "source": [ + "### OpenAI Python SDK" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NY3DJhPfcQhA" + }, + "outputs": [], "source": [ "import openai\n", "from openai import OpenAI\n", @@ -478,24 +469,24 @@ ")\n", "\n", "print(response)\n" - ], - "metadata": { - "id": "NY3DJhPfcQhA" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Langchain Embeddings" - ], "metadata": { "id": "hmbg-DW6cUZs" - } + }, + "source": [ + "### Langchain Embeddings" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lX2S8Nl1cWVP" + }, + "outputs": [], "source": [ "from langchain.embeddings import OpenAIEmbeddings\n", "\n", @@ -526,24 +517,22 @@ "\n", "print(f\"TITAN EMBEDDINGS\")\n", "print(query_result[:5])" - ], - "metadata": { - "id": "lX2S8Nl1cWVP" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Curl Request" - ], "metadata": { "id": "oqGbWBCQcYfd" - } + }, + "source": [ + "### Curl Request" + ] }, { "cell_type": "markdown", + "metadata": { + "id": "7rkIMV9LcdwQ" + }, "source": [ "\n", "\n", @@ -556,10 +545,21 @@ " }'\n", "```\n", "\n" - ], - "metadata": { - "id": "7rkIMV9LcdwQ" - } + ] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/cookbook/ai_coding_tool_guides/claude_code_quickstart/guide.md b/cookbook/ai_coding_tool_guides/claude_code_quickstart/guide.md new file mode 100644 index 000000000000..3d6c75498b12 --- /dev/null +++ b/cookbook/ai_coding_tool_guides/claude_code_quickstart/guide.md @@ -0,0 +1,295 @@ +# Claude Code with LiteLLM Quickstart + +This guide shows how to call Claude models (and any LiteLLM-supported model) through LiteLLM proxy from Claude Code. + +> **Note:** This integration is based on [Anthropic's official LiteLLM configuration documentation](https://docs.anthropic.com/en/docs/claude-code/llm-gateway#litellm-configuration). It allows you to use any LiteLLM supported model through Claude Code with centralized authentication, usage tracking, and cost controls. + +## Video Walkthrough + +Watch the full tutorial: https://www.loom.com/embed/3c17d683cdb74d36a3698763cc558f56 + +## Prerequisites + +- [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview) installed +- API keys for your chosen providers + +## Installation + +First, install LiteLLM with proxy support: + +```bash +pip install 'litellm[proxy]' +``` + +## Step 1: Setup config.yaml + +Create a secure configuration using environment variables: + +```yaml +model_list: + # Claude models + - model_name: claude-3-5-sonnet-20241022 + litellm_params: + model: anthropic/claude-3-5-sonnet-20241022 + api_key: os.environ/ANTHROPIC_API_KEY + + - model_name: claude-3-5-haiku-20241022 + litellm_params: + model: anthropic/claude-3-5-haiku-20241022 + api_key: os.environ/ANTHROPIC_API_KEY + + +litellm_settings: + master_key: os.environ/LITELLM_MASTER_KEY +``` + +Set your environment variables: + +```bash +export ANTHROPIC_API_KEY="your-anthropic-api-key" +export LITELLM_MASTER_KEY="sk-1234567890" # Generate a secure key +``` + +## Step 2: Start Proxy + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +## Step 3: Verify Setup + +Test that your proxy is working correctly: + +```bash +curl -X POST http://0.0.0.0:4000/v1/messages \ +-H "Authorization: Bearer $LITELLM_MASTER_KEY" \ +-H "Content-Type: application/json" \ +-d '{ + "model": "claude-3-5-sonnet-20241022", + "max_tokens": 1000, + "messages": [{"role": "user", "content": "What is the capital of France?"}] +}' +``` + +## Step 4: Configure Claude Code + +### Method 1: Unified Endpoint (Recommended) + +Configure Claude Code to use LiteLLM's unified endpoint. Either a virtual key or master key can be used here: + +```bash +export ANTHROPIC_BASE_URL="http://0.0.0.0:4000" +export ANTHROPIC_AUTH_TOKEN="$LITELLM_MASTER_KEY" +``` + +> **Tip:** LITELLM_MASTER_KEY gives Claude access to all proxy models, whereas a virtual key would be limited to the models set in the UI. + +### Method 2: Provider-specific Pass-through Endpoint + +Alternatively, use the Anthropic pass-through endpoint: + +```bash +export ANTHROPIC_BASE_URL="http://0.0.0.0:4000/anthropic" +export ANTHROPIC_AUTH_TOKEN="$LITELLM_MASTER_KEY" +``` + +## Step 5: Use Claude Code + +### Choosing Your Model + +You have two options for specifying which model Claude Code uses: + +#### Option 1: Command Line / Session Model Selection + +Specify the model directly when starting Claude Code or during a session: + +```bash +# Specify model at startup +claude --model claude-3-5-sonnet-20241022 + +# Or change model during a session +/model claude-3-5-haiku-20241022 +``` + +This method uses the exact model you specify. + +#### Option 2: Environment Variables + +Configure default models using environment variables: + +```bash +# Tell Claude Code which models to use by default +export ANTHROPIC_DEFAULT_SONNET_MODEL=claude-3-5-sonnet-20241022 +export ANTHROPIC_DEFAULT_HAIKU_MODEL=claude-3-5-haiku-20241022 +export ANTHROPIC_DEFAULT_OPUS_MODEL=claude-opus-3-5-20240229 + +claude # Will use the models specified above +``` + +**Note:** Claude Code may cache the model from a previous session. If environment variables don't take effect, use Option 1 to explicitly set the model. + +**Important:** The `model_name` in your LiteLLM config must match what Claude Code requests (either from env vars or command line). + +### Using 1M Context Window + +Claude Code supports extended context (1 million tokens) using the `[1m]` suffix with Claude 4+ models: + +```bash +# Use Sonnet 4.5 with 1M context (requires quotes for shell) +claude --model 'claude-sonnet-4-5-20250929[1m]' + +# Inside a Claude Code session (no quotes needed) +/model claude-sonnet-4-5-20250929[1m] +``` + +**Important:** When using `--model` with `[1m]` in the shell, you must use quotes to prevent the shell from interpreting the brackets. + +Alternatively, set as default with environment variables: + +```bash +export ANTHROPIC_DEFAULT_SONNET_MODEL='claude-sonnet-4-5-20250929[1m]' +claude +``` + +**How it works:** +- Claude Code strips the `[1m]` suffix before sending to LiteLLM +- Claude Code automatically adds the header `anthropic-beta: context-1m-2025-08-07` +- Your LiteLLM config should **NOT** include `[1m]` in model names + +**Verify 1M context is active:** +```bash +/context +# Should show: 21k/1000k tokens (2%) +``` + +**Pricing:** Models using 1M context have different pricing. Input tokens above 200k are charged at a higher rate. + +## Troubleshooting + +Common issues and solutions: + +**Claude Code not connecting:** +- Verify your proxy is running: `curl http://0.0.0.0:4000/health` +- Check that `ANTHROPIC_BASE_URL` is set correctly +- Ensure your `ANTHROPIC_AUTH_TOKEN` matches your LiteLLM master key + +**Authentication errors:** +- Verify your environment variables are set: `echo $LITELLM_MASTER_KEY` +- Check that your API keys are valid and have sufficient credits +- Ensure the `ANTHROPIC_AUTH_TOKEN` matches your LiteLLM master key + +**Model not found:** +- Check what model Claude Code is requesting in LiteLLM logs +- Ensure your `config.yaml` has a matching `model_name` entry +- If using environment variables, verify they're set: `echo $ANTHROPIC_DEFAULT_SONNET_MODEL` + +**1M context not working (showing 200k instead of 1000k):** +- Verify you're using the `[1m]` suffix: `/model your-model-name[1m]` +- Check LiteLLM logs for the header `context-1m-2025-08-07` in the request +- Ensure your model supports 1M context (only certain Claude models do) +- Your LiteLLM config should **NOT** include `[1m]` in the `model_name` + +## Using Multiple Models and Providers + +You can configure LiteLLM to route to any supported provider. Here's an example with multiple providers: + +```yaml +model_list: + # OpenAI models + - model_name: codex-mini + litellm_params: + model: openai/codex-mini + api_key: os.environ/OPENAI_API_KEY + api_base: https://api.openai.com/v1 + + - model_name: o3-pro + litellm_params: + model: openai/o3-pro + api_key: os.environ/OPENAI_API_KEY + api_base: https://api.openai.com/v1 + + - model_name: gpt-4o + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY + api_base: https://api.openai.com/v1 + + # Anthropic models + - model_name: claude-3-5-sonnet-20241022 + litellm_params: + model: anthropic/claude-3-5-sonnet-20241022 + api_key: os.environ/ANTHROPIC_API_KEY + + - model_name: claude-3-5-haiku-20241022 + litellm_params: + model: anthropic/claude-3-5-haiku-20241022 + api_key: os.environ/ANTHROPIC_API_KEY + + # AWS Bedrock + - model_name: claude-bedrock + litellm_params: + model: bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0 + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-east-1 + +litellm_settings: + master_key: os.environ/LITELLM_MASTER_KEY +``` + +**Note:** The `model_name` can be anything you choose. Claude Code will request whatever model you specify (via env vars or command line), and LiteLLM will route to the `model` configured in `litellm_params`. + +Switch between models seamlessly: + +```bash +# Use environment variables to set defaults +export ANTHROPIC_DEFAULT_SONNET_MODEL=claude-3-5-sonnet-20241022 +export ANTHROPIC_DEFAULT_HAIKU_MODEL=claude-3-5-haiku-20241022 + +# Or specify directly +claude --model claude-3-5-sonnet-20241022 # Complex reasoning +claude --model claude-3-5-haiku-20241022 # Fast responses +claude --model claude-bedrock # Bedrock deployment +``` + +## Default Models Used by Claude Code + +If you **don't** set environment variables, Claude Code uses these default model names: + +| Purpose | Default Model Name (v2.1.14) | +|---------|------------------------------| +| Main model | `claude-sonnet-4-5-20250929` | +| Light tasks (subagents, summaries) | `claude-haiku-4-5-20251001` | +| Planning mode | `claude-opus-4-5-20251101` | + +Your LiteLLM config should include these model names if you want Claude Code to work without setting environment variables: + +```yaml +model_list: + - model_name: claude-sonnet-4-5-20250929 + litellm_params: + # Can be any provider - Anthropic, Bedrock, Vertex AI, etc. + model: anthropic/claude-sonnet-4-5-20250929 + api_key: os.environ/ANTHROPIC_API_KEY + + - model_name: claude-haiku-4-5-20251001 + litellm_params: + model: anthropic/claude-haiku-4-5-20251001 + api_key: os.environ/ANTHROPIC_API_KEY + + - model_name: claude-opus-4-5-20251101 + litellm_params: + model: anthropic/claude-opus-4-5-20251101 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +**Warning:** These default model names may change with new Claude Code versions. Check LiteLLM proxy logs for "model not found" errors to identify what Claude Code is requesting. + +## Additional Resources + +- [LiteLLM Documentation](https://docs.litellm.ai/) +- [Claude Code Documentation](https://docs.anthropic.com/en/docs/claude-code/overview) +- [Anthropic's LiteLLM Configuration Guide](https://docs.anthropic.com/en/docs/claude-code/llm-gateway#litellm-configuration) + diff --git a/cookbook/ai_coding_tool_guides/index.json b/cookbook/ai_coding_tool_guides/index.json new file mode 100644 index 000000000000..3e71670d6239 --- /dev/null +++ b/cookbook/ai_coding_tool_guides/index.json @@ -0,0 +1,134 @@ +[{ + "title": "Claude Code Quickstart", + "description": "This is a quickstart guide to using Claude Code with LiteLLM.", + "url": "https://docs.litellm.ai/docs/tutorials/claude_responses_api", + "date": "2026-01-15", + "version": "1.0.0", + "tags": [ + "Claude Code", + "LiteLLM" + ] +}, +{ + "title": "Claude Code with MCPs", + "description": "This is a guide to using Claude Code with MCPs via LiteLLM Proxy.", + "url": "https://docs.litellm.ai/docs/tutorials/claude_mcp", + "date": "2026-01-15", + "version": "1.0.0", + "tags": [ + "Claude Code", + "LiteLLM", + "MCP" + ] +}, +{ + "title": "Claude Code with Non-Anthropic Models", + "description": "This is a guide to using Claude Code with non-Anthropic models via LiteLLM Proxy.", + "url": "https://docs.litellm.ai/docs/tutorials/claude_non_anthropic_models", + "date": "2026-01-16", + "version": "1.0.0", + "tags": [ + "Claude Code", + "LiteLLM", + "OpenAI", + "Gemini" + ] +}, +{ + "title": "Cursor Quickstart", + "description": "This is a quickstart guide to using Cursor with LiteLLM.", + "url": "https://docs.litellm.ai/docs/tutorials/cursor_integration", + "date": "2026-01-16", + "version": "1.0.0", + "tags": [ + "Cursor", + "LiteLLM", + "Quickstart" + ] +}, +{ + "title": "Github Copilot Quickstart", + "description": "This is a quickstart guide to using Github Copilot with LiteLLM.", + "url": "https://docs.litellm.ai/docs/tutorials/github_copilot_integration", + "date": "2026-01-16", + "version": "1.0.0", + "tags": [ + "Github Copilot", + "LiteLLM", + "Quickstart" + ] +}, +{ + "title": "LiteLLM Gemini CLI Quickstart", + "description": "This is a quickstart guide to using LiteLLM Gemini CLI.", + "url": "https://docs.litellm.ai/docs/tutorials/litellm_gemini_cli", + "date": "2026-01-16", + "version": "1.0.0", + "tags": [ + "Gemini CLI", + "Gemini", + "LiteLLM", + "Quickstart" + ] +}, +{ + "title": "OpenAI Codex CLI Quickstart", + "description": "This is a quickstart guide to using OpenAI Codex CLI.", + "url": "https://docs.litellm.ai/docs/tutorials/openai_codex", + "date": "2026-01-16", + "version": "1.0.0", + "tags": [ + "OpenAI Codex CLI", + "OpenAI", + "LiteLLM", + "Quickstart" + ] +}, +{ + "title": "OpenWebUI Quickstart", + "description": "This is a quickstart guide to using OpenWebUI with LiteLLM.", + "url": "https://docs.litellm.ai/docs/tutorials/openweb_ui", + "date": "2026-01-16", + "version": "1.0.0", + "tags": [ + "OpenWebUI", + "LiteLLM", + "Quickstart" + ] +}, +{ + "title": "AI Coding Tool Usage Tracking", + "description": "This is a guide to tracking usage for AI coding tools monitor the use of Claude Code , Google Antigravity, OpenAI Codex, Roo Code etc. through LiteLLM.", + "url": "https://docs.litellm.ai/docs/tutorials/cost_tracking_coding", + "date": "2026-01-17", + "version": "1.0.0", + "tags": [ + "Claude Code", + "Gemini CLI", + "OpenAI Codex", + "LiteLLM" + ] +}, +{ + "title": "Use Web Search with Claude Code (across Bedrock/OpenAI/Gemini/etc.)", + "description": "This is a guide for using Web Search with Claude Code via LiteLLM.", + "url": "https://docs.litellm.ai/docs/tutorials/claude_code_websearch", + "date": "2026-01-17", + "version": "1.0.0", + "tags": [ + "Claude Code", + "LiteLLM", + "Web Search" + ] +}, +{ + "title": "Track Claude Code Usage per user via Custom Headers", + "description": "This is a guide for tracking claude code user usage by passing a customer ID header.", + "url": "https://docs.litellm.ai/docs/tutorials/claude_code_customer_tracking", + "date": "2026-01-17", + "version": "1.0.0", + "tags": [ + "Claude Code", + "LiteLLM" + ] +}] \ No newline at end of file diff --git a/cookbook/anthropic_agent_sdk/README.md b/cookbook/anthropic_agent_sdk/README.md new file mode 100644 index 000000000000..294d949e24e3 --- /dev/null +++ b/cookbook/anthropic_agent_sdk/README.md @@ -0,0 +1,144 @@ +# Claude Agent SDK with LiteLLM Gateway + +A simple example showing how to use Claude's Agent SDK with LiteLLM as a proxy. This lets you use any LLM provider (OpenAI, Bedrock, Azure, etc.) through the Agent SDK. + +## Quick Start + +### 1. Install dependencies + +```bash +pip install anthropic claude-agent-sdk litellm +``` + +### 2. Start LiteLLM proxy + +```bash +# Simple start with Claude +litellm --model claude-sonnet-4-20250514 + +# Or with a config file +litellm --config config.yaml +``` + +### 3. Run the chat + +**Basic Agent (no MCP):** + +```bash +python main.py +``` + +**Agent with MCP (DeepWiki2 for research):** + +```bash +python agent_with_mcp.py +``` + +If MCP connection fails, you can disable it: + +```bash +USE_MCP=false python agent_with_mcp.py +``` + +That's it! You can now chat with the agent in your terminal. + +### Chat Commands + +While chatting, you can use these commands: +- `models` - List all available models (fetched from your LiteLLM proxy) +- `model` - Switch to a different model +- `clear` - Start a new conversation +- `quit` or `exit` - End the chat + +The chat automatically fetches available models from your LiteLLM proxy's `/models` endpoint, so you'll always see what's currently configured. + +## Configuration + +Set these environment variables if needed: + +```bash +export LITELLM_PROXY_URL="http://localhost:4000" +export LITELLM_API_KEY="sk-1234" +export LITELLM_MODEL="bedrock-claude-sonnet-4.5" +``` + +Or just use the defaults - it'll connect to `http://localhost:4000` by default. + +## Files + +- `main.py` - Basic interactive agent without MCP +- `agent_with_mcp.py` - Agent with MCP server integration (DeepWiki2) +- `common.py` - Shared utilities and functions +- `config.example.yaml` - Example LiteLLM configuration +- `requirements.txt` - Python dependencies + +## Example Config File + +If you want to use multiple models, create a `config.yaml` (see `config.example.yaml`): + +```yaml +model_list: + - model_name: bedrock-claude-sonnet-4 + litellm_params: + model: "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0" + aws_region_name: "us-east-1" + + - model_name: bedrock-claude-sonnet-4.5 + litellm_params: + model: "bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0" + aws_region_name: "us-east-1" +``` + +Then start LiteLLM with: `litellm --config config.yaml` + +## How It Works + +The key is pointing the Agent SDK to LiteLLM instead of directly to Anthropic: + +```python +# Point to LiteLLM gateway (not Anthropic) +os.environ["ANTHROPIC_BASE_URL"] = "http://localhost:4000" +os.environ["ANTHROPIC_API_KEY"] = "sk-1234" # Your LiteLLM key + +# Use any model configured in LiteLLM +options = ClaudeAgentOptions( + model="bedrock-claude-sonnet-4", # or gpt-4, or anything else + system_prompt="You are a helpful assistant.", + max_turns=50, +) +``` + +Note: Don't add `/anthropic` to the base URL - LiteLLM handles the routing automatically. + +## Why Use This? + +- **Switch providers easily**: Use the same code with OpenAI, Bedrock, Azure, etc. +- **Cost tracking**: LiteLLM tracks spending across all your agent conversations +- **Rate limiting**: Set budgets and limits on your agent usage +- **Load balancing**: Distribute requests across multiple API keys or regions +- **Fallbacks**: Automatically retry with a different model if one fails + +## Troubleshooting + +**Connection errors?** +- Make sure LiteLLM is running: `litellm --model your-model` +- Check the URL is correct (default: `http://localhost:4000`) + +**Authentication errors?** +- Verify your LiteLLM API key is correct +- Make sure the model is configured in your LiteLLM setup + +**Model not found?** +- Check the model name matches what's in your LiteLLM config +- Run `litellm --model your-model` to test it works + +**Agent with MCP stuck or failing?** +- The MCP server might not be available at `http://localhost:4000/mcp/deepwiki2` +- Try disabling MCP: `USE_MCP=false python agent_with_mcp.py` +- Or use the basic agent: `python main.py` + +## Learn More + +- [LiteLLM Docs](https://docs.litellm.ai/) +- [Claude Agent SDK](https://github.com/anthropics/anthropic-agent-sdk) +- [LiteLLM Proxy Guide](https://docs.litellm.ai/docs/proxy/quick_start) diff --git a/cookbook/anthropic_agent_sdk/agent_with_mcp.py b/cookbook/anthropic_agent_sdk/agent_with_mcp.py new file mode 100644 index 000000000000..ff25feb777fc --- /dev/null +++ b/cookbook/anthropic_agent_sdk/agent_with_mcp.py @@ -0,0 +1,140 @@ +""" +Interactive Claude Agent SDK CLI with MCP Support + +This example demonstrates an interactive CLI chat with the Anthropic Agent SDK using LiteLLM as a proxy, +with MCP (Model Context Protocol) server integration for enhanced capabilities. +""" + +import asyncio +import os +from claude_agent_sdk import ClaudeSDKClient, ClaudeAgentOptions +from common import ( + Config, + fetch_available_models, + setup_litellm_env, + print_header, + handle_model_list, + handle_model_switch, + stream_response, +) + + +async def interactive_chat_with_mcp(): + """ + Interactive CLI chat with the agent and MCP server + """ + config = Config() + + # Configure Anthropic SDK to point to LiteLLM gateway + litellm_base_url = setup_litellm_env(config) + + # Fetch available models from proxy + available_models = await fetch_available_models(litellm_base_url, config.LITELLM_API_KEY) + + current_model = config.LITELLM_MODEL + + # MCP server configuration + mcp_server_url = f"{litellm_base_url}/mcp/deepwiki2" + use_mcp = os.getenv("USE_MCP", "true").lower() == "true" + + if not use_mcp: + print("⚠️ MCP disabled via USE_MCP=false") + + print_header(litellm_base_url, current_model, has_mcp=use_mcp) + + while True: + # Configure agent options + if use_mcp: + try: + # Try with MCP server (HTTP transport) + # Using McpHttpServerConfig format from Agent SDK + options = ClaudeAgentOptions( + system_prompt="You are a helpful AI assistant with access to DeepWiki for research. Be concise, accurate, and friendly.", + model=current_model, + max_turns=50, + mcp_servers={ + "deepwiki2": { + "type": "http", + "url": mcp_server_url, + "headers": { + "Authorization": f"Bearer {config.LITELLM_API_KEY}" + } + } + }, + ) + except Exception as e: + print(f"⚠️ Warning: Could not configure MCP server: {e}") + print("Continuing without MCP...\n") + use_mcp = False + options = ClaudeAgentOptions( + system_prompt="You are a helpful AI assistant. Be concise, accurate, and friendly.", + model=current_model, + max_turns=50, + ) + else: + # Without MCP + options = ClaudeAgentOptions( + system_prompt="You are a helpful AI assistant. Be concise, accurate, and friendly.", + model=current_model, + max_turns=50, + ) + + # Create agent client + try: + async with ClaudeSDKClient(options=options) as client: + conversation_active = True + + while conversation_active: + # Get user input + try: + user_input = input("\n👤 You: ").strip() + except (EOFError, KeyboardInterrupt): + print("\n\n👋 Goodbye!") + return + + # Handle commands + if user_input.lower() in ['quit', 'exit']: + print("\n👋 Goodbye!") + return + + if user_input.lower() == 'clear': + print("\n🔄 Starting new conversation...\n") + conversation_active = False + continue + + if user_input.lower() == 'models': + handle_model_list(available_models, current_model) + continue + + if user_input.lower() == 'model': + new_model, should_restart = handle_model_switch(available_models, current_model) + if should_restart: + current_model = new_model + conversation_active = False + continue + + if not user_input: + continue + + # Stream response from agent + await stream_response(client, user_input) + + except Exception as e: + print(f"\n❌ Error creating agent client: {e}") + print("This might be an MCP configuration issue. Try running without MCP:") + print(" USE_MCP=false python agent_with_mcp.py") + print("\nOr use the basic agent:") + print(" python main.py") + return + + +def main(): + """Run interactive chat with MCP""" + try: + asyncio.run(interactive_chat_with_mcp()) + except KeyboardInterrupt: + print("\n\n👋 Goodbye!") + + +if __name__ == "__main__": + main() diff --git a/cookbook/anthropic_agent_sdk/common.py b/cookbook/anthropic_agent_sdk/common.py new file mode 100644 index 000000000000..d9ee65cb58d7 --- /dev/null +++ b/cookbook/anthropic_agent_sdk/common.py @@ -0,0 +1,160 @@ +""" +Common utilities for Claude Agent SDK examples +""" + +import os +import httpx + + +class Config: + """Configuration for LiteLLM Gateway connection""" + + # LiteLLM proxy URL (default to local instance) + LITELLM_PROXY_URL = os.getenv("LITELLM_PROXY_URL", "http://localhost:4000") + + # LiteLLM API key (master key or virtual key) + LITELLM_API_KEY = os.getenv("LITELLM_API_KEY", "sk-1234") + + # Model name as configured in LiteLLM (e.g., "bedrock-claude-sonnet-4", "gpt-4", etc.) + LITELLM_MODEL = os.getenv("LITELLM_MODEL", "bedrock-claude-sonnet-4.5") + + +async def fetch_available_models(base_url: str, api_key: str) -> list[str]: + """ + Fetch available models from LiteLLM proxy /models endpoint + """ + try: + async with httpx.AsyncClient() as client: + response = await client.get( + f"{base_url}/models", + headers={"Authorization": f"Bearer {api_key}"}, + timeout=10.0 + ) + response.raise_for_status() + data = response.json() + return [model["id"] for model in data.get("data", [])] + except Exception as e: + print(f"⚠️ Warning: Could not fetch models from proxy: {e}") + print("Using default model list...") + # Fallback to default models + return [ + "bedrock-claude-sonnet-3.5", + "bedrock-claude-sonnet-4", + "bedrock-claude-sonnet-4.5", + "bedrock-claude-opus-4.5", + "bedrock-nova-premier", + ] + + +def setup_litellm_env(config: Config): + """ + Configure environment variables to point Agent SDK to LiteLLM + """ + litellm_base_url = config.LITELLM_PROXY_URL.rstrip('/') + os.environ["ANTHROPIC_BASE_URL"] = litellm_base_url + os.environ["ANTHROPIC_API_KEY"] = config.LITELLM_API_KEY + return litellm_base_url + + +def print_header(base_url: str, current_model: str, has_mcp: bool = False): + """ + Print the chat header + """ + mcp_indicator = " + MCP" if has_mcp else "" + print("=" * 70) + print(f"🤖 Claude Agent SDK with LiteLLM Gateway{mcp_indicator} - Interactive Chat") + print("=" * 70) + print(f"🚀 Connected to: {base_url}") + print(f"📦 Current model: {current_model}") + if has_mcp: + print("🔌 MCP: deepwiki2 enabled") + print("\nType your messages below. Commands:") + print(" - 'quit' or 'exit' to end the conversation") + print(" - 'clear' to start a new conversation") + print(" - 'model' to switch models") + print(" - 'models' to list available models") + print("=" * 70) + print() + + +def handle_model_list(available_models: list[str], current_model: str): + """ + Display available models + """ + print("\n📋 Available models:") + for i, model in enumerate(available_models, 1): + marker = "✓" if model == current_model else " " + print(f" {marker} {i}. {model}") + + +def handle_model_switch(available_models: list[str], current_model: str) -> tuple[str, bool]: + """ + Handle model switching + + Returns: + tuple: (new_model, should_restart_conversation) + """ + print("\n📋 Select a model:") + for i, model in enumerate(available_models, 1): + marker = "✓" if model == current_model else " " + print(f" {marker} {i}. {model}") + + try: + choice = input("\nEnter number (or press Enter to cancel): ").strip() + if choice: + idx = int(choice) - 1 + if 0 <= idx < len(available_models): + new_model = available_models[idx] + print(f"\n✅ Switched to: {new_model}") + print("🔄 Starting new conversation with new model...\n") + return new_model, True + else: + print("❌ Invalid choice") + except (ValueError, IndexError): + print("❌ Invalid input") + + return current_model, False + + +async def stream_response(client, user_input: str): + """ + Stream response from the agent + """ + print("\n🤖 Assistant: ", end='', flush=True) + + try: + await client.query(user_input) + + # Show loading indicator + print("⏳ thinking...", end='', flush=True) + + # Stream the response + first_chunk = True + async for msg in client.receive_response(): + # Clear loading indicator on first message + if first_chunk: + print("\r🤖 Assistant: ", end='', flush=True) + first_chunk = False + + # Handle different message types + if hasattr(msg, 'type'): + if msg.type == 'content_block_delta': + # Streaming text delta + if hasattr(msg, 'delta') and hasattr(msg.delta, 'text'): + print(msg.delta.text, end='', flush=True) + elif msg.type == 'content_block_start': + # Start of content block + if hasattr(msg, 'content_block') and hasattr(msg.content_block, 'text'): + print(msg.content_block.text, end='', flush=True) + + # Fallback to original content handling + if hasattr(msg, 'content'): + for content_block in msg.content: + if hasattr(content_block, 'text'): + print(content_block.text, end='', flush=True) + + print() # New line after response + + except Exception as e: + print(f"\r\n❌ Error: {e}") + print("Please check your LiteLLM gateway is running and configured correctly.") diff --git a/cookbook/anthropic_agent_sdk/config.example.yaml b/cookbook/anthropic_agent_sdk/config.example.yaml new file mode 100644 index 000000000000..eb1984fc4eaf --- /dev/null +++ b/cookbook/anthropic_agent_sdk/config.example.yaml @@ -0,0 +1,25 @@ +model_list: + - model_name: bedrock-claude-sonnet-3.5 + litellm_params: + model: "bedrock/us.anthropic.claude-3-5-sonnet-20240620-v1:0" + aws_region_name: "us-east-1" + + - model_name: bedrock-claude-sonnet-4 + litellm_params: + model: "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0" + aws_region_name: "us-east-1" + + - model_name: bedrock-claude-sonnet-4.5 + litellm_params: + model: "bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0" + aws_region_name: "us-east-1" + + - model_name: bedrock-claude-opus-4.5 + litellm_params: + model: "bedrock/us.anthropic.claude-opus-4-5-20251101-v1:0" + aws_region_name: "us-east-1" + + - model_name: bedrock-nova-premier + litellm_params: + model: "bedrock/amazon.nova-premier-v1:0" + aws_region_name: "us-east-1" diff --git a/cookbook/anthropic_agent_sdk/main.py b/cookbook/anthropic_agent_sdk/main.py new file mode 100644 index 000000000000..231b57ca97b9 --- /dev/null +++ b/cookbook/anthropic_agent_sdk/main.py @@ -0,0 +1,95 @@ +""" +Simple Interactive Claude Agent SDK CLI using LiteLLM Gateway + +This example demonstrates an interactive CLI chat with the Anthropic Agent SDK using LiteLLM as a proxy. +LiteLLM acts as a unified interface, allowing you to use any LLM provider (OpenAI, Azure, Bedrock, etc.) +through the Claude Agent SDK by pointing it to the LiteLLM gateway. +""" + +import asyncio +from claude_agent_sdk import ClaudeSDKClient, ClaudeAgentOptions +from common import ( + Config, + fetch_available_models, + setup_litellm_env, + print_header, + handle_model_list, + handle_model_switch, + stream_response, +) + + +async def interactive_chat(): + """ + Interactive CLI chat with the agent + """ + config = Config() + + # Configure Anthropic SDK to point to LiteLLM gateway + litellm_base_url = setup_litellm_env(config) + + # Fetch available models from proxy + available_models = await fetch_available_models(litellm_base_url, config.LITELLM_API_KEY) + + current_model = config.LITELLM_MODEL + + print_header(litellm_base_url, current_model) + + while True: + # Configure agent options for each conversation + options = ClaudeAgentOptions( + system_prompt="You are a helpful AI assistant. Be concise, accurate, and friendly.", + model=current_model, + max_turns=50, + ) + + # Create agent client + async with ClaudeSDKClient(options=options) as client: + conversation_active = True + + while conversation_active: + # Get user input + try: + user_input = input("\n👤 You: ").strip() + except (EOFError, KeyboardInterrupt): + print("\n\n👋 Goodbye!") + return + + # Handle commands + if user_input.lower() in ['quit', 'exit']: + print("\n👋 Goodbye!") + return + + if user_input.lower() == 'clear': + print("\n🔄 Starting new conversation...\n") + conversation_active = False + continue + + if user_input.lower() == 'models': + handle_model_list(available_models, current_model) + continue + + if user_input.lower() == 'model': + new_model, should_restart = handle_model_switch(available_models, current_model) + if should_restart: + current_model = new_model + conversation_active = False + continue + + if not user_input: + continue + + # Stream response from agent + await stream_response(client, user_input) + + +def main(): + """Run interactive chat""" + try: + asyncio.run(interactive_chat()) + except KeyboardInterrupt: + print("\n\n👋 Goodbye!") + + +if __name__ == "__main__": + main() diff --git a/cookbook/anthropic_agent_sdk/requirements.txt b/cookbook/anthropic_agent_sdk/requirements.txt new file mode 100644 index 000000000000..1e810bb7d992 --- /dev/null +++ b/cookbook/anthropic_agent_sdk/requirements.txt @@ -0,0 +1,2 @@ +claude-agent-sdk +httpx>=0.27.0 diff --git a/cookbook/litellm_proxy_server/braintrust_prompt_wrapper_README.md b/cookbook/litellm_proxy_server/braintrust_prompt_wrapper_README.md new file mode 100644 index 000000000000..1bf52d922c6d --- /dev/null +++ b/cookbook/litellm_proxy_server/braintrust_prompt_wrapper_README.md @@ -0,0 +1,279 @@ +# Braintrust Prompt Wrapper for LiteLLM + +This directory contains a wrapper server that enables LiteLLM to use prompts from [Braintrust](https://www.braintrust.dev/) through the generic prompt management API. + +## Architecture + +``` +┌─────────────┐ ┌──────────────────────┐ ┌─────────────┐ +│ LiteLLM │ ──────> │ Wrapper Server │ ──────> │ Braintrust │ +│ Client │ │ (This Server) │ │ API │ +└─────────────┘ └──────────────────────┘ └─────────────┘ + Uses generic Transforms Stores actual + prompt manager Braintrust format prompt templates + to LiteLLM format +``` + +## Components + +### 1. Generic Prompt Manager (`litellm/integrations/generic_prompt_management/`) + +A generic client that can work with any API implementing the `/beta/litellm_prompt_management` endpoint. + +**Expected API Response Format:** +```json +{ + "prompt_id": "string", + "prompt_template": [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello {name}"} + ], + "prompt_template_model": "gpt-4", + "prompt_template_optional_params": { + "temperature": 0.7, + "max_tokens": 100 + } +} +``` + +### 2. Braintrust Wrapper Server (`braintrust_prompt_wrapper_server.py`) + +A FastAPI server that: +- Implements the `/beta/litellm_prompt_management` endpoint +- Fetches prompts from Braintrust API +- Transforms Braintrust response format to LiteLLM format + +## Setup + +### Install Dependencies + +```bash +pip install fastapi uvicorn httpx litellm +``` + +### Set Environment Variables + +```bash +export BRAINTRUST_API_KEY="your-braintrust-api-key" +``` + +## Usage + +### Step 1: Start the Wrapper Server + +```bash +python braintrust_prompt_wrapper_server.py +``` + +The server will start on `http://localhost:8080` by default. + +You can customize the port and host: +```bash +export PORT=8000 +export HOST=0.0.0.0 +python braintrust_prompt_wrapper_server.py +``` + +### Step 2: Use with LiteLLM + +```python +import litellm +from litellm.integrations.generic_prompt_management import GenericPromptManager + +# Configure the generic prompt manager to use your wrapper server +generic_config = { + "api_base": "http://localhost:8080", + "api_key": "your-braintrust-api-key", # Will be passed to Braintrust + "timeout": 30, +} + +# Create the prompt manager +prompt_manager = GenericPromptManager(**generic_config) + +# Use with completion +response = litellm.completion( + model="generic_prompt/gpt-4", + prompt_id="your-braintrust-prompt-id", + prompt_variables={"name": "World"}, # Variables to substitute + messages=[{"role": "user", "content": "Additional message"}] +) + +print(response) +``` + +### Step 3: Direct API Testing + +You can also test the wrapper API directly: + +```bash +# Test with curl +curl -H "Authorization: Bearer YOUR_BRAINTRUST_TOKEN" \ + "http://localhost:8080/beta/litellm_prompt_management?prompt_id=YOUR_PROMPT_ID" + +# Health check +curl http://localhost:8080/health + +# Service info +curl http://localhost:8080/ +``` + +## API Documentation + +Once the server is running, visit: +- Swagger UI: `http://localhost:8080/docs` +- ReDoc: `http://localhost:8080/redoc` + +## Braintrust Format Transformation + +The wrapper automatically transforms Braintrust's response format: + +**Braintrust API Response:** +```json +{ + "id": "prompt-123", + "prompt_data": { + "prompt": { + "type": "chat", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant" + } + ] + }, + "options": { + "model": "gpt-4", + "params": { + "temperature": 0.7, + "max_tokens": 100 + } + } + } +} +``` + +**Transformed to LiteLLM Format:** +```json +{ + "prompt_id": "prompt-123", + "prompt_template": [ + { + "role": "system", + "content": "You are a helpful assistant" + } + ], + "prompt_template_model": "gpt-4", + "prompt_template_optional_params": { + "temperature": 0.7, + "max_tokens": 100 + } +} +``` + +## Supported Parameters + +The wrapper automatically maps these Braintrust parameters to LiteLLM: + +- `temperature` +- `max_tokens` / `max_completion_tokens` +- `top_p` +- `frequency_penalty` +- `presence_penalty` +- `n` +- `stop` +- `response_format` +- `tool_choice` +- `function_call` +- `tools` + +## Variable Substitution + +The generic prompt manager supports simple variable substitution: + +```python +# In your Braintrust prompt: +# "Hello {name}, welcome to {place}!" + +# In your code: +prompt_variables = { + "name": "Alice", + "place": "Wonderland" +} + +# Result: +# "Hello Alice, welcome to Wonderland!" +``` + +Supports both `{variable}` and `{{variable}}` syntax. + +## Error Handling + +The wrapper provides detailed error messages: + +- **401**: Missing or invalid Braintrust API token +- **404**: Prompt not found in Braintrust +- **502**: Failed to connect to Braintrust API +- **500**: Error transforming response + +## Production Deployment + +For production use: + +1. **Use HTTPS**: Deploy behind a reverse proxy with SSL +2. **Authentication**: Add authentication to the wrapper endpoint if needed +3. **Rate Limiting**: Implement rate limiting to prevent abuse +4. **Caching**: Consider caching prompt responses +5. **Monitoring**: Add logging and monitoring + +Example with Docker: + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +RUN pip install fastapi uvicorn httpx + +COPY braintrust_prompt_wrapper_server.py . + +ENV PORT=8080 +ENV HOST=0.0.0.0 + +EXPOSE 8080 + +CMD ["python", "braintrust_prompt_wrapper_server.py"] +``` + +## Extending to Other Providers + +This pattern can be used with any prompt management provider: + +1. Create a wrapper server that implements `/beta/litellm_prompt_management` +2. Transform the provider's response to LiteLLM format +3. Use the generic prompt manager to connect + +Example providers: +- Langsmith +- PromptLayer +- Humanloop +- Custom internal systems + +## Troubleshooting + +### "No Braintrust API token provided" +- Set `BRAINTRUST_API_KEY` environment variable +- Or pass token in `Authorization: Bearer TOKEN` header + +### "Failed to connect to Braintrust API" +- Check your internet connection +- Verify Braintrust API is accessible +- Check firewall settings + +### "Prompt not found" +- Verify the prompt ID exists in Braintrust +- Check that your API token has access to the prompt + +## License + +This wrapper is part of the LiteLLM project and follows the same license. + diff --git a/cookbook/litellm_proxy_server/braintrust_prompt_wrapper_server.py b/cookbook/litellm_proxy_server/braintrust_prompt_wrapper_server.py new file mode 100644 index 000000000000..6379314c5b6d --- /dev/null +++ b/cookbook/litellm_proxy_server/braintrust_prompt_wrapper_server.py @@ -0,0 +1,274 @@ +""" +Mock server that implements the /beta/litellm_prompt_management endpoint +and acts as a wrapper for calling the Braintrust API. + +This server transforms Braintrust's prompt API response into the format +expected by LiteLLM's generic prompt management client. + +Usage: + python braintrust_prompt_wrapper_server.py + + # Then test with: + curl -H "Authorization: Bearer YOUR_BRAINTRUST_TOKEN" \ + "http://localhost:8080/beta/litellm_prompt_management?prompt_id=YOUR_PROMPT_ID" +""" + +import json +import os +from typing import Any, Dict, List, Optional + +import httpx +from fastapi import FastAPI, HTTPException, Header, Query +from fastapi.responses import JSONResponse +import uvicorn + + +app = FastAPI( + title="Braintrust Prompt Wrapper", + description="Wrapper server for Braintrust prompts to work with LiteLLM", + version="1.0.0", +) + + +def transform_braintrust_message(message: Dict[str, Any]) -> Dict[str, str]: + """ + Transform a Braintrust message to LiteLLM format. + + Braintrust message format: + { + "role": "system", + "content": "...", + "name": "..." (optional) + } + + LiteLLM format: + { + "role": "system", + "content": "..." + } + """ + result = { + "role": message.get("role", "user"), + "content": message.get("content", ""), + } + + # Include name if present + if "name" in message: + result["name"] = message["name"] + + return result + + +def transform_braintrust_response( + braintrust_response: Dict[str, Any], +) -> Dict[str, Any]: + """ + Transform Braintrust API response to LiteLLM prompt management format. + + Braintrust response format: + { + "objects": [{ + "id": "prompt_id", + "prompt_data": { + "prompt": { + "type": "chat", + "messages": [...], + "tools": "..." + }, + "options": { + "model": "gpt-4", + "params": { + "temperature": 0.7, + "max_tokens": 100, + ... + } + } + } + }] + } + + LiteLLM format: + { + "prompt_id": "prompt_id", + "prompt_template": [...], + "prompt_template_model": "gpt-4", + "prompt_template_optional_params": {...} + } + """ + # Extract the first object from the objects array if it exists + if "objects" in braintrust_response and len(braintrust_response["objects"]) > 0: + prompt_object = braintrust_response["objects"][0] + else: + prompt_object = braintrust_response + + prompt_data = prompt_object.get("prompt_data", {}) + prompt_info = prompt_data.get("prompt", {}) + options = prompt_data.get("options", {}) + + # Extract messages + messages = prompt_info.get("messages", []) + transformed_messages = [transform_braintrust_message(msg) for msg in messages] + + # Extract model + model = options.get("model") + + # Extract optional parameters + params = options.get("params", {}) + optional_params: Dict[str, Any] = {} + + # Map common parameters + param_mapping = { + "temperature": "temperature", + "max_tokens": "max_tokens", + "max_completion_tokens": "max_tokens", # Alternative name + "top_p": "top_p", + "frequency_penalty": "frequency_penalty", + "presence_penalty": "presence_penalty", + "n": "n", + "stop": "stop", + } + + for braintrust_param, litellm_param in param_mapping.items(): + if braintrust_param in params: + value = params[braintrust_param] + if value is not None: + optional_params[litellm_param] = value + + # Handle response_format + if "response_format" in params: + optional_params["response_format"] = params["response_format"] + + # Handle tool_choice + if "tool_choice" in params: + optional_params["tool_choice"] = params["tool_choice"] + + # Handle function_call + if "function_call" in params: + optional_params["function_call"] = params["function_call"] + + # Add tools if present + if "tools" in prompt_info and prompt_info["tools"]: + optional_params["tools"] = prompt_info["tools"] + + # Handle tool_functions from prompt_data + if "tool_functions" in prompt_data and prompt_data["tool_functions"]: + optional_params["tool_functions"] = prompt_data["tool_functions"] + + return { + "prompt_id": prompt_object.get("id"), + "prompt_template": transformed_messages, + "prompt_template_model": model, + "prompt_template_optional_params": optional_params if optional_params else None, + } + + +@app.get("/beta/litellm_prompt_management") +async def get_prompt( + prompt_id: str = Query(..., description="The Braintrust prompt ID to fetch"), + authorization: Optional[str] = Header( + None, description="Bearer token for Braintrust API" + ), +) -> JSONResponse: + """ + Fetch a prompt from Braintrust and transform it to LiteLLM format. + + Args: + prompt_id: The Braintrust prompt ID + authorization: Bearer token for Braintrust API (from header) + + Returns: + JSONResponse with the transformed prompt data + """ + # Extract token from Authorization header or environment + braintrust_token = None + if authorization and authorization.startswith("Bearer "): + braintrust_token = authorization.replace("Bearer ", "") + else: + braintrust_token = os.getenv("BRAINTRUST_API_KEY") + + if not braintrust_token: + raise HTTPException( + status_code=401, + detail="No Braintrust API token provided. Pass via Authorization header or set BRAINTRUST_API_KEY environment variable.", + ) + + # Call Braintrust API + braintrust_url = f"https://api.braintrust.dev/v1/prompt/{prompt_id}" + headers = { + "Authorization": f"Bearer {braintrust_token}", + "Accept": "application/json", + } + print(f"headers: {headers}") + print(f"braintrust_url: {braintrust_url}") + print(f"braintrust_token: {braintrust_token}") + + try: + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.get(braintrust_url, headers=headers) + response.raise_for_status() + braintrust_data = response.json() + except httpx.HTTPStatusError as e: + raise HTTPException( + status_code=e.response.status_code, + detail=f"Braintrust API error: {e.response.text}", + ) + except httpx.RequestError as e: + raise HTTPException( + status_code=502, + detail=f"Failed to connect to Braintrust API: {str(e)}", + ) + except json.JSONDecodeError as e: + raise HTTPException( + status_code=502, + detail=f"Failed to parse Braintrust API response: {str(e)}", + ) + + print(f"braintrust_data: {braintrust_data}") + # Transform the response + try: + transformed_data = transform_braintrust_response(braintrust_data) + print(f"transformed_data: {transformed_data}") + return JSONResponse(content=transformed_data) + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"Failed to transform Braintrust response: {str(e)}", + ) + + +@app.get("/health") +async def health_check(): + """Health check endpoint.""" + return {"status": "healthy", "service": "braintrust-prompt-wrapper"} + + +@app.get("/") +async def root(): + """Root endpoint with service information.""" + return { + "service": "Braintrust Prompt Wrapper for LiteLLM", + "version": "1.0.0", + "endpoints": { + "prompt_management": "/beta/litellm_prompt_management?prompt_id=", + "health": "/health", + }, + "documentation": "/docs", + } + + +def main(): + """Run the server.""" + port = int(os.getenv("PORT", "8080")) + host = os.getenv("HOST", "0.0.0.0") + + print(f"🚀 Starting Braintrust Prompt Wrapper Server on {host}:{port}") + print(f"📚 API Documentation available at http://{host}:{port}/docs") + print( + f"🔑 Make sure to set BRAINTRUST_API_KEY environment variable or pass token in Authorization header" + ) + + uvicorn.run(app, host=host, port=port) + + +if __name__ == "__main__": + main() diff --git a/cookbook/litellm_proxy_server/secret_manager/custom_secret_manager_config.yaml b/cookbook/litellm_proxy_server/secret_manager/custom_secret_manager_config.yaml new file mode 100644 index 000000000000..3598a9b1b654 --- /dev/null +++ b/cookbook/litellm_proxy_server/secret_manager/custom_secret_manager_config.yaml @@ -0,0 +1,20 @@ +general_settings: + master_key: os.environ/LITELLM_MASTER_KEY + key_management_system: "custom" + key_management_settings: + custom_secret_manager: my_secret_manager.InMemorySecretManager + store_virtual_keys: true + prefix_for_stored_virtual_keys: "litellm/" + access_mode: "read_and_write" + +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4 + api_key: os.environ/OPENAI_API_KEY # Read from custom secret manager + + - model_name: claude-3-5-sonnet + litellm_params: + model: anthropic/claude-3-5-sonnet-20241022 + api_key: os.environ/ANTHROPIC_API_KEY # Read from custom secret manager + diff --git a/cookbook/litellm_proxy_server/secret_manager/my_secret_manager.py b/cookbook/litellm_proxy_server/secret_manager/my_secret_manager.py new file mode 100644 index 000000000000..b3c1bf608e2f --- /dev/null +++ b/cookbook/litellm_proxy_server/secret_manager/my_secret_manager.py @@ -0,0 +1,79 @@ +""" +Example custom secret manager for LiteLLM Proxy. + +This is a simple in-memory secret manager for testing purposes. +In production, replace this with your actual secret management system. +""" + +from typing import Optional, Union + +import httpx + +from litellm.integrations.custom_secret_manager import CustomSecretManager + + +class InMemorySecretManager(CustomSecretManager): + def __init__(self): + super().__init__(secret_manager_name="in_memory_secrets") + # Store your secrets in memory + print("INITIALIZING CUSTOM SECRET MANAGER IN MEMORY") + self.secrets = {} + print("CUSTOM SECRET MANAGER IN MEMORY INITIALIZED") + + async def async_read_secret( + self, + secret_name: str, + optional_params: Optional[dict] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + ) -> Optional[str]: + """Read secret asynchronously""" + print("READING SECRET ASYNCHRONOUSLY") + print("SECRET NAME: %s", secret_name) + print("SECRET: %s", self.secrets.get(secret_name)) + return self.secrets.get(secret_name) + + def sync_read_secret( + self, + secret_name: str, + optional_params: Optional[dict] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + ) -> Optional[str]: + """Read secret synchronously""" + from litellm._logging import verbose_proxy_logger + + verbose_proxy_logger.info(f"CUSTOM SECRET MANAGER: LOOKING FOR SECRET: {secret_name}") + value = self.secrets.get(secret_name) + verbose_proxy_logger.info(f"CUSTOM SECRET MANAGER: READ SECRET: {value}") + return value + + async def async_write_secret( + self, + secret_name: str, + secret_value: str, + description: Optional[str] = None, + optional_params: Optional[dict] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + tags: Optional[Union[dict, list]] = None, + ) -> dict: + """Write a secret to the in-memory store""" + self.secrets[secret_name] = secret_value + print("ALL SECRETS=%s", self.secrets) + return { + "status": "success", + "secret_name": secret_name, + "description": description, + } + + async def async_delete_secret( + self, + secret_name: str, + recovery_window_in_days: Optional[int] = 7, + optional_params: Optional[dict] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + ) -> dict: + """Delete a secret from the in-memory store""" + if secret_name in self.secrets: + del self.secrets[secret_name] + return {"status": "deleted", "secret_name": secret_name} + return {"status": "not_found", "secret_name": secret_name} + diff --git a/cookbook/misc/RELEASE_NOTES_GENERATION_INSTRUCTIONS.md b/cookbook/misc/RELEASE_NOTES_GENERATION_INSTRUCTIONS.md index d47de5b08711..ab2cf334459a 100644 --- a/cookbook/misc/RELEASE_NOTES_GENERATION_INSTRUCTIONS.md +++ b/cookbook/misc/RELEASE_NOTES_GENERATION_INSTRUCTIONS.md @@ -43,6 +43,14 @@ hide_table_of_contents: false ## Key Highlights [3-5 bullet points of major features - prioritize MCP OAuth 2.0, scheduled key rotations, and major model updates] +## New Providers and Endpoints + +### New Providers +[Table with Provider, Supported Endpoints, Description columns] + +### New LLM API Endpoints +[Optional table for new endpoint additions with Endpoint, Method, Description, Documentation columns] + ## New Models / Updated Models #### New Model Support [Model pricing table] @@ -53,9 +61,6 @@ hide_table_of_contents: false ### Bug Fixes [Provider-specific bug fixes organized by provider] -#### New Provider Support -[New provider integrations] - ## LLM API Endpoints #### Features [API-specific features organized by API type] @@ -70,16 +75,20 @@ hide_table_of_contents: false #### Bugs [Management-related bug fixes] -## Logging / Guardrail / Prompt Management Integrations -#### Features -[Organized by integration provider with proper doc links] +## AI Integrations -#### Guardrails +### Logging +[Logging integrations organized by provider with proper doc links, includes General subsection] + +### Guardrails [Guardrail-specific features and fixes] -#### Prompt Management +### Prompt Management [Prompt management integrations like BitBucket] +### Secret Managers +[Secret manager integrations - AWS, HashiCorp Vault, CyberArk, etc.] + ## Spend Tracking, Budgets and Rate Limiting [Cost tracking, service tier pricing, rate limiting improvements] @@ -149,26 +158,34 @@ hide_table_of_contents: false - Admin settings updates - Management routes and endpoints -**Logging / Guardrail / Prompt Management Integrations:** +**AI Integrations:** - **Structure:** - - `#### Features` - organized by integration provider with proper doc links - - `#### Guardrails` - guardrail-specific features and fixes - - `#### Prompt Management` - prompt management integrations - - `#### New Integration` - major new integrations -- **Integration Categories:** + - `### Logging` - organized by integration provider with proper doc links, includes **General** subsection + - `### Guardrails` - guardrail-specific features and fixes + - `### Prompt Management` - prompt management integrations + - `### Secret Managers` - secret manager integrations +- **Logging Categories:** - **[DataDog](../../docs/proxy/logging#datadog)** - group all DataDog-related changes - **[Langfuse](../../docs/proxy/logging#langfuse)** - Langfuse-specific features - **[Prometheus](../../docs/proxy/logging#prometheus)** - monitoring improvements - **[PostHog](../../docs/observability/posthog)** - observability integration - **[SQS](../../docs/proxy/logging#sqs)** - SQS logging features - **[Opik](../../docs/proxy/logging#opik)** - Opik integration improvements + - **[Arize Phoenix](../../docs/observability/arize_phoenix)** - Arize Phoenix integration + - **General** - miscellaneous logging features like callback controls, sensitive data masking - Other logging providers with proper doc links - **Guardrail Categories:** - - LakeraAI, Presidio, Noma, and other guardrail providers + - LakeraAI, Presidio, Noma, Grayswan, IBM Guardrails, and other guardrail providers - **Prompt Management:** - BitBucket, GitHub, and other prompt management integrations + - Prompt versioning, testing, and UI features +- **Secret Managers:** + - **[AWS Secrets Manager](../../docs/secret_managers)** - AWS secret manager features + - **[HashiCorp Vault](../../docs/secret_managers)** - Vault integrations + - **[CyberArk](../../docs/secret_managers)** - CyberArk integrations + - **General** - cross-secret-manager features - Use bullet points under each provider for multiple features -- Separate logging features from guardrails and prompt management clearly +- Separate logging, guardrails, prompt management, and secret managers clearly ### 4. Documentation Linking Strategy @@ -232,6 +249,9 @@ From git diff analysis, create tables like: - **Cost breakdown in logging** → Spend Tracking section - **MCP configuration/OAuth** → MCP Gateway (NOT General Proxy Improvements) - **All documentation PRs** → Documentation Updates section for visibility +- **Callback controls/logging features** → AI Integrations > Logging > General +- **Secret manager features** → AI Integrations > Secret Managers +- **Video generation tag-based routing** → LLM API Endpoints > Video Generation API ### 7. Writing Style Guidelines @@ -370,10 +390,107 @@ This release has a known issue... - **Virtual Keys** - Key rotation and management - **Models + Endpoints** - Provider and endpoint management -**Logging Section Expansion:** -- Rename to "Logging / Guardrail / Prompt Management Integrations" -- Add **Prompt Management** subsection for BitBucket, GitHub integrations -- Keep guardrails separate from logging features +**AI Integrations Section Expansion:** +- Renamed from "Logging / Guardrail / Prompt Management Integrations" to "AI Integrations" +- Structure with four main subsections: + - **Logging** - with **General** subsection for miscellaneous logging features + - **Guardrails** - separate from logging features + - **Prompt Management** - BitBucket, GitHub integrations, versioning features + - **Secret Managers** - AWS, HashiCorp Vault, CyberArk, etc. + +**New Providers and Endpoints Section:** +- Add section after Key Highlights and before New Models / Updated Models +- Include tables for: + - **New Providers** - Provider name, supported endpoints, description + - **New LLM API Endpoints** (optional) - Endpoint, method, description, documentation link +- Only include major new provider integrations, not minor provider updates +- **IMPORTANT**: When adding new providers, also update `provider_endpoints_support.json` in the repository root (see Section 13) + +### 12. Section Header Counts + +**Always include counts in section headers for:** +- **New Providers** - Add count in parentheses: `### New Providers (X new providers)` +- **New LLM API Endpoints** - Add count in parentheses: `### New LLM API Endpoints (X new endpoints)` +- **New Model Support** - Add count in parentheses: `#### New Model Support (X new models)` + +**Format:** +```markdown +### New Providers (4 new providers) + +| Provider | Supported LiteLLM Endpoints | Description | +| -------- | --------------------------- | ----------- | +... + +### New LLM API Endpoints (2 new endpoints) + +| Endpoint | Method | Description | Documentation | +| -------- | ------ | ----------- | ------------- | +... + +#### New Model Support (32 new models) + +| Provider | Model | Context Window | Input ($/1M tokens) | Output ($/1M tokens) | Features | +| -------- | ----- | -------------- | ------------------- | -------------------- | -------- | +... +``` + +**Counting Rules:** +- Count each row in the table (excluding the header row) +- For models, count each model entry in the pricing table +- For providers, count each new provider added +- For endpoints, count each new API endpoint added + +### 13. Update provider_endpoints_support.json + +**When adding new providers or endpoints, you MUST also update `provider_endpoints_support.json` in the repository root.** + +This file tracks which endpoints are supported by each LiteLLM provider and is used to generate documentation. + +**Required Steps:** +1. For each new provider added to the release notes, add a corresponding entry to `provider_endpoints_support.json` +2. For each new endpoint type added, update the schema comment and add the endpoint to relevant providers + +**Provider Entry Format:** +```json +"provider_slug": { + "display_name": "Provider Name (`provider_slug`)", + "url": "https://docs.litellm.ai/docs/providers/provider_slug", + "endpoints": { + "chat_completions": true, + "messages": true, + "responses": true, + "embeddings": false, + "image_generations": false, + "audio_transcriptions": false, + "audio_speech": false, + "moderations": false, + "batches": false, + "rerank": false, + "a2a": true + } +} +``` + +**Available Endpoint Types:** +- `chat_completions` - `/chat/completions` endpoint +- `messages` - `/messages` endpoint (Anthropic format) +- `responses` - `/responses` endpoint (OpenAI/Anthropic unified) +- `embeddings` - `/embeddings` endpoint +- `image_generations` - `/image/generations` endpoint +- `audio_transcriptions` - `/audio/transcriptions` endpoint +- `audio_speech` - `/audio/speech` endpoint +- `moderations` - `/moderations` endpoint +- `batches` - `/batches` endpoint +- `rerank` - `/rerank` endpoint +- `ocr` - `/ocr` endpoint +- `search` - `/search` endpoint +- `vector_stores` - `/vector_stores` endpoint +- `a2a` - `/a2a/{agent}/message/send` endpoint (A2A Protocol) + +**Checklist:** +- [ ] All new providers from release notes are added to `provider_endpoints_support.json` +- [ ] Endpoint support flags accurately reflect provider capabilities +- [ ] Documentation URL points to correct provider docs page ## Example Command Workflow diff --git a/cookbook/mock_guardrail_server/mock_bedrock_guardrail_server.py b/cookbook/mock_guardrail_server/mock_bedrock_guardrail_server.py new file mode 100644 index 000000000000..7bf9cc32484a --- /dev/null +++ b/cookbook/mock_guardrail_server/mock_bedrock_guardrail_server.py @@ -0,0 +1,540 @@ +#!/usr/bin/env python3 +""" +Mock Bedrock Guardrail API Server + +This is a FastAPI server that mimics the AWS Bedrock Guardrail API for testing purposes. +It follows the same API spec as the real Bedrock guardrail endpoint. + +Usage: + python mock_bedrock_guardrail_server.py + +The server will start on http://localhost:8080 +""" + +import os +import re +from typing import Any, Dict, List, Literal, Optional + +from fastapi import Depends, FastAPI, Header, HTTPException, status +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field + +# ============================================================================ +# Request/Response Models (matching Bedrock API spec) +# ============================================================================ + + +class BedrockTextContent(BaseModel): + text: str + + +class BedrockContentItem(BaseModel): + text: BedrockTextContent + + +class BedrockRequest(BaseModel): + source: Literal["INPUT", "OUTPUT"] + content: List[BedrockContentItem] = Field(default_factory=list) + + +class BedrockGuardrailOutput(BaseModel): + text: Optional[str] = None + + +class TopicPolicyItem(BaseModel): + name: str + type: str + action: Literal["BLOCKED", "NONE"] + + +class TopicPolicy(BaseModel): + topics: List[TopicPolicyItem] = Field(default_factory=list) + + +class ContentFilterItem(BaseModel): + type: str + confidence: str + action: Literal["BLOCKED", "NONE"] + + +class ContentPolicy(BaseModel): + filters: List[ContentFilterItem] = Field(default_factory=list) + + +class CustomWord(BaseModel): + match: str + action: Literal["BLOCKED", "NONE"] + + +class WordPolicy(BaseModel): + customWords: List[CustomWord] = Field(default_factory=list) + managedWordLists: List[Dict[str, Any]] = Field(default_factory=list) + + +class PiiEntity(BaseModel): + type: str + match: str + action: Literal["BLOCKED", "ANONYMIZED", "NONE"] + + +class RegexMatch(BaseModel): + name: str + match: str + regex: str + action: Literal["BLOCKED", "ANONYMIZED", "NONE"] + + +class SensitiveInformationPolicy(BaseModel): + piiEntities: List[PiiEntity] = Field(default_factory=list) + regexes: List[RegexMatch] = Field(default_factory=list) + + +class ContextualGroundingFilter(BaseModel): + type: str + threshold: float + score: float + action: Literal["BLOCKED", "NONE"] + + +class ContextualGroundingPolicy(BaseModel): + filters: List[ContextualGroundingFilter] = Field(default_factory=list) + + +class Assessment(BaseModel): + topicPolicy: Optional[TopicPolicy] = None + contentPolicy: Optional[ContentPolicy] = None + wordPolicy: Optional[WordPolicy] = None + sensitiveInformationPolicy: Optional[SensitiveInformationPolicy] = None + contextualGroundingPolicy: Optional[ContextualGroundingPolicy] = None + + +class BedrockGuardrailResponse(BaseModel): + usage: Dict[str, int] = Field( + default_factory=lambda: {"topicPolicyUnits": 1, "contentPolicyUnits": 1} + ) + action: Literal["NONE", "GUARDRAIL_INTERVENED"] = "NONE" + outputs: List[BedrockGuardrailOutput] = Field(default_factory=list) + assessments: List[Assessment] = Field(default_factory=list) + + +# ============================================================================ +# Mock Guardrail Configuration +# ============================================================================ + + +class GuardrailConfig(BaseModel): + """Configuration for mock guardrail behavior""" + + blocked_words: List[str] = Field( + default_factory=lambda: ["offensive", "inappropriate", "badword"] + ) + blocked_topics: List[str] = Field(default_factory=lambda: ["violence", "illegal"]) + pii_patterns: Dict[str, str] = Field( + default_factory=lambda: { + "EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", + "PHONE": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", + "SSN": r"\b\d{3}-\d{2}-\d{4}\b", + "CREDIT_CARD": r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b", + } + ) + anonymize_pii: bool = True # If True, ANONYMIZE PII; if False, BLOCK it + bearer_token: str = "mock-bedrock-token-12345" + + +# Global config +GUARDRAIL_CONFIG = GuardrailConfig() + +# ============================================================================ +# FastAPI App Setup +# ============================================================================ + +app = FastAPI( + title="Mock Bedrock Guardrail API", + description="Mock server mimicking AWS Bedrock Guardrail API", + version="1.0.0", +) + + +# ============================================================================ +# Authentication +# ============================================================================ + + +async def verify_bearer_token(authorization: Optional[str] = Header(None)) -> str: + """ + Verify the Bearer token from the Authorization header. + + Args: + authorization: The Authorization header value + + Returns: + The token if valid + + Raises: + HTTPException: If token is missing or invalid + """ + if authorization is None: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Missing Authorization header", + headers={"WWW-Authenticate": "Bearer"}, + ) + + # Check if it's a Bearer token + parts = authorization.split() + print(f"parts: {parts}") + if len(parts) != 2 or parts[0].lower() != "bearer": + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid Authorization header format. Expected: Bearer ", + headers={"WWW-Authenticate": "Bearer"}, + ) + + token = parts[1] + + # Verify token + if token != GUARDRAIL_CONFIG.bearer_token: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Invalid bearer token", + ) + + return token + + +# ============================================================================ +# Guardrail Logic +# ============================================================================ + + +def check_blocked_words(text: str) -> Optional[WordPolicy]: + """Check if text contains blocked words""" + found_words = [] + text_lower = text.lower() + + for word in GUARDRAIL_CONFIG.blocked_words: + if word.lower() in text_lower: + found_words.append(CustomWord(match=word, action="BLOCKED")) + + if found_words: + return WordPolicy(customWords=found_words) + return None + + +def check_blocked_topics(text: str) -> Optional[TopicPolicy]: + """Check if text contains blocked topics""" + found_topics = [] + text_lower = text.lower() + + for topic in GUARDRAIL_CONFIG.blocked_topics: + if topic.lower() in text_lower: + found_topics.append( + TopicPolicyItem(name=topic, type=topic.upper(), action="BLOCKED") + ) + + if found_topics: + return TopicPolicy(topics=found_topics) + return None + + +def check_pii(text: str) -> tuple[Optional[SensitiveInformationPolicy], str]: + """ + Check for PII in text and return policy + anonymized text + + Returns: + Tuple of (SensitiveInformationPolicy or None, anonymized_text) + """ + pii_entities = [] + anonymized_text = text + action = "ANONYMIZED" if GUARDRAIL_CONFIG.anonymize_pii else "BLOCKED" + + for pii_type, pattern in GUARDRAIL_CONFIG.pii_patterns.items(): + try: + # Compile the regex pattern with a timeout to prevent ReDoS attacks + compiled_pattern = re.compile(pattern) + matches = compiled_pattern.finditer(text) + for match in matches: + matched_text = match.group() + pii_entities.append( + PiiEntity(type=pii_type, match=matched_text, action=action) + ) + + # Anonymize the text if configured + if GUARDRAIL_CONFIG.anonymize_pii: + anonymized_text = anonymized_text.replace( + matched_text, f"[{pii_type}_REDACTED]" + ) + except re.error: + # Invalid regex pattern - skip it and log a warning + print(f"Warning: Invalid regex pattern for PII type {pii_type}: {pattern}") + continue + + if pii_entities: + return SensitiveInformationPolicy(piiEntities=pii_entities), anonymized_text + + return None, text + + +def process_guardrail_request( + request: BedrockRequest, +) -> tuple[BedrockGuardrailResponse, List[str]]: + """ + Process a guardrail request and return the response. + + Returns: + Tuple of (response, list of output texts) + """ + all_text_content = [] + output_texts = [] + + # Extract all text from content items + for content_item in request.content: + if content_item.text and content_item.text.text: + all_text_content.append(content_item.text.text) + + # Combine all text for analysis + combined_text = " ".join(all_text_content) + + # Initialize response + response = BedrockGuardrailResponse() + assessment = Assessment() + has_intervention = False + + # Check for blocked words + word_policy = check_blocked_words(combined_text) + if word_policy: + assessment.wordPolicy = word_policy + has_intervention = True + + # Check for blocked topics + topic_policy = check_blocked_topics(combined_text) + if topic_policy: + assessment.topicPolicy = topic_policy + has_intervention = True + + # Check for PII + for text in all_text_content: + pii_policy, anonymized_text = check_pii(text) + if pii_policy: + assessment.sensitiveInformationPolicy = pii_policy + if GUARDRAIL_CONFIG.anonymize_pii: + # If anonymizing, we don't block, we modify the text + output_texts.append(anonymized_text) + has_intervention = True + else: + # If not anonymizing PII, we block it + output_texts.append(text) + has_intervention = True + else: + output_texts.append(text) + + # Build response + if has_intervention: + response.action = "GUARDRAIL_INTERVENED" + # Only add assessment if there were interventions + response.assessments = [assessment] + + # Add outputs (modified or original text) + response.outputs = [BedrockGuardrailOutput(text=txt) for txt in output_texts] + + return response, output_texts + + +# ============================================================================ +# API Endpoints +# ============================================================================ + + +@app.get("/") +async def root(): + """Health check endpoint""" + return { + "service": "Mock Bedrock Guardrail API", + "status": "running", + "endpoint_format": "/guardrail/{guardrailIdentifier}/version/{guardrailVersion}/apply", + } + + +@app.get("/health") +async def health(): + """Health check endpoint""" + return {"status": "healthy"} + + +""" +LiteLLM exposes a basic guardrail API with the text extracted from the request and sent to the guardrail API, as well as the received request body for any further processing. + +This works across all LiteLLM endpoints (completion, anthropic /v1/messages, responses api, image generation, embedding, etc.) + +This makes it easy to support your own guardrail API without having to make a PR to LiteLLM. + +LiteLLM supports passing any provider specific params from LiteLLM config.yaml to the guardrail API. + +Example: + +```yaml +guardrails: + - guardrail_name: "bedrock-content-guard" + litellm_params: + guardrail: generic_guardrail_api + mode: "pre_call" + api_key: os.environ/GUARDRAIL_API_KEY + api_base: os.environ/GUARDRAIL_API_BASE + additional_provider_specific_params: + api_version: os.environ/GUARDRAIL_API_VERSION # additional provider specific params +``` + +This is a beta API. Please help us improve it. +""" + + +class LitellmBasicGuardrailRequest(BaseModel): + texts: List[str] + images: Optional[List[str]] = None + tools: Optional[List[dict]] = None + tool_calls: Optional[List[dict]] = None + request_data: Dict[str, Any] = Field(default_factory=dict) + additional_provider_specific_params: Dict[str, Any] = Field(default_factory=dict) + input_type: Literal["request", "response"] + litellm_call_id: Optional[str] = None + litellm_trace_id: Optional[str] = None + structured_messages: Optional[List[Dict[str, Any]]] = None + + +class LitellmBasicGuardrailResponse(BaseModel): + action: Literal[ + "BLOCKED", "NONE", "GUARDRAIL_INTERVENED" + ] # BLOCKED = litellm will raise an error, NONE = litellm will continue, GUARDRAIL_INTERVENED = litellm will continue, but the text was modified by the guardrail + blocked_reason: Optional[str] = None # only if action is BLOCKED, otherwise None + texts: Optional[List[str]] = None + images: Optional[List[str]] = None + + +@app.post( + "/beta/litellm_basic_guardrail_api", + response_model=LitellmBasicGuardrailResponse, +) +async def beta_litellm_basic_guardrail_api( + request: LitellmBasicGuardrailRequest, +) -> LitellmBasicGuardrailResponse: + """ + Apply guardrail to input or output content. + + This endpoint mimics the AWS Bedrock ApplyGuardrail API. + + Args: + request: The guardrail request containing content to analyze + token: Bearer token (verified by dependency) + + Returns: + LitellmBasicGuardrailResponse with analysis results + """ + print(f"request: {request}") + if any("ishaan" in text.lower() for text in request.texts): + return LitellmBasicGuardrailResponse( + action="BLOCKED", blocked_reason="Ishaan is not allowed" + ) + elif any("pii_value" in text for text in request.texts): + return LitellmBasicGuardrailResponse( + action="GUARDRAIL_INTERVENED", + texts=[ + text.replace("pii_value", "pii_value_redacted") + for text in request.texts + ], + ) + return LitellmBasicGuardrailResponse(action="NONE") + + +@app.post("/config/update") +async def update_config( + config: GuardrailConfig, token: str = Depends(verify_bearer_token) +): + """ + Update the guardrail configuration. + + This is a testing endpoint to modify the mock guardrail behavior. + + Args: + config: New guardrail configuration + token: Bearer token (verified by dependency) + + Returns: + Updated configuration + """ + global GUARDRAIL_CONFIG + GUARDRAIL_CONFIG = config + return {"status": "updated", "config": GUARDRAIL_CONFIG} + + +@app.get("/config") +async def get_config(token: str = Depends(verify_bearer_token)): + """ + Get the current guardrail configuration. + + Args: + token: Bearer token (verified by dependency) + + Returns: + Current configuration + """ + return GUARDRAIL_CONFIG + + +# ============================================================================ +# Error Handlers +# ============================================================================ + + +@app.exception_handler(HTTPException) +async def http_exception_handler(request, exc: HTTPException): + """Custom error handler for HTTP exceptions""" + return JSONResponse( + status_code=exc.status_code, + content={"error": exc.detail}, + headers=exc.headers, + ) + + +# ============================================================================ +# Main +# ============================================================================ + +if __name__ == "__main__": + import uvicorn + + # Get configuration from environment + host = os.getenv("MOCK_BEDROCK_HOST", "0.0.0.0") + port = int(os.getenv("MOCK_BEDROCK_PORT", "8080")) + bearer_token = os.getenv("MOCK_BEDROCK_TOKEN", "mock-bedrock-token-12345") + + # Update config with environment token + GUARDRAIL_CONFIG.bearer_token = bearer_token + + print("=" * 80) + print("Mock Bedrock Guardrail API Server") + print("=" * 80) + print(f"Server starting on: http://{host}:{port}") + print(f"Bearer Token: {bearer_token}") + print(f"Endpoint: POST /guardrail/{{id}}/version/{{version}}/apply") + print("=" * 80) + print("\nExample curl command:") + print( + f""" +curl -X POST "http://{host}:{port}/guardrail/test-guardrail/version/1/apply" \\ + -H "Authorization: Bearer {bearer_token}" \\ + -H "Content-Type: application/json" \\ + -d '{{ + "source": "INPUT", + "content": [ + {{ + "text": {{ + "text": "Hello, my email is test@example.com" + }} + }} + ] + }}' + """ + ) + print("=" * 80) + + uvicorn.run(app, host=host, port=port) diff --git a/deploy/Dockerfile.ghcr_base b/deploy/Dockerfile.ghcr_base index dbfe0a5a2069..69b08a5893cc 100644 --- a/deploy/Dockerfile.ghcr_base +++ b/deploy/Dockerfile.ghcr_base @@ -8,7 +8,8 @@ WORKDIR /app COPY config.yaml . # Make sure your docker/entrypoint.sh is executable -RUN chmod +x docker/entrypoint.sh +# Convert Windows line endings to Unix +RUN sed -i 's/\r$//' docker/entrypoint.sh && chmod +x docker/entrypoint.sh # Expose the necessary port EXPOSE 4000/tcp diff --git a/deploy/charts/litellm-helm/Chart.yaml b/deploy/charts/litellm-helm/Chart.yaml index aa81e4efeccd..8a08f0b4e29e 100644 --- a/deploy/charts/litellm-helm/Chart.yaml +++ b/deploy/charts/litellm-helm/Chart.yaml @@ -18,13 +18,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.4.7 +version: 1.1.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: v1.50.2 +appVersion: v1.80.12 dependencies: - name: "postgresql" @@ -33,5 +33,5 @@ dependencies: condition: db.deployStandalone - name: redis version: ">=18.0.0" - repository: oci://registry-1.docker.io/bitnamicharts + repository: oci://registry-1.docker.io/bitnamicharts condition: redis.enabled diff --git a/deploy/charts/litellm-helm/README.md b/deploy/charts/litellm-helm/README.md index 352c3e9ddff7..2fa856843f32 100644 --- a/deploy/charts/litellm-helm/README.md +++ b/deploy/charts/litellm-helm/README.md @@ -10,46 +10,48 @@ - Helm 3.8.0+ If `db.deployStandalone` is used: + - PV provisioner support in the underlying infrastructure If `db.useStackgresOperator` is used (not yet implemented): -- The Stackgres Operator must already be installed in the Kubernetes Cluster. This chart will **not** install the operator if it is missing. + +- The Stackgres Operator must already be installed in the Kubernetes Cluster. This chart will **not** install the operator if it is missing. ## Parameters ### LiteLLM Proxy Deployment Settings -| Name | Description | Value | -| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | -| `replicaCount` | The number of LiteLLM Proxy pods to be deployed | `1` | -| `masterkeySecretName` | The name of the Kubernetes Secret that contains the Master API Key for LiteLLM. If not specified, use the generated secret name. | N/A | -| `masterkeySecretKey` | The key within the Kubernetes Secret that contains the Master API Key for LiteLLM. If not specified, use `masterkey` as the key. | N/A | -| `masterkey` | The Master API Key for LiteLLM. If not specified, a random key in the `sk-...` format is generated. | N/A | -| `environmentSecrets` | An optional array of Secret object names. The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` | -| `environmentConfigMaps` | An optional array of ConfigMap object names. The keys and values in these configmaps will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` | -| `image.repository` | LiteLLM Proxy image repository | `ghcr.io/berriai/litellm` | -| `image.pullPolicy` | LiteLLM Proxy image pull policy | `IfNotPresent` | -| `image.tag` | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published. | `""` | -| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` | -| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` | -| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` | -| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `4000` | -| `service.loadBalancerClass` | Optional LoadBalancer implementation class (only used when `service.type` is `LoadBalancer`) | `""` | -| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A | -| `proxyConfigMap.create` | When `true`, render a ConfigMap from `.Values.proxy_config` and mount it. | `true` | -| `proxyConfigMap.name` | When `create=false`, name of the existing ConfigMap to mount. | `""` | -| `proxyConfigMap.key` | Key in the ConfigMap that contains the proxy config file. | `"config.yaml"` | -| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. Rendered into the ConfigMap’s `config.yaml` only when `proxyConfigMap.create=true`. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | `N/A` | -| `extraContainers[]` | An array of additional containers to be deployed as sidecars alongside the LiteLLM Proxy. -| `pdb.enabled` | Enable a PodDisruptionBudget for the LiteLLM proxy Deployment | `false` | -| `pdb.minAvailable` | Minimum number/percentage of pods that must be available during **voluntary** disruptions (choose **one** of minAvailable/maxUnavailable) | `null` | -| `pdb.maxUnavailable` | Maximum number/percentage of pods that can be unavailable during **voluntary** disruptions (choose **one** of minAvailable/maxUnavailable) | `null` | -| `pdb.annotations` | Extra metadata annotations to add to the PDB | `{}` | -| `pdb.labels` | Extra metadata labels to add to the PDB | `{}` | +| Name | Description | Value | +| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------- | +| `replicaCount` | The number of LiteLLM Proxy pods to be deployed | `1` | +| `masterkeySecretName` | The name of the Kubernetes Secret that contains the Master API Key for LiteLLM. If not specified, use the generated secret name. | N/A | +| `masterkeySecretKey` | The key within the Kubernetes Secret that contains the Master API Key for LiteLLM. If not specified, use `masterkey` as the key. | N/A | +| `masterkey` | The Master API Key for LiteLLM. If not specified, a random key in the `sk-...` format is generated. | N/A | +| `environmentSecrets` | An optional array of Secret object names. The keys and values in these secrets will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` | +| `environmentConfigMaps` | An optional array of ConfigMap object names. The keys and values in these configmaps will be presented to the LiteLLM proxy pod as environment variables. See below for an example Secret object. | `[]` | +| `image.repository` | LiteLLM Proxy image repository | `docker.litellm.ai/berriai/litellm` | +| `image.pullPolicy` | LiteLLM Proxy image pull policy | `IfNotPresent` | +| `image.tag` | Overrides the image tag whose default the latest version of LiteLLM at the time this chart was published. | `""` | +| `imagePullSecrets` | Registry credentials for the LiteLLM and initContainer images. | `[]` | +| `serviceAccount.create` | Whether or not to create a Kubernetes Service Account for this deployment. The default is `false` because LiteLLM has no need to access the Kubernetes API. | `false` | +| `service.type` | Kubernetes Service type (e.g. `LoadBalancer`, `ClusterIP`, etc.) | `ClusterIP` | +| `service.port` | TCP port that the Kubernetes Service will listen on. Also the TCP port within the Pod that the proxy will listen on. | `4000` | +| `service.loadBalancerClass` | Optional LoadBalancer implementation class (only used when `service.type` is `LoadBalancer`) | `""` | +| `ingress.labels` | Additional labels for the Ingress resource | `{}` | +| `ingress.*` | See [values.yaml](./values.yaml) for example settings | N/A | +| `proxyConfigMap.create` | When `true`, render a ConfigMap from `.Values.proxy_config` and mount it. | `true` | +| `proxyConfigMap.name` | When `create=false`, name of the existing ConfigMap to mount. | `""` | +| `proxyConfigMap.key` | Key in the ConfigMap that contains the proxy config file. | `"config.yaml"` | +| `proxy_config.*` | See [values.yaml](./values.yaml) for default settings. Rendered into the ConfigMap’s `config.yaml` only when `proxyConfigMap.create=true`. See [example_config_yaml](../../../litellm/proxy/example_config_yaml/) for configuration examples. | `N/A` | +| `extraContainers[]` | An array of additional containers to be deployed as sidecars alongside the LiteLLM Proxy. | +| `pdb.enabled` | Enable a PodDisruptionBudget for the LiteLLM proxy Deployment | `false` | +| `pdb.minAvailable` | Minimum number/percentage of pods that must be available during **voluntary** disruptions (choose **one** of minAvailable/maxUnavailable) | `null` | +| `pdb.maxUnavailable` | Maximum number/percentage of pods that can be unavailable during **voluntary** disruptions (choose **one** of minAvailable/maxUnavailable) | `null` | +| `pdb.annotations` | Extra metadata annotations to add to the PDB | `{}` | +| `pdb.labels` | Extra metadata labels to add to the PDB | `{}` | #### Example `proxy_config` ConfigMap from values (default): - ``` proxyConfigMap: create: true @@ -67,7 +69,6 @@ proxy_config: #### Example using existing `proxyConfigMap` instead of creating it: - ``` proxyConfigMap: create: false @@ -77,8 +78,7 @@ proxyConfigMap: # proxy_config is ignored in this mode ``` -#### Example `environmentSecrets` Secret - +#### Example `environmentSecrets` Secret ``` apiVersion: v1 @@ -91,21 +91,23 @@ type: Opaque ``` ### Database Settings -| Name | Description | Value | -| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | -| `db.useExisting` | Use an existing Postgres database. A Kubernetes Secret object must exist that contains credentials for connecting to the database. An example secret object definition is provided below. | `false` | -| `db.endpoint` | If `db.useExisting` is `true`, this is the IP, Hostname or Service Name of the Postgres server to connect to. | `localhost` | -| `db.database` | If `db.useExisting` is `true`, the name of the existing database to connect to. | `litellm` | -| `db.url` | If `db.useExisting` is `true`, the connection url of the existing database to connect to can be overwritten with this value. | `postgresql://$(DATABASE_USERNAME):$(DATABASE_PASSWORD)@$(DATABASE_HOST)/$(DATABASE_NAME)` | -| `db.secret.name` | If `db.useExisting` is `true`, the name of the Kubernetes Secret that contains credentials. | `postgres` | -| `db.secret.usernameKey` | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the username for authenticating with the Postgres instance. | `username` | -| `db.secret.passwordKey` | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the password associates with the above user. | `password` | -| `db.useStackgresOperator` | Not yet implemented. | `false` | -| `db.deployStandalone` | Deploy a standalone, single instance deployment of Postgres, using the Bitnami postgresql chart. This is useful for getting started but doesn't provide HA or (by default) data backups. | `true` | -| `postgresql.*` | If `db.deployStandalone` is `true`, configuration passed to the Bitnami postgresql chart. See the [Bitnami Documentation](https://github.com/bitnami/charts/tree/main/bitnami/postgresql) for full configuration details. See [values.yaml](./values.yaml) for the default configuration. | See [values.yaml](./values.yaml) | -| `postgresql.auth.*` | If `db.deployStandalone` is `true`, care should be taken to ensure the default `password` and `postgres-password` values are **NOT** used. | `NoTaGrEaTpAsSwOrD` | + +| Name | Description | Value | +| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | +| `db.useExisting` | Use an existing Postgres database. A Kubernetes Secret object must exist that contains credentials for connecting to the database. An example secret object definition is provided below. | `false` | +| `db.endpoint` | If `db.useExisting` is `true`, this is the IP, Hostname or Service Name of the Postgres server to connect to. | `localhost` | +| `db.database` | If `db.useExisting` is `true`, the name of the existing database to connect to. | `litellm` | +| `db.url` | If `db.useExisting` is `true`, the connection url of the existing database to connect to can be overwritten with this value. | `postgresql://$(DATABASE_USERNAME):$(DATABASE_PASSWORD)@$(DATABASE_HOST)/$(DATABASE_NAME)` | +| `db.secret.name` | If `db.useExisting` is `true`, the name of the Kubernetes Secret that contains credentials. | `postgres` | +| `db.secret.usernameKey` | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the username for authenticating with the Postgres instance. | `username` | +| `db.secret.passwordKey` | If `db.useExisting` is `true`, the name of the key within the Kubernetes Secret that holds the password associates with the above user. | `password` | +| `db.useStackgresOperator` | Not yet implemented. | `false` | +| `db.deployStandalone` | Deploy a standalone, single instance deployment of Postgres, using the Bitnami postgresql chart. This is useful for getting started but doesn't provide HA or (by default) data backups. | `true` | +| `postgresql.*` | If `db.deployStandalone` is `true`, configuration passed to the Bitnami postgresql chart. See the [Bitnami Documentation](https://github.com/bitnami/charts/tree/main/bitnami/postgresql) for full configuration details. See [values.yaml](./values.yaml) for the default configuration. | See [values.yaml](./values.yaml) | +| `postgresql.auth.*` | If `db.deployStandalone` is `true`, care should be taken to ensure the default `password` and `postgres-password` values are **NOT** used. | `NoTaGrEaTpAsSwOrD` | #### Example Postgres `db.useExisting` Secret + ```yaml apiVersion: v1 kind: Secret @@ -143,7 +145,7 @@ metadata: name: litellm-env-secret type: Opaque data: - SOME_PASSWORD: cDZbUGVXeU5e0ZW # base64 encoded + SOME_PASSWORD: cDZbUGVXeU5e0ZW # base64 encoded ANOTHER_PASSWORD: AAZbUGVXeU5e0ZB # base64 encoded ``` @@ -153,23 +155,23 @@ Source: [GitHub Gist from troyharvey](https://gist.github.com/troyharvey/4506472 The migration job supports both ArgoCD and Helm hooks to ensure database migrations run at the appropriate time during deployments. -| Name | Description | Value | -| ---------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | -| `migrationJob.enabled` | Enable or disable the schema migration Job | `true` | -| `migrationJob.backoffLimit` | Backoff limit for Job restarts | `4` | -| `migrationJob.ttlSecondsAfterFinished` | TTL for completed migration jobs | `120` | -| `migrationJob.annotations` | Additional annotations for the migration job pod | `{}` | -| `migrationJob.extraContainers` | Additional containers to run alongside the migration job | `[]` | -| `migrationJob.hooks.argocd.enabled` | Enable ArgoCD hooks for the migration job (uses PreSync hook with BeforeHookCreation delete policy) | `true` | -| `migrationJob.hooks.helm.enabled` | Enable Helm hooks for the migration job (uses pre-install,pre-upgrade hooks with before-hook-creation delete policy) | `false` | -| `migrationJob.hooks.helm.weight` | Helm hook execution order (lower weights executed first). Optional - defaults to "1" if not specified. | N/A | - +| Name | Description | Value | +| -------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | ------- | +| `migrationJob.enabled` | Enable or disable the schema migration Job | `true` | +| `migrationJob.backoffLimit` | Backoff limit for Job restarts | `4` | +| `migrationJob.ttlSecondsAfterFinished` | TTL for completed migration jobs | `120` | +| `migrationJob.annotations` | Additional annotations for the migration job pod | `{}` | +| `migrationJob.extraContainers` | Additional containers to run alongside the migration job | `[]` | +| `migrationJob.hooks.argocd.enabled` | Enable ArgoCD hooks for the migration job (uses PreSync hook with BeforeHookCreation delete policy) | `true` | +| `migrationJob.hooks.helm.enabled` | Enable Helm hooks for the migration job (uses pre-install,pre-upgrade hooks with before-hook-creation delete policy) | `false` | +| `migrationJob.hooks.helm.weight` | Helm hook execution order (lower weights executed first). Optional - defaults to "1" if not specified. | N/A | ## Accessing the Admin UI + When browsing to the URL published per the settings in `ingress.*`, you will -be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal +be prompted for **Admin Configuration**. The **Proxy Endpoint** is the internal (from the `litellm` pod's perspective) URL published by the `-litellm` -Kubernetes Service. If the deployment uses the default settings for this +Kubernetes Service. If the deployment uses the default settings for this service, the **Proxy Endpoint** should be set to `http://-litellm:4000`. The **Proxy Key** is the value specified for `masterkey` or, if a `masterkey` @@ -181,7 +183,8 @@ kubectl -n litellm get secret -litellm-masterkey -o jsonpath="{.data.ma ``` ## Admin UI Limitations -At the time of writing, the Admin UI is unable to add models. This is because + +At the time of writing, the Admin UI is unable to add models. This is because it would need to update the `config.yaml` file which is a exposed ConfigMap, and -therefore, read-only. This is a limitation of this helm chart, not the Admin UI +therefore, read-only. This is a limitation of this helm chart, not the Admin UI itself. diff --git a/deploy/charts/litellm-helm/templates/deployment.yaml b/deploy/charts/litellm-helm/templates/deployment.yaml index 6a5a6e875771..4ac5582d0609 100644 --- a/deploy/charts/litellm-helm/templates/deployment.yaml +++ b/deploy/charts/litellm-helm/templates/deployment.yaml @@ -6,8 +6,11 @@ metadata: name: {{ include "litellm.fullname" . }} labels: {{- include "litellm.labels" . | nindent 4 }} + {{- if .Values.deploymentLabels }} + {{- toYaml .Values.deploymentLabels | nindent 4 }} + {{- end }} spec: - {{- if not .Values.autoscaling.enabled }} + {{- if and (not .Values.keda.enabled) (not .Values.autoscaling.enabled) }} replicas: {{ .Values.replicaCount }} {{- end }} selector: @@ -35,6 +38,10 @@ spec: serviceAccountName: {{ include "litellm.serviceAccountName" . }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- with .Values.extraInitContainers }} + initContainers: + {{- toYaml . | nindent 8 }} + {{- end }} containers: - name: {{ include "litellm.name" . }} securityContext: @@ -126,9 +133,20 @@ spec: - configMapRef: name: {{ . }} {{- end }} + {{- if .Values.command }} + command: {{ toYaml .Values.command | nindent 12 }} + {{- end }} + {{- if .Values.args }} + args: {{ toYaml .Values.args | nindent 12 }} + {{- else }} args: - --config - /etc/litellm/config.yaml + {{ if .Values.numWorkers }} + - --num_workers + - {{ .Values.numWorkers | quote }} + {{- end }} + {{- end }} ports: - name: http containerPort: {{ .Values.service.port }} @@ -156,7 +174,8 @@ spec: {{- toYaml .Values.resources | nindent 12 }} volumeMounts: - name: litellm-config - mountPath: /etc/litellm/ + mountPath: /etc/litellm/config.yaml + subPath: config.yaml {{ if .Values.securityContext.readOnlyRootFilesystem }} - name: tmp mountPath: /tmp @@ -168,6 +187,10 @@ spec: {{- with .Values.volumeMounts }} {{- toYaml . | nindent 12 }} {{- end }} + {{- with .Values.lifecycle }} + lifecycle: + {{- toYaml . | nindent 12 }} + {{- end }} {{- with .Values.extraContainers }} {{- toYaml . | nindent 8 }} {{- end }} @@ -208,3 +231,8 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds | default 90 }} + {{- if .Values.topologySpreadConstraints }} + topologySpreadConstraints: + {{- toYaml .Values.topologySpreadConstraints | nindent 8 }} + {{- end }} \ No newline at end of file diff --git a/deploy/charts/litellm-helm/templates/extra-resources.yaml b/deploy/charts/litellm-helm/templates/extra-resources.yaml new file mode 100644 index 000000000000..33190d96fc03 --- /dev/null +++ b/deploy/charts/litellm-helm/templates/extra-resources.yaml @@ -0,0 +1,6 @@ +{{- if .Values.extraResources }} +{{- range .Values.extraResources }} +--- +{{ toYaml . | nindent 0 }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/deploy/charts/litellm-helm/templates/ingress.yaml b/deploy/charts/litellm-helm/templates/ingress.yaml index 09e8d715ab81..ea9ffcbb54c2 100644 --- a/deploy/charts/litellm-helm/templates/ingress.yaml +++ b/deploy/charts/litellm-helm/templates/ingress.yaml @@ -18,6 +18,9 @@ metadata: name: {{ $fullName }} labels: {{- include "litellm.labels" . | nindent 4 }} + {{- with .Values.ingress.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} {{- with .Values.ingress.annotations }} annotations: {{- toYaml . | nindent 4 }} diff --git a/deploy/charts/litellm-helm/templates/keda.yaml b/deploy/charts/litellm-helm/templates/keda.yaml new file mode 100644 index 000000000000..fe5190fffc60 --- /dev/null +++ b/deploy/charts/litellm-helm/templates/keda.yaml @@ -0,0 +1,37 @@ +{{- if and .Values.keda.enabled (not .Values.autoscaling.enabled) }} +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: {{ include "litellm.fullname" . }} + labels: + {{- include "litellm.labels" . | nindent 4 }} + {{- if .Values.keda.scaledObject.annotations }} + annotations: {{ toYaml .Values.keda.scaledObject.annotations | nindent 4 }} + {{- end }} +spec: + scaleTargetRef: + name: {{ include "litellm.fullname" . }} + pollingInterval: {{ .Values.keda.pollingInterval }} + cooldownPeriod: {{ .Values.keda.cooldownPeriod }} + minReplicaCount: {{ .Values.keda.minReplicas }} + maxReplicaCount: {{ .Values.keda.maxReplicas }} +{{- with .Values.keda.fallback }} + fallback: + failureThreshold: {{ .failureThreshold | default 3 }} + replicas: {{ .replicas | default $.Values.keda.maxReplicas }} +{{- end }} + triggers: +{{- with .Values.keda.triggers }} + {{- toYaml . | nindent 2 }} +{{- end }} + advanced: + restoreToOriginalReplicaCount: {{ .Values.keda.restoreToOriginalReplicaCount }} +{{- if .Values.keda.behavior }} + horizontalPodAutoscalerConfig: + behavior: +{{- with .Values.keda.behavior }} +{{- toYaml . | nindent 8 }} +{{- end }} + +{{- end }} +{{- end }} diff --git a/deploy/charts/litellm-helm/templates/migrations-job.yaml b/deploy/charts/litellm-helm/templates/migrations-job.yaml index 243a4ba7d481..3459fa12d1c5 100644 --- a/deploy/charts/litellm-helm/templates/migrations-job.yaml +++ b/deploy/charts/litellm-helm/templates/migrations-job.yaml @@ -22,6 +22,9 @@ spec: metadata: labels: {{- include "litellm.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} annotations: {{- with .Values.migrationJob.annotations }} {{- toYaml . | nindent 8 }} @@ -32,6 +35,10 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} serviceAccountName: {{ include "litellm.serviceAccountName" . }} + {{- with .Values.migrationJob.extraInitContainers }} + initContainers: + {{- toYaml . | nindent 8 }} + {{- end }} containers: - name: prisma-migrations image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default (printf "main-%s" .Chart.AppVersion) }}" diff --git a/deploy/charts/litellm-helm/templates/servicemonitor.yaml b/deploy/charts/litellm-helm/templates/servicemonitor.yaml new file mode 100644 index 000000000000..743098deb3f0 --- /dev/null +++ b/deploy/charts/litellm-helm/templates/servicemonitor.yaml @@ -0,0 +1,39 @@ +{{- with .Values.serviceMonitor }} +{{- if and (eq .enabled true) }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "litellm.fullname" $ }} + labels: + {{- include "litellm.labels" $ | nindent 4 }} + {{- if .labels }} + {{- toYaml .labels | nindent 4 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "litellm.selectorLabels" $ | nindent 6 }} + namespaceSelector: + matchNames: + # if not set, use the release namespace + {{- if not .namespaceSelector.matchNames }} + - {{ $.Release.Namespace | quote }} + {{- else }} + {{- toYaml .namespaceSelector.matchNames | nindent 4 }} + {{- end }} + endpoints: + - port: http + path: /metrics/ + interval: {{ .interval }} + scrapeTimeout: {{ .scrapeTimeout }} + scheme: http + {{- if .relabelings }} + relabelings: +{{- toYaml .relabelings | nindent 4 }} + {{- end }} +{{- end }} +{{- end }} diff --git a/deploy/charts/litellm-helm/templates/tests/test-servicemonitor.yaml b/deploy/charts/litellm-helm/templates/tests/test-servicemonitor.yaml new file mode 100644 index 000000000000..c2a4f84ec21f --- /dev/null +++ b/deploy/charts/litellm-helm/templates/tests/test-servicemonitor.yaml @@ -0,0 +1,152 @@ +{{- if .Values.serviceMonitor.enabled }} +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "litellm.fullname" . }}-test-servicemonitor" + labels: + {{- include "litellm.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: test + image: bitnami/kubectl:latest + command: ['sh', '-c'] + args: + - | + set -e + echo "🔍 Testing ServiceMonitor configuration..." + + # Check if ServiceMonitor exists + if ! kubectl get servicemonitor {{ include "litellm.fullname" . }} -n {{ .Release.Namespace }} &>/dev/null; then + echo "❌ ServiceMonitor not found" + exit 1 + fi + echo "✅ ServiceMonitor exists" + + # Get ServiceMonitor YAML + SM=$(kubectl get servicemonitor {{ include "litellm.fullname" . }} -n {{ .Release.Namespace }} -o yaml) + + # Test endpoint configuration + ENDPOINT_PORT=$(echo "$SM" | grep -A 5 "endpoints:" | grep "port:" | awk '{print $2}') + if [ "$ENDPOINT_PORT" != "http" ]; then + echo "❌ Endpoint port mismatch. Expected: http, Got: $ENDPOINT_PORT" + exit 1 + fi + echo "✅ Endpoint port is correctly set to: $ENDPOINT_PORT" + + # Test endpoint path + ENDPOINT_PATH=$(echo "$SM" | grep -A 5 "endpoints:" | grep "path:" | awk '{print $2}') + if [ "$ENDPOINT_PATH" != "/metrics/" ]; then + echo "❌ Endpoint path mismatch. Expected: /metrics/, Got: $ENDPOINT_PATH" + exit 1 + fi + echo "✅ Endpoint path is correctly set to: $ENDPOINT_PATH" + + # Test interval + INTERVAL=$(echo "$SM" | grep "interval:" | awk '{print $2}') + if [ "$INTERVAL" != "{{ .Values.serviceMonitor.interval }}" ]; then + echo "❌ Interval mismatch. Expected: {{ .Values.serviceMonitor.interval }}, Got: $INTERVAL" + exit 1 + fi + echo "✅ Interval is correctly set to: $INTERVAL" + + # Test scrapeTimeout + TIMEOUT=$(echo "$SM" | grep "scrapeTimeout:" | awk '{print $2}') + if [ "$TIMEOUT" != "{{ .Values.serviceMonitor.scrapeTimeout }}" ]; then + echo "❌ ScrapeTimeout mismatch. Expected: {{ .Values.serviceMonitor.scrapeTimeout }}, Got: $TIMEOUT" + exit 1 + fi + echo "✅ ScrapeTimeout is correctly set to: $TIMEOUT" + + # Test scheme + SCHEME=$(echo "$SM" | grep "scheme:" | awk '{print $2}') + if [ "$SCHEME" != "http" ]; then + echo "❌ Scheme mismatch. Expected: http, Got: $SCHEME" + exit 1 + fi + echo "✅ Scheme is correctly set to: $SCHEME" + + {{- if .Values.serviceMonitor.labels }} + # Test custom labels + echo "🔍 Checking custom labels..." + {{- range $key, $value := .Values.serviceMonitor.labels }} + LABEL_VALUE=$(echo "$SM" | grep -A 20 "metadata:" | grep "{{ $key }}:" | awk '{print $2}') + if [ "$LABEL_VALUE" != "{{ $value }}" ]; then + echo "❌ Label {{ $key }} mismatch. Expected: {{ $value }}, Got: $LABEL_VALUE" + exit 1 + fi + echo "✅ Label {{ $key }} is correctly set to: {{ $value }}" + {{- end }} + {{- end }} + + {{- if .Values.serviceMonitor.annotations }} + # Test annotations + echo "🔍 Checking annotations..." + {{- range $key, $value := .Values.serviceMonitor.annotations }} + ANNOTATION_VALUE=$(echo "$SM" | grep -A 10 "annotations:" | grep "{{ $key }}:" | awk '{print $2}') + if [ "$ANNOTATION_VALUE" != "{{ $value }}" ]; then + echo "❌ Annotation {{ $key }} mismatch. Expected: {{ $value }}, Got: $ANNOTATION_VALUE" + exit 1 + fi + echo "✅ Annotation {{ $key }} is correctly set to: {{ $value }}" + {{- end }} + {{- end }} + + {{- if .Values.serviceMonitor.namespaceSelector.matchNames }} + # Test namespace selector + echo "🔍 Checking namespace selector..." + {{- range .Values.serviceMonitor.namespaceSelector.matchNames }} + if ! echo "$SM" | grep -A 5 "namespaceSelector:" | grep -q "{{ . }}"; then + echo "❌ Namespace {{ . }} not found in namespaceSelector" + exit 1 + fi + echo "✅ Namespace {{ . }} found in namespaceSelector" + {{- end }} + {{- else }} + # Test default namespace selector (should be release namespace) + if ! echo "$SM" | grep -A 5 "namespaceSelector:" | grep -q "{{ .Release.Namespace }}"; then + echo "❌ Release namespace {{ .Release.Namespace }} not found in namespaceSelector" + exit 1 + fi + echo "✅ Default namespace selector set to release namespace: {{ .Release.Namespace }}" + {{- end }} + + {{- if .Values.serviceMonitor.relabelings }} + # Test relabelings + echo "🔍 Checking relabelings configuration..." + if ! echo "$SM" | grep -q "relabelings:"; then + echo "❌ Relabelings section not found" + exit 1 + fi + echo "✅ Relabelings section exists" + {{- range .Values.serviceMonitor.relabelings }} + {{- if .targetLabel }} + if ! echo "$SM" | grep -A 50 "relabelings:" | grep -q "targetLabel: {{ .targetLabel }}"; then + echo "❌ Relabeling targetLabel {{ .targetLabel }} not found" + exit 1 + fi + echo "✅ Relabeling targetLabel {{ .targetLabel }} found" + {{- end }} + {{- if .action }} + if ! echo "$SM" | grep -A 50 "relabelings:" | grep -q "action: {{ .action }}"; then + echo "❌ Relabeling action {{ .action }} not found" + exit 1 + fi + echo "✅ Relabeling action {{ .action }} found" + {{- end }} + {{- end }} + {{- end }} + + # Test selector labels match the service + echo "🔍 Checking selector labels match service..." + SVC_LABELS=$(kubectl get svc {{ include "litellm.fullname" . }} -n {{ .Release.Namespace }} -o jsonpath='{.metadata.labels}') + echo "Service labels: $SVC_LABELS" + echo "✅ Selector labels validation passed" + + echo "" + echo "🎉 All ServiceMonitor tests passed successfully!" + serviceAccountName: {{ include "litellm.serviceAccountName" . }} + restartPolicy: Never +{{- end }} + diff --git a/deploy/charts/litellm-helm/tests/deployment_command_args_labels_tests.yaml b/deploy/charts/litellm-helm/tests/deployment_command_args_labels_tests.yaml new file mode 100644 index 000000000000..6b0d45ebf482 --- /dev/null +++ b/deploy/charts/litellm-helm/tests/deployment_command_args_labels_tests.yaml @@ -0,0 +1,68 @@ +suite: test deployment command, args, and deploymentLabels +templates: + - deployment.yaml + - configmap-litellm.yaml +tests: + - it: should override args when custom args specified + template: deployment.yaml + set: + args: + - --custom-arg1 + - value1 + - --custom-arg2 + asserts: + - equal: + path: spec.template.spec.containers[0].args + value: + - --custom-arg1 + - value1 + - --custom-arg2 + - it: should set custom command when specified + template: deployment.yaml + set: + command: + - /bin/sh + - -c + asserts: + - equal: + path: spec.template.spec.containers[0].command + value: + - /bin/sh + - -c + - it: should set custom command and args together + template: deployment.yaml + set: + command: + - python + - -u + args: + - my_script.py + - --verbose + asserts: + - equal: + path: spec.template.spec.containers[0].command + value: + - python + - -u + - equal: + path: spec.template.spec.containers[0].args + value: + - my_script.py + - --verbose + - it: should add deploymentLabels to deployment metadata + template: deployment.yaml + set: + deploymentLabels: + environment: production + team: platform + version: v1.2.3 + asserts: + - equal: + path: metadata.labels.environment + value: production + - equal: + path: metadata.labels.team + value: platform + - equal: + path: metadata.labels.version + value: v1.2.3 diff --git a/deploy/charts/litellm-helm/tests/deployment_tests.yaml b/deploy/charts/litellm-helm/tests/deployment_tests.yaml index f9c83966696a..f1229e102351 100644 --- a/deploy/charts/litellm-helm/tests/deployment_tests.yaml +++ b/deploy/charts/litellm-helm/tests/deployment_tests.yaml @@ -136,4 +136,27 @@ tests: path: spec.template.spec.containers[0].volumeMounts content: name: litellm-config - mountPath: /etc/litellm/ \ No newline at end of file + mountPath: /etc/litellm/config.yaml + subPath: config.yaml + - it: should work with lifecycle hooks + template: deployment.yaml + set: + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - echo "Container stopping" + asserts: + - exists: + path: spec.template.spec.containers[0].lifecycle + - equal: + path: spec.template.spec.containers[0].lifecycle.preStop.exec.command[0] + value: /bin/sh + - equal: + path: spec.template.spec.containers[0].lifecycle.preStop.exec.command[1] + value: -c + - equal: + path: spec.template.spec.containers[0].lifecycle.preStop.exec.command[2] + value: echo "Container stopping" \ No newline at end of file diff --git a/deploy/charts/litellm-helm/tests/ingress_tests.yaml b/deploy/charts/litellm-helm/tests/ingress_tests.yaml new file mode 100644 index 000000000000..aad6ecfcee8a --- /dev/null +++ b/deploy/charts/litellm-helm/tests/ingress_tests.yaml @@ -0,0 +1,45 @@ +suite: Ingress Configuration Tests +templates: + - ingress.yaml +tests: + - it: should not create Ingress by default + asserts: + - hasDocuments: + count: 0 + + - it: should create Ingress when enabled + set: + ingress.enabled: true + asserts: + - hasDocuments: + count: 1 + - isKind: + of: Ingress + + - it: should add custom labels + set: + ingress.enabled: true + ingress.labels: + custom-label: "true" + another-label: "value" + asserts: + - isKind: + of: Ingress + - equal: + path: metadata.labels.custom-label + value: "true" + - equal: + path: metadata.labels.another-label + value: "value" + + - it: should add annotations + set: + ingress.enabled: true + ingress.annotations: + kubernetes.io/ingress.class: "nginx" + asserts: + - isKind: + of: Ingress + - equal: + path: metadata.annotations["kubernetes.io/ingress.class"] + value: "nginx" diff --git a/deploy/charts/litellm-helm/values.yaml b/deploy/charts/litellm-helm/values.yaml index c1792497d291..cea25974bb08 100644 --- a/deploy/charts/litellm-helm/values.yaml +++ b/deploy/charts/litellm-helm/values.yaml @@ -3,6 +3,7 @@ # Declare variables to be passed into your templates. replicaCount: 1 +# numWorkers: 2 image: # Use "ghcr.io/berriai/litellm-database" for optimized image with database @@ -29,14 +30,26 @@ serviceAccount: # annotations for litellm deployment deploymentAnnotations: {} +deploymentLabels: {} # annotations for litellm pods podAnnotations: {} podLabels: {} +terminationGracePeriodSeconds: 90 +topologySpreadConstraints: + [] + # - maxSkew: 1 + # topologyKey: kubernetes.io/hostname + # whenUnsatisfiable: DoNotSchedule + # labelSelector: + # matchLabels: + # app: litellm + # At the time of writing, the litellm docker image requires write access to the # filesystem on startup so that prisma can install some dependencies. podSecurityContext: {} -securityContext: {} +securityContext: + {} # capabilities: # drop: # - ALL @@ -47,13 +60,15 @@ securityContext: {} # A list of Kubernetes Secret objects that will be exported to the LiteLLM proxy # pod as environment variables. These secrets can then be referenced in the # configuration file (or "litellm" ConfigMap) with `os.environ/` -environmentSecrets: [] +environmentSecrets: + [] # - litellm-env-secret # A list of Kubernetes ConfigMap objects that will be exported to the LiteLLM proxy # pod as environment variables. The ConfigMap kv-pairs can then be referenced in the # configuration file (or "litellm" ConfigMap) with `os.environ/` -environmentConfigMaps: [] +environmentConfigMaps: + [] # - litellm-env-configmap service: @@ -72,7 +87,9 @@ separateHealthPort: 8081 ingress: enabled: false className: "nginx" - annotations: {} + labels: {} + annotations: + {} # kubernetes.io/ingress.class: nginx # kubernetes.io/tls-acme: "true" hosts: @@ -119,7 +136,8 @@ proxy_config: general_settings: master_key: os.environ/PROXY_MASTER_KEY -resources: {} +resources: + {} # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little # resources, such as Minikube. If you do want to specify resources, uncomment the following @@ -138,6 +156,40 @@ autoscaling: targetCPUUtilizationPercentage: 80 # targetMemoryUtilizationPercentage: 80 +# Autoscaling with keda is mutually exclusive with hpa +keda: + enabled: false + minReplicas: 1 + maxReplicas: 100 + pollingInterval: 30 + cooldownPeriod: 300 + # fallback: + # failureThreshold: 3 + # replicas: 11 + restoreToOriginalReplicaCount: false + scaledObject: + annotations: {} + triggers: [] + # - type: prometheus + # metadata: + # serverAddress: http://:9090 + # metricName: http_requests_total + # threshold: '100' + # query: sum(rate(http_requests_total{deployment="my-deployment"}[2m])) + behavior: {} + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 2 + # periodSeconds: 60 + # Additional volumes on the output Deployment definition. volumes: [] # - name: foo @@ -182,6 +234,14 @@ db: # instance. See the "postgresql" top level key for additional configuration. deployStandalone: true +# Lifecycle hooks for the LiteLLM container +# Example: +# lifecycle: +# preStop: +# exec: +# command: ["/bin/sh", "-c", "sleep 10"] +lifecycle: {} + # Settings for Bitnami postgresql chart (if db.deployStandalone is true, ignored # otherwise) postgresql: @@ -221,7 +281,8 @@ migrationJob: # cpu: 100m # memory: 100Mi extraContainers: [] - + extraInitContainers: [] + # Hook configuration hooks: argocd: @@ -230,21 +291,51 @@ migrationJob: enabled: false # Additional environment variables to be added to the deployment as a map of key-value pairs -envVars: { - # USE_DDTRACE: "true" -} +envVars: {} +# USE_DDTRACE: "true" # Additional environment variables to be added to the deployment as a list of k8s env vars -extraEnvVars: { - # - name: EXTRA_ENV_VAR - # value: EXTRA_ENV_VAR_VALUE -} - +extraEnvVars: {} + +# if you want to override the container command, you can do so here +command: {} +# if you want to override the container args, you can do so here +args: {} + +# - name: EXTRA_ENV_VAR +# value: EXTRA_ENV_VAR_VALUE +# Additional Kubernetes resources to deploy with litellm +extraResources: [] + +# - apiVersion: v1 +# kind: ConfigMap +# metadata: +# name: my-extra-config +# data: +# foo: bar # Pod Disruption Budget pdb: enabled: false # Set exactly one of the following. If both are set, minAvailable takes precedence. - minAvailable: null # e.g. "50%" or 1 - maxUnavailable: null # e.g. 1 or "20%" + minAvailable: null # e.g. "50%" or 1 + maxUnavailable: null # e.g. 1 or "20%" annotations: {} labels: {} + +serviceMonitor: + enabled: false + labels: + {} + # test: test + annotations: + {} + # kubernetes.io/test: test + interval: 15s + scrapeTimeout: 10s + relabelings: [] + # - targetLabel: __meta_kubernetes_pod_node_name + # replacement: $1 + # action: replace + namespaceSelector: + matchNames: [] + # - test-namespace diff --git a/dist/litellm-1.79.1.tar.gz b/dist/litellm-1.79.1.tar.gz new file mode 100644 index 000000000000..5980922c1b59 Binary files /dev/null and b/dist/litellm-1.79.1.tar.gz differ diff --git a/docker-compose.yml b/docker-compose.yml index c268f9ba0ff6..988860a7877a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,7 +4,7 @@ services: context: . args: target: runtime - image: ghcr.io/berriai/litellm:main-stable + image: docker.litellm.ai/berriai/litellm:main-stable ######################################### ## Uncomment these lines to start proxy with a config.yaml file ## # volumes: @@ -22,7 +22,9 @@ services: depends_on: - db # Indicates that this service depends on the 'db' service, ensuring 'db' starts first healthcheck: # Defines the health check configuration for the container - test: [ "CMD-SHELL", "wget --no-verbose --tries=1 http://localhost:4000/health/liveliness || exit 1" ] # Command to execute for health check + test: + - CMD-SHELL + - python3 -c "import urllib.request; urllib.request.urlopen('http://localhost:4000/health/liveliness')" # Command to execute for health check interval: 30s # Perform health check every 30 seconds timeout: 10s # Health check command times out after 10 seconds retries: 3 # Retry up to 3 times if health check fails diff --git a/docker/Dockerfile.alpine b/docker/Dockerfile.alpine index f036081549ab..ef2bb98db6ed 100644 --- a/docker/Dockerfile.alpine +++ b/docker/Dockerfile.alpine @@ -34,8 +34,8 @@ RUN pip wheel --no-cache-dir --wheel-dir=/wheels/ -r requirements.txt # Runtime stage FROM $LITELLM_RUNTIME_IMAGE AS runtime -# Update dependencies and clean up -RUN apk upgrade --no-cache +# Update dependencies and clean up, install libsndfile for audio processing +RUN apk upgrade --no-cache && apk add --no-cache libsndfile WORKDIR /app @@ -46,8 +46,9 @@ COPY --from=builder /wheels/ /wheels/ # Install the built wheel using pip; again using a wildcard if it's the only file RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels -RUN chmod +x docker/entrypoint.sh -RUN chmod +x docker/prod_entrypoint.sh +# Convert Windows line endings to Unix for entrypoint scripts +RUN sed -i 's/\r$//' docker/entrypoint.sh && chmod +x docker/entrypoint.sh +RUN sed -i 's/\r$//' docker/prod_entrypoint.sh && chmod +x docker/prod_entrypoint.sh EXPOSE 4000/tcp diff --git a/docker/Dockerfile.custom_ui b/docker/Dockerfile.custom_ui index 5a313142112a..57926bcd1707 100644 --- a/docker/Dockerfile.custom_ui +++ b/docker/Dockerfile.custom_ui @@ -5,7 +5,8 @@ FROM ghcr.io/berriai/litellm:litellm_fwd_server_root_path-dev WORKDIR /app # Install Node.js and npm (adjust version as needed) -RUN apt-get update && apt-get install -y nodejs npm +RUN apt-get update && apt-get install -y nodejs npm && \ + npm install -g npm@latest tar@latest # Copy the UI source into the container COPY ./ui/litellm-dashboard /app/ui/litellm-dashboard @@ -32,8 +33,9 @@ RUN rm -rf /app/litellm/proxy/_experimental/out/* && \ WORKDIR /app # Make sure your docker/entrypoint.sh is executable -RUN chmod +x docker/entrypoint.sh -RUN chmod +x docker/prod_entrypoint.sh +# Convert Windows line endings to Unix for entrypoint scripts +RUN sed -i 's/\r$//' docker/entrypoint.sh && chmod +x docker/entrypoint.sh +RUN sed -i 's/\r$//' docker/prod_entrypoint.sh && chmod +x docker/prod_entrypoint.sh # Expose the necessary port EXPOSE 4000/tcp diff --git a/docker/Dockerfile.database b/docker/Dockerfile.database index 351c4f6bc485..24bf706434d7 100644 --- a/docker/Dockerfile.database +++ b/docker/Dockerfile.database @@ -1,8 +1,8 @@ # Base image for building -ARG LITELLM_BUILD_IMAGE=cgr.dev/chainguard/python:latest-dev +ARG LITELLM_BUILD_IMAGE=cgr.dev/chainguard/wolfi-base # Runtime image -ARG LITELLM_RUNTIME_IMAGE=cgr.dev/chainguard/python:latest-dev +ARG LITELLM_RUNTIME_IMAGE=cgr.dev/chainguard/wolfi-base # Builder stage FROM $LITELLM_BUILD_IMAGE AS builder @@ -12,17 +12,23 @@ WORKDIR /app USER root # Install build dependencies -RUN apk add --no-cache gcc python3-dev openssl openssl-dev +RUN apk add --no-cache \ + bash \ + gcc \ + py3-pip \ + python3 \ + python3-dev \ + openssl \ + openssl-dev - -RUN pip install --upgrade pip && \ - pip install build +RUN python -m pip install build # Copy the current directory contents into the container at /app COPY . . # Build Admin UI -RUN chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh +# Convert Windows line endings to Unix and make executable +RUN sed -i 's/\r$//' docker/build_admin_ui.sh && chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh # Build the package RUN rm -rf dist/* && python -m build @@ -43,7 +49,8 @@ FROM $LITELLM_RUNTIME_IMAGE AS runtime USER root # Install runtime dependencies -RUN apk add --no-cache openssl +RUN apk add --no-cache bash openssl tzdata nodejs npm python3 py3-pip libsndfile && \ + npm install -g npm@latest tar@latest WORKDIR /app # Copy the current directory contents into the container at /app @@ -58,20 +65,23 @@ COPY --from=builder /wheels/ /wheels/ RUN pip install *.whl /wheels/* --no-index --find-links=/wheels/ && rm -f *.whl && rm -rf /wheels # Install semantic_router and aurelio-sdk using script -RUN chmod +x docker/install_auto_router.sh && ./docker/install_auto_router.sh +# Convert Windows line endings to Unix and make executable +RUN sed -i 's/\r$//' docker/install_auto_router.sh && chmod +x docker/install_auto_router.sh && ./docker/install_auto_router.sh # ensure pyjwt is used, not jwt RUN pip uninstall jwt -y RUN pip uninstall PyJWT -y RUN pip install PyJWT==2.9.0 --no-cache-dir -# Build Admin UI -RUN chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh +# Build Admin UI (runtime stage) +# Convert Windows line endings to Unix and make executable +RUN sed -i 's/\r$//' docker/build_admin_ui.sh && chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh # Generate prisma client RUN prisma generate -RUN chmod +x docker/entrypoint.sh -RUN chmod +x docker/prod_entrypoint.sh +# Convert Windows line endings to Unix for entrypoint scripts +RUN sed -i 's/\r$//' docker/entrypoint.sh && chmod +x docker/entrypoint.sh +RUN sed -i 's/\r$//' docker/prod_entrypoint.sh && chmod +x docker/prod_entrypoint.sh EXPOSE 4000/tcp RUN apk add --no-cache supervisor diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 2e886915203d..ae557d4647fc 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -40,7 +40,8 @@ COPY enterprise/ ./enterprise/ COPY docker/ ./docker/ # Build Admin UI once -RUN chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh +# Convert Windows line endings to Unix and make executable +RUN sed -i 's/\r$//' docker/build_admin_ui.sh && chmod +x docker/build_admin_ui.sh && ./docker/build_admin_ui.sh # Build the package RUN rm -rf dist/* && python -m build @@ -57,7 +58,11 @@ USER root # Install only runtime dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ libssl3 \ - && rm -rf /var/lib/apt/lists/* + libatomic1 \ + nodejs \ + npm \ + && rm -rf /var/lib/apt/lists/* \ + && npm install -g npm@latest tar@latest WORKDIR /app @@ -76,8 +81,12 @@ RUN pip install --no-cache-dir *.whl /wheels/* --no-index --find-links=/wheels/ rm -rf /wheels # Generate prisma client and set permissions +# Convert Windows line endings to Unix for entrypoint scripts RUN prisma generate && \ - chmod +x docker/entrypoint.sh docker/prod_entrypoint.sh + sed -i 's/\r$//' docker/entrypoint.sh && \ + sed -i 's/\r$//' docker/prod_entrypoint.sh && \ + chmod +x docker/entrypoint.sh && \ + chmod +x docker/prod_entrypoint.sh EXPOSE 4000/tcp diff --git a/docker/Dockerfile.health_check b/docker/Dockerfile.health_check new file mode 100644 index 000000000000..de62e4bd7292 --- /dev/null +++ b/docker/Dockerfile.health_check @@ -0,0 +1,16 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy health check script and requirements +COPY scripts/health_check/health_check_client.py /app/health_check_client.py +COPY scripts/health_check/health_check_requirements.txt /app/requirements.txt + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Make script executable +RUN chmod +x /app/health_check_client.py + +# Set entrypoint +ENTRYPOINT ["python", "/app/health_check_client.py"] diff --git a/docker/Dockerfile.non_root b/docker/Dockerfile.non_root index ebfeeecbbdae..9ff27e07494a 100644 --- a/docker/Dockerfile.non_root +++ b/docker/Dockerfile.non_root @@ -1,7 +1,7 @@ # Base images ARG LITELLM_BUILD_IMAGE=cgr.dev/chainguard/wolfi-base ARG LITELLM_RUNTIME_IMAGE=cgr.dev/chainguard/wolfi-base -ARG PROXY_EXTRAS_SOURCE=local +ARG PROXY_EXTRAS_SOURCE=published # ----------------- # Builder Stage @@ -15,6 +15,7 @@ USER root RUN for i in 1 2 3; do \ apk add --no-cache \ python3 \ + python3-dev \ py3-pip \ clang \ llvm \ @@ -40,7 +41,7 @@ COPY . . ENV LITELLM_NON_ROOT=true # Build Admin UI using the upstream command order while keeping a single RUN layer -RUN mkdir -p /tmp/litellm_ui && \ +RUN mkdir -p /var/lib/litellm/ui && \ npm install -g npm@latest && npm cache clean --force && \ cd /app/ui/litellm-dashboard && \ if [ -f "/app/enterprise/enterprise_ui/enterprise_colors.json" ]; then \ @@ -49,10 +50,10 @@ RUN mkdir -p /tmp/litellm_ui && \ rm -f package-lock.json && \ npm install --legacy-peer-deps && \ npm run build && \ - cp -r /app/ui/litellm-dashboard/out/* /tmp/litellm_ui/ && \ - mkdir -p /tmp/litellm_assets && \ - cp /app/litellm/proxy/logo.jpg /tmp/litellm_assets/logo.jpg && \ - ( cd /tmp/litellm_ui && \ + cp -r /app/ui/litellm-dashboard/out/* /var/lib/litellm/ui/ && \ + mkdir -p /var/lib/litellm/assets && \ + cp /app/litellm/proxy/logo.jpg /var/lib/litellm/assets/logo.jpg && \ + ( cd /var/lib/litellm/ui && \ for html_file in *.html; do \ if [ "$html_file" != "index.html" ] && [ -f "$html_file" ]; then \ folder_name="${html_file%.html}" && \ @@ -79,7 +80,7 @@ ENV PRISMA_BINARY_CACHE_DIR=/app/.cache/prisma-python/binaries \ XDG_CACHE_HOME=/app/.cache \ PATH="/usr/lib/python3.13/site-packages/nodejs/bin:${PATH}" -RUN pip install --no-cache-dir prisma==0.11.0 nodejs-bin==18.4.0a4 \ +RUN pip install --no-cache-dir prisma==0.11.0 nodejs-wheel-binaries==24.12.0 \ && mkdir -p /app/.cache/npm RUN NPM_CONFIG_CACHE=/app/.cache/npm \ @@ -103,16 +104,19 @@ RUN for i in 1 2 3; do \ done \ && for i in 1 2 3; do \ apk add --no-cache python3 py3-pip bash openssl tzdata nodejs npm supervisor && break || sleep 5; \ - done + done \ + && npm install -g npm@latest tar@latest # Copy artifacts from builder COPY --from=builder /app/requirements.txt /app/requirements.txt COPY --from=builder /app/docker/entrypoint.sh /app/docker/prod_entrypoint.sh /app/docker/ COPY --from=builder /app/docker/supervisord.conf /etc/supervisord.conf COPY --from=builder /app/schema.prisma /app/ +# Copy prisma_migration.py for Helm migrations job compatibility +COPY --from=builder /app/litellm/proxy/prisma_migration.py /app/litellm/proxy/prisma_migration.py COPY --from=builder /wheels/ /wheels/ -COPY --from=builder /tmp/litellm_ui /tmp/litellm_ui -COPY --from=builder /tmp/litellm_assets /tmp/litellm_assets +COPY --from=builder /var/lib/litellm/ui /var/lib/litellm/ui +COPY --from=builder /var/lib/litellm/assets /var/lib/litellm/assets COPY --from=builder /app/.cache /app/.cache COPY --from=builder /app/litellm-proxy-extras /app/litellm-proxy-extras COPY --from=builder \ @@ -144,32 +148,37 @@ RUN pip install --no-index --find-links=/wheels/ -r requirements.txt && \ fi # Permissions, cleanup, and Prisma prep -RUN chmod +x docker/entrypoint.sh docker/prod_entrypoint.sh && \ - mkdir -p /nonexistent /.npm /tmp/litellm_assets /tmp/litellm_ui && \ - chown -R nobody:nogroup /app /tmp/litellm_ui /tmp/litellm_assets /nonexistent /.npm && \ +# Convert Windows line endings to Unix for entrypoint scripts +RUN sed -i 's/\r$//' docker/entrypoint.sh && \ + sed -i 's/\r$//' docker/prod_entrypoint.sh && \ + chmod +x docker/entrypoint.sh docker/prod_entrypoint.sh && \ + mkdir -p /nonexistent /.npm /var/lib/litellm/assets /var/lib/litellm/ui && \ + chown -R nobody:nogroup /app /var/lib/litellm/ui /var/lib/litellm/assets /nonexistent /.npm && \ pip uninstall jwt -y || true && \ pip uninstall PyJWT -y || true && \ - pip install --no-index --find-links=/wheels/ PyJWT==2.9.0 --no-cache-dir && \ + pip install --no-index --find-links=/wheels/ PyJWT==2.10.1 --no-cache-dir && \ rm -rf /wheels && \ PRISMA_PATH=$(python -c "import os, prisma; print(os.path.dirname(prisma.__file__))") && \ chown -R nobody:nogroup $PRISMA_PATH && \ LITELLM_PKG_MIGRATIONS_PATH="$(python -c 'import os, litellm_proxy_extras; print(os.path.dirname(litellm_proxy_extras.__file__))' 2>/dev/null || echo '')/migrations" && \ [ -n "$LITELLM_PKG_MIGRATIONS_PATH" ] && chown -R nobody:nogroup $LITELLM_PKG_MIGRATIONS_PATH && \ LITELLM_PROXY_EXTRAS_PATH=$(python -c "import os, litellm_proxy_extras; print(os.path.dirname(litellm_proxy_extras.__file__))" 2>/dev/null || echo "") && \ - chgrp -R 0 $PRISMA_PATH /tmp/litellm_ui /tmp/litellm_assets && \ + chgrp -R 0 $PRISMA_PATH /var/lib/litellm/ui /var/lib/litellm/assets && \ [ -n "$LITELLM_PROXY_EXTRAS_PATH" ] && chgrp -R 0 $LITELLM_PROXY_EXTRAS_PATH || true && \ - chmod -R g=u $PRISMA_PATH /tmp/litellm_ui /tmp/litellm_assets && \ + chmod -R g=u $PRISMA_PATH /var/lib/litellm/ui /var/lib/litellm/assets && \ [ -n "$LITELLM_PROXY_EXTRAS_PATH" ] && chmod -R g=u $LITELLM_PROXY_EXTRAS_PATH || true && \ - chmod -R g+w $PRISMA_PATH /tmp/litellm_ui /tmp/litellm_assets && \ + chmod -R g+w $PRISMA_PATH /var/lib/litellm/ui /var/lib/litellm/assets && \ [ -n "$LITELLM_PROXY_EXTRAS_PATH" ] && chmod -R g+w $LITELLM_PROXY_EXTRAS_PATH || true && \ chmod -R g+rX $PRISMA_PATH && \ chmod -R g+rX /app/.cache && \ - mkdir -p /tmp/.npm /nonexistent /.npm && \ - prisma generate + mkdir -p /tmp/.npm /nonexistent /.npm # Switch to non-root user for runtime USER nobody +# Generate Prisma client as nobody user to ensure correct file ownership +RUN prisma generate + # Prisma runtime knobs for offline containers ENV PRISMA_SKIP_POSTINSTALL_GENERATE=1 \ PRISMA_HIDE_UPDATE_MESSAGE=1 \ diff --git a/docker/build_from_pip/Dockerfile.build_from_pip b/docker/build_from_pip/Dockerfile.build_from_pip index aeb19bce21ff..05236008ded9 100644 --- a/docker/build_from_pip/Dockerfile.build_from_pip +++ b/docker/build_from_pip/Dockerfile.build_from_pip @@ -1,14 +1,16 @@ -FROM cgr.dev/chainguard/python:latest-dev +FROM python:3.13-alpine -USER root WORKDIR /app ENV HOME=/home/litellm ENV PATH="${HOME}/venv/bin:$PATH" # Install runtime dependencies +# Note: Using Python 3.13 for compatibility with ddtrace and other packages +# rust and cargo are required for building ddtrace from source +# musl-dev and libffi-dev are needed for some Python packages on Alpine RUN apk update && \ - apk add --no-cache gcc python3-dev openssl openssl-dev + apk add --no-cache gcc musl-dev libffi-dev openssl openssl-dev rust cargo RUN python -m venv ${HOME}/venv RUN ${HOME}/venv/bin/pip install --no-cache-dir --upgrade pip diff --git a/docker/prod_entrypoint.sh b/docker/prod_entrypoint.sh index 1fc09d2c8648..28d1bdcc2942 100644 --- a/docker/prod_entrypoint.sh +++ b/docker/prod_entrypoint.sh @@ -2,6 +2,7 @@ if [ "$SEPARATE_HEALTH_APP" = "1" ]; then export LITELLM_ARGS="$@" + export SUPERVISORD_STOPWAITSECS="${SUPERVISORD_STOPWAITSECS:-3600}" exec supervisord -c /etc/supervisord.conf fi diff --git a/docker/supervisord.conf b/docker/supervisord.conf index c6855fe652b9..ba9d99d18a5d 100644 --- a/docker/supervisord.conf +++ b/docker/supervisord.conf @@ -1,6 +1,8 @@ [supervisord] nodaemon=true loglevel=info +logfile=/tmp/supervisord.log +pidfile=/tmp/supervisord.pid [group:litellm] programs=main,health @@ -14,6 +16,7 @@ priority=1 exitcodes=0 stopasgroup=true killasgroup=true +stopwaitsecs=%(ENV_SUPERVISORD_STOPWAITSECS)s stdout_logfile=/dev/stdout stderr_logfile=/dev/stderr stdout_logfile_maxbytes = 0 @@ -29,6 +32,7 @@ priority=2 exitcodes=0 stopasgroup=true killasgroup=true +stopwaitsecs=%(ENV_SUPERVISORD_STOPWAITSECS)s stdout_logfile=/dev/stdout stderr_logfile=/dev/stderr stdout_logfile_maxbytes = 0 diff --git a/docs/my-website/.trivyignore b/docs/my-website/.trivyignore new file mode 100644 index 000000000000..977504f26700 --- /dev/null +++ b/docs/my-website/.trivyignore @@ -0,0 +1,7 @@ +# js-yaml CVE-2025-64718 +# This vulnerability is not applicable because we've forced js-yaml to version 4.1.1 +# via npm overrides in package.json. Trivy incorrectly reports this based on +# dependency requirements in the lockfile, but the actual installed version is 4.1.1. +# Verified with: npm list js-yaml +CVE-2025-64718 + diff --git a/docs/my-website/blog/anthropic_opus_4_5_and_advanced_features/index.md b/docs/my-website/blog/anthropic_opus_4_5_and_advanced_features/index.md new file mode 100644 index 000000000000..8a54426dfb02 --- /dev/null +++ b/docs/my-website/blog/anthropic_opus_4_5_and_advanced_features/index.md @@ -0,0 +1,1070 @@ +--- +slug: anthropic_advanced_features +title: "Day 0 Support: Claude 4.5 Opus (+Advanced Features)" +date: 2025-11-25T10:00:00 +authors: + - name: Sameer Kankute + title: SWE @ LiteLLM (LLM Translation) + url: https://www.linkedin.com/in/sameer-kankute/ + image_url: https://pbs.twimg.com/profile_images/2001352686994907136/ONgNuSk5_400x400.jpg + - name: Krrish Dholakia + title: "CEO, LiteLLM" + url: https://www.linkedin.com/in/krish-d/ + image_url: https://pbs.twimg.com/profile_images/1298587542745358340/DZv3Oj-h_400x400.jpg + - name: Ishaan Jaff + title: "CTO, LiteLLM" + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg +description: "Guide to Claude Opus 4.5 and advanced features in LiteLLM: Tool Search, Programmatic Tool Calling, and Effort Parameter." +tags: [anthropic, claude, tool search, programmatic tool calling, effort, advanced features] +hide_table_of_contents: false +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +This guide covers Anthropic's latest model (Claude Opus 4.5) and its advanced features now available in LiteLLM: Tool Search, Programmatic Tool Calling, Tool Input Examples, and the Effort Parameter. + +--- + +| Feature | Supported Models | +|---------|-----------------| +| Tool Search | Claude Opus 4.5, Sonnet 4.5 | +| Programmatic Tool Calling | Claude Opus 4.5, Sonnet 4.5 | +| Input Examples | Claude Opus 4.5, Sonnet 4.5 | +| Effort Parameter | Claude Opus 4.5 only | + +Supported Providers: [Anthropic](../../docs/providers/anthropic), [Bedrock](../../docs/providers/bedrock), [Vertex AI](../../docs/providers/vertex_partner#vertex-ai---anthropic-claude), [Azure AI](../../docs/providers/azure_ai). + +## Usage + + + + + +```python +import os +from litellm import completion + +# set env - [OPTIONAL] replace with your anthropic key +os.environ["ANTHROPIC_API_KEY"] = "your-api-key" + +messages = [{"role": "user", "content": "Hey! how's it going?"}] + +## OPENAI /chat/completions API format +response = completion(model="claude-opus-4-5-20251101", messages=messages) +print(response) + +``` + + + + +**1. Setup config.yaml** + +```yaml +model_list: + - model_name: claude-4 ### RECEIVED MODEL NAME ### + litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input + model: claude-opus-4-5-20251101 ### MODEL NAME sent to `litellm.completion()` ### + api_key: "os.environ/ANTHROPIC_API_KEY" # does os.getenv("ANTHROPIC_API_KEY") +``` + +**2. Start the proxy** + +```bash +litellm --config /path/to/config.yaml +``` + +**3. Test it!** + + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + } +' +``` + + +```bash +curl --location 'http://0.0.0.0:4000/v1/messages' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + } +' +``` + + + + + +## Usage - Bedrock + +:::info + +LiteLLM uses the boto3 library to authenticate with Bedrock. + +For more ways to authenticate with Bedrock, see the [Bedrock documentation](../../docs/providers/bedrock#authentication). + +::: + + + + + +```python +import os +from litellm import completion + +os.environ["AWS_ACCESS_KEY_ID"] = "" +os.environ["AWS_SECRET_ACCESS_KEY"] = "" +os.environ["AWS_REGION_NAME"] = "" + +## OPENAI /chat/completions API format +response = completion( + model="bedrock/us.anthropic.claude-opus-4-5-20251101-v1:0", + messages=[{ "content": "Hello, how are you?","role": "user"}] +) +``` + + + + +**1. Setup config.yaml** + +```yaml +model_list: + - model_name: claude-4 ### RECEIVED MODEL NAME ### + litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input + model: bedrock/us.anthropic.claude-opus-4-5-20251101-v1:0 ### MODEL NAME sent to `litellm.completion()` ### + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: os.environ/AWS_REGION_NAME +``` + +**2. Start the proxy** + +```bash +litellm --config /path/to/config.yaml +``` + +**3. Test it!** + + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + } +' +``` + + +```bash +curl --location 'http://0.0.0.0:4000/v1/messages' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + } +' +``` + + +```bash +curl --location 'http://0.0.0.0:4000/bedrock/model/claude-4/invoke' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "max_tokens": 1024, + "messages": [{"role": "user", "content": "Hello, how are you?"}] + }' +``` + + +```bash +curl --location 'http://0.0.0.0:4000/bedrock/model/claude-4/converse' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "messages": [{"role": "user", "content": "Hello, how are you?"}] + }' +``` + + + + + + +## Usage - Vertex AI + + + + + +```python +from litellm import completion +import json + +## GET CREDENTIALS +## RUN ## +# !gcloud auth application-default login - run this to add vertex credentials to your env +## OR ## +file_path = 'path/to/vertex_ai_service_account.json' + +# Load the JSON file +with open(file_path, 'r') as file: + vertex_credentials = json.load(file) + +# Convert to JSON string +vertex_credentials_json = json.dumps(vertex_credentials) + +## COMPLETION CALL +response = completion( + model="vertex_ai/claude-opus-4-5@20251101", + messages=[{ "content": "Hello, how are you?","role": "user"}], + vertex_credentials=vertex_credentials_json, + vertex_project="your-project-id", + vertex_location="us-east5" +) +``` + + + + +**1. Setup config.yaml** + +```yaml +model_list: + - model_name: claude-4 ### RECEIVED MODEL NAME ### + litellm_params: + model: vertex_ai/claude-opus-4-5@20251101 + vertex_credentials: "/path/to/service_account.json" + vertex_project: "your-project-id" + vertex_location: "us-east5" +``` + +**2. Start the proxy** + +```bash +litellm --config /path/to/config.yaml +``` + +**3. Test it!** + + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + } +' +``` + + +```bash +curl --location 'http://0.0.0.0:4000/v1/messages' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + } +' +``` + + + + + +## Usage - Azure Anthropic (Azure Foundry Claude) + +LiteLLM funnels Azure Claude deployments through the `azure_ai/` provider so Claude Opus models on Azure Foundry keep working with Tool Search, Effort, streaming, and the rest of the advanced feature set. Point `AZURE_AI_API_BASE` to `https://.services.ai.azure.com/anthropic` (LiteLLM appends `/v1/messages` automatically) and authenticate with `AZURE_AI_API_KEY` or an Azure AD token. + + + + +```python +import os +from litellm import completion + +# Configure Azure credentials +os.environ["AZURE_AI_API_KEY"] = "your-azure-ai-api-key" +os.environ["AZURE_AI_API_BASE"] = "https://my-resource.services.ai.azure.com/anthropic" + +response = completion( + model="azure_ai/claude-opus-4-1", + messages=[{"role": "user", "content": "Explain how Azure Anthropic hosts Claude Opus differently from the public Anthropic API."}], + max_tokens=1200, + temperature=0.7, + stream=True, +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + + + + +**1. Set environment variables** + +```bash +export AZURE_AI_API_KEY="your-azure-ai-api-key" +export AZURE_AI_API_BASE="https://my-resource.services.ai.azure.com/anthropic" +``` + +**2. Configure the proxy** + +```yaml +model_list: + - model_name: claude-4-azure + litellm_params: + model: azure_ai/claude-opus-4-1 + api_key: os.environ/AZURE_AI_API_KEY + api_base: os.environ/AZURE_AI_API_BASE +``` + +**3. Start LiteLLM** + +```bash +litellm --config /path/to/config.yaml +``` + +**4. Test the Azure Claude route** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer $LITELLM_KEY' \ + --data '{ + "model": "claude-4-azure", + "messages": [ + { + "role": "user", + "content": "How do I use Claude Opus 4 via Azure Anthropic in LiteLLM?" + } + ], + "max_tokens": 1024 + }' +``` + + + + + +## Tool Search {#tool-search} + +This lets Claude work with thousands of tools, by dynamically loading tools on-demand, instead of loading all tools into the context window upfront. + +### Usage Example + + + + +```python +import litellm +import os + +# Configure your API key +os.environ["ANTHROPIC_API_KEY"] = "your-api-key" + +# Define your tools with defer_loading +tools = [ + # Tool search tool (regex variant) + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + # Deferred tools - loaded on-demand + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location. Returns temperature and conditions.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "Temperature unit" + } + }, + "required": ["location"] + } + }, + "defer_loading": True # Load on-demand + }, + { + "type": "function", + "function": { + "name": "search_files", + "description": "Search through files in the workspace using keywords", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "file_types": { + "type": "array", + "items": {"type": "string"} + } + }, + "required": ["query"] + } + }, + "defer_loading": True + }, + { + "type": "function", + "function": { + "name": "query_database", + "description": "Execute SQL queries against the database", + "parameters": { + "type": "object", + "properties": { + "sql": {"type": "string"} + }, + "required": ["sql"] + } + }, + "defer_loading": True + } +] + +# Make a request - Claude will search for and use relevant tools +response = litellm.completion( + model="anthropic/claude-opus-4-5-20251101", + messages=[{ + "role": "user", + "content": "What's the weather like in San Francisco?" + }], + tools=tools +) + +print("Claude's response:", response.choices[0].message.content) +print("Tool calls:", response.choices[0].message.tool_calls) + +# Check tool search usage +if hasattr(response.usage, 'server_tool_use'): + print(f"Tool searches performed: {response.usage.server_tool_use.tool_search_requests}") +``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-4 + litellm_params: + model: anthropic/claude-opus-4-5-20251101 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "messages": [{ + "role": "user", + "content": "What's the weather like in San Francisco?" + }], + "tools": [ + # Tool search tool (regex variant) + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + # Deferred tools - loaded on-demand + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location. Returns temperature and conditions.", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "Temperature unit" + } + }, + "required": ["location"] + } + }, + "defer_loading": True # Load on-demand + }, + { + "type": "function", + "function": { + "name": "search_files", + "description": "Search through files in the workspace using keywords", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "file_types": { + "type": "array", + "items": {"type": "string"} + } + }, + "required": ["query"] + } + }, + "defer_loading": True + }, + { + "type": "function", + "function": { + "name": "query_database", + "description": "Execute SQL queries against the database", + "parameters": { + "type": "object", + "properties": { + "sql": {"type": "string"} + }, + "required": ["sql"] + } + }, + "defer_loading": True + } + ] +} +' +``` + + + +### BM25 Variant (Natural Language Search) + +For natural language queries instead of regex patterns: + +```python +tools = [ + { + "type": "tool_search_tool_bm25_20251119", # Natural language variant + "name": "tool_search_tool_bm25" + }, + # ... your deferred tools +] +``` + +--- + +## Programmatic Tool Calling {#programmatic-tool-calling} + +Programmatic tool calling allows Claude to write code that calls your tools programmatically. [Learn more](https://platform.claude.com/docs/en/agents-and-tools/tool-use/programmatic-tool-calling) + + + + +```python +import litellm +import json + +# Define tools that can be called programmatically +tools = [ + # Code execution tool (required for programmatic calling) + { + "type": "code_execution_20250825", + "name": "code_execution" + }, + # Tool that can be called from code + { + "type": "function", + "function": { + "name": "query_database", + "description": "Execute a SQL query against the sales database. Returns a list of rows as JSON objects.", + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "SQL query to execute" + } + }, + "required": ["sql"] + } + }, + "allowed_callers": ["code_execution_20250825"] # Enable programmatic calling + } +] + +# First request +response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[{ + "role": "user", + "content": "Query sales data for West, East, and Central regions, then tell me which had the highest revenue" + }], + tools=tools +) + +print("Claude's response:", response.choices[0].message) + +# Handle tool calls +messages = [ + {"role": "user", "content": "Query sales data for West, East, and Central regions, then tell me which had the highest revenue"}, + {"role": "assistant", "content": response.choices[0].message.content, "tool_calls": response.choices[0].message.tool_calls} +] + +# Process each tool call +for tool_call in response.choices[0].message.tool_calls: + # Check if it's a programmatic call + if hasattr(tool_call, 'caller') and tool_call.caller: + print(f"Programmatic call to {tool_call.function.name}") + print(f"Called from: {tool_call.caller}") + + # Simulate tool execution + if tool_call.function.name == "query_database": + args = json.loads(tool_call.function.arguments) + # Simulate database query + result = json.dumps([ + {"region": "West", "revenue": 150000}, + {"region": "East", "revenue": 180000}, + {"region": "Central", "revenue": 120000} + ]) + + messages.append({ + "role": "user", + "content": [{ + "type": "tool_result", + "tool_use_id": tool_call.id, + "content": result + }] + }) + +# Get final response +final_response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=messages, + tools=tools +) + +print("\nFinal answer:", final_response.choices[0].message.content) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-4 + litellm_params: + model: anthropic/claude-opus-4-5-20251101 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "messages": [{ + "role": "user", + "content": "Query sales data for West, East, and Central regions, then tell me which had the highest revenue" + }], + "tools": [ + # Code execution tool (required for programmatic calling) + { + "type": "code_execution_20250825", + "name": "code_execution" + }, + # Tool that can be called from code + { + "type": "function", + "function": { + "name": "query_database", + "description": "Execute a SQL query against the sales database. Returns a list of rows as JSON objects.", + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "SQL query to execute" + } + }, + "required": ["sql"] + } + }, + "allowed_callers": ["code_execution_20250825"] # Enable programmatic calling + } + ] +} +' +``` + + + +--- + +## Tool Input Examples {#tool-input-examples} + +You can now provide Claude with examples of how to use your tools. [Learn more](https://platform.claude.com/docs/en/agents-and-tools/tool-use/tool-input-examples) + + + + + +```python +import litellm + +tools = [ + { + "type": "function", + "function": { + "name": "create_calendar_event", + "description": "Create a new calendar event with attendees and reminders", + "parameters": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "start_time": { + "type": "string", + "description": "ISO 8601 format: YYYY-MM-DDTHH:MM:SS" + }, + "duration_minutes": {"type": "integer"}, + "attendees": { + "type": "array", + "items": { + "type": "object", + "properties": { + "email": {"type": "string"}, + "optional": {"type": "boolean"} + } + } + }, + "reminders": { + "type": "array", + "items": { + "type": "object", + "properties": { + "minutes_before": {"type": "integer"}, + "method": {"type": "string", "enum": ["email", "popup"]} + } + } + } + }, + "required": ["title", "start_time", "duration_minutes"] + } + }, + # Provide concrete examples + "input_examples": [ + { + "title": "Team Standup", + "start_time": "2025-01-15T09:00:00", + "duration_minutes": 30, + "attendees": [ + {"email": "alice@company.com", "optional": False}, + {"email": "bob@company.com", "optional": False} + ], + "reminders": [ + {"minutes_before": 15, "method": "popup"} + ] + }, + { + "title": "Lunch Break", + "start_time": "2025-01-15T12:00:00", + "duration_minutes": 60 + # Demonstrates optional fields can be omitted + } + ] + } +] + +response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[{ + "role": "user", + "content": "Schedule a team meeting for tomorrow at 2pm for 45 minutes with john@company.com and sarah@company.com" + }], + tools=tools +) + +print("Tool call:", response.choices[0].message.tool_calls[0].function.arguments) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-4 + litellm_params: + model: anthropic/claude-opus-4-5-20251101 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "messages": [{ + "role": "user", + "content": "Schedule a team meeting for tomorrow at 2pm for 45 minutes with john@company.com and sarah@company.com" + }], + "tools": [ + { + "type": "function", + "function": { + "name": "create_calendar_event", + "description": "Create a new calendar event with attendees and reminders", + "parameters": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "start_time": { + "type": "string", + "description": "ISO 8601 format: YYYY-MM-DDTHH:MM:SS" + }, + "duration_minutes": {"type": "integer"}, + "attendees": { + "type": "array", + "items": { + "type": "object", + "properties": { + "email": {"type": "string"}, + "optional": {"type": "boolean"} + } + } + }, + "reminders": { + "type": "array", + "items": { + "type": "object", + "properties": { + "minutes_before": {"type": "integer"}, + "method": {"type": "string", "enum": ["email", "popup"]} + } + } + } + }, + "required": ["title", "start_time", "duration_minutes"] + } + }, + # Provide concrete examples + "input_examples": [ + { + "title": "Team Standup", + "start_time": "2025-01-15T09:00:00", + "duration_minutes": 30, + "attendees": [ + {"email": "alice@company.com", "optional": False}, + {"email": "bob@company.com", "optional": False} + ], + "reminders": [ + {"minutes_before": 15, "method": "popup"} + ] + }, + { + "title": "Lunch Break", + "start_time": "2025-01-15T12:00:00", + "duration_minutes": 60 + # Demonstrates optional fields can be omitted + } + ] + } +] +} +' +``` + + + +--- + +## Effort Parameter: Control Token Usage {#effort-parameter} + +Control how much effort Claude puts into its response using the `reasoning_effort` parameter. This allows you to trade off between response thoroughness and token efficiency. + +:::info +LiteLLM automatically maps `reasoning_effort` to Anthropic's `output_config` format and adds the required `effort-2025-11-24` beta header for Claude Opus 4.5. +::: + +Potential values for `reasoning_effort` parameter: `"high"`, `"medium"`, `"low"`. + +### Usage Example + + + + +```python +import litellm + +message = "Analyze the trade-offs between microservices and monolithic architectures" + +# High effort (default) - Maximum capability +response_high = litellm.completion( + model="anthropic/claude-opus-4-5-20251101", + messages=[{"role": "user", "content": message}], + reasoning_effort="high" +) + +print("High effort response:") +print(response_high.choices[0].message.content) +print(f"Tokens used: {response_high.usage.completion_tokens}\n") + +# Medium effort - Balanced approach +response_medium = litellm.completion( + model="anthropic/claude-opus-4-5-20251101", + messages=[{"role": "user", "content": message}], + reasoning_effort="medium" +) + +print("Medium effort response:") +print(response_medium.choices[0].message.content) +print(f"Tokens used: {response_medium.usage.completion_tokens}\n") + +# Low effort - Maximum efficiency +response_low = litellm.completion( + model="anthropic/claude-opus-4-5-20251101", + messages=[{"role": "user", "content": message}], + reasoning_effort="low" +) + +print("Low effort response:") +print(response_low.choices[0].message.content) +print(f"Tokens used: {response_low.usage.completion_tokens}\n") + +# Compare token usage +print("Token Comparison:") +print(f"High: {response_high.usage.completion_tokens} tokens") +print(f"Medium: {response_medium.usage.completion_tokens} tokens") +print(f"Low: {response_low.usage.completion_tokens} tokens") +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-4 + litellm_params: + model: anthropic/claude-opus-4-5-20251101 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer $LITELLM_KEY' \ +--data ' { + "model": "claude-4", + "messages": [{ + "role": "user", + "content": "Analyze the trade-offs between microservices and monolithic architectures" + }], + "reasoning_effort": "high" + } +' +``` + + diff --git a/docs/my-website/blog/authors.yml b/docs/my-website/blog/authors.yml new file mode 100644 index 000000000000..2a49a7363339 --- /dev/null +++ b/docs/my-website/blog/authors.yml @@ -0,0 +1,24 @@ +litellm: + name: LiteLLM Team + title: LiteLLM Core Team + url: https://github.com/BerriAI/litellm + image_url: https://github.com/BerriAI.png + +krrish: + name: Krrish Dholakia + title: CEO, LiteLLM + url: https://www.linkedin.com/in/krish-d/ + image_url: https://pbs.twimg.com/profile_images/1298587542745358340/DZv3Oj-h_400x400.jpg + +ishaan: + name: Ishaan Jaffer + title: CTO, LiteLLM + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg + +# Alias for typo in name +ishaan-alt: + name: Ishaan Jaff + title: CTO, LiteLLM + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg diff --git a/docs/my-website/blog/gemini_3/index.md b/docs/my-website/blog/gemini_3/index.md new file mode 100644 index 000000000000..7263acc12c9c --- /dev/null +++ b/docs/my-website/blog/gemini_3/index.md @@ -0,0 +1,983 @@ +--- +slug: gemini_3 +title: "DAY 0 Support: Gemini 3 on LiteLLM" +date: 2025-11-19T10:00:00 +authors: + - name: Sameer Kankute + title: SWE @ LiteLLM (LLM Translation) + url: https://www.linkedin.com/in/sameer-kankute/ + image_url: https://pbs.twimg.com/profile_images/2001352686994907136/ONgNuSk5_400x400.jpg + - name: Krrish Dholakia + title: "CEO, LiteLLM" + url: https://www.linkedin.com/in/krish-d/ + image_url: https://pbs.twimg.com/profile_images/1298587542745358340/DZv3Oj-h_400x400.jpg + - name: Ishaan Jaff + title: "CTO, LiteLLM" + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg +description: "Common questions and best practices for using gemini-3-pro-preview with LiteLLM Proxy and SDK." +tags: [gemini, day 0 support, llms] +hide_table_of_contents: false +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +:::info + +This guide covers common questions and best practices for using `gemini-3-pro-preview` with LiteLLM Proxy and SDK. + +::: + +## Quick Start + + + + +```python +from litellm import completion +import os + +os.environ["GEMINI_API_KEY"] = "your-api-key" + +response = completion( + model="gemini/gemini-3-pro-preview", + messages=[{"role": "user", "content": "Hello!"}], + reasoning_effort="low" +) + +print(response.choices[0].message.content) +``` + + + + +**1. Add to config.yaml:** + +```yaml +model_list: + - model_name: gemini-3-pro-preview + litellm_params: + model: gemini/gemini-3-pro-preview + api_key: os.environ/GEMINI_API_KEY +``` + +**2. Start proxy:** + +```bash +litellm --config /path/to/config.yaml +``` + +**3. Make request:** + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-3-pro-preview", + "messages": [{"role": "user", "content": "Hello!"}], + "reasoning_effort": "low" + }' +``` + + + + +## Supported Endpoints + +LiteLLM provides **full end-to-end support** for Gemini 3 Pro Preview on: + +- ✅ `/v1/chat/completions` - OpenAI-compatible chat completions endpoint +- ✅ `/v1/responses` - OpenAI Responses API endpoint (streaming and non-streaming) +- ✅ [`/v1/messages`](../../docs/anthropic_unified) - Anthropic-compatible messages endpoint +- ✅ `/v1/generateContent` – [Google Gemini API](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/gemini#rest) compatible endpoint (for code, see: `client.models.generate_content(...)`) + +All endpoints support: +- Streaming and non-streaming responses +- Function calling with thought signatures +- Multi-turn conversations +- All Gemini 3-specific features + +## Thought Signatures + +#### What are Thought Signatures? + +Thought signatures are encrypted representations of the model's internal reasoning process. They're essential for maintaining context across multi-turn conversations, especially with function calling. + +#### How Thought Signatures Work + +1. **Automatic Extraction**: When Gemini 3 returns a function call, LiteLLM automatically extracts the `thought_signature` from the response +2. **Storage**: Thought signatures are stored in `provider_specific_fields.thought_signature` of tool calls +3. **Automatic Preservation**: When you include the assistant's message in conversation history, LiteLLM automatically preserves and returns thought signatures to Gemini + +## Example: Multi-Turn Function Calling + +#### Streaming with Thought Signatures + +When using streaming mode with `stream_chunk_builder()`, thought signatures are now automatically preserved: + + + + +```python +import os +import litellm +from litellm import completion + +os.environ["GEMINI_API_KEY"] = "your-api-key" + +MODEL = "gemini/gemini-3-pro-preview" + +messages = [ + {"role": "system", "content": "You are a helpful assistant. Use the calculate tool."}, + {"role": "user", "content": "What is 2+2?"}, +] + +tools = [{ + "type": "function", + "function": { + "name": "calculate", + "description": "Calculate a mathematical expression", + "parameters": { + "type": "object", + "properties": {"expression": {"type": "string"}}, + "required": ["expression"], + }, + }, +}] + +print("Step 1: Sending request with stream=True...") +response = completion( + model=MODEL, + messages=messages, + stream=True, + tools=tools, + reasoning_effort="low" +) + +# Collect all chunks +chunks = [] +for part in response: + chunks.append(part) + +# Reconstruct message using stream_chunk_builder +# Thought signatures are now preserved automatically! +full_response = litellm.stream_chunk_builder(chunks, messages=messages) +print(f"Full response: {full_response}") + +assistant_msg = full_response.choices[0].message + +# ✅ Thought signature is now preserved in provider_specific_fields +if assistant_msg.tool_calls and assistant_msg.tool_calls[0].provider_specific_fields: + thought_sig = assistant_msg.tool_calls[0].provider_specific_fields.get("thought_signature") + print(f"Thought signature preserved: {thought_sig is not None}") + +# Append assistant message (includes thought signatures automatically) +messages.append(assistant_msg) + +# Mock tool execution +messages.append({ + "role": "tool", + "content": "4", + "tool_call_id": assistant_msg.tool_calls[0].id +}) + +print("\nStep 2: Sending tool result back to model...") +response_2 = completion( + model=MODEL, + messages=messages, + stream=True, + tools=tools, + reasoning_effort="low" +) + +for part in response_2: + if part.choices[0].delta.content: + print(part.choices[0].delta.content, end="") +print() # New line +``` + +**Key Points:** +- ✅ `stream_chunk_builder()` now preserves `provider_specific_fields` including thought signatures +- ✅ Thought signatures are automatically included when appending `assistant_msg` to conversation history +- ✅ Multi-turn conversations work seamlessly with streaming + + + + +```python +from openai import OpenAI +import json + +client = OpenAI(api_key="sk-1234", base_url="http://localhost:4000") + +# Define tools +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + } +] + +# Step 1: Initial request +messages = [{"role": "user", "content": "What's the weather in Tokyo?"}] + +response = client.chat.completions.create( + model="gemini-3-pro-preview", + messages=messages, + tools=tools, + reasoning_effort="low" +) + +# Step 2: Append assistant message (thought signatures automatically preserved) +messages.append(response.choices[0].message) + +# Step 3: Execute tool and append result +for tool_call in response.choices[0].message.tool_calls: + if tool_call.function.name == "get_weather": + result = {"temperature": 30, "unit": "celsius"} + messages.append({ + "role": "tool", + "content": json.dumps(result), + "tool_call_id": tool_call.id + }) + +# Step 4: Follow-up request (thought signatures automatically included) +response2 = client.chat.completions.create( + model="gemini-3-pro-preview", + messages=messages, + tools=tools, + reasoning_effort="low" +) + +print(response2.choices[0].message.content) +``` + +**Key Points:** +- ✅ Thought signatures are automatically extracted from `response.choices[0].message.tool_calls[].provider_specific_fields.thought_signature` +- ✅ When you append `response.choices[0].message` to your conversation history, thought signatures are automatically preserved +- ✅ You don't need to manually extract or manage thought signatures + + + + +```bash +# Step 1: Initial request +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-3-pro-preview", + "messages": [ + {"role": "user", "content": "What'\''s the weather in Tokyo?"} + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + } + ], + "reasoning_effort": "low" + }' +``` + +**Response includes thought signature:** + +```json +{ + "choices": [{ + "message": { + "role": "assistant", + "tool_calls": [{ + "id": "call_abc123", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"location\": \"Tokyo\"}" + }, + "provider_specific_fields": { + "thought_signature": "CpcHAdHtim9+q4rstcbvQC0ic4x1/vqQlCJWgE+UZ6dTLYGHMMBkF/AxqL5UmP6SY46uYC8t4BTFiXG5zkw6EMJ..." + } + }] + } + }] +} +``` + +```bash +# Step 2: Follow-up request (include assistant message with thought signature) +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-3-pro-preview", + "messages": [ + {"role": "user", "content": "What'\''s the weather in Tokyo?"}, + { + "role": "assistant", + "content": null, + "tool_calls": [{ + "id": "call_abc123", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"location\": \"Tokyo\"}" + }, + "provider_specific_fields": { + "thought_signature": "CpcHAdHtim9+q4rstcbvQC0ic4x1/vqQlCJWgE+UZ6dTLYGHMMBkF/AxqL5UmP6SY46uYC8t4BTFiXG5zkw6EMJ..." + } + }] + }, + { + "role": "tool", + "content": "{\"temperature\": 30, \"unit\": \"celsius\"}", + "tool_call_id": "call_abc123" + } + ], + "tools": [...], + "reasoning_effort": "low" + }' +``` + + + + +#### Important Notes on Thought Signatures + +1. **Automatic Handling**: LiteLLM automatically extracts and preserves thought signatures. You don't need to manually manage them. + +2. **Parallel Function Calls**: When the model makes parallel function calls, only the **first function call** has a thought signature. + +3. **Sequential Function Calls**: In multi-step function calling, each step's first function call has its own thought signature that must be preserved. + +4. **Required for Context**: Thought signatures are essential for maintaining reasoning context. Without them, the model may lose context of its previous reasoning. + +## Conversation History: Switching from Non-Gemini-3 Models + +#### Common Question: Will switching from a non-Gemini-3 model to Gemini-3 break conversation history? + +**Answer: No!** LiteLLM automatically handles this by adding dummy thought signatures when needed. + +#### How It Works + +When you switch from a model that doesn't use thought signatures (e.g., `gemini-2.5-flash`) to Gemini 3, LiteLLM: + +1. **Detects missing signatures**: Identifies assistant messages with tool calls that lack thought signatures +2. **Adds dummy signature**: Automatically injects a dummy thought signature (`skip_thought_signature_validator`) for compatibility +3. **Maintains conversation flow**: Your conversation history continues to work seamlessly + +#### Example: Switching Models Mid-Conversation + + + + +```python +from openai import OpenAI + +client = OpenAI(api_key="sk-1234", base_url="http://localhost:4000") + +# Step 1: Start with gemini-2.5-flash (no thought signatures) +messages = [{"role": "user", "content": "What's the weather?"}] + +response1 = client.chat.completions.create( + model="gemini-2.5-flash", + messages=messages, + tools=[...], + reasoning_effort="low" +) + +# Append assistant message (no tool call thought signature from gemini-2.5-flash) +messages.append(response1.choices[0].message) + +# Step 2: Switch to gemini-3-pro-preview +# LiteLLM automatically adds dummy thought signature to the previous assistant message +response2 = client.chat.completions.create( + model="gemini-3-pro-preview", # 👈 Switched model + messages=messages, # 👈 Same conversation history + tools=[...], + reasoning_effort="low" +) + +# ✅ Works seamlessly! No errors, no breaking changes +print(response2.choices[0].message.content) +``` + + + + +```bash +# Step 1: Start with gemini-2.5-flash +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-2.5-flash", + "messages": [{"role": "user", "content": "What'\''s the weather?"}], + "tools": [...], + "reasoning_effort": "low" + }' + +# Step 2: Switch to gemini-3-pro-preview with same conversation history +# LiteLLM automatically handles the missing thought signature +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-3-pro-preview", # 👈 Switched model + "messages": [ + {"role": "user", "content": "What'\''s the weather?"}, + { + "role": "assistant", + "tool_calls": [...] # 👈 No thought_signature from gemini-2.5-flash + } + ], + "tools": [...], + "reasoning_effort": "low" + }' +# ✅ Works! LiteLLM adds dummy signature automatically +``` + + + + +#### Dummy Signature Details + +The dummy signature used is: `base64("skip_thought_signature_validator")` + +This is the recommended approach by Google for handling conversation history from models that don't support thought signatures. It allows Gemini 3 to: +- Accept the conversation history without validation errors +- Continue the conversation seamlessly +- Maintain context across model switches + +## Thinking Level Parameter + +#### How `reasoning_effort` Maps to `thinking_level` + +For Gemini 3 Pro Preview, LiteLLM automatically maps `reasoning_effort` to the new `thinking_level` parameter: + +| `reasoning_effort` | `thinking_level` | Notes | +|-------------------|------------------|-------| +| `"minimal"` | `"low"` | Maps to low thinking level | +| `"low"` | `"low"` | Default for most use cases | +| `"medium"` | `"high"` | Medium not available yet, maps to high | +| `"high"` | `"high"` | Maximum reasoning depth | +| `"disable"` | `"low"` | Gemini 3 cannot fully disable thinking | +| `"none"` | `"low"` | Gemini 3 cannot fully disable thinking | + +#### Default Behavior + +If you don't specify `reasoning_effort`, LiteLLM automatically sets `thinking_level="low"` for Gemini 3 models, to avoid high costs. + +### Example Usage + + + + +```python +from litellm import completion + +# Low thinking level (faster, lower cost) +response = completion( + model="gemini/gemini-3-pro-preview", + messages=[{"role": "user", "content": "What's the weather?"}], + reasoning_effort="low" # Maps to thinking_level="low" +) + +# High thinking level (deeper reasoning, higher cost) +response = completion( + model="gemini/gemini-3-pro-preview", + messages=[{"role": "user", "content": "Solve this complex math problem step by step."}], + reasoning_effort="high" # Maps to thinking_level="high" +) +``` + + + + +```bash +# Low thinking level +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-3-pro-preview", + "messages": [{"role": "user", "content": "What'\''s the weather?"}], + "reasoning_effort": "low" + }' + +# High thinking level +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-3-pro-preview", + "messages": [{"role": "user", "content": "Solve this complex problem."}], + "reasoning_effort": "high" + }' +``` + + + + +## Important Notes + +1. **Gemini 3 Cannot Disable Thinking**: Unlike Gemini 2.5 models, Gemini 3 cannot fully disable thinking. Even when you set `reasoning_effort="none"` or `"disable"`, it maps to `thinking_level="low"`. + +2. **Temperature Recommendation**: For Gemini 3 models, LiteLLM defaults `temperature` to `1.0` and strongly recommends keeping it at this default. Setting `temperature < 1.0` can cause: + - Infinite loops + - Degraded reasoning performance + - Failure on complex tasks + +3. **Automatic Defaults**: If you don't specify `reasoning_effort`, LiteLLM automatically sets `thinking_level="low"` for optimal performance. + +## Cost Tracking: Prompt Caching & Context Window + +LiteLLM provides comprehensive cost tracking for Gemini 3 Pro Preview, including support for prompt caching and tiered pricing based on context window size. + +### Prompt Caching Cost Tracking + +Gemini 3 supports prompt caching, which allows you to cache frequently used prompt prefixes to reduce costs. LiteLLM automatically tracks and calculates costs for: + +- **Cache Hit Tokens**: Tokens that are read from cache (charged at a lower rate) +- **Cache Creation Tokens**: Tokens that are written to cache (one-time cost) +- **Text Tokens**: Regular prompt tokens that are processed normally + +#### How It Works + +LiteLLM extracts caching information from the `prompt_tokens_details` field in the usage object: + +```python +{ + "usage": { + "prompt_tokens": 50000, + "completion_tokens": 1000, + "total_tokens": 51000, + "prompt_tokens_details": { + "cached_tokens": 30000, # Cache hit tokens + "cache_creation_tokens": 5000, # Tokens written to cache + "text_tokens": 15000 # Regular processed tokens + } + } +} +``` + +### Context Window Tiered Pricing + +Gemini 3 Pro Preview supports up to 1M tokens of context, with tiered pricing that automatically applies when your prompt exceeds 200k tokens. + +#### Automatic Tier Detection + +LiteLLM automatically detects when your prompt exceeds the 200k token threshold and applies the appropriate tiered pricing: + +```python +from litellm import completion_cost + +# Example: Small prompt (< 200k tokens) +response_small = completion( + model="gemini/gemini-3-pro-preview", + messages=[{"role": "user", "content": "Hello!"}] +) +# Uses base pricing: $0.000002/input token, $0.000012/output token + +# Example: Large prompt (> 200k tokens) +response_large = completion( + model="gemini/gemini-3-pro-preview", + messages=[{"role": "user", "content": "..." * 250000}] # 250k tokens +) +# Automatically uses tiered pricing: $0.000004/input token, $0.000018/output token +``` + +#### Cost Breakdown + +The cost calculation includes: + +1. **Text Processing Cost**: Regular tokens processed at base or tiered rate +2. **Cache Read Cost**: Cached tokens read at discounted rate +3. **Cache Creation Cost**: One-time cost for writing tokens to cache (applies tiered rate if above 200k) +4. **Output Cost**: Generated tokens at base or tiered rate + +### Example: Viewing Cost Breakdown + +You can view the detailed cost breakdown using LiteLLM's cost tracking: + +```python +from litellm import completion, completion_cost + +response = completion( + model="gemini/gemini-3-pro-preview", + messages=[{"role": "user", "content": "Explain prompt caching"}], + caching=True # Enable prompt caching +) + +# Get total cost +total_cost = completion_cost(completion_response=response) +print(f"Total cost: ${total_cost:.6f}") + +# Access usage details +usage = response.usage +print(f"Prompt tokens: {usage.prompt_tokens}") +print(f"Completion tokens: {usage.completion_tokens}") + +# Access caching details +if usage.prompt_tokens_details: + print(f"Cache hit tokens: {usage.prompt_tokens_details.cached_tokens}") + print(f"Cache creation tokens: {usage.prompt_tokens_details.cache_creation_tokens}") + print(f"Text tokens: {usage.prompt_tokens_details.text_tokens}") +``` + +### Cost Optimization Tips + +1. **Use Prompt Caching**: For repeated prompt prefixes, enable caching to reduce costs by up to 90% for cached portions +2. **Monitor Context Size**: Be aware that prompts above 200k tokens use tiered pricing (2x for input, 1.5x for output) +3. **Cache Management**: Cache creation tokens are charged once when writing to cache, then subsequent reads are much cheaper +4. **Track Usage**: Use LiteLLM's built-in cost tracking to monitor spending across different token types + +### Integration with LiteLLM Proxy + +When using LiteLLM Proxy, all cost tracking is automatically logged and available through: + +- **Usage Logs**: Detailed token and cost breakdowns in proxy logs +- **Budget Management**: Set budgets and alerts based on actual usage +- **Analytics Dashboard**: View cost trends and breakdowns by token type + +```yaml +# config.yaml +model_list: + - model_name: gemini-3-pro-preview + litellm_params: + model: gemini/gemini-3-pro-preview + api_key: os.environ/GEMINI_API_KEY + +litellm_settings: + # Enable detailed cost tracking + success_callback: ["langfuse"] # or your preferred logging service +``` + +## Using with Claude Code CLI + +You can use `gemini-3-pro-preview` with **Claude Code CLI** - Anthropic's command-line interface. This allows you to use Gemini 3 Pro Preview with Claude Code's native syntax and workflows. + +### Setup + +**1. Add Gemini 3 Pro Preview to your `config.yaml`:** + +```yaml +model_list: + - model_name: gemini-3-pro-preview + litellm_params: + model: gemini/gemini-3-pro-preview + api_key: os.environ/GEMINI_API_KEY + +litellm_settings: + master_key: os.environ/LITELLM_MASTER_KEY +``` + +**2. Set environment variables:** + +```bash +export GEMINI_API_KEY="your-gemini-api-key" +export LITELLM_MASTER_KEY="sk-1234567890" # Generate a secure key +``` + +**3. Start LiteLLM Proxy:** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +**4. Configure Claude Code to use LiteLLM Proxy:** + +```bash +export ANTHROPIC_BASE_URL="http://0.0.0.0:4000" +export ANTHROPIC_AUTH_TOKEN="$LITELLM_MASTER_KEY" +``` + +**5. Use Gemini 3 Pro Preview with Claude Code:** + +```bash +# Claude Code will use gemini-3-pro-preview from your LiteLLM proxy +claude --model gemini-3-pro-preview + +``` + +### Example Usage + +Once configured, you can interact with Gemini 3 Pro Preview using Claude Code's native interface: + +```bash +$ claude --model gemini-3-pro-preview +> Explain how thought signatures work in multi-turn conversations. + +# Gemini 3 Pro Preview responds through Claude Code interface +``` + +### Benefits + +- ✅ **Native Claude Code Experience**: Use Gemini 3 Pro Preview with Claude Code's familiar CLI interface +- ✅ **Unified Authentication**: Single API key for all models through LiteLLM proxy +- ✅ **Cost Tracking**: All usage tracked through LiteLLM's centralized logging +- ✅ **Seamless Model Switching**: Easily switch between Claude and Gemini models +- ✅ **Full Feature Support**: All Gemini 3 features (thought signatures, function calling, etc.) work through Claude Code + +### Troubleshooting + +**Claude Code not finding the model:** +- Ensure the model name in Claude Code matches exactly: `gemini-3-pro-preview` +- Verify your proxy is running: `curl http://0.0.0.0:4000/health` +- Check that `ANTHROPIC_BASE_URL` points to your LiteLLM proxy + +**Authentication errors:** +- Verify `ANTHROPIC_AUTH_TOKEN` matches your LiteLLM master key +- Ensure `GEMINI_API_KEY` is set correctly +- Check LiteLLM proxy logs for detailed error messages + +## Responses API Support + +LiteLLM fully supports the OpenAI Responses API for Gemini 3 Pro Preview, including both streaming and non-streaming modes. The Responses API provides a structured way to handle multi-turn conversations with function calling, and LiteLLM automatically preserves thought signatures throughout the conversation. + +### Example: Using Responses API with Gemini 3 + + + + +```python +from openai import OpenAI +import json + +client = OpenAI() + +# 1. Define a list of callable tools for the model +tools = [ + { + "type": "function", + "name": "get_horoscope", + "description": "Get today's horoscope for an astrological sign.", + "parameters": { + "type": "object", + "properties": { + "sign": { + "type": "string", + "description": "An astrological sign like Taurus or Aquarius", + }, + }, + "required": ["sign"], + }, + }, +] + +def get_horoscope(sign): + return f"{sign}: Next Tuesday you will befriend a baby otter." + +# Create a running input list we will add to over time +input_list = [ + {"role": "user", "content": "What is my horoscope? I am an Aquarius."} +] + +# 2. Prompt the model with tools defined +response = client.responses.create( + model="gemini-3-pro-preview", + tools=tools, + input=input_list, +) + +# Save function call outputs for subsequent requests +input_list += response.output + +for item in response.output: + if item.type == "function_call": + if item.name == "get_horoscope": + # 3. Execute the function logic for get_horoscope + horoscope = get_horoscope(json.loads(item.arguments)) + + # 4. Provide function call results to the model + input_list.append({ + "type": "function_call_output", + "call_id": item.call_id, + "output": json.dumps({ + "horoscope": horoscope + }) + }) + +print("Final input:") +print(input_list) + +response = client.responses.create( + model="gemini-3-pro-preview", + instructions="Respond only with a horoscope generated by a tool.", + tools=tools, + input=input_list, +) + +# 5. The model should be able to give a response! +print("Final output:") +print(response.model_dump_json(indent=2)) +print("\n" + response.output_text) +``` + +**Key Points:** +- ✅ Thought signatures are automatically preserved in function calls +- ✅ Works seamlessly with multi-turn conversations +- ✅ All Gemini 3-specific features are fully supported + + + + +```python +from openai import OpenAI +import json + +client = OpenAI() + +tools = [ + { + "type": "function", + "name": "get_horoscope", + "description": "Get today's horoscope for an astrological sign.", + "parameters": { + "type": "object", + "properties": { + "sign": { + "type": "string", + "description": "An astrological sign like Taurus or Aquarius", + }, + }, + "required": ["sign"], + }, + }, +] + +def get_horoscope(sign): + return f"{sign}: Next Tuesday you will befriend a baby otter." + +input_list = [ + {"role": "user", "content": "What is my horoscope? I am an Aquarius."} +] + +# Streaming mode +response = client.responses.create( + model="gemini-3-pro-preview", + tools=tools, + input=input_list, + stream=True, +) + +# Collect all chunks +chunks = [] +for chunk in response: + chunks.append(chunk) + # Process streaming chunks as they arrive + print(chunk) + +# Thought signatures are automatically preserved in streaming mode +``` + +**Key Points:** +- ✅ Streaming mode fully supported +- ✅ Thought signatures preserved across streaming chunks +- ✅ Real-time processing of function calls and responses + + + + +### Responses API Benefits + +- ✅ **Structured Output**: Responses API provides a clear structure for handling function calls and multi-turn conversations +- ✅ **Thought Signature Preservation**: LiteLLM automatically preserves thought signatures in both streaming and non-streaming modes +- ✅ **Seamless Integration**: Works with existing OpenAI SDK patterns +- ✅ **Full Feature Support**: All Gemini 3 features (thought signatures, function calling, reasoning) are fully supported + + +## Best Practices + +#### 1. Always Include Thought Signatures in Conversation History + +When building multi-turn conversations with function calling: + +✅ **Do:** +```python +# Append the full assistant message (includes thought signatures) +messages.append(response.choices[0].message) +``` + +❌ **Don't:** +```python +# Don't manually construct assistant messages without thought signatures +messages.append({ + "role": "assistant", + "tool_calls": [...] # Missing thought signatures! +}) +``` + +#### 2. Use Appropriate Thinking Levels + +- **`reasoning_effort="low"`**: For simple queries, quick responses, cost optimization +- **`reasoning_effort="high"`**: For complex problems requiring deep reasoning + +#### 3. Keep Temperature at Default + +For Gemini 3 models, always use `temperature=1.0` (default). Lower temperatures can cause issues. + +#### 4. Handle Model Switches Gracefully + +When switching from non-Gemini-3 to Gemini-3: +- ✅ LiteLLM automatically handles missing thought signatures +- ✅ No manual intervention needed +- ✅ Conversation history continues seamlessly + + +## Troubleshooting + +#### Issue: Missing Thought Signatures + +**Symptom**: Error when including assistant messages in conversation history + +**Solution**: Ensure you're appending the full assistant message from the response: +```python +messages.append(response.choices[0].message) # ✅ Includes thought signatures +``` + +#### Issue: Conversation Breaks When Switching Models + +**Symptom**: Errors when switching from gemini-2.5-flash to gemini-3-pro-preview + +**Solution**: This should work automatically! LiteLLM adds dummy signatures. If you see errors, ensure you're using the latest LiteLLM version. + +#### Issue: Infinite Loops or Poor Performance + +**Symptom**: Model gets stuck or produces poor results + +**Solution**: +- Ensure `temperature=1.0` (default for Gemini 3) +- Check that `reasoning_effort` is set appropriately +- Verify you're using the correct model name: `gemini/gemini-3-pro-preview` + +## Additional Resources + +- [Gemini Provider Documentation](../gemini.md) +- [Thought Signatures Guide](../gemini.md#thought-signatures) +- [Reasoning Content Documentation](../../reasoning_content.md) +- [Function Calling Guide](../../function_calling.md) + diff --git a/docs/my-website/blog/gemini_3_flash/index.md b/docs/my-website/blog/gemini_3_flash/index.md new file mode 100644 index 000000000000..830c21e5f664 --- /dev/null +++ b/docs/my-website/blog/gemini_3_flash/index.md @@ -0,0 +1,255 @@ +--- +slug: gemini_3_flash +title: "DAY 0 Support: Gemini 3 Flash on LiteLLM" +date: 2025-12-17T10:00:00 +authors: + - name: Sameer Kankute + title: SWE @ LiteLLM (LLM Translation) + url: https://www.linkedin.com/in/sameer-kankute/ + image_url: https://pbs.twimg.com/profile_images/2001352686994907136/ONgNuSk5_400x400.jpg + - name: Krrish Dholakia + title: "CEO, LiteLLM" + url: https://www.linkedin.com/in/krish-d/ + image_url: https://pbs.twimg.com/profile_images/1298587542745358340/DZv3Oj-h_400x400.jpg + - name: Ishaan Jaff + title: "CTO, LiteLLM" + url: https://www.linkedin.com/in/reffajnaahsi/ + image_url: https://pbs.twimg.com/profile_images/1613813310264340481/lz54oEiB_400x400.jpg +description: "Guide to using Gemini 3 Flash on LiteLLM Proxy and SDK with day 0 support." +tags: [gemini, day 0 support, llms] +hide_table_of_contents: false +--- + + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Gemini 3 Flash Day 0 Support + +LiteLLM now supports `gemini-3-flash-preview` and all the new API changes along with it. + +:::note +If you only want cost tracking, you need no change in your current Litellm version. But if you want the support for new features introduced along with it like thinking levels, you will need to use v1.80.8-stable.1 or above. +::: + +## Deploy this version + + + + +``` showLineNumbers title="docker run litellm" +docker run \ +-e STORE_MODEL_IN_DB=True \ +-p 4000:4000 \ +ghcr.io/berriai/litellm:main-v1.80.8-stable.1 +``` + + + + + +``` showLineNumbers title="pip install litellm" +pip install litellm==1.80.8.post1 +``` + + + + +## What's New + +### 1. New Thinking Levels: `thinkingLevel` with MINIMAL & MEDIUM + +Gemini 3 Flash introduces granular thinking control with `thinkingLevel` instead of `thinkingBudget`. +- **MINIMAL**: Ultra-lightweight thinking for fast responses +- **MEDIUM**: Balanced thinking for complex reasoning +- **HIGH**: Maximum reasoning depth + +LiteLLM automatically maps the OpenAI `reasoning_effort` parameter to Gemini's `thinkingLevel`, so you can use familiar `reasoning_effort` values (`minimal`, `low`, `medium`, `high`) without changing your code! + +### 2. Thought Signatures + +Like `gemini-3-pro`, this model also includes thought signatures for tool calls. LiteLLM handles signature extraction and embedding internally. [Learn more about thought signatures](../gemini_3/index.md#thought-signatures). + +**Edge Case Handling**: If thought signatures are missing in the request, LiteLLM adds a dummy signature ensuring the API call doesn't break + +--- +## Supported Endpoints + +LiteLLM provides **full end-to-end support** for Gemini 3 Flash on: + +- ✅ `/v1/chat/completions` - OpenAI-compatible chat completions endpoint +- ✅ `/v1/responses` - OpenAI Responses API endpoint (streaming and non-streaming) +- ✅ [`/v1/messages`](../../docs/anthropic_unified) - Anthropic-compatible messages endpoint +- ✅ `/v1/generateContent` – [Google Gemini API](../../docs/generateContent.md) compatible endpoint +All endpoints support: +- Streaming and non-streaming responses +- Function calling with thought signatures +- Multi-turn conversations +- All Gemini 3-specific features +- Converstion of provider specific thinking related param to thinkingLevel + +## Quick Start + + + + +**Basic Usage with MEDIUM thinking (NEW)** + +```python +from litellm import completion + +# No need to make any changes to your code as we map openai reasoning param to thinkingLevel +response = completion( + model="gemini/gemini-3-flash-preview", + messages=[{"role": "user", "content": "Solve this complex math problem: 25 * 4 + 10"}], + reasoning_effort="medium", # NEW: MEDIUM thinking level +) + +print(response.choices[0].message.content) +``` + + + + + +**1. Setup config.yaml** + +```yaml +model_list: + - model_name: gemini-3-flash + litellm_params: + model: gemini/gemini-3-flash-preview + api_key: os.environ/GEMINI_API_KEY +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml +``` + +**3. Call with MEDIUM thinking** + +```bash +curl -X POST http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "gemini-3-flash", + "messages": [{"role": "user", "content": "Complex reasoning task"}], + "reasoning_effort": "medium" + }' +``' + + + + +--- + +## All `reasoning_effort` Levels + + + + +**Ultra-fast, minimal reasoning** + +```python +from litellm import completion + +response = completion( + model="gemini/gemini-3-flash-preview", + messages=[{"role": "user", "content": "What's 2+2?"}], + reasoning_effort="minimal", +) +``` + + + + + +**Simple instruction following** + +```python +response = completion( + model="gemini/gemini-3-flash-preview", + messages=[{"role": "user", "content": "Write a haiku about coding"}], + reasoning_effort="low", +) +``` + + + + + +**Balanced reasoning for complex tasks** ✨ + +```python +response = completion( + model="gemini/gemini-3-flash-preview", + messages=[{"role": "user", "content": "Analyze this dataset and find patterns"}], + reasoning_effort="medium", # NEW! +) +``` + + + + + +**Maximum reasoning depth** + +```python +response = completion( + model="gemini/gemini-3-flash-preview", + messages=[{"role": "user", "content": "Prove this mathematical theorem"}], + reasoning_effort="high", +) +``` + + + + +--- + +## Key Features + +✅ **Thinking Levels**: MINIMAL, LOW, MEDIUM, HIGH +✅ **Thought Signatures**: Track reasoning with unique identifiers +✅ **Seamless Integration**: Works with existing OpenAI-compatible client +✅ **Backward Compatible**: Gemini 2.5 models continue using `thinkingBudget` + +--- + +## Installation + +```bash +pip install litellm --upgrade +``` + +```python +import litellm +from litellm import completion + +response = completion( + model="gemini/gemini-3-flash-preview", + messages=[{"role": "user", "content": "Your question here"}], + reasoning_effort="medium", # Use MEDIUM thinking +) +print(response) +``` + +:::note +If using this model via vertex_ai, keep the location as global as this is the only supported location as of now. +::: + + +## `reasoning_effort` Mapping for Gemini 3+ + +| reasoning_effort | thinking_level | +|------------------|----------------| +| `minimal` | `minimal` | +| `low` | `low` | +| `medium` | `medium` | +| `high` | `high` | +| `disable` | `minimal` | +| `none` | `minimal` | + diff --git a/docs/my-website/docs/a2a.md b/docs/my-website/docs/a2a.md new file mode 100644 index 000000000000..a7e8b52d99ab --- /dev/null +++ b/docs/my-website/docs/a2a.md @@ -0,0 +1,371 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import Image from '@theme/IdealImage'; + +# Agent Gateway (A2A Protocol) - Overview + +Add A2A Agents on LiteLLM AI Gateway, Invoke agents in A2A Protocol, track request/response logs in LiteLLM Logs. Manage which Teams, Keys can access which Agents onboarded. + + + +
+
+ +| Feature | Supported | +|---------|-----------| +| Supported Agent Providers | A2A, Vertex AI Agent Engine, LangGraph, Azure AI Foundry, Bedrock AgentCore, Pydantic AI | +| Logging | ✅ | +| Load Balancing | ✅ | +| Streaming | ✅ | + + +:::tip + +LiteLLM follows the [A2A (Agent-to-Agent) Protocol](https://github.com/google/A2A) for invoking agents. + +::: + +## Adding your Agent + +### Add A2A Agents + +You can add A2A-compatible agents through the LiteLLM Admin UI. + +1. Navigate to the **Agents** tab +2. Click **Add Agent** +3. Enter the agent name (e.g., `ij-local`) and the URL of your A2A agent + + + +The URL should be the invocation URL for your A2A agent (e.g., `http://localhost:10001`). + + +### Add Azure AI Foundry Agents + +Follow [this guide, to add your azure ai foundry agent to LiteLLM Agent Gateway](./providers/azure_ai_agents#litellm-a2a-gateway) + +### Add Vertex AI Agent Engine + +Follow [this guide, to add your Vertex AI Agent Engine to LiteLLM Agent Gateway](./providers/vertex_ai_agent_engine) + +### Add Bedrock AgentCore Agents + +Follow [this guide, to add your bedrock agentcore agent to LiteLLM Agent Gateway](./providers/bedrock_agentcore#litellm-a2a-gateway) + +### Add LangGraph Agents + +Follow [this guide, to add your langgraph agent to LiteLLM Agent Gateway](./providers/langgraph#litellm-a2a-gateway) + +### Add Pydantic AI Agents + +Follow [this guide, to add your pydantic ai agent to LiteLLM Agent Gateway](./providers/pydantic_ai_agent#litellm-a2a-gateway) + +## Invoking your Agents + +Use the [A2A Python SDK](https://pypi.org/project/a2a-sdk) to invoke agents through LiteLLM. + +This example shows how to: +1. **List available agents** - Query `/v1/agents` to see which agents your key can access +2. **Select an agent** - Pick an agent from the list +3. **Invoke via A2A** - Use the A2A protocol to send messages to the agent + +```python showLineNumbers title="invoke_a2a_agent.py" +from uuid import uuid4 +import httpx +import asyncio +from a2a.client import A2ACardResolver, A2AClient +from a2a.types import MessageSendParams, SendMessageRequest + +# === CONFIGURE THESE === +LITELLM_BASE_URL = "http://localhost:4000" # Your LiteLLM proxy URL +LITELLM_VIRTUAL_KEY = "sk-1234" # Your LiteLLM Virtual Key +# ======================= + +async def main(): + headers = {"Authorization": f"Bearer {LITELLM_VIRTUAL_KEY}"} + + async with httpx.AsyncClient(headers=headers) as client: + # Step 1: List available agents + response = await client.get(f"{LITELLM_BASE_URL}/v1/agents") + agents = response.json() + + print("Available agents:") + for agent in agents: + print(f" - {agent['agent_name']} (ID: {agent['agent_id']})") + + if not agents: + print("No agents available for this key") + return + + # Step 2: Select an agent and invoke it + selected_agent = agents[0] + agent_id = selected_agent["agent_id"] + agent_name = selected_agent["agent_name"] + print(f"\nInvoking: {agent_name}") + + # Step 3: Use A2A protocol to invoke the agent + base_url = f"{LITELLM_BASE_URL}/a2a/{agent_id}" + resolver = A2ACardResolver(httpx_client=client, base_url=base_url) + agent_card = await resolver.get_agent_card() + a2a_client = A2AClient(httpx_client=client, agent_card=agent_card) + + request = SendMessageRequest( + id=str(uuid4()), + params=MessageSendParams( + message={ + "role": "user", + "parts": [{"kind": "text", "text": "Hello, what can you do?"}], + "messageId": uuid4().hex, + } + ), + ) + response = await a2a_client.send_message(request) + print(f"Response: {response.model_dump(mode='json', exclude_none=True, indent=4)}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Streaming Responses + +For streaming responses, use `send_message_streaming`: + +```python showLineNumbers title="invoke_a2a_agent_streaming.py" +from uuid import uuid4 +import httpx +import asyncio +from a2a.client import A2ACardResolver, A2AClient +from a2a.types import MessageSendParams, SendStreamingMessageRequest + +# === CONFIGURE THESE === +LITELLM_BASE_URL = "http://localhost:4000" # Your LiteLLM proxy URL +LITELLM_VIRTUAL_KEY = "sk-1234" # Your LiteLLM Virtual Key +LITELLM_AGENT_NAME = "ij-local" # Agent name registered in LiteLLM +# ======================= + +async def main(): + base_url = f"{LITELLM_BASE_URL}/a2a/{LITELLM_AGENT_NAME}" + headers = {"Authorization": f"Bearer {LITELLM_VIRTUAL_KEY}"} + + async with httpx.AsyncClient(headers=headers) as httpx_client: + # Resolve agent card and create client + resolver = A2ACardResolver(httpx_client=httpx_client, base_url=base_url) + agent_card = await resolver.get_agent_card() + client = A2AClient(httpx_client=httpx_client, agent_card=agent_card) + + # Send a streaming message + request = SendStreamingMessageRequest( + id=str(uuid4()), + params=MessageSendParams( + message={ + "role": "user", + "parts": [{"kind": "text", "text": "Hello, what can you do?"}], + "messageId": uuid4().hex, + } + ), + ) + + # Stream the response + async for chunk in client.send_message_streaming(request): + print(chunk.model_dump(mode="json", exclude_none=True)) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Tracking Agent Logs + +After invoking an agent, you can view the request logs in the LiteLLM **Logs** tab. + +The logs show: +- **Request/Response content** sent to and received from the agent +- **User, Key, Team** information for tracking who made the request +- **Latency and cost** metrics + + + + +## Forwarding LiteLLM Context Headers + +When LiteLLM invokes your A2A agent, it sends special headers that enable: +- **Trace Grouping**: All LLM calls from the same agent execution appear under one trace +- **Agent Spend Tracking**: Costs are attributed to the specific agent + +| Header | Purpose | +|--------|---------| +| `X-LiteLLM-Trace-Id` | Links all LLM calls to the same execution flow | +| `X-LiteLLM-Agent-Id` | Attributes spend to the correct agent | + + +To enable these features, your A2A server must **forward these headers** to any LLM calls it makes back to LiteLLM. + +### Implementation Steps + +**Step 1: Extract headers from incoming A2A request** +```python def get_litellm_headers(request) -> dict: + """Extract X-LiteLLM-* headers from incoming A2A request.""" + all_headers = request.call_context.state.get('headers', {}) + return { + k: v for k, v in all_headers.items() + if k.lower().startswith('x-litellm-') + } +``` + +**Step 2: Forward headers to your LLM calls** +Pass the extracted headers when making calls back to LiteLLM: + + + +```python from openai import OpenAI + +headers = get_litellm_headers(request) + +client = OpenAI( + api_key="sk-your-litellm-key", + base_url="http://localhost:4000", + default_headers=headers, # Forward headers +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}] +) +``` + + + + +```python +from langchain_openai import ChatOpenAI + +headers = get_litellm_headers(request) + +llm = ChatOpenAI( + model="gpt-4o", + openai_api_key="sk-your-litellm-key", + base_url="http://localhost:4000", + default_headers=headers, # Forward headers +) +``` + + + +```python +import litellm + +headers = get_litellm_headers(request) + +response = litellm.completion( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}], + api_base="http://localhost:4000", + extra_headers=headers, # Forward headers +) +``` + + + +```python +import httpx + +headers = get_litellm_headers(request) +headers["Authorization"] = "Bearer sk-your-litellm-key" + +response = httpx.post( + "http://localhost:4000/v1/chat/completions", + headers=headers, + json={"model": "gpt-4o", "messages": [{"role": "user", "content": "Hello"}]} +) +``` + + + +### Result + +With header forwarding enabled, you'll see: + +**Trace Grouping in Langfuse:** + + + +**Agent Spend Attribution:** + + + +## API Reference + +### Endpoint + +``` +POST /a2a/{agent_name}/message/send +``` + +### Authentication + +Include your LiteLLM Virtual Key in the `Authorization` header: + +``` +Authorization: Bearer sk-your-litellm-key +``` + +### Request Format + +LiteLLM follows the [A2A JSON-RPC 2.0 specification](https://github.com/google/A2A): + +```json title="Request Body" +{ + "jsonrpc": "2.0", + "id": "unique-request-id", + "method": "message/send", + "params": { + "message": { + "role": "user", + "parts": [{"kind": "text", "text": "Your message here"}], + "messageId": "unique-message-id" + } + } +} +``` + +### Response Format + +```json title="Response" +{ + "jsonrpc": "2.0", + "id": "unique-request-id", + "result": { + "kind": "task", + "id": "task-id", + "contextId": "context-id", + "status": {"state": "completed", "timestamp": "2025-01-01T00:00:00Z"}, + "artifacts": [ + { + "artifactId": "artifact-id", + "name": "response", + "parts": [{"kind": "text", "text": "Agent response here"}] + } + ] + } +} +``` + +## Agent Registry + +Want to create a central registry so your team can discover what agents are available within your company? + +Use the [AI Hub](./proxy/ai_hub) to make agents public and discoverable across your organization. This allows developers to browse available agents without needing to rebuild them. diff --git a/docs/my-website/docs/a2a_agent_permissions.md b/docs/my-website/docs/a2a_agent_permissions.md new file mode 100644 index 000000000000..93f367f43e7f --- /dev/null +++ b/docs/my-website/docs/a2a_agent_permissions.md @@ -0,0 +1,259 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import Image from '@theme/IdealImage'; + +# Agent Permission Management + +Control which A2A agents can be accessed by specific keys or teams in LiteLLM. + +## Overview + +Agent Permission Management lets you restrict which agents a LiteLLM Virtual Key or Team can access. This is useful for: + +- **Multi-tenant environments**: Give different teams access to different agents +- **Security**: Prevent keys from invoking agents they shouldn't have access to +- **Compliance**: Enforce access policies for sensitive agent workflows + +When permissions are configured: +- `GET /v1/agents` only returns agents the key/team can access +- `POST /a2a/{agent_id}` (Invoking an agent) returns `403 Forbidden` if access is denied + +## Setting Permissions on a Key + +This example shows how to create a key with agent permissions and test access. + +### 1. Get Your Agent ID + + + + +1. Go to **Agents** in the sidebar +2. Click into the agent you want +3. Copy the **Agent ID** + + + + + + +```bash title="List all agents" showLineNumbers +curl "http://localhost:4000/v1/agents" \ + -H "Authorization: Bearer sk-master-key" +``` + +Response: +```json title="Response" showLineNumbers +{ + "agents": [ + {"agent_id": "agent-123", "name": "Support Agent"}, + {"agent_id": "agent-456", "name": "Sales Agent"} + ] +} +``` + + + + +### 2. Create a Key with Agent Permissions + + + + +1. Go to **Keys** → **Create Key** +2. Expand **Agent Settings** +3. Select the agents you want to allow + + + + + + +```bash title="Create key with agent permissions" showLineNumbers +curl -X POST "http://localhost:4000/key/generate" \ + -H "Authorization: Bearer sk-master-key" \ + -H "Content-Type: application/json" \ + -d '{ + "object_permission": { + "agents": ["agent-123"] + } + }' +``` + + + + +### 3. Test Access + +**Allowed agent (succeeds):** +```bash title="Invoke allowed agent" showLineNumbers +curl -X POST "http://localhost:4000/a2a/agent-123" \ + -H "Authorization: Bearer sk-your-new-key" \ + -H "Content-Type: application/json" \ + -d '{"message": {"role": "user", "parts": [{"type": "text", "text": "Hello"}]}}' +``` + +**Blocked agent (fails with 403):** +```bash title="Invoke blocked agent" showLineNumbers +curl -X POST "http://localhost:4000/a2a/agent-456" \ + -H "Authorization: Bearer sk-your-new-key" \ + -H "Content-Type: application/json" \ + -d '{"message": {"role": "user", "parts": [{"type": "text", "text": "Hello"}]}}' +``` + +Response: +```json title="403 Forbidden Response" showLineNumbers +{ + "error": { + "message": "Access denied to agent: agent-456", + "code": 403 + } +} +``` + +## Setting Permissions on a Team + +Restrict all keys belonging to a team to only access specific agents. + +### 1. Create a Team with Agent Permissions + + + + +1. Go to **Teams** → **Create Team** +2. Expand **Agent Settings** +3. Select the agents you want to allow for this team + + + + + + +```bash title="Create team with agent permissions" showLineNumbers +curl -X POST "http://localhost:4000/team/new" \ + -H "Authorization: Bearer sk-master-key" \ + -H "Content-Type: application/json" \ + -d '{ + "team_alias": "support-team", + "object_permission": { + "agents": ["agent-123"] + } + }' +``` + +Response: +```json title="Response" showLineNumbers +{ + "team_id": "team-abc-123", + "team_alias": "support-team" +} +``` + + + + +### 2. Create a Key for the Team + + + + +1. Go to **Keys** → **Create Key** +2. Select the **Team** from the dropdown + + + + + + +```bash title="Create key for team" showLineNumbers +curl -X POST "http://localhost:4000/key/generate" \ + -H "Authorization: Bearer sk-master-key" \ + -H "Content-Type: application/json" \ + -d '{ + "team_id": "team-abc-123" + }' +``` + + + + +### 3. Test Access + +The key inherits agent permissions from the team. + +**Allowed agent (succeeds):** +```bash title="Invoke allowed agent" showLineNumbers +curl -X POST "http://localhost:4000/a2a/agent-123" \ + -H "Authorization: Bearer sk-team-key" \ + -H "Content-Type: application/json" \ + -d '{"message": {"role": "user", "parts": [{"type": "text", "text": "Hello"}]}}' +``` + +**Blocked agent (fails with 403):** +```bash title="Invoke blocked agent" showLineNumbers +curl -X POST "http://localhost:4000/a2a/agent-456" \ + -H "Authorization: Bearer sk-team-key" \ + -H "Content-Type: application/json" \ + -d '{"message": {"role": "user", "parts": [{"type": "text", "text": "Hello"}]}}' +``` + +## How It Works + +```mermaid +flowchart TD + A[Request to invoke agent] --> B{LiteLLM Virtual Key has agent restrictions?} + B -->|Yes| C{LiteLLM Team has agent restrictions?} + B -->|No| D{LiteLLM Team has agent restrictions?} + + C -->|Yes| E[Use intersection of key + team permissions] + C -->|No| F[Use key permissions only] + + D -->|Yes| G[Inherit team permissions] + D -->|No| H[Allow ALL agents] + + E --> I{Agent in allowed list?} + F --> I + G --> I + H --> J[Allow request] + + I -->|Yes| J + I -->|No| K[Return 403 Forbidden] +``` + +| Key Permissions | Team Permissions | Result | Notes | +|-----------------|------------------|--------|-------| +| None | None | Key can access **all** agents | Open access by default when no restrictions are set | +| `["agent-1", "agent-2"]` | None | Key can access `agent-1` and `agent-2` | Key uses its own permissions | +| None | `["agent-1", "agent-3"]` | Key can access `agent-1` and `agent-3` | Key inherits team's permissions | +| `["agent-1", "agent-2"]` | `["agent-1", "agent-3"]` | Key can access `agent-1` only | Intersection of both lists (most restrictive wins) | + +## Viewing Permissions + + + + +1. Go to **Keys** or **Teams** +2. Click into the key/team you want to view +3. Agent permissions are displayed in the info view + + + + +```bash title="Get key info" showLineNumbers +curl "http://localhost:4000/key/info?key=sk-your-key" \ + -H "Authorization: Bearer sk-master-key" +``` + + + diff --git a/docs/my-website/docs/a2a_cost_tracking.md b/docs/my-website/docs/a2a_cost_tracking.md new file mode 100644 index 000000000000..94c8b442e7f0 --- /dev/null +++ b/docs/my-website/docs/a2a_cost_tracking.md @@ -0,0 +1,147 @@ +import Image from '@theme/IdealImage'; + +# A2A Agent Cost Tracking + +LiteLLM supports adding custom cost tracking for A2A agents. You can configure: + +- **Flat cost per query** - A fixed cost charged for each agent request +- **Cost by input/output tokens** - Variable cost based on token usage + +This allows you to track and attribute costs for agent usage across your organization, making it easy to see how much each team or project is spending on agent calls. + +## Quick Start + +### 1. Navigate to Agents + +From the sidebar, click on "Agents" to open the agent management page. + +![Navigate to Agents](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/f9ac0752-6936-4dda-b7ed-f536fefcc79a/ascreenshot.jpeg?tl_px=208,326&br_px=2409,1557&force_format=jpeg&q=100&width=1120.0) + +### 2. Create a New Agent + +Click "+ Add New Agent" to open the creation form. You'll need to provide a few basic details: + +- **Agent Name** - A unique identifier for your agent (used in API calls) +- **Display Name** - A human-readable name shown in the UI + +![Enter Agent Name](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/f5bacfeb-67a0-4644-a400-b3d50b6b9ce5/ascreenshot.jpeg?tl_px=0,0&br_px=2617,1463&force_format=jpeg&q=100&width=1120.0) + +![Enter Display Name](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/6db6422b-fe85-4a8b-aa5c-39319f0d4621/ascreenshot.jpeg?tl_px=0,27&br_px=2617,1490&force_format=jpeg&q=100&width=1120.0) + +### 3. Configure Cost Settings + +Scroll down and click on "Cost Configuration" to expand the cost settings panel. This is where you define how much to charge for agent usage. + +![Click Cost Configuration](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/a3019ae8-629c-431b-b2d8-2743cc517be7/ascreenshot.jpeg?tl_px=0,653&br_px=2201,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=388,416) + +### 4. Set Cost Per Query + +Enter the cost per query amount (in dollars). For example, entering `0.05` means each request to this agent will be charged $0.05. + +![Set Cost Per Query](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/91159f8a-1f66-4555-a166-600e4bdecc68/ascreenshot.jpeg?tl_px=0,653&br_px=2201,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=372,281) + +![Enter Cost Amount](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/2add2f69-fd72-462e-9335-1e228c7150da/ascreenshot.jpeg?tl_px=0,420&br_px=2617,1884&force_format=jpeg&q=100&width=1120.0) + +### 5. Create the Agent + +Once you've configured everything, click "Create Agent" to save. Your agent is now ready to use with cost tracking enabled. + +![Create Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/1876cf29-b8a7-4662-b944-2b86a8b7cd2e/ascreenshot.jpeg?tl_px=416,653&br_px=2618,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=706,523) + +## Testing Cost Tracking + +Let's verify that cost tracking is working by sending a test request through the Playground. + +### 1. Go to Playground + +Click "Playground" in the sidebar to open the interactive testing interface. + +![Go to Playground](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/7d5d8338-6393-49a5-b255-86aef5bf5dfa/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=41,98) + +### 2. Select A2A Endpoint + +By default, the Playground uses the chat completions endpoint. To test your agent, click "Endpoint Type" and select `/v1/a2a/message/send` from the dropdown. + +![Select Endpoint Type](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/4d066510-0878-4e0b-8abf-0b074fe2a560/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=325,238) + +![Select A2A Endpoint](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/fe2f8957-4e8a-4331-b177-d5093480cf60/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=333,261) + +### 3. Select Your Agent + +Now pick the agent you just created from the agent dropdown. You should see it listed by its display name. + +![Select Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/8c7add70-fe72-48cb-ba33-9f53b989fcad/ascreenshot.jpeg?tl_px=0,150&br_px=2201,1381&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=287,277) + +### 4. Send a Test Message + +Type a message and hit send. You can use the suggested prompts or write your own. + +![Send Message](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/2c16acb1-4016-447e-88e9-c4522e408ea2/ascreenshot.jpeg?tl_px=15,653&br_px=2216,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=524,443) + +Once the agent responds, the request is logged with the cost you configured. + +![Agent Response](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/2dcf7109-0be4-4d03-8333-ef45759c70c9/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=494,273) + +## Viewing Cost in Logs + +Now let's confirm the cost was actually tracked. + +### 1. Navigate to Logs + +Click "Logs" in the sidebar to see all recent requests. + +![Go to Logs](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/c96abf3c-f06a-4401-ada6-04b6e8040453/ascreenshot.jpeg?tl_px=0,118&br_px=2201,1349&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=41,277) + +### 2. View Cost Attribution + +Find your agent request in the list. You'll see the cost column showing the amount you configured. This cost is now attributed to the API key that made the request, so you can track spend per team or project. + +![View Cost in Logs](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/1ae167ec-1a43-48a3-9251-43d4cb3e57f5/ascreenshot.jpeg?tl_px=335,11&br_px=2536,1242&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=524,277) + +## View Spend in Usage Page + +Navigate to the Agent Usage tab in the Admin UI to view agent-level spend analytics: + +### 1. Access Agent Usage + +Go to the Usage page in the Admin UI (`PROXY_BASE_URL/ui/?login=success&page=new_usage`) and click on the **Agent Usage** tab. + + + +### 2. View Agent Analytics + +The Agent Usage dashboard provides: + +- **Total spend per agent**: View aggregated spend across all agents +- **Daily spend trends**: See how agent spend changes over time +- **Model usage breakdown**: Understand which models each agent uses +- **Activity metrics**: Track requests, tokens, and success rates per agent + + + +### 3. Filter by Agent + +Use the agent filter dropdown to view spend for specific agents: + +- Select one or more agent IDs from the dropdown +- View filtered analytics, spend logs, and activity metrics +- Compare spend across different agents + + + +## Cost Configuration Options + +You can mix and match these options depending on your pricing model: + +| Field | Description | +| ----------------------------- | ----------------------------------------- | +| **Cost Per Query ($)** | Fixed cost charged for each agent request | +| **Input Cost Per Token ($)** | Cost per input token processed | +| **Output Cost Per Token ($)** | Cost per output token generated | + +For most use cases, a flat cost per query is simplest. Use token-based pricing if your agent costs vary significantly based on input/output length. + +## Related + +- [A2A Agent Gateway](./a2a.md) +- [Spend Tracking](./proxy/cost_tracking.md) diff --git a/docs/my-website/docs/adding_provider/generic_guardrail_api.md b/docs/my-website/docs/adding_provider/generic_guardrail_api.md new file mode 100644 index 000000000000..482dedaa8a98 --- /dev/null +++ b/docs/my-website/docs/adding_provider/generic_guardrail_api.md @@ -0,0 +1,394 @@ +# [BETA] Generic Guardrail API - Integrate Without a PR + +## The Problem + +As a guardrail provider, integrating with LiteLLM traditionally requires: +- Making a PR to the LiteLLM repository +- Waiting for review and merge +- Maintaining provider-specific code in LiteLLM's codebase +- Updating the integration for changes to your API + +## The Solution + +The **Generic Guardrail API** lets you integrate with LiteLLM **instantly** by implementing a simple API endpoint. No PR required. + +### Key Benefits + +1. **No PR Needed** - Deploy and integrate immediately +2. **Universal Support** - Works across ALL LiteLLM endpoints (chat, embeddings, image generation, etc.) +3. **Simple Contract** - One endpoint, three response types +4. **Multi-Modal Support** - Handle both text and images in requests/responses +5. **Custom Parameters** - Pass provider-specific params via config +6. **Full Control** - You own and maintain your guardrail API + +## Supported Endpoints + +The Generic Guardrail API works with the following LiteLLM endpoints: + +- `/v1/chat/completions` - OpenAI Chat Completions +- `/v1/completions` - OpenAI Text Completions +- `/v1/responses` - OpenAI Responses API +- `/v1/images/generations` - OpenAI Image Generation +- `/v1/audio/transcriptions` - OpenAI Audio Transcriptions +- `/v1/audio/speech` - OpenAI Text-to-Speech +- `/v1/messages` - Anthropic Messages +- `/v1/rerank` - Cohere Rerank +- Pass-through endpoints + +## How It Works + +1. LiteLLM extracts text and images from any request (chat messages, embeddings, image prompts, etc.) +2. Sends extracted content + metadata to your API endpoint +3. Your API responds with: `BLOCKED`, `NONE`, or `GUARDRAIL_INTERVENED` +4. LiteLLM enforces the decision and applies any modifications + +## API Contract + +### Endpoint + +Implement `POST /beta/litellm_basic_guardrail_api` + +### Request Format + +```json +{ + "texts": ["extracted text from the request"], // array of text strings + "images": ["base64_encoded_image_data"], // optional array of images + "tools": [ // tool calls sent to the LLM (in the OpenAI Chat Completions spec) + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + } + ], + "tool_calls": [ // tool calls received from the LLM (in the OpenAI Chat Completions spec) + { + "id": "call_abc123", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"location\": \"San Francisco\"}" + } + } + ], + "structured_messages": [ // optional, full messages in OpenAI format (for chat endpoints) + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"} + ], + "request_data": { + "user_api_key_hash": "hash of the litellm virtual key used", + "user_api_key_alias": "alias of the litellm virtual key used", + "user_api_key_user_id": "user id associated with the litellm virtual key used", + "user_api_key_user_email": "user email associated with the litellm virtual key used", + "user_api_key_team_id": "team id associated with the litellm virtual key used", + "user_api_key_team_alias": "team alias associated with the litellm virtual key used", + "user_api_key_end_user_id": "end user id associated with the litellm virtual key used", + "user_api_key_org_id": "org id associated with the litellm virtual key used" + }, + "input_type": "request", // "request" or "response" + "litellm_call_id": "unique_call_id", // the call id of the individual LLM call + "litellm_trace_id": "trace_id", // the trace id of the LLM call - useful if there are multiple LLM calls for the same conversation + "additional_provider_specific_params": { + // your custom params from config + } +} +``` + +### Response Format + +```json +{ + "action": "BLOCKED" | "NONE" | "GUARDRAIL_INTERVENED", + "blocked_reason": "why content was blocked", // required if action=BLOCKED + "texts": ["modified text"], // optional array of modified text strings + "images": ["modified_base64_image"] // optional array of modified images +} +``` + +**Actions:** +- `BLOCKED` - LiteLLM raises error and blocks request +- `NONE` - Request proceeds unchanged +- `GUARDRAIL_INTERVENED` - Request proceeds with modified texts/images (provide `texts` and/or `images` fields) + +## Parameters + +### `tools` Parameter + +The `tools` parameter provides information about available function/tool definitions in the request. + +**Format:** OpenAI `ChatCompletionToolParam` format (see [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-tools)) + +**Example:** +```json +{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["location"] + } + } +} +``` + +**Availability:** +- **Input only:** Tools are only passed for `input_type="request"` (pre-call guardrails). Output/response guardrails do not currently receive tool definitions. +- **Supported endpoints:** The `tools` parameter is supported on: `/v1/chat/completions`, `/v1/responses`, and `/v1/messages`. Other endpoints do not have tool support. + +**Use cases:** +- Enforce tool permission policies (e.g., only allow certain users/teams to access specific tools) +- Validate tool schemas before sending to LLM +- Log tool usage for audit purposes +- Block sensitive tools based on user context + +### `tool_calls` Parameter + +The `tool_calls` parameter contains actual function/tool invocations being made in the request or response. + +**Format:** OpenAI `ChatCompletionMessageToolCall` format (see [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/object#chat/object-tool_calls)) + +**Example:** +```json +{ + "id": "call_abc123", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"location\": \"San Francisco\", \"unit\": \"celsius\"}" + } +} +``` + +**Key Difference from `tools`:** +- **`tools`** = Tool definitions/schemas (what tools are *available*) +- **`tool_calls`** = Tool invocations/executions (what tools are *being called* with what arguments) + +**Availability:** +- **Both input and output:** Tool calls can be present in both `input_type="request"` (assistant messages requesting tool calls) and `input_type="response"` (LLM responses with tool calls). +- **Supported endpoints:** The `tool_calls` parameter is supported on: `/v1/chat/completions`, `/v1/responses`, and `/v1/messages`. + +**Use cases:** +- Validate tool call arguments before execution +- Redact sensitive data from tool call arguments (e.g., PII) +- Log tool invocations for audit/debugging +- Block tool calls with dangerous parameters +- Modify tool call arguments (e.g., enforce constraints, sanitize inputs) +- Monitor tool usage patterns across users/teams + +### `structured_messages` Parameter + +The `structured_messages` parameter provides the full input in OpenAI chat completion spec format, useful for distinguishing between system and user messages. + +**Format:** Array of OpenAI chat completion messages (see [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create#chat-create-messages)) + +**Example:** +```json +[ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"} +] +``` + +**Availability:** +- **Supported endpoints:** `/v1/chat/completions`, `/v1/messages`, `/v1/responses` +- **Input only:** Only passed for `input_type="request"` (pre-call guardrails) + +**Use cases:** +- Apply different policies for system vs user messages +- Enforce role-based content restrictions +- Log structured conversation context + +## LiteLLM Configuration + +Add to `config.yaml`: + +```yaml +litellm_settings: + guardrails: + - guardrail_name: "my-guardrail" + litellm_params: + guardrail: generic_guardrail_api + mode: pre_call # or post_call, during_call + api_base: https://your-guardrail-api.com + api_key: os.environ/YOUR_GUARDRAIL_API_KEY # optional + additional_provider_specific_params: + # your custom parameters + threshold: 0.8 + language: "en" +``` + +### Example: Pillar Security + +[Pillar Security](https://pillar.security) uses the Generic Guardrail API to provide comprehensive AI security scanning including prompt injection protection, PII/PCI detection, secret detection, and content moderation. + +```yaml +guardrails: + - guardrail_name: "pillar-security" + litellm_params: + guardrail: generic_guardrail_api + mode: [pre_call, post_call] + api_base: https://api.pillar.security/api/v1/integrations/litellm + api_key: os.environ/PILLAR_API_KEY + default_on: true + additional_provider_specific_params: + plr_mask: true # Enable automatic masking of sensitive data + plr_evidence: true # Include detection evidence in response + plr_scanners: true # Include scanner details in response +``` + +See the [Pillar Security documentation](../proxy/guardrails/pillar_security.md) for full configuration options. + +## Usage + +Users apply your guardrail by name: + +```python +response = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "hello"}], + guardrails=["my-guardrail"] +) +``` + +Or with dynamic parameters: + +```python +response = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "hello"}], + guardrails=[{ + "my-guardrail": { + "extra_body": { + "custom_threshold": 0.9 + } + } + }] +) +``` + +## Implementation Example + +See [mock_bedrock_guardrail_server.py](https://github.com/BerriAI/litellm/blob/main/cookbook/mock_guardrail_server/mock_bedrock_guardrail_server.py) for a complete reference implementation. + +**Minimal FastAPI example:** + +```python +from fastapi import FastAPI +from pydantic import BaseModel +from typing import List, Optional, Dict, Any + +app = FastAPI() + +class GuardrailRequest(BaseModel): + texts: List[str] + images: Optional[List[str]] = None + tools: Optional[List[Dict[str, Any]]] = None # OpenAI ChatCompletionToolParam format (tool definitions) + tool_calls: Optional[List[Dict[str, Any]]] = None # OpenAI ChatCompletionMessageToolCall format (tool invocations) + structured_messages: Optional[List[Dict[str, Any]]] = None # OpenAI messages format (for chat endpoints) + request_data: Dict[str, Any] + input_type: str # "request" or "response" + litellm_call_id: Optional[str] = None + litellm_trace_id: Optional[str] = None + additional_provider_specific_params: Dict[str, Any] + +class GuardrailResponse(BaseModel): + action: str # BLOCKED, NONE, or GUARDRAIL_INTERVENED + blocked_reason: Optional[str] = None + texts: Optional[List[str]] = None + images: Optional[List[str]] = None + +@app.post("/beta/litellm_basic_guardrail_api") +async def apply_guardrail(request: GuardrailRequest): + # Your guardrail logic here + + # Example: Check text content + for text in request.texts: + if "badword" in text.lower(): + return GuardrailResponse( + action="BLOCKED", + blocked_reason="Content contains prohibited terms" + ) + + # Example: Check tool definitions (if present in request) + if request.tools: + for tool in request.tools: + if tool.get("type") == "function": + function_name = tool.get("function", {}).get("name", "") + # Block sensitive tool definitions + if function_name in ["delete_data", "access_admin_panel"]: + return GuardrailResponse( + action="BLOCKED", + blocked_reason=f"Tool '{function_name}' is not allowed" + ) + + # Example: Check tool calls (if present in request or response) + if request.tool_calls: + for tool_call in request.tool_calls: + if tool_call.get("type") == "function": + function_name = tool_call.get("function", {}).get("name", "") + arguments_str = tool_call.get("function", {}).get("arguments", "{}") + + # Parse arguments and validate + import json + try: + arguments = json.loads(arguments_str) + # Block dangerous arguments + if "file_path" in arguments and ".." in str(arguments["file_path"]): + return GuardrailResponse( + action="BLOCKED", + blocked_reason="Tool call contains path traversal attempt" + ) + except json.JSONDecodeError: + pass + + # Example: Check structured messages (if present in request) + if request.structured_messages: + for message in request.structured_messages: + if message.get("role") == "system": + # Apply stricter policies to system messages + if "admin" in message.get("content", "").lower(): + return GuardrailResponse( + action="BLOCKED", + blocked_reason="System message contains restricted terms" + ) + + return GuardrailResponse(action="NONE") +``` + +## When to Use This + +✅ **Use Generic Guardrail API when:** +- You want instant integration without waiting for PRs +- You maintain your own guardrail service +- You need full control over updates and features +- You want to support all LiteLLM endpoints automatically + +❌ **Make a PR when:** +- You want deeper integration with LiteLLM internals +- Your guardrail requires complex LiteLLM-specific logic +- You want to be featured as a built-in provider + +## Questions? + +This is a **beta API**. We're actively improving it based on feedback. Open an issue or PR if you need additional capabilities. + diff --git a/docs/my-website/docs/adding_provider/simple_guardrail_tutorial.md b/docs/my-website/docs/adding_provider/simple_guardrail_tutorial.md new file mode 100644 index 000000000000..9c654cd1560d --- /dev/null +++ b/docs/my-website/docs/adding_provider/simple_guardrail_tutorial.md @@ -0,0 +1,134 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Adding a New Guardrail Integration + +You're going to create a class that checks text before it goes to the LLM or after it comes back. If it violates your rules, you block it. + +## How It Works + +Request with guardrail: + +```bash +curl --location 'http://localhost:4000/chat/completions' \ +--header 'Authorization: Bearer sk-1234' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "gpt-4", + "messages": [{"role": "user", "content": "How do I hack a system?"}], + "guardrails": ["my-guardrail"] +}' +``` + +Your guardrail checks input, then output. If something's wrong, raise an exception. + +## Build Your Guardrail + +### Create Your Directory + +```bash +mkdir -p litellm/proxy/guardrails/guardrail_hooks/my_guardrail +cd litellm/proxy/guardrails/guardrail_hooks/my_guardrail +``` + +Two files: `my_guardrail.py` (main class) and `__init__.py` (initialization). + +### Write the Main Class + +`my_guardrail.py`: + +Follow from [Custom Guardrail](../proxy/guardrails/custom_guardrail#custom-guardrail) tutorial. + +### Create the Init File + +`__init__.py`: + +```python +from typing import TYPE_CHECKING + +from litellm.types.guardrails import SupportedGuardrailIntegrations + +from .my_guardrail import MyGuardrail + +if TYPE_CHECKING: + from litellm.types.guardrails import Guardrail, LitellmParams + + +def initialize_guardrail(litellm_params: "LitellmParams", guardrail: "Guardrail"): + import litellm + + _my_guardrail_callback = MyGuardrail( + api_base=litellm_params.api_base, + api_key=litellm_params.api_key, + guardrail_name=guardrail.get("guardrail_name", ""), + event_hook=litellm_params.mode, + default_on=litellm_params.default_on, + ) + + litellm.logging_callback_manager.add_litellm_callback(_my_guardrail_callback) + return _my_guardrail_callback + + +guardrail_initializer_registry = { + SupportedGuardrailIntegrations.MY_GUARDRAIL.value: initialize_guardrail, +} + +guardrail_class_registry = { + SupportedGuardrailIntegrations.MY_GUARDRAIL.value: MyGuardrail, +} +``` + +### Register Your Guardrail Type + +Add to `litellm/types/guardrails.py`: + +```python +class SupportedGuardrailIntegrations(str, Enum): + LAKERA = "lakera_prompt_injection" + APORIA = "aporia" + BEDROCK = "bedrock_guardrails" + PRESIDIO = "presidio" + ZSCALER_AI_GUARD = "zscaler_ai_guard" + MY_GUARDRAIL = "my_guardrail" +``` + +## Usage + +### Config File + +```yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: gpt-4 + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + guardrails: + - guardrail_name: my_guardrail + litellm_params: + guardrail: my_guardrail + mode: during_call + api_key: os.environ/MY_GUARDRAIL_API_KEY + api_base: https://api.myguardrail.com +``` + +### Per-Request + +```bash +curl --location 'http://localhost:4000/chat/completions' \ +--header 'Authorization: Bearer sk-1234' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "gpt-4", + "messages": [{"role": "user", "content": "Test message"}], + "guardrails": ["my_guardrail"] +}' +``` + +## Testing + +Add unit tests inside `test_litellm/` folder. + + + diff --git a/docs/my-website/docs/anthropic_count_tokens.md b/docs/my-website/docs/anthropic_count_tokens.md new file mode 100644 index 000000000000..963172fec4e4 --- /dev/null +++ b/docs/my-website/docs/anthropic_count_tokens.md @@ -0,0 +1,232 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# /v1/messages/count_tokens + +## Overview + +Anthropic-compatible token counting endpoint. Count tokens for messages before sending them to the model. + +| Feature | Supported | Notes | +|---------|-----------|-------| +| Cost Tracking | ❌ | Token counting only, no cost incurred | +| Logging | ✅ | Works across all integrations | +| End-user Tracking | ✅ | | +| Supported Providers | Anthropic, Vertex AI (Claude), Bedrock (Claude), Gemini, Vertex AI | Auto-routes to provider-specific token counting APIs | + +## Quick Start + +### 1. Start LiteLLM Proxy + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +### 2. Count Tokens + + + + +```bash +curl -X POST "http://localhost:4000/v1/messages/count_tokens" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "claude-3-5-sonnet-20241022", + "messages": [ + {"role": "user", "content": "Hello, how are you?"} + ] + }' +``` + + + + +```python +import httpx + +response = httpx.post( + "http://localhost:4000/v1/messages/count_tokens", + headers={ + "Content-Type": "application/json", + "Authorization": "Bearer sk-1234" + }, + json={ + "model": "claude-3-5-sonnet-20241022", + "messages": [ + {"role": "user", "content": "Hello, how are you?"} + ] + } +) + +print(response.json()) +# {"input_tokens": 14} +``` + + + + +**Expected Response:** + +```json +{ + "input_tokens": 14 +} +``` + +## LiteLLM Proxy Configuration + +Add models to your `config.yaml`: + +```yaml +model_list: + - model_name: claude-3-5-sonnet + litellm_params: + model: anthropic/claude-3-5-sonnet-20241022 + api_key: os.environ/ANTHROPIC_API_KEY + + - model_name: claude-vertex + litellm_params: + model: vertex_ai/claude-3-5-sonnet-v2@20241022 + vertex_project: my-project + vertex_location: us-east5 + vertex_count_tokens_location: us-east5 # Optional: Override location for token counting (count_tokens not available on global location) + + - model_name: claude-bedrock + litellm_params: + model: bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0 + aws_region_name: us-west-2 +``` + +## Request Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | ✅ | The model to use for token counting | +| `messages` | array | ✅ | Array of messages in Anthropic format | + +### Messages Format + +```json +{ + "messages": [ + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi there!"}, + {"role": "user", "content": "How are you?"} + ] +} +``` + +## Response Format + +```json +{ + "input_tokens": +} +``` + +| Field | Type | Description | +|-------|------|-------------| +| `input_tokens` | integer | Number of tokens in the input messages | + +## Supported Providers + +The `/v1/messages/count_tokens` endpoint automatically routes to the appropriate provider-specific token counting API: + +| Provider | Token Counting Method | +|----------|----------------------| +| Anthropic | [Anthropic Token Counting API](https://docs.anthropic.com/en/docs/build-with-claude/token-counting) | +| Vertex AI (Claude) | Vertex AI Partner Models Token Counter | +| Bedrock (Claude) | AWS Bedrock CountTokens API | +| Gemini | Google AI Studio countTokens API | +| Vertex AI (Gemini) | Vertex AI countTokens API | + +## Examples + +### Count Tokens with System Message + +```bash +curl -X POST "http://localhost:4000/v1/messages/count_tokens" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "claude-3-5-sonnet-20241022", + "messages": [ + {"role": "user", "content": "You are a helpful assistant. Please help me write a haiku about programming."} + ] + }' +``` + +### Count Tokens for Multi-turn Conversation + +```bash +curl -X POST "http://localhost:4000/v1/messages/count_tokens" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "claude-3-5-sonnet-20241022", + "messages": [ + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "The capital of France is Paris."}, + {"role": "user", "content": "What is its population?"} + ] + }' +``` + +### Using with Vertex AI Claude + +```bash +curl -X POST "http://localhost:4000/v1/messages/count_tokens" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "claude-vertex", + "messages": [ + {"role": "user", "content": "Hello, world!"} + ] + }' +``` + +### Using with Bedrock Claude + +```bash +curl -X POST "http://localhost:4000/v1/messages/count_tokens" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "claude-bedrock", + "messages": [ + {"role": "user", "content": "Hello, world!"} + ] + }' +``` + +## Comparison with Anthropic Passthrough + +LiteLLM provides two ways to count tokens: + +| Endpoint | Description | Use Case | +|----------|-------------|----------| +| `/v1/messages/count_tokens` | LiteLLM's Anthropic-compatible endpoint | Works with all supported providers (Anthropic, Vertex AI, Bedrock, etc.) | +| `/anthropic/v1/messages/count_tokens` | [Pass-through to Anthropic API](./pass_through/anthropic_completion.md#example-2-token-counting-api) | Direct Anthropic API access with native headers | + +### Pass-through Example + +For direct Anthropic API access with full native headers: + +```bash +curl --request POST \ + --url http://0.0.0.0:4000/anthropic/v1/messages/count_tokens \ + --header "x-api-key: $LITELLM_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "anthropic-beta: token-counting-2024-11-01" \ + --header "content-type: application/json" \ + --data '{ + "model": "claude-3-5-sonnet-20241022", + "messages": [ + {"role": "user", "content": "Hello, world"} + ] + }' +``` diff --git a/docs/my-website/docs/anthropic_unified.md b/docs/my-website/docs/anthropic_unified/index.md similarity index 100% rename from docs/my-website/docs/anthropic_unified.md rename to docs/my-website/docs/anthropic_unified/index.md diff --git a/docs/my-website/docs/anthropic_unified/structured_output.md b/docs/my-website/docs/anthropic_unified/structured_output.md new file mode 100644 index 000000000000..2a06cf827854 --- /dev/null +++ b/docs/my-website/docs/anthropic_unified/structured_output.md @@ -0,0 +1,294 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Structured Output /v1/messages + +Use LiteLLM to call Anthropic's structured output feature via the `/v1/messages` endpoint. + +## Supported Providers + +| Provider | Supported | Notes | +|----------|-----------|-------| +| Anthropic | ✅ | Native support | +| Azure AI (Anthropic models) | ✅ | Claude models on Azure AI | +| Bedrock (Converse Anthropic models) | ✅ | Claude models via Bedrock Converse API | +| Bedrock (Invoke Anthropic models) | ✅ | Claude models via Bedrock Invoke API | + +## Usage + +### LiteLLM Proxy Server + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-sonnet + litellm_params: + model: anthropic/claude-sonnet-4-5-20250514 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://localhost:4000/v1/messages \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -H "anthropic-version: 2023-06-01" \ + -d '{ + "model": "claude-sonnet", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "Extract the key information from this email: John Smith (john@example.com) is interested in our Enterprise plan and wants to schedule a demo for next Tuesday at 2pm." + } + ], + "output_format": { + "type": "json_schema", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "email": {"type": "string"}, + "plan_interest": {"type": "string"}, + "demo_requested": {"type": "boolean"} + }, + "required": ["name", "email", "plan_interest", "demo_requested"], + "additionalProperties": false + } + } + }' +``` + + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: azure-claude-sonnet + litellm_params: + model: azure_ai/claude-sonnet-4-5-20250514 + api_key: os.environ/AZURE_AI_API_KEY + api_base: https://your-endpoint.inference.ai.azure.com +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://localhost:4000/v1/messages \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -H "anthropic-version: 2023-06-01" \ + -d '{ + "model": "azure-claude-sonnet", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "Extract the key information from this email: John Smith (john@example.com) is interested in our Enterprise plan and wants to schedule a demo for next Tuesday at 2pm." + } + ], + "output_format": { + "type": "json_schema", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "email": {"type": "string"}, + "plan_interest": {"type": "string"}, + "demo_requested": {"type": "boolean"} + }, + "required": ["name", "email", "plan_interest", "demo_requested"], + "additionalProperties": false + } + } + }' +``` + + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: bedrock-claude-sonnet + litellm_params: + model: bedrock/global.anthropic.claude-sonnet-4-5-20250929-v1:0 + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-west-2 +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://localhost:4000/v1/messages \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -H "anthropic-version: 2023-06-01" \ + -d '{ + "model": "bedrock-claude-sonnet", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "Extract the key information from this email: John Smith (john@example.com) is interested in our Enterprise plan and wants to schedule a demo for next Tuesday at 2pm." + } + ], + "output_format": { + "type": "json_schema", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "email": {"type": "string"}, + "plan_interest": {"type": "string"}, + "demo_requested": {"type": "boolean"} + }, + "required": ["name", "email", "plan_interest", "demo_requested"], + "additionalProperties": false + } + } + }' +``` + + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: bedrock-claude-invoke + litellm_params: + model: bedrock/invoke/global.anthropic.claude-sonnet-4-5-20250929-v1:0 + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-west-2 +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://localhost:4000/v1/messages \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -H "anthropic-version: 2023-06-01" \ + -d '{ + "model": "bedrock-claude-invoke", + "max_tokens": 1024, + "messages": [ + { + "role": "user", + "content": "Extract the key information from this email: John Smith (john@example.com) is interested in our Enterprise plan and wants to schedule a demo for next Tuesday at 2pm." + } + ], + "output_format": { + "type": "json_schema", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "email": {"type": "string"}, + "plan_interest": {"type": "string"}, + "demo_requested": {"type": "boolean"} + }, + "required": ["name", "email", "plan_interest", "demo_requested"], + "additionalProperties": false + } + } + }' +``` + + + + + +## Example Response + +```json +{ + "id": "msg_01XFDUDYJgAACzvnptvVoYEL", + "type": "message", + "role": "assistant", + "content": [ + { + "type": "text", + "text": "{\"name\":\"John Smith\",\"email\":\"john@example.com\",\"plan_interest\":\"Enterprise\",\"demo_requested\":true}" + } + ], + "model": "claude-sonnet-4-5-20250514", + "stop_reason": "end_turn", + "stop_sequence": null, + "usage": { + "input_tokens": 75, + "output_tokens": 28 + } +} +``` + +## Request Format + +### output_format + +The `output_format` parameter specifies the structured output format. + +```json +{ + "output_format": { + "type": "json_schema", + "schema": { + "type": "object", + "properties": { + "field_name": {"type": "string"}, + "another_field": {"type": "integer"} + }, + "required": ["field_name", "another_field"], + "additionalProperties": false + } + } +} +``` + +#### Fields + +- **type** (string): Must be `"json_schema"` +- **schema** (object): A JSON Schema object defining the expected output structure + - **type** (string): The root type, typically `"object"` + - **properties** (object): Defines the fields and their types + - **required** (array): List of required field names + - **additionalProperties** (boolean): Set to `false` to enforce strict schema adherence diff --git a/docs/my-website/docs/assistants.md b/docs/my-website/docs/assistants.md index d262b492a70d..2960d0fded88 100644 --- a/docs/my-website/docs/assistants.md +++ b/docs/my-website/docs/assistants.md @@ -3,6 +3,14 @@ import TabItem from '@theme/TabItem'; # /assistants +:::warning Deprecation Notice + +OpenAI has deprecated the Assistants API. It will shut down on **August 26, 2026**. + +Consider migrating to the [Responses API](/docs/response_api) instead. See [OpenAI's migration guide](https://platform.openai.com/docs/guides/responses-vs-assistants) for details. + +::: + Covers Threads, Messages, Assistants. LiteLLM currently covers: diff --git a/docs/my-website/docs/audio_transcription.md b/docs/my-website/docs/audio_transcription.md index fd55cc66e927..5853b5c1872b 100644 --- a/docs/my-website/docs/audio_transcription.md +++ b/docs/my-website/docs/audio_transcription.md @@ -13,7 +13,7 @@ import TabItem from '@theme/TabItem'; | Fallbacks | ✅ | Works between supported models | | Loadbalancing | ✅ | Works between supported models | | Guardrails | ✅ | Applies to output transcribed text (non-streaming only) | -| Supported Providers | `openai`, `azure`, `vertex_ai`, `gemini`, `deepgram`, `groq`, `fireworks_ai` | | +| Supported Providers | `openai`, `azure`, `vertex_ai`, `gemini`, `deepgram`, `groq`, `fireworks_ai`, `ovhcloud` | | ## Quick Start @@ -126,6 +126,7 @@ transcript = client.audio.transcriptions.create( - [Fireworks AI](./providers/fireworks_ai.md#audio-transcription) - [Groq](./providers/groq.md#speech-to-text---whisper) - [Deepgram](./providers/deepgram.md) +- [OVHcloud AI Endpoints](./providers/ovhcloud.md) --- diff --git a/docs/my-website/docs/batches.md b/docs/my-website/docs/batches.md index 1bd4c700ae7f..9c21d8525f3e 100644 --- a/docs/my-website/docs/batches.md +++ b/docs/my-website/docs/batches.md @@ -7,7 +7,7 @@ Covers Batches, Files | Feature | Supported | Notes | |-------|-------|-------| -| Supported Providers | OpenAI, Azure, Vertex, Bedrock | - | +| Supported Providers | OpenAI, Azure, Vertex, Bedrock, vLLM | - | | ✨ Cost Tracking | ✅ | LiteLLM Enterprise only | | Logging | ✅ | Works across all logging integrations | @@ -174,11 +174,263 @@ print("list_batches_response=", list_batches_response) +## Multi-Account / Model-Based Routing + +Route batch operations to different provider accounts using model-specific credentials from your `config.yaml`. This eliminates the need for environment variables and enables multi-tenant batch processing. + +### How It Works + +**Priority Order:** +1. **Encoded Batch/File ID** (highest) - Model info embedded in the ID +2. **Model Parameter** - Via header (`x-litellm-model`), query param, or request body +3. **Custom Provider** (fallback) - Uses environment variables + +### Configuration + +```yaml +model_list: + - model_name: gpt-4o-account-1 + litellm_params: + model: openai/gpt-4o + api_key: sk-account-1-key + api_base: https://api.openai.com/v1 + + - model_name: gpt-4o-account-2 + litellm_params: + model: openai/gpt-4o + api_key: sk-account-2-key + api_base: https://api.openai.com/v1 + + - model_name: azure-batches + litellm_params: + model: azure/gpt-4 + api_key: azure-key-123 + api_base: https://my-resource.openai.azure.com + api_version: "2024-02-01" +``` + +### Usage Examples + +#### Scenario 1: Encoded File ID with Model + +When you upload a file with a model parameter, LiteLLM encodes the model information in the file ID. All subsequent operations automatically use those credentials. + +```bash +# Step 1: Upload file with model +curl http://localhost:4000/v1/files \ + -H "Authorization: Bearer sk-1234" \ + -H "x-litellm-model: gpt-4o-account-1" \ + -F purpose="batch" \ + -F file="@batch.jsonl" + +# Response includes encoded file ID: +# { +# "id": "file-bGl0ZWxsbTpmaWxlLUxkaUwzaVYxNGZRVlpYcU5KVEdkSjk7bW9kZWwsZ3B0LTRvLWFjY291bnQtMQ", +# ... +# } + +# Step 2: Create batch - automatically routes to gpt-4o-account-1 +curl http://localhost:4000/v1/batches \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "input_file_id": "file-bGl0ZWxsbTpmaWxlLUxkaUwzaVYxNGZRVlpYcU5KVEdkSjk7bW9kZWwsZ3B0LTRvLWFjY291bnQtMQ", + "endpoint": "/v1/chat/completions", + "completion_window": "24h" + }' + +# Batch ID is also encoded with model: +# { +# "id": "batch_bGl0ZWxsbTpiYXRjaF82OTIwM2IzNjg0MDQ4MTkwYTA3ODQ5NDY3YTFjMDJkYTttb2RlbCxncHQtNG8tYWNjb3VudC0x", +# "input_file_id": "file-bGl0ZWxsbTpmaWxlLUxkaUwzaVYxNGZRVlpYcU5KVEdkSjk7bW9kZWwsZ3B0LTRvLWFjY291bnQtMQ", +# ... +# } + +# Step 3: Retrieve batch - automatically routes to gpt-4o-account-1 +curl http://localhost:4000/v1/batches/batch_bGl0ZWxsbTpiYXRjaF82OTIwM2IzNjg0MDQ4MTkwYTA3ODQ5NDY3YTFjMDJkYTttb2RlbCxncHQtNG8tYWNjb3VudC0x \ + -H "Authorization: Bearer sk-1234" +``` + +**✅ Benefits:** +- No need to specify model on every request +- File and batch IDs "remember" which account created them +- Automatic routing for retrieve, cancel, and file content operations + +#### Scenario 2: Model via Header/Query Parameter + +Specify the model for each request without encoding it in the ID. + +```bash +# Create batch with model header +curl http://localhost:4000/v1/batches \ + -H "Authorization: Bearer sk-1234" \ + -H "x-litellm-model: gpt-4o-account-2" \ + -H "Content-Type: application/json" \ + -d '{ + "input_file_id": "file-abc123", + "endpoint": "/v1/chat/completions", + "completion_window": "24h" + }' + +# Or use query parameter +curl "http://localhost:4000/v1/batches?model=gpt-4o-account-2" \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "input_file_id": "file-abc123", + "endpoint": "/v1/chat/completions", + "completion_window": "24h" + }' + +# List batches for specific model +curl "http://localhost:4000/v1/batches?model=gpt-4o-account-2" \ + -H "Authorization: Bearer sk-1234" +``` + +**✅ Use Case:** +- One-off batch operations +- Different models for different operations +- Explicit control over routing + +#### Scenario 3: Environment Variables (Fallback) + +Traditional approach using environment variables when no model is specified. + +```bash +export OPENAI_API_KEY="sk-env-key" + +curl http://localhost:4000/v1/batches \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "input_file_id": "file-abc123", + "endpoint": "/v1/chat/completions", + "completion_window": "24h" + }' +``` + +**✅ Use Case:** +- Backward compatibility +- Simple single-account setups +- Quick prototyping + +### Complete Multi-Account Example + +```bash +# Upload file to Account 1 +FILE_1=$(curl -s http://localhost:4000/v1/files \ + -H "x-litellm-model: gpt-4o-account-1" \ + -F purpose="batch" \ + -F file="@batch1.jsonl" | jq -r '.id') + +# Upload file to Account 2 +FILE_2=$(curl -s http://localhost:4000/v1/files \ + -H "x-litellm-model: gpt-4o-account-2" \ + -F purpose="batch" \ + -F file="@batch2.jsonl" | jq -r '.id') + +# Create batch on Account 1 (auto-routed via encoded file ID) +BATCH_1=$(curl -s http://localhost:4000/v1/batches \ + -d "{\"input_file_id\": \"$FILE_1\", \"endpoint\": \"/v1/chat/completions\", \"completion_window\": \"24h\"}" | jq -r '.id') + +# Create batch on Account 2 (auto-routed via encoded file ID) +BATCH_2=$(curl -s http://localhost:4000/v1/batches \ + -d "{\"input_file_id\": \"$FILE_2\", \"endpoint\": \"/v1/chat/completions\", \"completion_window\": \"24h\"}" | jq -r '.id') + +# Retrieve both batches (auto-routed to correct accounts) +curl http://localhost:4000/v1/batches/$BATCH_1 +curl http://localhost:4000/v1/batches/$BATCH_2 + +# List batches per account +curl "http://localhost:4000/v1/batches?model=gpt-4o-account-1" +curl "http://localhost:4000/v1/batches?model=gpt-4o-account-2" +``` + +### SDK Usage with Model Routing + +```python +import litellm +import asyncio + +# Upload file with model routing +file_obj = await litellm.acreate_file( + file=open("batch.jsonl", "rb"), + purpose="batch", + model="gpt-4o-account-1", # Route to specific account +) + +print(f"File ID: {file_obj.id}") +# File ID is encoded with model info + +# Create batch - automatically uses gpt-4o-account-1 credentials +batch = await litellm.acreate_batch( + completion_window="24h", + endpoint="/v1/chat/completions", + input_file_id=file_obj.id, # Model info embedded in ID +) + +print(f"Batch ID: {batch.id}") +# Batch ID is also encoded + +# Retrieve batch - automatically routes to correct account +retrieved = await litellm.aretrieve_batch( + batch_id=batch.id, # Model info embedded in ID +) + +print(f"Batch status: {retrieved.status}") + +# Or explicitly specify model +batch2 = await litellm.acreate_batch( + completion_window="24h", + endpoint="/v1/chat/completions", + input_file_id="file-regular-id", + model="gpt-4o-account-2", # Explicit routing +) +``` + +### How ID Encoding Works + +LiteLLM encodes model information into file and batch IDs using base64: + +``` +Original: file-abc123 +Encoded: file-bGl0ZWxsbTpmaWxlLWFiYzEyMzttb2RlbCxncHQtNG8tdGVzdA + └─┬─┘ └──────────────────┬──────────────────────┘ + prefix base64(litellm:file-abc123;model,gpt-4o-test) + +Original: batch_xyz789 +Encoded: batch_bGl0ZWxsbTpiYXRjaF94eXo3ODk7bW9kZWwsZ3B0LTRvLXRlc3Q + └──┬──┘ └──────────────────┬──────────────────────┘ + prefix base64(litellm:batch_xyz789;model,gpt-4o-test) +``` + +The encoding: +- ✅ Preserves OpenAI-compatible prefixes (`file-`, `batch_`) +- ✅ Is transparent to clients +- ✅ Enables automatic routing without additional parameters +- ✅ Works across all batch and file endpoints + +### Supported Endpoints + +All batch and file endpoints support model-based routing: + +| Endpoint | Method | Model Routing | +|----------|--------|---------------| +| `/v1/files` | POST | ✅ Via header/query/body | +| `/v1/files/{file_id}` | GET | ✅ Auto from encoded ID + header/query | +| `/v1/files/{file_id}/content` | GET | ✅ Auto from encoded ID + header/query | +| `/v1/files/{file_id}` | DELETE | ✅ Auto from encoded ID | +| `/v1/batches` | POST | ✅ Auto from file ID + header/query/body | +| `/v1/batches` | GET | ✅ Via header/query | +| `/v1/batches/{batch_id}` | GET | ✅ Auto from encoded ID | +| `/v1/batches/{batch_id}/cancel` | POST | ✅ Auto from encoded ID | + ## **Supported Providers**: ### [Azure OpenAI](./providers/azure#azure-batches-api) ### [OpenAI](#quick-start) ### [Vertex AI](./providers/vertex#batch-apis) ### [Bedrock](./providers/bedrock_batches) +### [vLLM](./providers/vllm_batches) ## How Cost Tracking for Batches API Works diff --git a/docs/my-website/docs/benchmarks.md b/docs/my-website/docs/benchmarks.md index f60fa4fcd14b..a1489081b4c9 100644 --- a/docs/my-website/docs/benchmarks.md +++ b/docs/my-website/docs/benchmarks.md @@ -48,6 +48,28 @@ In these tests the baseline latency characteristics are measured against a fake- - High-percentile latencies drop significantly: P95 630 ms → 150 ms, P99 1,200 ms → 240 ms. - Setting workers equal to CPU count gives optimal performance. +## `/realtime` API Benchmarks + +End-to-end latency benchmarks for the `/realtime` endpoint tested against a fake realtime endpoint. + +### Performance Metrics + +| Metric | Value | +| --------------- | ---------- | +| Median latency | 59 ms | +| p95 latency | 67 ms | +| p99 latency | 99 ms | +| Average latency | 63 ms | +| RPS | 1,207 | + +### Test Setup + +| Category | Specification | +|----------|---------------| +| **Load Testing** | Locust: 1,000 concurrent users, 500 ramp-up | +| **System** | 4 vCPUs, 8 GB RAM, 4 workers, 4 instances | +| **Database** | PostgreSQL (Redis unused) | + ## Machine Spec used for testing Each machine deploying LiteLLM had the following specs: @@ -60,6 +82,58 @@ Each machine deploying LiteLLM had the following specs: - Database: PostgreSQL - Redis: Not used +## Infrastructure Recommendations + +Recommended specifications based on benchmark results and industry standards for API gateway deployments. + +### PostgreSQL + +Required for authentication, key management, and usage tracking. + +| Workload | CPU | RAM | Storage | Connections | +|----------|-----|-----|---------|-------------| +| 1-2K RPS | 4-8 cores | 16GB | 200GB SSD (3000+ IOPS) | 100-200 | +| 2-5K RPS | 8 cores | 16-32GB | 500GB SSD (5000+ IOPS) | 200-500 | +| 5K+ RPS | 16+ cores | 32-64GB | 1TB+ SSD (10000+ IOPS) | 500+ | + +**Configuration:** Set `proxy_batch_write_at: 60` to batch writes and reduce DB load. Total connections = pool limit × instances. + +### Redis (Recommended) + +Redis was not used in these benchmarks but provides significant production benefits: 60-80% reduced DB load. + +| Workload | CPU | RAM | +|----------|-----|-----| +| 1-2K RPS | 2-4 cores | 8GB | +| 2-5K RPS | 4 cores | 16GB | +| 5K+ RPS | 8+ cores | 32GB+ | + +**Requirements:** Redis 7.0+, AOF persistence enabled, `allkeys-lru` eviction policy. + +**Configuration:** +```yaml +router_settings: + redis_host: os.environ/REDIS_HOST + redis_port: os.environ/REDIS_PORT + redis_password: os.environ/REDIS_PASSWORD + +litellm_settings: + cache: True + cache_params: + type: redis + host: os.environ/REDIS_HOST + port: os.environ/REDIS_PORT + password: os.environ/REDIS_PASSWORD +``` + +:::tip +Use `redis_host`, `redis_port`, and `redis_password` instead of `redis_url` for ~80 RPS better performance. +::: + +**Scaling:** DB connections scale linearly with instances. Consider PostgreSQL read replicas beyond 5K RPS. + +See [Production Configuration](./proxy/prod) for detailed best practices. + ## Locust Settings - 1000 Users @@ -122,10 +196,57 @@ class MyUser(HttpUser): ``` +## LiteLLM vs Portkey Performance Comparison + +**Test Configuration**: 4 CPUs, 8 GB RAM per instance | Load: 1k concurrent users, 500 ramp-up +**Versions:** Portkey **v1.14.0** | LiteLLM **v1.79.1-stable** +**Test Duration:** 5 minutes + +### Multi-Instance (4×) Performance + +| Metric | Portkey (no DB) | LiteLLM (with DB) | Comment | +| ------------------- | --------------- | ----------------- | -------------- | +| **Total Requests** | 293,796 | 312,405 | LiteLLM higher | +| **Failed Requests** | 0 | 0 | Same | +| **Median Latency** | 100 ms | 100 ms | Same | +| **p95 Latency** | 230 ms | 150 ms | LiteLLM lower | +| **p99 Latency** | 500 ms | 240 ms | LiteLLM lower | +| **Average Latency** | 123 ms | 111 ms | LiteLLM lower | +| **Current RPS** | 1,170.9 | 1,170 | Same | + + +*Lower is better for latency metrics; higher is better for requests and RPS.* + +### Technical Insights + +**Portkey** + +**Pros** + +* Low memory footprint +* Stable latency with minimal spikes + +**Cons** + +* CPU utilization capped around ~40%, indicating underutilization of available compute resources +* Experienced three I/O timeout outages + +**LiteLLM** + +**Pros** + +* Fully utilizes available CPU capacity +* Strong connection handling and low latency after initial warm-up spikes + +**Cons** + +* High memory usage during initialization and per request + + ## Logging Callbacks -### [GCS Bucket Logging](https://docs.litellm.ai/docs/proxy/bucket) +### [GCS Bucket Logging](https://docs.litellm.ai/docs/observability/gcs_bucket_integration) Using GCS Bucket has **no impact on latency, RPS compared to Basic Litellm Proxy** diff --git a/docs/my-website/docs/caching/all_caches.md b/docs/my-website/docs/caching/all_caches.md index 0548c331f805..37fb8bc360a2 100644 --- a/docs/my-website/docs/caching/all_caches.md +++ b/docs/my-website/docs/caching/all_caches.md @@ -105,6 +105,14 @@ Then simply initialize: litellm.cache = Cache(type="redis") ``` +:::info +Use `REDIS_*` environment variables as the primary mechanism for configuring all Redis client library parameters. This approach automatically maps environment variables to Redis client kwargs and is the suggested way to toggle Redis settings. +::: + +:::warning +If you need to pass non-string Redis parameters (integers, booleans, complex objects), avoid `REDIS_*` environment variables as they may fail during Redis client initialization. Instead, pass them directly as kwargs to the `Cache()` constructor. +::: + diff --git a/docs/my-website/docs/completion/drop_params.md b/docs/my-website/docs/completion/drop_params.md index 590d9a459554..cc32d3bbd320 100644 --- a/docs/my-website/docs/completion/drop_params.md +++ b/docs/my-website/docs/completion/drop_params.md @@ -5,6 +5,14 @@ import TabItem from '@theme/TabItem'; Drop unsupported OpenAI params by your LLM Provider. +## Default Behavior + +**By default, LiteLLM raises an exception** if you send a parameter to a model that doesn't support it. + +For example, if you send `temperature=0.2` to a model that doesn't support the `temperature` parameter, LiteLLM will raise an exception. + +**When `drop_params=True` is set**, LiteLLM will drop the unsupported parameter instead of raising an exception. This allows your code to work seamlessly across different providers without having to customize parameters for each one. + ## Quick Start ```python @@ -109,6 +117,56 @@ response = litellm.completion( **additional_drop_params**: List or null - Is a list of openai params you want to drop when making a call to the model. +### Nested Field Removal + +Drop nested fields within complex objects using JSONPath-like notation: + + + + +```python +import litellm + +response = litellm.completion( + model="bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0", + messages=[{"role": "user", "content": "Hello"}], + tools=[{ + "name": "search", + "description": "Search files", + "input_schema": {"type": "object", "properties": {"query": {"type": "string"}}}, + "input_examples": [{"query": "test"}] # Will be removed + }], + additional_drop_params=["tools[*].input_examples"] # Remove from all tools +) +``` + + + + +```yaml +model_list: + - model_name: my-bedrock-model + litellm_params: + model: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0 + additional_drop_params: ["tools[*].input_examples"] # Remove from all tools +``` + + + + +**Supported syntax:** +- `field` - Top-level field +- `parent.child` - Nested object field +- `array[*]` - All array elements +- `array[0]` - Specific array index +- `tools[*].input_examples` - Field in all array elements +- `tools[0].metadata.field` - Specific index + nested field + +**Example use cases:** +- Remove `input_examples` from tool definitions (Claude Code + AWS Bedrock) +- Drop provider-specific fields from nested structures +- Clean up nested parameters before sending to LLM + ## Specify allowed openai params in a request Tell litellm to allow specific openai params in a request. Use this if you get a `litellm.UnsupportedParamsError` and want to allow a param. LiteLLM will pass the param as is to the model. diff --git a/docs/my-website/docs/completion/image_generation_chat.md b/docs/my-website/docs/completion/image_generation_chat.md index 58ae70e2fffc..83488ac7ce8e 100644 --- a/docs/my-website/docs/completion/image_generation_chat.md +++ b/docs/my-website/docs/completion/image_generation_chat.md @@ -15,16 +15,22 @@ Supported Providers: - Google AI Studio (`gemini`) - Vertex AI (`vertex_ai/`) -LiteLLM will standardize the `image` response in the assistant message for models that support image generation during chat completions. +LiteLLM will standardize the `images` response in the assistant message for models that support image generation during chat completions. ```python title="Example response from litellm" "message": { ... "content": "Here's the image you requested:", - "image": { - "url": "...", - "detail": "auto" - } + "images": [ + { + "image_url": { + "url": "...", + "detail": "auto" + }, + "index": 0, + "type": "image_url" + } + ] } ``` @@ -47,7 +53,7 @@ response = completion( ) print(response.choices[0].message.content) # Text response -print(response.choices[0].message.image) # Image data +print(response.choices[0].message.images) # List of image objects ``` @@ -103,10 +109,16 @@ curl http://0.0.0.0:4000/v1/chat/completions \ "message": { "content": "Here's the image you requested:", "role": "assistant", - "image": { - "url": "...", - "detail": "auto" - } + "images": [ + { + "image_url": { + "url": "...", + "detail": "auto" + }, + "index": 0, + "type": "image_url" + } + ] } } ], @@ -141,8 +153,8 @@ response = completion( ) for chunk in response: - if hasattr(chunk.choices[0].delta, "image") and chunk.choices[0].delta.image is not None: - print("Generated image:", chunk.choices[0].delta.image["url"]) + if hasattr(chunk.choices[0].delta, "images") and chunk.choices[0].delta.images is not None: + print("Generated image:", chunk.choices[0].delta.images[0]["image_url"]["url"]) break ``` @@ -175,7 +187,7 @@ data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1723323084 data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1723323084,"model":"gemini/gemini-2.5-flash-image-preview","choices":[{"index":0,"delta":{"content":"Here's the image you requested:"},"finish_reason":null}]} -data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1723323084,"model":"gemini/gemini-2.5-flash-image-preview","choices":[{"index":0,"delta":{"image":{"url":"...","detail":"auto"}},"finish_reason":null}]} +data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1723323084,"model":"gemini/gemini-2.5-flash-image-preview","choices":[{"index":0,"delta":{"images":[{"image_url":{"url":"...","detail":"auto"},"index":0,"type":"image_url"}]},"finish_reason":null}]} data: {"id":"chatcmpl-123","object":"chat.completion.chunk","created":1723323084,"model":"gemini/gemini-2.5-flash-image-preview","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]} @@ -200,8 +212,8 @@ async def generate_image(): ) print(response.choices[0].message.content) # Text response - print(response.choices[0].message.image) # Image data - + print(response.choices[0].message.images) # List of image objects + return response # Run the async function @@ -212,21 +224,31 @@ asyncio.run(generate_image()) | Provider | Model | |----------|--------| -| Google AI Studio | `gemini/gemini-2.5-flash-image-preview` | -| Vertex AI | `vertex_ai/gemini-2.5-flash-image-preview` | +| Google AI Studio | `gemini/gemini-2.0-flash-preview-image-generation`, `gemini/gemini-2.5-flash-image-preview`, `gemini/gemini-3-pro-image-preview` | +| Vertex AI | `vertex_ai/gemini-2.0-flash-preview-image-generation`, `vertex_ai/gemini-2.5-flash-image-preview`, `vertex_ai/gemini-3-pro-image-preview` | -## Spec +## Spec -The `image` field in the response follows this structure: +The `images` field in the response follows this structure: ```python -"image": { - "url": "data:image/png;base64,", - "detail": "auto" -} +"images": [ + { + "image_url": { + "url": "data:image/png;base64,", + "detail": "auto" + }, + "index": 0, + "type": "image_url" + } +] ``` -- `url` - str: Base64 encoded image data in data URI format -- `detail` - str: Image detail level (always "auto" for generated images) +- `images` - List[ImageURLListItem]: Array of generated images + - `image_url` - ImageURLObject: Container for image data + - `url` - str: Base64 encoded image data in data URI format + - `detail` - str: Image detail level (always "auto" for generated images) + - `index` - int: Index of the image in the response + - `type` - str: Type identifier (always "image_url") -The image is returned as a base64-encoded data URI that can be directly used in HTML `` tags or saved to a file. +The images are returned as base64-encoded data URIs that can be directly used in HTML `` tags or saved to files. diff --git a/docs/my-website/docs/completion/input.md b/docs/my-website/docs/completion/input.md index bdbd0b049296..cc0589352215 100644 --- a/docs/my-website/docs/completion/input.md +++ b/docs/my-website/docs/completion/input.md @@ -142,7 +142,47 @@ def completion( - `tool_call_id`: *str (optional)* - Tool call that this message is responding to. -[**See All Message Values**](https://github.com/BerriAI/litellm/blob/8600ec77042dacad324d3879a2bd918fc6a719fa/litellm/types/llms/openai.py#L392) +[**See All Message Values**](https://github.com/BerriAI/litellm/blob/main/litellm/types/llms/openai.py#L664) + +#### Content Types + +`content` can be a string (text only) or a list of content blocks (multimodal): + +| Type | Description | Docs | +|------|-------------|------| +| `text` | Text content | [Type Definition](https://github.com/BerriAI/litellm/blob/main/litellm/types/llms/openai.py#L598) | +| `image_url` | Images | [Vision](./vision.md) | +| `input_audio` | Audio input | [Audio](./audio.md) | +| `video_url` | Video input | [Type Definition](https://github.com/BerriAI/litellm/blob/main/litellm/types/llms/openai.py#L625) | +| `file` | Files | [Document Understanding](./document_understanding.md) | +| `document` | Documents/PDFs | [Document Understanding](./document_understanding.md) | + +**Examples:** +```python +# Text +messages=[{"role": "user", "content": [{"type": "text", "text": "Hello!"}]}] + +# Image +messages=[{"role": "user", "content": [{"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}]}] + +# Audio +messages=[{"role": "user", "content": [{"type": "input_audio", "input_audio": {"data": "", "format": "wav"}}]}] + +# Video +messages=[{"role": "user", "content": [{"type": "video_url", "video_url": {"url": "https://example.com/video.mp4"}}]}] + +# File +messages=[{"role": "user", "content": [{"type": "file", "file": {"file_id": "https://example.com/doc.pdf"}}]}] + +# Document +messages=[{"role": "user", "content": [{"type": "document", "source": {"type": "text", "media_type": "application/pdf", "data": ""}}]}] + +# Combining multiple types (multimodal) +messages=[{"role": "user", "content": [ + {"type": "text", "text": "Generate a product description based on this image"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} +]}] +``` ## Optional Fields @@ -159,6 +199,8 @@ def completion( - `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value. - `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens. + + **Note**: OpenAI supports a maximum of 4 stop sequences. If you provide more than 4, LiteLLM will automatically truncate the list to the first 4 elements. To disable this automatic truncation, set `litellm.disable_stop_sequence_limit = True`. - `max_completion_tokens`: *integer (optional)* - An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens. @@ -174,11 +216,11 @@ def completion( - `seed`: *integer or null (optional)* - This feature is in Beta. If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed, and you should refer to the `system_fingerprint` response parameter to monitor changes in the backend. -- `tools`: *array (optional)* - A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of functions the model may generate JSON inputs for. +- `tools`: *array (optional)* - A list of tools the model may call. Use this to provide a list of functions the model may generate JSON inputs for. - - `type`: *string* - The type of the tool. Currently, only function is supported. + - `type`: *string* - The type of the tool. You can set this to `"function"` or `"mcp"` (matching the `/responses` schema) to call LiteLLM-registered MCP servers directly from `/chat/completions`. - - `function`: *object* - Required. + - `function`: *object* - Required for function tools. - `tool_choice`: *string or object (optional)* - Controls which (if any) function is called by the model. none means the model will not call a function and instead generates a message. auto means the model can pick between generating a message or calling a function. Specifying a particular function via `{"type": "function", "function": {"name": "my_function"}}` forces the model to call that function. @@ -247,4 +289,3 @@ def completion( - `eos_token`: *string (optional)* - Initial string applied at the end of a sequence - `hf_model_name`: *string (optional)* - [Sagemaker Only] The corresponding huggingface name of the model, used to pull the right chat template for the model. - diff --git a/docs/my-website/docs/completion/json_mode.md b/docs/my-website/docs/completion/json_mode.md index c86a1e59893f..14477f991536 100644 --- a/docs/my-website/docs/completion/json_mode.md +++ b/docs/my-website/docs/completion/json_mode.md @@ -126,6 +126,8 @@ resp = completion( ) print("Received={}".format(resp)) + +events_list = EventsList.model_validate_json(resp.choices[0].message.content) ``` @@ -339,4 +341,90 @@ curl http://0.0.0.0:4000/v1/chat/completions \ ``` - \ No newline at end of file + + +## Gemini - Native JSON Schema Format (Gemini 2.0+) + +Gemini 2.0+ models automatically use the native `responseJsonSchema` parameter, which provides better compatibility with standard JSON Schema format. + +### Benefits (Gemini 2.0+): +- Standard JSON Schema format (lowercase types like `string`, `object`) +- Supports `additionalProperties: false` for stricter validation +- Better compatibility with Pydantic's `model_json_schema()` +- No `propertyOrdering` required + +### Usage + + + + +```python +from litellm import completion +from pydantic import BaseModel + +class UserInfo(BaseModel): + name: str + age: int + +response = completion( + model="gemini/gemini-2.0-flash", + messages=[{"role": "user", "content": "Extract: John is 25 years old"}], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "user_info", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"], + "additionalProperties": False # Supported on Gemini 2.0+ + } + } + } +) +``` + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "gemini-2.0-flash", + "messages": [ + {"role": "user", "content": "Extract: John is 25 years old"} + ], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "user_info", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"], + "additionalProperties": false + } + } + } + }' +``` + + + + +### Model Behavior + +| Model | Format Used | `additionalProperties` Support | +|-------|-------------|-------------------------------| +| Gemini 2.0+ | `responseJsonSchema` (JSON Schema) | ✅ Yes | +| Gemini 1.5 | `responseSchema` (OpenAPI) | ❌ No | + +LiteLLM automatically selects the appropriate format based on the model version. \ No newline at end of file diff --git a/docs/my-website/docs/completion/knowledgebase.md b/docs/my-website/docs/completion/knowledgebase.md index 3040f7f1cc0b..7dc3132ad771 100644 --- a/docs/my-website/docs/completion/knowledgebase.md +++ b/docs/my-website/docs/completion/knowledgebase.md @@ -18,8 +18,11 @@ LiteLLM integrates with vector stores, allowing your models to access your organ ## Supported Vector Stores - [Bedrock Knowledge Bases](https://aws.amazon.com/bedrock/knowledge-bases/) - [OpenAI Vector Stores](https://platform.openai.com/docs/api-reference/vector-stores/search) -- [Azure Vector Stores](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/file-search?tabs=python#vector-stores) (Cannot be directly queried. Only available for calling in Assistants messages. We will be adding Azure AI Search Vector Store API support soon.) +- [Azure Vector Stores](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/file-search?tabs=python#vector-stores) (Cannot be directly queried. Only available for calling in Assistants messages.) +- [Azure AI Search](/docs/providers/azure_ai_vector_stores) (Vector search with Azure AI Search indexes) - [Vertex AI RAG API](https://cloud.google.com/vertex-ai/generative-ai/docs/rag-overview) +- [Gemini File Search](https://ai.google.dev/gemini-api/docs/file-search) +- [RAGFlow Datasets](/docs/providers/ragflow_vector_store.md) (Dataset management only, search not supported) ## Quick Start diff --git a/docs/my-website/docs/completion/prompt_caching.md b/docs/my-website/docs/completion/prompt_caching.md index c8adf4bcccf6..630c9e58d24e 100644 --- a/docs/my-website/docs/completion/prompt_caching.md +++ b/docs/my-website/docs/completion/prompt_caching.md @@ -27,7 +27,7 @@ For the supported providers, LiteLLM follows the OpenAI prompt caching usage obj } ``` -- `prompt_tokens`: These are the non-cached prompt tokens (same as Anthropic, equivalent to Deepseek `prompt_cache_miss_tokens`). +- `prompt_tokens`: These are all prompt tokens including cache-miss and cache-hit input tokens. - `completion_tokens`: These are the output tokens generated by the model. - `total_tokens`: Sum of prompt_tokens + completion_tokens. - `prompt_tokens_details`: Object containing cached_tokens. diff --git a/docs/my-website/docs/completion/token_usage.md b/docs/my-website/docs/completion/token_usage.md index 0bec6b3f9020..d99564765a15 100644 --- a/docs/my-website/docs/completion/token_usage.md +++ b/docs/my-website/docs/completion/token_usage.md @@ -100,7 +100,7 @@ from litellm import cost_per_token prompt_tokens = 5 completion_tokens = 10 -prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model="gpt-3.5-turbo", prompt_tokens=prompt_tokens, completion_tokens=completion_tokens)) +prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar = cost_per_token(model="gpt-3.5-turbo", prompt_tokens=prompt_tokens, completion_tokens=completion_tokens) print(prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar) ``` @@ -162,7 +162,7 @@ print(model_cost) # {'gpt-3.5-turbo': {'max_tokens': 4000, 'input_cost_per_token **Dictionary** ```python -from litellm import register_model +import litellm litellm.register_model({ "gpt-4": { diff --git a/docs/my-website/docs/completion/vision.md b/docs/my-website/docs/completion/vision.md index 76700084868e..90d6b2393fbc 100644 --- a/docs/my-website/docs/completion/vision.md +++ b/docs/my-website/docs/completion/vision.md @@ -31,7 +31,7 @@ response = completion( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png" } } ] @@ -92,7 +92,7 @@ response = client.chat.completions.create( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png" } } ] @@ -230,7 +230,7 @@ response = completion( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png", "format": "image/jpeg" } } @@ -292,7 +292,7 @@ response = client.chat.completions.create( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png", "format": "image/jpeg" } } diff --git a/docs/my-website/docs/completion/web_search.md b/docs/my-website/docs/completion/web_search.md index b0d8fcdf4c09..db50c7b5bc5c 100644 --- a/docs/my-website/docs/completion/web_search.md +++ b/docs/my-website/docs/completion/web_search.md @@ -371,6 +371,22 @@ model_list: web_search_options: {} # Enables web search with default settings ``` +### Advanced +You can configure LiteLLM's router to optionally drop models that do not support WebSearch, for example +```yaml + - model_name: gpt-4.1 + litellm_params: + model: openai/gpt-4.1 + - model_name: gpt-4.1 + litellm_params: + model: azure/gpt-4.1 + api_base: "x.openai.azure.com/" + api_version: 2025-03-01-preview + model_info: + supports_web_search: False <---- KEY CHANGE! +``` +In this example, LiteLLM will still route LLM requests to both deployments, but for WebSearch, will solely route to OpenAI. + diff --git a/docs/my-website/docs/contact.md b/docs/my-website/docs/contact.md index 947ec86991c8..b0aa9c6ce6aa 100644 --- a/docs/my-website/docs/contact.md +++ b/docs/my-website/docs/contact.md @@ -2,6 +2,6 @@ [![](https://dcbadge.vercel.app/api/server/wuPM9dRgDw)](https://discord.gg/wuPM9dRgDw) -* [Community Slack 💭](https://join.slack.com/share/enQtOTE0ODczMzk2Nzk4NC01YjUxNjY2YjBlYTFmNDRiZTM3NDFiYTM3MzVkODFiMDVjOGRjMmNmZTZkZTMzOWQzZGQyZWIwYjQ0MWExYmE3) +* [Community Slack 💭](https://www.litellm.ai/support) * [Meet with us 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) * Contact us at ishaan@berri.ai / krrish@berri.ai diff --git a/docs/my-website/docs/container_files.md b/docs/my-website/docs/container_files.md new file mode 100644 index 000000000000..1ef7687ea776 --- /dev/null +++ b/docs/my-website/docs/container_files.md @@ -0,0 +1,384 @@ +--- +id: container_files +title: /containers/files +--- + +# Container Files API + +Manage files within Code Interpreter containers. Files are created automatically when code interpreter generates outputs (charts, CSVs, images, etc.). + +:::tip +Looking for how to use Code Interpreter? See the [Code Interpreter Guide](/docs/guides/code_interpreter). +::: + +| Feature | Supported | +|---------|-----------| +| Cost Tracking | ✅ | +| Logging | ✅ | +| Supported Providers | `openai` | + +## Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/v1/containers/{container_id}/files` | POST | Upload file to container | +| `/v1/containers/{container_id}/files` | GET | List files in container | +| `/v1/containers/{container_id}/files/{file_id}` | GET | Get file metadata | +| `/v1/containers/{container_id}/files/{file_id}/content` | GET | Download file content | +| `/v1/containers/{container_id}/files/{file_id}` | DELETE | Delete file | + +## LiteLLM Python SDK + +### Upload Container File + +Upload files directly to a container session. This is useful when `/chat/completions` or `/responses` sends files to the container but the input file type is limited to PDF. This endpoint lets you work with other file types like CSV, Excel, Python scripts, etc. + +```python showLineNumbers title="upload_container_file.py" +from litellm import upload_container_file + +# Upload a CSV file +file = upload_container_file( + container_id="cntr_123...", + file=("data.csv", open("data.csv", "rb").read(), "text/csv"), + custom_llm_provider="openai" +) + +print(f"Uploaded: {file.id}") +print(f"Path: {file.path}") +``` + +**Async:** + +```python showLineNumbers title="aupload_container_file.py" +from litellm import aupload_container_file + +file = await aupload_container_file( + container_id="cntr_123...", + file=("script.py", b"print('hello world')", "text/x-python"), + custom_llm_provider="openai" +) +``` + +**Supported file formats:** +- CSV (`.csv`) +- Excel (`.xlsx`) +- Python scripts (`.py`) +- JSON (`.json`) +- Markdown (`.md`) +- Text files (`.txt`) +- And more... + +### List Container Files + +```python showLineNumbers title="list_container_files.py" +from litellm import list_container_files + +files = list_container_files( + container_id="cntr_123...", + custom_llm_provider="openai" +) + +for file in files.data: + print(f" - {file.id}: {file.filename}") +``` + +**Async:** + +```python showLineNumbers title="alist_container_files.py" +from litellm import alist_container_files + +files = await alist_container_files( + container_id="cntr_123...", + custom_llm_provider="openai" +) +``` + +### Retrieve Container File + +```python showLineNumbers title="retrieve_container_file.py" +from litellm import retrieve_container_file + +file = retrieve_container_file( + container_id="cntr_123...", + file_id="cfile_456...", + custom_llm_provider="openai" +) + +print(f"File: {file.filename}") +print(f"Size: {file.bytes} bytes") +``` + +### Download File Content + +```python showLineNumbers title="retrieve_container_file_content.py" +from litellm import retrieve_container_file_content + +content = retrieve_container_file_content( + container_id="cntr_123...", + file_id="cfile_456...", + custom_llm_provider="openai" +) + +# content is raw bytes +with open("output.png", "wb") as f: + f.write(content) +``` + +### Delete Container File + +```python showLineNumbers title="delete_container_file.py" +from litellm import delete_container_file + +result = delete_container_file( + container_id="cntr_123...", + file_id="cfile_456...", + custom_llm_provider="openai" +) + +print(f"Deleted: {result.deleted}") +``` + +## LiteLLM AI Gateway (Proxy) + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +### Upload File + + + + +```python showLineNumbers title="upload_file.py" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000" +) + +file = client.containers.files.create( + container_id="cntr_123...", + file=open("data.csv", "rb") +) + +print(f"Uploaded: {file.id}") +print(f"Path: {file.path}") +``` + + + + +```bash showLineNumbers title="upload_file.sh" +curl "http://localhost:4000/v1/containers/cntr_123.../files" \ + -H "Authorization: Bearer sk-1234" \ + -F file="@data.csv" +``` + + + + +### List Files + + + + +```python showLineNumbers title="list_files.py" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000" +) + +files = client.containers.files.list( + container_id="cntr_123..." +) + +for file in files.data: + print(f" - {file.id}: {file.filename}") +``` + + + + +```bash showLineNumbers title="list_files.sh" +curl "http://localhost:4000/v1/containers/cntr_123.../files" \ + -H "Authorization: Bearer sk-1234" +``` + + + + +### Retrieve File Metadata + + + + +```python showLineNumbers title="retrieve_file.py" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000" +) + +file = client.containers.files.retrieve( + container_id="cntr_123...", + file_id="cfile_456..." +) + +print(f"File: {file.filename}") +print(f"Size: {file.bytes} bytes") +``` + + + + +```bash showLineNumbers title="retrieve_file.sh" +curl "http://localhost:4000/v1/containers/cntr_123.../files/cfile_456..." \ + -H "Authorization: Bearer sk-1234" +``` + + + + +### Download File Content + + + + +```python showLineNumbers title="download_content.py" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000" +) + +content = client.containers.files.content( + container_id="cntr_123...", + file_id="cfile_456..." +) + +with open("output.png", "wb") as f: + f.write(content.read()) +``` + + + + +```bash showLineNumbers title="download_content.sh" +curl "http://localhost:4000/v1/containers/cntr_123.../files/cfile_456.../content" \ + -H "Authorization: Bearer sk-1234" \ + --output downloaded_file.png +``` + + + + +### Delete File + + + + +```python showLineNumbers title="delete_file.py" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000" +) + +result = client.containers.files.delete( + container_id="cntr_123...", + file_id="cfile_456..." +) + +print(f"Deleted: {result.deleted}") +``` + + + + +```bash showLineNumbers title="delete_file.sh" +curl -X DELETE "http://localhost:4000/v1/containers/cntr_123.../files/cfile_456..." \ + -H "Authorization: Bearer sk-1234" +``` + + + + +## Parameters + +### Upload File + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `container_id` | string | Yes | Container ID | +| `file` | FileTypes | Yes | File to upload. Can be a tuple of (filename, content, content_type), file-like object, or bytes | + +### List Files + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `container_id` | string | Yes | Container ID | +| `after` | string | No | Pagination cursor | +| `limit` | integer | No | Items to return (1-100, default: 20) | +| `order` | string | No | Sort order: `asc` or `desc` | + +### Retrieve/Delete File + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `container_id` | string | Yes | Container ID | +| `file_id` | string | Yes | File ID | + +## Response Objects + +### ContainerFileObject + +```json showLineNumbers title="ContainerFileObject" +{ + "id": "cfile_456...", + "object": "container.file", + "container_id": "cntr_123...", + "bytes": 12345, + "created_at": 1234567890, + "filename": "chart.png", + "path": "/mnt/data/chart.png", + "source": "code_interpreter" +} +``` + +### ContainerFileListResponse + +```json showLineNumbers title="ContainerFileListResponse" +{ + "object": "list", + "data": [...], + "first_id": "cfile_456...", + "last_id": "cfile_789...", + "has_more": false +} +``` + +### DeleteContainerFileResponse + +```json showLineNumbers title="DeleteContainerFileResponse" +{ + "id": "cfile_456...", + "object": "container.file.deleted", + "deleted": true +} +``` + +## Supported Providers + +| Provider | Status | +|----------|--------| +| OpenAI | ✅ Supported | + +## Related + +- [Containers API](/docs/containers) - Manage containers +- [Code Interpreter Guide](/docs/guides/code_interpreter) - Using Code Interpreter with LiteLLM diff --git a/docs/my-website/docs/containers.md b/docs/my-website/docs/containers.md new file mode 100644 index 000000000000..2bfe179ff6b6 --- /dev/null +++ b/docs/my-website/docs/containers.md @@ -0,0 +1,474 @@ +# /containers + +Manage OpenAI code interpreter containers (sessions) for executing code in isolated environments. + +:::tip +Looking for how to use Code Interpreter? See the [Code Interpreter Guide](/docs/guides/code_interpreter). +::: + +| Feature | Supported | +|---------|-----------| +| Cost Tracking | ✅ | +| Logging | ✅ (Full request/response logging) | +| Load Balancing | ✅ | +| Proxy Server Support | ✅ Full proxy integration with virtual keys | +| Spend Management | ✅ Budget tracking and rate limiting | +| Supported Providers | `openai`| + +:::tip + +Containers provide isolated execution environments for code interpreter sessions. You can create, list, retrieve, and delete containers. + +::: + +## **LiteLLM Python SDK Usage** + +### Quick Start + +**Create a Container** + +```python +import litellm +import os + +# setup env +os.environ["OPENAI_API_KEY"] = "sk-.." + +container = litellm.create_container( + name="My Code Interpreter Container", + custom_llm_provider="openai", + expires_after={ + "anchor": "last_active_at", + "minutes": 20 + } +) + +print(f"Container ID: {container.id}") +print(f"Container Name: {container.name}") +``` + +### Async Usage + +```python +from litellm import acreate_container +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." + +container = await acreate_container( + name="My Code Interpreter Container", + custom_llm_provider="openai", + expires_after={ + "anchor": "last_active_at", + "minutes": 20 + } +) + +print(f"Container ID: {container.id}") +print(f"Container Name: {container.name}") +``` + +### List Containers + +```python +from litellm import list_containers +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." + +containers = list_containers( + custom_llm_provider="openai", + limit=20, + order="desc" +) + +print(f"Found {len(containers.data)} containers") +for container in containers.data: + print(f" - {container.id}: {container.name}") +``` + +**Async Usage:** + +```python +from litellm import alist_containers + +containers = await alist_containers( + custom_llm_provider="openai", + limit=20, + order="desc" +) + +print(f"Found {len(containers.data)} containers") +for container in containers.data: + print(f" - {container.id}: {container.name}") +``` + +### Retrieve a Container + +```python +from litellm import retrieve_container +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." + +container = retrieve_container( + container_id="cntr_123...", + custom_llm_provider="openai" +) + +print(f"Container: {container.name}") +print(f"Status: {container.status}") +print(f"Created: {container.created_at}") +``` + +**Async Usage:** + +```python +from litellm import aretrieve_container + +container = await aretrieve_container( + container_id="cntr_123...", + custom_llm_provider="openai" +) + +print(f"Container: {container.name}") +print(f"Status: {container.status}") +print(f"Created: {container.created_at}") +``` + +### Delete a Container + +```python +from litellm import delete_container +import os + +os.environ["OPENAI_API_KEY"] = "sk-.." + +result = delete_container( + container_id="cntr_123...", + custom_llm_provider="openai" +) + +print(f"Deleted: {result.deleted}") +print(f"Container ID: {result.id}") +``` + +**Async Usage:** + +```python +from litellm import adelete_container + +result = await adelete_container( + container_id="cntr_123...", + custom_llm_provider="openai" +) + +print(f"Deleted: {result.deleted}") +print(f"Container ID: {result.id}") +``` + +## **LiteLLM Proxy Usage** + +LiteLLM provides OpenAI API compatible container endpoints for managing code interpreter sessions: + +- `/v1/containers` - Create and list containers +- `/v1/containers/{container_id}` - Retrieve and delete containers + +**Setup** + +```bash +$ export OPENAI_API_KEY="sk-..." + +$ litellm + +# RUNNING on http://0.0.0.0:4000 +``` + +**Custom Provider Specification** + +You can specify the custom LLM provider in multiple ways (priority order): +1. Header: `-H "custom-llm-provider: openai"` +2. Query param: `?custom_llm_provider=openai` +3. Request body: `{"custom_llm_provider": "openai", ...}` +4. Defaults to "openai" if not specified + +**Create a Container** + +```bash +# Default provider (openai) +curl -X POST "http://localhost:4000/v1/containers" \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "My Container", + "expires_after": { + "anchor": "last_active_at", + "minutes": 20 + } + }' +``` + +```bash +# Via header +curl -X POST "http://localhost:4000/v1/containers" \ + -H "Authorization: Bearer sk-1234" \ + -H "custom-llm-provider: openai" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "My Container" + }' +``` + +```bash +# Via query parameter +curl -X POST "http://localhost:4000/v1/containers?custom_llm_provider=openai" \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "name": "My Container" + }' +``` + +**List Containers** + +```bash +curl "http://localhost:4000/v1/containers?limit=20&order=desc" \ + -H "Authorization: Bearer sk-1234" +``` + +**Retrieve a Container** + +```bash +curl "http://localhost:4000/v1/containers/cntr_123..." \ + -H "Authorization: Bearer sk-1234" +``` + +**Delete a Container** + +```bash +curl -X DELETE "http://localhost:4000/v1/containers/cntr_123..." \ + -H "Authorization: Bearer sk-1234" +``` + +## **Using OpenAI Client with LiteLLM Proxy** + +You can use the standard OpenAI Python client to interact with LiteLLM's container endpoints. This provides a familiar interface while leveraging LiteLLM's proxy features. + +### Setup + +First, configure your OpenAI client to point to your LiteLLM proxy: + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", # Your LiteLLM proxy key + base_url="http://localhost:4000" # LiteLLM proxy URL +) +``` + +### Create a Container + +```python +container = client.containers.create( + name="test-container", + expires_after={ + "anchor": "last_active_at", + "minutes": 20 + }, + extra_body={"custom_llm_provider": "openai"} +) + +print(f"Container ID: {container.id}") +print(f"Container Name: {container.name}") +print(f"Created at: {container.created_at}") +``` + +### List Containers + +```python +containers = client.containers.list( + limit=20, + extra_body={"custom_llm_provider": "openai"} +) + +print(f"Found {len(containers.data)} containers") +for container in containers.data: + print(f" - {container.id}: {container.name}") +``` + +### Retrieve a Container + +```python +container = client.containers.retrieve( + container_id="cntr_6901d28b3c8881908b702815828a5bde0380b3408aeae8c7", + extra_body={"custom_llm_provider": "openai"} +) + +print(f"Container: {container.name}") +print(f"Status: {container.status}") +print(f"Last active: {container.last_active_at}") +``` + +### Delete a Container + +```python +result = client.containers.delete( + container_id="cntr_6901d28b3c8881908b702815828a5bde0380b3408aeae8c7", + extra_body={"custom_llm_provider": "openai"} +) + +print(f"Deleted: {result.deleted}") +print(f"Container ID: {result.id}") +``` + +### Complete Workflow Example + +Here's a complete example showing the full container management workflow: + +```python +from openai import OpenAI + +# Initialize client +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000" +) + +# 1. Create a container +print("Creating container...") +container = client.containers.create( + name="My Code Interpreter Session", + expires_after={ + "anchor": "last_active_at", + "minutes": 20 + }, + extra_body={"custom_llm_provider": "openai"} +) + +container_id = container.id +print(f"Container created. ID: {container_id}") + +# 2. List all containers +print("\nListing containers...") +containers = client.containers.list( + extra_body={"custom_llm_provider": "openai"} +) + +for c in containers.data: + print(f" - {c.id}: {c.name} (Status: {c.status})") + +# 3. Retrieve specific container +print(f"\nRetrieving container {container_id}...") +retrieved = client.containers.retrieve( + container_id=container_id, + extra_body={"custom_llm_provider": "openai"} +) + +print(f"Container: {retrieved.name}") +print(f"Status: {retrieved.status}") +print(f"Last active: {retrieved.last_active_at}") + +# 4. Delete container +print(f"\nDeleting container {container_id}...") +result = client.containers.delete( + container_id=container_id, + extra_body={"custom_llm_provider": "openai"} +) + +print(f"Deleted: {result.deleted}") +``` + +## Container Parameters + +### Create Container Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `name` | string | Yes | Name of the container | +| `expires_after` | object | No | Container expiration settings | +| `expires_after.anchor` | string | No | Anchor point for expiration (e.g., "last_active_at") | +| `expires_after.minutes` | integer | No | Minutes until expiration from anchor | +| `file_ids` | array | No | List of file IDs to include in the container | +| `custom_llm_provider` | string | No | LLM provider to use (default: "openai") | + +### List Container Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `after` | string | No | Cursor for pagination | +| `limit` | integer | No | Number of items to return (1-100, default: 20) | +| `order` | string | No | Sort order: "asc" or "desc" (default: "desc") | +| `custom_llm_provider` | string | No | LLM provider to use (default: "openai") | + +### Retrieve/Delete Container Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `container_id` | string | Yes | ID of the container to retrieve/delete | +| `custom_llm_provider` | string | No | LLM provider to use (default: "openai") | + +## Response Objects + +### ContainerObject + +```json +{ + "id": "cntr_123...", + "object": "container", + "created_at": 1234567890, + "name": "My Container", + "status": "active", + "last_active_at": 1234567890, + "expires_at": 1234569090, + "file_ids": [] +} +``` + +### ContainerListResponse + +```json +{ + "object": "list", + "data": [ + { + "id": "cntr_123...", + "object": "container", + "created_at": 1234567890, + "name": "My Container", + "status": "active" + } + ], + "first_id": "cntr_123...", + "last_id": "cntr_456...", + "has_more": false +} +``` + +### DeleteContainerResult + +```json +{ + "id": "cntr_123...", + "object": "container.deleted", + "deleted": true +} +``` + +## **Supported Providers** + +| Provider | Support Status | Notes | +|-------------|----------------|-------| +| OpenAI | ✅ Supported | Full support for all container operations | + +:::info + +Currently, only OpenAI supports container management for code interpreter sessions. Support for additional providers may be added in the future. + +::: + +## Related + +- [Container Files API](/docs/container_files) - Manage files within containers +- [Code Interpreter Guide](/docs/guides/code_interpreter) - Using Code Interpreter with LiteLLM + diff --git a/docs/my-website/docs/contribute_integration/custom_webhook_api.md b/docs/my-website/docs/contribute_integration/custom_webhook_api.md new file mode 100644 index 000000000000..158937d2a43e --- /dev/null +++ b/docs/my-website/docs/contribute_integration/custom_webhook_api.md @@ -0,0 +1,114 @@ +# Contribute Custom Webhook API + +If your API just needs a Webhook event from LiteLLM, here's how to add a 'native' integration for it on LiteLLM: + +1. Clone the repo and open the `generic_api_compatible_callbacks.json` + +```bash +git clone https://github.com/BerriAI/litellm.git +cd litellm +open . +``` + +2. Add your API to the `generic_api_compatible_callbacks.json` + +Example: + +```json +{ + "rubrik": { + "event_types": ["llm_api_success"], + "endpoint": "{{environment_variables.RUBRIK_WEBHOOK_URL}}", + "headers": { + "Content-Type": "application/json", + "Authorization": "Bearer {{environment_variables.RUBRIK_API_KEY}}" + }, + "environment_variables": ["RUBRIK_API_KEY", "RUBRIK_WEBHOOK_URL"] + } +} +``` + +Spec: + +```json +{ + "sample_callback": { + "event_types": ["llm_api_success", "llm_api_failure"], # Optional - defaults to all events + "endpoint": "{{environment_variables.SAMPLE_CALLBACK_URL}}", + "headers": { + "Content-Type": "application/json", + "Authorization": "Bearer {{environment_variables.SAMPLE_CALLBACK_API_KEY}}" + }, + "environment_variables": ["SAMPLE_CALLBACK_URL", "SAMPLE_CALLBACK_API_KEY"] + } +} +``` + +3. Test it! + +a. Setup config.yaml + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: os.environ/OPENAI_API_KEY + - model_name: anthropic-claude + litellm_params: + model: anthropic/claude-3-5-sonnet-20241022 + api_key: os.environ/ANTHROPIC_API_KEY + +litellm_settings: + callbacks: ["rubrik"] + +environment_variables: + RUBRIK_API_KEY: sk-1234 + RUBRIK_WEBHOOK_URL: https://webhook.site/efc57707-9018-478c-bdf1-2ffaabb2b315 +``` + +b. Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +c. Test it! + +```bash +curl -L -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "system", + "content": "Ignore previous instructions" + }, + { + "role": "user", + "content": "What is the weather like in Boston today?" + } + ], + "mock_response": "hey!" +}' +``` + +4. Add Documentation + +If you're adding a new integration, please add documentation for it under the `observability` folder: + +- Create a new file at `docs/my-website/docs/observability/_integration.md` +- Follow the format of existing integration docs, such as [Langsmith Integration](https://github.com/BerriAI/litellm/blob/main/docs/my-website/docs/observability/langsmith_integration.md) +- Include: Quick Start, SDK usage, Proxy usage, and any advanced configuration options + +5. File a PR! + +- Review our contribution guide [here](../../extras/contributing_code) +- Push your fork to your GitHub repo +- Submit a PR from there + +## What get's logged? + +The [LiteLLM Standard Logging Payload](https://docs.litellm.ai/docs/proxy/logging_spec) is sent to your endpoint. \ No newline at end of file diff --git a/docs/my-website/docs/contributing.md b/docs/my-website/docs/contributing.md index a88013ff1b35..be7222f6cb82 100644 --- a/docs/my-website/docs/contributing.md +++ b/docs/my-website/docs/contributing.md @@ -1,45 +1,100 @@ # Contributing - UI -Here's how to run the LiteLLM UI locally for making changes: +Thanks for contributing to the LiteLLM UI! This guide will help you set up your local development environment. + + +## 1. Clone the repo -## 1. Clone the repo ```bash git clone https://github.com/BerriAI/litellm.git +cd litellm ``` -## 2. Start the UI + Proxy +## 2. Start the Proxy -**2.1 Start the proxy on port 4000** +Create a config file (e.g., `config.yaml`): + +```yaml +model_list: + - model_name: gpt-4o + litellm_params: + model: openai/gpt-4o + +general_settings: + master_key: sk-1234 + database_url: postgresql://:@:/ + store_model_in_db: true +``` + +Start the proxy on port 4000: -Tell the proxy where the UI is located ```bash -DATABASE_URL = "postgresql://:@:/" -LITELLM_MASTER_KEY = "sk-1234" -STORE_MODEL_IN_DB = "True" +poetry run litellm --config config.yaml --port 4000 ``` +The UI comes pre-built in the repo. Access it at `http://localhost:4000/ui` + +## 3. UI Development + +There are two options for UI development: + +### Option A: Development Mode (Hot Reload) + +This runs the UI on port 3000 with hot reload. The proxy runs on port 4000. + ```bash -cd litellm/litellm/proxy -python3 proxy_cli.py --config /path/to/config.yaml --port 4000 +cd ui/litellm-dashboard +npm install +npm run dev ``` -**2.2 Start the UI** +**Login flow:** +1. Go to `http://localhost:3000` +2. You'll be redirected to `http://localhost:4000/ui` for login +3. After logging in, manually navigate back to `http://localhost:3000/` +4. You're now authenticated and can develop with hot reload + +:::note +If you experience redirect loops or authentication issues, clear your browser cookies for localhost or use Build Mode instead. +::: -Set the mode as development (this will assume the proxy is running on localhost:4000) +### Option B: Build Mode + +This builds the UI and copies it to the proxy. Changes require rebuilding. + +1. Make your code changes in `ui/litellm-dashboard/src/` + +2. Build the UI ```bash -npm install # install dependencies +cd ui/litellm-dashboard +npm install +npm run build ``` +After building, copy the output to the proxy: + ```bash -cd litellm/ui/litellm-dashboard +cp -r out/* ../../litellm/proxy/_experimental/out/ +``` -npm run dev +Then restart the proxy and access the UI at `http://localhost:4000/ui` -# starts on http://0.0.0.0:3000 +## 4. Submitting a PR + +1. Create a new branch for your changes: +```bash +git checkout -b feat/your-feature-name ``` -## 3. Go to local UI +2. Stage and commit your changes: +```bash +git add . +git commit -m "feat: description of your changes" +``` +3. Push to your fork: ```bash -http://0.0.0.0:3000 -``` \ No newline at end of file +git push origin feat/your-feature-name +``` + +4. Create a Pull Request on GitHub following the [PR template](https://github.com/BerriAI/litellm/blob/main/.github/pull_request_template.md) diff --git a/docs/my-website/docs/contributing/adding_openai_compatible_providers.md b/docs/my-website/docs/contributing/adding_openai_compatible_providers.md new file mode 100644 index 000000000000..bb89eea35bf1 --- /dev/null +++ b/docs/my-website/docs/contributing/adding_openai_compatible_providers.md @@ -0,0 +1,130 @@ +# Adding OpenAI-Compatible Providers + +For simple OpenAI-compatible providers (like Hyperbolic, Nscale, etc.), you can add support by editing a single JSON file. + +## Quick Start + +1. Edit `litellm/llms/openai_like/providers.json` +2. Add your provider configuration +3. Test with: `litellm.completion(model="your_provider/model-name", ...)` + +## Basic Configuration + +For a fully OpenAI-compatible provider: + +```json +{ + "your_provider": { + "base_url": "https://api.yourprovider.com/v1", + "api_key_env": "YOUR_PROVIDER_API_KEY" + } +} +``` + +That's it! The provider is now available. + +## Configuration Options + +### Required Fields + +- `base_url` - API endpoint (e.g., `https://api.provider.com/v1`) +- `api_key_env` - Environment variable name for API key (e.g., `PROVIDER_API_KEY`) + +### Optional Fields + +- `api_base_env` - Environment variable to override `base_url` +- `base_class` - Use `"openai_gpt"` (default) or `"openai_like"` +- `param_mappings` - Map OpenAI parameter names to provider-specific names +- `constraints` - Parameter value constraints (min/max) +- `special_handling` - Special behaviors like content format conversion + +## Examples + +### Simple Provider (Fully Compatible) + +```json +{ + "hyperbolic": { + "base_url": "https://api.hyperbolic.xyz/v1", + "api_key_env": "HYPERBOLIC_API_KEY" + } +} +``` + +### Provider with Parameter Mapping + +```json +{ + "publicai": { + "base_url": "https://api.publicai.co/v1", + "api_key_env": "PUBLICAI_API_KEY", + "param_mappings": { + "max_completion_tokens": "max_tokens" + } + } +} +``` + +### Provider with Constraints + +```json +{ + "custom_provider": { + "base_url": "https://api.custom.com/v1", + "api_key_env": "CUSTOM_API_KEY", + "constraints": { + "temperature_max": 1.0, + "temperature_min": 0.0 + } + } +} +``` + +## Usage + +```python +import litellm +import os + +# Set your API key +os.environ["YOUR_PROVIDER_API_KEY"] = "your-key-here" + +# Use the provider +response = litellm.completion( + model="your_provider/model-name", + messages=[{"role": "user", "content": "Hello"}], +) +``` + +## When to Use Python Instead + +Use a Python config class if you need: + +- Custom authentication flows (OAuth, JWT, etc.) +- Complex request/response transformations +- Provider-specific streaming logic +- Advanced tool calling modifications + +For these cases, create a config class in `litellm/llms/your_provider/chat/transformation.py` that inherits from `OpenAIGPTConfig` or `OpenAILikeChatConfig`. + +## Testing + +Test your provider: + +```bash +# Quick test +python -c " +import litellm +import os +os.environ['PROVIDER_API_KEY'] = 'your-key' +response = litellm.completion( + model='provider/model-name', + messages=[{'role': 'user', 'content': 'test'}] +) +print(response.choices[0].message.content) +" +``` + +## Reference + +See existing providers in `litellm/llms/openai_like/providers.json` for examples. diff --git a/docs/my-website/docs/data_retention.md b/docs/my-website/docs/data_retention.md index 04d4675199ea..3cfdd247258d 100644 --- a/docs/my-website/docs/data_retention.md +++ b/docs/my-website/docs/data_retention.md @@ -10,7 +10,7 @@ This policy outlines the requirements and controls/procedures LiteLLM Cloud has For Customers 1. Active Accounts -- Customer data is retained for as long as the customer’s account is in active status. This includes data such as prompts, generated content, logs, and usage metrics. +- Customer data is retained for as long as the customer’s account is in active status. This includes data such as prompts, generated content, logs, and usage metrics. By default, we do not store the message / response content of your API requests or responses. Cloud users need to explicitly opt in to store the message / response content of your API requests or responses. 2. Voluntary Account Closure diff --git a/docs/my-website/docs/embedding/supported_embedding.md b/docs/my-website/docs/embedding/supported_embedding.md index e63d94036650..11ca4da48a4e 100644 --- a/docs/my-website/docs/embedding/supported_embedding.md +++ b/docs/my-website/docs/embedding/supported_embedding.md @@ -10,6 +10,26 @@ import os os.environ['OPENAI_API_KEY'] = "" response = embedding(model='text-embedding-ada-002', input=["good morning from litellm"]) ``` + +## Async Usage - `aembedding()` + +LiteLLM provides an asynchronous version of the `embedding` function called `aembedding`: + +```python +from litellm import aembedding +import asyncio + +async def get_embedding(): + response = await aembedding( + model='text-embedding-ada-002', + input=["good morning from litellm"] + ) + return response + +response = asyncio.run(get_embedding()) +print(response) +``` + ## Proxy Usage **NOTE** @@ -263,6 +283,8 @@ print(response) | Model Name | Function Call | |----------------------|---------------------------------------------| +| Amazon Nova Multimodal Embeddings | `embedding(model="bedrock/amazon.nova-2-multimodal-embeddings-v1:0", input=input)` | [Nova Docs](../providers/bedrock_embedding#amazon-nova-multimodal-embeddings) | +| Amazon Nova (Async) | `embedding(model="bedrock/async_invoke/amazon.nova-2-multimodal-embeddings-v1:0", input=input, input_type="text", output_s3_uri="s3://bucket/")` | [Nova Async Docs](../providers/bedrock_embedding#asynchronous-embeddings-with-segmentation) | | Titan Embeddings - G1 | `embedding(model="amazon.titan-embed-text-v1", input=input)` | | Cohere Embeddings - English | `embedding(model="cohere.embed-english-v3", input=input)` | | Cohere Embeddings - Multilingual | `embedding(model="cohere.embed-multilingual-v3", input=input)` | diff --git a/docs/my-website/docs/enterprise.md b/docs/my-website/docs/enterprise.md index cc3466fc1032..2eed0f53e597 100644 --- a/docs/my-website/docs/enterprise.md +++ b/docs/my-website/docs/enterprise.md @@ -3,7 +3,8 @@ import Image from '@theme/IdealImage'; # Enterprise :::info -✨ SSO is free for up to 5 users. After that, an enterprise license is required. [Get Started with Enterprise here](https://www.litellm.ai/enterprise) +- ✨ SSO is free for up to 5 users. After that, an enterprise license is required. [Get Started with Enterprise here](https://www.litellm.ai/enterprise) +- Who is Enterprise for? Companies giving access to 100+ users **OR** 10+ AI use-cases. If you're not sure, [get in touch with us](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) to discuss your needs. ::: For companies that need SSO, user management and professional support for LiteLLM Proxy @@ -16,7 +17,7 @@ Get free 7-day trial key [here](https://www.litellm.ai/enterprise#trial) Includes all enterprise features. - + [**Procurement available via AWS / Azure Marketplace**](./data_security.md#legalcompliance-faqs) @@ -40,7 +41,7 @@ Self-Managed Enterprise deployments require our team to understand your exact ne ### How does deployment with Enterprise License work? -You just deploy [our docker image](https://docs.litellm.ai/docs/proxy/deploy) and get an enterprise license key to add to your environment to unlock additional functionality (SSO, Prometheus metrics, etc.). +You just deploy [our docker image](https://docs.litellm.ai/docs/proxy/deploy) and get an enterprise license key to add to your environment to unlock additional functionality (SSO, etc.). ```env LITELLM_LICENSE="eyJ..." diff --git a/docs/my-website/docs/exception_mapping.md b/docs/my-website/docs/exception_mapping.md index 2342f444e17d..efdada2a1eb7 100644 --- a/docs/my-website/docs/exception_mapping.md +++ b/docs/my-website/docs/exception_mapping.md @@ -112,6 +112,85 @@ except openai.APITimeoutError as e: print(f"should_retry: {should_retry}") ``` +## Advanced + +### Accessing Provider-Specific Error Details + +LiteLLM exceptions include a `provider_specific_fields` attribute that contains additional error information specific to each provider. This is particularly useful for Azure OpenAI, which provides detailed content filtering information. + +#### Azure OpenAI - Content Policy Violation Inner Error Access + +When Azure OpenAI returns content policy violations, you can access the detailed content filtering results through the `innererror` field: + +```python +import litellm +from litellm.exceptions import ContentPolicyViolationError + +try: + response = litellm.completion( + model="azure/gpt-4", + messages=[ + { + "role": "user", + "content": "Some content that might violate policies" + } + ] + ) +except ContentPolicyViolationError as e: + # Access Azure-specific error details + if e.provider_specific_fields and "innererror" in e.provider_specific_fields: + innererror = e.provider_specific_fields["innererror"] + + # Access content filter results + content_filter_result = innererror.get("content_filter_result", {}) + + print(f"Content filter code: {innererror.get('code')}") + print(f"Hate filtered: {content_filter_result.get('hate', {}).get('filtered')}") + print(f"Violence severity: {content_filter_result.get('violence', {}).get('severity')}") + print(f"Sexual content filtered: {content_filter_result.get('sexual', {}).get('filtered')}") +``` + +**Example Response Structure:** + +When calling the LiteLLM proxy, content policy violations will return detailed filtering information: + +```json +{ + "error": { + "message": "litellm.ContentPolicyViolationError: AzureException - The response was filtered due to the prompt triggering Azure OpenAI's content management policy...", + "type": null, + "param": null, + "code": "400", + "provider_specific_fields": { + "innererror": { + "code": "ResponsibleAIPolicyViolation", + "content_filter_result": { + "hate": { + "filtered": true, + "severity": "high" + }, + "jailbreak": { + "filtered": false, + "detected": false + }, + "self_harm": { + "filtered": false, + "severity": "safe" + }, + "sexual": { + "filtered": false, + "severity": "safe" + }, + "violence": { + "filtered": true, + "severity": "medium" + } + } + } + } + } +} + ## Details To see how it's implemented - [check out the code](https://github.com/BerriAI/litellm/blob/a42c197e5a6de56ea576c73715e6c7c6b19fa249/litellm/utils.py#L1217) diff --git a/docs/my-website/docs/extras/contributing_code.md b/docs/my-website/docs/extras/contributing_code.md index f3a8271b14b8..930a47eec7eb 100644 --- a/docs/my-website/docs/extras/contributing_code.md +++ b/docs/my-website/docs/extras/contributing_code.md @@ -107,3 +107,18 @@ docker run \ litellm_test_image \ --config /app/config.yaml --detailed_debug ``` +### Running LiteLLM Proxy Locally + +1. cd into the `proxy/` directory + +``` +cd litellm/litellm/proxy +``` + +2. Run the proxy + +```shell +python3 proxy_cli.py --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` \ No newline at end of file diff --git a/docs/my-website/docs/files_endpoints.md b/docs/my-website/docs/files_endpoints.md index 88493fe0bbdf..30677c748a95 100644 --- a/docs/my-website/docs/files_endpoints.md +++ b/docs/my-website/docs/files_endpoints.md @@ -16,7 +16,137 @@ Use this to call the provider's `/files` endpoints directly, in the OpenAI forma - Delete File - Get File Content +## Multi-Account Support (Multiple OpenAI Keys) + +Use different OpenAI API keys for files and batches by specifying a `model` parameter that references entries in your `model_list`. This approach works **without requiring a database** and allows you to route files/batches to different OpenAI accounts. + +### How It Works + +1. Define models in `model_list` with different API keys +2. Pass `model` parameter when creating files +3. LiteLLM returns encoded IDs that contain routing information +4. Use encoded IDs for all subsequent operations (retrieve, delete, batches) +5. No need to specify model again - routing info is in the ID + +### Setup + +```yaml +model_list: + # litellm OpenAI Account + - model_name: "gpt-4o-litellm" + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_LITELLM_API_KEY + + # Free OpenAI Account + - model_name: "gpt-4o-free" + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_FREE_API_KEY +``` + +### Usage Example + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", # Your LiteLLM proxy key + base_url="http://0.0.0.0:4000" +) + +# Create file using litellm account +file_response = client.files.create( + file=open("batch_data.jsonl", "rb"), + purpose="batch", + extra_body={"model": "gpt-4o-litellm"} # Routes to litellm key +) +print(f"File ID: {file_response.id}") +# Returns encoded ID like: file-bGl0ZWxsbTpmaWxlLWFiYzEyMzttb2RlbCxncHQtNG8taWZvb2Q + +# Create batch using the encoded file ID +# No need to specify model again - it's embedded in the file ID +batch_response = client.batches.create( + input_file_id=file_response.id, # Encoded ID + endpoint="/v1/chat/completions", + completion_window="24h" +) +print(f"Batch ID: {batch_response.id}") +# Returns encoded batch ID with routing info + +# Retrieve batch - routing happens automatically +batch_status = client.batches.retrieve(batch_response.id) +print(f"Status: {batch_status.status}") + +# List files for a specific account +files = client.files.list( + extra_body={"model": "gpt-4o-free"} # List free files +) + +# List batches for a specific account +batches = client.batches.list( + extra_query={"model": "gpt-4o-litellm"} # List litellm batches +) +``` + +### Parameter Options + +You can pass the `model` parameter via: +- **Request body**: `extra_body={"model": "gpt-4o-litellm"}` +- **Query parameter**: `?model=gpt-4o-litellm` +- **Header**: `x-litellm-model: gpt-4o-litellm` + +### How Encoded IDs Work + +- When you create a file/batch with a `model` parameter, LiteLLM encodes the model name into the returned ID +- The encoded ID is base64-encoded and looks like: `file-bGl0ZWxsbTpmaWxlLWFiYzEyMzttb2RlbCxncHQtNG8taWZvb2Q` +- When you use this ID in subsequent operations (retrieve, delete, batch create), LiteLLM automatically: + 1. Decodes the ID + 2. Extracts the model name + 3. Looks up the credentials + 4. Routes the request to the correct OpenAI account +- The original provider file/batch ID is preserved internally +### Benefits + +✅ **No Database Required** - All routing info stored in the ID +✅ **Stateless** - Works across proxy restarts +✅ **Simple** - Just pass the ID around like normal +✅ **Backward Compatible** - Existing `custom_llm_provider` and `files_settings` still work +✅ **Future-Proof** - Aligns with managed batches approach + +### Migration from files_settings + +**Old approach (still works):** +```yaml +files_settings: + - custom_llm_provider: openai + api_key: os.environ/OPENAI_KEY +``` + +```python +# Had to specify provider on every call +client.files.create(..., extra_headers={"custom-llm-provider": "openai"}) +client.files.retrieve(file_id, extra_headers={"custom-llm-provider": "openai"}) +``` + +**New approach (recommended):** +```yaml +model_list: + - model_name: "gpt-4o-account1" + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_KEY +``` + +```python +# Specify model once on create +file = client.files.create(..., extra_body={"model": "gpt-4o-account1"}) + +# Then just use the ID - routing is automatic +client.files.retrieve(file.id) # No need to specify account +client.batches.create(input_file_id=file.id) # Routes correctly +``` @@ -171,6 +301,17 @@ content = await litellm.afile_content( print("file content=", content) ``` +**Get File Content (Bedrock)** +```python +# For Bedrock batch output files stored in S3 +content = await litellm.afile_content( + file_id="s3://bucket-name/path/to/file.jsonl", # S3 URI or unified file ID + custom_llm_provider="bedrock", + aws_region_name="us-west-2" +) +print("file content=", content.text) +``` + @@ -183,4 +324,6 @@ print("file content=", content) ### [Vertex AI](./providers/vertex#batch-apis) +### [Bedrock](./providers/bedrock_batches#4-retrieve-batch-results) + ## [Swagger API Reference](https://litellm-api.up.railway.app/#/files) diff --git a/docs/my-website/docs/getting_started.md b/docs/my-website/docs/getting_started.md deleted file mode 100644 index 6b2c1fd531e4..000000000000 --- a/docs/my-website/docs/getting_started.md +++ /dev/null @@ -1,108 +0,0 @@ -# Getting Started - -import QuickStart from '../src/components/QuickStart.js' - -LiteLLM simplifies LLM API calls by mapping them all to the [OpenAI ChatCompletion format](https://platform.openai.com/docs/api-reference/chat). - -## basic usage - -By default we provide a free $10 community-key to try all providers supported on LiteLLM. - -```python -from litellm import completion - -## set ENV variables -os.environ["OPENAI_API_KEY"] = "your-api-key" -os.environ["COHERE_API_KEY"] = "your-api-key" - -messages = [{ "content": "Hello, how are you?","role": "user"}] - -# openai call -response = completion(model="gpt-3.5-turbo", messages=messages) - -# cohere call -response = completion("command-nightly", messages) -``` - -**Need a dedicated key?** -Email us @ krrish@berri.ai - -Next Steps 👉 [Call all supported models - e.g. Claude-2, Llama2-70b, etc.](./proxy_api.md#supported-models) - -More details 👉 - -- [Completion() function details](./completion/) -- [Overview of supported models / providers on LiteLLM](./providers/) -- [Search all models / providers](https://models.litellm.ai/) -- [Build your own OpenAI proxy](https://github.com/BerriAI/liteLLM-proxy/tree/main) - -## streaming - -Same example from before. Just pass in `stream=True` in the completion args. - -```python -from litellm import completion - -## set ENV variables -os.environ["OPENAI_API_KEY"] = "openai key" -os.environ["COHERE_API_KEY"] = "cohere key" - -messages = [{ "content": "Hello, how are you?","role": "user"}] - -# openai call -response = completion(model="gpt-3.5-turbo", messages=messages, stream=True) - -# cohere call -response = completion("command-nightly", messages, stream=True) - -print(response) -``` - -More details 👉 - -- [streaming + async](./completion/stream.md) -- [tutorial for streaming Llama2 on TogetherAI](./tutorials/TogetherAI_liteLLM.md) - -## exception handling - -LiteLLM maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with LiteLLM. - -```python -from openai.error import OpenAIError -from litellm import completion - -os.environ["ANTHROPIC_API_KEY"] = "bad-key" -try: - # some code - completion(model="claude-instant-1", messages=[{"role": "user", "content": "Hey, how's it going?"}]) -except OpenAIError as e: - print(e) -``` - -## Logging Observability - Log LLM Input/Output ([Docs](https://docs.litellm.ai/docs/observability/callbacks)) - -LiteLLM exposes pre defined callbacks to send data to MLflow, Lunary, Langfuse, Helicone, Promptlayer, Traceloop, Slack - -```python -from litellm import completion - -## set env variables for logging tools (API key set up is not required when using MLflow) -os.environ["LUNARY_PUBLIC_KEY"] = "your-lunary-public-key" # get your public key at https://app.lunary.ai/settings -os.environ["HELICONE_API_KEY"] = "your-helicone-key" -os.environ["LANGFUSE_PUBLIC_KEY"] = "" -os.environ["LANGFUSE_SECRET_KEY"] = "" - -os.environ["OPENAI_API_KEY"] - -# set callbacks -litellm.success_callback = ["lunary", "mlflow", "langfuse", "helicone"] # log input/output to MLflow, langfuse, lunary, helicone - -#openai call -response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}]) -``` - -More details 👉 - -- [exception mapping](./exception_mapping.md) -- [retries + model fallbacks for completion()](./completion/reliable_completions.md) -- [tutorial for model fallbacks with completion()](./tutorials/fallbacks.md) diff --git a/docs/my-website/docs/guides/code_interpreter.md b/docs/my-website/docs/guides/code_interpreter.md new file mode 100644 index 000000000000..44349a6e3079 --- /dev/null +++ b/docs/my-website/docs/guides/code_interpreter.md @@ -0,0 +1,168 @@ +import Image from '@theme/IdealImage'; + +# Code Interpreter + +Use OpenAI's Code Interpreter tool to execute Python code in a secure, sandboxed environment. + +| Feature | Supported | +|---------|-----------| +| LiteLLM Python SDK | ✅ | +| LiteLLM AI Gateway | ✅ | +| Supported Providers | `openai` | + +## LiteLLM AI Gateway + +### API (OpenAI SDK) + +Use the OpenAI SDK pointed at your LiteLLM Gateway: + +```python showLineNumbers title="code_interpreter_gateway.py" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", # Your LiteLLM API key + base_url="http://localhost:4000" +) + +response = client.responses.create( + model="openai/gpt-4o", + tools=[{"type": "code_interpreter"}], + input="Calculate the first 20 fibonacci numbers and plot them" +) + +print(response) +``` + +#### Streaming + +```python showLineNumbers title="code_interpreter_streaming.py" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000" +) + +stream = client.responses.create( + model="openai/gpt-4o", + tools=[{"type": "code_interpreter"}], + input="Generate sample sales data CSV and create a visualization", + stream=True +) + +for event in stream: + print(event) +``` + +#### Get Generated File Content + +```python showLineNumbers title="get_file_content_gateway.py" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000" +) + +# 1. Run code interpreter +response = client.responses.create( + model="openai/gpt-4o", + tools=[{"type": "code_interpreter"}], + input="Create a scatter plot and save as PNG" +) + +# 2. Get container_id from response +container_id = response.output[0].container_id + +# 3. List files +files = client.containers.files.list(container_id=container_id) + +# 4. Download file content +for file in files.data: + content = client.containers.files.content( + container_id=container_id, + file_id=file.id + ) + + with open(file.filename, "wb") as f: + f.write(content.read()) + print(f"Downloaded: {file.filename}") +``` + +### AI Gateway UI + +The LiteLLM Admin UI includes built-in Code Interpreter support. + + + +**Steps:** + +1. Go to **Playground** in the LiteLLM UI +2. Select an **OpenAI model** (e.g., `openai/gpt-4o`) +3. Select `/v1/responses` as the endpoint under **Endpoint Type** +4. Toggle **Code Interpreter** in the left panel +5. Send a prompt requesting code execution or file generation + +The UI will display: +- Executed Python code (collapsible) +- Generated images inline +- Download links for files (CSVs, etc.) + +## LiteLLM Python SDK + +### Run Code Interpreter + +```python showLineNumbers title="code_interpreter.py" +import litellm + +response = litellm.responses( + model="openai/gpt-4o", + input="Generate a bar chart of quarterly sales and save as PNG", + tools=[{"type": "code_interpreter"}] +) + +print(response) +``` + +### Get Generated File Content + +After Code Interpreter runs, retrieve the generated files: + +```python showLineNumbers title="get_file_content.py" +import litellm + +# 1. Run code interpreter +response = litellm.responses( + model="openai/gpt-4o", + input="Create a pie chart of market share and save as PNG", + tools=[{"type": "code_interpreter"}] +) + +# 2. Extract container_id from response +container_id = response.output[0].container_id # e.g. "cntr_abc123..." + +# 3. List files in container +files = litellm.list_container_files( + container_id=container_id, + custom_llm_provider="openai" +) + +# 4. Download each file +for file in files.data: + content = litellm.retrieve_container_file_content( + container_id=container_id, + file_id=file.id, + custom_llm_provider="openai" + ) + + with open(file.filename, "wb") as f: + f.write(content) + print(f"Downloaded: {file.filename}") +``` + + +## Related + +- [Containers API](/docs/containers) - Manage containers +- [Container Files API](/docs/container_files) - Manage files within containers +- [OpenAI Code Interpreter Docs](https://platform.openai.com/docs/guides/tools-code-interpreter) - Official OpenAI documentation diff --git a/docs/my-website/docs/guides/security_settings.md b/docs/my-website/docs/guides/security_settings.md index d6397a7c1970..3b6d44b00879 100644 --- a/docs/my-website/docs/guides/security_settings.md +++ b/docs/my-website/docs/guides/security_settings.md @@ -187,4 +187,37 @@ export AIOHTTP_TRUST_ENV='True' ``` +## 7. Per-Service SSL Verification +LiteLLM allows you to override SSL verification settings for specific services or provider calls. This is useful when different services (e.g., an internal guardrail vs. a public LLM provider) require different CA certificates. + +### Bedrock (SDK) +You can pass `ssl_verify` directly in the `completion` call. + +```python +import litellm + +response = litellm.completion( + model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + messages=[{"role": "user", "content": "hi"}], + ssl_verify="path/to/bedrock_cert.pem" # Or False to disable +) +``` + +### AIM Guardrail (Proxy) +You can configure `ssl_verify` per guardrail in your `config.yaml`. + +```yaml +guardrails: + - guardrail_name: aim-protected-app + litellm_params: + guardrail: aim + ssl_verify: "/path/to/aim_cert.pem" # Use specific cert for AIM +``` + +### Priority Logic +LiteLLM resolves `ssl_verify` using the following priority: +1. **Explicit Parameter**: Passed in `completion()` or guardrail config. +2. **Environment Variable**: `SSL_VERIFY` environment variable. +3. **Global Setting**: `litellm.ssl_verify` setting. +4. **System Standard**: `SSL_CERT_FILE` environment variable. diff --git a/docs/my-website/docs/image_edits.md b/docs/my-website/docs/image_edits.md index 84dddd5e4ad7..a84383345425 100644 --- a/docs/my-website/docs/image_edits.md +++ b/docs/my-website/docs/image_edits.md @@ -14,9 +14,9 @@ LiteLLM provides image editing functionality that maps to OpenAI's `/images/edit | Fallbacks | ✅ | Works between supported models | | Loadbalancing | ✅ | Works between supported models | | Supported operations | Create image edits | Single and multiple images supported | -| Supported LiteLLM SDK Versions | 1.63.8+ | | -| Supported LiteLLM Proxy Versions | 1.71.1+ | | -| Supported LLM providers | **OpenAI** | Currently only `openai` is supported | +| Supported LiteLLM SDK Versions | 1.63.8+ | Gemini support requires 1.79.3+ | +| Supported LiteLLM Proxy Versions | 1.71.1+ | Gemini support requires 1.79.3+ | +| Supported LLM providers | **OpenAI**, **Gemini (Google AI Studio)**, **Vertex AI**, **Stability AI**, **AWS Bedrock (Stability)** | Gemini supports the new `gemini-2.5-flash-image` family. Vertex AI supports both Gemini and Imagen models. Stability AI and Bedrock Stability support various image editing operations. | #### ⚡️See all supported models and providers at [models.litellm.ai](https://models.litellm.ai/) @@ -149,6 +149,101 @@ for i, image_data in enumerate(response.data): print(f"Image {i+1}: {image_data.url}") ``` +``` + + + + + +#### Basic Image Edit +```python showLineNumbers title="Gemini Image Edit" +import base64 +import os +from litellm import image_edit + +os.environ["GEMINI_API_KEY"] = "your-api-key" + +response = image_edit( + model="gemini/gemini-2.5-flash-image", + image=open("original_image.png", "rb"), + prompt="Add aurora borealis to the night sky", + size="1792x1024", # mapped to aspectRatio=16:9 for Gemini +) + +edited_image_bytes = base64.b64decode(response.data[0].b64_json) +with open("edited_image.png", "wb") as f: + f.write(edited_image_bytes) +``` + +#### Multiple Images Edit +```python showLineNumbers title="Gemini Multiple Images Edit" +import base64 +import os +from litellm import image_edit + +os.environ["GEMINI_API_KEY"] = "your-api-key" + +response = image_edit( + model="gemini/gemini-2.5-flash-image", + image=[ + open("scene.png", "rb"), + open("style_reference.png", "rb"), + ], + prompt="Blend the reference style into the scene while keeping the subject sharp.", +) + +for idx, image_obj in enumerate(response.data): + with open(f"gemini_edit_{idx}.png", "wb") as f: + f.write(base64.b64decode(image_obj.b64_json)) +``` + + + + + +#### Basic Image Edit (Gemini) +```python showLineNumbers title="Vertex AI Gemini Image Edit" +import os +import litellm + +# Set Vertex AI credentials +os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id" +os.environ["VERTEXAI_LOCATION"] = "us-central1" +os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/service-account.json" + +response = litellm.image_edit( + model="vertex_ai/gemini-2.5-flash", + image=open("original_image.png", "rb"), + prompt="Add neon lights in the background", + size="1024x1024", +) + +print(response) +``` + +#### Image Edit with Imagen (Supports Masks) +```python showLineNumbers title="Vertex AI Imagen Image Edit" +import os +import litellm + +# Set Vertex AI credentials +os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id" +os.environ["VERTEXAI_LOCATION"] = "us-central1" +os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/service-account.json" + +# Imagen supports mask for inpainting +response = litellm.image_edit( + model="vertex_ai/imagen-3.0-capability-001", + image=open("original_image.png", "rb"), + mask=open("mask_image.png", "rb"), # Optional: for inpainting + prompt="Turn this into watercolor style scenery", + n=2, # Number of variations + size="1024x1024", +) + +print(response) +``` + @@ -224,6 +319,85 @@ curl -X POST "http://localhost:4000/v1/images/edits" \ -F "response_format=url" ``` +``` + + + + + +1. Add the Gemini image edit model to your `config.yaml`: +```yaml showLineNumbers title="Gemini Proxy Configuration" +model_list: + - model_name: gemini-image-edit + litellm_params: + model: gemini/gemini-2.5-flash-image + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start the LiteLLM proxy server: +```bash showLineNumbers title="Start LiteLLM Proxy Server" +litellm --config /path/to/config.yaml +``` + +3. Make an image edit request (Gemini responses are base64-only): +```bash showLineNumbers title="Gemini Proxy Image Edit" +curl -X POST "http://0.0.0.0:4000/v1/images/edits" \ + -H "Authorization: Bearer " \ + -F "model=gemini-image-edit" \ + -F "image=@original_image.png" \ + -F "prompt=Add a warm golden-hour glow to the scene" \ + -F "size=1024x1024" +``` + + + + + +1. Add Vertex AI image edit models to your `config.yaml`: +```yaml showLineNumbers title="Vertex AI Proxy Configuration" +model_list: + - model_name: vertex-gemini-image-edit + litellm_params: + model: vertex_ai/gemini-2.5-flash + vertex_project: os.environ/VERTEXAI_PROJECT + vertex_location: os.environ/VERTEXAI_LOCATION + vertex_credentials: os.environ/GOOGLE_APPLICATION_CREDENTIALS + + - model_name: vertex-imagen-image-edit + litellm_params: + model: vertex_ai/imagen-3.0-capability-001 + vertex_project: os.environ/VERTEXAI_PROJECT + vertex_location: os.environ/VERTEXAI_LOCATION + vertex_credentials: os.environ/GOOGLE_APPLICATION_CREDENTIALS +``` + +2. Start the LiteLLM proxy server: +```bash showLineNumbers title="Start LiteLLM Proxy Server" +litellm --config /path/to/config.yaml +``` + +3. Make an image edit request: +```bash showLineNumbers title="Vertex AI Gemini Proxy Image Edit" +curl -X POST "http://0.0.0.0:4000/v1/images/edits" \ + -H "Authorization: Bearer " \ + -F "model=vertex-gemini-image-edit" \ + -F "image=@original_image.png" \ + -F "prompt=Add neon lights in the background" \ + -F "size=1024x1024" +``` + +4. Imagen image edit with mask: +```bash showLineNumbers title="Vertex AI Imagen Proxy Image Edit with Mask" +curl -X POST "http://0.0.0.0:4000/v1/images/edits" \ + -H "Authorization: Bearer " \ + -F "model=vertex-imagen-image-edit" \ + -F "image=@original_image.png" \ + -F "mask=@mask_image.png" \ + -F "prompt=Turn this into watercolor style scenery" \ + -F "n=2" \ + -F "size=1024x1024" +``` + diff --git a/docs/my-website/docs/image_generation.md b/docs/my-website/docs/image_generation.md index b4eaef365214..7f27f48f9101 100644 --- a/docs/my-website/docs/image_generation.md +++ b/docs/my-website/docs/image_generation.md @@ -15,7 +15,7 @@ import TabItem from '@theme/TabItem'; | Fallbacks | ✅ | Works between supported models | | Loadbalancing | ✅ | Works between supported models | | Guardrails | ✅ | Applies to input prompts (non-streaming only) | -| Supported Providers | OpenAI, Azure, Google AI Studio, Vertex AI, AWS Bedrock, Recraft, Xinference, Nscale | | +| Supported Providers | OpenAI, Azure, Google AI Studio, Vertex AI, AWS Bedrock, Recraft, OpenRouter, Xinference, Nscale | | ## Quick Start @@ -238,6 +238,27 @@ print(response) See Recraft usage with LiteLLM [here](./providers/recraft.md#image-generation) +## OpenRouter Image Generation Models + +Use this for image generation models available through OpenRouter (e.g., Google Gemini image generation models) + +#### Usage + +```python showLineNumbers +from litellm import image_generation +import os + +os.environ['OPENROUTER_API_KEY'] = "your-api-key" + +response = image_generation( + model="openrouter/google/gemini-2.5-flash-image", + prompt="A beautiful sunset over a calm ocean", + size="1024x1024", + quality="high", +) +print(response) +``` + ## OpenAI Compatible Image Generation Models Use this for calling `/image_generation` endpoints on OpenAI Compatible Servers, example https://github.com/xorbitsai/inference @@ -301,5 +322,6 @@ print(f"response: {response}") | Vertex AI | [Vertex AI Image Generation →](./providers/vertex_image) | | AWS Bedrock | [Bedrock Image Generation →](./providers/bedrock) | | Recraft | [Recraft Image Generation →](./providers/recraft#image-generation) | +| OpenRouter | [OpenRouter Image Generation →](./providers/openrouter#image-generation) | | Xinference | [Xinference Image Generation →](./providers/xinference#image-generation) | | Nscale | [Nscale Image Generation →](./providers/nscale#image-generation) | \ No newline at end of file diff --git a/docs/my-website/docs/index.md b/docs/my-website/docs/index.md index 11d2963b7a32..ba605e316d3c 100644 --- a/docs/my-website/docs/index.md +++ b/docs/my-website/docs/index.md @@ -7,42 +7,42 @@ https://github.com/BerriAI/litellm ## **Call 100+ LLMs using the OpenAI Input/Output Format** -- Translate inputs to provider's `completion`, `embedding`, and `image_generation` endpoints -- [Consistent output](https://docs.litellm.ai/docs/completion/output), text responses will always be available at `['choices'][0]['message']['content']` +- Translate inputs to provider's endpoints (`/chat/completions`, `/responses`, `/embeddings`, `/images`, `/audio`, `/batches`, and more) +- [Consistent output](https://docs.litellm.ai/docs/supported_endpoints) - same response format regardless of which provider you use - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing) - Track spend & set budgets per project [LiteLLM Proxy Server](https://docs.litellm.ai/docs/simple_proxy) ## How to use LiteLLM -You can use litellm through either: -1. [LiteLLM Proxy Server](#litellm-proxy-server-llm-gateway) - Server (LLM Gateway) to call 100+ LLMs, load balance, cost tracking across projects -2. [LiteLLM python SDK](#basic-usage) - Python Client to call 100+ LLMs, load balance, cost tracking -### **When to use LiteLLM Proxy Server (LLM Gateway)** +You can use LiteLLM through either the Proxy Server or Python SDK. Both gives you a unified interface to access multiple LLMs (100+ LLMs). Choose the option that best fits your needs: + + + + + + + + + + + + + + + + + + + + + + + + + + +
LiteLLM Proxy ServerLiteLLM Python SDK
Use CaseCentral service (LLM Gateway) to access multiple LLMsUse LiteLLM directly in your Python code
Who Uses It?Gen AI Enablement / ML Platform TeamsDevelopers building LLM projects
Key Features• Centralized API gateway with authentication & authorization
• Multi-tenant cost tracking and spend management per project/user
• Per-project customization (logging, guardrails, caching)
• Virtual keys for secure access control
• Admin dashboard UI for monitoring and management
• Direct Python library integration in your codebase
• Router with retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - Router
• Application-level load balancing and cost tracking
• Exception handling with OpenAI-compatible errors
• Observability callbacks (Lunary, MLflow, Langfuse, etc.)
-:::tip - -Use LiteLLM Proxy Server if you want a **central service (LLM Gateway) to access multiple LLMs** - -Typically used by Gen AI Enablement / ML PLatform Teams - -::: - - - LiteLLM Proxy gives you a unified interface to access multiple LLMs (100+ LLMs) - - Track LLM Usage and setup guardrails - - Customize Logging, Guardrails, Caching per project - -### **When to use LiteLLM Python SDK** - -:::tip - - Use LiteLLM Python SDK if you want to use LiteLLM in your **python code** - -Typically used by developers building llm projects - -::: - - - LiteLLM SDK gives you a unified interface to access multiple LLMs (100+ LLMs) - - Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing) ## **LiteLLM Python SDK** @@ -245,7 +245,7 @@ response = completion( -### Response Format (OpenAI Format) +### Response Format (OpenAI Chat Completions Format) ```json { @@ -514,15 +514,22 @@ response = completion( LiteLLM maps exceptions across all supported providers to the OpenAI exceptions. All our exceptions inherit from OpenAI's exception types, so any error-handling you have for that, should work out of the box with LiteLLM. ```python -from openai.error import OpenAIError +import litellm from litellm import completion +import os os.environ["ANTHROPIC_API_KEY"] = "bad-key" try: - # some code - completion(model="claude-instant-1", messages=[{"role": "user", "content": "Hey, how's it going?"}]) -except OpenAIError as e: - print(e) + completion(model="anthropic/claude-instant-1", messages=[{"role": "user", "content": "Hey, how's it going?"}]) +except litellm.AuthenticationError as e: + # Thrown when the API key is invalid + print(f"Authentication failed: {e}") +except litellm.RateLimitError as e: + # Thrown when you've exceeded your rate limit + print(f"Rate limited: {e}") +except litellm.APIError as e: + # Thrown for general API errors + print(f"API error: {e}") ``` ### See How LiteLLM Transforms Your Requests @@ -650,7 +657,7 @@ docker run \ -e AZURE_API_KEY=d6*********** \ -e AZURE_API_BASE=https://openai-***********/ \ -p 4000:4000 \ - ghcr.io/berriai/litellm:main-latest \ + docker.litellm.ai/berriai/litellm:main-latest \ --config /app/config.yaml --detailed_debug ``` diff --git a/docs/my-website/docs/integrations/community.md b/docs/my-website/docs/integrations/community.md new file mode 100644 index 000000000000..76a8403e9459 --- /dev/null +++ b/docs/my-website/docs/integrations/community.md @@ -0,0 +1,30 @@ +# Be an Integration Partner + +Welcome, integration partners! 👋 + +We're excited to have you contribute to LiteLLM. To get started and connect with the LiteLLM community: + +## Get Support & Connect + +**Fill out our support form to join the community:** + +👉 [**https://www.litellm.ai/support**](https://www.litellm.ai/support) + +By filling out this form, you'll be able to: +- Join our **OSS Slack community** for real-time discussions +- Get help and feedback on your integration +- Connect with other developers and contributors +- Stay updated on the latest LiteLLM developments + +## What We Offer Integration Partners + +- **Direct support** from the LiteLLM team +- **Feedback** on your integration implementation +- **Collaboration** with a growing community of LLM developers +- **Visibility** for your integration in our documentation + +## Questions? + +Once you've joined our Slack community, head over to the **`#integration-partners`** channel to introduce yourself and ask questions. Our team and community members are happy to help you build great integrations with LiteLLM. + +We look forward to working with you! 🚀 diff --git a/docs/my-website/docs/interactions.md b/docs/my-website/docs/interactions.md new file mode 100644 index 000000000000..32c82a1589c8 --- /dev/null +++ b/docs/my-website/docs/interactions.md @@ -0,0 +1,269 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# /interactions + +| Feature | Supported | Notes | +|---------|-----------|-------| +| Logging | ✅ | Works across all integrations | +| Streaming | ✅ | | +| Loadbalancing | ✅ | Between supported models | +| Supported LLM providers | **All LiteLLM supported CHAT COMPLETION providers** | `openai`, `anthropic`, `bedrock`, `vertex_ai`, `gemini`, `azure`, `azure_ai` etc. | + +## **LiteLLM Python SDK Usage** + +### Quick Start + +```python showLineNumbers title="Create Interaction" +from litellm import create_interaction +import os + +os.environ["GEMINI_API_KEY"] = "your-api-key" + +response = create_interaction( + model="gemini/gemini-2.5-flash", + input="Tell me a short joke about programming." +) + +print(response.outputs[-1].text) +``` + +### Async Usage + +```python showLineNumbers title="Async Create Interaction" +from litellm import acreate_interaction +import os +import asyncio + +os.environ["GEMINI_API_KEY"] = "your-api-key" + +async def main(): + response = await acreate_interaction( + model="gemini/gemini-2.5-flash", + input="Tell me a short joke about programming." + ) + print(response.outputs[-1].text) + +asyncio.run(main()) +``` + +### Streaming + +```python showLineNumbers title="Streaming Interaction" +from litellm import create_interaction +import os + +os.environ["GEMINI_API_KEY"] = "your-api-key" + +response = create_interaction( + model="gemini/gemini-2.5-flash", + input="Write a 3 paragraph story about a robot.", + stream=True +) + +for chunk in response: + print(chunk) +``` + +## **LiteLLM AI Gateway (Proxy) Usage** + +### Setup + +Add this to your litellm proxy config.yaml: + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: gemini-flash + litellm_params: + model: gemini/gemini-2.5-flash + api_key: os.environ/GEMINI_API_KEY +``` + +Start litellm: + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +### Test Request + + + + +```bash showLineNumbers title="Create Interaction" +curl -X POST "http://localhost:4000/v1beta/interactions" \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gemini/gemini-2.5-flash", + "input": "Tell me a short joke about programming." + }' +``` + +**Streaming:** + +```bash showLineNumbers title="Streaming Interaction" +curl -N -X POST "http://localhost:4000/v1beta/interactions" \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gemini/gemini-2.5-flash", + "input": "Write a 3 paragraph story about a robot.", + "stream": true + }' +``` + +**Get Interaction:** + +```bash showLineNumbers title="Get Interaction by ID" +curl "http://localhost:4000/v1beta/interactions/{interaction_id}" \ + -H "Authorization: Bearer sk-1234" +``` + + + + + +Point the Google GenAI SDK to LiteLLM Proxy: + +```python showLineNumbers title="Google GenAI SDK with LiteLLM Proxy" +from google import genai +import os + +# Point SDK to LiteLLM Proxy +os.environ["GOOGLE_GENAI_BASE_URL"] = "http://localhost:4000" +os.environ["GEMINI_API_KEY"] = "sk-1234" # Your LiteLLM API key + +client = genai.Client() + +# Create an interaction +interaction = client.interactions.create( + model="gemini/gemini-2.5-flash", + input="Tell me a short joke about programming." +) + +print(interaction.outputs[-1].text) +``` + +**Streaming:** + +```python showLineNumbers title="Google GenAI SDK Streaming" +from google import genai +import os + +os.environ["GOOGLE_GENAI_BASE_URL"] = "http://localhost:4000" +os.environ["GEMINI_API_KEY"] = "sk-1234" + +client = genai.Client() + +for chunk in client.interactions.create_stream( + model="gemini/gemini-2.5-flash", + input="Write a story about space exploration.", +): + print(chunk) +``` + + + + +## **Request/Response Format** + +### Request Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Model to use (e.g., `gemini/gemini-2.5-flash`) | +| `input` | string | Yes | The input text for the interaction | +| `stream` | boolean | No | Enable streaming responses | +| `tools` | array | No | Tools available to the model | +| `system_instruction` | string | No | System instructions for the model | +| `generation_config` | object | No | Generation configuration | +| `previous_interaction_id` | string | No | ID of previous interaction for context | + +### Response Format + +```json +{ + "id": "interaction_abc123", + "object": "interaction", + "model": "gemini-2.5-flash", + "status": "completed", + "created": "2025-01-15T10:30:00Z", + "updated": "2025-01-15T10:30:05Z", + "role": "model", + "outputs": [ + { + "type": "text", + "text": "Why do programmers prefer dark mode? Because light attracts bugs!" + } + ], + "usage": { + "total_input_tokens": 10, + "total_output_tokens": 15, + "total_tokens": 25 + } +} +``` + +## **Calling non-Interactions API endpoints (`/interactions` to `/responses` Bridge)** + +LiteLLM allows you to call non-Interactions API models via a bridge to LiteLLM's `/responses` endpoint. This is useful for calling OpenAI, Anthropic, and other providers that don't natively support the Interactions API. + +#### Python SDK Usage + +```python showLineNumbers title="SDK Usage" +import litellm +import os + +# Set API key +os.environ["OPENAI_API_KEY"] = "your-openai-api-key" + +# Non-streaming interaction +response = litellm.interactions.create( + model="gpt-4o", + input="Tell me a short joke about programming." +) + +print(response.outputs[-1].text) +``` + +#### LiteLLM Proxy Usage + +**Setup Config:** + +```yaml showLineNumbers title="Example Configuration" +model_list: +- model_name: openai-model + litellm_params: + model: gpt-4o + api_key: os.environ/OPENAI_API_KEY +``` + +**Start Proxy:** + +```bash showLineNumbers title="Start LiteLLM Proxy" +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +**Make Request:** + +```bash showLineNumbers title="non-Interactions API Model Request" +curl http://localhost:4000/v1beta/interactions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "openai-model", + "input": "Tell me a short joke about programming." + }' +``` + +## **Supported Providers** + +| Provider | Link to Usage | +|----------|---------------| +| Google AI Studio | [Usage](#quick-start) | +| All other LiteLLM providers | [Bridge Usage](#calling-non-interactions-api-endpoints-interactions-to-responses-bridge) | diff --git a/docs/my-website/docs/mcp.md b/docs/my-website/docs/mcp.md index 10405c1b493b..d63b55ee29e1 100644 --- a/docs/my-website/docs/mcp.md +++ b/docs/my-website/docs/mcp.md @@ -17,10 +17,15 @@ LiteLLM Proxy provides an MCP Gateway that allows you to use a fixed endpoint fo ## Overview | Feature | Description | |---------|-------------| -| MCP Operations | • List Tools
• Call Tools | +| MCP Operations | • List Tools
• Call Tools
• Prompts
• Resources | | Supported MCP Transports | • Streamable HTTP
• SSE
• Standard Input/Output (stdio) | | LiteLLM Permission Management | • By Key
• By Team
• By Organization | +:::caution MCP protocol update +Starting in LiteLLM v1.80.18, the LiteLLM MCP protocol version is `2025-11-25`.
+LiteLLM namespaces multiple MCP servers by prefixing each tool name with its MCP server name, so newly created servers now must use names that comply with SEP-986—noncompliant names cannot be added anymore. Existing servers that still violate SEP-986 only emit warnings today, but future MCP-side rollouts may block those names entirely, so we recommend updating any legacy server names proactively before MCP enforcement makes them unusable. +::: + ## Adding your MCP ### Prerequisites @@ -60,6 +65,8 @@ model_list: If `supported_db_objects` is not set, all object types are loaded from the database (default behavior). +For diagnosing connectivity problems after setup, see the [MCP Troubleshooting Guide](./mcp_troubleshoot.md). + @@ -107,6 +114,42 @@ For stdio MCP servers, select "Standard Input/Output (stdio)" as the transport t style={{width: '80%', display: 'block', margin: '0'}} /> +
+
+ +### OAuth Configuration & Overrides + +LiteLLM attempts [OAuth 2.0 Authorization Server Discovery](https://datatracker.ietf.org/doc/html/rfc8414) by default. When you create an MCP server in the UI and set `Authentication: OAuth`, LiteLLM will locate the provider metadata, dynamically register a client, and perform PKCE-based authorization without you providing any additional details. + +**Customize the OAuth flow when needed:** + + + +- **Provide explicit client credentials** – If the MCP provider does not offer dynamic client registration or you prefer to manage the client yourself, fill in `client_id`, `client_secret`, and the desired `scopes`. +- **Override discovery URLs** – In some environments, LiteLLM might not be able to reach the provider's metadata endpoints. Use the optional `authorization_url`, `token_url`, and `registration_url` fields to point LiteLLM directly to the correct endpoints. + +
+ +### Static Headers + +Sometimes your MCP server needs specific headers on every request. Maybe it's an API key, maybe it's a custom header the server expects. Instead of configuring auth, you can just set them directly. + + + +These headers get sent with every request to the server. That's it. + + +**When to use this:** +- Your server needs custom headers that don't fit the standard auth patterns +- You want full control over exactly what headers are sent +- You're debugging and need to quickly add headers without changing auth configuration +
@@ -162,6 +205,7 @@ mcp_servers: - `http` - Streamable HTTP transport - `stdio` - Standard Input/Output transport - **Command**: The command to execute for stdio transport (required for stdio) +- **allow_all_keys**: Set to `true` to make the server available to every LiteLLM API key, even if the key/team doesn't list the server in its MCP permissions. - **Args**: Array of arguments to pass to the command (optional for stdio) - **Env**: Environment variables to set for the stdio process (optional for stdio) - **Description**: Optional description for the server @@ -175,6 +219,7 @@ mcp_servers: | `authorization` | `Authorization: ` | - **Extra Headers**: Optional list of additional header names that should be forwarded from client to the MCP server +- **Static Headers**: Optional map of header key/value pairs to include every request to the MCP server. - **Spec Version**: Optional MCP specification version (defaults to `2025-06-18`) Examples for each auth type: @@ -190,11 +235,12 @@ mcp_servers: oauth2_example: url: "https://my-mcp-server.com/mcp" auth_type: "oauth2" # 👈 KEY CHANGE - authorization_url: "https://my-mcp-server.com/oauth/authorize" # optional for client-credentials - token_url: "https://my-mcp-server.com/oauth/token" # required + authorization_url: "https://my-mcp-server.com/oauth/authorize" # optional override + token_url: "https://my-mcp-server.com/oauth/token" # optional override + registration_url: "https://my-mcp-server.com/oauth/register" # optional override client_id: os.environ/OAUTH_CLIENT_ID client_secret: os.environ/OAUTH_CLIENT_SECRET - scopes: ["tool.read", "tool.write"] # optional + scopes: ["tool.read", "tool.write"] # optional override bearer_example: url: "https://my-mcp-server.com/mcp" @@ -217,27 +263,49 @@ mcp_servers: auth_type: "bearer_token" auth_value: "ghp_example_token" extra_headers: ["custom_key", "x-custom-header"] # These headers will be forwarded from client -``` - -### Static Headers - -Sometimes your MCP server needs specific headers on every request. Maybe it's an API key, maybe it's a custom header the server expects. Instead of configuring auth, you can just set them directly. -```yaml title="config.yaml" showLineNumbers -mcp_servers: + # Example with static headers my_mcp_server: url: "https://my-mcp-server.com/mcp" - static_headers: + static_headers: # These headers will be requested to the MCP server X-API-Key: "abc123" X-Custom-Header: "some-value" ``` -These headers get sent with every request to the server. That's it. +### MCP Walkthroughs + +- **Strands (STDIO)** – [watch tutorial](https://screen.studio/share/ruv4D73F) + +> Add it from the UI + +```json title="strands-mcp" showLineNumbers +{ + "mcpServers": { + "strands-agents": { + "command": "uvx", + "args": ["strands-agents-mcp-server"], + "env": { + "FASTMCP_LOG_LEVEL": "INFO" + }, + "disabled": false, + "autoApprove": ["search_docs", "fetch_doc"] + } + } +} +``` + +> config.yml + +```yaml title="config.yml – strands MCP" showLineNumbers +mcp_servers: + strands_mcp: + transport: "stdio" + command: "uvx" + args: ["strands-agents-mcp-server"] + env: + FASTMCP_LOG_LEVEL: "INFO" +``` -**When to use this:** -- Your server needs custom headers that don't fit the standard auth patterns -- You want full control over exactly what headers are sent -- You're debugging and need to quickly add headers without changing auth configuration ### MCP Aliases @@ -265,18 +333,19 @@ litellm_settings:
+ ## Converting OpenAPI Specs to MCP Servers LiteLLM can automatically convert OpenAPI specifications into MCP servers, allowing you to expose any REST API as MCP tools. This is useful when you have existing APIs with OpenAPI/Swagger documentation and want to make them available as MCP tools. -### Benefits +**Benefits:** - **Rapid Integration**: Convert existing APIs to MCP tools without writing custom MCP server code - **Automatic Tool Generation**: LiteLLM automatically generates MCP tools from your OpenAPI spec - **Unified Interface**: Use the same MCP interface for both native MCP servers and OpenAPI-based APIs - **Easy Testing**: Test and iterate on API integrations quickly -### Configuration +**Configuration:** Add your OpenAPI-based MCP server to your `config.yaml`: @@ -309,7 +378,7 @@ mcp_servers: auth_value: "your-bearer-token" ``` -### Configuration Parameters +**Configuration Parameters:** | Parameter | Required | Description | |-----------|----------|-------------| @@ -317,6 +386,10 @@ mcp_servers: | `spec_path` | Yes | Path or URL to your OpenAPI specification file (JSON or YAML) | | `auth_type` | No | Authentication type: `none`, `api_key`, `bearer_token`, `basic`, `authorization` | | `auth_value` | No | Authentication value (required if `auth_type` is set) | +| `authorization_url` | No | For `auth_type: oauth2`. Optional override; if omitted LiteLLM auto-discovers it. | +| `token_url` | No | For `auth_type: oauth2`. Optional override; if omitted LiteLLM auto-discovers it. | +| `registration_url` | No | For `auth_type: oauth2`. Optional override; if omitted LiteLLM auto-discovers it. | +| `scopes` | No | For `auth_type: oauth2`. Optional override; if omitted LiteLLM uses the scopes advertised by the server. | | `description` | No | Optional description for the MCP server | | `allowed_tools` | No | List of specific tools to allow (see [MCP Tool Filtering](#mcp-tool-filtering)) | | `disallowed_tools` | No | List of specific tools to block (see [MCP Tool Filtering](#mcp-tool-filtering)) | @@ -417,7 +490,7 @@ curl --location 'https://api.openai.com/v1/responses' \ -### How It Works +**How It Works** 1. **Spec Loading**: LiteLLM loads your OpenAPI specification from the provided `spec_path` 2. **Tool Generation**: Each API endpoint in the spec becomes an MCP tool @@ -425,7 +498,7 @@ curl --location 'https://api.openai.com/v1/responses' \ 4. **Request Handling**: When a tool is called, LiteLLM converts the MCP request to the appropriate HTTP request 5. **Response Translation**: API responses are converted back to MCP format -### OpenAPI Spec Requirements +**OpenAPI Spec Requirements** Your OpenAPI specification should follow standard OpenAPI/Swagger conventions: - **Supported versions**: OpenAPI 3.0.x, OpenAPI 3.1.x, Swagger 2.0 @@ -433,585 +506,94 @@ Your OpenAPI specification should follow standard OpenAPI/Swagger conventions: - **Operation IDs**: Each operation should have a unique `operationId` (this becomes the tool name) - **Parameters**: Request parameters should be properly documented with types and descriptions -### Example OpenAPI Spec Structure - -```yaml title="sample-openapi.yaml" showLineNumbers -openapi: 3.0.0 -info: - title: My API - version: 1.0.0 -paths: - /pets/{petId}: - get: - operationId: getPetById - summary: Get a pet by ID - parameters: - - name: petId - in: path - required: true - schema: - type: integer - responses: - '200': - description: Successful response - content: - application/json: - schema: - type: object -``` - -## Allow/Disallow MCP Tools - -Control which tools are available from your MCP servers. You can either allow only specific tools or block dangerous ones. - - - - -Use `allowed_tools` to specify exactly which tools users can access. All other tools will be blocked. - -```yaml title="config.yaml" showLineNumbers -mcp_servers: - github_mcp: - url: "https://api.githubcopilot.com/mcp" - auth_type: oauth2 - authorization_url: https://github.com/login/oauth/authorize - token_url: https://github.com/login/oauth/access_token - client_id: os.environ/GITHUB_OAUTH_CLIENT_ID - client_secret: os.environ/GITHUB_OAUTH_CLIENT_SECRET - scopes: ["public_repo", "user:email"] - allowed_tools: ["list_tools"] - # only list_tools will be available -``` - -**Use this when:** -- You want strict control over which tools are available -- You're in a high-security environment -- You're testing a new MCP server with limited tools +## MCP Oauth - - +LiteLLM v 1.77.6 added support for OAuth 2.0 Client Credentials for MCP servers. -Use `disallowed_tools` to block specific tools. All other tools will be available. +You can configure this either in `config.yaml` or directly from the LiteLLM UI (MCP Servers → Authentication → OAuth). -```yaml title="config.yaml" showLineNumbers +```yaml mcp_servers: github_mcp: url: "https://api.githubcopilot.com/mcp" auth_type: oauth2 - authorization_url: https://github.com/login/oauth/authorize - token_url: https://github.com/login/oauth/access_token client_id: os.environ/GITHUB_OAUTH_CLIENT_ID client_secret: os.environ/GITHUB_OAUTH_CLIENT_SECRET - scopes: ["public_repo", "user:email"] - disallowed_tools: ["repo_delete"] - # only repo_delete will be blocked ``` -**Use this when:** -- Most tools are safe, but you want to block a few dangerous ones -- You want to prevent expensive API calls -- You're gradually adding restrictions to an existing server - - - - -### Important Notes - -- If you specify both `allowed_tools` and `disallowed_tools`, the allowed list takes priority -- Tool names are case-sensitive - ---- - -## Allow/Disallow MCP Tool Parameters - -Control which parameters are allowed for specific MCP tools using the `allowed_params` configuration. This provides fine-grained control over tool usage by restricting the parameters that can be passed to each tool. - -### Configuration - -`allowed_params` is a dictionary that maps tool names to lists of allowed parameter names. When configured, only the specified parameters will be accepted for that tool - any other parameters will be rejected with a 403 error. - -```yaml title="config.yaml with allowed_params" showLineNumbers -mcp_servers: - deepwiki_mcp: - url: https://mcp.deepwiki.com/mcp - transport: "http" - auth_type: "none" - allowed_params: - # Tool name: list of allowed parameters - read_wiki_contents: ["status"] - - my_api_mcp: - url: "https://my-api-server.com" - auth_type: "api_key" - auth_value: "my-key" - allowed_params: - # Using unprefixed tool name - getpetbyid: ["status"] - # Using prefixed tool name (both formats work) - my_api_mcp-findpetsbystatus: ["status", "limit"] - # Another tool with multiple allowed params - create_issue: ["title", "body", "labels"] -``` +[**See Claude Code Tutorial**](./tutorials/claude_responses_api#connecting-mcp-servers) ### How It Works -1. **Tool-specific filtering**: Each tool can have its own list of allowed parameters -2. **Flexible naming**: Tool names can be specified with or without the server prefix (e.g., both `"getpetbyid"` and `"my_api_mcp-getpetbyid"` work) -3. **Whitelist approach**: Only parameters in the allowed list are permitted -4. **Unlisted tools**: If `allowed_params` is not set, all parameters are allowed -5. **Error handling**: Requests with disallowed parameters receive a 403 error with details about which parameters are allowed - -### Example Request Behavior - -With the configuration above, here's how requests would be handled: - -**✅ Allowed Request:** -```json -{ - "tool": "read_wiki_contents", - "arguments": { - "status": "active" - } -} -``` - -**❌ Rejected Request:** -```json -{ - "tool": "read_wiki_contents", - "arguments": { - "status": "active", - "limit": 10 // This parameter is not allowed - } -} -``` - -**Error Response:** -```json -{ - "error": "Parameters ['limit'] are not allowed for tool read_wiki_contents. Allowed parameters: ['status']. Contact proxy admin to allow these parameters." -} -``` - -### Use Cases - -- **Security**: Prevent users from accessing sensitive parameters or dangerous operations -- **Cost control**: Restrict expensive parameters (e.g., limiting result counts) -- **Compliance**: Enforce parameter usage policies for regulatory requirements -- **Staged rollouts**: Gradually enable parameters as tools are tested -- **Multi-tenant isolation**: Different parameter access for different user groups - -### Combining with Tool Filtering - -`allowed_params` works alongside `allowed_tools` and `disallowed_tools` for complete control: - -```yaml title="Combined filtering example" showLineNumbers -mcp_servers: - github_mcp: - url: "https://api.githubcopilot.com/mcp" - auth_type: oauth2 - authorization_url: https://github.com/login/oauth/authorize - token_url: https://github.com/login/oauth/access_token - client_id: os.environ/GITHUB_OAUTH_CLIENT_ID - client_secret: os.environ/GITHUB_OAUTH_CLIENT_SECRET - scopes: ["public_repo", "user:email"] - # Only allow specific tools - allowed_tools: ["create_issue", "list_issues", "search_issues"] - # Block dangerous operations - disallowed_tools: ["delete_repo"] - # Restrict parameters per tool - allowed_params: - create_issue: ["title", "body", "labels"] - list_issues: ["state", "sort", "perPage"] - search_issues: ["query", "sort", "order", "perPage"] -``` - -This configuration ensures that: -1. Only the three listed tools are available -2. The `delete_repo` tool is explicitly blocked -3. Each tool can only use its specified parameters - ---- - -## MCP Server Access Control - -LiteLLM Proxy provides two methods for controlling access to specific MCP servers: - -1. **URL-based Namespacing** - Use URL paths to directly access specific servers or access groups -2. **Header-based Namespacing** - Use the `x-mcp-servers` header to specify which servers to access - ---- - -### Method 1: URL-based Namespacing - -LiteLLM Proxy supports URL-based namespacing for MCP servers using the format `/mcp/`. This allows you to: - -- **Direct URL Access**: Point MCP clients directly to specific servers or access groups via URL -- **Simplified Configuration**: Use URLs instead of headers for server selection -- **Access Group Support**: Use access group names in URLs for grouped server access - -#### URL Format - -``` -/mcp/ -``` - -**Examples:** -- `/mcp/github` - Access tools from the "github" MCP server -- `/mcp/zapier` - Access tools from the "zapier" MCP server -- `/mcp/dev_group` - Access tools from all servers in the "dev_group" access group -- `/mcp/github,zapier` - Access tools from multiple specific servers - -#### Usage Examples - - - - -```bash title="cURL Example with URL Namespacing" showLineNumbers -curl --location 'https://api.openai.com/v1/responses' \ ---header 'Content-Type: application/json' \ ---header "Authorization: Bearer $OPENAI_API_KEY" \ ---data '{ - "model": "gpt-4o", - "tools": [ - { - "type": "mcp", - "server_label": "litellm", - "server_url": "/mcp/github", - "require_approval": "never", - "headers": { - "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY" - } - } - ], - "input": "Run available tools", - "tool_choice": "required" -}' -``` - -This example uses URL namespacing to access only the "github" MCP server. - - - - - -```bash title="cURL Example with URL Namespacing" showLineNumbers -curl --location '/v1/responses' \ ---header 'Content-Type: application/json' \ ---header "Authorization: Bearer $LITELLM_API_KEY" \ ---data '{ - "model": "gpt-4o", - "tools": [ - { - "type": "mcp", - "server_label": "litellm", - "server_url": "/mcp/dev_group", - "require_approval": "never", - "headers": { - "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY" - } - } - ], - "input": "Run available tools", - "tool_choice": "required" -}' -``` - -This example uses URL namespacing to access all servers in the "dev_group" access group. - - - - - -```json title="Cursor MCP Configuration with URL Namespacing" showLineNumbers -{ - "mcpServers": { - "LiteLLM": { - "url": "/mcp/github,zapier", - "headers": { - "x-litellm-api-key": "Bearer $LITELLM_API_KEY" - } - } - } -} -``` - -This configuration uses URL namespacing to access tools from both "github" and "zapier" MCP servers. - - - - -#### Benefits of URL Namespacing - -- **Direct Access**: No need for additional headers to specify servers -- **Clean URLs**: Self-documenting URLs that clearly indicate which servers are accessible -- **Access Group Support**: Use access group names for grouped server access -- **Multiple Servers**: Specify multiple servers in a single URL with comma separation -- **Simplified Configuration**: Easier setup for MCP clients that prefer URL-based configuration - ---- - -### Method 2: Header-based Namespacing - -You can choose to access specific MCP servers and only list their tools using the `x-mcp-servers` header. This header allows you to: -- Limit tool access to one or more specific MCP servers -- Control which tools are available in different environments or use cases - -The header accepts a comma-separated list of server aliases: `"alias_1,Server2,Server3"` - -**Notes:** -- If the header is not provided, tools from all available MCP servers will be accessible -- This method works with the standard LiteLLM MCP endpoint - - - - -```bash title="cURL Example with Header Namespacing" showLineNumbers -curl --location 'https://api.openai.com/v1/responses' \ ---header 'Content-Type: application/json' \ ---header "Authorization: Bearer $OPENAI_API_KEY" \ ---data '{ - "model": "gpt-4o", - "tools": [ - { - "type": "mcp", - "server_label": "litellm", - "server_url": "/mcp/", - "require_approval": "never", - "headers": { - "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY", - "x-mcp-servers": "alias_1" - } - } - ], - "input": "Run available tools", - "tool_choice": "required" -}' -``` - -In this example, the request will only have access to tools from the "alias_1" MCP server. - - - - - -```bash title="cURL Example with Header Namespacing" showLineNumbers -curl --location '/v1/responses' \ ---header 'Content-Type: application/json' \ ---header "Authorization: Bearer $LITELLM_API_KEY" \ ---data '{ - "model": "gpt-4o", - "tools": [ - { - "type": "mcp", - "server_label": "litellm", - "server_url": "/mcp/", - "require_approval": "never", - "headers": { - "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY", - "x-mcp-servers": "alias_1,Server2" - } - } - ], - "input": "Run available tools", - "tool_choice": "required" -}' -``` - -This configuration restricts the request to only use tools from the specified MCP servers. - - - - - -```json title="Cursor MCP Configuration with Header Namespacing" showLineNumbers -{ - "mcpServers": { - "LiteLLM": { - "url": "/mcp/", - "headers": { - "x-litellm-api-key": "Bearer $LITELLM_API_KEY", - "x-mcp-servers": "alias_1,Server2" - } - } - } -} +```mermaid +sequenceDiagram + participant Browser as User-Agent (Browser) + participant Client as Client + participant LiteLLM as LiteLLM Proxy + participant MCP as MCP Server (Resource Server) + participant Auth as Authorization Server + + Note over Client,LiteLLM: Step 1 – Resource discovery + Client->>LiteLLM: GET /.well-known/oauth-protected-resource/{mcp_server_name}/mcp + LiteLLM->>Client: Return resource metadata + + Note over Client,LiteLLM: Step 2 – Authorization server discovery + Client->>LiteLLM: GET /.well-known/oauth-authorization-server/{mcp_server_name} + LiteLLM->>Client: Return authorization server metadata + + Note over Client,Auth: Step 3 – Dynamic client registration + Client->>LiteLLM: POST /{mcp_server_name}/register + LiteLLM->>Auth: Forward registration request + Auth->>LiteLLM: Issue client credentials + LiteLLM->>Client: Return client credentials + + Note over Client,Browser: Step 4 – User authorization (PKCE) + Client->>Browser: Open authorization URL + code_challenge + resource + Browser->>Auth: Authorization request + Note over Auth: User authorizes + Auth->>Browser: Redirect with authorization code + Browser->>LiteLLM: Callback to LiteLLM with code + LiteLLM->>Browser: Redirect back with authorization code + Browser->>Client: Callback with authorization code + + Note over Client,Auth: Step 5 – Token exchange + Client->>LiteLLM: Token request + code_verifier + resource + LiteLLM->>Auth: Forward token request + Auth->>LiteLLM: Access (and refresh) token + LiteLLM->>Client: Return tokens + + Note over Client,MCP: Step 6 – Authenticated MCP call + Client->>LiteLLM: MCP request with access token + LiteLLM API key + LiteLLM->>MCP: MCP request with Bearer token + MCP-->>LiteLLM: MCP response + LiteLLM-->>Client: Return MCP response ``` -This configuration in Cursor IDE settings will limit tool access to only the specified MCP servers. +**Participants** - - +- **Client** – The MCP-capable AI agent (e.g., Claude Code, Cursor, or another IDE/agent) that initiates OAuth discovery, authorization, and tool invocations on behalf of the user. +- **LiteLLM Proxy** – Mediates all OAuth discovery, registration, token exchange, and MCP traffic while protecting stored credentials. +- **Authorization Server** – Issues OAuth 2.0 tokens via dynamic client registration, PKCE authorization, and token endpoints. +- **MCP Server (Resource Server)** – The protected MCP endpoint that receives LiteLLM’s authenticated JSON-RPC requests. +- **User-Agent (Browser)** – Temporarily involved so the end user can grant consent during the authorization step. ---- +**Flow Steps** -### Comparison: Header vs URL Namespacing +1. **Resource Discovery**: The client fetches MCP resource metadata from LiteLLM’s `.well-known/oauth-protected-resource` endpoint to understand scopes and capabilities. +2. **Authorization Server Discovery**: The client retrieves the OAuth server metadata (token endpoint, authorization endpoint, supported PKCE methods) through LiteLLM’s `.well-known/oauth-authorization-server` endpoint. +3. **Dynamic Client Registration**: The client registers through LiteLLM, which forwards the request to the authorization server (RFC 7591). If the provider doesn’t support dynamic registration, you can pre-store `client_id`/`client_secret` in LiteLLM (e.g., GitHub MCP) and the flow proceeds the same way. +4. **User Authorization**: The client launches a browser session (with code challenge and resource hints). The user approves access, the authorization server sends the code through LiteLLM back to the client. +5. **Token Exchange**: The client calls LiteLLM with the authorization code, code verifier, and resource. LiteLLM exchanges them with the authorization server and returns the issued access/refresh tokens. +6. **MCP Invocation**: With a valid token, the client sends the MCP JSON-RPC request (plus LiteLLM API key) to LiteLLM, which forwards it to the MCP server and relays the tool response. -| Feature | Header Namespacing | URL Namespacing | -|---------|-------------------|-----------------| -| **Method** | Uses `x-mcp-servers` header | Uses URL path `/mcp/` | -| **Endpoint** | Standard `litellm_proxy` endpoint | Custom `/mcp/` endpoint | -| **Configuration** | Requires additional header | Self-contained in URL | -| **Multiple Servers** | Comma-separated in header | Comma-separated in URL path | -| **Access Groups** | Supported via header | Supported via URL path | -| **Client Support** | Works with all MCP clients | Works with URL-aware MCP clients | -| **Use Case** | Dynamic server selection | Fixed server configuration | - - - - -```bash title="cURL Example with Server Segregation" showLineNumbers -curl --location 'https://api.openai.com/v1/responses' \ ---header 'Content-Type: application/json' \ ---header "Authorization: Bearer $OPENAI_API_KEY" \ ---data '{ - "model": "gpt-4o", - "tools": [ - { - "type": "mcp", - "server_label": "litellm", - "server_url": "/mcp/", - "require_approval": "never", - "headers": { - "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY", - "x-mcp-servers": "alias_1" - } - } - ], - "input": "Run available tools", - "tool_choice": "required" -}' -``` - -In this example, the request will only have access to tools from the "alias_1" MCP server. - - - - - -```bash title="cURL Example with Server Segregation" showLineNumbers -curl --location '/v1/responses' \ ---header 'Content-Type: application/json' \ ---header "Authorization: Bearer $LITELLM_API_KEY" \ ---data '{ - "model": "gpt-4o", - "tools": [ - { - "type": "mcp", - "server_label": "litellm", - "server_url": "litellm_proxy", - "require_approval": "never", - "headers": { - "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY", - "x-mcp-servers": "alias_1,Server2" - } - } - ], - "input": "Run available tools", - "tool_choice": "required" -}' -``` - -This configuration restricts the request to only use tools from the specified MCP servers. - - - - - -```json title="Cursor MCP Configuration with Server Segregation" showLineNumbers -{ - "mcpServers": { - "LiteLLM": { - "url": "litellm_proxy", - "headers": { - "x-litellm-api-key": "Bearer $LITELLM_API_KEY", - "x-mcp-servers": "alias_1,Server2" - } - } - } -} -``` - -This configuration in Cursor IDE settings will limit tool access to only the specified MCP server. - - - - -### Grouping MCPs (Access Groups) - -MCP Access Groups allow you to group multiple MCP servers together for easier management. - -#### 1. Create an Access Group - -##### A. Creating Access Groups using Config: - -```yaml title="Creating access groups for MCP using the config" showLineNumbers -mcp_servers: - "deepwiki_mcp": - url: https://mcp.deepwiki.com/mcp - transport: "http" - auth_type: "none" - access_groups: ["dev_group"] -``` - -While adding `mcp_servers` using the config: -- Pass in a list of strings inside `access_groups` -- These groups can then be used for segregating access using keys, teams and MCP clients using headers - -##### B. Creating Access Groups using UI - -To create an access group: -- Go to MCP Servers in the LiteLLM UI -- Click "Add a New MCP Server" -- Under "MCP Access Groups", create a new group (e.g., "dev_group") by typing it -- Add the same group name to other servers to group them together - - - -#### 2. Use Access Group in Cursor - -Include the access group name in the `x-mcp-servers` header: - -```json title="Cursor Configuration with Access Groups" showLineNumbers -{ - "mcpServers": { - "LiteLLM": { - "url": "litellm_proxy", - "headers": { - "x-litellm-api-key": "Bearer $LITELLM_API_KEY", - "x-mcp-servers": "dev_group" - } - } - } -} -``` - -This gives you access to all servers in the "dev_group" access group. -- Which means that if deepwiki server (and any other servers) which have the access group `dev_group` assigned to them will be available for tool calling - -#### Advanced: Connecting Access Groups to API Keys - -When creating API keys, you can assign them to specific access groups for permission management: - -- Go to "Keys" in the LiteLLM UI and click "Create Key" -- Select the desired MCP access groups from the dropdown -- The key will have access to all MCP servers in those groups -- This is reflected in the Test Key page - - +See the official [MCP Authorization Flow](https://modelcontextprotocol.io/specification/2025-06-18/basic/authorization#authorization-flow-steps) for additional reference. ## Forwarding Custom Headers to MCP Servers LiteLLM supports forwarding additional custom headers from MCP clients to backend MCP servers using the `extra_headers` configuration parameter. This allows you to pass custom authentication tokens, API keys, or other headers that your MCP server requires. -### Configuration +**Configuration** @@ -1097,7 +679,7 @@ if __name__ == "__main__": -### Client Usage +#### Client Usage When connecting from MCP clients, include the custom headers that match the `extra_headers` configuration: @@ -1182,52 +764,40 @@ curl --location 'http://localhost:4000/github_mcp/mcp' \ -### How It Works +#### How It Works 1. **Configuration**: Define `extra_headers` in your MCP server config with the header names you want to forward 2. **Client Headers**: Include the corresponding headers in your MCP client requests 3. **Header Forwarding**: LiteLLM automatically forwards matching headers to the backend MCP server 4. **Authentication**: The backend MCP server receives both the configured auth headers and the custom headers -### Use Cases -- **Custom Authentication**: Forward custom API keys or tokens required by specific MCP servers -- **Request Context**: Pass user identification, session data, or request tracking headers -- **Third-party Integration**: Include headers required by external services that your MCP server integrates with -- **Multi-tenant Systems**: Forward tenant-specific headers for proper request routing +### Passing Request Headers to STDIO env Vars -### Security Considerations +If your stdio MCP server needs per-request credentials, you can map HTTP headers from the client request directly into the environment for the launched stdio process. Reference the header name in the env value using the `${X-HEADER_NAME}` syntax. LiteLLM will read that header from the incoming request and set the env var before starting the command. -- Only headers listed in `extra_headers` are forwarded to maintain security -- Sensitive headers should be passed through environment variables when possible -- Consider using server-specific auth headers for better security isolation - ---- - -## MCP Oauth - -LiteLLM v 1.77.6 added support for OAuth 2.0 Client Credentials for MCP servers. - - -This configuration is currently available on the config.yaml, with UI support coming soon. - -```yaml -mcp_servers: - github_mcp: - url: "https://api.githubcopilot.com/mcp" - auth_type: oauth2 - authorization_url: https://github.com/login/oauth/authorize - token_url: https://github.com/login/oauth/access_token - client_id: os.environ/GITHUB_OAUTH_CLIENT_ID - client_secret: os.environ/GITHUB_OAUTH_CLIENT_SECRET - scopes: ["public_repo", "user:email"] +```json title="Forward X-GITHUB_PERSONAL_ACCESS_TOKEN header to stdio env" showLineNumbers +{ + "mcpServers": { + "github": { + "command": "docker", + "args": [ + "run", + "-i", + "--rm", + "-e", + "GITHUB_PERSONAL_ACCESS_TOKEN", + "ghcr.io/github/github-mcp-server" + ], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "${X-GITHUB_PERSONAL_ACCESS_TOKEN}" + } + } + } +} ``` -**Note** -In the future, users will only need to specify the `url` of the MCP server. -LiteLLM will automatically resolve the corresponding `authorization_url`, `token_url`, and `registration_url` based on the MCP server metadata (e.g., `.well-known/oauth-authorization-server` or `oauth-protected-resource`). - -[**See Claude Code Tutorial**](./tutorials/claude_responses_api#connecting-mcp-servers) +In this example, when a client makes a request with the `X-GITHUB_PERSONAL_ACCESS_TOKEN` header, the proxy forwards that value into the stdio process as the `GITHUB_PERSONAL_ACCESS_TOKEN` environment variable. ## Using your MCP with client side credentials @@ -1617,6 +1187,37 @@ curl --location '/v1/responses' \ }' ``` +## Use MCP tools with `/chat/completions` + +:::tip Works with all providers +This flow is **provider-agnostic**: the same MCP tool definition works for _every_ LLM backend behind LiteLLM (OpenAI, Azure OpenAI, Anthropic, Amazon Bedrock, Vertex, self-hosted deployments, etc.). +::: + +LiteLLM Proxy also supports MCP-aware tooling on the classic `/v1/chat/completions` endpoint. Provide the MCP tool definition directly in the `tools` array and LiteLLM will fetch and transform the MCP server's tools into OpenAI-compatible function calls. When `require_approval` is set to `"never"`, the proxy automatically executes the returned tool calls and feeds the results back into the model before returning the assistant response. + +```bash title="Chat Completions with MCP Tools" showLineNumbers +curl --location '/v1/chat/completions' \ +--header 'Content-Type: application/json' \ +--header "Authorization: Bearer $LITELLM_API_KEY" \ +--data '{ + "model": "gpt-4o-mini", + "messages": [ + {"role": "user", "content": "Summarize the latest open PR."} + ], + "tools": [ + { + "type": "mcp", + "server_url": "litellm_proxy/mcp/github", + "server_label": "github_mcp", + "require_approval": "never" + } + ] +}' +``` + +If you omit `require_approval` or set it to any value other than `"never"`, the MCP tool calls are returned to the client so that you can review and execute them manually, matching the upstream OpenAI behavior. + + ## LiteLLM Proxy - Walk through MCP Gateway LiteLLM exposes an MCP Gateway for admins to add all their MCP servers to LiteLLM. The key benefits of using LiteLLM Proxy with MCP are: @@ -1879,4 +1480,18 @@ async with stdio_client(server_params) as (read, write): ``` - \ No newline at end of file + + +## FAQ + +**Q: How do I use OAuth2 client_credentials (machine-to-machine) with MCP servers behind LiteLLM?** + +At the moment LiteLLM only forwards whatever `Authorization` header/value you configure for the MCP server; it does not issue OAuth2 tokens by itself. If your MCP requires the Client Credentials grant, obtain the access token directly from the authorization server and set that bearer token as the MCP server’s Authorization header value. LiteLLM does not yet fetch or refresh those machine-to-machine tokens on your behalf, but we plan to add first-class client_credentials support in a future release so the proxy can manage those tokens automatically. + +**Q: When I fetch an OAuth token from the LiteLLM UI, where is it stored?** + +The UI keeps only transient state in `sessionStorage` so the OAuth redirect flow can finish; the token is not persisted in the server or database. + +**Q: I'm seeing MCP connection errors—what should I check?** + +Walk through the [MCP Troubleshooting Guide](./mcp_troubleshoot.md) for step-by-step isolation (Client → LiteLLM vs. LiteLLM → MCP), log examples, and verification methods like MCP Inspector and `curl`. diff --git a/docs/my-website/docs/mcp_control.md b/docs/my-website/docs/mcp_control.md index 484cb13708cd..96c71ef9278f 100644 --- a/docs/my-website/docs/mcp_control.md +++ b/docs/my-website/docs/mcp_control.md @@ -13,6 +13,7 @@ LiteLLM provides fine-grained permission management for MCP servers, allowing yo - **Restrict MCP access by entity**: Control which keys, teams, or organizations can access specific MCP servers - **Tool-level filtering**: Automatically filter available tools based on entity permissions - **Centralized control**: Manage all MCP permissions from the LiteLLM Admin UI or API +- **One-click public MCPs**: Mark specific servers as available to every LiteLLM API key when you don't need per-key restrictions This ensures that only authorized entities can discover and use MCP tools, providing an additional security layer for your MCP infrastructure. @@ -35,6 +36,596 @@ When Creating a Key, Team, or Organization, you can select the allowed MCP Serve /> +## Allow/Disallow MCP Tools + +Control which tools are available from your MCP servers. You can either allow only specific tools or block dangerous ones. + + + + +Use `allowed_tools` to specify exactly which tools users can access. All other tools will be blocked. + +```yaml title="config.yaml" showLineNumbers +mcp_servers: + github_mcp: + url: "https://api.githubcopilot.com/mcp" + auth_type: oauth2 + authorization_url: https://github.com/login/oauth/authorize + token_url: https://github.com/login/oauth/access_token + client_id: os.environ/GITHUB_OAUTH_CLIENT_ID + client_secret: os.environ/GITHUB_OAUTH_CLIENT_SECRET + scopes: ["public_repo", "user:email"] + allowed_tools: ["list_tools"] + # only list_tools will be available +``` + +**Use this when:** +- You want strict control over which tools are available +- You're in a high-security environment +- You're testing a new MCP server with limited tools + + + + +Use `disallowed_tools` to block specific tools. All other tools will be available. + +```yaml title="config.yaml" showLineNumbers +mcp_servers: + github_mcp: + url: "https://api.githubcopilot.com/mcp" + auth_type: oauth2 + authorization_url: https://github.com/login/oauth/authorize + token_url: https://github.com/login/oauth/access_token + client_id: os.environ/GITHUB_OAUTH_CLIENT_ID + client_secret: os.environ/GITHUB_OAUTH_CLIENT_SECRET + scopes: ["public_repo", "user:email"] + disallowed_tools: ["repo_delete"] + # only repo_delete will be blocked +``` + +**Use this when:** +- Most tools are safe, but you want to block a few dangerous ones +- You want to prevent expensive API calls +- You're gradually adding restrictions to an existing server + + + + +### Important Notes + +- If you specify both `allowed_tools` and `disallowed_tools`, the allowed list takes priority +- Tool names are case-sensitive + +## Public MCP Servers (allow_all_keys) + +Some MCP servers are meant to be shared broadly—think internal knowledge bases, calendar integrations, or other low-risk utilities where every team should be able to connect without requesting access. Instead of adding those servers to every key, team, or organization, enable the new `allow_all_keys` toggle. + + + + +1. Open **MCP Servers → Add / Edit** in the Admin UI. +2. Expand **Permission Management / Access Control**. +3. Toggle **Allow All LiteLLM Keys** on. + +MCP server configuration in Admin UI + +The toggle makes the server “public” without touching existing access groups. + + + + +Set `allow_all_keys: true` to mark the server as public: + +```yaml title="Make an MCP server public" showLineNumbers +mcp_servers: + deepwiki: + url: https://mcp.deepwiki.com/mcp + allow_all_keys: true +``` + + + + +### When to use it + +- You have shared MCP utilities where fine-grained ACLs would only add busywork. +- You want a “default enabled” experience for internal users, while still being able to layer tool-level restrictions. +- You’re onboarding new teams and want the safest MCPs available out of the box. + +Once enabled, LiteLLM automatically includes the server for every key during tool discovery/calls—no extra virtual-key or team configuration is required. + +--- + +## Allow/Disallow MCP Tool Parameters + +Control which parameters are allowed for specific MCP tools using the `allowed_params` configuration. This provides fine-grained control over tool usage by restricting the parameters that can be passed to each tool. + +### Configuration + +`allowed_params` is a dictionary that maps tool names to lists of allowed parameter names. When configured, only the specified parameters will be accepted for that tool - any other parameters will be rejected with a 403 error. + +```yaml title="config.yaml with allowed_params" showLineNumbers +mcp_servers: + deepwiki_mcp: + url: https://mcp.deepwiki.com/mcp + transport: "http" + auth_type: "none" + allowed_params: + # Tool name: list of allowed parameters + read_wiki_contents: ["status"] + + my_api_mcp: + url: "https://my-api-server.com" + auth_type: "api_key" + auth_value: "my-key" + allowed_params: + # Using unprefixed tool name + getpetbyid: ["status"] + # Using prefixed tool name (both formats work) + my_api_mcp-findpetsbystatus: ["status", "limit"] + # Another tool with multiple allowed params + create_issue: ["title", "body", "labels"] +``` + +### How It Works + +1. **Tool-specific filtering**: Each tool can have its own list of allowed parameters +2. **Flexible naming**: Tool names can be specified with or without the server prefix (e.g., both `"getpetbyid"` and `"my_api_mcp-getpetbyid"` work) +3. **Whitelist approach**: Only parameters in the allowed list are permitted +4. **Unlisted tools**: If `allowed_params` is not set, all parameters are allowed +5. **Error handling**: Requests with disallowed parameters receive a 403 error with details about which parameters are allowed + +### Example Request Behavior + +With the configuration above, here's how requests would be handled: + +**✅ Allowed Request:** +```json +{ + "tool": "read_wiki_contents", + "arguments": { + "status": "active" + } +} +``` + +**❌ Rejected Request:** +```json +{ + "tool": "read_wiki_contents", + "arguments": { + "status": "active", + "limit": 10 // This parameter is not allowed + } +} +``` + +**Error Response:** +```json +{ + "error": "Parameters ['limit'] are not allowed for tool read_wiki_contents. Allowed parameters: ['status']. Contact proxy admin to allow these parameters." +} +``` + +### Use Cases + +- **Security**: Prevent users from accessing sensitive parameters or dangerous operations +- **Cost control**: Restrict expensive parameters (e.g., limiting result counts) +- **Compliance**: Enforce parameter usage policies for regulatory requirements +- **Staged rollouts**: Gradually enable parameters as tools are tested +- **Multi-tenant isolation**: Different parameter access for different user groups + +### Combining with Tool Filtering + +`allowed_params` works alongside `allowed_tools` and `disallowed_tools` for complete control: + +```yaml title="Combined filtering example" showLineNumbers +mcp_servers: + github_mcp: + url: "https://api.githubcopilot.com/mcp" + auth_type: oauth2 + authorization_url: https://github.com/login/oauth/authorize + token_url: https://github.com/login/oauth/access_token + client_id: os.environ/GITHUB_OAUTH_CLIENT_ID + client_secret: os.environ/GITHUB_OAUTH_CLIENT_SECRET + scopes: ["public_repo", "user:email"] + # Only allow specific tools + allowed_tools: ["create_issue", "list_issues", "search_issues"] + # Block dangerous operations + disallowed_tools: ["delete_repo"] + # Restrict parameters per tool + allowed_params: + create_issue: ["title", "body", "labels"] + list_issues: ["state", "sort", "perPage"] + search_issues: ["query", "sort", "order", "perPage"] +``` + +This configuration ensures that: +1. Only the three listed tools are available +2. The `delete_repo` tool is explicitly blocked +3. Each tool can only use its specified parameters + +--- + +## MCP Server Access Control + +LiteLLM Proxy provides two methods for controlling access to specific MCP servers: + +1. **URL-based Namespacing** - Use URL paths to directly access specific servers or access groups +2. **Header-based Namespacing** - Use the `x-mcp-servers` header to specify which servers to access + +--- + +### Method 1: URL-based Namespacing + +LiteLLM Proxy supports URL-based namespacing for MCP servers using the format `//mcp`. This allows you to: + +- **Direct URL Access**: Point MCP clients directly to specific servers or access groups via URL +- **Simplified Configuration**: Use URLs instead of headers for server selection +- **Access Group Support**: Use access group names in URLs for grouped server access + +#### URL Format + +``` +//mcp +``` + +**Examples:** +- `/github_mcp/mcp` - Access tools from the "github_mcp" MCP server +- `/zapier/mcp` - Access tools from the "zapier" MCP server +- `/dev_group/mcp` - Access tools from all servers in the "dev_group" access group +- `/github_mcp,zapier/mcp` - Access tools from multiple specific servers + +#### Usage Examples + + + + +```bash title="cURL Example with URL Namespacing" showLineNumbers +curl --location 'https://api.openai.com/v1/responses' \ +--header 'Content-Type: application/json' \ +--header "Authorization: Bearer $OPENAI_API_KEY" \ +--data '{ + "model": "gpt-4o", + "tools": [ + { + "type": "mcp", + "server_label": "litellm", + "server_url": "/github_mcp/mcp", + "require_approval": "never", + "headers": { + "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY" + } + } + ], + "input": "Run available tools", + "tool_choice": "required" +}' +``` + +This example uses URL namespacing to access only the "github" MCP server. + + + + + +```bash title="cURL Example with URL Namespacing" showLineNumbers +curl --location '/v1/responses' \ +--header 'Content-Type: application/json' \ +--header "Authorization: Bearer $LITELLM_API_KEY" \ +--data '{ + "model": "gpt-4o", + "tools": [ + { + "type": "mcp", + "server_label": "litellm", + "server_url": "/dev_group/mcp", + "require_approval": "never", + "headers": { + "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY" + } + } + ], + "input": "Run available tools", + "tool_choice": "required" +}' +``` + +This example uses URL namespacing to access all servers in the "dev_group" access group. + + + + + +```json title="Cursor MCP Configuration with URL Namespacing" showLineNumbers +{ + "mcpServers": { + "LiteLLM": { + "url": "/github_mcp,zapier/mcp", + "headers": { + "x-litellm-api-key": "Bearer $LITELLM_API_KEY" + } + } + } +} +``` + +This configuration uses URL namespacing to access tools from both "github" and "zapier" MCP servers. + + + + +#### Benefits of URL Namespacing + +- **Direct Access**: No need for additional headers to specify servers +- **Clean URLs**: Self-documenting URLs that clearly indicate which servers are accessible +- **Access Group Support**: Use access group names for grouped server access +- **Multiple Servers**: Specify multiple servers in a single URL with comma separation +- **Simplified Configuration**: Easier setup for MCP clients that prefer URL-based configuration + +--- + +### Method 2: Header-based Namespacing + +You can choose to access specific MCP servers and only list their tools using the `x-mcp-servers` header. This header allows you to: +- Limit tool access to one or more specific MCP servers +- Control which tools are available in different environments or use cases + +The header accepts a comma-separated list of server aliases: `"alias_1,Server2,Server3"` + +**Notes:** +- If the header is not provided, tools from all available MCP servers will be accessible +- This method works with the standard LiteLLM MCP endpoint + + + + +```bash title="cURL Example with Header Namespacing" showLineNumbers +curl --location 'https://api.openai.com/v1/responses' \ +--header 'Content-Type: application/json' \ +--header "Authorization: Bearer $OPENAI_API_KEY" \ +--data '{ + "model": "gpt-4o", + "tools": [ + { + "type": "mcp", + "server_label": "litellm", + "server_url": "/mcp/", + "require_approval": "never", + "headers": { + "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY", + "x-mcp-servers": "alias_1" + } + } + ], + "input": "Run available tools", + "tool_choice": "required" +}' +``` + +In this example, the request will only have access to tools from the "alias_1" MCP server. + + + + + +```bash title="cURL Example with Header Namespacing" showLineNumbers +curl --location '/v1/responses' \ +--header 'Content-Type: application/json' \ +--header "Authorization: Bearer $LITELLM_API_KEY" \ +--data '{ + "model": "gpt-4o", + "tools": [ + { + "type": "mcp", + "server_label": "litellm", + "server_url": "/mcp/", + "require_approval": "never", + "headers": { + "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY", + "x-mcp-servers": "alias_1,Server2" + } + } + ], + "input": "Run available tools", + "tool_choice": "required" +}' +``` + +This configuration restricts the request to only use tools from the specified MCP servers. + + + + + +```json title="Cursor MCP Configuration with Header Namespacing" showLineNumbers +{ + "mcpServers": { + "LiteLLM": { + "url": "/mcp/", + "headers": { + "x-litellm-api-key": "Bearer $LITELLM_API_KEY", + "x-mcp-servers": "alias_1,Server2" + } + } + } +} +``` + +This configuration in Cursor IDE settings will limit tool access to only the specified MCP servers. + + + + +--- + +### Comparison: Header vs URL Namespacing + +| Feature | Header Namespacing | URL Namespacing | +|---------|-------------------|-----------------| +| **Method** | Uses `x-mcp-servers` header | Uses URL path `//mcp` | +| **Endpoint** | Standard `litellm_proxy` endpoint | Custom `//mcp` endpoint | +| **Configuration** | Requires additional header | Self-contained in URL | +| **Multiple Servers** | Comma-separated in header | Comma-separated in URL path | +| **Access Groups** | Supported via header | Supported via URL path | +| **Client Support** | Works with all MCP clients | Works with URL-aware MCP clients | +| **Use Case** | Dynamic server selection | Fixed server configuration | + + + + +```bash title="cURL Example with Server Segregation" showLineNumbers +curl --location 'https://api.openai.com/v1/responses' \ +--header 'Content-Type: application/json' \ +--header "Authorization: Bearer $OPENAI_API_KEY" \ +--data '{ + "model": "gpt-4o", + "tools": [ + { + "type": "mcp", + "server_label": "litellm", + "server_url": "/mcp/", + "require_approval": "never", + "headers": { + "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY", + "x-mcp-servers": "alias_1" + } + } + ], + "input": "Run available tools", + "tool_choice": "required" +}' +``` + +In this example, the request will only have access to tools from the "alias_1" MCP server. + + + + + +```bash title="cURL Example with Server Segregation" showLineNumbers +curl --location '/v1/responses' \ +--header 'Content-Type: application/json' \ +--header "Authorization: Bearer $LITELLM_API_KEY" \ +--data '{ + "model": "gpt-4o", + "tools": [ + { + "type": "mcp", + "server_label": "litellm", + "server_url": "litellm_proxy", + "require_approval": "never", + "headers": { + "x-litellm-api-key": "Bearer YOUR_LITELLM_API_KEY", + "x-mcp-servers": "alias_1,Server2" + } + } + ], + "input": "Run available tools", + "tool_choice": "required" +}' +``` + +This configuration restricts the request to only use tools from the specified MCP servers. + + + + + +```json title="Cursor MCP Configuration with Server Segregation" showLineNumbers +{ + "mcpServers": { + "LiteLLM": { + "url": "litellm_proxy", + "headers": { + "x-litellm-api-key": "Bearer $LITELLM_API_KEY", + "x-mcp-servers": "alias_1,Server2" + } + } + } +} +``` + +This configuration in Cursor IDE settings will limit tool access to only the specified MCP server. + + + + +### Grouping MCPs (Access Groups) + +MCP Access Groups allow you to group multiple MCP servers together for easier management. + +#### 1. Create an Access Group + +##### A. Creating Access Groups using Config: + +```yaml title="Creating access groups for MCP using the config" showLineNumbers +mcp_servers: + "deepwiki_mcp": + url: https://mcp.deepwiki.com/mcp + transport: "http" + auth_type: "none" + access_groups: ["dev_group"] +``` + +While adding `mcp_servers` using the config: +- Pass in a list of strings inside `access_groups` +- These groups can then be used for segregating access using keys, teams and MCP clients using headers + +##### B. Creating Access Groups using UI + +To create an access group: +- Go to MCP Servers in the LiteLLM UI +- Click "Add a New MCP Server" +- Under "MCP Access Groups", create a new group (e.g., "dev_group") by typing it +- Add the same group name to other servers to group them together + + + +#### 2. Use Access Group in Cursor + +Include the access group name in the `x-mcp-servers` header: + +```json title="Cursor Configuration with Access Groups" showLineNumbers +{ + "mcpServers": { + "LiteLLM": { + "url": "litellm_proxy", + "headers": { + "x-litellm-api-key": "Bearer $LITELLM_API_KEY", + "x-mcp-servers": "dev_group" + } + } + } +} +``` + +This gives you access to all servers in the "dev_group" access group. +- Which means that if deepwiki server (and any other servers) which have the access group `dev_group` assigned to them will be available for tool calling + +#### Advanced: Connecting Access Groups to API Keys + +When creating API keys, you can assign them to specific access groups for permission management: + +- Go to "Keys" in the LiteLLM UI and click "Create Key" +- Select the desired MCP access groups from the dropdown +- The key will have access to all MCP servers in those groups +- This is reflected in the Test Key page + + + + + ## Set Allowed Tools for a Key, Team, or Organization Control which tools different teams can access from the same MCP server. For example, give your Engineering team access to `list_repositories`, `create_issue`, and `search_code`, while Sales only gets `search_code` and `close_issue`. @@ -43,3 +634,31 @@ Control which tools different teams can access from the same MCP server. For exa This video shows how to set allowed tools for a Key, Team, or Organization. + + +## Dashboard View Modes + +Proxy admins can also control what non-admins see inside the MCP dashboard via `general_settings.user_mcp_management_mode`: + +- `restricted` *(default)* – users only see servers that their team explicitly has access to. +- `view_all` – every dashboard user can see the full MCP server list. + +```yaml title="Config example" +general_settings: + user_mcp_management_mode: view_all +``` + +This is useful when you want discoverability for MCP offerings without granting additional execution privileges. + + +## Publish MCP Registry + +If you want other systems—for example external agent frameworks such as MCP-capable IDEs running outside your network—to automatically discover the MCP servers hosted on LiteLLM, you can expose a Model Context Protocol Registry endpoint. This registry lists the built-in LiteLLM MCP server and every server you have configured, using the [official MCP Registry spec](https://github.com/modelcontextprotocol/registry). + +1. Set `enable_mcp_registry: true` under `general_settings` in your proxy config (or DB settings) and restart the proxy. +2. LiteLLM will serve the registry at `GET /v1/mcp/registry.json`. +3. Each entry points to either `/mcp` (built-in server) or `/{mcp_server_name}/mcp` for your custom servers, so clients can connect directly using the advertised Streamable HTTP URL. + +:::note Permissions still apply +The registry only advertises server URLs. Actual access control is still enforced by LiteLLM when the client connects to `/mcp` or `/{server}/mcp`, so publishing the registry does not bypass per-key permissions. +::: diff --git a/docs/my-website/docs/mcp_guardrail.md b/docs/my-website/docs/mcp_guardrail.md index f71ea2fe5efa..9ce3fb2bcf84 100644 --- a/docs/my-website/docs/mcp_guardrail.md +++ b/docs/my-website/docs/mcp_guardrail.md @@ -85,4 +85,5 @@ MCP guardrails work with all LiteLLM-supported guardrail providers: - **Bedrock**: AWS Bedrock guardrails - **Lakera**: Content moderation - **Aporia**: Custom guardrails +- **Noma**: Noma Security - **Custom**: Your own guardrail implementations \ No newline at end of file diff --git a/docs/my-website/docs/mcp_troubleshoot.md b/docs/my-website/docs/mcp_troubleshoot.md new file mode 100644 index 000000000000..27ba0e4d7877 --- /dev/null +++ b/docs/my-website/docs/mcp_troubleshoot.md @@ -0,0 +1,99 @@ +import Image from '@theme/IdealImage'; + +# MCP Troubleshooting Guide + +When LiteLLM acts as an MCP proxy, traffic normally flows `Client → LiteLLM Proxy → MCP Server`, while OAuth-enabled setups add an authorization server for metadata discovery. + +For provisioning steps, transport options, and configuration fields, refer to [mcp.md](./mcp.md). + +## Locate the Error Source + +Pin down where the failure occurs before adjusting settings so you do not mix symptoms from separate hops. + +### LiteLLM UI / Playground Errors (LiteLLM → MCP) +Failures shown on the MCP creation form or within the MCP Tool Testing Playground mean the LiteLLM proxy cannot reach the MCP server. Typical causes are misconfiguration (transport, headers, credentials), MCP/server outages, network/firewall blocks, or inaccessible OAuth metadata. + + + +
+ +**Actions** +- Capture LiteLLM proxy logs alongside MCP-server logs (see [Error Log Example](./mcp_troubleshoot#error-log-example-failed-mcp-call)) to inspect the request/response pair and stack traces. +- From the LiteLLM server, run Method 2 ([`curl` smoke test](./mcp_troubleshoot#curl-smoke-test)) against the MCP endpoint to confirm basic connectivity. + +### Client Traffic Issues (Client → LiteLLM) +If only real client requests fail, determine whether LiteLLM ever reaches the MCP hop. + +#### MCP Protocol Sessions +Clients such as IDEs or agent runtimes speak the MCP protocol directly with LiteLLM. + +**Actions** +- Inspect LiteLLM access logs (see [Access Log Example](./mcp_troubleshoot#access-log-example-successful-mcp-call)) to verify the client request reached the proxy and which MCP server it targeted. +- Review LiteLLM error logs (see [Error Log Example](./mcp_troubleshoot#error-log-example-failed-mcp-call)) for TLS, authentication, or routing errors that block the request before the MCP call starts. +- Use the [MCP Inspector](./mcp_troubleshoot#mcp-inspector) to confirm the MCP server is reachable outside of the failing client. + +#### Responses/Completions with Embedded MCP Calls +During `/responses` or `/chat/completions`, LiteLLM may trigger MCP tool calls mid-request. An error could occur before the MCP call begins or after the MCP responds. + +**Actions** +- Check LiteLLM request logs (see [Access Log Example](./mcp_troubleshoot#access-log-example-successful-mcp-call)) to see whether an MCP attempt was recorded; if not, the problem lies in `Client → LiteLLM`. +- Validate MCP connectivity with the [MCP Inspector](./mcp_troubleshoot#mcp-inspector) to ensure the server responds. +- Reproduce the same MCP call via the LiteLLM Playground to confirm LiteLLM can complete the MCP hop independently. + + + +### OAuth Metadata Discovery +LiteLLM performs metadata discovery per the MCP spec ([section 2.3](https://modelcontextprotocol.info/specification/draft/basic/authorization/#23-server-metadata-discovery)). When OAuth is enabled, confirm the authorization server exposes the metadata URL and that LiteLLM can fetch it. + +**Actions** +- Use `curl ` (or similar) from the LiteLLM host to ensure the discovery document is reachable and contains the expected authorization/token endpoints. +- Record the exact metadata URL, requested scopes, and any static client credentials so support can replay the discovery step if needed. + +## Verify Connectivity + +Run lightweight validations before impacting production traffic. + +### MCP Inspector +Use the MCP Inspector when you need to test both `Client → LiteLLM` and `Client → MCP` communications in one place; it makes isolating the failing hop straightforward. + +1. Execute `npx @modelcontextprotocol/inspector` on your workstation. +2. Configure and connect: + - **Transport Type:** choose the transport the client uses (Streamable HTTP for LiteLLM). + - **URL:** the endpoint under test (LiteLLM MCP URL for `Client → LiteLLM`, or the MCP server URL for `Client → MCP`). + - **Custom Headers:** e.g., `Authorization: Bearer `. +3. Open the **Tools** tab and click **List Tools** to verify the MCP alias responds. + +### `curl` Smoke Test +`curl` is ideal on servers where installing the Inspector is impractical. It replicates the MCP tool call LiteLLM would make—swap in the domain of the system under test (LiteLLM or the MCP server). + +```bash +curl -X POST https://your-target-domain.example.com/mcp \ + -H "Content-Type: application/json" \ + -H "Accept: application/json, text/event-stream" \ + -d '{"jsonrpc":"2.0","id":1,"method":"tools/list","params":{}}' +``` + +Add `-H "Authorization: Bearer "` when the target is a LiteLLM endpoint that requires authentication. Adjust the headers, or payload to target other MCP methods. Matching failures between `curl` and LiteLLM confirm that the MCP server or network/OAuth layer is the culprit. + +## Review Logs + +Well-scoped logs make it clear whether LiteLLM reached the MCP server and what happened next. + +### Access Log Example (successful MCP call) +```text +INFO: 127.0.0.1:57230 - "POST /everything/mcp HTTP/1.1" 200 OK +``` + +### Error Log Example (failed MCP call) +```text +07:22:00 - LiteLLM:ERROR: client.py:224 - MCP client list_tools failed - Error Type: ExceptionGroup, Error: unhandled errors in a TaskGroup (1 sub-exception), Server: http://localhost:3001/mcp, Transport: MCPTransport.http + httpcore.ConnectError: All connection attempts failed +ERROR:LiteLLM:MCP client list_tools failed - Error Type: ExceptionGroup, Error: unhandled errors in a TaskGroup (1 sub-exception)... + httpx.ConnectError: All connection attempts failed +``` diff --git a/docs/my-website/docs/moderation.md b/docs/my-website/docs/moderation.md index f9c2810bc8aa..1f67b0a7543d 100644 --- a/docs/my-website/docs/moderation.md +++ b/docs/my-website/docs/moderation.md @@ -22,10 +22,19 @@ response = moderation( For `/moderations` endpoint, there is **no need to specify `model` in the request or on the litellm config.yaml** -Start litellm proxy server +1. Setup config.yaml +```yaml +model_list: + - model_name: text-moderation-stable + litellm_params: + model: openai/omni-moderation-latest ``` -litellm + +2. Start litellm proxy server + +``` +litellm --config /path/to/config.yaml ``` @@ -41,7 +50,7 @@ client = OpenAI(api_key="", base_url="http://0.0.0.0:4000") response = client.moderations.create( input="hello from litellm", - model="text-moderation-stable" # optional, defaults to `omni-moderation-latest` + model="text-moderation-stable" ) print(response) diff --git a/docs/my-website/docs/observability/arize_integration.md b/docs/my-website/docs/observability/arize_integration.md index a654a1b4de3a..b3ccf98ea3b4 100644 --- a/docs/my-website/docs/observability/arize_integration.md +++ b/docs/my-website/docs/observability/arize_integration.md @@ -7,13 +7,6 @@ import TabItem from '@theme/TabItem'; AI Observability and Evaluation Platform -:::tip - -This is community maintained, Please make an issue if you run into a bug -https://github.com/BerriAI/litellm - -::: - @@ -53,7 +46,7 @@ response = litellm.completion( ) ``` -### Using with LiteLLM Proxy +## Using with LiteLLM Proxy 1. Setup config.yaml ```yaml @@ -71,10 +64,11 @@ general_settings: master_key: "sk-1234" # can also be set as an environment variable environment_variables: - ARIZE_SPACE_KEY: "d0*****" + ARIZE_SPACE_ID: "d0*****" ARIZE_API_KEY: "141a****" ARIZE_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize GRPC api endpoint ARIZE_HTTP_ENDPOINT: "https://otlp.arize.com/v1" # OPTIONAL - your custom arize HTTP api endpoint. Set either this or ARIZE_ENDPOINT or Neither (defaults to https://otlp.arize.com/v1 on grpc) + ARIZE_PROJECT_NAME: "my-litellm-project" # OPTIONAL - sets the arize project name ``` 2. Start the proxy @@ -96,7 +90,8 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ Supported parameters: - `arize_api_key` -- `arize_space_key` +- `arize_space_key` *(deprecated, use `arize_space_id` instead)* +- `arize_space_id` @@ -117,8 +112,8 @@ response = litellm.completion( messages=[ {"role": "user", "content": "Hi 👋 - i'm openai"} ], - arize_api_key=os.getenv("ARIZE_SPACE_2_API_KEY"), - arize_space_key=os.getenv("ARIZE_SPACE_2_KEY"), + arize_api_key=os.getenv("ARIZE_API_KEY"), + arize_space_id=os.getenv("ARIZE_SPACE_ID"), ) ``` @@ -159,8 +154,8 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ -d '{ "model": "gpt-4", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}], - "arize_api_key": "ARIZE_SPACE_2_API_KEY", - "arize_space_key": "ARIZE_SPACE_2_KEY" + "arize_api_key": "ARIZE_API_KEY", + "arize_space_id": "ARIZE_SPACE_ID" }' ``` @@ -183,8 +178,8 @@ response = client.chat.completions.create( } ], extra_body={ - "arize_api_key": "ARIZE_SPACE_2_API_KEY", - "arize_space_key": "ARIZE_SPACE_2_KEY" + "arize_api_key": "ARIZE_API_KEY", + "arize_space_id": "ARIZE_SPACE_ID" } ) @@ -199,5 +194,5 @@ print(response) - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) - [Community Discord 💭](https://discord.gg/wuPM9dRgDw) -- Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬ +- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238 - Our emails ✉️ ishaan@berri.ai / krrish@berri.ai diff --git a/docs/my-website/docs/observability/azure_sentinel.md b/docs/my-website/docs/observability/azure_sentinel.md new file mode 100644 index 000000000000..6e7e05417952 --- /dev/null +++ b/docs/my-website/docs/observability/azure_sentinel.md @@ -0,0 +1,238 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Azure Sentinel + + + +LiteLLM supports logging to Azure Sentinel via the Azure Monitor Logs Ingestion API. Azure Sentinel uses Log Analytics workspaces for data storage, so logs sent to the workspace will be available in Sentinel for security monitoring and analysis. + +## Azure Sentinel Integration + +| Feature | Details | +|---------|---------| +| **What is logged** | [StandardLoggingPayload](../proxy/logging_spec) | +| **Events** | Success + Failure | +| **Product Link** | [Azure Sentinel](https://learn.microsoft.com/en-us/azure/sentinel/overview) | +| **API Reference** | [Logs Ingestion API](https://learn.microsoft.com/en-us/azure/azure-monitor/logs/logs-ingestion-api-overview) | + +We will use the `--config` to set `litellm.callbacks = ["azure_sentinel"]` this will log all successful and failed LLM calls to Azure Sentinel. + +**Step 1**: Create a `config.yaml` file and set `litellm_settings`: `callbacks` + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo +litellm_settings: + callbacks: ["azure_sentinel"] # logs llm success + failure logs to Azure Sentinel +``` + +**Step 2**: Set Up Azure Resources + +Before using the Logs Ingestion API, you need to set up the following in Azure: + +1. **Create a Log Analytics Workspace** (if you don't have one) +2. **Create a Custom Table** in your Log Analytics workspace (e.g., `LiteLLM_CL`) +3. **Create a Data Collection Rule (DCR)** with: + - Stream declaration matching your data structure + - Transformation to map data to your custom table + - Access granted to your app registration +4. **Register an Application** in Microsoft Entra ID (Azure AD) with: + - Client ID + - Client Secret + - Permissions to write to the DCR + +For detailed setup instructions, see the [Microsoft documentation on Logs Ingestion API](https://learn.microsoft.com/en-us/azure/azure-monitor/logs/logs-ingestion-api-overview). + +**Step 3**: Set Required Environment Variables + +Set the following environment variables with your Azure credentials: + +```shell showLineNumbers title="Environment Variables" +# Required: Data Collection Rule (DCR) configuration +AZURE_SENTINEL_DCR_IMMUTABLE_ID="dcr-xxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # DCR Immutable ID from Azure portal +AZURE_SENTINEL_STREAM_NAME="Custom-LiteLLM_CL_CL" # Stream name from your DCR +AZURE_SENTINEL_ENDPOINT="https://your-dcr-endpoint.eastus-1.ingest.monitor.azure.com" # DCR logs ingestion endpoint (NOT the DCE endpoint) + +# Required: OAuth2 Authentication (App Registration) +AZURE_SENTINEL_TENANT_ID="your-tenant-id" # Azure Tenant ID +AZURE_SENTINEL_CLIENT_ID="your-client-id" # Application (client) ID +AZURE_SENTINEL_CLIENT_SECRET="your-client-secret" # Client secret value + +``` + +**Note**: The `AZURE_SENTINEL_ENDPOINT` should be the DCR's logs ingestion endpoint (found in the DCR Overview page), NOT the Data Collection Endpoint (DCE). The DCR endpoint is associated with your specific DCR and looks like: `https://your-dcr-endpoint.{region}-1.ingest.monitor.azure.com` + +**Step 4**: Start the proxy and make a test request + +Start proxy + +```shell showLineNumbers title="Start Proxy" +litellm --config config.yaml --debug +``` + +Test Request + +```shell showLineNumbers title="Test Request" +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + "metadata": { + "your-custom-metadata": "custom-field", + } +}' +``` + +**Step 5**: View logs in Azure Sentinel + +1. Navigate to your Azure Sentinel workspace in the Azure portal +2. Go to "Logs" and query your custom table (e.g., `LiteLLM_CL`) +3. Run a query like: + +```kusto showLineNumbers title="KQL Query" +LiteLLM_CL +| where TimeGenerated > ago(1h) +| project TimeGenerated, model, status, total_tokens, response_cost +| order by TimeGenerated desc +``` + +You should see following logs in Azure Workspace. + + + +## Environment Variables + +| Environment Variable | Description | Default Value | Required | +|---------------------|-------------|---------------|----------| +| `AZURE_SENTINEL_DCR_IMMUTABLE_ID` | Data Collection Rule (DCR) Immutable ID | None | ✅ Yes | +| `AZURE_SENTINEL_ENDPOINT` | DCR logs ingestion endpoint URL (from DCR Overview page) | None | ✅ Yes | +| `AZURE_SENTINEL_STREAM_NAME` | Stream name from DCR (e.g., "Custom-LiteLLM_CL_CL") | "Custom-LiteLLM" | ❌ No | +| `AZURE_SENTINEL_TENANT_ID` | Azure Tenant ID for OAuth2 authentication | None (falls back to `AZURE_TENANT_ID`) | ✅ Yes | +| `AZURE_SENTINEL_CLIENT_ID` | Application (client) ID for OAuth2 authentication | None (falls back to `AZURE_CLIENT_ID`) | ✅ Yes | +| `AZURE_SENTINEL_CLIENT_SECRET` | Client secret for OAuth2 authentication | None (falls back to `AZURE_CLIENT_SECRET`) | ✅ Yes | + +## How It Works + +The Azure Sentinel integration uses the [Azure Monitor Logs Ingestion API](https://learn.microsoft.com/en-us/azure/azure-monitor/logs/logs-ingestion-api-overview) to send logs to your Log Analytics workspace. The integration: + +- Authenticates using OAuth2 client credentials flow with your app registration +- Sends logs to the Data Collection Rule (DCR) endpoint +- Batches logs for efficient transmission +- Sends logs in the [StandardLoggingPayload](../proxy/logging_spec) format +- Automatically handles both success and failure events +- Caches OAuth2 tokens and refreshes them automatically + +Logs sent to the Log Analytics workspace are automatically available in Azure Sentinel for security monitoring, threat detection, and analysis. + +## Azure Sentinel Setup Guide + +Follow this step-by-step guide to set up Azure Sentinel with LiteLLM. + +### Step 1: Create a Log Analytics Workspace + +1. Navigate to [https://portal.azure.com/#home](https://portal.azure.com/#home) + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/5659f6f5-a166-4b26-a991-73352274e3bb/ascreenshot.jpeg?tl_px=0,210&br_px=2618,1673&force_format=jpeg&q=100&width=1120.0) + +2. Search for "Log Analytics workspaces" and click "Create" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/a827ba10-a391-486a-a36a-51816c6255de/ascreenshot.jpeg?tl_px=0,0&br_px=2618,1463&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=21,106) + +3. Enter a name for your workspace (e.g., "litellm-sentinel-prod") + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/943458f1-fd4c-47dd-a273-ea5a04734ed9/ascreenshot.jpeg?tl_px=0,420&br_px=2618,1884&force_format=jpeg&q=100&width=1120.0) + +4. Click "Review + Create" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/c54828fb-f895-4eb7-b810-cacf437617bd/ascreenshot.jpeg?tl_px=0,420&br_px=2618,1884&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=40,564) + +### Step 2: Create a Custom Table + +1. Go to your Log Analytics workspace and click "Tables" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/72d65f70-75c0-471f-95e9-947c72e173cc/ascreenshot.jpeg?tl_px=0,142&br_px=2618,1605&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=330,277) + +2. Click "Create" → "New custom log (Direct Ingest)" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/863ad29b-2c3a-4b7c-9a6b-36d3a76c9f32/ascreenshot.jpeg?tl_px=0,0&br_px=2618,1463&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=526,146) + +3. Enter a table name (e.g., "LITELLM_PROD_CL") + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/ef2f1c52-aa36-46a1-91e6-9bd868891b15/ascreenshot.jpeg?tl_px=0,0&br_px=2618,1463&force_format=jpeg&q=100&width=1120.0) + +### Step 3: Create a Data Collection Rule (DCR) + +1. Click "Create a new data collection rule" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/f2abc0d3-8be8-4057-9290-946d10cfd183/ascreenshot.jpeg?tl_px=0,420&br_px=2618,1884&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=264,404) + +2. Enter a name for the DCR (e.g., "litellm-prod") + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/79bbebdc-e4d9-46ff-a270-1930619050a1/ascreenshot.jpeg?tl_px=0,8&br_px=2618,1471&force_format=jpeg&q=100&width=1120.0) + +3. Select a Data Collection Endpoint + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/f3112e9a-551e-415c-a7f9-55aad801bc8a/ascreenshot.jpeg?tl_px=0,420&br_px=2618,1884&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=332,480) + +4. Upload the sample JSON file for schema (use the [example_standard_logging_payload.json](https://github.com/BerriAI/litellm/blob/main/litellm/integrations/azure_sentinel/example_standard_logging_payload.json) file) + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/703c0762-840a-4f1f-a60f-876dc24b7a03/ascreenshot.jpeg?tl_px=0,0&br_px=2618,1463&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=518,272) + +5. Click "Next" and then "Create" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/0bca0200-5c64-4fbd-8061-9308aa6656b8/ascreenshot.jpeg?tl_px=0,420&br_px=2618,1884&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=128,560) + +### Step 4: Get the DCR Immutable ID and Logs Ingestion Endpoint + +1. Go to "Data Collection Rules" and select your DCR + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/11c06a0d-584f-4d22-b36e-9c338d43812c/ascreenshot.jpeg?tl_px=0,0&br_px=2618,1463&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=94,258) + +2. Copy the **DCR Immutable ID** (starts with `dcr-`) + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/cd0ad69a-4d95-4b6a-9533-7720908ba809/ascreenshot.jpeg?tl_px=1160,92&br_px=2618,907&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=530,277) + +3. Copy the **Logs Ingestion Endpoint** URL + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/3d3752ed-08ea-4490-8c98-a97d33947ea7/ascreenshot.jpeg?tl_px=1160,464&br_px=2618,1279&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=532,277) + +### Step 5: Get the Stream Name + +1. Click "JSON View" in the DCR + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/fd8a5504-4769-4f23-983e-520f256ee308/ascreenshot.jpeg?tl_px=1160,0&br_px=2618,814&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=965,257) + +2. Find the **Stream Name** in the `streamDeclarations` section (e.g., "Custom-LITELLM_PROD_CL_CL") + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-17/a4052b32-2028-4d12-8930-bfcdf6f47652/ascreenshot.jpeg?tl_px=405,270&br_px=2115,1225&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=523,277) + +### Step 6: Register an App and Grant Permissions + +1. Go to **Microsoft Entra ID** → **App registrations** → **New registration** +2. Create a new app and note the **Client ID** and **Tenant ID** +3. Go to **Certificates & secrets** → Create a new client secret and copy the **Secret Value** +4. Go back to your DCR → **Access Control (IAM)** → **Add role assignment** +5. Assign the **"Monitoring Metrics Publisher"** role to your app registration + +### Summary: Where to Find Each Value + +| Environment Variable | Where to Find It | +|---------------------|------------------| +| `AZURE_SENTINEL_DCR_IMMUTABLE_ID` | DCR Overview page → Immutable ID (starts with `dcr-`) | +| `AZURE_SENTINEL_ENDPOINT` | DCR Overview page → Logs Ingestion Endpoint | +| `AZURE_SENTINEL_STREAM_NAME` | DCR JSON View → `streamDeclarations` section | +| `AZURE_SENTINEL_TENANT_ID` | App Registration → Overview → Directory (tenant) ID | +| `AZURE_SENTINEL_CLIENT_ID` | App Registration → Overview → Application (client) ID | +| `AZURE_SENTINEL_CLIENT_SECRET` | App Registration → Certificates & secrets → Secret Value | + +For more details, refer to the [Microsoft Logs Ingestion API documentation](https://learn.microsoft.com/en-us/azure/azure-monitor/logs/logs-ingestion-api-overview). diff --git a/docs/my-website/docs/observability/cloudzero.md b/docs/my-website/docs/observability/cloudzero.md index f213ef64e131..19f6d80ca8bd 100644 --- a/docs/my-website/docs/observability/cloudzero.md +++ b/docs/my-website/docs/observability/cloudzero.md @@ -65,6 +65,52 @@ Start your LiteLLM proxy with the configuration: litellm --config /path/to/config.yaml ``` +## Setup on UI + +1\. Click "Settings" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/5ac36280-c688-41a3-8d0e-23e19c6a470b/ascreenshot.jpeg?tl_px=0,332&br_px=1308,1064&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=119,444) + + +2\. Click "Logging & Alerts" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/13f76b09-e0c4-4738-ba05-2d5111c6ad3e/ascreenshot.jpeg?tl_px=0,332&br_px=1308,1064&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=58,507) + + +3\. Click "CloudZero Cost Tracking" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/f96cc1e5-7bc0-4d7c-9aeb-5cbbec549b12/ascreenshot.jpeg?tl_px=0,0&br_px=1308,731&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=389,56) + + +4\. Click "Add CloudZero Integration" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/04fbc748-0e6f-43bb-8a57-dd2e83dbfcb5/ascreenshot.jpeg?tl_px=0,90&br_px=1308,821&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=616,277) + + +5\. Enter your CloudZero API Key. + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/080e82f1-f94f-4ed7-8014-e495380336f3/ascreenshot.jpeg?tl_px=0,0&br_px=1308,731&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=506,129) + + +6\. Enter your CloudZero Connection ID. + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/af417aa2-67a8-4dee-a014-84b1892dc07e/ascreenshot.jpeg?tl_px=0,0&br_px=1308,731&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=488,213) + + +7\. Click "Create" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/647e672f-9a4a-4754-a7b0-abf1397abad4/ascreenshot.jpeg?tl_px=0,88&br_px=1308,819&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=711,277) + + +8\. Test your payload with "Run Dry Run Simulation" + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/7447cbe0-3450-4be5-bdc4-37fb8280aa58/ascreenshot.jpeg?tl_px=0,125&br_px=1308,856&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=334,277) + + +10\. Click "Export Data Now" to export to CLoudZero + +![](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-22/7be9bd48-6e27-4c68-bc75-946f3ab593d9/ascreenshot.jpeg?tl_px=0,130&br_px=1308,861&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=518,277) + ## Testing Your Setup ### Dry Run Export diff --git a/docs/my-website/docs/observability/custom_callback.md b/docs/my-website/docs/observability/custom_callback.md index cfe97ca42c01..ae8926212709 100644 --- a/docs/my-website/docs/observability/custom_callback.md +++ b/docs/my-website/docs/observability/custom_callback.md @@ -203,7 +203,11 @@ asyncio.run(test_chat_openai()) ## What's Available in kwargs? -The kwargs dictionary contains all the details about your API call: +The kwargs dictionary contains all the details about your API call. + +:::info +For the complete logging payload specification, see the [Standard Logging Payload Spec](https://docs.litellm.ai/docs/proxy/logging_spec). +::: ```python def custom_callback(kwargs, completion_response, start_time, end_time): diff --git a/docs/my-website/docs/observability/datadog.md b/docs/my-website/docs/observability/datadog.md index 08ebf8b28ce7..6f785be1013b 100644 --- a/docs/my-website/docs/observability/datadog.md +++ b/docs/my-website/docs/observability/datadog.md @@ -7,6 +7,7 @@ import TabItem from '@theme/TabItem'; LiteLLM Supports logging to the following Datdog Integrations: - `datadog` [Datadog Logs](https://docs.datadoghq.com/logs/) - `datadog_llm_observability` [Datadog LLM Observability](https://www.datadoghq.com/product/llm-observability/) +- `datadog_cost_management` [Datadog Cloud Cost Management](#datadog-cloud-cost-management) - `ddtrace-run` [Datadog Tracing](#datadog-tracing) ## Datadog Logs @@ -56,12 +57,37 @@ litellm_settings: **Step 2**: Set Required env variables for datadog +#### Direct API + +Send logs directly to Datadog API: + ```shell DD_API_KEY="5f2d0f310***********" # your datadog API Key DD_SITE="us5.datadoghq.com" # your datadog base url DD_SOURCE="litellm_dev" # [OPTIONAL] your datadog source. use to differentiate dev vs. prod deployments ``` +#### Via DataDog Agent + +Send logs through a local DataDog agent (useful for containerized environments): + +```shell +LITELLM_DD_AGENT_HOST="localhost" # hostname or IP of DataDog agent +LITELLM_DD_AGENT_PORT="10518" # [OPTIONAL] port of DataDog agent (default: 10518) +DD_API_KEY="5f2d0f310***********" # [OPTIONAL] your datadog API Key (Agent handles auth for Logs. REQUIRED for LLM Observability) +DD_SOURCE="litellm_dev" # [OPTIONAL] your datadog source +``` + +When `LITELLM_DD_AGENT_HOST` is set, logs are sent to the agent instead of directly to DataDog API. This is useful for: +- Centralized log shipping in containerized environments +- Reducing direct API calls from multiple services +- Leveraging agent-side processing and filtering + +**Note:** We use `LITELLM_DD_AGENT_HOST` instead of `DD_AGENT_HOST` to avoid conflicts with `ddtrace` which automatically sets `DD_AGENT_HOST` for APM tracing. + +> [!IMPORTANT] +> **Datadog LLM Observability**: `DD_API_KEY` is **REQUIRED** even when using the Datadog Agent (`LITELLM_DD_AGENT_HOST`). The agent acts as a proxy but the API key header is mandatory for the LLM Observability endpoint. + **Step 3**: Start the proxy, make a test request Start proxy @@ -139,6 +165,50 @@ On the Datadog LLM Observability page, you should see that both input messages a + + + +## Datadog Cloud Cost Management + +| Feature | Details | +|---------|---------| +| **What is logged** | Aggregated LLM Costs (FOCUS format) | +| **Events** | Periodic Uploads of Aggregated Cost Data | +| **Product Link** | [Datadog Cloud Cost Management](https://docs.datadoghq.com/cost_management/) | + +We will use the `--config` to set `litellm.callbacks = ["datadog_cost_management"]`. This will periodically upload aggregated LLM cost data to Datadog. + +**Step 1**: Create a `config.yaml` file and set `litellm_settings`: `success_callback` + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo +litellm_settings: + callbacks: ["datadog_cost_management"] +``` + +**Step 2**: Set Required env variables + +```shell +DD_API_KEY="your-api-key" +DD_APP_KEY="your-app-key" # REQUIRED for Cost Management +DD_SITE="us5.datadoghq.com" +``` + +**Step 3**: Start the proxy + +```shell +litellm --config config.yaml +``` + +**How it works** +* LiteLLM aggregates costs in-memory by Provider, Model, Date, and Tags. +* Requires `DD_APP_KEY` for the Custom Costs API. +* Costs are uploaded periodically (flushed). + + ### Datadog Tracing Use `ddtrace-run` to enable [Datadog Tracing](https://ddtrace.readthedocs.io/en/stable/installation_quickstart.html) on litellm proxy @@ -159,7 +229,7 @@ docker run \ -e USE_DDTRACE=true \ -e USE_DDPROFILER=true \ -p 4000:4000 \ - ghcr.io/berriai/litellm:main-latest \ + docker.litellm.ai/berriai/litellm:main-latest \ --config /app/config.yaml --detailed_debug ``` @@ -169,8 +239,10 @@ LiteLLM supports customizing the following Datadog environment variables | Environment Variable | Description | Default Value | Required | |---------------------|-------------|---------------|----------| -| `DD_API_KEY` | Your Datadog API key for authentication | None | ✅ Yes | -| `DD_SITE` | Your Datadog site (e.g., "us5.datadoghq.com") | None | ✅ Yes | +| `DD_API_KEY` | Your Datadog API key for authentication (required for direct API, optional for agent) | None | Conditional* | +| `DD_SITE` | Your Datadog site (e.g., "us5.datadoghq.com") (required for direct API) | None | Conditional* | +| `LITELLM_DD_AGENT_HOST` | Hostname or IP of DataDog agent (e.g., "localhost"). When set, logs are sent to agent instead of direct API | None | ❌ No | +| `LITELLM_DD_AGENT_PORT` | Port of DataDog agent for log intake | "10518" | ❌ No | | `DD_ENV` | Environment tag for your logs (e.g., "production", "staging") | "unknown" | ❌ No | | `DD_SERVICE` | Service name for your logs | "litellm-server" | ❌ No | | `DD_SOURCE` | Source name for your logs | "litellm" | ❌ No | @@ -178,3 +250,6 @@ LiteLLM supports customizing the following Datadog environment variables | `HOSTNAME` | Hostname tag for your logs | "" | ❌ No | | `POD_NAME` | Pod name tag (useful for Kubernetes deployments) | "unknown" | ❌ No | +\* **Required when using Direct API** (default): `DD_API_KEY` and `DD_SITE` are required +\* **Optional when using DataDog Agent**: Set `LITELLM_DD_AGENT_HOST` to use agent mode; `DD_API_KEY` and `DD_SITE` are not required for **Datadog Logs**. (**Note: `DD_API_KEY` IS REQUIRED for Datadog LLM Observability**) + diff --git a/docs/my-website/docs/observability/focus.md b/docs/my-website/docs/observability/focus.md new file mode 100644 index 000000000000..c282f4a220c2 --- /dev/null +++ b/docs/my-website/docs/observability/focus.md @@ -0,0 +1,93 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Focus Export (Experimental) + +:::caution Experimental feature +Focus Format export is under active development and currently considered experimental. +Interfaces, schema mappings, and configuration options may change as we iterate based on user feedback. +Please treat this integration as a preview and report any issues or suggestions to help us stabilize and improve the workflow. +::: + +LiteLLM can emit usage data in the [FinOps FOCUS format](https://focus.finops.org/focus-specification/v1-2/) and push artifacts (for example Parquet files) to destinations such as Amazon S3. This enables downstream cost-analysis tooling to ingest a standardised dataset directly from LiteLLM. + +LiteLLM currently conforms to the FinOps FOCUS v1.2 specification when emitting this dataset. + +## Overview + +| Property | Details | +|----------|---------| +| Destination | Export LiteLLM usage data in FOCUS format to managed storage (currently S3) | +| Callback name | `focus` | +| Supported operations | Automatic scheduled export | +| Data format | FOCUS Normalised Dataset (Parquet) | + +## Environment Variables + +### Common settings + +| Variable | Required | Description | +|----------|----------|-------------| +| `FOCUS_PROVIDER` | No | Destination provider (defaults to `s3`). | +| `FOCUS_FORMAT` | No | Output format (currently only `parquet`). | +| `FOCUS_FREQUENCY` | No | Export cadence. Prefer `hourly` or `daily` for production; `interval` is intended for short test loops. Defaults to `hourly`. | +| `FOCUS_CRON_OFFSET` | No | Minute offset used for hourly/daily cron triggers. Defaults to `5`. | +| `FOCUS_INTERVAL_SECONDS` | No | Interval (seconds) when `FOCUS_FREQUENCY="interval"`. | +| `FOCUS_PREFIX` | No | Object key prefix/folder. Defaults to `focus_exports`. | + +### S3 destination + +| Variable | Required | Description | +|----------|----------|-------------| +| `FOCUS_S3_BUCKET_NAME` | Yes | Destination bucket for exported files. | +| `FOCUS_S3_REGION_NAME` | No | AWS region for the bucket. | +| `FOCUS_S3_ENDPOINT_URL` | No | Custom endpoint (useful for S3-compatible storage). | +| `FOCUS_S3_ACCESS_KEY` | Yes | AWS access key for uploads. | +| `FOCUS_S3_SECRET_KEY` | Yes | AWS secret key for uploads. | +| `FOCUS_S3_SESSION_TOKEN` | No | AWS session token if using temporary credentials. | + +## Setup via Config + +### Configure environment variables + +```bash +export FOCUS_PROVIDER="s3" +export FOCUS_PREFIX="focus_exports" + +# S3 example +export FOCUS_S3_BUCKET_NAME="my-litellm-focus-bucket" +export FOCUS_S3_REGION_NAME="us-east-1" +export FOCUS_S3_ACCESS_KEY="AKIA..." +export FOCUS_S3_SECRET_KEY="..." +``` + +### Update LiteLLM config + +```yaml +model_list: + - model_name: gpt-4o + litellm_params: + model: openai/gpt-4o + api_key: sk-your-key + +litellm_settings: + callbacks: ["focus"] +``` + +### Start the proxy + +```bash +litellm --config /path/to/config.yaml +``` + +During boot LiteLLM registers the Focus logger and a background job that runs according to the configured frequency. + +## Planned Enhancements +- Add "Setup on UI" flow alongside the current configuration-based setup. +- Add GCS / Azure Blob to the Destination options. +- Support CSV output alongside Parquet. + +## Related Links + +- [Focus](https://focus.finops.org/) + diff --git a/docs/my-website/docs/observability/generic_api.md b/docs/my-website/docs/observability/generic_api.md new file mode 100644 index 000000000000..93a0762591ae --- /dev/null +++ b/docs/my-website/docs/observability/generic_api.md @@ -0,0 +1,169 @@ +# Generic API Callback (Webhook) + +Send LiteLLM logs to any HTTP endpoint. + +## Quick Start + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + callbacks: ["custom_api_name"] + +callback_settings: + custom_api_name: + callback_type: generic_api + endpoint: https://your-endpoint.com/logs + headers: + Authorization: Bearer sk-1234 +``` + +## Configuration + +### Basic Setup + +```yaml +callback_settings: + : + callback_type: generic_api + endpoint: https://your-endpoint.com # required + headers: # optional + Authorization: Bearer + Custom-Header: value + event_types: # optional, defaults to all events + - llm_api_success + - llm_api_failure +``` + +### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `callback_type` | string | Yes | Must be `generic_api` | +| `endpoint` | string | Yes | HTTP endpoint to send logs to | +| `headers` | dict | No | Custom headers for the request | +| `event_types` | list | No | Filter events: `llm_api_success`, `llm_api_failure`. Defaults to all events. | +| `log_format` | string | No | Output format: `json_array` (default), `ndjson`, or `single`. Controls how logs are batched and sent. | + +## Pre-configured Callbacks + +Use built-in configurations from `generic_api_compatible_callbacks.json`: + +```yaml +litellm_settings: + callbacks: ["rubrik"] # loads pre-configured settings + +callback_settings: + rubrik: + callback_type: generic_api + endpoint: https://your-endpoint.com # override defaults + headers: + Authorization: Bearer ${RUBRIK_API_KEY} +``` + +## Payload Format + +Logs are sent as `StandardLoggingPayload` [objects](https://docs.litellm.ai/docs/proxy/logging_spec) in JSON format: + +```json +[ + { + "id": "chatcmpl-123", + "call_type": "litellm.completion", + "model": "gpt-3.5-turbo", + "messages": [...], + "response": {...}, + "usage": {...}, + "cost": 0.0001, + "startTime": "2024-01-01T00:00:00", + "endTime": "2024-01-01T00:00:01", + "metadata": {...} + } +] +``` + +## Environment Variables + +Set via environment variables instead of config: + +```bash +export GENERIC_LOGGER_ENDPOINT=https://your-endpoint.com +export GENERIC_LOGGER_HEADERS="Authorization=Bearer token,Custom-Header=value" +``` + +## Batch Settings + +Control batching behavior (inherits from `CustomBatchLogger`): + +```yaml +callback_settings: + my_api: + callback_type: generic_api + endpoint: https://your-endpoint.com + batch_size: 100 # default: 100 + flush_interval: 60 # seconds, default: 60 +``` + +## Log Format Options + +Control how logs are formatted and sent to your endpoint. + +### JSON Array (Default) + +```yaml +callback_settings: + my_api: + callback_type: generic_api + endpoint: https://your-endpoint.com + log_format: json_array # default if not specified +``` + +Sends all logs in a batch as a single JSON array `[{log1}, {log2}, ...]`. This is the default behavior and maintains backward compatibility. + +**When to use**: Most HTTP endpoints expecting batched JSON data. + +### NDJSON (Newline-Delimited JSON) + +```yaml +callback_settings: + my_api: + callback_type: generic_api + endpoint: https://your-endpoint.com + log_format: ndjson +``` + +Sends logs as newline-delimited JSON (one record per line): +``` +{log1} +{log2} +{log3} +``` + +**When to use**: Log aggregation services like Sumo Logic, Splunk, or Datadog that support field extraction on individual records. + +**Benefits**: +- Each log is ingested as a separate message +- Field Extraction Rules work at ingest time +- Better parsing and querying performance + +### Single + +```yaml +callback_settings: + my_api: + callback_type: generic_api + endpoint: https://your-endpoint.com + log_format: single +``` + +Sends each log as an individual HTTP request in parallel when the batch is flushed. + +**When to use**: Endpoints that expect individual records, or when you need maximum compatibility. + +**Note**: This mode sends N HTTP requests per batch (more overhead). Consider using `ndjson` instead if your endpoint supports it. + + diff --git a/docs/my-website/docs/observability/helicone_integration.md b/docs/my-website/docs/observability/helicone_integration.md index 22ea051f7cde..92d0f5c3ebfa 100644 --- a/docs/my-website/docs/observability/helicone_integration.md +++ b/docs/my-website/docs/observability/helicone_integration.md @@ -10,7 +10,7 @@ https://github.com/BerriAI/litellm ::: -[Helicone](https://helicone.ai/) is an open source observability platform that proxies your LLM requests and provides key insights into your usage, spend, latency and more. +[Helicone](https://helicone.ai/) is an open sourced observability platform providing key insights into your usage, spend, latency and more. ## Quick Start @@ -25,14 +25,10 @@ from litellm import completion ## Set env variables os.environ["HELICONE_API_KEY"] = "your-helicone-key" -os.environ["OPENAI_API_KEY"] = "your-openai-key" - -# Set callbacks -litellm.success_callback = ["helicone"] # OpenAI call response = completion( - model="gpt-4o", + model="helicone/gpt-4o-mini", messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}], ) @@ -54,7 +50,7 @@ model_list: # Add Helicone callback litellm_settings: success_callback: ["helicone"] - + # Set Helicone API key environment_variables: HELICONE_API_KEY: "your-helicone-key" @@ -72,12 +68,12 @@ litellm --config config.yaml There are two main approaches to integrate Helicone with LiteLLM: -1. **Callbacks**: Log to Helicone while using any provider -2. **Proxy Mode**: Use Helicone as a proxy for advanced features +1. **As a Provider**: Use Helicone to log requests for [all models supported ](../providers/helicone) +2. **Callbacks**: Log to Helicone while using any provider ### Supported LLM Providers -Helicone can log requests across [various LLM providers](https://docs.helicone.ai/getting-started/quick-start), including: +Helicone can log requests across [all major LLM providers](https://helicone.ai/models), including: - OpenAI - Azure @@ -88,156 +84,149 @@ Helicone can log requests across [various LLM providers](https://docs.helicone.a - Replicate - And more -## Method 1: Using Callbacks +## Method 1: Using Helicone as a Provider -Log requests to Helicone while using any LLM provider directly. +Helicone's AI Gateway provides [advanced functionality](https://docs.helicone.ai) like caching, rate limiting, LLM security, and more. - - -```python -import os -import litellm -from litellm import completion - -## Set env variables -os.environ["HELICONE_API_KEY"] = "your-helicone-key" -os.environ["OPENAI_API_KEY"] = "your-openai-key" -# os.environ["HELICONE_API_BASE"] = "" # [OPTIONAL] defaults to `https://api.helicone.ai` - -# Set callbacks -litellm.success_callback = ["helicone"] - -# OpenAI call -response = completion( - model="gpt-4o", - messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}], -) - -print(response) -``` - - - - -```yaml title="config.yaml" -model_list: - - model_name: gpt-4 - litellm_params: - model: gpt-4 - api_key: os.environ/OPENAI_API_KEY - - model_name: claude-3 - litellm_params: - model: anthropic/claude-3-sonnet-20240229 - api_key: os.environ/ANTHROPIC_API_KEY - -# Add Helicone logging -litellm_settings: - success_callback: ["helicone"] - -# Environment variables -environment_variables: - HELICONE_API_KEY: "your-helicone-key" - OPENAI_API_KEY: "your-openai-key" - ANTHROPIC_API_KEY: "your-anthropic-key" -``` - -Start the proxy: -```bash -litellm --config config.yaml -``` - -Make requests to your proxy: -```python -import openai - -client = openai.OpenAI( - api_key="anything", # proxy doesn't require real API key - base_url="http://localhost:4000" -) - -response = client.chat.completions.create( - model="gpt-4", # This gets logged to Helicone - messages=[{"role": "user", "content": "Hello!"}] -) -``` - - + + + Set Helicone as your base URL and pass authentication headers: + + ```python + import os + import litellm + from litellm import completion + + os.environ["HELICONE_API_KEY"] = "" # your Helicone API key + + messages = [{"content": "What is the capital of France?", "role": "user"}] + + # Helicone call - routes through Helicone gateway to any model + response = completion( + model="helicone/gpt-4o-mini", # or any 100+ models + messages=messages + ) + + print(response) + ``` + + ### Advanced Usage + + You can add custom metadata and properties to your requests using Helicone headers. Here are some examples: + + ```python + litellm.metadata = { + "Helicone-User-Id": "user-abc", # Specify the user making the request + "Helicone-Property-App": "web", # Custom property to add additional information + "Helicone-Property-Custom": "any-value", # Add any custom property + "Helicone-Prompt-Id": "prompt-supreme-court", # Assign an ID to associate this prompt with future versions + "Helicone-Cache-Enabled": "true", # Enable caching of responses + "Cache-Control": "max-age=3600", # Set cache limit to 1 hour + "Helicone-RateLimit-Policy": "10;w=60;s=user", # Set rate limit policy + "Helicone-Retry-Enabled": "true", # Enable retry mechanism + "helicone-retry-num": "3", # Set number of retries + "helicone-retry-factor": "2", # Set exponential backoff factor + "Helicone-Model-Override": "gpt-3.5-turbo-0613", # Override the model used for cost calculation + "Helicone-Session-Id": "session-abc-123", # Set session ID for tracking + "Helicone-Session-Path": "parent-trace/child-trace", # Set session path for hierarchical tracking + "Helicone-Omit-Response": "false", # Include response in logging (default behavior) + "Helicone-Omit-Request": "false", # Include request in logging (default behavior) + "Helicone-LLM-Security-Enabled": "true", # Enable LLM security features + "Helicone-Moderations-Enabled": "true", # Enable content moderation + } + ``` + + ### Caching and Rate Limiting + + Enable caching and set up rate limiting policies: + + ```python + litellm.metadata = { + "Helicone-Cache-Enabled": "true", # Enable caching of responses + "Cache-Control": "max-age=3600", # Set cache limit to 1 hour + "Helicone-RateLimit-Policy": "100;w=3600;s=user", # Set rate limit policy + } + ``` + + -## Method 2: Using Helicone as a Proxy +## Method 2: Using Callbacks -Helicone's proxy provides [advanced functionality](https://docs.helicone.ai/getting-started/proxy-vs-async) like caching, rate limiting, LLM security through [PromptArmor](https://promptarmor.com/) and more. +Log requests to Helicone while using any LLM provider directly. - - -Set Helicone as your base URL and pass authentication headers: - -```python -import os -import litellm -from litellm import completion - -# Configure LiteLLM to use Helicone proxy -litellm.api_base = "https://oai.hconeai.com/v1" -litellm.headers = { - "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", -} - -# Set your OpenAI API key -os.environ["OPENAI_API_KEY"] = "your-openai-key" - -response = completion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "How does a court case get to the Supreme Court?"}] -) - -print(response) -``` - -### Advanced Usage - -You can add custom metadata and properties to your requests using Helicone headers. Here are some examples: - -```python -litellm.metadata = { - "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API - "Helicone-User-Id": "user-abc", # Specify the user making the request - "Helicone-Property-App": "web", # Custom property to add additional information - "Helicone-Property-Custom": "any-value", # Add any custom property - "Helicone-Prompt-Id": "prompt-supreme-court", # Assign an ID to associate this prompt with future versions - "Helicone-Cache-Enabled": "true", # Enable caching of responses - "Cache-Control": "max-age=3600", # Set cache limit to 1 hour - "Helicone-RateLimit-Policy": "10;w=60;s=user", # Set rate limit policy - "Helicone-Retry-Enabled": "true", # Enable retry mechanism - "helicone-retry-num": "3", # Set number of retries - "helicone-retry-factor": "2", # Set exponential backoff factor - "Helicone-Model-Override": "gpt-3.5-turbo-0613", # Override the model used for cost calculation - "Helicone-Session-Id": "session-abc-123", # Set session ID for tracking - "Helicone-Session-Path": "parent-trace/child-trace", # Set session path for hierarchical tracking - "Helicone-Omit-Response": "false", # Include response in logging (default behavior) - "Helicone-Omit-Request": "false", # Include request in logging (default behavior) - "Helicone-LLM-Security-Enabled": "true", # Enable LLM security features - "Helicone-Moderations-Enabled": "true", # Enable content moderation - "Helicone-Fallbacks": '["gpt-3.5-turbo", "gpt-4"]', # Set fallback models -} -``` - -### Caching and Rate Limiting - -Enable caching and set up rate limiting policies: - -```python -litellm.metadata = { - "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", # Authenticate to send requests to Helicone API - "Helicone-Cache-Enabled": "true", # Enable caching of responses - "Cache-Control": "max-age=3600", # Set cache limit to 1 hour - "Helicone-RateLimit-Policy": "100;w=3600;s=user", # Set rate limit policy -} -``` - - + + + ```python + import os + import litellm + from litellm import completion + + ## Set env variables + os.environ["HELICONE_API_KEY"] = "your-helicone-key" + os.environ["OPENAI_API_KEY"] = "your-openai-key" + # os.environ["HELICONE_API_BASE"] = "" # [OPTIONAL] defaults to `https://api.helicone.ai` + + # Set callbacks + litellm.success_callback = ["helicone"] + + # OpenAI call + response = completion( + model="gpt-4o", + messages=[{"role": "user", "content": "Hi 👋 - I'm OpenAI"}], + ) + + print(response) + ``` + + + + + ```yaml title="config.yaml" + model_list: + - model_name: gpt-4 + litellm_params: + model: gpt-4 + api_key: os.environ/OPENAI_API_KEY + - model_name: claude-3 + litellm_params: + model: anthropic/claude-3-sonnet-20240229 + api_key: os.environ/ANTHROPIC_API_KEY + + # Add Helicone logging + litellm_settings: + success_callback: ["helicone"] + + # Environment variables + environment_variables: + HELICONE_API_KEY: "your-helicone-key" + OPENAI_API_KEY: "your-openai-key" + ANTHROPIC_API_KEY: "your-anthropic-key" + ``` + + Start the proxy: + ```bash + litellm --config config.yaml + ``` + + Make requests to your proxy: + ```python + import openai + + client = openai.OpenAI( + api_key="anything", # proxy doesn't require real API key + base_url="http://localhost:4000" + ) + + response = client.chat.completions.create( + model="gpt-4", # This gets logged to Helicone + messages=[{"role": "user", "content": "Hello!"}] + ) + ``` + + ## Session Tracking and Tracing @@ -245,57 +234,62 @@ litellm.metadata = { Track multi-step and agentic LLM interactions using session IDs and paths: - - -```python -import litellm - -litellm.api_base = "https://oai.hconeai.com/v1" -litellm.metadata = { - "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", - "Helicone-Session-Id": "session-abc-123", - "Helicone-Session-Path": "parent-trace/child-trace", -} - -response = litellm.completion( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "Start a conversation"}] -) -``` - - - - -```python -import openai - -client = openai.OpenAI( - api_key="anything", - base_url="http://localhost:4000" -) - -# First request in session -response1 = client.chat.completions.create( - model="gpt-4", - messages=[{"role": "user", "content": "Hello"}], - extra_headers={ - "Helicone-Session-Id": "session-abc-123", - "Helicone-Session-Path": "conversation/greeting" - } -) - -# Follow-up request in same session -response2 = client.chat.completions.create( - model="gpt-4", - messages=[{"role": "user", "content": "Tell me more"}], - extra_headers={ - "Helicone-Session-Id": "session-abc-123", - "Helicone-Session-Path": "conversation/follow-up" - } -) -``` - - + + + ```python + import os + import litellm + from litellm import completion + + os.environ["HELICONE_API_KEY"] = "" # your Helicone API key + + messages = [{"content": "What is the capital of France?", "role": "user"}] + + response = completion( + model="helicone/gpt-4", + messages=messages, + metadata={ + "Helicone-Session-Id": "session-abc-123", + "Helicone-Session-Path": "parent-trace/child-trace", + } + ) + + print(response) + ``` + + + + + ```python + import openai + + client = openai.OpenAI( + api_key="anything", + base_url="http://localhost:4000" + ) + + # First request in session + response1 = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}], + extra_headers={ + "Helicone-Session-Id": "session-abc-123", + "Helicone-Session-Path": "conversation/greeting" + } + ) + + # Follow-up request in same session + response2 = client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Tell me more"}], + extra_headers={ + "Helicone-Session-Id": "session-abc-123", + "Helicone-Session-Path": "conversation/follow-up" + } + ) + ``` + + - `Helicone-Session-Id`: Unique identifier for the session to group related requests @@ -304,52 +298,50 @@ response2 = client.chat.completions.create( ## Retry and Fallback Mechanisms - - -```python -import litellm - -litellm.api_base = "https://oai.hconeai.com/v1" -litellm.metadata = { - "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}", - "Helicone-Retry-Enabled": "true", - "helicone-retry-num": "3", - "helicone-retry-factor": "2", # Exponential backoff - "Helicone-Fallbacks": '["gpt-3.5-turbo", "gpt-4"]', -} - -response = litellm.completion( - model="gpt-4", - messages=[{"role": "user", "content": "Hello"}] -) -``` - - - - -```yaml title="config.yaml" -model_list: - - model_name: gpt-4 - litellm_params: - model: gpt-4 - api_key: os.environ/OPENAI_API_KEY - api_base: "https://oai.hconeai.com/v1" - -default_litellm_params: - headers: - Helicone-Auth: "Bearer ${HELICONE_API_KEY}" - Helicone-Retry-Enabled: "true" - helicone-retry-num: "3" - helicone-retry-factor: "2" - Helicone-Fallbacks: '["gpt-3.5-turbo", "gpt-4"]' - -environment_variables: - HELICONE_API_KEY: "your-helicone-key" - OPENAI_API_KEY: "your-openai-key" -``` - - + + + ```python + import litellm + + litellm.api_base = "https://ai-gateway.helicone.ai/" + litellm.metadata = { + "Helicone-Retry-Enabled": "true", + "helicone-retry-num": "3", + "helicone-retry-factor": "2", + } + + response = litellm.completion( + model="helicone/gpt-4o-mini/openai,claude-3-5-sonnet-20241022/anthropic", # Try OpenAI first, then fallback to Anthropic, then continue with other models + messages=[{"role": "user", "content": "Hello"}] + ) + ``` + + + + + ```yaml title="config.yaml" + model_list: + - model_name: gpt-4 + litellm_params: + model: gpt-4 + api_key: os.environ/OPENAI_API_KEY + api_base: "https://oai.hconeai.com/v1" + + default_litellm_params: + headers: + Helicone-Auth: "Bearer ${HELICONE_API_KEY}" + Helicone-Retry-Enabled: "true" + helicone-retry-num: "3" + helicone-retry-factor: "2" + Helicone-Fallbacks: '["gpt-3.5-turbo", "gpt-4"]' + + environment_variables: + HELICONE_API_KEY: "your-helicone-key" + OPENAI_API_KEY: "your-openai-key" + ``` + + -> **Supported Headers** - For a full list of supported Helicone headers and their descriptions, please refer to the [Helicone documentation](https://docs.helicone.ai/getting-started/quick-start). +> **Supported Headers** - For a full list of supported Helicone headers and their descriptions, please refer to the [Helicone documentation](https://docs.helicone.ai/features/advanced-usage/custom-properties). > By utilizing these headers and metadata options, you can gain deeper insights into your LLM usage, optimize performance, and better manage your AI workflows with Helicone and LiteLLM. diff --git a/docs/my-website/docs/observability/levo_integration.md b/docs/my-website/docs/observability/levo_integration.md new file mode 100644 index 000000000000..3e46cf6b921f --- /dev/null +++ b/docs/my-website/docs/observability/levo_integration.md @@ -0,0 +1,162 @@ +--- +sidebar_label: Levo AI +--- + +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Levo AI + +
+
+ +
+
+ +
+
+ +[Levo](https://levo.ai/) is an AI observability and compliance platform that provides comprehensive monitoring, analysis, and compliance tracking for LLM applications. + +## Quick Start + +Send all your LLM requests and responses to Levo for monitoring and analysis using LiteLLM's built-in Levo integration. + +### What You'll Get + +- **Complete visibility** into all LLM API calls across all providers +- **Request and response data** including prompts, completions, and metadata +- **Usage and cost tracking** with token counts and cost breakdowns +- **Error monitoring** and performance metrics +- **Compliance tracking** for audit and governance + +### Setup Steps + +**1. Install OpenTelemetry dependencies:** + +```bash +pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http opentelemetry-exporter-otlp-proto-grpc +``` + +**2. Enable Levo callback in your LiteLLM config:** + +Add to your `litellm_config.yaml`: + +```yaml +litellm_settings: + callbacks: ["levo"] +``` + +**3. Configure environment variables:** + +[Contact Levo support](mailto:support@levo.ai) to get your collector endpoint URL, API key, organization ID, and workspace ID. + +Set these required environment variables: + +```bash +export LEVOAI_API_KEY="" +export LEVOAI_ORG_ID="" +export LEVOAI_WORKSPACE_ID="" +export LEVOAI_COLLECTOR_URL="" +``` + +**Note:** The collector URL should be the full endpoint URL provided by Levo support. It will be used exactly as provided. + +**4. Start LiteLLM:** + +```bash +litellm --config config.yaml +``` + +**5. Make requests - they'll automatically be sent to Levo!** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "Hello, this is a test message" + } + ] + }' +``` + +## What Data is Captured + +| Feature | Details | +|---------|---------| +| **What is logged** | OpenTelemetry Trace Data (OTLP format) | +| **Events** | Success + Failure | +| **Format** | OTLP (OpenTelemetry Protocol) | +| **Headers** | Automatically includes `Authorization: Bearer {LEVOAI_API_KEY}`, `x-levo-organization-id`, and `x-levo-workspace-id` | + +## Configuration Reference + +### Required Environment Variables + +| Variable | Description | Example | +|----------|-------------|---------| +| `LEVOAI_API_KEY` | Your Levo API key | `levo_abc123...` | +| `LEVOAI_ORG_ID` | Your Levo organization ID | `org-123456` | +| `LEVOAI_WORKSPACE_ID` | Your Levo workspace ID | `workspace-789` | +| `LEVOAI_COLLECTOR_URL` | Full collector endpoint URL from Levo support | `https://collector.levo.ai/v1/traces` | + +### Optional Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `LEVOAI_ENV_NAME` | Environment name for tagging traces | `None` | + +**Note:** The collector URL is used exactly as provided by Levo support. No path manipulation is performed. + +## Troubleshooting + +### Not seeing traces in Levo? + +1. **Verify Levo callback is enabled**: Check LiteLLM startup logs for `initializing callbacks=['levo']` + +2. **Check required environment variables**: Ensure all required variables are set: + ```bash + echo $LEVOAI_API_KEY + echo $LEVOAI_ORG_ID + echo $LEVOAI_WORKSPACE_ID + echo $LEVOAI_COLLECTOR_URL + ``` + +3. **Verify collector connectivity**: Test if your collector is reachable: + ```bash + curl /health + ``` + +4. **Check for initialization errors**: Look for errors in LiteLLM startup logs. Common issues: + - Missing OpenTelemetry packages: Install with `pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http opentelemetry-exporter-otlp-proto-grpc` + - Missing required environment variables: All four required variables must be set + - Invalid collector URL: Ensure the URL is correct and reachable + +5. **Enable debug logging**: + ```bash + export LITELLM_LOG="DEBUG" + ``` + +6. **Wait for async export**: OTLP sends traces asynchronously. Wait 10-15 seconds after making requests before checking Levo. + +### Common Errors + +**Error: "LEVOAI_COLLECTOR_URL environment variable is required"** +- Solution: Set the `LEVOAI_COLLECTOR_URL` environment variable with your collector endpoint URL from Levo support. + +**Error: "No module named 'opentelemetry'"** +- Solution: Install OpenTelemetry packages: `pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http opentelemetry-exporter-otlp-proto-grpc` + +## Additional Resources + +- [Levo Documentation](https://docs.levo.ai) +- [OpenTelemetry Specification](https://opentelemetry.io/docs/specs/otel/) + +## Need Help? + +For issues or questions about the Levo integration with LiteLLM, please [contact Levo support](mailto:support@levo.ai) or open an issue on the [LiteLLM GitHub repository](https://github.com/BerriAI/litellm/issues). diff --git a/docs/my-website/docs/observability/logfire_integration.md b/docs/my-website/docs/observability/logfire_integration.md index b75c5bfd496d..a1bd43a4bc4a 100644 --- a/docs/my-website/docs/observability/logfire_integration.md +++ b/docs/my-website/docs/observability/logfire_integration.md @@ -40,6 +40,10 @@ import os # from https://logfire.pydantic.dev/ os.environ["LOGFIRE_TOKEN"] = "" +# Optionally customize the base url +# from https://logfire.pydantic.dev/ +os.environ["LOGFIRE_BASE_URL"] = "" + # LLM API Keys os.environ['OPENAI_API_KEY']="" diff --git a/docs/my-website/docs/observability/opentelemetry_integration.md b/docs/my-website/docs/observability/opentelemetry_integration.md index 23532ab6e808..80ef1bcc9896 100644 --- a/docs/my-website/docs/observability/opentelemetry_integration.md +++ b/docs/my-website/docs/observability/opentelemetry_integration.md @@ -4,10 +4,24 @@ import TabItem from '@theme/TabItem'; # OpenTelemetry - Tracing LLMs with any observability tool -OpenTelemetry is a CNCF standard for observability. It connects to any observability tool, such as Jaeger, Zipkin, Datadog, New Relic, Traceloop and others. +OpenTelemetry is a CNCF standard for observability. It connects to any observability tool, such as Jaeger, Zipkin, Datadog, New Relic, Traceloop, Levo AI and others. +:::note Change in v1.81.0 + +From v1.81.0, the request/response will be set as attributes on the parent "Received Proxy Server Request" span by default. This allows you to see the request/response in the parent span in your observability tool. + +**Note:** When making multiple LLM calls within an external OTEL span context, the last call's attributes will overwrite previous calls' attributes on the parent span. + +To use the older behavior with nested "litellm_request" spans (which creates separate spans for each call), set the following environment variable: + +```shell +USE_OTEL_LITELLM_REQUEST_SPAN=true +``` + +::: + ## Getting Started Install the OpenTelemetry SDK: @@ -49,6 +63,8 @@ OTEL_EXPORTER_OTLP_PROTOCOL=grpc OTEL_EXPORTER_OTLP_HEADERS="api-key=key,other-config-value=value" ``` +> Note: OTLP gRPC requires `grpcio`. Install via `pip install "litellm[grpc]"` (or `grpcio`). + @@ -59,6 +75,8 @@ OTEL_ENDPOINT="https://api.lmnr.ai:8443" OTEL_HEADERS="authorization=Bearer " ``` +> Note: OTLP gRPC requires `grpcio`. Install via `pip install "litellm[grpc]"` (or `grpcio`). +
@@ -114,4 +132,4 @@ If you don't see traces landing on your integration, set `OTEL_DEBUG="True"` in export OTEL_DEBUG="True" ``` -This will emit any logging issues to the console. \ No newline at end of file +This will emit any logging issues to the console. diff --git a/docs/my-website/docs/observability/phoenix_integration.md b/docs/my-website/docs/observability/phoenix_integration.md index d15eea9a8341..191f1f8044a6 100644 --- a/docs/my-website/docs/observability/phoenix_integration.md +++ b/docs/my-website/docs/observability/phoenix_integration.md @@ -6,7 +6,7 @@ Open source tracing and evaluation platform :::tip -This is community maintained, Please make an issue if you run into a bug +This is community maintained. Please make an issue if you run into a bug: https://github.com/BerriAI/litellm ::: @@ -31,17 +31,16 @@ litellm.callbacks = ["arize_phoenix"] import litellm import os -os.environ["PHOENIX_API_KEY"] = "" # Necessary only using Phoenix Cloud -os.environ["PHOENIX_COLLECTOR_HTTP_ENDPOINT"] = "" # The URL of your Phoenix OSS instance e.g. http://localhost:6006/v1/traces -# This defaults to https://app.phoenix.arize.com/v1/traces for Phoenix Cloud +# Set env variables +os.environ["PHOENIX_API_KEY"] = "d0*****" # Set the Phoenix API key here. It is necessary only when using Phoenix Cloud. +os.environ["PHOENIX_COLLECTOR_HTTP_ENDPOINT"] = "https://app.phoenix.arize.com/s//v1/traces" # Set the URL of your Phoenix OSS instance, otherwise tracer would use https://app.phoenix.arize.com/v1/traces for Phoenix Cloud. +os.environ["PHOENIX_PROJECT_NAME"] = "litellm" # Configure the project name, otherwise traces would go to "default" project. +os.environ['OPENAI_API_KEY'] = "fake-key" # Set the OpenAI API key here. -# LLM API Keys -os.environ['OPENAI_API_KEY']="" - -# set arize as a callback, litellm will send the data to arize +# Set arize_phoenix as a callback & LiteLLM will send the data to Phoenix. litellm.callbacks = ["arize_phoenix"] - -# openai call + +# OpenAI call response = litellm.completion( model="gpt-3.5-turbo", messages=[ @@ -50,8 +49,9 @@ response = litellm.completion( ) ``` -### Using with LiteLLM Proxy +## Using with LiteLLM Proxy +1. Setup config.yaml ```yaml model_list: @@ -64,12 +64,65 @@ model_list: litellm_settings: callbacks: ["arize_phoenix"] +general_settings: + master_key: "sk-1234" + environment_variables: PHOENIX_API_KEY: "d0*****" - PHOENIX_COLLECTOR_ENDPOINT: "https://app.phoenix.arize.com/v1/traces" # OPTIONAL, for setting the GRPC endpoint - PHOENIX_COLLECTOR_HTTP_ENDPOINT: "https://app.phoenix.arize.com/v1/traces" # OPTIONAL, for setting the HTTP endpoint + PHOENIX_COLLECTOR_ENDPOINT: "https://app.phoenix.arize.com/s//v1/traces" # OPTIONAL - For setting the gRPC endpoint + PHOENIX_COLLECTOR_HTTP_ENDPOINT: "https://app.phoenix.arize.com/s//v1/traces" # OPTIONAL - For setting the HTTP endpoint +``` + +> Note: If you set the gRPC endpoint, install `grpcio` via `pip install "litellm[grpc]"` (or `grpcio`). + +2. Start the proxy + +```bash +litellm --config config.yaml +``` + +3. Test it! + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ "model": "gpt-4o", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}' +``` + +## Supported Phoenix Endpoints +Phoenix now supports multiple deployment types. The correct endpoint depends on which version of Phoenix Cloud you are using. + +**Phoenix Cloud (With Spaces - New Version)** +Use this if your Phoenix URL contains `/s/` path. + +```bash +https://app.phoenix.arize.com/s//v1/traces +``` + +**Phoenix Cloud (Legacy - Deprecated)** +Use this only if your deployment still shows the `/legacy` pattern. + +```bash +https://app.phoenix.arize.com/legacy/v1/traces ``` +**Phoenix Cloud (Without Spaces - Old Version)** +Use this if your Phoenix Cloud URL does not contain `/s/` or `/legacy` path. + +```bash +https://app.phoenix.arize.com/v1/traces +``` + +**Self-Hosted Phoenix (Local Instance)** +Use this when running Phoenix on your machine or a private server. + +```bash +http://localhost:6006/v1/traces +``` + +Depending on which Phoenix Cloud version or deployment you are using, you should set the corresponding endpoint in `PHOENIX_COLLECTOR_HTTP_ENDPOINT` or `PHOENIX_COLLECTOR_ENDPOINT`. + ## Support & Talk to Founders - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) diff --git a/docs/my-website/docs/observability/qualifire_integration.md b/docs/my-website/docs/observability/qualifire_integration.md new file mode 100644 index 000000000000..cf866f467bf2 --- /dev/null +++ b/docs/my-website/docs/observability/qualifire_integration.md @@ -0,0 +1,122 @@ +import Image from '@theme/IdealImage'; + +# Qualifire - LLM Evaluation, Guardrails & Observability + +[Qualifire](https://qualifire.ai/) provides real-time Agentic evaluations, guardrails and observability for production AI applications. + +**Key Features:** + +- **Evaluation** - Systematically assess AI behavior to detect hallucinations, jailbreaks, policy breaches, and other vulnerabilities +- **Guardrails** - Real-time interventions to prevent risks like brand damage, data leaks, and compliance breaches +- **Observability** - Complete tracing and logging for RAG pipelines, chatbots, and AI agents +- **Prompt Management** - Centralized prompt management with versioning and no-code studio + +:::tip + +Looking for Qualifire Guardrails? Check out the [Qualifire Guardrails Integration](../proxy/guardrails/qualifire.md) for real-time content moderation, prompt injection detection, PII checks, and more. + +::: + +## Pre-Requisites + +1. Create an account on [Qualifire](https://app.qualifire.ai/) +2. Get your API key and webhook URL from the Qualifire dashboard + +```bash +pip install litellm +``` + +## Quick Start + +Use just 2 lines of code to instantly log your responses **across all providers** with Qualifire. + +```python +litellm.callbacks = ["qualifire_eval"] +``` + +```python +import litellm +import os + +# Set Qualifire credentials +os.environ["QUALIFIRE_API_KEY"] = "your-qualifire-api-key" +os.environ["QUALIFIRE_WEBHOOK_URL"] = "https://your-qualifire-webhook-url" + +# LLM API Keys +os.environ['OPENAI_API_KEY'] = "your-openai-api-key" + +# Set qualifire_eval as a callback & LiteLLM will send the data to Qualifire +litellm.callbacks = ["qualifire_eval"] + +# OpenAI call +response = litellm.completion( + model="gpt-5", + messages=[ + {"role": "user", "content": "Hi 👋 - i'm openai"} + ] +) +``` + +## Using with LiteLLM Proxy + +1. Setup config.yaml + +```yaml +model_list: + - model_name: gpt-4o + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + callbacks: ["qualifire_eval"] + +general_settings: + master_key: "sk-1234" + +environment_variables: + QUALIFIRE_API_KEY: "your-qualifire-api-key" + QUALIFIRE_WEBHOOK_URL: "https://app.qualifire.ai/api/v1/webhooks/evaluations" +``` + +2. Start the proxy + +```bash +litellm --config config.yaml +``` + +3. Test it! + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ "model": "gpt-4o", "messages": [{"role": "user", "content": "Hi 👋 - i'm openai"}]}' +``` + +## Environment Variables + +| Variable | Description | +| ----------------------- | ------------------------------------------------------ | +| `QUALIFIRE_API_KEY` | Your Qualifire API key for authentication | +| `QUALIFIRE_WEBHOOK_URL` | The Qualifire webhook endpoint URL from your dashboard | + +## What Gets Logged? + +The [LiteLLM Standard Logging Payload](https://docs.litellm.ai/docs/proxy/logging_spec) is sent to your Qualifire endpoint on each successful LLM API call. + +This includes: + +- Request messages and parameters +- Response content and metadata +- Token usage statistics +- Latency metrics +- Model information +- Cost data + +Once data is in Qualifire, you can: + +- Run evaluations to detect hallucinations, toxicity, and policy violations +- Set up guardrails to block or modify responses in real-time +- View traces across your entire AI pipeline +- Track performance and quality metrics over time diff --git a/docs/my-website/docs/observability/signoz.md b/docs/my-website/docs/observability/signoz.md new file mode 100644 index 000000000000..f306b143ef0d --- /dev/null +++ b/docs/my-website/docs/observability/signoz.md @@ -0,0 +1,398 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# SigNoz LiteLLM Integration + +For more details on setting up observability for LiteLLM, check out the [SigNoz LiteLLM observability docs](https://signoz.io/docs/litellm-observability/). + + +## Overview + +This guide walks you through setting up observability and monitoring for LiteLLM SDK and Proxy Server using [OpenTelemetry](https://opentelemetry.io/) and exporting logs, traces, and metrics to SigNoz. With this integration, you can observe various models performance, capture request/response details, and track system-level metrics in SigNoz, giving you real-time visibility into latency, error rates, and usage trends for your LiteLLM applications. + +Instrumenting LiteLLM in your AI applications with telemetry ensures full observability across your AI workflows, making it easier to debug issues, optimize performance, and understand user interactions. By leveraging SigNoz, you can analyze correlated traces, logs, and metrics in unified dashboards, configure alerts, and gain actionable insights to continuously improve reliability, responsiveness, and user experience. + +## Prerequisites + +- A [SigNoz Cloud account](https://signoz.io/teams/) with an active ingestion key +- Internet access to send telemetry data to SigNoz Cloud +- [LiteLLM](https://www.litellm.ai/) SDK or Proxy integration +- For Python: `pip` installed for managing Python packages and _(optional but recommended)_ a Python virtual environment to isolate dependencies + +## Monitoring LiteLLM + +LiteLLM can be monitored in two ways: using the **LiteLLM SDK** (directly embedded in your Python application code for programmatic LLM calls) or the **LiteLLM Proxy Server** (a standalone server that acts as a centralized gateway for managing and routing LLM requests across your infrastructure). + + + + +For more detailed info on instrumenting your LiteLLM SDK applications click [here](https://docs.litellm.ai/docs/observability/opentelemetry_integration). + + + + + +No-code auto-instrumentation is recommended for quick setup with minimal code changes. It's ideal when you want to get observability up and running without modifying your application code and are leveraging standard instrumentor libraries. + +**Step 1:** Install the necessary packages in your Python environment. + +```bash +pip install \ + opentelemetry-api \ + opentelemetry-distro \ + opentelemetry-exporter-otlp \ + httpx \ + opentelemetry-instrumentation-httpx \ + litellm +``` + +**Step 2:** Add Automatic Instrumentation + +```bash +opentelemetry-bootstrap --action=install +``` + +**Step 3:** Instrument your LiteLLM SDK application + +Initialize LiteLLM SDK instrumentation by calling `litellm.callbacks = ["otel"]`: + +```python +from litellm import litellm + +litellm.callbacks = ["otel"] +``` + +This call enables automatic tracing, logs, and metrics collection for all LiteLLM SDK calls in your application. + +> 📌 Note: Ensure this is called before any LiteLLM related calls to properly configure instrumentation of your application + +**Step 4:** Run an example + +```python +from litellm import completion, litellm + +litellm.callbacks = ["otel"] + +response = completion( + model="openai/gpt-4o", + messages=[{ "content": "What is SigNoz","role": "user"}] +) + +print(response) +``` + +> 📌 Note: LiteLLM supports a [variety of model providers](https://docs.litellm.ai/docs/providers) for LLMs. In this example, we're using OpenAI. Before running this code, ensure that you have set the environment variable `OPENAI_API_KEY` with your generated API key. + +**Step 5:** Run your application with auto-instrumentation + +```bash +OTEL_RESOURCE_ATTRIBUTES="service.name=" \ +OTEL_EXPORTER_OTLP_ENDPOINT="https://ingest..signoz.cloud:443" \ +OTEL_EXPORTER_OTLP_HEADERS="signoz-ingestion-key=" \ +OTEL_EXPORTER_OTLP_PROTOCOL=grpc \ +OTEL_TRACES_EXPORTER=otlp \ +OTEL_METRICS_EXPORTER=otlp \ +OTEL_LOGS_EXPORTER=otlp \ +OTEL_PYTHON_LOG_CORRELATION=true \ +OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED=true \ +OTEL_PYTHON_DISABLED_INSTRUMENTATIONS=openai \ +opentelemetry-instrument +``` + +> Note: OTLP gRPC requires `grpcio`. Install via `pip install "litellm[grpc]"` (or `grpcio`). + +> 📌 Note: We're using `OTEL_PYTHON_DISABLED_INSTRUMENTATIONS=openai` in the run command to disable the OpenAI instrumentor for tracing. This avoids conflicts with LiteLLM's native telemetry/instrumentation, ensuring that telemetry is captured exclusively through LiteLLM's built-in instrumentation. + +- **``** is the name of your service +- Set the `` to match your SigNoz Cloud [region](https://signoz.io/docs/ingestion/signoz-cloud/overview/#endpoint) +- Replace `` with your SigNoz [ingestion key](https://signoz.io/docs/ingestion/signoz-cloud/keys/) +- Replace `` with the actual command you would use to run your application. For example: `python main.py` + +> 📌 Note: Using self-hosted SigNoz? Most steps are identical. To adapt this guide, update the endpoint and remove the ingestion key header as shown in [Cloud → Self-Hosted](https://signoz.io/docs/ingestion/cloud-vs-self-hosted/#cloud-to-self-hosted). + + + + + + +Code-based instrumentation gives you fine-grained control over your telemetry configuration. Use this approach when you need to customize resource attributes, sampling strategies, or integrate with existing observability infrastructure. + +**Step 1:** Install the necessary packages in your Python environment. + +```bash +pip install \ + opentelemetry-api \ + opentelemetry-sdk \ + opentelemetry-exporter-otlp \ + opentelemetry-instrumentation-httpx \ + opentelemetry-instrumentation-system-metrics \ + litellm +``` + +**Step 2:** Import the necessary modules in your Python application + +**Traces:** + +```python +from opentelemetry import trace +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +``` + +**Logs:** + +```python +from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter +from opentelemetry._logs import set_logger_provider +import logging +``` + +**Metrics:** + +```python +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry import metrics +from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor +``` + +**Step 3:** Set up the OpenTelemetry Tracer Provider to send traces directly to SigNoz Cloud + +```python +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry import trace +import os + +resource = Resource.create({"service.name": ""}) +provider = TracerProvider(resource=resource) +span_exporter = OTLPSpanExporter( + endpoint= os.getenv("OTEL_EXPORTER_TRACES_ENDPOINT"), + headers={"signoz-ingestion-key": os.getenv("SIGNOZ_INGESTION_KEY")}, +) +processor = BatchSpanProcessor(span_exporter) +provider.add_span_processor(processor) +trace.set_tracer_provider(provider) +``` + +- **``** is the name of your service +- **`OTEL_EXPORTER_TRACES_ENDPOINT`** → SigNoz Cloud trace endpoint with appropriate [region](https://signoz.io/docs/ingestion/signoz-cloud/overview/#endpoint):`https://ingest..signoz.cloud:443/v1/traces` +- **`SIGNOZ_INGESTION_KEY`** → Your SigNoz [ingestion key](https://signoz.io/docs/ingestion/signoz-cloud/keys/) + + +> 📌 Note: Using self-hosted SigNoz? Most steps are identical. To adapt this guide, update the endpoint and remove the ingestion key header as shown in [Cloud → Self-Hosted](https://signoz.io/docs/ingestion/cloud-vs-self-hosted/#cloud-to-self-hosted). + + +**Step 4**: Setup Logs + +```python +import logging +from opentelemetry.sdk.resources import Resource +from opentelemetry._logs import set_logger_provider +from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler +from opentelemetry.sdk._logs.export import BatchLogRecordProcessor +from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter +import os + +resource = Resource.create({"service.name": ""}) +logger_provider = LoggerProvider(resource=resource) +set_logger_provider(logger_provider) + +otlp_log_exporter = OTLPLogExporter( + endpoint= os.getenv("OTEL_EXPORTER_LOGS_ENDPOINT"), + headers={"signoz-ingestion-key": os.getenv("SIGNOZ_INGESTION_KEY")}, +) +logger_provider.add_log_record_processor( + BatchLogRecordProcessor(otlp_log_exporter) +) +# Attach OTel logging handler to root logger +handler = LoggingHandler(level=logging.INFO, logger_provider=logger_provider) +logging.basicConfig(level=logging.INFO, handlers=[handler]) + +logger = logging.getLogger(__name__) +``` + +- **``** is the name of your service +- **`OTEL_EXPORTER_LOGS_ENDPOINT`** → SigNoz Cloud endpoint with appropriate [region](https://signoz.io/docs/ingestion/signoz-cloud/overview/#endpoint):`https://ingest..signoz.cloud:443/v1/logs` +- **`SIGNOZ_INGESTION_KEY`** → Your SigNoz [ingestion key](https://signoz.io/docs/ingestion/signoz-cloud/keys/) + +> 📌 Note: Using self-hosted SigNoz? Most steps are identical. To adapt this guide, update the endpoint and remove the ingestion key header as shown in [Cloud → Self-Hosted](https://signoz.io/docs/ingestion/cloud-vs-self-hosted/#cloud-to-self-hosted). + + +**Step 5**: Setup Metrics + +```python +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry import metrics +from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor +import os + +resource = Resource.create({"service.name": ""}) +metric_exporter = OTLPMetricExporter( + endpoint= os.getenv("OTEL_EXPORTER_METRICS_ENDPOINT"), + headers={"signoz-ingestion-key": os.getenv("SIGNOZ_INGESTION_KEY")}, +) +reader = PeriodicExportingMetricReader(metric_exporter) +metric_provider = MeterProvider(metric_readers=[reader], resource=resource) +metrics.set_meter_provider(metric_provider) + +meter = metrics.get_meter(__name__) + +# turn on out-of-the-box metrics +SystemMetricsInstrumentor().instrument() +HTTPXClientInstrumentor().instrument() +``` + +- **``** is the name of your service +- **`OTEL_EXPORTER_METRICS_ENDPOINT`** → SigNoz Cloud endpoint with appropriate [region](https://signoz.io/docs/ingestion/signoz-cloud/overview/#endpoint):`https://ingest..signoz.cloud:443/v1/metrics` +- **`SIGNOZ_INGESTION_KEY`** → Your SigNoz [ingestion key](https://signoz.io/docs/ingestion/signoz-cloud/keys/) + +> 📌 Note: Using self-hosted SigNoz? Most steps are identical. To adapt this guide, update the endpoint and remove the ingestion key header as shown in [Cloud → Self-Hosted](https://signoz.io/docs/ingestion/cloud-vs-self-hosted/#cloud-to-self-hosted). + + +> 📌 Note: SystemMetricsInstrumentor provides system metrics (CPU, memory, etc.), and HTTPXClientInstrumentor provides outbound HTTP request metrics such as request duration. If you want to add custom metrics to your LiteLLM application, see [Python Custom Metrics](https://signoz.io/opentelemetry/python-custom-metrics/). + +**Step 6:** Instrument your LiteLLM application + +Initialize LiteLLM SDK instrumentation by calling `litellm.callbacks = ["otel"]`: + +```python +from litellm import litellm + +litellm.callbacks = ["otel"] +``` + +This call enables automatic tracing, logs, and metrics collection for all LiteLLM SDK calls in your application. + +> 📌 Note: Ensure this is called before any LiteLLM related calls to properly configure instrumentation of your application + +**Step 7:** Run an example + +```python +from litellm import completion, litellm + +litellm.callbacks = ["otel"] + +response = completion( + model="openai/gpt-4o", + messages=[{ "content": "What is SigNoz","role": "user"}] +) + +print(response) +``` + +> 📌 Note: LiteLLM supports a [variety of model providers](https://docs.litellm.ai/docs/providers) for LLMs. In this example, we're using OpenAI. Before running this code, ensure that you have set the environment variable `OPENAI_API_KEY` with your generated API key. + + + + +## View Traces, Logs, and Metrics in SigNoz + +Your LiteLLM commands should now automatically emit traces, logs, and metrics. + +You should be able to view traces in Signoz Cloud under the traces tab: + +![LiteLLM SDK Trace View](https://signoz.io/img/docs/llm/litellm/litellmsdk-traces.webp) + +When you click on a trace in SigNoz, you'll see a detailed view of the trace, including all associated spans, along with their events and attributes. + +![LiteLLM SDK Detailed Trace View](https://signoz.io/img/docs/llm/litellm/litellmsdk-detailed-traces.webp) + +You should be able to view logs in Signoz Cloud under the logs tab. You can also view logs by clicking on the “Related Logs” button in the trace view to see correlated logs: + +![LiteLLM SDK Logs View](https://signoz.io/img/docs/llm/litellm/litellmsdk-logs.webp) + +When you click on any of these logs in SigNoz, you'll see a detailed view of the log, including attributes: + +![LiteLLM SDK Detailed Logs View](https://signoz.io/img/docs/llm/litellm/litellmsdk-detailed-logs.webp) + +You should be able to see LiteLLM related metrics in Signoz Cloud under the metrics tab: + +![LiteLLM SDK Metrics View](https://signoz.io/img/docs/llm/litellm/litellmsdk-metrics.webp) + +When you click on any of these metrics in SigNoz, you'll see a detailed view of the metric, including attributes: + +![LiteLLM Detailed Metrics View](https://signoz.io/img/docs/llm/litellm/litellmsdk-detailed-metrics.webp) + +## Dashboard + +You can also check out our custom LiteLLM SDK dashboard [here](https://signoz.io/docs/dashboards/dashboard-templates/litellm-sdk-dashboard/) which provides specialized visualizations for monitoring your LiteLLM usage in applications. The dashboard includes pre-built charts specifically tailored for LLM usage, along with import instructions to get started quickly. + +![LiteLLM SDK Dashboard Template](https://signoz.io/img/docs/llm/litellm/litellm-sdk-dashboard.webp) + + + + + +**Step 1:** Install the necessary packages in your Python environment. + +```bash +pip install opentelemetry-api \ + opentelemetry-sdk \ + opentelemetry-exporter-otlp \ + 'litellm[proxy]' +``` + +**Step 2:** Configure otel for the LiteLLM Proxy Server + +Add the following to `config.yaml`: + +```yaml +litellm_settings: + callbacks: ['otel'] +``` + +**Step 3:** Set the following environment variables: + +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT="https://ingest..signoz.cloud:443" +export OTEL_EXPORTER_OTLP_HEADERS="signoz-ingestion-key=" +export OTEL_EXPORTER_OTLP_PROTOCOL="grpc" +export OTEL_TRACES_EXPORTER="otlp" +export OTEL_METRICS_EXPORTER="otlp" +export OTEL_LOGS_EXPORTER="otlp" +``` + +> Note: OTLP gRPC requires `grpcio`. Install via `pip install "litellm[grpc]"` (or `grpcio`). + +- Set the `` to match your SigNoz Cloud [region](https://signoz.io/docs/ingestion/signoz-cloud/overview/#endpoint) +- Replace `` with your SigNoz [ingestion key](https://signoz.io/docs/ingestion/signoz-cloud/keys/) + +> 📌 Note: Using self-hosted SigNoz? Most steps are identical. To adapt this guide, update the endpoint and remove the ingestion key header as shown in [Cloud → Self-Hosted](https://signoz.io/docs/ingestion/cloud-vs-self-hosted/#cloud-to-self-hosted). + + +**Step 4:** Run the proxy server using the config file: + +```bash +litellm --config config.yaml +``` + +Now any calls made through your LiteLLM proxy server will be traced and sent to SigNoz. + +You should be able to view traces in Signoz Cloud under the traces tab: + +![LiteLLM Proxy Trace View](https://signoz.io/img/docs/llm/litellm/litellmproxy-traces.webp) + +When you click on a trace in SigNoz, you'll see a detailed view of the trace, including all associated spans, along with their events and attributes. + +![LiteLLM Proxy Detailed Trace View](https://signoz.io/img/docs/llm/litellm/litellmproxy-detailed-traces.webp) + +## Dashboard + +You can also check out our custom LiteLLM Proxy dashboard [here](https://signoz.io/docs/dashboards/dashboard-templates/litellm-proxy-dashboard/) which provides specialized visualizations for monitoring your LiteLLM Proxy usage in applications. The dashboard includes pre-built charts specifically tailored for LLM usage, along with import instructions to get started quickly. + +![LiteLLM Proxy Dashboard Template](https://signoz.io/img/docs/llm/litellm/litellm-proxy-dashboard.webp) + + + diff --git a/docs/my-website/docs/observability/sumologic_integration.md b/docs/my-website/docs/observability/sumologic_integration.md new file mode 100644 index 000000000000..c30ee94dad4e --- /dev/null +++ b/docs/my-website/docs/observability/sumologic_integration.md @@ -0,0 +1,332 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Sumo Logic + +Send LiteLLM logs to Sumo Logic for observability, monitoring, and analysis. + +Sumo Logic is a cloud-native machine data analytics platform that provides real-time insights into your applications and infrastructure. +https://www.sumologic.com/ + +:::info +We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or +join our [discord](https://discord.gg/wuPM9dRgDw) +::: + +## Pre-Requisites + +1. Create a Sumo Logic account at https://www.sumologic.com/ +2. Set up an HTTP Logs and Metrics Source in Sumo Logic: + - Go to **Manage Data** > **Collection** > **Collection** + - Click **Add Source** next to a Hosted Collector + - Select **HTTP Logs & Metrics** + - Copy the generated URL (it contains the authentication token) + +For more details, see the [HTTP Logs & Metrics Source](https://www.sumologic.com/help/docs/send-data/hosted-collectors/http-source/logs-metrics/) documentation. + +```shell +pip install litellm +``` + +## Quick Start + +Use just 2 lines of code to instantly log your LLM responses to Sumo Logic. + +The Sumo Logic HTTP Source URL includes the authentication token, so no separate API key is required. + + + + +```python +litellm.callbacks = ["sumologic"] +``` + +```python +import litellm +import os + +# Sumo Logic HTTP Source URL (includes auth token) +os.environ["SUMOLOGIC_WEBHOOK_URL"] = "https://collectors.sumologic.com/receiver/v1/http/your-token-here" + +# LLM API Keys +os.environ['OPENAI_API_KEY'] = "" + +# Set sumologic as a callback +litellm.callbacks = ["sumologic"] + +# OpenAI call +response = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + {"role": "user", "content": "Hi 👋 - I'm testing Sumo Logic integration"} + ] +) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + callbacks: ["sumologic"] + +environment_variables: + SUMOLOGIC_WEBHOOK_URL: os.environ/SUMOLOGIC_WEBHOOK_URL +``` + +2. Start LiteLLM Proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl -L -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "Hey, how are you?" + } + ] +}' +``` + + + + +## What Data is Logged? + +LiteLLM sends the [Standard Logging Payload](https://docs.litellm.ai/docs/proxy/logging_spec) to Sumo Logic, which includes: + +- **Request details**: Model, messages, parameters +- **Response details**: Completion text, token usage, latency +- **Metadata**: User ID, custom metadata, timestamps +- **Cost tracking**: Response cost based on token usage + +Example payload: + +```json +{ + "id": "chatcmpl-123", + "call_type": "litellm.completion", + "model": "gpt-3.5-turbo", + "messages": [ + {"role": "user", "content": "Hello"} + ], + "response": { + "choices": [{ + "message": { + "role": "assistant", + "content": "Hi there!" + } + }] + }, + "usage": { + "prompt_tokens": 10, + "completion_tokens": 5, + "total_tokens": 15 + }, + "response_cost": 0.0001, + "start_time": "2024-01-01T00:00:00", + "end_time": "2024-01-01T00:00:01" +} +``` + +## Advanced Configuration + +### Log Format + +The Sumo Logic integration uses **NDJSON (newline-delimited JSON)** format by default. This format is optimal for Sumo Logic's parsing capabilities and allows Field Extraction Rules to work at ingest time. + +#### NDJSON Format + +Each log entry is sent as a separate line in the HTTP request: +``` +{"id":"chatcmpl-1","model":"gpt-3.5-turbo","response_cost":0.0001,...} +{"id":"chatcmpl-2","model":"gpt-4","response_cost":0.0003,...} +{"id":"chatcmpl-3","model":"gpt-3.5-turbo","response_cost":0.0001,...} +``` + +#### Benefits for Field Extraction Rules (FERs) + +With NDJSON format, you can create Field Extraction Rules directly: + +``` +_sourceCategory=litellm/logs +| json field=_raw "model", "response_cost", "user" as model, cost, user +``` + +**Before NDJSON** (with JSON array format): +- Required `parse regex ... multi` workaround +- FERs couldn't parse at ingest time +- Query-time parsing impacted dashboard performance + +**After NDJSON**: +- ✅ FERs parse fields at ingest time +- ✅ No query-time workarounds needed +- ✅ Better dashboard performance +- ✅ Simpler query syntax + +#### Changing the Log Format (Advanced) + +If you need to change the log format (not recommended for Sumo Logic): + +```yaml +callback_settings: + sumologic: + callback_type: generic_api + callback_name: sumologic + log_format: json_array # Override to use JSON array instead +``` + +### Batching Settings + +Control how LiteLLM batches logs before sending to Sumo Logic: + + + + +```python +import litellm + +os.environ["SUMOLOGIC_WEBHOOK_URL"] = "https://collectors.sumologic.com/receiver/v1/http/your-token" + +litellm.callbacks = ["sumologic"] + +# Configure batch settings (optional) +# These are inherited from CustomBatchLogger +# Default batch_size: 100 +# Default flush_interval: 60 seconds +``` + + + + +```yaml +litellm_settings: + callbacks: ["sumologic"] + +environment_variables: + SUMOLOGIC_WEBHOOK_URL: os.environ/SUMOLOGIC_WEBHOOK_URL +``` + + + + +### Compressed Data + +Sumo Logic supports compressed data (gzip or deflate). LiteLLM automatically handles compression when beneficial. + +Benefits: +- Reduced network usage +- Faster message delivery +- Lower data transfer costs + +### Query Logs in Sumo Logic + +Once logs are flowing to Sumo Logic, you can query them using the Sumo Logic Query Language: + +```sql +_sourceCategory=litellm +| json "model", "response_cost", "usage.total_tokens" as model, cost, tokens +| sum(cost) by model +``` + +Example queries: + +**Total cost by model:** +```sql +_sourceCategory=litellm +| json "model", "response_cost" as model, cost +| sum(cost) as total_cost by model +| sort by total_cost desc +``` + +**Average response time:** +```sql +_sourceCategory=litellm +| json "start_time", "end_time" as start, end +| parse regex field=start "(?\d+)" +| parse regex field=end "(?\d+)" +| (end_ms - start_ms) as response_time_ms +| avg(response_time_ms) as avg_response_time +``` + +**Requests per user:** +```sql +_sourceCategory=litellm +| json "model_parameters.user" as user +| count by user +``` + +## Authentication + +The Sumo Logic HTTP Source URL includes the authentication token, so you only need to set the `SUMOLOGIC_WEBHOOK_URL` environment variable. + +**Security Best Practices:** +- Keep your HTTP Source URL private (it contains the auth token) +- Store it in environment variables or secrets management +- Regenerate the URL if it's compromised (in Sumo Logic UI) +- Use separate HTTP Sources for different environments (dev, staging, prod) + +## Getting Your Sumo Logic URL + +1. Log in to [Sumo Logic](https://www.sumologic.com/) +2. Go to **Manage Data** > **Collection** > **Collection** +3. Click **Add Source** next to a Hosted Collector +4. Select **HTTP Logs & Metrics** +5. Configure the source: + - **Name**: LiteLLM Logs + - **Source Category**: litellm (optional, but helps with queries) +6. Click **Save** +7. Copy the displayed URL - it will look like: + ``` + https://collectors.sumologic.com/receiver/v1/http/ZaVnC4dhaV39Tn37... + ``` + +## Troubleshooting + +### Logs not appearing in Sumo Logic + +1. **Verify the URL**: Make sure `SUMOLOGIC_WEBHOOK_URL` is set correctly +2. **Check the HTTP Source**: Ensure it's active in Sumo Logic UI +3. **Wait for batching**: Logs are sent in batches, wait 60 seconds +4. **Check for errors**: Enable debug logging in LiteLLM: + ```python + litellm.set_verbose = True + ``` + +### URL Format + +The URL must be the complete HTTP Source URL from Sumo Logic: +- ✅ Correct: `https://collectors.sumologic.com/receiver/v1/http/ZaVnC4dhaV39Tn37...` + +### No authentication errors + +If you get authentication errors, regenerate the HTTP Source URL in Sumo Logic: +1. Go to your HTTP Source in Sumo Logic +2. Click the settings icon +3. Click **Show URL** +4. Click **Regenerate URL** +5. Update your `SUMOLOGIC_WEBHOOK_URL` environment variable + +## Support & Talk to Founders + +- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) +- [Community Discord 💭](https://discord.gg/wuPM9dRgDw) +- Our numbers 📞 +1 (770) 8783-106 / ‭+1 (412) 618-6238‬ +- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai diff --git a/docs/my-website/docs/ocr.md b/docs/my-website/docs/ocr.md index 2cb87edc4612..93cb74ee69fc 100644 --- a/docs/my-website/docs/ocr.md +++ b/docs/my-website/docs/ocr.md @@ -5,7 +5,7 @@ | Cost Tracking | ✅ | | Logging | ✅ (Basic Logging not supported) | | Load Balancing | ✅ | -| Supported Providers | `mistral`, `azure_ai` | +| Supported Providers | `mistral`, `azure_ai`, `vertex_ai` | :::tip @@ -262,4 +262,5 @@ The response follows Mistral's OCR format with the following structure: |-------------|--------------------| | Mistral AI | [Usage](#quick-start) | | Azure AI | [Usage](../docs/providers/azure_ocr) | +| Vertex AI | [Usage](../docs/providers/vertex_ocr) | diff --git a/docs/my-website/docs/oidc.md b/docs/my-website/docs/oidc.md index 3db4b6ecdc5d..b541329aa386 100644 --- a/docs/my-website/docs/oidc.md +++ b/docs/my-website/docs/oidc.md @@ -106,7 +106,7 @@ model_list: aws_region_name: us-west-2 aws_session_name: "my-test-session" aws_role_name: "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci" - aws_web_identity_token: "oidc/circleci_v2/" + aws_web_identity_token: "oidc/example-provider/" ``` #### Amazon IAM Role Configuration for CircleCI v2 -> Bedrock diff --git a/docs/my-website/docs/pass_through/anthropic_completion.md b/docs/my-website/docs/pass_through/anthropic_completion.md index e644b7d348f7..38c42ed990df 100644 --- a/docs/my-website/docs/pass_through/anthropic_completion.md +++ b/docs/my-website/docs/pass_through/anthropic_completion.md @@ -1,13 +1,13 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Anthropic SDK +# Anthropic Passthrough Pass-through endpoints for Anthropic - call provider-specific endpoint, in native format (no translation). | Feature | Supported | Notes | |-------|-------|-------| -| Cost Tracking | ✅ | supports all models on `/messages` endpoint | +| Cost Tracking | ✅ | supports all models on `/messages`, `/v1/messages/batches` endpoint | | Logging | ✅ | works across all integrations | | End-user Tracking | ✅ | disable prometheus tracking via `litellm.disable_end_user_cost_tracking_prometheus_only`| | Streaming | ✅ | | @@ -263,6 +263,19 @@ curl https://api.anthropic.com/v1/messages/batches \ }' ``` +:::note Configuration Required for Batch Cost Tracking +For batch passthrough cost tracking to work properly, you need to define the Anthropic model in your `proxy_config.yaml`: + +```yaml +model_list: + - model_name: claude-sonnet-4-5-20250929 # or any alias + litellm_params: + model: anthropic/claude-sonnet-4-5-20250929 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +This ensures the polling mechanism can correctly identify the provider and retrieve batch status for cost calculation. +::: ## Advanced diff --git a/docs/my-website/docs/pass_through/openai_passthrough.md b/docs/my-website/docs/pass_through/openai_passthrough.md index 271236957516..49026f8aa2d6 100644 --- a/docs/my-website/docs/pass_through/openai_passthrough.md +++ b/docs/my-website/docs/pass_through/openai_passthrough.md @@ -1,6 +1,6 @@ # OpenAI Passthrough -Pass-through endpoints for `/openai` +Pass-through endpoints for direct OpenAI API access ## Overview @@ -10,15 +10,33 @@ Pass-through endpoints for `/openai` | Logging | ✅ | Works across all integrations | | Streaming | ✅ | Fully supported | -### When to use this? +## Available Endpoints + +### `/openai_passthrough` - Recommended +Dedicated passthrough endpoint that guarantees direct routing to OpenAI without conflicts. + +**Use this for:** +- OpenAI Responses API (`/v1/responses`) +- Any endpoint where you need guaranteed passthrough +- When `/openai` routes are conflicting with LiteLLM's native implementations + +### `/openai` - Legacy +Standard passthrough endpoint that may conflict with LiteLLM's native implementations. + +**Note:** Some endpoints like `/openai/v1/responses` will be routed to LiteLLM's native implementation instead of OpenAI. + +## When to use this? - For 90% of your use cases, you should use the [native LiteLLM OpenAI Integration](https://docs.litellm.ai/docs/providers/openai) (`/chat/completions`, `/embeddings`, `/completions`, `/images`, `/batches`, etc.) -- Use this passthrough to call less popular or newer OpenAI endpoints that LiteLLM doesn't fully support yet, such as `/assistants`, `/threads`, `/vector_stores` +- Use `/openai_passthrough` to call less popular or newer OpenAI endpoints that LiteLLM doesn't fully support yet, such as `/assistants`, `/threads`, `/vector_stores`, `/responses` -Simply replace `https://api.openai.com` with `LITELLM_PROXY_BASE_URL/openai` +Simply replace `https://api.openai.com` with `LITELLM_PROXY_BASE_URL/openai_passthrough` ## Usage Examples +Requirements: +Set `OPENAI_API_KEY` in your environment variables. + ### Assistants API #### Create OpenAI Client @@ -31,7 +49,7 @@ Make sure you do the following: import openai client = openai.OpenAI( - base_url="http://0.0.0.0:4000/openai", # /openai + base_url="http://0.0.0.0:4000/openai_passthrough", # /openai_passthrough api_key="sk-anything" # ) ``` diff --git a/docs/my-website/docs/pass_through/vertex_ai.md b/docs/my-website/docs/pass_through/vertex_ai.md index 2efef60070da..00df6def704c 100644 --- a/docs/my-website/docs/pass_through/vertex_ai.md +++ b/docs/my-website/docs/pass_through/vertex_ai.md @@ -45,7 +45,7 @@ model_list: litellm_params: model: vertex_ai/gemini-1.0-pro vertex_project: adroit-crow-413218 - vertex_region: us-central1 + vertex_location: us-central1 vertex_credentials: /path/to/credentials.json use_in_pass_through: true # 👈 KEY CHANGE ``` @@ -57,9 +57,9 @@ model_list: ```yaml -default_vertex_config: +default_vertex_config: vertex_project: adroit-crow-413218 - vertex_region: us-central1 + vertex_location: us-central1 vertex_credentials: /path/to/credentials.json ``` @@ -461,3 +461,48 @@ generateContent(); + +### Using Anthropic Beta Features on Vertex AI + +When using Anthropic models via Vertex AI passthrough (e.g., Claude on Vertex), you can enable Anthropic beta features like extended context windows. + +The `anthropic-beta` header is automatically forwarded to Vertex AI when calling Anthropic models. + +```bash +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-east5/publishers/anthropic/models/claude-3-5-sonnet:rawPredict \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -H "anthropic-beta: context-1m-2025-08-07" \ + -d '{ + "anthropic_version": "vertex-2023-10-16", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 500 + }' +``` + +### Forwarding Custom Headers with `x-pass-` Prefix + +You can forward any custom header to the provider by prefixing it with `x-pass-`. The prefix is stripped before the header is sent to the provider. + +For example: +- `x-pass-anthropic-beta: value` becomes `anthropic-beta: value` +- `x-pass-custom-header: value` becomes `custom-header: value` + +This is useful when you need to send provider-specific headers that aren't in the default allowlist. + +```bash +curl http://localhost:4000/vertex_ai/v1/projects/${PROJECT_ID}/locations/us-east5/publishers/anthropic/models/claude-3-5-sonnet:rawPredict \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -H "x-pass-anthropic-beta: context-1m-2025-08-07" \ + -H "x-pass-custom-feature: enabled" \ + -d '{ + "anthropic_version": "vertex-2023-10-16", + "messages": [{"role": "user", "content": "Hello"}], + "max_tokens": 500 + }' +``` + +:::info +The `x-pass-` prefix works for all LLM pass-through endpoints, not just Vertex AI. +::: diff --git a/docs/my-website/docs/pass_through/vertex_ai_search_datastores.md b/docs/my-website/docs/pass_through/vertex_ai_search_datastores.md index 90a85312fab5..20501d71f97d 100644 --- a/docs/my-website/docs/pass_through/vertex_ai_search_datastores.md +++ b/docs/my-website/docs/pass_through/vertex_ai_search_datastores.md @@ -50,7 +50,7 @@ Register your datastore once. Reference it by ID. vector_store_registry: - vector_store_name: "vertex-ai-litellm-website-knowledgebase" litellm_params: - vector_store_id: "litellm-docs_1761094140318" + vector_store_id: "my-datastore" custom_llm_provider: "vertex_ai/search_api" vertex_app_id: "test-litellm-app_1761094730750" vertex_project: "test-vector-store-db" @@ -120,3 +120,20 @@ for result in response.json().get("results", []): print(f"{data['title']}: {data['link']}") ``` +### Use with Chat Completion + +```bash +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "claude-3-5-sonnet", + "messages": [{"role": "user", "content": "What is litellm?"}], + "tools": [ + { + "type": "file_search", + "vector_store_ids": ["my-datastore"] + } + ] + }' +``` \ No newline at end of file diff --git a/docs/my-website/docs/projects/Agent Lightning.md b/docs/my-website/docs/projects/Agent Lightning.md new file mode 100644 index 000000000000..28e5546e398b --- /dev/null +++ b/docs/my-website/docs/projects/Agent Lightning.md @@ -0,0 +1,10 @@ + +# Agent Lightning + +[Agent Lightning](https://github.com/microsoft/agent-lightning) is Microsoft's open-source framework for training and optimizing AI agents with Reinforcement Learning, Automatic Prompt Optimization, and Supervised Fine-tuning — with almost zero code changes. + +It works with any agent framework including LangChain, OpenAI Agents SDK, AutoGen, and CrewAI. Agent Lightning uses LiteLLM Proxy under the hood to route LLM requests and collect traces that power its training algorithms. + +- [GitHub](https://github.com/microsoft/agent-lightning) +- [Docs](https://microsoft.github.io/agent-lightning/) +- [arXiv Paper](https://arxiv.org/abs/2508.03680) diff --git a/docs/my-website/docs/projects/Google ADK.md b/docs/my-website/docs/projects/Google ADK.md new file mode 100644 index 000000000000..25e910dcbad4 --- /dev/null +++ b/docs/my-website/docs/projects/Google ADK.md @@ -0,0 +1,21 @@ + +# Google ADK (Agent Development Kit) + +[Google ADK](https://github.com/google/adk-python) is an open-source, code-first Python framework for building, evaluating, and deploying sophisticated AI agents. While optimized for Gemini, ADK is model-agnostic and supports LiteLLM for using 100+ providers. + +```python +from google.adk.agents.llm_agent import Agent +from google.adk.models.lite_llm import LiteLlm + +root_agent = Agent( + model=LiteLlm(model="openai/gpt-4o"), # Or any LiteLLM-supported model + name="my_agent", + description="An agent using LiteLLM", + instruction="You are a helpful assistant.", + tools=[your_tools], +) +``` + +- [GitHub](https://github.com/google/adk-python) +- [Documentation](https://google.github.io/adk-docs) +- [LiteLLM Samples](https://github.com/google/adk-python/tree/main/contributing/samples/hello_world_litellm) diff --git a/docs/my-website/docs/projects/GraphRAG.md b/docs/my-website/docs/projects/GraphRAG.md new file mode 100644 index 000000000000..6c5e3dea3343 --- /dev/null +++ b/docs/my-website/docs/projects/GraphRAG.md @@ -0,0 +1,8 @@ + +# Microsoft GraphRAG + +GraphRAG is a data pipeline and transformation suite that extracts meaningful, structured data from unstructured text using the power of LLMs. It uses a graph-based approach to RAG (Retrieval-Augmented Generation) that leverages knowledge graphs to improve reasoning over private datasets. + +- [Github](https://github.com/microsoft/graphrag) +- [Docs](https://microsoft.github.io/graphrag/) +- [Paper](https://arxiv.org/pdf/2404.16130) diff --git a/docs/my-website/docs/projects/Harbor.md b/docs/my-website/docs/projects/Harbor.md new file mode 100644 index 000000000000..684dfa93720d --- /dev/null +++ b/docs/my-website/docs/projects/Harbor.md @@ -0,0 +1,24 @@ + +# Harbor + +[Harbor](https://github.com/laude-institute/harbor) is a framework from the creators of Terminal-Bench for evaluating and optimizing agents and language models. It uses LiteLLM to call 100+ LLM providers. + +```bash +# Install +pip install harbor + +# Run a benchmark with any LiteLLM-supported model +harbor run --dataset terminal-bench@2.0 \ + --agent claude-code \ + --model anthropic/claude-opus-4-1 \ + --n-concurrent 4 +``` + +Key features: +- Evaluate agents like Claude Code, OpenHands, Codex CLI +- Build and share benchmarks and environments +- Run experiments in parallel across cloud providers (Daytona, Modal) +- Generate rollouts for RL optimization + +- [GitHub](https://github.com/laude-institute/harbor) +- [Documentation](https://harborframework.com/docs) diff --git a/docs/my-website/docs/projects/Softgen.md b/docs/my-website/docs/projects/Softgen.md new file mode 100644 index 000000000000..2e5024a0770f --- /dev/null +++ b/docs/my-website/docs/projects/Softgen.md @@ -0,0 +1,7 @@ +# Softgen + +`Softgen` is an AI-powered platform that builds full-stack web apps from your plain instructions. +LiteLLM helps `Softgen` users to choose and use different LLMs. + +- [Softgen](https://softgen.ai) +- [Academy](hhttps://academy.softgen.ai) diff --git a/docs/my-website/docs/projects/mini-swe-agent.md b/docs/my-website/docs/projects/mini-swe-agent.md new file mode 100644 index 000000000000..525f541899bb --- /dev/null +++ b/docs/my-website/docs/projects/mini-swe-agent.md @@ -0,0 +1,17 @@ +# mini-swe-agent + +**mini-swe-agent** The 100 line AI agent that solves GitHub issues & more. + +Key features: +- Just 100 lines of Python - radically simple and hackable +- Uses bash only (no custom tools) for maximum flexibility +- Built on LiteLLM for model flexibility +- Comes with CLI and Python bindings +- Deployable anywhere: local, docker, podman, apptainer + +Perfect for researchers, developers who want readable tools, and engineers who need easy deployment. + +- [Website](https://mini-swe-agent.com/latest/) +- [GitHub](https://github.com/SWE-agent/mini-swe-agent) +- [Quick Start](https://mini-swe-agent.com/latest/quickstart/) +- [Documentation](https://mini-swe-agent.com/latest/) diff --git a/docs/my-website/docs/projects/openai-agents.md b/docs/my-website/docs/projects/openai-agents.md new file mode 100644 index 000000000000..95a2191b8831 --- /dev/null +++ b/docs/my-website/docs/projects/openai-agents.md @@ -0,0 +1,22 @@ + +# OpenAI Agents SDK + +The [OpenAI Agents SDK](https://github.com/openai/openai-agents-python) is a lightweight framework for building multi-agent workflows. +It includes an official LiteLLM extension that lets you use any of the 100+ supported providers (Anthropic, Gemini, Mistral, Bedrock, etc.) + +```python +from agents import Agent, Runner +from agents.extensions.models.litellm_model import LitellmModel + +agent = Agent( + name="Assistant", + instructions="You are a helpful assistant.", + model=LitellmModel(model="provider/model-name") +) + +result = Runner.run_sync(agent, "your_prompt_here") +print("Result:", result.final_output) +``` + +- [GitHub](https://github.com/openai/openai-agents-python) +- [LiteLLM Extension Docs](https://openai.github.io/openai-agents-python/ref/extensions/litellm/) diff --git a/docs/my-website/docs/provider_registration/add_model_pricing.md b/docs/my-website/docs/provider_registration/add_model_pricing.md new file mode 100644 index 000000000000..ebf35c42e329 --- /dev/null +++ b/docs/my-website/docs/provider_registration/add_model_pricing.md @@ -0,0 +1,124 @@ +--- +title: "Add Model Pricing & Context Window" +--- + +To add pricing or context window information for a model, simply make a PR to this file: + +**[model_prices_and_context_window.json](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)** + +### Sample Spec + +Here's the full specification with all available fields: + +```json +{ + "sample_spec": { + "code_interpreter_cost_per_session": 0.0, + "computer_use_input_cost_per_1k_tokens": 0.0, + "computer_use_output_cost_per_1k_tokens": 0.0, + "deprecation_date": "date when the model becomes deprecated in the format YYYY-MM-DD", + "file_search_cost_per_1k_calls": 0.0, + "file_search_cost_per_gb_per_day": 0.0, + "input_cost_per_audio_token": 0.0, + "input_cost_per_token": 0.0, + "litellm_provider": "one of https://docs.litellm.ai/docs/providers", + "max_input_tokens": "max input tokens, if the provider specifies it. if not default to max_tokens", + "max_output_tokens": "max output tokens, if the provider specifies it. if not default to max_tokens", + "max_tokens": "LEGACY parameter. set to max_output_tokens if provider specifies it. IF not set to max_input_tokens, if provider specifies it.", + "mode": "one of: chat, embedding, completion, image_generation, audio_transcription, audio_speech, image_generation, moderation, rerank, search", + "output_cost_per_reasoning_token": 0.0, + "output_cost_per_token": 0.0, + "search_context_cost_per_query": { + "search_context_size_high": 0.0, + "search_context_size_low": 0.0, + "search_context_size_medium": 0.0 + }, + "supported_regions": [ + "global", + "us-west-2", + "eu-west-1", + "ap-southeast-1", + "ap-northeast-1" + ], + "supports_audio_input": true, + "supports_audio_output": true, + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_prompt_caching": true, + "supports_reasoning": true, + "supports_response_schema": true, + "supports_system_messages": true, + "supports_vision": true, + "supports_web_search": true, + "vector_store_cost_per_gb_per_day": 0.0 + } +} +``` + +### Examples + +#### Anthropic Claude + +```json +{ + "claude-3-5-haiku-20241022": { + "cache_creation_input_token_cost": 1e-06, + "cache_creation_input_token_cost_above_1hr": 6e-06, + "cache_read_input_token_cost": 8e-08, + "deprecation_date": "2025-10-01", + "input_cost_per_token": 8e-07, + "litellm_provider": "anthropic", + "max_input_tokens": 200000, + "max_output_tokens": 8192, + "max_tokens": 8192, + "mode": "chat", + "output_cost_per_token": 4e-06, + "search_context_cost_per_query": { + "search_context_size_high": 0.01, + "search_context_size_low": 0.01, + "search_context_size_medium": 0.01 + }, + "supports_assistant_prefill": true, + "supports_function_calling": true, + "supports_pdf_input": true, + "supports_prompt_caching": true, + "supports_vision": true + } +} +``` + +#### Vertex AI Gemini + +```json +{ + "vertex_ai/gemini-3-pro-preview": { + "cache_read_input_token_cost": 2e-07, + "cache_read_input_token_cost_above_200k_tokens": 4e-07, + "cache_creation_input_token_cost_above_200k_tokens": 2.5e-07, + "input_cost_per_token": 2e-06, + "input_cost_per_token_above_200k_tokens": 4e-06, + "input_cost_per_token_batches": 1e-06, + "litellm_provider": "vertex_ai", + "max_audio_length_hours": 8.4, + "max_audio_per_prompt": 1, + "max_images_per_prompt": 3000, + "max_input_tokens": 1048576, + "max_output_tokens": 65535, + "max_pdf_size_mb": 30, + "max_tokens": 65535, + "max_video_length": 1, + "max_videos_per_prompt": 10, + "mode": "chat", + "output_cost_per_token": 1.2e-05, + "output_cost_per_token_above_200k_tokens": 1.8e-05, + "output_cost_per_token_batches": 6e-06, + "supports_function_calling": true, + "supports_parallel_function_calling": true, + "supports_prompt_caching": true, + "supports_system_messages": true, + "supports_vision": true + } +} +``` + +That's it! Your PR will be reviewed and merged. diff --git a/docs/my-website/docs/provider_registration/index.md b/docs/my-website/docs/provider_registration/index.md index 66f61554783c..60570dee7b71 100644 --- a/docs/my-website/docs/provider_registration/index.md +++ b/docs/my-website/docs/provider_registration/index.md @@ -2,6 +2,12 @@ title: "Integrate as a Model Provider" --- +## Quick Start for OpenAI-Compatible Providers + +If your API is OpenAI-compatible, you can add support by editing a single JSON file. See [Adding OpenAI-Compatible Providers](/docs/contributing/adding_openai_compatible_providers) for the simple approach. + +--- + This guide focuses on how to setup the classes and configuration necessary to act as a chat provider. Please see this guide first and look at the existing code in the codebase to understand how to act as a different provider, e.g. handling embeddings or image-generation. diff --git a/docs/my-website/docs/providers/abliteration.md b/docs/my-website/docs/providers/abliteration.md new file mode 100644 index 000000000000..a0fc7f393103 --- /dev/null +++ b/docs/my-website/docs/providers/abliteration.md @@ -0,0 +1,109 @@ +# Abliteration + +## Overview + +| Property | Details | +|-------|-------| +| Description | Abliteration provides an OpenAI-compatible `/chat/completions` endpoint. | +| Provider Route on LiteLLM | `abliteration/` | +| Link to Provider Doc | [Abliteration](https://abliteration.ai) | +| Base URL | `https://api.abliteration.ai/v1` | +| Supported Operations | [`/chat/completions`](#sample-usage) | + +
+ +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["ABLITERATION_API_KEY"] = "" # your Abliteration API key +``` + +## Sample Usage + +```python showLineNumbers title="Abliteration Completion" +import os +from litellm import completion + +os.environ["ABLITERATION_API_KEY"] = "" + +response = completion( + model="abliteration/abliterated-model", + messages=[{"role": "user", "content": "Hello from LiteLLM"}], +) + +print(response) +``` + +## Sample Usage - Streaming + +```python showLineNumbers title="Abliteration Streaming Completion" +import os +from litellm import completion + +os.environ["ABLITERATION_API_KEY"] = "" + +response = completion( + model="abliteration/abliterated-model", + messages=[{"role": "user", "content": "Stream a short reply"}], + stream=True, +) + +for chunk in response: + print(chunk) +``` + +## Usage with LiteLLM Proxy Server + +1. Add the model to your proxy config: + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: abliteration-chat + litellm_params: + model: abliteration/abliterated-model + api_key: os.environ/ABLITERATION_API_KEY +``` + +2. Start the proxy: + +```bash +litellm --config /path/to/config.yaml +``` + +## Direct API Usage (Bearer Token) + +Use the environment variable as a Bearer token against the OpenAI-compatible endpoint: +`https://api.abliteration.ai/v1/chat/completions`. + +```bash showLineNumbers title="cURL" +export ABLITERATION_API_KEY="" +curl https://api.abliteration.ai/v1/chat/completions \ + -H "Authorization: Bearer ${ABLITERATION_API_KEY}" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "abliterated-model", + "messages": [{"role": "user", "content": "Hello from Abliteration"}] + }' +``` + +```python showLineNumbers title="Python (requests)" +import os +import requests + +api_key = os.environ["ABLITERATION_API_KEY"] + +response = requests.post( + "https://api.abliteration.ai/v1/chat/completions", + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + }, + json={ + "model": "abliterated-model", + "messages": [{"role": "user", "content": "Hello from Abliteration"}], + }, + timeout=60, +) + +print(response.json()) +``` diff --git a/docs/my-website/docs/providers/amazon_nova.md b/docs/my-website/docs/providers/amazon_nova.md new file mode 100644 index 000000000000..509127036df5 --- /dev/null +++ b/docs/my-website/docs/providers/amazon_nova.md @@ -0,0 +1,291 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Amazon Nova + +| Property | Details | +|-------|-------| +| Description | Amazon Nova is a family of foundation models built by Amazon that deliver frontier intelligence and industry-leading price performance. | +| Provider Route on LiteLLM | `amazon_nova/` | +| Provider Doc | [Amazon Nova ↗](https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html) | +| Supported OpenAI Endpoints | `/chat/completions`, `v1/responses` | +| Other Supported Endpoints | `v1/messages`, `/generateContent` | + +## Authentication + +Amazon Nova uses API key authentication. You can obtain your API key from the [Amazon Nova developer console ↗](https://nova.amazon.com/dev/documentation). + +```bash +export AMAZON_NOVA_API_KEY="your-api-key" +``` + +## Usage + + + + +```python +import os +from litellm import completion + +# Set your API key +os.environ["AMAZON_NOVA_API_KEY"] = "your-api-key" + +response = completion( + model="amazon_nova/nova-micro-v1", + messages=[ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello, how are you?"} + ] +) + +print(response) +``` + + + + +### 1. Setup config.yaml + +```yaml +model_list: + - model_name: amazon-nova-micro + litellm_params: + model: amazon_nova/nova-micro-v1 + api_key: os.environ/AMAZON_NOVA_API_KEY +``` +### 2. Start the proxy +```bash +litellm --config /path/to/config.yaml +``` + +### 3. Test it + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "amazon-nova-micro", + "messages": [ + { + "role": "user", + "content": "Hello, how are you?" + } + ] +}' +``` + + + + +## Supported Models + +| Model Name | Usage | Context Window | +|------------|-------|----------------| +| Nova Micro | `completion(model="amazon_nova/nova-micro-v1", messages=messages)` | 128K tokens | +| Nova Lite | `completion(model="amazon_nova/nova-lite-v1", messages=messages)` | 300K tokens | +| Nova Pro | `completion(model="amazon_nova/nova-pro-v1", messages=messages)` | 300K tokens | +| Nova Premier | `completion(model="amazon_nova/nova-premier-v1", messages=messages)` | 1M tokens | + +## Usage - Streaming + + + + +```python +import os +from litellm import completion + +os.environ["AMAZON_NOVA_API_KEY"] = "your-api-key" + +response = completion( + model="amazon_nova/nova-micro-v1", + messages=[ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Tell me about machine learning"} + ], + stream=True +) + +for chunk in response: + print(chunk.choices[0].delta.content or "", end="") +``` + + + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "amazon-nova-micro", + "messages": [ + { + "role": "user", + "content": "Tell me about machine learning" + } + ], + "stream": true +}' +``` + + + + +## Usage - Function Calling / Tool Usage + + + + +```python +import os +from litellm import completion + +os.environ["AMAZON_NOVA_API_KEY"] = "your-api-key" + +tools = [ + { + "type": "function", + "function": { + "name": "getCurrentWeather", + "description": "Get the current weather in a given city", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. San Francisco, CA" + } + }, + "required": ["location"] + } + } + } +] + +response = completion( + model="amazon_nova/nova-micro-v1", + messages=[ + {"role": "user", "content": "What's the weather like in San Francisco?"} + ], + tools=tools +) + +print(response) +``` + + + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "amazon-nova-micro", + "messages": [ + { + "role": "user", + "content": "What'\''s the weather like in San Francisco?" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "getCurrentWeather", + "description": "Get the current weather in a given city", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "City and country e.g. San Francisco, CA" + } + }, + "required": ["location"] + } + } + } + ] +}' +``` + + + + +## Set temperature, top_p, etc. + + + + +```python +import os +from litellm import completion + +os.environ["AMAZON_NOVA_API_KEY"] = "your-api-key" + +response = completion( + model="amazon_nova/nova-pro-v1", + messages=[ + {"role": "user", "content": "Write a creative story"} + ], + temperature=0.8, + max_tokens=500, + top_p=0.9 +) + +print(response) +``` + + + + +**Set on yaml** + +```yaml +model_list: + - model_name: amazon-nova-pro + litellm_params: + model: amazon_nova/nova-pro-v1 + temperature: 0.8 + max_tokens: 500 + top_p: 0.9 +``` +**Set on request** +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "amazon-nova-pro", + "messages": [ + { + "role": "user", + "content": "Write a creative story" + } + ], + "temperature": 0.8, + "max_tokens": 500, + "top_p": 0.9 +}' +``` + + + + +## Model Comparison + +| Model | Best For | Speed | Cost | Context | +|-------|----------|-------|------|---------| +| **Nova Micro** | Simple tasks, high throughput | Fastest | Lowest | 128K | +| **Nova Lite** | Balanced performance | Fast | Low | 300K | +| **Nova Pro** | Complex reasoning | Medium | Medium | 300K | +| **Nova Premier** | Most advanced tasks | Slower | Higher | 1M | + +## Error Handling + +Common error codes and their meanings: + +- `401 Unauthorized`: Invalid API key +- `429 Too Many Requests`: Rate limit exceeded +- `400 Bad Request`: Invalid request format +- `500 Internal Server Error`: Service temporarily unavailable \ No newline at end of file diff --git a/docs/my-website/docs/providers/anthropic.md b/docs/my-website/docs/providers/anthropic.md index 1663d32ddfcd..446d663c5ac9 100644 --- a/docs/my-website/docs/providers/anthropic.md +++ b/docs/my-website/docs/providers/anthropic.md @@ -5,6 +5,7 @@ import TabItem from '@theme/TabItem'; LiteLLM supports all anthropic models. - `claude-sonnet-4-5-20250929` +- `claude-opus-4-5-20251101` - `claude-opus-4-1-20250805` - `claude-4` (`claude-opus-4-20250514`, `claude-sonnet-4-20250514`) - `claude-3.7` (`claude-3-7-sonnet-20250219`) @@ -17,11 +18,11 @@ LiteLLM supports all anthropic models. | Property | Details | |-------|-------| -| Description | Claude is a highly performant, trustworthy, and intelligent AI platform built by Anthropic. Claude excels at tasks involving language, reasoning, analysis, coding, and more. | -| Provider Route on LiteLLM | `anthropic/` (add this prefix to the model name, to route any requests to Anthropic - e.g. `anthropic/claude-3-5-sonnet-20240620`) | -| Provider Doc | [Anthropic ↗](https://docs.anthropic.com/en/docs/build-with-claude/overview) | -| API Endpoint for Provider | https://api.anthropic.com | -| Supported Endpoints | `/chat/completions` | +| Description | Claude is a highly performant, trustworthy, and intelligent AI platform built by Anthropic. Claude excels at tasks involving language, reasoning, analysis, coding, and more. Also available via Azure Foundry. | +| Provider Route on LiteLLM | `anthropic/` (add this prefix to the model name, to route any requests to Anthropic - e.g. `anthropic/claude-3-5-sonnet-20240620`). For Azure Foundry deployments, use `azure/claude-*` (see [Azure Anthropic documentation](../providers/azure/azure_anthropic)) | +| Provider Doc | [Anthropic ↗](https://docs.anthropic.com/en/docs/build-with-claude/overview), [Azure Foundry Claude ↗](https://learn.microsoft.com/en-us/azure/ai-services/foundry-models/claude) | +| API Endpoint for Provider | https://api.anthropic.com (or Azure Foundry endpoint: `https://.services.ai.azure.com/anthropic`) | +| Supported Endpoints | `/chat/completions`, `/v1/messages` (passthrough) | ## Supported OpenAI Parameters @@ -40,15 +41,120 @@ Check this in code, [here](../completion/input.md#translated-openai-params) "extra_headers", "parallel_tool_calls", "response_format", -"user" +"user", +"reasoning_effort", ``` :::info -Anthropic API fails requests when `max_tokens` are not passed. Due to this litellm passes `max_tokens=4096` when no `max_tokens` are passed. +**Notes:** +- Anthropic API fails requests when `max_tokens` are not passed. Due to this litellm passes `max_tokens=4096` when no `max_tokens` are passed. +- `response_format` is fully supported for Claude Sonnet 4.5 and Opus 4.1 models (see [Structured Outputs](#structured-outputs) section) +- `reasoning_effort` is automatically mapped to `output_config={"effort": ...}` for Claude Opus 4.5 models (see [Effort Parameter](./anthropic_effort.md)) ::: +## **Structured Outputs** + +LiteLLM supports Anthropic's [structured outputs feature](https://platform.claude.com/docs/en/build-with-claude/structured-outputs) for Claude Sonnet 4.5 and Opus 4.1 models. When you use `response_format` with these models, LiteLLM automatically: +- Adds the required `structured-outputs-2025-11-13` beta header +- Transforms OpenAI's `response_format` to Anthropic's `output_format` format + +### Supported Models +- `sonnet-4-5` or `sonnet-4.5` (all Sonnet 4.5 variants) +- `opus-4-1` or `opus-4.1` (all Opus 4.1 variants) + - `opus-4-5` or `opus-4.5` (all Opus 4.5 variants) + +### Example Usage + + + + +```python +from litellm import completion + +response = completion( + model="claude-sonnet-4-5-20250929", + messages=[{"role": "user", "content": "What is the capital of France?"}], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "capital_response", + "strict": True, + "schema": { + "type": "object", + "properties": { + "country": {"type": "string"}, + "capital": {"type": "string"} + }, + "required": ["country", "capital"], + "additionalProperties": False + } + } + } +) + +print(response.choices[0].message.content) +# Output: {"country": "France", "capital": "Paris"} +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-sonnet-4-5 + litellm_params: + model: anthropic/claude-sonnet-4-5-20250929 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "claude-sonnet-4-5", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "capital_response", + "strict": true, + "schema": { + "type": "object", + "properties": { + "country": {"type": "string"}, + "capital": {"type": "string"} + }, + "required": ["country", "capital"], + "additionalProperties": false + } + } + } + }' +``` + + + + +:::info +When using structured outputs with supported models, LiteLLM automatically: +- Converts OpenAI's `response_format` to Anthropic's `output_schema` +- Adds the `anthropic-beta: structured-outputs-2025-11-13` header +- Creates a tool with the schema and forces the model to use it +::: + ## API Keys ```python @@ -59,6 +165,22 @@ os.environ["ANTHROPIC_API_KEY"] = "your-api-key" # os.environ["LITELLM_ANTHROPIC_DISABLE_URL_SUFFIX"] = "true" # [OPTIONAL] Disable automatic URL suffix appending ``` +:::tip Azure Foundry Support + +Claude models are also available via Microsoft Azure Foundry. Use the `azure/` prefix instead of `anthropic/` and configure Azure authentication. See the [Azure Anthropic documentation](../providers/azure/azure_anthropic) for details. + +Example: +```python +response = completion( + model="azure/claude-sonnet-4-5", + api_base="https://.services.ai.azure.com/anthropic", + api_key="your-azure-api-key", + messages=[{"role": "user", "content": "Hello!"}] +) +``` + +::: + ### Custom API Base When using a custom API base for Anthropic (e.g., a proxy or custom endpoint), LiteLLM automatically appends the appropriate suffix (`/v1/messages` or `/v1/complete`) to your base URL. @@ -79,6 +201,30 @@ Without `LITELLM_ANTHROPIC_DISABLE_URL_SUFFIX`: With `LITELLM_ANTHROPIC_DISABLE_URL_SUFFIX=true`: - Base URL `https://my-proxy.com/custom/path` → `https://my-proxy.com/custom/path` (unchanged) +### Azure AI Foundry (Alternative Method) + +:::tip Recommended Method +For full Azure support including Azure AD authentication, use the dedicated [Azure Anthropic provider](./azure/azure_anthropic) with `azure_ai/` prefix. +::: + +As an alternative, you can use the `anthropic/` provider directly with your Azure endpoint since Azure exposes Claude using Anthropic's native API. + +```python +from litellm import completion + +response = completion( + model="anthropic/claude-sonnet-4-5", + api_base="https://.services.ai.azure.com/anthropic", + api_key="", + messages=[{"role": "user", "content": "Hello!"}], +) +print(response) +``` + +:::info +**Finding your Azure endpoint:** Go to Azure AI Foundry → Your deployment → Overview. Your base URL will be `https://.services.ai.azure.com/anthropic` +::: + ## Usage ```python @@ -298,7 +444,7 @@ Here's what a sample Raw Request from LiteLLM for Anthropic Context Caching look POST Request Sent from LiteLLM: curl -X POST \ https://api.anthropic.com/v1/messages \ --H 'accept: application/json' -H 'anthropic-version: 2023-06-01' -H 'content-type: application/json' -H 'x-api-key: sk-...' -H 'anthropic-beta: prompt-caching-2024-07-31' \ +-H 'accept: application/json' -H 'anthropic-version: 2023-06-01' -H 'content-type: application/json' -H 'x-api-key: sk-...' \ -d '{'model': 'claude-3-5-sonnet-20240620', [ { "role": "user", @@ -326,6 +472,8 @@ https://api.anthropic.com/v1/messages \ "max_tokens": 10 }' ``` + +**Note:** Anthropic no longer requires the `anthropic-beta: prompt-caching-2024-07-31` header. Prompt caching now works automatically when you use `cache_control` in your messages. ::: ### Caching - Large Context Caching @@ -953,7 +1101,31 @@ except Exception as e: s/o @[Shekhar Patnaik](https://www.linkedin.com/in/patnaikshekhar) for requesting this! -### Anthropic Hosted Tools (Computer, Text Editor, Web Search) +### Context Management (Beta) + +Anthropic’s [context editing](https://docs.claude.com/en/docs/build-with-claude/context-editing) API lets you automatically clear older tool results or thinking blocks. LiteLLM now forwards the native `context_management` payload when you call Anthropic models, and automatically attaches the required `context-management-2025-06-27` beta header. + +```python +from litellm import completion + +response = completion( + model="anthropic/claude-sonnet-4-20250514", + messages=[{"role": "user", "content": "Summarize the latest tool results"}], + context_management={ + "edits": [ + { + "type": "clear_tool_uses_20250919", + "trigger": {"type": "input_tokens", "value": 30000}, + "keep": {"type": "tool_uses", "value": 3}, + "clear_at_least": {"type": "input_tokens", "value": 5000}, + "exclude_tools": ["web_search"], + } + ] + }, +) +``` + +### Anthropic Hosted Tools (Computer, Text Editor, Web Search, Memory) @@ -1183,6 +1355,72 @@ curl http://0.0.0.0:4000/v1/chat/completions \ + + + +:::info +The Anthropic Memory tool is currently in beta. +::: + + + + +```python +from litellm import completion + +tools = [{ + "type": "memory_20250818", + "name": "memory" +}] + +model = "claude-sonnet-4-5-20250929" +messages = [{"role": "user", "content": "Please remember that my favorite color is blue."}] + +response = completion( + model=model, + messages=messages, + tools=tools, +) + +print(response) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-memory-model + litellm_params: + model: anthropic/claude-sonnet-4-5-20250929 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "claude-memory-model", + "messages": [{"role": "user", "content": "Please remember that my favorite color is blue."}], + "tools": [{"type": "memory_20250818", "name": "memory"}] + }' +``` + + + + + @@ -1454,9 +1692,9 @@ Assistant: ``` -## Usage - PDF +## Usage - PDF -Pass base64 encoded PDF files to Anthropic models using the `image_url` field. +Pass base64 encoded PDF files to Anthropic models using the `file` content type with a `file_data` field. @@ -1700,3 +1938,87 @@ curl http://0.0.0.0:4000/v1/chat/completions \ + +## Usage - Agent Skills + +LiteLLM supports using Agent Skills with the API + + + + +```python +response = completion( + model="claude-sonnet-4-5-20250929", + messages=messages, + tools= [ + { + "type": "code_execution_20250825", + "name": "code_execution" + } + ], + container= { + "skills": [ + { + "type": "anthropic", + "skill_id": "pptx", + "version": "latest" + } + ] + } +) +``` + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: claude-sonnet-4-5-20250929 + litellm_params: + model: anthropic/claude-sonnet-4-5-20250929 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +2. Start Proxy + +``` +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl --location 'http://localhost:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer ' \ +--data '{ + "model": "claude-sonnet-4-5-20250929", + "messages": [ + { + "role": "user", + "content": "Hi" + } + ], + "tools": [ + { + "type": "code_execution_20250825", + "name": "code_execution" + } + ], + "container": { + "skills": [ + { + "type": "anthropic", + "skill_id": "pptx", + "version": "latest" + } + ] + } +}' +``` + + + + +The container and its "id" will be present in "provider_specific_fields" in streaming/non-streaming response \ No newline at end of file diff --git a/docs/my-website/docs/providers/anthropic_effort.md b/docs/my-website/docs/providers/anthropic_effort.md new file mode 100644 index 000000000000..e4bfd50e6c2d --- /dev/null +++ b/docs/my-website/docs/providers/anthropic_effort.md @@ -0,0 +1,286 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Anthropic Effort Parameter + +Control how many tokens Claude uses when responding with the `effort` parameter, trading off between response thoroughness and token efficiency. + +## Overview + +The `effort` parameter allows you to control how eager Claude is about spending tokens when responding to requests. This gives you the ability to trade off between response thoroughness and token efficiency, all with a single model. + +**Note**: The effort parameter is currently in beta and only supported by Claude Opus 4.5. LiteLLM automatically adds the `effort-2025-11-24` beta header when: +- `reasoning_effort` parameter is provided (for Claude Opus 4.5 only) + +For Claude Opus 4.5, `reasoning_effort="medium"`—both are automatically mapped to the correct format. + +## How Effort Works + +By default, Claude uses maximum effort—spending as many tokens as needed for the best possible outcome. By lowering the effort level, you can instruct Claude to be more conservative with token usage, optimizing for speed and cost while accepting some reduction in capability. + +**Tip**: Setting `effort` to `"high"` produces exactly the same behavior as omitting the `effort` parameter entirely. + +The effort parameter affects **all tokens** in the response, including: +- Text responses and explanations +- Tool calls and function arguments +- Extended thinking (when enabled) + +This approach has two major advantages: +1. It doesn't require thinking to be enabled in order to use it. +2. It can affect all token spend including tool calls. For example, lower effort would mean Claude makes fewer tool calls. + +This gives a much greater degree of control over efficiency. + +## Effort Levels + +| Level | Description | Typical use case | +|-------|-------------|------------------| +| `high` | Maximum capability—Claude uses as many tokens as needed for the best possible outcome. Equivalent to not setting the parameter. | Complex reasoning, difficult coding problems, agentic tasks | +| `medium` | Balanced approach with moderate token savings. | Agentic tasks that require a balance of speed, cost, and performance | +| `low` | Most efficient—significant token savings with some capability reduction. | Simpler tasks that need the best speed and lowest costs, such as subagents | + +## Quick Start + +### Using LiteLLM SDK + + + + +```python +import litellm + +response = litellm.completion( + model="anthropic/claude-opus-4-5-20251101", + messages=[{ + "role": "user", + "content": "Analyze the trade-offs between microservices and monolithic architectures" + }], + reasoning_effort="medium" # Automatically mapped to output_config for Opus 4.5 +) + +print(response.choices[0].message.content) +``` + + + + +```typescript +import Anthropic from "@anthropic-ai/sdk"; + +const client = new Anthropic({ + apiKey: process.env.ANTHROPIC_API_KEY, +}); + +const response = await client.messages.create({ + model: "claude-opus-4-5-20251101", + max_tokens: 4096, + messages: [{ + role: "user", + content: "Analyze the trade-offs between microservices and monolithic architectures" + }], + output_config: { + effort: "medium" + } +}); + +console.log(response.content[0].text); +``` + + + + +### Using LiteLLM Proxy + +```bash +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "anthropic/claude-opus-4-5-20251101", + "messages": [{ + "role": "user", + "content": "Analyze the trade-offs between microservices and monolithic architectures" + }], + "output_config": { + "effort": "medium" + } + }' +``` + +### Direct Anthropic API Call + +```bash +curl https://api.anthropic.com/v1/messages \ + --header "x-api-key: $ANTHROPIC_API_KEY" \ + --header "anthropic-version: 2023-06-01" \ + --header "anthropic-beta: effort-2025-11-24" \ + --header "content-type: application/json" \ + --data '{ + "model": "claude-opus-4-5-20251101", + "max_tokens": 4096, + "messages": [{ + "role": "user", + "content": "Analyze the trade-offs between microservices and monolithic architectures" + }], + "output_config": { + "effort": "medium" + } + }' +``` + +## Model Compatibility + +The effort parameter is currently only supported by: +- **Claude Opus 4.5** (`claude-opus-4-5-20251101`) + +## When Should I Adjust the Effort Parameter? + +- Use **high effort** (the default) when you need Claude's best work—complex reasoning, nuanced analysis, difficult coding problems, or any task where quality is the top priority. + +- Use **medium effort** as a balanced option when you want solid performance without the full token expenditure of high effort. + +- Use **low effort** when you're optimizing for speed (because Claude answers with fewer tokens) or cost—for example, simple classification tasks, quick lookups, or high-volume use cases where marginal quality improvements don't justify additional latency or spend. + +## Effort with Tool Use + +When using tools, the effort parameter affects both the explanations around tool calls and the tool calls themselves. Lower effort levels tend to: +- Combine multiple operations into fewer tool calls +- Make fewer tool calls +- Proceed directly to action + +Example with tools: + +```python +import litellm + +response = litellm.completion( + model="anthropic/claude-opus-4-5-20251101", + messages=[{ + "role": "user", + "content": "Check the weather in multiple cities" + }], + tools=[{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + }], + output_config={ + "effort": "low" # Will make fewer tool calls + } +) +``` + +## Effort with Extended Thinking + +The effort parameter works seamlessly with extended thinking. When both are enabled, effort controls the token budget across all response types: + +```python +import litellm + +response = litellm.completion( + model="anthropic/claude-opus-4-5-20251101", + messages=[{ + "role": "user", + "content": "Solve this complex problem" + }], + thinking={ + "type": "enabled", + "budget_tokens": 5000 + }, + output_config={ + "effort": "medium" # Affects both thinking and response tokens + } +) +``` + +## Best Practices + +1. **Start with the default (high)** for new tasks, then experiment with lower effort levels if you're looking to optimize costs. + +2. **Use medium effort for production agentic workflows** where you need a balance of quality and efficiency. + +3. **Reserve low effort for high-volume, simple tasks** like classification, routing, or data extraction where speed matters more than nuanced responses. + +4. **Monitor token usage** to understand the actual savings from different effort levels for your specific use cases. + +5. **Test with your specific prompts** as the impact of effort levels can vary based on task complexity. + +## Provider Support + +The effort parameter is supported across all Anthropic-compatible providers: + +- **Standard Anthropic API**: ✅ Supported (Claude Opus 4.5) +- **Azure Anthropic / Microsoft Foundry**: ✅ Supported (Claude Opus 4.5) +- **Amazon Bedrock**: ✅ Supported (Claude Opus 4.5) +- **Google Cloud Vertex AI**: ✅ Supported (Claude Opus 4.5) + +LiteLLM automatically handles: +- Beta header injection (`effort-2025-11-24`) for all providers +- Parameter mapping: `reasoning_effort` → `output_config={"effort": ...}` for Claude Opus 4.5 + +## Usage and Pricing + +Token usage with different effort levels is tracked in the standard usage object. Lower effort levels result in fewer output tokens, which directly reduces costs: + +```python +response = litellm.completion( + model="anthropic/claude-opus-4-5-20251101", + messages=[{"role": "user", "content": "Analyze this"}], + output_config={"effort": "low"} +) + +print(f"Output tokens: {response.usage.completion_tokens}") +print(f"Total tokens: {response.usage.total_tokens}") +``` + +## Troubleshooting + +### Beta header not being added + +LiteLLM automatically adds the `effort-2025-11-24` beta header when: +- `reasoning_effort` parameter is provided (for Claude Opus 4.5 only) + +If you're not seeing the header: + +1. Ensure you're using `reasoning_effort` parameter +2. Verify the model is Claude Opus 4.5 +3. Check that LiteLLM version supports this feature + +### Invalid effort value error + +Only three values are accepted: `"high"`, `"medium"`, `"low"`. Any other value will raise a validation error: + +```python +# ❌ This will raise an error +output_config={"effort": "very_low"} + +# ✅ Use one of the valid values +output_config={"effort": "low"} +``` + +### Model not supported + +Currently, only Claude Opus 4.5 supports the effort parameter. Using it with other models may result in the parameter being ignored or an error. + +## Related Features + +- [Extended Thinking](/docs/providers/anthropic_extended_thinking) - Control Claude's reasoning process +- [Tool Use](/docs/providers/anthropic_tools) - Enable Claude to use tools and functions +- [Programmatic Tool Calling](/docs/providers/anthropic_programmatic_tool_calling) - Let Claude write code that calls tools +- [Prompt Caching](/docs/providers/anthropic_prompt_caching) - Cache prompts to reduce costs + +## Additional Resources + +- [Anthropic Effort Documentation](https://docs.anthropic.com/en/docs/build-with-claude/effort) +- [LiteLLM Anthropic Provider Guide](/docs/providers/anthropic) +- [Cost Optimization Best Practices](/docs/guides/cost_optimization) + diff --git a/docs/my-website/docs/providers/anthropic_programmatic_tool_calling.md b/docs/my-website/docs/providers/anthropic_programmatic_tool_calling.md new file mode 100644 index 000000000000..574dd7b0935e --- /dev/null +++ b/docs/my-website/docs/providers/anthropic_programmatic_tool_calling.md @@ -0,0 +1,435 @@ +# Anthropic Programmatic Tool Calling + +Programmatic tool calling allows Claude to write code that calls your tools programmatically within a code execution container, rather than requiring round trips through the model for each tool invocation. This reduces latency for multi-tool workflows and decreases token consumption by allowing Claude to filter or process data before it reaches the model's context window. + +:::info +Programmatic tool calling is currently in public beta. LiteLLM automatically detects tools with the `allowed_callers` field and adds the appropriate beta header based on your provider: + +- **Anthropic API & Microsoft Foundry**: `advanced-tool-use-2025-11-20` +- **Amazon Bedrock**: `advanced-tool-use-2025-11-20` +- **Google Cloud Vertex AI**: Not supported + +This feature requires the code execution tool to be enabled. +::: + +## Model Compatibility + +Programmatic tool calling is available on the following models: + +| Model | Tool Version | +|-------|--------------| +| Claude Opus 4.5 (`claude-opus-4-5-20251101`) | `code_execution_20250825` | +| Claude Sonnet 4.5 (`claude-sonnet-4-5-20250929`) | `code_execution_20250825` | + +## Quick Start + +Here's a simple example where Claude programmatically queries a database multiple times and aggregates results: + +```python +import litellm + +response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[ + { + "role": "user", + "content": "Query sales data for the West, East, and Central regions, then tell me which region had the highest revenue" + } + ], + tools=[ + { + "type": "code_execution_20250825", + "name": "code_execution" + }, + { + "type": "function", + "function": { + "name": "query_database", + "description": "Execute a SQL query against the sales database. Returns a list of rows as JSON objects.", + "parameters": { + "type": "object", + "properties": { + "sql": { + "type": "string", + "description": "SQL query to execute" + } + }, + "required": ["sql"] + } + }, + "allowed_callers": ["code_execution_20250825"] + } + ] +) + +print(response) +``` + +## How It Works + +When you configure a tool to be callable from code execution and Claude decides to use that tool: + +1. Claude writes Python code that invokes the tool as a function, potentially including multiple tool calls and pre/post-processing logic +2. Claude runs this code in a sandboxed container via code execution +3. When a tool function is called, code execution pauses and the API returns a `tool_use` block with a `caller` field +4. You provide the tool result, and code execution continues (intermediate results are not loaded into Claude's context window) +5. Once all code execution completes, Claude receives the final output and continues working on the task + +This approach is particularly useful for: + +- **Large data processing**: Filter or aggregate tool results before they reach Claude's context +- **Multi-step workflows**: Save tokens and latency by calling tools serially or in a loop without sampling Claude in-between tool calls +- **Conditional logic**: Make decisions based on intermediate tool results + +## The `allowed_callers` Field + +The `allowed_callers` field specifies which contexts can invoke a tool: + +```python +{ + "type": "function", + "function": { + "name": "query_database", + "description": "Execute a SQL query against the database", + "parameters": {...} + }, + "allowed_callers": ["code_execution_20250825"] +} +``` + +**Possible values:** + +- `["direct"]` - Only Claude can call this tool directly (default if omitted) +- `["code_execution_20250825"]` - Only callable from within code execution +- `["direct", "code_execution_20250825"]` - Callable both directly and from code execution + +:::tip +We recommend choosing either `["direct"]` or `["code_execution_20250825"]` for each tool rather than enabling both, as this provides clearer guidance to Claude for how best to use the tool. +::: + +## The `caller` Field in Responses + +Every tool use block includes a `caller` field indicating how it was invoked: + +**Direct invocation (traditional tool use):** + +```python +{ + "type": "tool_use", + "id": "toolu_abc123", + "name": "query_database", + "input": {"sql": ""}, + "caller": {"type": "direct"} +} +``` + +**Programmatic invocation:** + +```python +{ + "type": "tool_use", + "id": "toolu_xyz789", + "name": "query_database", + "input": {"sql": ""}, + "caller": { + "type": "code_execution_20250825", + "tool_id": "srvtoolu_abc123" + } +} +``` + +The `tool_id` references the code execution tool that made the programmatic call. + +## Container Lifecycle + +Programmatic tool calling uses code execution containers: + +- **Container creation**: A new container is created for each session unless you reuse an existing one +- **Expiration**: Containers expire after approximately 4.5 minutes of inactivity (subject to change) +- **Container ID**: Pass the `container` parameter to reuse an existing container +- **Reuse**: Pass the container ID to maintain state across requests + +```python +# First request - creates a new container +response1 = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[{"role": "user", "content": "Query the database"}], + tools=[...] +) + +# Get container ID from response (if available in response metadata) +container_id = response1.get("container", {}).get("id") + +# Second request - reuse the same container +response2 = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[...], + tools=[...], + container=container_id # Reuse container +) +``` + +:::warning +When a tool is called programmatically and the container is waiting for your tool result, you must respond before the container expires. Monitor the `expires_at` field. If the container expires, Claude may treat the tool call as timed out and retry it. +::: + +## Example Workflow + +### Step 1: Initial Request + +```python +import litellm + +response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[{ + "role": "user", + "content": "Query customer purchase history from the last quarter and identify our top 5 customers by revenue" + }], + tools=[ + { + "type": "code_execution_20250825", + "name": "code_execution" + }, + { + "type": "function", + "function": { + "name": "query_database", + "description": "Execute a SQL query against the sales database. Returns a list of rows as JSON objects.", + "parameters": { + "type": "object", + "properties": { + "sql": {"type": "string", "description": "SQL query to execute"} + }, + "required": ["sql"] + } + }, + "allowed_callers": ["code_execution_20250825"] + } + ] +) +``` + +### Step 2: API Response with Tool Call + +Claude writes code that calls your tool. The response includes: + +```python +{ + "role": "assistant", + "content": [ + { + "type": "text", + "text": "I'll query the purchase history and analyze the results." + }, + { + "type": "server_tool_use", + "id": "srvtoolu_abc123", + "name": "code_execution", + "input": { + "code": "results = await query_database('')\ntop_customers = sorted(results, key=lambda x: x['revenue'], reverse=True)[:5]" + } + }, + { + "type": "tool_use", + "id": "toolu_def456", + "name": "query_database", + "input": {"sql": ""}, + "caller": { + "type": "code_execution_20250825", + "tool_id": "srvtoolu_abc123" + } + } + ], + "stop_reason": "tool_use" +} +``` + +### Step 3: Provide Tool Result + +```python +# Add assistant's response and tool result to conversation +messages = [ + {"role": "user", "content": "Query customer purchase history..."}, + { + "role": "assistant", + "content": response.choices[0].message.content, + "tool_calls": response.choices[0].message.tool_calls + }, + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "toolu_def456", + "content": '[{"customer_id": "C1", "revenue": 45000}, ...]' + } + ] + } +] + +# Continue the conversation +response2 = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=messages, + tools=[...] +) +``` + +### Step 4: Final Response + +Once code execution completes, Claude provides the final response: + +```python +{ + "content": [ + { + "type": "code_execution_tool_result", + "tool_use_id": "srvtoolu_abc123", + "content": { + "type": "code_execution_result", + "stdout": "Top 5 customers by revenue:\n1. Customer C1: $45,000\n...", + "stderr": "", + "return_code": 0 + } + }, + { + "type": "text", + "text": "I've analyzed the purchase history from last quarter. Your top 5 customers generated $167,500 in total revenue..." + } + ], + "stop_reason": "end_turn" +} +``` + +## Advanced Patterns + +### Batch Processing with Loops + +Claude can write code that processes multiple items efficiently: + +```python +# Claude writes code like this: +regions = ["West", "East", "Central", "North", "South"] +results = {} +for region in regions: + data = await query_database(f"SELECT SUM(revenue) FROM sales WHERE region='{region}'") + results[region] = data[0]["total"] + +top_region = max(results.items(), key=lambda x: x[1]) +print(f"Top region: {top_region[0]} with ${top_region[1]:,}") +``` + +This pattern: +- Reduces model round-trips from N (one per region) to 1 +- Processes large result sets programmatically before returning to Claude +- Saves tokens by only returning aggregated conclusions + +### Early Termination + +Claude can stop processing as soon as success criteria are met: + +```python +endpoints = ["us-east", "eu-west", "apac"] +for endpoint in endpoints: + status = await check_health(endpoint) + if status == "healthy": + print(f"Found healthy endpoint: {endpoint}") + break # Stop early +``` + +### Data Filtering + +```python +logs = await fetch_logs(server_id) +errors = [log for log in logs if "ERROR" in log] +print(f"Found {len(errors)} errors") +for error in errors[-10:]: # Only return last 10 errors + print(error) +``` + +## Best Practices + +### Tool Design + +- **Provide detailed output descriptions**: Since Claude deserializes tool results in code, clearly document the format (JSON structure, field types, etc.) +- **Return structured data**: JSON or other easily parseable formats work best for programmatic processing +- **Keep responses concise**: Return only necessary data to minimize processing overhead + +### When to Use Programmatic Calling + +**Good use cases:** + +- Processing large datasets where you only need aggregates or summaries +- Multi-step workflows with 3+ dependent tool calls +- Operations requiring filtering, sorting, or transformation of tool results +- Tasks where intermediate data shouldn't influence Claude's reasoning +- Parallel operations across many items (e.g., checking 50 endpoints) + +**Less ideal use cases:** + +- Single tool calls with simple responses +- Tools that need immediate user feedback +- Very fast operations where code execution overhead would outweigh the benefit + +## Token Efficiency + +Programmatic tool calling can significantly reduce token consumption: + +- **Tool results from programmatic calls are not added to Claude's context** - only the final code output is +- **Intermediate processing happens in code** - filtering, aggregation, etc. don't consume model tokens +- **Multiple tool calls in one code execution** - reduces overhead compared to separate model turns + +For example, calling 10 tools directly uses ~10x the tokens of calling them programmatically and returning a summary. + +## Provider Support + +LiteLLM supports programmatic tool calling across the following Anthropic-compatible providers: + +- **Standard Anthropic API** (`anthropic/claude-sonnet-4-5-20250929`) ✅ +- **Azure Anthropic / Microsoft Foundry** (`azure/claude-sonnet-4-5-20250929`) ✅ +- **Amazon Bedrock** (`bedrock/invoke/anthropic.claude-sonnet-4-5-20250929-v1:0`) ✅ +- **Google Cloud Vertex AI** (`vertex_ai/claude-sonnet-4-5-20250929`) ❌ Not supported + +The beta header (`advanced-tool-use-2025-11-20`) is automatically added when LiteLLM detects tools with the `allowed_callers` field. + +## Limitations + +### Feature Incompatibilities + +- **Structured outputs**: Tools with `strict: true` are not supported with programmatic calling +- **Tool choice**: You cannot force programmatic calling of a specific tool via `tool_choice` +- **Parallel tool use**: `disable_parallel_tool_use: true` is not supported with programmatic calling + +### Tool Restrictions + +The following tools cannot currently be called programmatically: + +- Web search +- Web fetch +- Tools provided by an MCP connector + +## Troubleshooting + +### Common Issues + +**"Tool not allowed" error** + +- Verify your tool definition includes `"allowed_callers": ["code_execution_20250825"]` +- Check that you're using a compatible model (Claude Sonnet 4.5 or Opus 4.5) + +**Container expiration** + +- Ensure you respond to tool calls within the container's lifetime (~4.5 minutes) +- Consider implementing faster tool execution + +**Beta header not added** + +- LiteLLM automatically adds the beta header when it detects `allowed_callers` +- If you're manually setting headers, ensure you include `advanced-tool-use-2025-11-20` + +## Related Features + +- [Anthropic Tool Search](./anthropic_tool_search.md) - Dynamically discover and load tools on-demand +- [Anthropic Provider](./anthropic.md) - General Anthropic provider documentation + diff --git a/docs/my-website/docs/providers/anthropic_tool_input_examples.md b/docs/my-website/docs/providers/anthropic_tool_input_examples.md new file mode 100644 index 000000000000..39f4d8555f48 --- /dev/null +++ b/docs/my-website/docs/providers/anthropic_tool_input_examples.md @@ -0,0 +1,445 @@ +# Anthropic Tool Input Examples + +Provide concrete examples of valid tool inputs to help Claude understand how to use your tools more effectively. This is particularly useful for complex tools with nested objects, optional parameters, or format-sensitive inputs. + +:::info +Tool input examples is a beta feature. LiteLLM automatically detects tools with the `input_examples` field and adds the appropriate beta header based on your provider: + +- **Anthropic API & Microsoft Foundry**: `advanced-tool-use-2025-11-20` +- **Amazon Bedrock**: `advanced-tool-use-2025-11-20` (Claude Opus 4.5 only) +- **Google Cloud Vertex AI**: Not supported + +You don't need to manually specify beta headers—LiteLLM handles this automatically. +::: + +## When to Use Input Examples + +Input examples are most helpful for: + +- **Complex nested objects**: Tools with deeply nested parameter structures +- **Optional parameters**: Showing when optional parameters should be included +- **Format-sensitive inputs**: Demonstrating expected formats (dates, addresses, etc.) +- **Enum values**: Illustrating valid enum choices in context +- **Edge cases**: Showing how to handle special cases + +:::tip +**Prioritize descriptions first!** Clear, detailed tool descriptions are more important than examples. Use `input_examples` as a supplement for complex tools where descriptions alone may not be sufficient. +::: + +## Quick Start + +Add an `input_examples` field to your tool definition with an array of example input objects: + +```python +import litellm + +response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[ + {"role": "user", "content": "What's the weather like in San Francisco?"} + ], + tools=[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + }, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "description": "The unit of temperature" + } + }, + "required": ["location"] + } + }, + "input_examples": [ + { + "location": "San Francisco, CA", + "unit": "fahrenheit" + }, + { + "location": "Tokyo, Japan", + "unit": "celsius" + }, + { + "location": "New York, NY" # 'unit' is optional + } + ] + } + ] +) + +print(response) +``` + +## How It Works + +When you provide `input_examples`: + +1. **LiteLLM detects** the `input_examples` field in your tool definition +2. **Beta header added automatically**: The `advanced-tool-use-2025-11-20` header is injected +3. **Examples included in prompt**: Anthropic includes the examples alongside your tool schema +4. **Claude learns patterns**: The model uses examples to understand proper tool usage +5. **Better tool calls**: Claude makes more accurate tool calls with correct parameter formats + +## Example Formats + +### Simple Tool with Examples + +```python +{ + "type": "function", + "function": { + "name": "send_email", + "description": "Send an email to a recipient", + "parameters": { + "type": "object", + "properties": { + "to": {"type": "string", "description": "Email address"}, + "subject": {"type": "string"}, + "body": {"type": "string"} + }, + "required": ["to", "subject", "body"] + } + }, + "input_examples": [ + { + "to": "user@example.com", + "subject": "Meeting Reminder", + "body": "Don't forget our meeting tomorrow at 2 PM." + }, + { + "to": "team@company.com", + "subject": "Weekly Update", + "body": "Here's this week's progress report..." + } + ] +} +``` + +### Complex Nested Objects + +```python +{ + "type": "function", + "function": { + "name": "create_calendar_event", + "description": "Create a new calendar event", + "parameters": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "start": { + "type": "object", + "properties": { + "date": {"type": "string"}, + "time": {"type": "string"} + } + }, + "attendees": { + "type": "array", + "items": { + "type": "object", + "properties": { + "email": {"type": "string"}, + "optional": {"type": "boolean"} + } + } + } + }, + "required": ["title", "start"] + } + }, + "input_examples": [ + { + "title": "Team Standup", + "start": { + "date": "2025-01-15", + "time": "09:00" + }, + "attendees": [ + {"email": "alice@example.com", "optional": False}, + {"email": "bob@example.com", "optional": True} + ] + }, + { + "title": "Lunch Break", + "start": { + "date": "2025-01-15", + "time": "12:00" + } + # No attendees - showing optional field + } + ] +} +``` + +### Format-Sensitive Parameters + +```python +{ + "type": "function", + "function": { + "name": "search_flights", + "description": "Search for available flights", + "parameters": { + "type": "object", + "properties": { + "origin": {"type": "string", "description": "Airport code"}, + "destination": {"type": "string", "description": "Airport code"}, + "date": {"type": "string", "description": "Date in YYYY-MM-DD format"}, + "passengers": {"type": "integer"} + }, + "required": ["origin", "destination", "date"] + } + }, + "input_examples": [ + { + "origin": "SFO", + "destination": "JFK", + "date": "2025-03-15", + "passengers": 2 + }, + { + "origin": "LAX", + "destination": "ORD", + "date": "2025-04-20", + "passengers": 1 + } + ] +} +``` + +## Requirements and Limitations + +### Schema Validation + +- Each example **must be valid** according to the tool's `input_schema` +- Invalid examples will return a **400 error** from Anthropic +- Validation happens server-side (LiteLLM passes examples through) + +### Server-Side Tools Not Supported + +Input examples are **only supported for user-defined tools**. The following server-side tools do NOT support `input_examples`: + +- `web_search` (web search tool) +- `code_execution` (code execution tool) +- `computer_use` (computer use tool) +- `bash_tool` (bash execution tool) +- `text_editor` (text editor tool) + +### Token Costs + +Examples add to your prompt tokens: + +- **Simple examples**: ~20-50 tokens per example +- **Complex nested objects**: ~100-200 tokens per example +- **Trade-off**: Higher token cost for better tool call accuracy + +### Model Compatibility + +Input examples work with all Claude models that support the `advanced-tool-use-2025-11-20` beta header: + +- Claude Opus 4.5 (`claude-opus-4-5-20251101`) +- Claude Sonnet 4.5 (`claude-sonnet-4-5-20250929`) +- Claude Opus 4.1 (`claude-opus-4-1-20250805`) + +:::note +On Google Cloud's Vertex AI and Amazon Bedrock, only Claude Opus 4.5 supports tool input examples. +::: + +## Best Practices + +### 1. Show Diverse Examples + +Include examples that demonstrate different use cases: + +```python +"input_examples": [ + {"location": "San Francisco, CA", "unit": "fahrenheit"}, # US city + {"location": "Tokyo, Japan", "unit": "celsius"}, # International + {"location": "New York, NY"} # Optional param omitted +] +``` + +### 2. Demonstrate Optional Parameters + +Show when optional parameters should and shouldn't be included: + +```python +"input_examples": [ + { + "query": "machine learning", + "filters": {"year": 2024, "category": "research"} # With optional filters + }, + { + "query": "artificial intelligence" # Without optional filters + } +] +``` + +### 3. Illustrate Format Requirements + +Make format expectations clear through examples: + +```python +"input_examples": [ + { + "phone": "+1-555-123-4567", # Shows expected phone format + "date": "2025-01-15", # Shows date format (YYYY-MM-DD) + "time": "14:30" # Shows time format (HH:MM) + } +] +``` + +### 4. Keep Examples Realistic + +Use realistic, production-like examples rather than placeholder data: + +```python +# ✅ Good - realistic examples +"input_examples": [ + {"email": "alice@company.com", "role": "admin"}, + {"email": "bob@company.com", "role": "user"} +] + +# ❌ Bad - placeholder examples +"input_examples": [ + {"email": "test@test.com", "role": "role1"}, + {"email": "example@example.com", "role": "role2"} +] +``` + +### 5. Limit Example Count + +Provide 2-5 examples per tool: + +- **Too few** (1): May not show enough variation +- **Just right** (2-5): Demonstrates patterns without bloating tokens +- **Too many** (10+): Wastes tokens, diminishing returns + +## Integration with Other Features + +Input examples work seamlessly with other Anthropic tool features: + +### With Tool Search + +```python +{ + "type": "function", + "function": { + "name": "query_database", + "description": "Execute a SQL query", + "parameters": {...} + }, + "defer_loading": True, # Tool search + "input_examples": [ # Input examples + {"sql": "SELECT * FROM users WHERE id = 1"} + ] +} +``` + +### With Programmatic Tool Calling + +```python +{ + "type": "function", + "function": { + "name": "fetch_data", + "description": "Fetch data from API", + "parameters": {...} + }, + "allowed_callers": ["code_execution_20250825"], # Programmatic calling + "input_examples": [ # Input examples + {"endpoint": "/api/users", "method": "GET"} + ] +} +``` + +### All Features Combined + +```python +{ + "type": "function", + "function": { + "name": "advanced_tool", + "description": "A complex tool", + "parameters": {...} + }, + "defer_loading": True, # Tool search + "allowed_callers": ["code_execution_20250825"], # Programmatic calling + "input_examples": [ # Input examples + {"param1": "value1", "param2": "value2"} + ] +} +``` + +## Provider Support + +LiteLLM supports input examples across the following Anthropic-compatible providers: + +- **Standard Anthropic API** (`anthropic/claude-sonnet-4-5-20250929`) ✅ +- **Azure Anthropic / Microsoft Foundry** (`azure/claude-sonnet-4-5-20250929`) ✅ +- **Amazon Bedrock** (`bedrock/invoke/anthropic.claude-opus-4-5-20251101-v1:0`) ✅ (Opus 4.5 only) +- **Google Cloud Vertex AI** (`vertex_ai/claude-sonnet-4-5-20250929`) ❌ Not supported + +The beta header (`advanced-tool-use-2025-11-20`) is automatically added when LiteLLM detects tools with the `input_examples` field. + +## Troubleshooting + +### "Invalid request" error with examples + +**Problem**: Receiving 400 error when using input examples + +**Solution**: Ensure each example is valid according to your `input_schema`: + +```python +# Check that: +# 1. All required fields are present in examples +# 2. Field types match the schema +# 3. Enum values are valid +# 4. Nested objects follow the schema structure +``` + +### Examples not improving tool calls + +**Problem**: Adding examples doesn't seem to help + +**Solution**: +1. **Check descriptions first**: Ensure tool descriptions are detailed and clear +2. **Review example quality**: Make sure examples are realistic and diverse +3. **Verify schema**: Confirm examples actually match your schema +4. **Add more variation**: Include examples showing different use cases + +### Token usage too high + +**Problem**: Input examples consuming too many tokens + +**Solution**: +1. **Reduce example count**: Use 2-3 examples instead of 5+ +2. **Simplify examples**: Remove unnecessary fields from examples +3. **Consider descriptions**: If descriptions are clear, examples may not be needed + +## When NOT to Use Input Examples + +Skip input examples if: + +- **Tool is simple**: Single parameter tools with clear descriptions +- **Schema is self-explanatory**: Well-structured schema with good descriptions +- **Token budget is tight**: Examples add 20-200 tokens each +- **Server-side tools**: web_search, code_execution, etc. don't support examples + +## Related Features + +- [Anthropic Tool Search](./anthropic_tool_search.md) - Dynamically discover and load tools on-demand +- [Anthropic Programmatic Tool Calling](./anthropic_programmatic_tool_calling.md) - Call tools from code execution +- [Anthropic Provider](./anthropic.md) - General Anthropic provider documentation + diff --git a/docs/my-website/docs/providers/anthropic_tool_search.md b/docs/my-website/docs/providers/anthropic_tool_search.md new file mode 100644 index 000000000000..203a2947ebcd --- /dev/null +++ b/docs/my-website/docs/providers/anthropic_tool_search.md @@ -0,0 +1,542 @@ +# Tool Search + +Tool search enables Claude to dynamically discover and load tools on-demand from large tool catalogs (10,000+ tools). Instead of loading all tool definitions into the context window upfront, Claude searches your tool catalog and loads only the tools it needs. + +## Supported Providers + +| Provider | Chat Completions API | Messages API | +|----------|---------------------|--------------| +| **Anthropic API** | ✅ | ✅ | +| **Azure Anthropic** (Microsoft Foundry) | ✅ | ✅ | +| **Google Cloud Vertex AI** | ✅ | ✅ | +| **Amazon Bedrock** | ✅ (Invoke API only, Opus 4.5 only) | ✅ (Invoke API only, Opus 4.5 only) | + + +## Benefits + +- **Context efficiency**: Avoid consuming massive portions of your context window with tool definitions +- **Better tool selection**: Claude's tool selection accuracy degrades with more than 30-50 tools. Tool search maintains accuracy even with thousands of tools +- **On-demand loading**: Tools are only loaded when Claude needs them + +## Tool Search Variants + +LiteLLM supports both tool search variants: + +### 1. Regex Tool Search (`tool_search_tool_regex_20251119`) + +Claude constructs regex patterns to search for tools. Best for exact pattern matching (faster). + +### 2. BM25 Tool Search (`tool_search_tool_bm25_20251119`) + +Claude uses natural language queries to search for tools using the BM25 algorithm. Best for natural language semantic search. + +**Note**: BM25 variant is not supported on Bedrock. + +--- + +## Chat Completions API + +### SDK Usage + +#### Basic Example with Regex Tool Search + +```python showLineNumbers title="Basic Tool Search Example" +import litellm + +response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[ + {"role": "user", "content": "What is the weather in San Francisco?"} + ], + tools=[ + # Tool search tool (regex variant) + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + # Deferred tool - will be loaded on-demand + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the weather at a specific location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"}, + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["location"] + } + }, + "defer_loading": True # Mark for deferred loading + } + ] +) + +print(response.choices[0].message.content) +``` + +#### BM25 Tool Search Example + +```python showLineNumbers title="BM25 Tool Search" +import litellm + +response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[ + {"role": "user", "content": "Search for Python files containing 'authentication'"} + ], + tools=[ + # Tool search tool (BM25 variant) + { + "type": "tool_search_tool_bm25_20251119", + "name": "tool_search_tool_bm25" + }, + # Deferred tools... + { + "type": "function", + "function": { + "name": "search_codebase", + "description": "Search through codebase files by content and filename", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string"}, + "file_pattern": {"type": "string"} + }, + "required": ["query"] + } + }, + "defer_loading": True + } + ] +) +``` + +#### Azure Anthropic Example + +```python showLineNumbers title="Azure Anthropic Tool Search" +import litellm + +response = litellm.completion( + model="azure_anthropic/claude-sonnet-4-5", + api_base="https://.services.ai.azure.com/anthropic", + api_key="your-azure-api-key", + messages=[ + {"role": "user", "content": "What's the weather like?"} + ], + tools=[ + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + }, + "defer_loading": True + } + ] +) +``` + +#### Vertex AI Example + +```python showLineNumbers title="Vertex AI Tool Search" +import litellm + +response = litellm.completion( + model="vertex_ai/claude-sonnet-4-5", + vertex_project="your-project-id", + vertex_location="us-central1", + messages=[ + {"role": "user", "content": "Search my documents"} + ], + tools=[ + { + "type": "tool_search_tool_bm25_20251119", + "name": "tool_search_tool_bm25" + }, + # Your deferred tools... + ] +) +``` + +#### Streaming Support + +```python showLineNumbers title="Streaming with Tool Search" +import litellm + +response = litellm.completion( + model="anthropic/claude-sonnet-4-5-20250929", + messages=[ + {"role": "user", "content": "Get the weather"} + ], + tools=[ + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather information", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + }, + "defer_loading": True + } + ], + stream=True +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="") +``` + +### AI Gateway Usage + +Tool search works automatically through the LiteLLM proxy. + +#### Proxy Configuration + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: claude-sonnet + litellm_params: + model: anthropic/claude-sonnet-4-5-20250929 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +#### Client Request + +```python showLineNumbers title="Client Request via Proxy" +from anthropic import Anthropic + +client = Anthropic( + api_key="your-litellm-proxy-key", + base_url="http://0.0.0.0:4000" +) + +response = client.messages.create( + model="claude-sonnet", + max_tokens=1024, + messages=[ + {"role": "user", "content": "What's the weather?"} + ], + tools=[ + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + { + "name": "get_weather", + "description": "Get weather information", + "input_schema": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + }, + "defer_loading": True + } + ] +) +``` + +--- + +## Messages API + +The Messages API provides native Anthropic-style tool search support via the `litellm.anthropic.messages` interface. + +### SDK Usage + +#### Basic Example + +```python showLineNumbers title="Messages API - Basic Tool Search" +import litellm + +response = await litellm.anthropic.messages.acreate( + model="anthropic/claude-sonnet-4-20250514", + messages=[ + { + "role": "user", + "content": "What's the weather in San Francisco?" + } + ], + tools=[ + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + { + "name": "get_weather", + "description": "Get the current weather for a location", + "input_schema": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + } + }, + "required": ["location"] + }, + "defer_loading": True + } + ], + max_tokens=1024, + extra_headers={"anthropic-beta": "advanced-tool-use-2025-11-20"} +) + +print(response) +``` + +#### Azure Anthropic Messages Example + +```python showLineNumbers title="Azure Anthropic Messages API" +import litellm + +response = await litellm.anthropic.messages.acreate( + model="azure_anthropic/claude-sonnet-4-20250514", + messages=[ + { + "role": "user", + "content": "What's the stock price of Apple?" + } + ], + tools=[ + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + { + "name": "get_stock_price", + "description": "Get the current stock price for a ticker symbol", + "input_schema": { + "type": "object", + "properties": { + "ticker": { + "type": "string", + "description": "The stock ticker symbol, e.g. AAPL" + } + }, + "required": ["ticker"] + }, + "defer_loading": True + } + ], + max_tokens=1024, + extra_headers={"anthropic-beta": "advanced-tool-use-2025-11-20"} +) +``` + +#### Vertex AI Messages Example + +```python showLineNumbers title="Vertex AI Messages API" +import litellm + +response = await litellm.anthropic.messages.acreate( + model="vertex_ai/claude-sonnet-4@20250514", + messages=[ + { + "role": "user", + "content": "Search the web for information about AI" + } + ], + tools=[ + { + "type": "tool_search_tool_bm25_20251119", + "name": "tool_search_tool_bm25" + }, + { + "name": "search_web", + "description": "Search the web for information", + "input_schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query" + } + }, + "required": ["query"] + }, + "defer_loading": True + } + ], + max_tokens=1024, + extra_headers={"anthropic-beta": "tool-search-tool-2025-10-19"} +) +``` + +#### Bedrock Messages Example + +```python showLineNumbers title="Bedrock Messages API (Invoke)" +import litellm + +response = await litellm.anthropic.messages.acreate( + model="bedrock/invoke/anthropic.claude-opus-4-20250514-v1:0", + messages=[ + { + "role": "user", + "content": "What's the weather?" + } + ], + tools=[ + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + { + "name": "get_weather", + "description": "Get weather information", + "input_schema": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + }, + "defer_loading": True + } + ], + max_tokens=1024, + extra_headers={"anthropic-beta": "tool-search-tool-2025-10-19"} +) +``` + +#### Streaming Support + +```python showLineNumbers title="Messages API - Streaming" +import litellm +import json + +response = await litellm.anthropic.messages.acreate( + model="anthropic/claude-sonnet-4-20250514", + messages=[ + { + "role": "user", + "content": "What's the weather in Tokyo?" + } + ], + tools=[ + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + { + "name": "get_weather", + "description": "Get weather information", + "input_schema": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + }, + "defer_loading": True + } + ], + max_tokens=1024, + stream=True, + extra_headers={"anthropic-beta": "advanced-tool-use-2025-11-20"} +) + +async for chunk in response: + if isinstance(chunk, bytes): + chunk_str = chunk.decode("utf-8") + for line in chunk_str.split("\n"): + if line.startswith("data: "): + try: + json_data = json.loads(line[6:]) + print(json_data) + except json.JSONDecodeError: + pass +``` + +### AI Gateway Usage + +Configure the proxy to use Messages API endpoints. + +#### Proxy Configuration + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: claude-sonnet-messages + litellm_params: + model: anthropic/claude-sonnet-4-20250514 + api_key: os.environ/ANTHROPIC_API_KEY +``` + +#### Client Request + +```python showLineNumbers title="Client Request via Proxy (Messages API)" +from anthropic import Anthropic + +client = Anthropic( + api_key="your-litellm-proxy-key", + base_url="http://0.0.0.0:4000" +) + +response = client.messages.create( + model="claude-sonnet-messages", + max_tokens=1024, + messages=[ + { + "role": "user", + "content": "What's the weather?" + } + ], + tools=[ + { + "type": "tool_search_tool_regex_20251119", + "name": "tool_search_tool_regex" + }, + { + "name": "get_weather", + "description": "Get weather information", + "input_schema": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + }, + "defer_loading": True + } + ], + extra_headers={"anthropic-beta": "advanced-tool-use-2025-11-20"} +) + +print(response) +``` + +--- + +## Additional Resources + +- [Anthropic Tool Search Documentation](https://docs.anthropic.com/en/docs/build-with-claude/tool-use/tool-search) +- [LiteLLM Tool Calling Guide](https://docs.litellm.ai/docs/completion/function_call) diff --git a/docs/my-website/docs/providers/apertis.md b/docs/my-website/docs/providers/apertis.md new file mode 100644 index 000000000000..967de8147e21 --- /dev/null +++ b/docs/my-website/docs/providers/apertis.md @@ -0,0 +1,129 @@ +# Apertis AI (Stima API) + +## Overview + +| Property | Details | +|-------|-------| +| Description | Apertis AI (formerly Stima API) is a unified API platform providing access to 430+ AI models through a single interface, with cost savings of up to 50%. | +| Provider Route on LiteLLM | `apertis/` | +| Link to Provider Doc | [Apertis AI Website ↗](https://api.stima.tech) | +| Base URL | `https://api.stima.tech/v1` | +| Supported Operations | [`/chat/completions`](#sample-usage) | + +
+ +## What is Apertis AI? + +Apertis AI is a unified API platform that lets developers: +- **Access 430+ AI Models**: All models through a single API +- **Save 50% on Costs**: Competitive pricing with significant discounts +- **Unified Billing**: Single bill for all model usage +- **Quick Setup**: Start with just $2 registration +- **GitHub Integration**: Link with your GitHub account + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["STIMA_API_KEY"] = "" # your Apertis AI API key +``` + +Get your Apertis AI API key from [api.stima.tech](https://api.stima.tech). + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="Apertis AI Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["STIMA_API_KEY"] = "" # your Apertis AI API key + +messages = [{"content": "What is the capital of France?", "role": "user"}] + +# Apertis AI call +response = completion( + model="apertis/model-name", # Replace with actual model name + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="Apertis AI Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["STIMA_API_KEY"] = "" # your Apertis AI API key + +messages = [{"content": "Write a short poem about AI", "role": "user"}] + +# Apertis AI call with streaming +response = completion( + model="apertis/model-name", # Replace with actual model name + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export STIMA_API_KEY="" +``` + +### 2. Start the proxy + +```yaml +model_list: + - model_name: apertis-model + litellm_params: + model: apertis/model-name # Replace with actual model name + api_key: os.environ/STIMA_API_KEY +``` + +## Supported OpenAI Parameters + +Apertis AI supports all standard OpenAI-compatible parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `messages` | array | **Required**. Array of message objects with 'role' and 'content' | +| `model` | string | **Required**. Model ID from 430+ available models | +| `stream` | boolean | Optional. Enable streaming responses | +| `temperature` | float | Optional. Sampling temperature | +| `top_p` | float | Optional. Nucleus sampling parameter | +| `max_tokens` | integer | Optional. Maximum tokens to generate | +| `frequency_penalty` | float | Optional. Penalize frequent tokens | +| `presence_penalty` | float | Optional. Penalize tokens based on presence | +| `stop` | string/array | Optional. Stop sequences | +| `tools` | array | Optional. List of available tools/functions | +| `tool_choice` | string/object | Optional. Control tool/function calling | + +## Cost Benefits + +Apertis AI offers significant cost advantages: +- **50% Cost Savings**: Save money compared to direct provider costs +- **Unified Billing**: Single invoice for all your AI model usage +- **Low Entry**: Start with just $2 registration + +## Model Availability + +With access to 430+ AI models, Apertis AI provides: +- Multiple providers through one API +- Latest model releases +- Various model types (text, image, video) + +## Additional Resources + +- [Apertis AI Website](https://api.stima.tech) +- [Apertis AI Enterprise](https://api.stima.tech/enterprise) diff --git a/docs/my-website/docs/providers/aws_polly.md b/docs/my-website/docs/providers/aws_polly.md new file mode 100644 index 000000000000..21b0fa679bf1 --- /dev/null +++ b/docs/my-website/docs/providers/aws_polly.md @@ -0,0 +1,364 @@ +# AWS Polly Text to Speech (tts) + +## Overview + +| Property | Details | +|-------|-------| +| Description | Convert text to natural-sounding speech using AWS Polly's neural and standard TTS engines | +| Provider Route on LiteLLM | `aws_polly/` | +| Supported Operations | `/audio/speech` | +| Link to Provider Doc | [AWS Polly SynthesizeSpeech ↗](https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html) | + +## Quick Start + +### **LiteLLM SDK** + +```python showLineNumbers title="SDK Usage" +import litellm +from pathlib import Path +import os + +# Set environment variables +os.environ["AWS_ACCESS_KEY_ID"] = "" +os.environ["AWS_SECRET_ACCESS_KEY"] = "" +os.environ["AWS_REGION_NAME"] = "us-east-1" + +# AWS Polly call +speech_file_path = Path(__file__).parent / "speech.mp3" +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input="the quick brown fox jumped over the lazy dogs", +) +response.stream_to_file(speech_file_path) +``` + +### **LiteLLM PROXY** + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: polly-neural + litellm_params: + model: aws_polly/neural + aws_access_key_id: "os.environ/AWS_ACCESS_KEY_ID" + aws_secret_access_key: "os.environ/AWS_SECRET_ACCESS_KEY" + aws_region_name: "us-east-1" +``` + +## Polly Engines + +AWS Polly supports different speech synthesis engines. Specify the engine in the model name: + +| Model | Engine | Cost (per 1M chars) | Description | +|-------|--------|---------------------|-------------| +| `aws_polly/standard` | Standard | $4.00 | Original Polly voices, faster and lowest cost | +| `aws_polly/neural` | Neural | $16.00 | More natural, human-like speech (recommended) | +| `aws_polly/generative` | Generative | $30.00 | Most expressive, highest quality (limited voices) | +| `aws_polly/long-form` | Long-form | $100.00 | Optimized for long content like articles | + +### **LiteLLM SDK** + +```python showLineNumbers title="Using Different Engines" +import litellm + +# Neural engine (recommended) +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input="Hello world", +) + +# Standard engine (lower cost) +response = litellm.speech( + model="aws_polly/standard", + voice="Joanna", + input="Hello world", +) + +# Generative engine (highest quality) +response = litellm.speech( + model="aws_polly/generative", + voice="Matthew", + input="Hello world", +) +``` + +### **LiteLLM PROXY** + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: polly-neural + litellm_params: + model: aws_polly/neural + aws_region_name: "us-east-1" + - model_name: polly-standard + litellm_params: + model: aws_polly/standard + aws_region_name: "us-east-1" + - model_name: polly-generative + litellm_params: + model: aws_polly/generative + aws_region_name: "us-east-1" +``` + +## Available Voices + +### Native Polly Voices + +AWS Polly has many voices across different languages. Here are popular US English voices: + +| Voice | Gender | Engine Support | +|-------|--------|----------------| +| `Joanna` | Female | Neural, Standard | +| `Matthew` | Male | Neural, Standard, Generative | +| `Ivy` | Female (child) | Neural, Standard | +| `Kendra` | Female | Neural, Standard | +| `Amy` | Female (British) | Neural, Standard | +| `Brian` | Male (British) | Neural, Standard | + +### **LiteLLM SDK** + +```python showLineNumbers title="Using Native Polly Voices" +import litellm + +# US English female +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input="Hello from Joanna", +) + +# US English male +response = litellm.speech( + model="aws_polly/neural", + voice="Matthew", + input="Hello from Matthew", +) + +# British English female +response = litellm.speech( + model="aws_polly/neural", + voice="Amy", + input="Hello from Amy", +) +``` + +### **LiteLLM PROXY** + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: polly-joanna + litellm_params: + model: aws_polly/neural + voice: "Joanna" + aws_region_name: "us-east-1" + - model_name: polly-matthew + litellm_params: + model: aws_polly/neural + voice: "Matthew" + aws_region_name: "us-east-1" +``` + +### OpenAI Voice Mappings + +LiteLLM also supports OpenAI voice names, which are automatically mapped to Polly voices: + +| OpenAI Voice | Maps to Polly Voice | +|--------------|---------------------| +| `alloy` | Joanna | +| `echo` | Matthew | +| `fable` | Amy | +| `onyx` | Brian | +| `nova` | Ivy | +| `shimmer` | Kendra | + +### **LiteLLM SDK** + +```python showLineNumbers title="Using OpenAI Voice Names" +import litellm + +# These are equivalent +response = litellm.speech( + model="aws_polly/neural", + voice="alloy", # Maps to Joanna + input="Hello world", +) + +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", # Native Polly voice + input="Hello world", +) +``` + +## SSML Support + +AWS Polly supports SSML (Speech Synthesis Markup Language) for advanced control over speech output. LiteLLM automatically detects SSML input. + +### **LiteLLM SDK** + +```python showLineNumbers title="SSML Example" +import litellm + +ssml_input = """ + + Hello, + this is a test with emphasis + and slower speech. + +""" + +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input=ssml_input, +) +``` + +### **LiteLLM PROXY** + +```bash showLineNumbers title="cURL Request with SSML" +curl -X POST http://localhost:4000/v1/audio/speech \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "polly-neural", + "voice": "Joanna", + "input": "Hello world" + }' \ + --output speech.mp3 +``` + +## Supported Parameters + +```python showLineNumbers title="All Parameters" +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", # Required: Voice selection + input="text to convert", # Required: Input text (or SSML) + response_format="mp3", # Optional: mp3, ogg_vorbis, pcm + + # AWS-specific parameters + language_code="en-US", # Optional: Language code + sample_rate="22050", # Optional: Sample rate in Hz +) +``` + +## Response Formats + +| Format | Description | +|--------|-------------| +| `mp3` | MP3 audio (default) | +| `ogg_vorbis` | Ogg Vorbis audio | +| `pcm` | Raw PCM audio | + +### **LiteLLM SDK** + +```python showLineNumbers title="Different Response Formats" +import litellm + +# MP3 (default) +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input="Hello", + response_format="mp3", +) + +# Ogg Vorbis +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input="Hello", + response_format="ogg_vorbis", +) +``` + +## AWS Authentication + +LiteLLM supports multiple AWS authentication methods. + +### **LiteLLM SDK** + +```python showLineNumbers title="Authentication Options" +import litellm +import os + +# Option 1: Environment variables (recommended) +os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-key" +os.environ["AWS_REGION_NAME"] = "us-east-1" + +response = litellm.speech(model="aws_polly/neural", voice="Joanna", input="Hello") + +# Option 2: Pass credentials directly +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input="Hello", + aws_access_key_id="your-access-key", + aws_secret_access_key="your-secret-key", + aws_region_name="us-east-1", +) + +# Option 3: IAM Role (when running on AWS) +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input="Hello", + aws_region_name="us-east-1", +) + +# Option 4: AWS Profile +response = litellm.speech( + model="aws_polly/neural", + voice="Joanna", + input="Hello", + aws_profile_name="my-profile", +) +``` + +### **LiteLLM PROXY** + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + # Using environment variables + - model_name: polly-neural + litellm_params: + model: aws_polly/neural + aws_access_key_id: "os.environ/AWS_ACCESS_KEY_ID" + aws_secret_access_key: "os.environ/AWS_SECRET_ACCESS_KEY" + aws_region_name: "us-east-1" + + # Using IAM Role (when proxy runs on AWS) + - model_name: polly-neural-iam + litellm_params: + model: aws_polly/neural + aws_region_name: "us-east-1" + + # Using AWS Profile + - model_name: polly-neural-profile + litellm_params: + model: aws_polly/neural + aws_profile_name: "my-profile" +``` + +## Async Support + +```python showLineNumbers title="Async Usage" +import litellm +import asyncio + +async def main(): + response = await litellm.aspeech( + model="aws_polly/neural", + voice="Joanna", + input="Hello from async AWS Polly", + aws_region_name="us-east-1", + ) + + with open("output.mp3", "wb") as f: + f.write(response.content) + +asyncio.run(main()) +``` diff --git a/docs/my-website/docs/providers/azure/azure.md b/docs/my-website/docs/providers/azure/azure.md index 2f8453573280..12ddc1bd98e8 100644 --- a/docs/my-website/docs/providers/azure/azure.md +++ b/docs/my-website/docs/providers/azure/azure.md @@ -9,10 +9,10 @@ import TabItem from '@theme/TabItem'; | Property | Details | |-------|-------| -| Description | Azure OpenAI Service provides REST API access to OpenAI's powerful language models including o1, o1-mini, GPT-5, GPT-4o, GPT-4o mini, GPT-4 Turbo with Vision, GPT-4, GPT-3.5-Turbo, and Embeddings model series | -| Provider Route on LiteLLM | `azure/`, [`azure/o_series/`](#o-series-models), [`azure/gpt5_series/`](#gpt-5-models) | -| Supported Operations | [`/chat/completions`](#azure-openai-chat-completion-models), [`/responses`](./azure_responses), [`/completions`](#azure-instruct-models), [`/embeddings`](./azure_embedding), [`/audio/speech`](azure_speech), [`/audio/transcriptions`](../audio_transcription), `/fine_tuning`, [`/batches`](#azure-batches-api), `/files`, [`/images`](../image_generation#azure-openai-image-generation-models) | -| Link to Provider Doc | [Azure OpenAI ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview) +| Description | Azure OpenAI Service provides REST API access to OpenAI's powerful language models including o1, o1-mini, GPT-5, GPT-4o, GPT-4o mini, GPT-4 Turbo with Vision, GPT-4, GPT-3.5-Turbo, and Embeddings model series. Also supports Claude models via Azure Foundry. | +| Provider Route on LiteLLM | `azure/`, [`azure/o_series/`](#o-series-models), [`azure/gpt5_series/`](#gpt-5-models), [`azure/claude-*`](./azure_anthropic) (Claude models via Azure Foundry) | +| Supported Operations | [`/chat/completions`](#azure-openai-chat-completion-models), [`/responses`](./azure_responses), [`/completions`](#azure-instruct-models), [`/embeddings`](./azure_embedding), [`/audio/speech`](azure_speech), [`/audio/transcriptions`](../audio_transcription), `/fine_tuning`, [`/batches`](#azure-batches-api), `/files`, [`/images`](../image_generation#azure-openai-image-generation-models), [`/anthropic/v1/messages`](./azure_anthropic) | +| Link to Provider Doc | [Azure OpenAI ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview), [Azure Foundry Claude ↗](https://learn.microsoft.com/en-us/azure/ai-services/foundry-models/claude) ## API Keys, Params api_key, api_base, api_version etc can be passed directly to `litellm.completion` - see here or set as `litellm.api_key` params see here @@ -27,6 +27,12 @@ os.environ["AZURE_AD_TOKEN"] = "" os.environ["AZURE_API_TYPE"] = "" ``` +:::info Azure Foundry Claude Models + +Azure also supports Claude models via Azure Foundry. Use `azure/claude-*` model names (e.g., `azure/claude-sonnet-4-5`) with Azure authentication. See the [Azure Anthropic documentation](./azure_anthropic) for details. + +::: + ## **Usage - LiteLLM Python SDK** Open In Colab @@ -251,7 +257,7 @@ response = completion( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png" } } ] @@ -543,7 +549,8 @@ print(response) ### Entra ID - use `azure_ad_token` -This is a walkthrough on how to use Azure Active Directory Tokens - Microsoft Entra ID to make `litellm.completion()` calls +This is a walkthrough on how to use Azure Active Directory Tokens - Microsoft Entra ID to make `litellm.completion()` calls. +> **Note:** You can follow the same process below to use Azure Active Directory Tokens for all other Azure endpoints (e.g., chat, embeddings, image, audio, etc.) with LiteLLM. Step 1 - Download Azure CLI Installation instructions: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli diff --git a/docs/my-website/docs/providers/azure/azure_anthropic.md b/docs/my-website/docs/providers/azure/azure_anthropic.md new file mode 100644 index 000000000000..4c722b303979 --- /dev/null +++ b/docs/my-website/docs/providers/azure/azure_anthropic.md @@ -0,0 +1,378 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Azure Anthropic (Claude via Azure Foundry) + +LiteLLM supports Claude models deployed via Microsoft Azure Foundry, including Claude Sonnet 4.5, Claude Haiku 4.5, and Claude Opus 4.1. + +## Available Models + +Azure Foundry supports the following Claude models: + +- `claude-sonnet-4-5` - Anthropic's most capable model for building real-world agents and handling complex, long-horizon tasks +- `claude-haiku-4-5` - Near-frontier performance with the right speed and cost for high-volume use cases +- `claude-opus-4-1` - Industry leader for coding, delivering sustained performance on long-running tasks + +| Property | Details | +|-------|-------| +| Description | Claude models deployed via Microsoft Azure Foundry. Uses the same API as Anthropic's Messages API but with Azure authentication. | +| Provider Route on LiteLLM | `azure_ai/` (add this prefix to Claude model names - e.g. `azure_ai/claude-sonnet-4-5`) | +| Provider Doc | [Azure Foundry Claude Models ↗](https://learn.microsoft.com/en-us/azure/ai-services/foundry-models/claude) | +| API Endpoint | `https://.services.ai.azure.com/anthropic/v1/messages` | +| Supported Endpoints | `/chat/completions`, `/anthropic/v1/messages`| + +## Key Features + +- **Extended thinking**: Enhanced reasoning capabilities for complex tasks +- **Image and text input**: Strong vision capabilities for analyzing charts, graphs, technical diagrams, and reports +- **Code generation**: Advanced thinking with code generation, analysis, and debugging (Claude Sonnet 4.5 and Claude Opus 4.1) +- **Same API as Anthropic**: All request/response transformations are identical to the main Anthropic provider + +## Authentication + +Azure Anthropic supports two authentication methods: + +1. **API Key**: Use the `api-key` header +2. **Azure AD Token**: Use `Authorization: Bearer ` header (Microsoft Entra ID) + +## API Keys and Configuration + +```python +import os + +# Option 1: API Key authentication +os.environ["AZURE_API_KEY"] = "your-azure-api-key" +os.environ["AZURE_API_BASE"] = "https://.services.ai.azure.com/anthropic" + +# Option 2: Azure AD Token authentication +os.environ["AZURE_AD_TOKEN"] = "your-azure-ad-token" +os.environ["AZURE_API_BASE"] = "https://.services.ai.azure.com/anthropic" + +# Optional: Azure AD Token Provider (for automatic token refresh) +os.environ["AZURE_TENANT_ID"] = "your-tenant-id" +os.environ["AZURE_CLIENT_ID"] = "your-client-id" +os.environ["AZURE_CLIENT_SECRET"] = "your-client-secret" +os.environ["AZURE_SCOPE"] = "https://cognitiveservices.azure.com/.default" +``` + +## Usage - LiteLLM Python SDK + +### Basic Completion + +```python +from litellm import completion + +# Set environment variables +os.environ["AZURE_API_KEY"] = "your-azure-api-key" +os.environ["AZURE_API_BASE"] = "https://.services.ai.azure.com/anthropic" + +# Make a completion request +response = completion( + model="azure_ai/claude-sonnet-4-5", + messages=[ + {"role": "user", "content": "What are 3 things to visit in Seattle?"} + ], + max_tokens=1000, + temperature=0.7, +) + +print(response) +``` + +### Completion with API Key Parameter + +```python +import litellm + +response = litellm.completion( + model="azure_ai/claude-sonnet-4-5", + api_base="https://.services.ai.azure.com/anthropic", + api_key="your-azure-api-key", + messages=[ + {"role": "user", "content": "Hello!"} + ], + max_tokens=1000, +) +``` + +### Completion with Azure AD Token + +```python +import litellm + +response = litellm.completion( + model="azure_ai/claude-sonnet-4-5", + api_base="https://.services.ai.azure.com/anthropic", + azure_ad_token="your-azure-ad-token", + messages=[ + {"role": "user", "content": "Hello!"} + ], + max_tokens=1000, +) +``` + +### Streaming + +```python +from litellm import completion + +response = completion( + model="azure_ai/claude-sonnet-4-5", + messages=[ + {"role": "user", "content": "Write a short story"} + ], + stream=True, + max_tokens=1000, +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + +### Tool Calling + +```python +from litellm import completion + +response = completion( + model="azure_ai/claude-sonnet-4-5", + messages=[ + {"role": "user", "content": "What's the weather in Seattle?"} + ], + tools=[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA" + } + }, + "required": ["location"] + } + } + } + ], + tool_choice="auto", + max_tokens=1000, +) + +print(response) +``` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export AZURE_API_KEY="your-azure-api-key" +export AZURE_API_BASE="https://.services.ai.azure.com/anthropic" +``` + +### 2. Configure the proxy + +```yaml +model_list: + - model_name: claude-sonnet-4-5 + litellm_params: + model: azure_ai/claude-sonnet-4-5 + api_base: https://.services.ai.azure.com/anthropic + api_key: os.environ/AZURE_API_KEY +``` + +### 3. Test it + + + + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "claude-sonnet-4-5", + "messages": [ + { + "role": "user", + "content": "Hello!" + } + ], + "max_tokens": 1000 +}' +``` + + + + +```python +from openai import OpenAI + +client = OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="claude-sonnet-4-5", + messages=[ + {"role": "user", "content": "Hello!"} + ], + max_tokens=1000 +) + +print(response) +``` + + + + +## Messages API + +Azure Anthropic also supports the native Anthropic Messages API. The endpoint structure is the same as Anthropic's `/v1/messages` API. + +### Using Anthropic SDK + +```python +from anthropic import Anthropic + +client = Anthropic( + api_key="your-azure-api-key", + base_url="https://.services.ai.azure.com/anthropic" +) + +response = client.messages.create( + model="claude-sonnet-4-5", + max_tokens=1000, + messages=[ + {"role": "user", "content": "Hello, world"} + ] +) + +print(response) +``` + +### Using LiteLLM Proxy + +```bash +curl --request POST \ + --url http://0.0.0.0:4000/anthropic/v1/messages \ + --header 'accept: application/json' \ + --header 'content-type: application/json' \ + --header "Authorization: bearer sk-anything" \ + --data '{ + "model": "claude-sonnet-4-5", + "max_tokens": 1024, + "messages": [ + {"role": "user", "content": "Hello, world"} + ] +}' +``` + +## Supported OpenAI Parameters + +Azure Anthropic supports the same parameters as the main Anthropic provider: + +``` +"stream", +"stop", +"temperature", +"top_p", +"max_tokens", +"max_completion_tokens", +"tools", +"tool_choice", +"extra_headers", +"parallel_tool_calls", +"response_format", +"user", +"thinking", +"reasoning_effort" +``` + +:::info + +Azure Anthropic API requires `max_tokens` to be passed. LiteLLM automatically passes `max_tokens=4096` when no `max_tokens` are provided. + +::: + +## Differences from Standard Anthropic Provider + +The only difference between Azure Anthropic and the standard Anthropic provider is authentication: + +- **Standard Anthropic**: Uses `x-api-key` header +- **Azure Anthropic**: Uses `api-key` header or `Authorization: Bearer ` for Azure AD authentication + +All other request/response transformations, tool calling, streaming, and feature support are identical. + +## API Base URL Format + +The API base URL should follow this format: + +``` +https://.services.ai.azure.com/anthropic +``` + +LiteLLM will automatically append `/v1/messages` if not already present in the URL. + +## Example: Full Configuration + +```python +import os +from litellm import completion + +# Configure Azure Anthropic +os.environ["AZURE_API_KEY"] = "your-azure-api-key" +os.environ["AZURE_API_BASE"] = "https://my-resource.services.ai.azure.com/anthropic" + +# Make a request +response = completion( + model="azure_ai/claude-sonnet-4-5", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Explain quantum computing in simple terms."} + ], + max_tokens=1000, + temperature=0.7, + stream=False, +) + +print(response.choices[0].message.content) +``` + +## Troubleshooting + +### Missing API Base Error + +If you see an error about missing API base, ensure you've set: + +```python +os.environ["AZURE_API_BASE"] = "https://.services.ai.azure.com/anthropic" +``` + +Or pass it directly: + +```python +response = completion( + model="azure_ai/claude-sonnet-4-5", + api_base="https://.services.ai.azure.com/anthropic", + # ... +) +``` + +### Authentication Errors + +- **API Key**: Ensure `AZURE_API_KEY` is set or passed as `api_key` parameter +- **Azure AD Token**: Ensure `AZURE_AD_TOKEN` is set or passed as `azure_ad_token` parameter +- **Token Provider**: For automatic token refresh, configure `AZURE_TENANT_ID`, `AZURE_CLIENT_ID`, and `AZURE_CLIENT_SECRET` + +## Related Documentation + +- [Anthropic Provider Documentation](./anthropic.md) - For standard Anthropic API usage +- [Azure OpenAI Documentation](./azure.md) - For Azure OpenAI models +- [Azure Authentication Guide](../secret_managers/azure_key_vault.md) - For Azure AD token setup + diff --git a/docs/my-website/docs/providers/azure/videos.md b/docs/my-website/docs/providers/azure/videos.md index d088c63f7108..62f8d0df182d 100644 --- a/docs/my-website/docs/providers/azure/videos.md +++ b/docs/my-website/docs/providers/azure/videos.md @@ -25,7 +25,6 @@ LiteLLM supports Azure OpenAI's video generation models including Sora with full import os os.environ["AZURE_OPENAI_API_KEY"] = "your-azure-api-key" os.environ["AZURE_OPENAI_API_BASE"] = "https://your-resource.openai.azure.com/" -os.environ["AZURE_OPENAI_API_VERSION"] = "2024-02-15-preview" ``` ### Basic Usage @@ -37,7 +36,6 @@ import time os.environ["AZURE_OPENAI_API_KEY"] = "your-azure-api-key" os.environ["AZURE_OPENAI_API_BASE"] = "https://your-resource.openai.azure.com/" -os.environ["AZURE_OPENAI_API_VERSION"] = "2024-02-15-preview" # Generate video response = video_generation( @@ -53,8 +51,7 @@ print(f"Initial Status: {response.status}") # Check status until video is ready while True: status_response = video_status( - video_id=response.id, - model="azure/sora-2" + video_id=response.id ) print(f"Current Status: {status_response.status}") @@ -69,8 +66,7 @@ while True: # Download video content when ready video_bytes = video_content( - video_id=response.id, - model="azure/sora-2" + video_id=response.id ) # Save to file @@ -87,7 +83,6 @@ Here's how to call Azure video generation models with the LiteLLM Proxy Server ```bash export AZURE_OPENAI_API_KEY="your-azure-api-key" export AZURE_OPENAI_API_BASE="https://your-resource.openai.azure.com/" -export AZURE_OPENAI_API_VERSION="2024-02-15-preview" ``` ### 2. Start the proxy @@ -102,7 +97,6 @@ model_list: model: azure/sora-2 api_key: os.environ/AZURE_OPENAI_API_KEY api_base: os.environ/AZURE_OPENAI_API_BASE - api_version: "2024-02-15-preview" ``` @@ -211,8 +205,7 @@ general_settings: ```python # Download video content video_bytes = video_content( - video_id="video_1234567890", - model="azure/sora-2" + video_id="video_1234567890" ) # Save to file @@ -243,8 +236,7 @@ def generate_and_download_video(prompt): # Step 3: Download video video_bytes = litellm.video_content( - video_id=video_id, - model="azure/sora-2" + video_id=video_id ) # Step 4: Save to file @@ -264,9 +256,9 @@ video_file = generate_and_download_video( ```python # Video editing with reference image response = litellm.video_remix( + video_id="video_456", prompt="Make the cat jump higher", input_reference=open("path/to/image.jpg", "rb"), # Reference image as file object - model="azure/sora-2", seconds="8" ) diff --git a/docs/my-website/docs/providers/azure_ai.md b/docs/my-website/docs/providers/azure_ai.md index b1b5de5bb34a..68e2df676e66 100644 --- a/docs/my-website/docs/providers/azure_ai.md +++ b/docs/my-website/docs/providers/azure_ai.md @@ -312,6 +312,82 @@ LiteLLM supports **ALL** azure ai models. Here's a few examples: | mistral-large-latest | `completion(model="azure_ai/mistral-large-latest", messages)` | | AI21-Jamba-Instruct | `completion(model="azure_ai/ai21-jamba-instruct", messages)` | +## Usage - Azure Anthropic (Azure Foundry Claude) + +LiteLLM funnels Azure Claude deployments through the `azure_ai/` provider so Claude Opus models on Azure Foundry keep working with Tool Search, Effort, streaming, and the rest of the advanced feature set. Point `AZURE_AI_API_BASE` to `https://.services.ai.azure.com/anthropic` (LiteLLM appends `/v1/messages` automatically) and authenticate with `AZURE_AI_API_KEY` or an Azure AD token. + + + + +```python +import os +from litellm import completion + +# Configure Azure credentials +os.environ["AZURE_AI_API_KEY"] = "your-azure-ai-api-key" +os.environ["AZURE_AI_API_BASE"] = "https://my-resource.services.ai.azure.com/anthropic" + +response = completion( + model="azure_ai/claude-opus-4-1", + messages=[{"role": "user", "content": "Explain how Azure Anthropic hosts Claude Opus differently from the public Anthropic API."}], + max_tokens=1200, + temperature=0.7, + stream=True, +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + + + + +**1. Set environment variables** + +```bash +export AZURE_AI_API_KEY="your-azure-ai-api-key" +export AZURE_AI_API_BASE="https://my-resource.services.ai.azure.com/anthropic" +``` + +**2. Configure the proxy** + +```yaml +model_list: + - model_name: claude-4-azure + litellm_params: + model: azure_ai/claude-opus-4-1 + api_key: os.environ/AZURE_AI_API_KEY + api_base: os.environ/AZURE_AI_API_BASE +``` + +**3. Start LiteLLM** + +```bash +litellm --config /path/to/config.yaml +``` + +**4. Test the Azure Claude route** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer $LITELLM_KEY' \ + --data '{ + "model": "claude-4-azure", + "messages": [ + { + "role": "user", + "content": "How do I use Claude Opus 4 via Azure Anthropic in LiteLLM?" + } + ], + "max_tokens": 1024 + }' +``` + + + + ## Rerank Endpoint @@ -397,4 +473,5 @@ curl http://0.0.0.0:4000/rerank \ ``` - \ No newline at end of file + + diff --git a/docs/my-website/docs/providers/azure_ai/azure_ai_vector_stores_passthrough.md b/docs/my-website/docs/providers/azure_ai/azure_ai_vector_stores_passthrough.md new file mode 100644 index 000000000000..a528b1ccfcfa --- /dev/null +++ b/docs/my-website/docs/providers/azure_ai/azure_ai_vector_stores_passthrough.md @@ -0,0 +1,391 @@ +# Azure AI Search - Vector Store (Passthrough API) + +Use this to allow developers to **create** and **search** vector stores using the Azure AI Search API in the **native** Azure AI Search API format, without giving them the Azure AI credentials. + +This is for the proxy only. + +## Admin Flow + +### 1. Add the vector store to LiteLLM + +```yaml +model_list: + - model_name: embedding-model + litellm_params: + model: openai/text-embedding-3-large + + +vector_store_registry: + - vector_store_name: "azure-ai-search" + litellm_params: + vector_store_id: "can-be-anything" # vector store id can be anything for the purpose of passthrough api + custom_llm_provider: "azure_ai" + api_key: os.environ/AZURE_SEARCH_API_KEY + api_base: https://azure-kb-search.search.windows.net + litellm_embedding_model: "azure/text-embedding-3-large" + litellm_embedding_config: + api_base: https://krris-mh44uf7y-eastus2.cognitiveservices.azure.com/ + api_key: os.environ/AZURE_API_KEY + api_version: "2025-09-01" + +general_settings: + database_url: "postgresql://user:password@host:port/database" + master_key: "sk-1234" +``` + +Add your vector store credentials to LiteLLM. + +### 2. Start the proxy. + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +### 3. Create a virtual index. + +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/indexes' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "index_name": "dall-e-4", + "litellm_params": { + "vector_store_index": "real-index-name-2", + "vector_store_name": "azure-ai-search" + } + +}' +``` + +This is a virtual index, which the developer can use to create and search vector stores. + +### 4. Create a key with the vector store permissions. + +```bash +curl -L -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "allowed_vector_store_indexes": [{"index_name": "dall-e-4", "index_permissions": ["write", "read"]}], + "models": ["embedding-model"] +}' +``` + +Give the key access to the virtual index and the embedding model. + +**Expected response** + +```json +{ + "key": "sk-my-virtual-key" +} +``` + +## Developer Flow + +### 1. Create a vector store with some documents. + +Note: Use the '/azure_ai' endpoint for the passthrough api that uses the `azure_ai` provider in your `_new_secret_config.yaml` file. + +```python +import requests +import json + +# ---------------------------- +# 🔐 CONFIGURATION +# ---------------------------- +# Azure OpenAI (for embeddings) +AZURE_OPENAI_ENDPOINT = "http://0.0.0.0:4000" +AZURE_OPENAI_KEY = "sk-my-virtual-key" +EMBEDDING_DEPLOYMENT_NAME = "embedding-model" + +# Azure AI Search +AZURE_AI_SEARCH_ENDPOINT = "http://0.0.0.0:4000/azure_ai" # IMPORTANT: Use the '/azure_ai' endpoint for the passthrough api to Azure +SEARCH_API_KEY = "sk-my-virtual-key" +INDEX_NAME = "dall-e-4" + + + +# Vector dimensions (text-embedding-3-large uses 3072 dimensions) +VECTOR_DIMENSIONS = 3072 + +# Example docs (replace with your own) +documents = [ + {"id": "1", "content": "Refunds must be requested within 30 days."}, + {"id": "2", "content": "We offer 24/7 support for all enterprise customers."}, +] + + +# ---------------------------- +# 📋 STEP 0 — Create Index Schema +# ---------------------------- +def delete_index_if_exists(): + """Delete the index if it exists""" + index_url = f"{AZURE_AI_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}?api-version=2024-07-01" + headers = {"api-key": SEARCH_API_KEY} + + response = requests.delete(index_url, headers=headers) + + if response.status_code == 204: + print(f"🗑️ Deleted existing index '{INDEX_NAME}'") + return True + elif response.status_code == 404: + print(f"ℹ️ Index '{INDEX_NAME}' does not exist yet") + return False + else: + print(f"⚠️ Delete response: {response.status_code}") + print(f" Message: {response.text}") + return False + + +def create_index(): + """Create the Azure AI Search index with proper schema""" + index_url = f"{AZURE_AI_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}?api-version=2024-07-01" + headers = {"Content-Type": "application/json", "api-key": SEARCH_API_KEY} + + index_schema = { + "name": INDEX_NAME, + "fields": [ + {"name": "id", "type": "Edm.String", "key": True, "filterable": True}, + { + "name": "content", + "type": "Edm.String", + "searchable": True, + "filterable": False, + }, + { + "name": "contentVector", + "type": "Collection(Edm.Single)", + "searchable": True, + "dimensions": VECTOR_DIMENSIONS, + "vectorSearchProfile": "my-vector-profile", + }, + ], + "vectorSearch": { + "algorithms": [ + { + "name": "my-hnsw-algorithm", + "kind": "hnsw", + "hnswParameters": { + "metric": "cosine", + "m": 4, + "efConstruction": 400, + "efSearch": 500, + }, + } + ], + "profiles": [ + {"name": "my-vector-profile", "algorithm": "my-hnsw-algorithm"} + ], + }, + } + + # Create the index + response = requests.put(index_url, headers=headers, json=index_schema) + + if response.status_code == 201: + print(f"✅ Index '{INDEX_NAME}' created successfully.") + return True + elif response.status_code == 204: + print(f"✅ Index '{INDEX_NAME}' updated successfully.") + return True + else: + print(f"❌ Failed to create index: {response.status_code}") + print(f" Message: {response.text}") + return False + + +# Delete and recreate the index with correct schema +print("🔄 Setting up Azure AI Search index...") +delete_index_if_exists() +if not create_index(): + print("❌ Could not create index. Exiting.") + exit(1) + + +# ---------------------------- +# 🧠 STEP 1 — Generate Embeddings +# ---------------------------- +def get_embedding(text: str): + url = f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{EMBEDDING_DEPLOYMENT_NAME}/embeddings?api-version=2024-10-21" + headers = {"Content-Type": "application/json", "api-key": AZURE_OPENAI_KEY} + payload = {"input": text} + response = requests.post(url, headers=headers, json=payload) + + if response.status_code != 200: + raise Exception(f"Embedding failed: {response.status_code}\n{response.text}") + return response.json()["data"][0]["embedding"] + + +# Generate embeddings for each document +for doc in documents: + doc["contentVector"] = get_embedding(doc["content"]) + print(f"✅ Embedded doc {doc['id']} (vector length: {len(doc['contentVector'])})") + +# ---------------------------- +# 📤 STEP 2 — Upload to Azure AI Search +# ---------------------------- +upload_url = f"{AZURE_AI_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}/docs/index?api-version=2024-07-01" +headers = {"Content-Type": "application/json", "api-key": SEARCH_API_KEY} + +payload = { + "value": [ + { + "@search.action": "upload", + "id": doc["id"], + "content": doc["content"], + "contentVector": doc["contentVector"], + } + for doc in documents + ] +} + +response = requests.post(upload_url, headers=headers, data=json.dumps(payload)) + +# ---------------------------- +# 🧾 RESULT +# ---------------------------- +if response.status_code == 200: + print("✅ Documents uploaded successfully.") +else: + print(f"❌ Upload failed: {response.status_code}") + print(response.text) + +``` + + +### 2. Search the vector store. + + +```python +import requests +import json + +# ---------------------------- +# 🔐 CONFIGURATION +# ---------------------------- +# Azure OpenAI (for embeddings) +AZURE_OPENAI_ENDPOINT = "http://0.0.0.0:4000" +AZURE_OPENAI_KEY = "sk-my-virtual-key" +EMBEDDING_DEPLOYMENT_NAME = "embedding-model" + +# Azure AI Search +AZURE_AI_SEARCH_ENDPOINT = "http://0.0.0.0:4000/azure_ai" +SEARCH_API_KEY = "sk-my-virtual-key" +INDEX_NAME = "dall-e-4" + + +# ---------------------------- +# 🧠 Generate Query Embedding +# ---------------------------- +def get_embedding(text: str): + """Generate embedding for the query text""" + url = f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{EMBEDDING_DEPLOYMENT_NAME}/embeddings?api-version=2024-10-21" + headers = {"Content-Type": "application/json", "api-key": AZURE_OPENAI_KEY} + payload = {"input": text} + response = requests.post(url, headers=headers, json=payload) + + if response.status_code != 200: + raise Exception(f"Embedding failed: {response.status_code}\n{response.text}") + return response.json()["data"][0]["embedding"] + + +# ---------------------------- +# 🔍 Vector Search Function +# ---------------------------- +def search_knowledge_base(query: str, top_k: int = 3): + """ + Search the knowledge base using vector similarity + + Args: + query: The search query string + top_k: Number of top results to return (default: 3) + + Returns: + List of search results with content and scores + """ + print(f"🔍 Searching for: '{query}'") + + # Step 1: Generate embedding for the query + print(" Generating query embedding...") + query_vector = get_embedding(query) + + # Step 2: Perform vector search + search_url = f"{AZURE_AI_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}/docs/search?api-version=2024-07-01" + headers = {"Content-Type": "application/json", "api-key": SEARCH_API_KEY} + + # Build the search request with vector search + search_payload = { + "search": "*", # Get all documents + "vectorQueries": [ + { + "vector": query_vector, + "fields": "contentVector", + "kind": "vector", + "k": top_k, # Number of nearest neighbors to return + } + ], + "select": "id,content", # Fields to return + "top": top_k, + } + + # Execute the search + response = requests.post(search_url, headers=headers, json=search_payload) + + if response.status_code != 200: + raise Exception(f"Search failed: {response.status_code}\n{response.text}") + + # Parse and return results + results = response.json() + return results.get("value", []) + + +# ---------------------------- +# 📊 Display Results +# ---------------------------- +def display_results(results): + """Pretty print the search results""" + if not results: + print("\n❌ No results found.") + return + + print(f"\n✅ Found {len(results)} results:\n") + print("=" * 80) + + for i, result in enumerate(results, 1): + print(f"\n📄 Result #{i}") + print(f" ID: {result.get('id', 'N/A')}") + print(f" Score: {result.get('@search.score', 'N/A')}") + print(f" Content: {result.get('content', 'N/A')}") + print("-" * 80) + + +# ---------------------------- +# 🎯 MAIN - Example Queries +# ---------------------------- +if __name__ == "__main__": + # Example 1: Search for refund policy + print("\n" + "=" * 80) + print("EXAMPLE 1: Refund Policy Query") + print("=" * 80) + results = search_knowledge_base("How do I get a refund?", top_k=2) + display_results(results) + + # Example 2: Search for customer support + print("\n\n" + "=" * 80) + print("EXAMPLE 2: Customer Support Query") + print("=" * 80) + results = search_knowledge_base("When can I contact support?", top_k=2) + display_results(results) + + # Example 3: Custom query - uncomment to use + # print("\n\n" + "=" * 80) + # print("CUSTOM QUERY") + # print("=" * 80) + # custom_query = input("Enter your query: ") + # results = search_knowledge_base(custom_query, top_k=3) + # display_results(results) + +``` \ No newline at end of file diff --git a/docs/my-website/docs/providers/azure_ai/azure_model_router.md b/docs/my-website/docs/providers/azure_ai/azure_model_router.md new file mode 100644 index 000000000000..16bc1afb70eb --- /dev/null +++ b/docs/my-website/docs/providers/azure_ai/azure_model_router.md @@ -0,0 +1,281 @@ +# Azure Model Router + +Azure Model Router is a feature in Azure AI Foundry that automatically routes your requests to the best available model based on your requirements. This allows you to use a single endpoint that intelligently selects the optimal model for each request. + +## Key Features + +- **Automatic Model Selection**: Azure Model Router dynamically selects the best model for your request +- **Cost Tracking**: LiteLLM automatically tracks costs based on the actual model used (e.g., `gpt-4.1-nano`), plus the Model Router infrastructure fee +- **Streaming Support**: Full support for streaming responses with accurate cost calculation +- **Simple Configuration**: Easy to set up via UI or config file + +## Model Naming Pattern + +Use the pattern: `azure_ai/model_router/` + +**Components:** +- `azure_ai` - The provider identifier +- `model_router` - Indicates this is a Model Router deployment +- `` - Your actual deployment name from Azure AI Foundry (e.g., `azure-model-router`) + +**Example:** `azure_ai/model_router/azure-model-router` + +**How it works:** +- LiteLLM automatically strips the `model_router/` prefix when sending requests to Azure +- Only your deployment name (e.g., `azure-model-router`) is sent to the Azure API +- The full path is preserved in responses and logs for proper cost tracking + +## LiteLLM Python SDK + +### Basic Usage + +Use the pattern `azure_ai/model_router/` where `` is your Azure deployment name: + +```python +import litellm +import os + +response = litellm.completion( + model="azure_ai/model_router/azure-model-router", # Use your deployment name + messages=[{"role": "user", "content": "Hello!"}], + api_base="https://your-endpoint.cognitiveservices.azure.com/openai/v1/", + api_key=os.getenv("AZURE_MODEL_ROUTER_API_KEY"), +) + +print(response) +``` + +**Pattern Explanation:** +- `azure_ai` - The provider +- `model_router` - Indicates this is a model router deployment +- `azure-model-router` - Your actual deployment name from Azure AI Foundry + +LiteLLM will automatically strip the `model_router/` prefix when sending the request to Azure, so only `azure-model-router` is sent to the API. + +### Streaming with Usage Tracking + +```python +import litellm +import os + +response = await litellm.acompletion( + model="azure_ai/model_router/azure-model-router", # Use your deployment name + messages=[{"role": "user", "content": "hi"}], + api_base="https://your-endpoint.cognitiveservices.azure.com/openai/v1/", + api_key=os.getenv("AZURE_MODEL_ROUTER_API_KEY"), + stream=True, + stream_options={"include_usage": True}, +) + +async for chunk in response: + print(chunk) +``` + +## LiteLLM Proxy (AI Gateway) + +### config.yaml + +```yaml +model_list: + - model_name: azure-model-router # Public name for your users + litellm_params: + model: azure_ai/model_router/azure-model-router # Use your deployment name + api_base: https://your-endpoint.cognitiveservices.azure.com/openai/v1/ + api_key: os.environ/AZURE_MODEL_ROUTER_API_KEY +``` + +**Note:** Replace `azure-model-router` in the model path with your actual deployment name from Azure AI Foundry. + +### Start Proxy + +```bash +litellm --config config.yaml +``` + +### Test Request + +```bash +curl -X POST http://localhost:4000/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "azure-model-router", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + +## Add Azure Model Router via LiteLLM UI + +This walkthrough shows how to add an Azure Model Router endpoint to LiteLLM using the Admin Dashboard. + +### Quick Start + +1. Navigate to the **Models** page in the LiteLLM UI +2. Select **"Azure AI Foundry (Studio)"** as the provider +3. Enter your deployment name (e.g., `azure-model-router`) +4. LiteLLM will automatically format it as `azure_ai/model_router/azure-model-router` +5. Add your API base URL and API key +6. Test and save + +### Detailed Walkthrough + +#### Step 1: Select Provider + +Navigate to the Models page and select "Azure AI Foundry (Studio)" as the provider. + +##### Navigate to Models Page + +![Navigate to Models](./img/azure_model_router_01.jpeg) + +##### Click Provider Dropdown + +![Click Provider](./img/azure_model_router_02.jpeg) + +##### Choose Azure AI Foundry + +![Select Azure AI Foundry](./img/azure_model_router_03.jpeg) + +#### Step 2: Enter Deployment Name + +**New Simplified Method:** Just enter your deployment name directly in the text field. If your deployment name contains "model-router" or "model_router", LiteLLM will automatically format it as `azure_ai/model_router/`. + +**Example:** +- Enter: `azure-model-router` +- LiteLLM creates: `azure_ai/model_router/azure-model-router` + +##### Copy Deployment Name from Azure Portal + +Switch to Azure AI Foundry and copy your model router deployment name. + +![Azure Portal Model Name](./img/azure_model_router_09.jpeg) + +![Copy Model Name](./img/azure_model_router_10.jpeg) + +##### Enter Deployment Name in LiteLLM + +Paste your deployment name (e.g., `azure-model-router`) directly into the text field. + +![Enter Deployment Name](./img/azure_model_router_04.jpeg) + +**What happens behind the scenes:** +- You enter: `azure-model-router` +- LiteLLM automatically detects this is a model router deployment +- The full model path becomes: `azure_ai/model_router/azure-model-router` +- When making API calls, only `azure-model-router` is sent to Azure + +#### Step 3: Configure API Base and Key + +Copy the endpoint URL and API key from Azure portal. + +##### Copy API Base URL from Azure + +![Copy API Base](./img/azure_model_router_12.jpeg) + +##### Enter API Base in LiteLLM + +![Click API Base Field](./img/azure_model_router_13.jpeg) + +![Paste API Base](./img/azure_model_router_14.jpeg) + +##### Copy API Key from Azure + +![Copy API Key](./img/azure_model_router_15.jpeg) + +##### Enter API Key in LiteLLM + +![Enter API Key](./img/azure_model_router_16.jpeg) + +#### Step 4: Test and Add Model + +Verify your configuration works and save the model. + +##### Test Connection + +![Test Connection](./img/azure_model_router_17.jpeg) + +##### Close Test Dialog + +![Close Dialog](./img/azure_model_router_18.jpeg) + +##### Add Model + +![Add Model](./img/azure_model_router_19.jpeg) + +#### Step 5: Verify in Playground + +Test your model and verify cost tracking is working. + +##### Open Playground + +![Go to Playground](./img/azure_model_router_20.jpeg) + +##### Select Model + +![Select Model](./img/azure_model_router_21.jpeg) + +##### Send Test Message + +![Send Message](./img/azure_model_router_22.jpeg) + +##### View Logs + +![View Logs](./img/azure_model_router_23.jpeg) + +##### Verify Cost Tracking + +Cost is tracked based on the actual model used (e.g., `gpt-4.1-nano`), plus a flat infrastructure cost of $0.14 per million input tokens for using the Model Router. + +![Verify Cost](./img/azure_model_router_24.jpeg) + +## Cost Tracking + +LiteLLM automatically handles cost tracking for Azure Model Router by: + +1. **Detecting the actual model**: When Azure Model Router routes your request to a specific model (e.g., `gpt-4.1-nano-2025-04-14`), LiteLLM extracts this from the response +2. **Calculating accurate costs**: Costs are calculated based on: + - The actual model used (e.g., `gpt-4.1-nano` token costs) + - Plus a flat infrastructure cost of **$0.14 per million input tokens** for using the Model Router +3. **Streaming support**: Cost tracking works correctly for both streaming and non-streaming requests + +### Cost Breakdown + +When you use Azure Model Router, the total cost includes: + +- **Model Cost**: Based on the actual model that handled your request (e.g., `gpt-4.1-nano`) +- **Router Flat Cost**: $0.14 per million input tokens (Azure AI Foundry infrastructure fee) + +### Example Response with Cost + +```python +import litellm + +response = litellm.completion( + model="azure_ai/model_router/azure-model-router", + messages=[{"role": "user", "content": "Hello!"}], + api_base="https://your-endpoint.cognitiveservices.azure.com/openai/v1/", + api_key="your-api-key", +) + +# The response will show the actual model used +print(f"Model used: {response.model}") # e.g., "azure_ai/gpt-4.1-nano-2025-04-14" + +# Get cost (includes both model cost and router flat cost) +from litellm import completion_cost +cost = completion_cost(completion_response=response) +print(f"Total cost: ${cost}") + +# Access detailed cost breakdown +if hasattr(response, '_hidden_params') and 'response_cost' in response._hidden_params: + print(f"Response cost: ${response._hidden_params['response_cost']}") +``` + +### Viewing Cost Breakdown in UI + +When viewing logs in the LiteLLM UI, you'll see: +- **Model Cost**: The cost for the actual model used +- **Azure Model Router Flat Cost**: The $0.14/M input tokens infrastructure fee +- **Total Cost**: Sum of both costs + +This breakdown helps you understand exactly what you're paying for when using the Model Router. + + diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_01.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_01.jpeg new file mode 100644 index 000000000000..42654600f740 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_01.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_02.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_02.jpeg new file mode 100644 index 000000000000..b9feab050ec3 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_02.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_03.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_03.jpeg new file mode 100644 index 000000000000..3f55ebf01215 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_03.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_04.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_04.jpeg new file mode 100644 index 000000000000..1626c78bd1b4 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_04.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_05.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_05.jpeg new file mode 100644 index 000000000000..bef736e361d2 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_05.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_06.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_06.jpeg new file mode 100644 index 000000000000..bfeb767eea71 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_06.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_07.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_07.jpeg new file mode 100644 index 000000000000..eed742a8c681 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_07.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_08.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_08.jpeg new file mode 100644 index 000000000000..e72a6e92e77c Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_08.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_09.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_09.jpeg new file mode 100644 index 000000000000..5fe1421c2a44 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_09.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_10.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_10.jpeg new file mode 100644 index 000000000000..60aa80063fc8 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_10.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_11.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_11.jpeg new file mode 100644 index 000000000000..98694fbb9be8 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_11.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_12.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_12.jpeg new file mode 100644 index 000000000000..77922ccea01c Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_12.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_13.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_13.jpeg new file mode 100644 index 000000000000..2cb80d0826a2 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_13.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_14.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_14.jpeg new file mode 100644 index 000000000000..8225023658c9 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_14.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_15.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_15.jpeg new file mode 100644 index 000000000000..7bd728528812 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_15.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_16.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_16.jpeg new file mode 100644 index 000000000000..e3dbd75acaed Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_16.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_17.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_17.jpeg new file mode 100644 index 000000000000..ba5fd5391385 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_17.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_18.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_18.jpeg new file mode 100644 index 000000000000..1ead4bee962b Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_18.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_19.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_19.jpeg new file mode 100644 index 000000000000..ec7fa9c3bcb3 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_19.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_20.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_20.jpeg new file mode 100644 index 000000000000..2999fcd678e8 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_20.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_21.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_21.jpeg new file mode 100644 index 000000000000..1226e29d648c Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_21.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_22.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_22.jpeg new file mode 100644 index 000000000000..4455b552b81f Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_22.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_23.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_23.jpeg new file mode 100644 index 000000000000..4fa88bdb9651 Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_23.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai/img/azure_model_router_24.jpeg b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_24.jpeg new file mode 100644 index 000000000000..7fb61d1cce1e Binary files /dev/null and b/docs/my-website/docs/providers/azure_ai/img/azure_model_router_24.jpeg differ diff --git a/docs/my-website/docs/providers/azure_ai_agents.md b/docs/my-website/docs/providers/azure_ai_agents.md new file mode 100644 index 000000000000..23ee5a39521c --- /dev/null +++ b/docs/my-website/docs/providers/azure_ai_agents.md @@ -0,0 +1,427 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Azure AI Foundry Agents + +Call Azure AI Foundry Agents in the OpenAI Request/Response format. + +| Property | Details | +|----------|---------| +| Description | Azure AI Foundry Agents provides hosted agent runtimes that can execute agentic workflows with foundation models, tools, and code interpreters. | +| Provider Route on LiteLLM | `azure_ai/agents/{AGENT_ID}` | +| Provider Doc | [Azure AI Foundry Agents ↗](https://learn.microsoft.com/en-us/azure/ai-foundry/agents/quickstart) | + +## Authentication + +Azure AI Foundry Agents require **Azure AD authentication** (not API keys). You can authenticate using: + +### Option 1: Service Principal (Recommended for Production) + +Set these environment variables: + +```bash +export AZURE_TENANT_ID="your-tenant-id" +export AZURE_CLIENT_ID="your-client-id" +export AZURE_CLIENT_SECRET="your-client-secret" +``` + +LiteLLM will automatically obtain an Azure AD token using these credentials. + +### Option 2: Azure AD Token (Manual) + +Pass a token directly via `api_key`: + +```bash +# Get token via Azure CLI +az account get-access-token --resource "https://ai.azure.com" --query accessToken -o tsv +``` + +### Required Azure Role + +Your Service Principal or user must have the **Azure AI Developer** or **Azure AI User** role on your Azure AI Foundry project. + +To assign via Azure CLI: +```bash +az role assignment create \ + --assignee-object-id "" \ + --assignee-principal-type "ServicePrincipal" \ + --role "Azure AI Developer" \ + --scope "/subscriptions//resourceGroups//providers/Microsoft.CognitiveServices/accounts/" +``` + +Or add via **Azure AI Foundry Portal** → Your Project → **Project users** → **+ New user**. + +## Quick Start + +### Model Format to LiteLLM + +To call an Azure AI Foundry Agent through LiteLLM, use the following model format. + +Here the `model=azure_ai/agents/` tells LiteLLM to call the Azure AI Foundry Agent Service API. + +```shell showLineNumbers title="Model Format to LiteLLM" +azure_ai/agents/{AGENT_ID} +``` + +**Example:** +- `azure_ai/agents/asst_abc123` + +You can find the Agent ID in your Azure AI Foundry portal under Agents. + +### LiteLLM Python SDK + +```python showLineNumbers title="Basic Agent Completion" +import litellm + +# Make a completion request to your Azure AI Foundry Agent +# Uses AZURE_TENANT_ID, AZURE_CLIENT_ID, AZURE_CLIENT_SECRET env vars for auth +response = litellm.completion( + model="azure_ai/agents/asst_abc123", + messages=[ + { + "role": "user", + "content": "Explain machine learning in simple terms" + } + ], + api_base="https://your-resource.services.ai.azure.com/api/projects/your-project", +) + +print(response.choices[0].message.content) +print(f"Usage: {response.usage}") +``` + +```python showLineNumbers title="Streaming Agent Responses" +import litellm + +# Stream responses from your Azure AI Foundry Agent +response = await litellm.acompletion( + model="azure_ai/agents/asst_abc123", + messages=[ + { + "role": "user", + "content": "What are the key principles of software architecture?" + } + ], + api_base="https://your-resource.services.ai.azure.com/api/projects/your-project", + stream=True, +) + +async for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="") +``` + +### LiteLLM Proxy + +#### 1. Configure your model in config.yaml + + + + +```yaml showLineNumbers title="LiteLLM Proxy Configuration" +model_list: + - model_name: azure-agent-1 + litellm_params: + model: azure_ai/agents/asst_abc123 + api_base: https://your-resource.services.ai.azure.com/api/projects/your-project + # Service Principal auth (recommended) + tenant_id: os.environ/AZURE_TENANT_ID + client_id: os.environ/AZURE_CLIENT_ID + client_secret: os.environ/AZURE_CLIENT_SECRET + + - model_name: azure-agent-math-tutor + litellm_params: + model: azure_ai/agents/asst_def456 + api_base: https://your-resource.services.ai.azure.com/api/projects/your-project + # Or pass Azure AD token directly + api_key: os.environ/AZURE_AD_TOKEN +``` + + + + +#### 2. Start the LiteLLM Proxy + +```bash showLineNumbers title="Start LiteLLM Proxy" +litellm --config config.yaml +``` + +#### 3. Make requests to your Azure AI Foundry Agents + + + + +```bash showLineNumbers title="Basic Agent Request" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "azure-agent-1", + "messages": [ + { + "role": "user", + "content": "Summarize the main benefits of cloud computing" + } + ] + }' +``` + +```bash showLineNumbers title="Streaming Agent Request" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "azure-agent-math-tutor", + "messages": [ + { + "role": "user", + "content": "What is 25 * 4?" + } + ], + "stream": true + }' +``` + + + + + +```python showLineNumbers title="Using OpenAI SDK with LiteLLM Proxy" +from openai import OpenAI + +# Initialize client with your LiteLLM proxy URL +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +# Make a completion request to your Azure AI Foundry Agent +response = client.chat.completions.create( + model="azure-agent-1", + messages=[ + { + "role": "user", + "content": "What are best practices for API design?" + } + ] +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="Streaming with OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +# Stream Agent responses +stream = client.chat.completions.create( + model="azure-agent-math-tutor", + messages=[ + { + "role": "user", + "content": "Explain the Pythagorean theorem" + } + ], + stream=True +) + +for chunk in stream: + if chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") +``` + + + + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `AZURE_TENANT_ID` | Azure AD tenant ID for Service Principal auth | +| `AZURE_CLIENT_ID` | Application (client) ID of your Service Principal | +| `AZURE_CLIENT_SECRET` | Client secret for your Service Principal | + +```bash +export AZURE_TENANT_ID="your-tenant-id" +export AZURE_CLIENT_ID="your-client-id" +export AZURE_CLIENT_SECRET="your-client-secret" +``` + +## Conversation Continuity (Thread Management) + +Azure AI Foundry Agents use threads to maintain conversation context. LiteLLM automatically manages threads for you, but you can also pass an existing thread ID to continue a conversation. + +```python showLineNumbers title="Continuing a Conversation" +import litellm + +# First message creates a new thread +response1 = await litellm.acompletion( + model="azure_ai/agents/asst_abc123", + messages=[{"role": "user", "content": "My name is Alice"}], + api_base="https://your-resource.services.ai.azure.com/api/projects/your-project", +) + +# Get the thread_id from the response +thread_id = response1._hidden_params.get("thread_id") + +# Continue the conversation using the same thread +response2 = await litellm.acompletion( + model="azure_ai/agents/asst_abc123", + messages=[{"role": "user", "content": "What's my name?"}], + api_base="https://your-resource.services.ai.azure.com/api/projects/your-project", + thread_id=thread_id, # Pass the thread_id to continue conversation +) + +print(response2.choices[0].message.content) # Should mention "Alice" +``` + +## Provider-specific Parameters + +Azure AI Foundry Agents support additional parameters that can be passed to customize the agent invocation. + + + + +```python showLineNumbers title="Using Agent-specific parameters" +from litellm import completion + +response = litellm.completion( + model="azure_ai/agents/asst_abc123", + messages=[ + { + "role": "user", + "content": "Analyze this data and provide insights", + } + ], + api_base="https://your-resource.services.ai.azure.com/api/projects/your-project", + thread_id="thread_abc123", # Optional: Continue existing conversation + instructions="Be concise and focus on key insights", # Optional: Override agent instructions +) +``` + + + + +```yaml showLineNumbers title="LiteLLM Proxy Configuration with Parameters" +model_list: + - model_name: azure-agent-analyst + litellm_params: + model: azure_ai/agents/asst_abc123 + api_base: https://your-resource.services.ai.azure.com/api/projects/your-project + tenant_id: os.environ/AZURE_TENANT_ID + client_id: os.environ/AZURE_CLIENT_ID + client_secret: os.environ/AZURE_CLIENT_SECRET + instructions: "Be concise and focus on key insights" +``` + + + + +### Available Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `thread_id` | string | Optional thread ID to continue an existing conversation | +| `instructions` | string | Optional instructions to override the agent's default instructions for this run | + +## LiteLLM A2A Gateway + +You can also connect to Azure AI Foundry Agents through LiteLLM's A2A (Agent-to-Agent) Gateway UI. This provides a visual way to register and test agents without writing code. + +### 1. Navigate to Agents + +From the sidebar, click "Agents" to open the agent management page, then click "+ Add New Agent". + +![Add New Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/f8efe335-a08a-4f2b-9f7f-de28e4d58b05/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=217,118) + +### 2. Select Azure AI Foundry Agent Type + +Click "A2A Standard" to see available agent types, then select "Azure AI Foundry". + +![Select A2A Standard](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/ede38044-3e18-43b9-afe3-b7513bf9963e/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=409,143) + +![Select Azure AI Foundry](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/33c396fc-a927-4b03-8ee2-ea04950b12c1/ascreenshot.jpeg?tl_px=0,86&br_px=2201,1317&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=433,277) + +### 3. Configure the Agent + +Fill in the following fields: + +#### Agent Name + +Enter a friendly agent name - callers will see this name as the agent available. + +![Enter Agent Name](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/18c02804-7612-40c4-9ba4-3f1a4c0725d5/ascreenshot.jpeg?tl_px=0,0&br_px=2617,1463&force_format=jpeg&q=100&width=1120.0) + +#### Agent ID + +Get the Agent ID from your Azure AI Foundry portal: + +1. Go to [https://ai.azure.com/](https://ai.azure.com/) and click "Agents" + +![Azure Agents](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/5e29fc48-c0f7-4b6d-8313-2063d1240d15/ascreenshot.jpeg?tl_px=0,0&br_px=2618,1463&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=39,187) + +2. Copy the "ID" of the agent you want to add (e.g., `asst_hbnoK9BOCcHhC3lC4MDroVGG`) + +![Copy Agent ID](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/bf17dfec-a627-41c6-9121-3935e86d3700/ascreenshot.jpeg?tl_px=0,0&br_px=2618,1463&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=504,241) + +3. Paste the Agent ID in LiteLLM - this tells LiteLLM which agent to invoke on Azure Foundry + +![Paste Agent ID](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/45230c28-54f6-441c-9a20-4ef8b74076e2/ascreenshot.jpeg?tl_px=0,97&br_px=2617,1560&force_format=jpeg&q=100&width=1120.0) + +#### Azure AI API Base + +Get your API base URL from Azure AI Foundry: + +1. Go to [https://ai.azure.com/](https://ai.azure.com/) and click "Overview" +2. Under libraries, select Microsoft Foundry +3. Get your endpoint - it should look like `https://.services.ai.azure.com/api/projects/` + +![Get API Base](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/60e2c735-4480-44b7-ab12-d69f4200b12c/ascreenshot.jpeg?tl_px=0,40&br_px=2618,1503&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=278,277) + +4. Paste the URL in LiteLLM + +![Paste API Base](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/e9c6f48e-7602-449a-9261-0df4a0a66876/ascreenshot.jpeg?tl_px=267,456&br_px=2468,1687&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=524,277) + +#### Authentication + +Add your Azure AD credentials for authentication: +- **Azure Tenant ID** +- **Azure Client ID** +- **Azure Client Secret** + +![Add Auth](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/e5e2b636-cf2e-4283-a1cc-8d497d349243/ascreenshot.jpeg?tl_px=0,653&br_px=2201,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=339,405) + +Click "Create Agent" to save. + +![Create Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/799a720a-639e-4217-a6f5-51687fc07611/ascreenshot.jpeg?tl_px=416,653&br_px=2618,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=693,519) + +### 4. Test in Playground + +Go to "Playground" in the sidebar to test your agent. + +![Go to Playground](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/7da84247-db1c-4d55-9015-6e3d60ea63ce/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=63,106) + +Change the endpoint type to `/v1/a2a/message/send`. + +![Select A2A Endpoint](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/733265a8-412d-4eac-bc19-03436d7846c4/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=286,234) + +### 5. Select Your Agent and Send a Message + +Pick your Azure AI Foundry agent from the dropdown and send a test message. + +![Select Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/59a8e66e-6f82-42e3-ab48-78355464e6be/ascreenshot.jpeg?tl_px=0,28&br_px=2201,1259&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=269,277) + +The agent responds with its capabilities. You can now interact with your Azure AI Foundry agent through the A2A protocol. + +![Agent Response](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-14/a0aafb69-6c28-4977-8210-96f9de750cdf/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=487,272) + +## Further Reading + +- [Azure AI Foundry Agents Documentation](https://learn.microsoft.com/en-us/azure/ai-services/agents/) +- [Create Thread and Run API Reference](https://learn.microsoft.com/en-us/rest/api/aifoundry/aiagents/create-thread-and-run/create-thread-and-run) +- [A2A Agent Gateway](../a2a.md) +- [A2A Cost Tracking](../a2a_cost_tracking.md) diff --git a/docs/my-website/docs/providers/azure_ai_img.md b/docs/my-website/docs/providers/azure_ai_img.md index 8e2f52268667..513bbe858d0c 100644 --- a/docs/my-website/docs/providers/azure_ai_img.md +++ b/docs/my-website/docs/providers/azure_ai_img.md @@ -1,7 +1,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Azure AI Image Generation +# Azure AI Image Generation (Black Forest Labs - Flux) Azure AI provides powerful image generation capabilities using FLUX models from Black Forest Labs to create high-quality images from text descriptions. @@ -12,7 +12,7 @@ Azure AI provides powerful image generation capabilities using FLUX models from | Description | Azure AI Image Generation uses FLUX models to generate high-quality images from text descriptions. | | Provider Route on LiteLLM | `azure_ai/` | | Provider Doc | [Azure AI FLUX Models ↗](https://techcommunity.microsoft.com/blog/azure-ai-foundry-blog/black-forest-labs-flux-1-kontext-pro-and-flux1-1-pro-now-available-in-azure-ai-f/4434659) | -| Supported Operations | [`/images/generations`](#image-generation) | +| Supported Operations | [`/images/generations`](#image-generation), [`/images/edits`](#image-editing) | ## Setup @@ -33,6 +33,7 @@ Get your API key and endpoint from [Azure AI Studio](https://ai.azure.com/). |------------|-------------|----------------| | `azure_ai/FLUX-1.1-pro` | Latest FLUX 1.1 Pro model for high-quality image generation | $0.04 | | `azure_ai/FLUX.1-Kontext-pro` | FLUX 1 Kontext Pro model with enhanced context understanding | $0.04 | +| `azure_ai/flux.2-pro` | FLUX 2 Pro model for next-generation image generation | $0.04 | ## Image Generation @@ -85,6 +86,32 @@ print(response.data[0].url) + + +```python showLineNumbers title="FLUX 2 Pro Image Generation" +import litellm +import os + +# Set your API credentials +os.environ["AZURE_AI_API_KEY"] = "your-api-key-here" +os.environ["AZURE_AI_API_BASE"] = "your-azure-ai-endpoint" # e.g., https://litellm-ci-cd-prod.services.ai.azure.com + +# Generate image with FLUX 2 Pro +response = litellm.image_generation( + model="azure_ai/flux.2-pro", + prompt="A photograph of a red fox in an autumn forest", + api_base=os.environ["AZURE_AI_API_BASE"], + api_key=os.environ["AZURE_AI_API_KEY"], + api_version="preview", + size="1024x1024", + n=1 +) + +print(response.data[0].b64_json) # FLUX 2 returns base64 encoded images +``` + + + ```python showLineNumbers title="Async Image Generation" @@ -165,6 +192,15 @@ model_list: model_info: mode: image_generation + - model_name: azure-flux-2-pro + litellm_params: + model: azure_ai/flux.2-pro + api_key: os.environ/AZURE_AI_API_KEY + api_base: os.environ/AZURE_AI_API_BASE + api_version: preview + model_info: + mode: image_generation + general_settings: master_key: sk-1234 ``` @@ -239,6 +275,103 @@ curl --location 'http://localhost:4000/v1/images/generations' \ +## Image Editing + +FLUX 2 Pro supports image editing by passing an input image along with a prompt describing the desired modifications. + +### Usage - LiteLLM Python SDK + + + + +```python showLineNumbers title="Basic Image Editing with FLUX 2 Pro" +import litellm +import os + +# Set your API credentials +os.environ["AZURE_AI_API_KEY"] = "your-api-key-here" +os.environ["AZURE_AI_API_BASE"] = "your-azure-ai-endpoint" # e.g., https://litellm-ci-cd-prod.services.ai.azure.com + +# Edit an existing image +response = litellm.image_edit( + model="azure_ai/flux.2-pro", + prompt="Add a red hat to the subject", + image=open("input_image.png", "rb"), + api_base=os.environ["AZURE_AI_API_BASE"], + api_key=os.environ["AZURE_AI_API_KEY"], + api_version="preview", +) + +print(response.data[0].b64_json) # FLUX 2 returns base64 encoded images +``` + + + + + +```python showLineNumbers title="Async Image Editing" +import litellm +import asyncio +import os + +async def edit_image(): + os.environ["AZURE_AI_API_KEY"] = "your-api-key-here" + os.environ["AZURE_AI_API_BASE"] = "your-azure-ai-endpoint" + + response = await litellm.aimage_edit( + model="azure_ai/flux.2-pro", + prompt="Change the background to a sunset beach", + image=open("input_image.png", "rb"), + api_base=os.environ["AZURE_AI_API_BASE"], + api_key=os.environ["AZURE_AI_API_KEY"], + api_version="preview", + ) + + return response + +asyncio.run(edit_image()) +``` + + + + +### Usage - LiteLLM Proxy Server + + + + +```bash showLineNumbers title="Image Edit via Proxy - cURL" +curl --location 'http://localhost:4000/v1/images/edits' \ +--header 'Authorization: Bearer sk-1234' \ +--form 'model="azure-flux-2-pro"' \ +--form 'prompt="Add sunglasses to the person"' \ +--form 'image=@"input_image.png"' +``` + + + + + +```python showLineNumbers title="Image Edit via Proxy - OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="sk-1234" +) + +response = client.images.edit( + model="azure-flux-2-pro", + prompt="Make the sky more dramatic with storm clouds", + image=open("input_image.png", "rb"), +) + +print(response.data[0].b64_json) +``` + + + + ## Supported Parameters Azure AI Image Generation supports the following OpenAI-compatible parameters: diff --git a/docs/my-website/docs/providers/azure_ai_speech.md b/docs/my-website/docs/providers/azure_ai_speech.md index 434a796a2fb2..22db98cfac54 100644 --- a/docs/my-website/docs/providers/azure_ai_speech.md +++ b/docs/my-website/docs/providers/azure_ai_speech.md @@ -136,6 +136,89 @@ response = speech( | `wav` | riff-24khz-16bit-mono-pcm | 24kHz | | `pcm` | raw-24khz-16bit-mono-pcm | 24kHz | +## Passing Raw SSML + +LiteLLM automatically detects when your `input` contains SSML (by checking for `` tags) and passes it through to Azure without any transformation. This gives you complete control over speech synthesis. + +**When to use raw SSML:** +- Using the `` element with multilingual voices to translate text (e.g., English text → Spanish speech) +- Complex SSML structures with multiple voices or prosody changes +- Fine-grained control over pronunciation, breaks, emphasis, and other speech features + +### LiteLLM SDK + +```python showLineNumbers title="Raw SSML for Multilingual Translation" +from litellm import speech + +# Use element to convert English text to Spanish speech +# The element forces the output language regardless of input text language +language_code = "es-ES" +text = "Hello, how are you today?" # English text +voice = "en-US-AvaMultilingualNeural" + +ssml = f""" + + {text} + +""" + +response = speech( + model="azure/speech/azure-tts", + voice=voice, + input=ssml, # LiteLLM auto-detects SSML and sends as-is + api_base="https://eastus.tts.speech.microsoft.com", + api_key=os.environ["AZURE_TTS_API_KEY"], +) +response.stream_to_file("speech.mp3") +``` + +```python showLineNumbers title="Raw SSML with Complex Features" +from litellm import speech + +# Complex SSML with multiple prosody adjustments +ssml = """ + + + + Welcome to our service! + + + + + How can I help you today? + + +""" + +response = speech( + model="azure/speech/azure-tts", + voice="en-US-JennyNeural", + input=ssml, # LiteLLM detects and passes through unchanged + api_base="https://eastus.tts.speech.microsoft.com", + api_key=os.environ["AZURE_TTS_API_KEY"], +) +response.stream_to_file("speech.mp3") +``` + +### LiteLLM Proxy + +```bash +curl http://0.0.0.0:4000/v1/audio/speech \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "azure-speech", + "voice": "en-US-AvaMultilingualNeural", + "input": "Hello, how are you today?" + }' \ + --output speech.mp3 +``` + + ## Sending Azure-Specific Params Azure AI Speech supports advanced SSML features through optional parameters: diff --git a/docs/my-website/docs/providers/azure_ai_vector_stores.md b/docs/my-website/docs/providers/azure_ai_vector_stores.md index d3abb78bbe4f..b9dfa3bdc9c7 100644 --- a/docs/my-website/docs/providers/azure_ai_vector_stores.md +++ b/docs/my-website/docs/providers/azure_ai_vector_stores.md @@ -1,9 +1,9 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Azure AI Search - Vector Store +# Azure AI Search - Vector Store (Unified API) -Use Azure AI Search as a vector store for RAG. +Use this to **search** Azure AI Search Vector Stores, with LiteLLM's unified `/chat/completions` API. ## Quick Start diff --git a/docs/my-website/docs/providers/azure_document_intelligence.md b/docs/my-website/docs/providers/azure_document_intelligence.md new file mode 100644 index 000000000000..edc3c616fa73 --- /dev/null +++ b/docs/my-website/docs/providers/azure_document_intelligence.md @@ -0,0 +1,408 @@ +# Azure Document Intelligence OCR + +## Overview + +| Property | Details | +|-------|-------| +| Description | Azure Document Intelligence (formerly Form Recognizer) provides advanced document analysis capabilities including text extraction, layout analysis, and structure recognition | +| Provider Route on LiteLLM | `azure_ai/doc-intelligence/` | +| Supported Operations | `/ocr` | +| Link to Provider Doc | [Azure Document Intelligence ↗](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/) + +Extract text and analyze document structure using Azure Document Intelligence's powerful prebuilt models. + +## Quick Start + +### **LiteLLM SDK** + +```python showLineNumbers title="SDK Usage" +import litellm +import os + +# Set environment variables +os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"] = "your-api-key" +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://your-resource.cognitiveservices.azure.com" + +# OCR with PDF URL +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) + +# Access extracted text +for page in response.pages: + print(f"Page {page.index}:") + print(page.markdown) +``` + +### **LiteLLM PROXY** + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: azure-doc-intel + litellm_params: + model: azure_ai/doc-intelligence/prebuilt-layout + api_key: os.environ/AZURE_DOCUMENT_INTELLIGENCE_API_KEY + api_base: os.environ/AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT + model_info: + mode: ocr +``` + +**Start Proxy** +```bash +litellm --config proxy_config.yaml +``` + +**Call OCR via Proxy** +```bash showLineNumbers title="cURL Request" +curl -X POST http://localhost:4000/ocr \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ + -d '{ + "model": "azure-doc-intel", + "document": { + "type": "document_url", + "document_url": "https://arxiv.org/pdf/2201.04234" + } + }' +``` + +## How It Works + +Azure Document Intelligence uses an asynchronous API pattern. LiteLLM AI Gateway handles the request/response transformation and polling automatically. + +### Complete Flow Diagram + +```mermaid +sequenceDiagram + participant Client + box rgb(200, 220, 255) LiteLLM AI Gateway + participant LiteLLM + end + participant Azure as Azure Document Intelligence + + Client->>LiteLLM: POST /ocr (Mistral format) + Note over LiteLLM: Transform to Azure format + + LiteLLM->>Azure: POST :analyze + Azure-->>LiteLLM: 202 Accepted + polling URL + + Note over LiteLLM: Automatic Polling + loop Every 2-10 seconds + LiteLLM->>Azure: GET polling URL + Azure-->>LiteLLM: Status: running + end + + LiteLLM->>Azure: GET polling URL + Azure-->>LiteLLM: Status: succeeded + results + + Note over LiteLLM: Transform to Mistral format + LiteLLM-->>Client: OCR Response (Mistral format) +``` + +### What LiteLLM Does For You + +When you call `litellm.ocr()` via SDK or `/ocr` via Proxy: + +1. **Request Transformation**: Converts Mistral OCR format → Azure Document Intelligence format +2. **Submits Document**: Sends transformed request to Azure DI API +3. **Handles 202 Response**: Captures the `Operation-Location` URL from response headers +4. **Automatic Polling**: + - Polls the operation URL at intervals specified by `retry-after` header (default: 2 seconds) + - Continues until status is `succeeded` or `failed` + - Respects Azure's rate limiting via `retry-after` headers +5. **Response Transformation**: Converts Azure DI format → Mistral OCR format +6. **Returns Result**: Sends unified Mistral format response to client + +**Polling Configuration:** +- Default timeout: 120 seconds +- Configurable via `AZURE_OPERATION_POLLING_TIMEOUT` environment variable +- Uses sync (`time.sleep()`) or async (`await asyncio.sleep()`) based on call type + +:::info +**Typical processing time**: 2-10 seconds depending on document size and complexity +::: + +## Supported Models + +Azure Document Intelligence offers several prebuilt models optimized for different use cases: + +### prebuilt-layout (Recommended) + +Best for general document OCR with structure preservation. + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + + + + +```python showLineNumbers title="Layout Model - SDK" +import litellm +import os + +os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"] = "your-api-key" +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://your-resource.cognitiveservices.azure.com" + +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) +``` + + + + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: azure-layout + litellm_params: + model: azure_ai/doc-intelligence/prebuilt-layout + api_key: os.environ/AZURE_DOCUMENT_INTELLIGENCE_API_KEY + api_base: os.environ/AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT + model_info: + mode: ocr +``` + +**Usage:** +```bash +curl -X POST http://localhost:4000/ocr \ + -H "Authorization: Bearer your-api-key" \ + -d '{"model": "azure-layout", "document": {"type": "document_url", "document_url": "https://example.com/doc.pdf"}}' +``` + + + + +**Features:** +- Text extraction with markdown formatting +- Table detection and extraction +- Document structure analysis +- Paragraph and section recognition + +**Pricing:** $10 per 1,000 pages + +### prebuilt-read + +Optimized for reading text from documents - fastest and most cost-effective. + + + + +```python showLineNumbers title="Read Model - SDK" +import litellm +import os + +os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"] = "your-api-key" +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://your-resource.cognitiveservices.azure.com" + +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-read", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) +``` + + + + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: azure-read + litellm_params: + model: azure_ai/doc-intelligence/prebuilt-read + api_key: os.environ/AZURE_DOCUMENT_INTELLIGENCE_API_KEY + api_base: os.environ/AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT + model_info: + mode: ocr +``` + +**Usage:** +```bash +curl -X POST http://localhost:4000/ocr \ + -H "Authorization: Bearer your-api-key" \ + -d '{"model": "azure-read", "document": {"type": "document_url", "document_url": "https://example.com/doc.pdf"}}' +``` + + + + +**Features:** +- Fast text extraction +- Optimized for reading-heavy documents +- Basic structure recognition + +**Pricing:** $1.50 per 1,000 pages + +### prebuilt-document + +General-purpose document analysis with key-value pairs. + + + + +```python showLineNumbers title="Document Model - SDK" +import litellm +import os + +os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"] = "your-api-key" +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://your-resource.cognitiveservices.azure.com" + +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-document", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) +``` + + + + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: azure-document + litellm_params: + model: azure_ai/doc-intelligence/prebuilt-document + api_key: os.environ/AZURE_DOCUMENT_INTELLIGENCE_API_KEY + api_base: os.environ/AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT + model_info: + mode: ocr +``` + +**Usage:** +```bash +curl -X POST http://localhost:4000/ocr \ + -H "Authorization: Bearer your-api-key" \ + -d '{"model": "azure-document", "document": {"type": "document_url", "document_url": "https://example.com/doc.pdf"}}' +``` + + + + +**Pricing:** $10 per 1,000 pages + +## Document Types + +Azure Document Intelligence supports various document formats. + +### PDF Documents + +```python showLineNumbers title="PDF OCR" +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) +``` + +### Image Documents + +```python showLineNumbers title="Image OCR" +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "image_url", + "image_url": "https://example.com/image.png" + } +) +``` + +**Supported image formats:** JPEG, PNG, BMP, TIFF + +### Base64 Encoded Documents + +```python showLineNumbers title="Base64 PDF" +import base64 + +# Read and encode PDF +with open("document.pdf", "rb") as f: + pdf_base64 = base64.b64encode(f.read()).decode() + +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": f"data:application/pdf;base64,{pdf_base64}" + } +) +``` + +## Response Format + +```python showLineNumbers title="Response Structure" +# Response has the following structure +response.pages # List of pages with extracted text +response.model # Model used +response.object # "ocr" +response.usage_info # Token usage information + +# Access page content +for page in response.pages: + print(f"Page {page.index}:") + print(page.markdown) + + # Page dimensions (in pixels) + if page.dimensions: + print(f"Width: {page.dimensions.width}px") + print(f"Height: {page.dimensions.height}px") +``` + +## Async Support + +```python showLineNumbers title="Async Usage" +import litellm +import asyncio + +async def process_document(): + response = await litellm.aocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } + ) + return response + +# Run async function +response = asyncio.run(process_document()) +``` + +## Cost Tracking + +LiteLLM automatically tracks costs for Azure Document Intelligence OCR: + +| Model | Cost per 1,000 Pages | +|-------|---------------------| +| prebuilt-read | $1.50 | +| prebuilt-layout | $10.00 | +| prebuilt-document | $10.00 | + +```python showLineNumbers title="View Cost" +response = litellm.ocr( + model="azure_ai/doc-intelligence/prebuilt-layout", + document={"type": "document_url", "document_url": "https://..."} +) + +# Access cost information +print(f"Cost: ${response._hidden_params.get('response_cost', 0)}") +``` + +## Additional Resources + +- [Azure Document Intelligence Documentation](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/) +- [Pricing Details](https://azure.microsoft.com/en-us/pricing/details/ai-document-intelligence/) +- [Supported File Formats](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/concept-model-overview) +- [LiteLLM OCR Documentation](https://docs.litellm.ai/docs/ocr) + diff --git a/docs/my-website/docs/providers/azure_ocr.md b/docs/my-website/docs/providers/azure_ocr.md index c93e995c43ec..5d79cc05338f 100644 --- a/docs/my-website/docs/providers/azure_ocr.md +++ b/docs/my-website/docs/providers/azure_ocr.md @@ -1,4 +1,4 @@ -# Azure AI OCR +# Azure AI OCR (Mistral) ## Overview diff --git a/docs/my-website/docs/providers/bedrock.md b/docs/my-website/docs/providers/bedrock.md index f0b89615a0db..487212ad6559 100644 --- a/docs/my-website/docs/providers/bedrock.md +++ b/docs/my-website/docs/providers/bedrock.md @@ -7,7 +7,7 @@ ALL Bedrock models (Anthropic, Meta, Deepseek, Mistral, Amazon, etc.) are Suppor | Property | Details | |-------|-------| | Description | Amazon Bedrock is a fully managed service that offers a choice of high-performing foundation models (FMs). | -| Provider Route on LiteLLM | `bedrock/`, [`bedrock/converse/`](#set-converse--invoke-route), [`bedrock/invoke/`](#set-invoke-route), [`bedrock/converse_like/`](#calling-via-internal-proxy), [`bedrock/llama/`](#deepseek-not-r1), [`bedrock/deepseek_r1/`](#deepseek-r1), [`bedrock/qwen3/`](#qwen3-imported-models) | +| Provider Route on LiteLLM | `bedrock/`, [`bedrock/converse/`](#set-converse--invoke-route), [`bedrock/invoke/`](#set-invoke-route), [`bedrock/converse_like/`](#calling-via-internal-proxy), [`bedrock/llama/`](#deepseek-not-r1), [`bedrock/deepseek_r1/`](#deepseek-r1), [`bedrock/qwen3/`](#qwen3-imported-models), [`bedrock/qwen2/`](./bedrock_imported.md#qwen2-imported-models), [`bedrock/openai/`](./bedrock_imported.md#openai-compatible-imported-models-qwen-25-vl-etc), [`bedrock/moonshot`](./bedrock_imported.md#moonshot-kimi-k2-thinking) | | Provider Doc | [Amazon Bedrock ↗](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) | | Supported OpenAI Endpoints | `/chat/completions`, `/completions`, `/embeddings`, `/images/generations` | | Rerank Endpoint | `/rerank` | @@ -43,6 +43,8 @@ export AWS_BEARER_TOKEN_BEDROCK="your-api-key" Option 2: use the api_key parameter to pass in API key for completion, embedding, image_generation API calls. + + ```python response = completion( model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", @@ -50,7 +52,17 @@ response = completion( api_key="your-api-key" ) ``` - + + +```yaml +model_list: + - model_name: bedrock-claude-3-sonnet + litellm_params: + model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0 + api_key: os.environ/AWS_BEARER_TOKEN_BEDROCK +``` + + ## Usage @@ -945,6 +957,89 @@ curl http://0.0.0.0:4000/v1/chat/completions \ +## Usage - Service Tier + +Control the processing tier for your Bedrock requests using `serviceTier`. Valid values are `priority`, `default`, or `flex`. + +- `priority`: Higher priority processing with guaranteed capacity +- `default`: Standard processing tier +- `flex`: Cost-optimized processing for batch workloads + +[Bedrock ServiceTier API Reference](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ServiceTier.html) + +### OpenAI-compatible `service_tier` parameter + +LiteLLM also supports the OpenAI-style `service_tier` parameter, which is automatically translated to Bedrock's native `serviceTier` format: + +| OpenAI `service_tier` | Bedrock `serviceTier` | +|-----------------------|----------------------| +| `"priority"` | `{"type": "priority"}` | +| `"default"` | `{"type": "default"}` | +| `"flex"` | `{"type": "flex"}` | +| `"auto"` | `{"type": "default"}` | + +```python +from litellm import completion + +# Using OpenAI-style service_tier parameter +response = completion( + model="bedrock/converse/anthropic.claude-3-sonnet-20240229-v1:0", + messages=[{"role": "user", "content": "Hello!"}], + service_tier="priority" # Automatically translated to serviceTier={"type": "priority"} +) +``` + +### Native Bedrock `serviceTier` parameter + + + + +```python +from litellm import completion + +response = completion( + model="bedrock/converse/qwen.qwen3-235b-a22b-2507-v1:0", + messages=[{"role": "user", "content": "What is the capital of France?"}], + serviceTier={"type": "priority"}, +) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: qwen3-235b-priority + litellm_params: + model: bedrock/converse/qwen.qwen3-235b-a22b-2507-v1:0 + aws_region_name: ap-northeast-1 + serviceTier: + type: priority +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "qwen3-235b-priority", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "serviceTier": {"type": "priority"} + }' +``` + + + ## Usage - Bedrock Guardrails Example of using [Bedrock Guardrails with LiteLLM](https://docs.aws.amazon.com/bedrock/latest/userguide/guardrails-use-converse-api.html) @@ -1598,117 +1693,66 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ -## Bedrock Imported Models (Deepseek, Deepseek R1) - -### Deepseek R1 - -This is a separate route, as the chat template is different. +### OpenAI GPT OSS | Property | Details | |----------|---------| -| Provider Route | `bedrock/deepseek_r1/{model_arn}` | -| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) | +| Provider Route | `bedrock/converse/openai.gpt-oss-20b-1:0`, `bedrock/converse/openai.gpt-oss-120b-1:0` | +| Provider Documentation | [Amazon Bedrock ↗](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) | -```python +```python title="GPT OSS SDK Usage" showLineNumbers from litellm import completion import os +# Set AWS credentials +os.environ["AWS_ACCESS_KEY_ID"] = "your-aws-access-key" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your-aws-secret-key" +os.environ["AWS_REGION_NAME"] = "us-east-1" + +# GPT OSS 20B model response = completion( - model="bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n", # bedrock/deepseek_r1/{your-model-arn} - messages=[{"role": "user", "content": "Tell me a joke"}], + model="bedrock/converse/openai.gpt-oss-20b-1:0", + messages=[{"role": "user", "content": "Hello, how are you?"}], ) -``` - - - - - - -**1. Add to config** - -```yaml -model_list: - - model_name: DeepSeek-R1-Distill-Llama-70B - litellm_params: - model: bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n - -``` - -**2. Start proxy** - -```bash -litellm --config /path/to/config.yaml - -# RUNNING at http://0.0.0.0:4000 -``` - -**3. Test it!** - -```bash -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "model": "DeepSeek-R1-Distill-Llama-70B", # 👈 the 'model_name' in config - "messages": [ - { - "role": "user", - "content": "what llm are you" - } - ], - }' -``` - - - - - -### Deepseek (not R1) - -| Property | Details | -|----------|---------| -| Provider Route | `bedrock/llama/{model_arn}` | -| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) | - - - -Use this route to call Bedrock Imported Models that follow the `llama` Invoke Request / Response spec - - - - - -```python -from litellm import completion -import os +print(response.choices[0].message.content) +# GPT OSS 120B model response = completion( - model="bedrock/llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n", # bedrock/llama/{your-model-arn} - messages=[{"role": "user", "content": "Tell me a joke"}], + model="bedrock/converse/openai.gpt-oss-120b-1:0", + messages=[{"role": "user", "content": "Explain machine learning in simple terms"}], ) +print(response.choices[0].message.content) ``` - **1. Add to config** -```yaml +```yaml title="config.yaml" showLineNumbers model_list: - - model_name: DeepSeek-R1-Distill-Llama-70B - litellm_params: - model: bedrock/llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n - + - model_name: gpt-oss-20b + litellm_params: + model: bedrock/converse/openai.gpt-oss-20b-1:0 + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: os.environ/AWS_REGION_NAME + + - model_name: gpt-oss-120b + litellm_params: + model: bedrock/converse/openai.gpt-oss-120b-1:0 + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: os.environ/AWS_REGION_NAME ``` **2. Start proxy** -```bash +```bash title="Start LiteLLM Proxy" showLineNumbers litellm --config /path/to/config.yaml # RUNNING at http://0.0.0.0:4000 @@ -1716,99 +1760,47 @@ litellm --config /path/to/config.yaml **3. Test it!** -```bash +```bash title="Test GPT OSS via Proxy" showLineNumbers curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "model": "DeepSeek-R1-Distill-Llama-70B", # 👈 the 'model_name' in config - "messages": [ - { - "role": "user", - "content": "what llm are you" - } - ], - }' + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "gpt-oss-20b", + "messages": [ + { + "role": "user", + "content": "What are the key benefits of open source AI?" + } + ] + }' ``` -### Qwen3 Imported Models +## TwelveLabs Pegasus - Video Understanding + +TwelveLabs Pegasus 1.2 is a video understanding model that can analyze and describe video content. LiteLLM supports this model through Bedrock's `/invoke` endpoint. | Property | Details | |----------|---------| -| Provider Route | `bedrock/qwen3/{model_arn}` | -| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Qwen3 Models](https://aws.amazon.com/about-aws/whats-new/2025/09/qwen3-models-fully-managed-amazon-bedrock/) | - - - +| Provider Route | `bedrock/us.twelvelabs.pegasus-1-2-v1:0`, `bedrock/eu.twelvelabs.pegasus-1-2-v1:0` | +| Provider Documentation | [TwelveLabs Pegasus Docs ↗](https://docs.twelvelabs.io/docs/models/pegasus) | +| Supported Parameters | `max_tokens`, `temperature`, `response_format` | +| Media Input | S3 URI or base64-encoded video | -```python -from litellm import completion -import os +### Supported Features -response = completion( - model="bedrock/qwen3/arn:aws:bedrock:us-east-1:086734376398:imported-model/your-qwen3-model", # bedrock/qwen3/{your-model-arn} - messages=[{"role": "user", "content": "Tell me a joke"}], - max_tokens=100, - temperature=0.7 -) -``` +- **Video Analysis**: Analyze video content from S3 or base64 input +- **Structured Output**: Support for JSON schema response format +- **S3 Integration**: Support for S3 video URLs with bucket owner specification - - - - -**1. Add to config** - -```yaml -model_list: - - model_name: Qwen3-32B - litellm_params: - model: bedrock/qwen3/arn:aws:bedrock:us-east-1:086734376398:imported-model/your-qwen3-model - -``` - -**2. Start proxy** - -```bash -litellm --config /path/to/config.yaml - -# RUNNING at http://0.0.0.0:4000 -``` - -**3. Test it!** - -```bash -curl --location 'http://0.0.0.0:4000/chat/completions' \ - --header 'Authorization: Bearer sk-1234' \ - --header 'Content-Type: application/json' \ - --data '{ - "model": "Qwen3-32B", # 👈 the 'model_name' in config - "messages": [ - { - "role": "user", - "content": "what llm are you" - } - ], - }' -``` - - - - -### OpenAI GPT OSS - -| Property | Details | -|----------|---------| -| Provider Route | `bedrock/converse/openai.gpt-oss-20b-1:0`, `bedrock/converse/openai.gpt-oss-120b-1:0` | -| Provider Documentation | [Amazon Bedrock ↗](https://docs.aws.amazon.com/bedrock/latest/userguide/what-is-bedrock.html) | +### Usage with S3 Video -```python title="GPT OSS SDK Usage" showLineNumbers +```python title="TwelveLabs Pegasus SDK Usage" showLineNumbers from litellm import completion import os @@ -1817,18 +1809,18 @@ os.environ["AWS_ACCESS_KEY_ID"] = "your-aws-access-key" os.environ["AWS_SECRET_ACCESS_KEY"] = "your-aws-secret-key" os.environ["AWS_REGION_NAME"] = "us-east-1" -# GPT OSS 20B model response = completion( - model="bedrock/converse/openai.gpt-oss-20b-1:0", - messages=[{"role": "user", "content": "Hello, how are you?"}], + model="bedrock/us.twelvelabs.pegasus-1-2-v1:0", + messages=[{"role": "user", "content": "Describe what happens in this video."}], + mediaSource={ + "s3Location": { + "uri": "s3://your-bucket/video.mp4", + "bucketOwner": "123456789012", # 12-digit AWS account ID + } + }, + temperature=0.2 ) -print(response.choices[0].message.content) -# GPT OSS 120B model -response = completion( - model="bedrock/converse/openai.gpt-oss-120b-1:0", - messages=[{"role": "user", "content": "Explain machine learning in simple terms"}], -) print(response.choices[0].message.content) ``` @@ -1840,16 +1832,9 @@ print(response.choices[0].message.content) ```yaml title="config.yaml" showLineNumbers model_list: - - model_name: gpt-oss-20b - litellm_params: - model: bedrock/converse/openai.gpt-oss-20b-1:0 - aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID - aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY - aws_region_name: os.environ/AWS_REGION_NAME - - - model_name: gpt-oss-120b + - model_name: pegasus-video litellm_params: - model: bedrock/converse/openai.gpt-oss-120b-1:0 + model: bedrock/us.twelvelabs.pegasus-1-2-v1:0 aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY aws_region_name: os.environ/AWS_REGION_NAME @@ -1865,24 +1850,59 @@ litellm --config /path/to/config.yaml **3. Test it!** -```bash title="Test GPT OSS via Proxy" showLineNumbers +```bash title="Test Pegasus via Proxy" showLineNumbers curl --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Authorization: Bearer sk-1234' \ --header 'Content-Type: application/json' \ --data '{ - "model": "gpt-oss-20b", + "model": "pegasus-video", "messages": [ { - "role": "user", - "content": "What are the key benefits of open source AI?" + "role": "user", + "content": "Describe what happens in this video." } - ] + ], + "mediaSource": { + "s3Location": { + "uri": "s3://your-bucket/video.mp4", + "bucketOwner": "123456789012" + } + }, + "temperature": 0.2 }' ``` +### Usage with Base64 Video + +You can also pass video content directly as base64: + +```python title="Base64 Video Input" showLineNumbers +from litellm import completion +import base64 + +# Read video file and encode to base64 +with open("video.mp4", "rb") as video_file: + video_base64 = base64.b64encode(video_file.read()).decode("utf-8") + +response = completion( + model="bedrock/us.twelvelabs.pegasus-1-2-v1:0", + messages=[{"role": "user", "content": "What is happening in this video?"}], + mediaSource={ + "base64String": video_base64 + }, + temperature=0.2, +) + +print(response.choices[0].message.content) +``` + +### Important Notes + +- **Response Format**: The model supports structured output via `response_format` with JSON schema + ## Provisioned throughput models To use provisioned throughput Bedrock models pass - `model=bedrock/`, example `model=bedrock/anthropic.claude-v2`. Set `model` to any of the [Supported AWS models](#supported-aws-bedrock-models) @@ -1943,6 +1963,9 @@ Here's an example of using a bedrock model with LiteLLM. For a complete list, re | Meta Llama 2 Chat 70b | `completion(model='bedrock/meta.llama2-70b-chat-v1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Mistral 7B Instruct | `completion(model='bedrock/mistral.mistral-7b-instruct-v0:2', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | | Mixtral 8x7B Instruct | `completion(model='bedrock/mistral.mixtral-8x7b-instruct-v0:1', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | +| TwelveLabs Pegasus 1.2 (US) | `completion(model='bedrock/us.twelvelabs.pegasus-1-2-v1:0', messages=messages, mediaSource={...})` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | +| TwelveLabs Pegasus 1.2 (EU) | `completion(model='bedrock/eu.twelvelabs.pegasus-1-2-v1:0', messages=messages, mediaSource={...})` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | +| Moonshot Kimi K2 Thinking | `completion(model='bedrock/moonshot.kimi-k2-thinking', messages=messages)` or `completion(model='bedrock/invoke/moonshot.kimi-k2-thinking', messages=messages)` | `os.environ['AWS_ACCESS_KEY_ID']`, `os.environ['AWS_SECRET_ACCESS_KEY']`, `os.environ['AWS_REGION_NAME']` | ## Bedrock Embedding @@ -2210,6 +2233,53 @@ response = completion( | `aws_role_name` | `RoleArn` | The Amazon Resource Name (ARN) of the role to assume | [AssumeRole API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts.html#STS.Client.assume_role) | | `aws_session_name` | `RoleSessionName` | An identifier for the assumed role session | [AssumeRole API](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sts.html#STS.Client.assume_role) | +### IAM Roles Anywhere (On-Premise / External Workloads) + +[IAM Roles Anywhere](https://docs.aws.amazon.com/rolesanywhere/latest/userguide/introduction.html) extends IAM roles to workloads **outside of AWS** (on-premise servers, edge devices, other clouds). It uses the same STS mechanism as regular IAM roles but authenticates via X.509 certificates instead of AWS credentials. + +**Setup**: Configure the [AWS Signing Helper](https://docs.aws.amazon.com/rolesanywhere/latest/userguide/credential-helper.html) as a credential process in `~/.aws/config`: + +```ini +[profile litellm-roles-anywhere] +credential_process = aws_signing_helper credential-process \ + --certificate /path/to/certificate.pem \ + --private-key /path/to/private-key.pem \ + --trust-anchor-arn arn:aws:rolesanywhere:us-east-1:123456789012:trust-anchor/abc123 \ + --profile-arn arn:aws:rolesanywhere:us-east-1:123456789012:profile/def456 \ + --role-arn arn:aws:iam::123456789012:role/MyBedrockRole +``` + +**Usage**: Reference the profile in LiteLLM: + + + + +```python +from litellm import completion + +response = completion( + model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + messages=[{"role": "user", "content": "Hello!"}], + aws_profile_name="litellm-roles-anywhere", +) +``` + + + + +```yaml +model_list: + - model_name: bedrock-claude + litellm_params: + model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0 + aws_profile_name: "litellm-roles-anywhere" +``` + + + + +See the [IAM Roles Anywhere Getting Started Guide](https://docs.aws.amazon.com/rolesanywhere/latest/userguide/getting-started.html) for trust anchor and profile setup. + Make the bedrock completion call diff --git a/docs/my-website/docs/providers/bedrock_agentcore.md b/docs/my-website/docs/providers/bedrock_agentcore.md new file mode 100644 index 000000000000..e3e352f7ab68 --- /dev/null +++ b/docs/my-website/docs/providers/bedrock_agentcore.md @@ -0,0 +1,252 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Bedrock AgentCore + +Call Bedrock AgentCore in the OpenAI Request/Response format. + +| Property | Details | +|----------|---------| +| Description | Amazon Bedrock AgentCore provides direct access to hosted agent runtimes for executing agentic workflows with foundation models. | +| Provider Route on LiteLLM | `bedrock/agentcore/{AGENT_RUNTIME_ARN}` | +| Provider Doc | [AWS Bedrock AgentCore ↗](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agentcore_InvokeAgentRuntime.html) | + +:::info + +This documentation is for **AgentCore Agents** (agent runtimes). If you want to use AgentCore MCP servers, add them as you would any other MCP server. See the [MCP documentation](https://docs.litellm.ai/docs/mcp) for details. + +::: + +## Quick Start + +### Model Format to LiteLLM + +To call a bedrock agent runtime through LiteLLM, use the following model format. + +Here the `model=bedrock/agentcore/` tells LiteLLM to call the bedrock `InvokeAgentRuntime` API. + +```shell showLineNumbers title="Model Format to LiteLLM" +bedrock/agentcore/{AGENT_RUNTIME_ARN} +``` + +**Example:** +- `bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/my-agent-runtime` + +You can find the Agent Runtime ARN in your AWS Bedrock console under AgentCore. + +### LiteLLM Python SDK + +```python showLineNumbers title="Basic AgentCore Completion" +import litellm + +# Make a completion request to your AgentCore runtime +response = litellm.completion( + model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/my-agent-runtime", + messages=[ + { + "role": "user", + "content": "Explain machine learning in simple terms" + } + ], +) + +print(response.choices[0].message.content) +print(f"Usage: {response.usage}") +``` + +```python showLineNumbers title="Streaming AgentCore Responses" +import litellm + +# Stream responses from your AgentCore runtime +response = litellm.completion( + model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/my-agent-runtime", + messages=[ + { + "role": "user", + "content": "What are the key principles of software architecture?" + } + ], + stream=True, +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="") +``` + +### LiteLLM Proxy + +#### 1. Configure your model in config.yaml + + + + +```yaml showLineNumbers title="LiteLLM Proxy Configuration" +model_list: + - model_name: agentcore-runtime-1 + litellm_params: + model: bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/my-agent-runtime + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-west-2 + + - model_name: agentcore-runtime-2 + litellm_params: + model: bedrock/agentcore/arn:aws:bedrock-agentcore:us-east-1:987654321098:runtime/production-runtime + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-east-1 +``` + + + + +#### 2. Start the LiteLLM Proxy + +```bash showLineNumbers title="Start LiteLLM Proxy" +litellm --config config.yaml +``` + +#### 3. Make requests to your AgentCore runtimes + + + + +```bash showLineNumbers title="Basic AgentCore Request" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "agentcore-runtime-1", + "messages": [ + { + "role": "user", + "content": "Summarize the main benefits of cloud computing" + } + ] + }' +``` + +```bash showLineNumbers title="Streaming AgentCore Request" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "agentcore-runtime-2", + "messages": [ + { + "role": "user", + "content": "Explain the differences between SQL and NoSQL databases" + } + ], + "stream": true + }' +``` + + + + + +```python showLineNumbers title="Using OpenAI SDK with LiteLLM Proxy" +from openai import OpenAI + +# Initialize client with your LiteLLM proxy URL +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +# Make a completion request to your AgentCore runtime +response = client.chat.completions.create( + model="agentcore-runtime-1", + messages=[ + { + "role": "user", + "content": "What are best practices for API design?" + } + ] +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="Streaming with OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +# Stream AgentCore responses +stream = client.chat.completions.create( + model="agentcore-runtime-2", + messages=[ + { + "role": "user", + "content": "Describe the microservices architecture pattern" + } + ], + stream=True +) + +for chunk in stream: + if chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") +``` + + + + +## Provider-specific Parameters + +AgentCore supports additional parameters that can be passed to customize the runtime invocation. + + + + +```python showLineNumbers title="Using AgentCore-specific parameters" +from litellm import completion + +response = litellm.completion( + model="bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/my-agent-runtime", + messages=[ + { + "role": "user", + "content": "Analyze this data and provide insights", + } + ], + qualifier="production", # PROVIDER-SPECIFIC: Runtime qualifier/version + runtimeSessionId="session-abc-123", # PROVIDER-SPECIFIC: Custom session ID +) +``` + + + + +```yaml showLineNumbers title="LiteLLM Proxy Configuration with Parameters" +model_list: + - model_name: agentcore-runtime-prod + litellm_params: + model: bedrock/agentcore/arn:aws:bedrock-agentcore:us-west-2:123456789012:runtime/my-agent-runtime + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-west-2 + qualifier: production +``` + + + + +### Available Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `qualifier` | string | Optional runtime qualifier/version to invoke a specific version of the agent runtime | +| `runtimeSessionId` | string | Optional custom session ID (must be 33+ characters). If not provided, LiteLLM generates one automatically | + +## Further Reading + +- [AWS Bedrock AgentCore Documentation](https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agentcore_InvokeAgentRuntime.html) +- [LiteLLM Authentication to Bedrock](https://docs.litellm.ai/docs/providers/bedrock#boto3---authentication) + diff --git a/docs/my-website/docs/providers/bedrock_batches.md b/docs/my-website/docs/providers/bedrock_batches.md index c262eef0e864..19446fda8372 100644 --- a/docs/my-website/docs/providers/bedrock_batches.md +++ b/docs/my-website/docs/providers/bedrock_batches.md @@ -40,6 +40,8 @@ model_list: s3_access_key_id: os.environ/AWS_ACCESS_KEY_ID s3_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY aws_batch_role_arn: arn:aws:iam::888602223428:role/service-role/AmazonBedrockExecutionRoleForAgents_BB9HNW6V4CV + # Optional: Custom KMS encryption key for S3 output + # s3_encryption_key_id: arn:aws:kms:us-west-2:123456789012:key/12345678-1234-1234-1234-123456789012 model_info: mode: batch # 👈 SPECIFY MODE AS BATCH, to tell user this is a batch model ``` @@ -55,6 +57,12 @@ model_list: | `aws_batch_role_arn` | IAM role ARN for Bedrock batch operations. Bedrock Batch APIs require an IAM role ARN to be set. | | `mode: batch` | Indicates to LiteLLM this is a batch model | +**Optional Parameters:** + +| Parameter | Description | +|-----------|-------------| +| `s3_encryption_key_id` | Custom KMS encryption key ID for S3 output data. If not specified, Bedrock uses AWS managed encryption keys. | + ### 2. Create Virtual Key ```bash showLineNumbers title="create_virtual_key.sh" @@ -164,6 +172,97 @@ curl http://localhost:4000/v1/batches \ +### 4. Retrieve batch results + +Once the batch job is completed, download the results from S3: + + + + +```python showLineNumbers title="bedrock_batch.py" +... +# Wait for batch completion (check status periodically) +batch_status = client.batches.retrieve(batch_id=batch.id) + +if batch_status.status == "completed": + # Download the output file + result = client.files.content( + file_id=batch_status.output_file_id, + extra_headers={"custom-llm-provider": "bedrock"} + ) + + # Save or process the results + with open("batch_output.jsonl", "wb") as f: + f.write(result.content) + + # Parse JSONL results + for line in result.text.strip().split('\n'): + record = json.loads(line) + print(f"Record ID: {record['recordId']}") + print(f"Output: {record.get('modelOutput', {})}") +``` + + + + +```bash showLineNumbers title="Download Batch Results" +# First retrieve batch to get output_file_id +curl http://localhost:4000/v1/batches/batch_abc123 \ + -H "Authorization: Bearer sk-1234" + +# Then download the output file +curl http://localhost:4000/v1/files/{output_file_id}/content \ + -H "Authorization: Bearer sk-1234" \ + -H "custom-llm-provider: bedrock" \ + -o batch_output.jsonl +``` + + + + +```python showLineNumbers title="bedrock_batch.py" +import litellm +from litellm import file_content + +# Download using litellm directly (bypasses proxy managed files) +result = file_content( + file_id=batch_status.output_file_id, # Can be S3 URI or unified file ID + custom_llm_provider="bedrock", + aws_region_name="us-west-2", +) + +# Process results +print(result.text) +``` + + + + +**Output Format:** + +The batch output file is in JSONL format with each line containing: + +```json +{ + "recordId": "request-1", + "modelInput": { + "messages": [...], + "max_tokens": 1000 + }, + "modelOutput": { + "content": [...], + "id": "msg_abc123", + "model": "claude-3-5-sonnet-20240620-v1:0", + "role": "assistant", + "stop_reason": "end_turn", + "usage": { + "input_tokens": 15, + "output_tokens": 10 + } + } +} +``` + ## FAQ ### Where are my files written? @@ -174,6 +273,29 @@ When a `target_model_names` is specified, the file is written to the S3 bucket c LiteLLM only supports Bedrock Anthropic Models for Batch API. If you want other bedrock models file an issue [here](https://github.com/BerriAI/litellm/issues/new/choose). +### How do I use a custom KMS encryption key? + +If your S3 bucket requires a custom KMS encryption key, you can specify it in your configuration using `s3_encryption_key_id`. This is useful for enterprise customers with specific encryption requirements. + +You can set the encryption key in 2 ways: + +1. **In config.yaml** (recommended): +```yaml +model_list: + - model_name: "bedrock-batch-claude" + litellm_params: + model: bedrock/us.anthropic.claude-3-5-sonnet-20240620-v1:0 + s3_encryption_key_id: arn:aws:kms:us-west-2:123456789012:key/12345678-1234-1234-1234-123456789012 + # ... other params +``` + +2. **As an environment variable**: +```bash +export AWS_S3_ENCRYPTION_KEY_ID=arn:aws:kms:us-west-2:123456789012:key/12345678-1234-1234-1234-123456789012 +``` + + + ## Further Reading - [AWS Bedrock Batch Inference Documentation](https://docs.aws.amazon.com/bedrock/latest/userguide/batch-inference.html) diff --git a/docs/my-website/docs/providers/bedrock_embedding.md b/docs/my-website/docs/providers/bedrock_embedding.md index 76c9606533ec..3c618fe06417 100644 --- a/docs/my-website/docs/providers/bedrock_embedding.md +++ b/docs/my-website/docs/providers/bedrock_embedding.md @@ -4,7 +4,8 @@ | Provider | LiteLLM Route | AWS Documentation | Cost Tracking | |----------|---------------|-------------------|---------------| -| Amazon Titan | `bedrock/amazon.*` | [Amazon Titan Embeddings](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) | ✅ | +| Amazon Titan | `bedrock/amazon.titan-*` | [Amazon Titan Embeddings](https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html) | ✅ | +| Amazon Nova | `bedrock/amazon.nova-*` | [Amazon Nova Embeddings](https://docs.aws.amazon.com/bedrock/latest/userguide/nova-embed.html) | ✅ | | Cohere | `bedrock/cohere.*` | [Cohere Embeddings](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-cohere-embed.html) | ✅ | | TwelveLabs | `bedrock/us.twelvelabs.*` | [TwelveLabs](https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-twelvelabs.html) | ✅ | @@ -16,6 +17,7 @@ LiteLLM supports AWS Bedrock's async-invoke feature for embedding models that re | Provider | Async Invoke Route | Use Case | |----------|-------------------|----------| +| Amazon Nova | `bedrock/async_invoke/amazon.nova-2-multimodal-embeddings-v1:0` | Multimodal embeddings with segmentation for long text, video, and audio | | TwelveLabs Marengo | `bedrock/async_invoke/us.twelvelabs.marengo-embed-2-7-v1:0` | Video, audio, image, and text embeddings | ### Required Parameters @@ -116,7 +118,7 @@ def check_async_job_status(invocation_arn, aws_region_name="us-east-1"): """Check the status of an async invoke job using LiteLLM batch API""" try: response = retrieve_batch( - batch_id=invocation_arn, + batch_id=invocation_arn, # Pass the invocation ARN here custom_llm_provider="bedrock", aws_region_name=aws_region_name ) @@ -128,11 +130,166 @@ def check_async_job_status(invocation_arn, aws_region_name="us-east-1"): # Check status status = check_async_job_status(invocation_arn, "us-east-1") if status: - print(f"Job Status: {status.status}") - print(f"Output Location: {status.output_file_id}") + print(f"Job Status: {status.status}") # "in_progress", "completed", or "failed" + print(f"Output Location: {status.metadata['output_file_id']}") # S3 URI where results are stored +``` + +#### Polling Until Complete + +Here's a complete example of polling for job completion: + +```python +def wait_for_async_job(invocation_arn, aws_region_name="us-east-1", max_wait=3600): + """Poll job status until completion""" + start_time = time.time() + + while True: + status = retrieve_batch( + batch_id=invocation_arn, + custom_llm_provider="bedrock", + aws_region_name=aws_region_name, + ) + + if status.status == "completed": + print("✅ Job completed!") + return status + elif status.status == "failed": + error_msg = status.metadata.get('failure_message', 'Unknown error') + raise Exception(f"❌ Job failed: {error_msg}") + else: + elapsed = time.time() - start_time + if elapsed > max_wait: + raise TimeoutError(f"Job timed out after {max_wait} seconds") + + print(f"⏳ Job still processing... (elapsed: {elapsed:.0f}s)") + time.sleep(10) # Wait 10 seconds before checking again + +# Wait for completion +completed_status = wait_for_async_job(invocation_arn) +output_s3_uri = completed_status.metadata['output_file_id'] +print(f"Results available at: {output_s3_uri}") +``` + +**Note:** The actual embedding results are stored in S3. When the job is completed, download the results from the S3 location specified in `status.metadata['output_file_id']`. The results will be in JSON/JSONL format containing the embedding vectors. + +## Amazon Nova Multimodal Embeddings + +Amazon Nova supports multimodal embeddings for text, images, video, and audio. It offers flexible embedding dimensions and purposes optimized for different use cases. + +### Supported Features + +- **Modalities**: Text, Image, Video, Audio +- **Dimensions**: 256, 384, 1024, 3072 (default: 3072) +- **Embedding Purposes**: + - `GENERIC_INDEX` (default) + - `GENERIC_RETRIEVAL` + - `TEXT_RETRIEVAL` + - `IMAGE_RETRIEVAL` + - `VIDEO_RETRIEVAL` + - `AUDIO_RETRIEVAL` + - `CLASSIFICATION` + - `CLUSTERING` + +### Text Embedding + +```python +from litellm import embedding + +response = embedding( + model="bedrock/amazon.nova-2-multimodal-embeddings-v1:0", + input=["Hello, world!"], + aws_region_name="us-east-1", + dimensions=1024, # Optional: 256, 384, 1024, or 3072 +) + +print(response.data[0].embedding) +``` + +### Image Embedding with Base64 + +Amazon Nova accepts images in base64 format using the standard data URL format: + +```python +import base64 +from litellm import embedding + +# Method 1: Load image from file +with open("image.jpg", "rb") as image_file: + image_data = base64.b64encode(image_file.read()).decode('utf-8') + # Create data URL with proper format + image_base64 = f"data:image/jpeg;base64,{image_data}" + +response = embedding( + model="bedrock/amazon.nova-2-multimodal-embeddings-v1:0", + input=[image_base64], + aws_region_name="us-east-1", + dimensions=1024, +) + +print(f"Image embedding: {response.data[0].embedding[:10]}...") # First 10 dimensions ``` -**Note:** The actual embedding results are stored in S3. The `output_file_id` from the batch status can be used to locate the results file in your S3 bucket. +#### Supported Image Formats + +Nova supports the following image formats: +- JPEG: `data:image/jpeg;base64,...` +- PNG: `data:image/png;base64,...` +- GIF: `data:image/gif;base64,...` +- WebP: `data:image/webp;base64,...` + +#### Complete Example with Error Handling + +```python +import base64 +from litellm import embedding + +def get_image_embedding(image_path, dimensions=1024): + """ + Get embedding for an image file. + + Args: + image_path: Path to the image file + dimensions: Embedding dimension (256, 384, 1024, or 3072) + + Returns: + List of embedding values + """ + try: + # Determine image format from file extension + if image_path.lower().endswith('.png'): + mime_type = "image/png" + elif image_path.lower().endswith(('.jpg', '.jpeg')): + mime_type = "image/jpeg" + elif image_path.lower().endswith('.gif'): + mime_type = "image/gif" + elif image_path.lower().endswith('.webp'): + mime_type = "image/webp" + else: + raise ValueError(f"Unsupported image format: {image_path}") + + # Read and encode image + with open(image_path, "rb") as image_file: + image_data = base64.b64encode(image_file.read()).decode('utf-8') + image_base64 = f"data:{mime_type};base64,{image_data}" + + # Get embedding + response = embedding( + model="bedrock/amazon.nova-2-multimodal-embeddings-v1:0", + input=[image_base64], + aws_region_name="us-east-1", + dimensions=dimensions, + ) + + return response.data[0].embedding + + except Exception as e: + print(f"Error getting image embedding: {e}") + raise + +# Example usage +image_embedding = get_image_embedding("photo.jpg", dimensions=1024) +print(f"Got embedding with {len(image_embedding)} dimensions") +``` ### Error Handling @@ -179,7 +336,7 @@ except Exception as e: ### Limitations -- Async-invoke is currently only supported for TwelveLabs Marengo models +- Async-invoke is supported for TwelveLabs Marengo and Amazon Nova models - Results are stored in S3 and must be retrieved separately using the output file ID - Job status checking requires using LiteLLM's `retrieve_batch()` function - No built-in polling mechanism in LiteLLM (must implement your own status checking loop) @@ -259,6 +416,7 @@ print(response) | Model Name | Usage | Supported Additional OpenAI params | |----------------------|---------------------------------------------|-----| +| **Amazon Nova Multimodal Embeddings** | `embedding(model="bedrock/amazon.nova-2-multimodal-embeddings-v1:0", input=input)` | Supports multimodal input (text, image, video, audio), multiple purposes, dimensions (256, 384, 1024, 3072) | | Titan Embeddings V2 | `embedding(model="bedrock/amazon.titan-embed-text-v2:0", input=input)` | [here](https://github.com/BerriAI/litellm/blob/f5905e100068e7a4d61441d7453d7cf5609c2121/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py#L59) | | Titan Embeddings - V1 | `embedding(model="bedrock/amazon.titan-embed-text-v1", input=input)` | [here](https://github.com/BerriAI/litellm/blob/f5905e100068e7a4d61441d7453d7cf5609c2121/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py#L53) | Titan Multimodal Embeddings | `embedding(model="bedrock/amazon.titan-embed-image-v1", input=input)` | [here](https://github.com/BerriAI/litellm/blob/f5905e100068e7a4d61441d7453d7cf5609c2121/litellm/llms/bedrock/embed/amazon_titan_multimodal_transformation.py#L28) | diff --git a/docs/my-website/docs/providers/bedrock_imported.md b/docs/my-website/docs/providers/bedrock_imported.md new file mode 100644 index 000000000000..709736e61097 --- /dev/null +++ b/docs/my-website/docs/providers/bedrock_imported.md @@ -0,0 +1,610 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Bedrock Imported Models + +Bedrock Imported Models (Deepseek, Deepseek R1, Qwen, OpenAI-compatible models) + +### Deepseek R1 + +This is a separate route, as the chat template is different. + +| Property | Details | +|----------|---------| +| Provider Route | `bedrock/deepseek_r1/{model_arn}` | +| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) | + + + + +```python +from litellm import completion +import os + +response = completion( + model="bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n", # bedrock/deepseek_r1/{your-model-arn} + messages=[{"role": "user", "content": "Tell me a joke"}], +) +``` + + + + + + +**1. Add to config** + +```yaml +model_list: + - model_name: DeepSeek-R1-Distill-Llama-70B + litellm_params: + model: bedrock/deepseek_r1/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n + +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "DeepSeek-R1-Distill-Llama-70B", # 👈 the 'model_name' in config + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' +``` + + + + + +### Deepseek (not R1) + +| Property | Details | +|----------|---------| +| Provider Route | `bedrock/llama/{model_arn}` | +| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Deepseek Bedrock Imported Model](https://aws.amazon.com/blogs/machine-learning/deploy-deepseek-r1-distilled-llama-models-with-amazon-bedrock-custom-model-import/) | + + + +Use this route to call Bedrock Imported Models that follow the `llama` Invoke Request / Response spec + + + + + +```python +from litellm import completion +import os + +response = completion( + model="bedrock/llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n", # bedrock/llama/{your-model-arn} + messages=[{"role": "user", "content": "Tell me a joke"}], +) +``` + + + + + + +**1. Add to config** + +```yaml +model_list: + - model_name: DeepSeek-R1-Distill-Llama-70B + litellm_params: + model: bedrock/llama/arn:aws:bedrock:us-east-1:086734376398:imported-model/r4c4kewx2s0n + +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "DeepSeek-R1-Distill-Llama-70B", # 👈 the 'model_name' in config + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' +``` + + + + +### Qwen3 Imported Models + +| Property | Details | +|----------|---------| +| Provider Route | `bedrock/qwen3/{model_arn}` | +| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html), [Qwen3 Models](https://aws.amazon.com/about-aws/whats-new/2025/09/qwen3-models-fully-managed-amazon-bedrock/) | + + + + +```python +from litellm import completion +import os + +response = completion( + model="bedrock/qwen3/arn:aws:bedrock:us-east-1:086734376398:imported-model/your-qwen3-model", # bedrock/qwen3/{your-model-arn} + messages=[{"role": "user", "content": "Tell me a joke"}], + max_tokens=100, + temperature=0.7 +) +``` + + + + + +**1. Add to config** + +```yaml +model_list: + - model_name: Qwen3-32B + litellm_params: + model: bedrock/qwen3/arn:aws:bedrock:us-east-1:086734376398:imported-model/your-qwen3-model + +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "Qwen3-32B", # 👈 the 'model_name' in config + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' +``` + + + + +### Qwen2 Imported Models + +| Property | Details | +|----------|---------| +| Provider Route | `bedrock/qwen2/{model_arn}` | +| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html) | +| Note | Qwen2 and Qwen3 architectures are mostly similar. The main difference is in the response format: Qwen2 uses "text" field while Qwen3 uses "generation" field. | + + + + +```python +from litellm import completion +import os + +response = completion( + model="bedrock/qwen2/arn:aws:bedrock:us-east-1:086734376398:imported-model/your-qwen2-model", # bedrock/qwen2/{your-model-arn} + messages=[{"role": "user", "content": "Tell me a joke"}], + max_tokens=100, + temperature=0.7 +) +``` + + + + + +**1. Add to config** + +```yaml +model_list: + - model_name: Qwen2-72B + litellm_params: + model: bedrock/qwen2/arn:aws:bedrock:us-east-1:086734376398:imported-model/your-qwen2-model + +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "Qwen2-72B", # 👈 the 'model_name' in config + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' +``` + + + + +### OpenAI-Compatible Imported Models (Qwen 2.5 VL, etc.) + +Use this route for Bedrock imported models that follow the **OpenAI Chat Completions API spec**. This includes models like Qwen 2.5 VL that accept OpenAI-formatted messages with support for vision (images), tool calling, and other OpenAI features. + +| Property | Details | +|----------|---------| +| Provider Route | `bedrock/openai/{model_arn}` | +| Provider Documentation | [Bedrock Imported Models](https://docs.aws.amazon.com/bedrock/latest/userguide/model-customization-import-model.html) | +| Supported Features | Vision (images), tool calling, streaming, system messages | + +#### LiteLLMSDK Usage + +**Basic Usage** + +```python +from litellm import completion + +response = completion( + model="bedrock/openai/arn:aws:bedrock:us-east-1:046319184608:imported-model/0m2lasirsp6z", # bedrock/openai/{your-model-arn} + messages=[{"role": "user", "content": "Tell me a joke"}], + max_tokens=300, + temperature=0.5 +) +``` + +**With Vision (Images)** + +```python +import base64 +from litellm import completion + +# Load and encode image +with open("image.jpg", "rb") as f: + image_base64 = base64.b64encode(f.read()).decode("utf-8") + +response = completion( + model="bedrock/openai/arn:aws:bedrock:us-east-1:046319184608:imported-model/0m2lasirsp6z", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that can analyze images." + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"} + } + ] + } + ], + max_tokens=300, + temperature=0.5 +) +``` + +**Comparing Multiple Images** + +```python +import base64 +from litellm import completion + +# Load images +with open("image1.jpg", "rb") as f: + image1_base64 = base64.b64encode(f.read()).decode("utf-8") +with open("image2.jpg", "rb") as f: + image2_base64 = base64.b64encode(f.read()).decode("utf-8") + +response = completion( + model="bedrock/openai/arn:aws:bedrock:us-east-1:046319184608:imported-model/0m2lasirsp6z", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that can analyze images." + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Spot the difference between these two images?"}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image1_base64}"} + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image2_base64}"} + } + ] + } + ], + max_tokens=300, + temperature=0.5 +) +``` + +#### LiteLLM Proxy Usage (AI Gateway) + +**1. Add to config** + +```yaml +model_list: + - model_name: qwen-25vl-72b + litellm_params: + model: bedrock/openai/arn:aws:bedrock:us-east-1:046319184608:imported-model/0m2lasirsp6z +``` + +**2. Start proxy** + +```bash +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +Basic text request: + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "qwen-25vl-72b", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + "max_tokens": 300 + }' +``` + +With vision (image): + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "qwen-25vl-72b", + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant that can analyze images." + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "What is in this image?"}, + { + "type": "image_url", + "image_url": {"url": "..."} + } + ] + } + ], + "max_tokens": 300, + "temperature": 0.5 + }' +``` + +### Moonshot Kimi K2 Thinking + +Moonshot AI's Kimi K2 Thinking model is now available on Amazon Bedrock. This model features advanced reasoning capabilities with automatic reasoning content extraction. + +| Property | Details | +|----------|---------| +| Provider Route | `bedrock/moonshot.kimi-k2-thinking`, `bedrock/invoke/moonshot.kimi-k2-thinking` | +| Provider Documentation | [AWS Bedrock Moonshot Announcement ↗](https://aws.amazon.com/about-aws/whats-new/2025/12/amazon-bedrock-fully-managed-open-weight-models/) | +| Supported Parameters | `temperature`, `max_tokens`, `top_p`, `stream`, `tools`, `tool_choice` | +| Special Features | Reasoning content extraction, Tool calling | + +#### Supported Features + +- **Reasoning Content Extraction**: Automatically extracts `` tags and returns them as `reasoning_content` (similar to OpenAI's o1 models) +- **Tool Calling**: Full support for function/tool calling with tool responses +- **Streaming**: Both streaming and non-streaming responses +- **System Messages**: System message support + +#### Basic Usage + + + + +```python title="Moonshot Kimi K2 SDK Usage" showLineNumbers +from litellm import completion +import os + +os.environ["AWS_ACCESS_KEY_ID"] = "your-aws-access-key" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your-aws-secret-key" +os.environ["AWS_REGION_NAME"] = "us-west-2" # or your preferred region + +# Basic completion +response = completion( + model="bedrock/moonshot.kimi-k2-thinking", # or bedrock/invoke/moonshot.kimi-k2-thinking + messages=[ + {"role": "user", "content": "What is 2+2? Think step by step."} + ], + temperature=0.7, + max_tokens=200 +) + +print(response.choices[0].message.content) + +# Access reasoning content if present +if response.choices[0].message.reasoning_content: + print("Reasoning:", response.choices[0].message.reasoning_content) +``` + + + + +**1. Add to config** + +```yaml title="config.yaml" showLineNumbers +model_list: + - model_name: kimi-k2 + litellm_params: + model: bedrock/moonshot.kimi-k2-thinking + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-west-2 +``` + +**2. Start proxy** + +```bash title="Start LiteLLM Proxy" showLineNumbers +litellm --config /path/to/config.yaml + +# RUNNING at http://0.0.0.0:4000 +``` + +**3. Test it!** + +```bash title="Test Kimi K2 via Proxy" showLineNumbers +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "kimi-k2", + "messages": [ + { + "role": "user", + "content": "What is 2+2? Think step by step." + } + ], + "temperature": 0.7, + "max_tokens": 200 + }' +``` + + + + +#### Tool Calling Example + +```python title="Kimi K2 with Tool Calling" showLineNumbers +from litellm import completion +import os + +os.environ["AWS_ACCESS_KEY_ID"] = "your-aws-access-key" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your-aws-secret-key" +os.environ["AWS_REGION_NAME"] = "us-west-2" + +# Tool calling example +response = completion( + model="bedrock/moonshot.kimi-k2-thinking", + messages=[ + {"role": "user", "content": "What's the weather in Tokyo?"} + ], + tools=[ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city name" + } + }, + "required": ["location"] + } + } + } + ] +) + +if response.choices[0].message.tool_calls: + tool_call = response.choices[0].message.tool_calls[0] + print(f"Tool called: {tool_call.function.name}") + print(f"Arguments: {tool_call.function.arguments}") +``` + +#### Streaming Example + +```python title="Kimi K2 Streaming" showLineNumbers +from litellm import completion +import os + +os.environ["AWS_ACCESS_KEY_ID"] = "your-aws-access-key" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your-aws-secret-key" +os.environ["AWS_REGION_NAME"] = "us-west-2" + +response = completion( + model="bedrock/moonshot.kimi-k2-thinking", + messages=[ + {"role": "user", "content": "Explain quantum computing in simple terms."} + ], + stream=True, + temperature=0.7 +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="") + + # Check for reasoning content in streaming + if hasattr(chunk.choices[0].delta, 'reasoning_content') and chunk.choices[0].delta.reasoning_content: + print(f"\n[Reasoning: {chunk.choices[0].delta.reasoning_content}]") +``` + +#### Supported Parameters + +| Parameter | Type | Description | Supported | +|-----------|------|-------------|-----------| +| `temperature` | float (0-1) | Controls randomness in output | ✅ | +| `max_tokens` | integer | Maximum tokens to generate | ✅ | +| `top_p` | float | Nucleus sampling parameter | ✅ | +| `stream` | boolean | Enable streaming responses | ✅ | +| `tools` | array | Tool/function definitions | ✅ | +| `tool_choice` | string/object | Tool choice specification | ✅ | +| `stop` | array | Stop sequences | ❌ (Not supported on Bedrock) | \ No newline at end of file diff --git a/docs/my-website/docs/providers/bedrock_vector_store.md b/docs/my-website/docs/providers/bedrock_vector_store.md index 39e1aec5ab83..5fae0c76c11a 100644 --- a/docs/my-website/docs/providers/bedrock_vector_store.md +++ b/docs/my-website/docs/providers/bedrock_vector_store.md @@ -138,6 +138,125 @@ print(response.choices[0].message.content) +## Filter Results + +Filter by metadata attributes. + +**Operators** (OpenAI-style, auto-translated): +- `eq`, `ne`, `gt`, `gte`, `lt`, `lte`, `in`, `nin` + +**AWS operators** (use directly): +- `equals`, `notEquals`, `greaterThan`, `greaterThanOrEquals`, `lessThan`, `lessThanOrEquals`, `in`, `notIn`, `startsWith`, `listContains`, `stringContains` + + + + +```python +response = await litellm.acompletion( + model="anthropic/claude-3-5-sonnet", + messages=[{"role": "user", "content": "What are the latest updates?"}], + tools=[{ + "type": "file_search", + "vector_store_ids": ["YOUR_KNOWLEDGE_BASE_ID"], + "filters": { + "key": "category", + "value": "updates", + "operator": "eq" + } + }] +) +``` + + + + + +```python +response = await litellm.acompletion( + model="anthropic/claude-3-5-sonnet", + messages=[{"role": "user", "content": "What are the policies?"}], + tools=[{ + "type": "file_search", + "vector_store_ids": ["YOUR_KNOWLEDGE_BASE_ID"], + "filters": { + "and": [ + {"key": "category", "value": "policy", "operator": "eq"}, + {"key": "year", "value": 2024, "operator": "gte"} + ] + } + }] +) +``` + + + + + +```python +response = await litellm.acompletion( + model="anthropic/claude-3-5-sonnet", + messages=[{"role": "user", "content": "Show me technical docs"}], + tools=[{ + "type": "file_search", + "vector_store_ids": ["YOUR_KNOWLEDGE_BASE_ID"], + "filters": { + "or": [ + {"key": "category", "value": "api", "operator": "eq"}, + {"key": "category", "value": "sdk", "operator": "eq"} + ] + } + }] +) +``` + + + + + +```python +response = await litellm.acompletion( + model="anthropic/claude-3-5-sonnet", + messages=[{"role": "user", "content": "Find docs"}], + tools=[{ + "type": "file_search", + "vector_store_ids": ["YOUR_KNOWLEDGE_BASE_ID"], + "filters": { + "and": [ + {"key": "title", "value": "Guide", "operator": "stringContains"}, + {"key": "tags", "value": "important", "operator": "listContains"} + ] + } + }] +) +``` + + + + + +```bash +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "claude-3-5-sonnet", + "messages": [{"role": "user", "content": "What are our policies?"}], + "tools": [{ + "type": "file_search", + "vector_store_ids": ["YOUR_KNOWLEDGE_BASE_ID"], + "filters": { + "and": [ + {"key": "department", "value": "engineering", "operator": "eq"}, + {"key": "type", "value": "policy", "operator": "eq"} + ] + } + }] + }' +``` + + + + ## Accessing Search Results See how to access vector store search results in your response: diff --git a/docs/my-website/docs/providers/bedrock_writer.md b/docs/my-website/docs/providers/bedrock_writer.md new file mode 100644 index 000000000000..00d77a37f44e --- /dev/null +++ b/docs/my-website/docs/providers/bedrock_writer.md @@ -0,0 +1,316 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Bedrock - Writer Palmyra + +## Overview + +| Property | Details | +|-------|-------| +| Description | Writer Palmyra X5 and X4 foundation models on Amazon Bedrock, offering advanced reasoning, tool calling, and document processing capabilities | +| Provider Route on LiteLLM | `bedrock/` | +| Supported Operations | `/chat/completions` | +| Link to Provider Doc | [Writer on AWS Bedrock ↗](https://aws.amazon.com/bedrock/writer/) | + +## Quick Start + +### LiteLLM SDK + +```python showLineNumbers title="SDK Usage" +import litellm +import os + +os.environ["AWS_ACCESS_KEY_ID"] = "" +os.environ["AWS_SECRET_ACCESS_KEY"] = "" +os.environ["AWS_REGION_NAME"] = "us-west-2" + +response = litellm.completion( + model="bedrock/us.writer.palmyra-x5-v1:0", + messages=[{"role": "user", "content": "Hello, how are you?"}] +) + +print(response.choices[0].message.content) +``` + +### LiteLLM Proxy + +**1. Setup config.yaml** + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: writer-palmyra-x5 + litellm_params: + model: bedrock/us.writer.palmyra-x5-v1:0 + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_region_name: us-west-2 +``` + +**2. Start the proxy** + +```bash showLineNumbers title="Start Proxy" +litellm --config config.yaml +``` + +**3. Call the proxy** + + + + +```bash showLineNumbers title="curl Request" +curl -X POST http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "writer-palmyra-x5", + "messages": [{"role": "user", "content": "Hello, how are you?"}] + }' +``` + + + + +```python showLineNumbers title="OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000/v1" +) + +response = client.chat.completions.create( + model="writer-palmyra-x5", + messages=[{"role": "user", "content": "Hello, how are you?"}] +) + +print(response.choices[0].message.content) +``` + + + + +## Tool Calling + +Writer Palmyra models support multi-step tool calling for complex workflows. + +### LiteLLM SDK + +```python showLineNumbers title="Tool Calling - SDK" +import litellm + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state" + } + }, + "required": ["location"] + } + } + } +] + +response = litellm.completion( + model="bedrock/us.writer.palmyra-x5-v1:0", + messages=[{"role": "user", "content": "What's the weather in Boston?"}], + tools=tools +) +``` + +### LiteLLM Proxy + + + + +```bash showLineNumbers title="Tool Calling - curl" +curl -X POST http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "writer-palmyra-x5", + "messages": [{"role": "user", "content": "What'\''s the weather in Boston?"}], + "tools": [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city and state"} + }, + "required": ["location"] + } + } + }] + }' +``` + + + + +```python showLineNumbers title="Tool Calling - OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000/v1" +) + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get the current weather in a location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state" + } + }, + "required": ["location"] + } + } + } +] + +response = client.chat.completions.create( + model="writer-palmyra-x5", + messages=[{"role": "user", "content": "What's the weather in Boston?"}], + tools=tools +) +``` + + + + +## Document Input + +Writer Palmyra models support document inputs including PDFs. + +### LiteLLM SDK + +```python showLineNumbers title="PDF Document Input - SDK" +import litellm +import base64 + +# Read and encode PDF +with open("document.pdf", "rb") as f: + pdf_base64 = base64.b64encode(f.read()).decode("utf-8") + +response = litellm.completion( + model="bedrock/us.writer.palmyra-x5-v1:0", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:application/pdf;base64,{pdf_base64}" + } + }, + { + "type": "text", + "text": "Summarize this document" + } + ] + } + ] +) +``` + +### LiteLLM Proxy + + + + +```bash showLineNumbers title="PDF Document Input - curl" +# First, base64 encode your PDF +PDF_BASE64=$(base64 -i document.pdf) + +curl -X POST http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "writer-palmyra-x5", + "messages": [{ + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": "data:application/pdf;base64,'$PDF_BASE64'"} + }, + { + "type": "text", + "text": "Summarize this document" + } + ] + }] + }' +``` + + + + +```python showLineNumbers title="PDF Document Input - OpenAI SDK" +from openai import OpenAI +import base64 + +client = OpenAI( + api_key="sk-1234", + base_url="http://localhost:4000/v1" +) + +# Read and encode PDF +with open("document.pdf", "rb") as f: + pdf_base64 = base64.b64encode(f.read()).decode("utf-8") + +response = client.chat.completions.create( + model="writer-palmyra-x5", + messages=[ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:application/pdf;base64,{pdf_base64}" + } + }, + { + "type": "text", + "text": "Summarize this document" + } + ] + } + ] +) +``` + + + + +## Supported Models + +| Model ID | Context Window | Input Cost (per 1K tokens) | Output Cost (per 1K tokens) | +|----------|---------------|---------------------------|----------------------------| +| `bedrock/us.writer.palmyra-x5-v1:0` | 1M tokens | $0.0006 | $0.006 | +| `bedrock/us.writer.palmyra-x4-v1:0` | 128K tokens | $0.0025 | $0.010 | +| `bedrock/writer.palmyra-x5-v1:0` | 1M tokens | $0.0006 | $0.006 | +| `bedrock/writer.palmyra-x4-v1:0` | 128K tokens | $0.0025 | $0.010 | + +:::info Cross-Region Inference +The `us.writer.*` model IDs use cross-region inference profiles. Use these for production workloads. +::: diff --git a/docs/my-website/docs/providers/chatgpt.md b/docs/my-website/docs/providers/chatgpt.md new file mode 100644 index 000000000000..156bbf99df6f --- /dev/null +++ b/docs/my-website/docs/providers/chatgpt.md @@ -0,0 +1,84 @@ +# ChatGPT Subscription + +Use ChatGPT Pro/Max subscription models through LiteLLM with OAuth device flow authentication. + +| Property | Details | +|-------|-------| +| Description | ChatGPT subscription access (Codex + GPT-5.2 family) via ChatGPT backend API | +| Provider Route on LiteLLM | `chatgpt/` | +| Supported Endpoints | `/responses`, `/chat/completions` (bridged to Responses for supported models) | +| API Reference | https://chatgpt.com | + +ChatGPT subscription access is native to the Responses API. Chat Completions requests are bridged to Responses for supported models (for example `chatgpt/gpt-5.2`). + +Notes: +- The ChatGPT subscription backend rejects token limit fields (`max_tokens`, `max_output_tokens`, `max_completion_tokens`) and `metadata`. LiteLLM strips these fields for this provider. +- `/v1/chat/completions` honors `stream`. When `stream` is false (default), LiteLLM aggregates the Responses stream into a single JSON response. + +## Authentication + +ChatGPT subscription access uses an OAuth device code flow: + +1. LiteLLM prints a device code and verification URL +2. Open the URL, sign in, and enter the code +3. Tokens are stored locally for reuse + +## Usage - LiteLLM Python SDK + +### Responses (recommended for Codex models) + +```python showLineNumbers title="ChatGPT Responses" +import litellm + +response = litellm.responses( + model="chatgpt/gpt-5.2-codex", + input="Write a Python hello world" +) + +print(response) +``` + +### Chat Completions (bridged to Responses) + +```python showLineNumbers title="ChatGPT Chat Completions" +import litellm + +response = litellm.completion( + model="chatgpt/gpt-5.2", + messages=[{"role": "user", "content": "Write a Python hello world"}] +) + +print(response) +``` + +## Usage - LiteLLM Proxy + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: chatgpt/gpt-5.2 + model_info: + mode: responses + litellm_params: + model: chatgpt/gpt-5.2 + - model_name: chatgpt/gpt-5.2-codex + model_info: + mode: responses + litellm_params: + model: chatgpt/gpt-5.2-codex +``` + +```bash showLineNumbers title="Start LiteLLM Proxy" +litellm --config config.yaml +``` + +## Configuration + +### Environment Variables + +- `CHATGPT_TOKEN_DIR`: Custom token storage directory +- `CHATGPT_AUTH_FILE`: Auth file name (default: `auth.json`) +- `CHATGPT_API_BASE`: Override API base (default: `https://chatgpt.com/backend-api/codex`) +- `OPENAI_CHATGPT_API_BASE`: Alias for `CHATGPT_API_BASE` +- `CHATGPT_ORIGINATOR`: Override the `originator` header value +- `CHATGPT_USER_AGENT`: Override the `User-Agent` header value +- `CHATGPT_USER_AGENT_SUFFIX`: Optional suffix appended to the `User-Agent` header diff --git a/docs/my-website/docs/providers/chutes.md b/docs/my-website/docs/providers/chutes.md new file mode 100644 index 000000000000..e2b81837c346 --- /dev/null +++ b/docs/my-website/docs/providers/chutes.md @@ -0,0 +1,172 @@ +# Chutes + +## Overview + +| Property | Details | +|-------|-------| +| Description | Chutes is a cloud-native AI deployment platform that allows you to deploy, run, and scale LLM applications with OpenAI-compatible APIs using pre-built templates for popular frameworks like vLLM and SGLang. | +| Provider Route on LiteLLM | `chutes/` | +| Link to Provider Doc | [Chutes Website ↗](https://chutes.ai) | +| Base URL | `https://llm.chutes.ai/v1/` | +| Supported Operations | [`/chat/completions`](#sample-usage), Embeddings | + +
+ +## What is Chutes? + +Chutes is a powerful AI deployment and serving platform that provides: +- **Pre-built Templates**: Ready-to-use configurations for vLLM, SGLang, diffusion models, and embeddings +- **OpenAI-Compatible APIs**: Use standard OpenAI SDKs and clients +- **Multi-GPU Scaling**: Support for large models across multiple GPUs +- **Streaming Responses**: Real-time model outputs +- **Custom Configurations**: Override any parameter for your specific needs +- **Performance Optimization**: Pre-configured optimization settings + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["CHUTES_API_KEY"] = "" # your Chutes API key +``` + +Get your Chutes API key from [chutes.ai](https://chutes.ai). + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="Chutes Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["CHUTES_API_KEY"] = "" # your Chutes API key + +messages = [{"content": "What is the capital of France?", "role": "user"}] + +# Chutes call +response = completion( + model="chutes/model-name", # Replace with actual model name + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="Chutes Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["CHUTES_API_KEY"] = "" # your Chutes API key + +messages = [{"content": "Write a short poem about AI", "role": "user"}] + +# Chutes call with streaming +response = completion( + model="chutes/model-name", # Replace with actual model name + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export CHUTES_API_KEY="" +``` + +### 2. Start the proxy + +```yaml +model_list: + - model_name: chutes-model + litellm_params: + model: chutes/model-name # Replace with actual model name + api_key: os.environ/CHUTES_API_KEY +``` + +## Supported OpenAI Parameters + +Chutes supports all standard OpenAI-compatible parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `messages` | array | **Required**. Array of message objects with 'role' and 'content' | +| `model` | string | **Required**. Model ID or HuggingFace model identifier | +| `stream` | boolean | Optional. Enable streaming responses | +| `temperature` | float | Optional. Sampling temperature | +| `top_p` | float | Optional. Nucleus sampling parameter | +| `max_tokens` | integer | Optional. Maximum tokens to generate | +| `frequency_penalty` | float | Optional. Penalize frequent tokens | +| `presence_penalty` | float | Optional. Penalize tokens based on presence | +| `stop` | string/array | Optional. Stop sequences | +| `tools` | array | Optional. List of available tools/functions | +| `tool_choice` | string/object | Optional. Control tool/function calling | +| `response_format` | object | Optional. Response format specification | + +## Support Frameworks + +Chutes provides optimized templates for popular AI frameworks: + +### vLLM (High-Performance LLM Serving) +- OpenAI-compatible endpoints +- Multi-GPU scaling support +- Advanced optimization settings +- Best for production workloads + +### SGLang (Advanced LLM Serving) +- Structured generation capabilities +- Advanced features and controls +- Custom configuration options +- Best for complex use cases + +### Diffusion Models (Image Generation) +- Pre-configured image generation templates +- Optimized settings for best results +- Support for popular diffusion models + +### Embedding Models +- Text embedding templates +- Vector search optimization +- Support for popular embedding models + +## Authentication + +Chutes supports multiple authentication methods: +- API Key via `X-API-Key` header +- Bearer token via `Authorization` header + +Example for LiteLLM (uses environment variable): +```python +os.environ["CHUTES_API_KEY"] = "your-api-key" +``` + +## Performance Optimization + +Chutes offers hardware selection and optimization: +- **Small Models (7B-13B)**: 1 GPU with 24GB VRAM +- **Medium Models (30B-70B)**: 4 GPUs with 80GB VRAM each +- **Large Models (100B+)**: 8 GPUs with 140GB+ VRAM each + +Engine optimization parameters available for fine-tuning performance. + +## Deployment Options + +Chutes provides flexible deployment: +- **Quick Setup**: Use pre-built templates for instant deployment +- **Custom Images**: Deploy with custom Docker images +- **Scaling**: Configure max instances and auto-scaling thresholds +- **Hardware**: Choose specific GPU types and configurations + +## Additional Resources + +- [Chutes Documentation](https://chutes.ai/docs) +- [Chutes Getting Started](https://chutes.ai/docs/getting-started/running-a-chute) +- [Chutes API Reference](https://chutes.ai/docs/sdk-reference) diff --git a/docs/my-website/docs/providers/custom.md b/docs/my-website/docs/providers/custom.md deleted file mode 100644 index 81b92f0a0310..000000000000 --- a/docs/my-website/docs/providers/custom.md +++ /dev/null @@ -1,69 +0,0 @@ -# Custom LLM API-Endpoints -LiteLLM supports Custom deploy api endpoints - -LiteLLM Expects the following input and output for custom LLM API endpoints - -### Model Details - -For calls to your custom API base ensure: -* Set `api_base="your-api-base"` -* Add `custom/` as a prefix to the `model` param. If your API expects `meta-llama/Llama-2-13b-hf` set `model=custom/meta-llama/Llama-2-13b-hf` - -| Model Name | Function Call | -|------------------|--------------------------------------------| -| meta-llama/Llama-2-13b-hf | `response = completion(model="custom/meta-llama/Llama-2-13b-hf", messages=messages, api_base="https://your-custom-inference-endpoint")` | -| meta-llama/Llama-2-13b-hf | `response = completion(model="custom/meta-llama/Llama-2-13b-hf", messages=messages, api_base="https://api.autoai.dev/inference")` | - -### Example Call to Custom LLM API using LiteLLM -```python -from litellm import completion -response = completion( - model="custom/meta-llama/Llama-2-13b-hf", - messages= [{"content": "what is custom llama?", "role": "user"}], - temperature=0.2, - max_tokens=10, - api_base="https://api.autoai.dev/inference", - request_timeout=300, -) -print("got response\n", response) -``` - -#### Setting your Custom API endpoint - -Inputs to your custom LLM api bases should follow this format: - -```python -resp = requests.post( - your-api_base, - json={ - 'model': 'meta-llama/Llama-2-13b-hf', # model name - 'params': { - 'prompt': ["The capital of France is P"], - 'max_tokens': 32, - 'temperature': 0.7, - 'top_p': 1.0, - 'top_k': 40, - } - } -) -``` - -Outputs from your custom LLM api bases should follow this format: -```python -{ - 'data': [ - { - 'prompt': 'The capital of France is P', - 'output': [ - 'The capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France is PARIS.\nThe capital of France' - ], - 'params': { - 'temperature': 0.7, - 'top_k': 40, - 'top_p': 1 - } - } - ], - 'message': 'ok' -} -``` \ No newline at end of file diff --git a/docs/my-website/docs/providers/custom_llm_server.md b/docs/my-website/docs/providers/custom_llm_server.md index 61099d1a358d..4fcbf8942ce9 100644 --- a/docs/my-website/docs/providers/custom_llm_server.md +++ b/docs/my-website/docs/providers/custom_llm_server.md @@ -17,6 +17,7 @@ Supported Routes: - `/v1/completions` -> `litellm.atext_completion` - `/v1/embeddings` -> `litellm.aembedding` - `/v1/images/generations` -> `litellm.aimage_generation` +- `/v1/images/edits` -> `litellm.aimage_edit` - `/v1/messages` -> `litellm.acompletion` @@ -263,6 +264,83 @@ Expected Response } ``` +## Image Edit + +1. Setup your `custom_handler.py` file +```python +import litellm +from litellm import CustomLLM +from litellm.types.utils import ImageResponse, ImageObject +import time + +class MyCustomLLM(CustomLLM): + async def aimage_edit( + self, + model: str, + image: Any, + prompt: str, + model_response: ImageResponse, + api_key: Optional[str], + api_base: Optional[str], + optional_params: dict, + logging_obj: Any, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[AsyncHTTPHandler] = None, + ) -> ImageResponse: + # Your custom image edit logic here + # e.g., call Stability AI, Black Forest Labs, etc. + return ImageResponse( + created=int(time.time()), + data=[ImageObject(url="https://example.com/edited-image.png")], + ) + +my_custom_llm = MyCustomLLM() +``` + + +2. Add to `config.yaml` + +In the config below, we pass + +python_filename: `custom_handler.py` +custom_handler_instance_name: `my_custom_llm`. This is defined in Step 1 + +custom_handler: `custom_handler.my_custom_llm` + +```yaml +model_list: + - model_name: "my-custom-image-edit-model" + litellm_params: + model: "my-custom-llm/my-model" + +litellm_settings: + custom_provider_map: + - {"provider": "my-custom-llm", "custom_handler": custom_handler.my_custom_llm} +``` + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl -X POST 'http://0.0.0.0:4000/v1/images/edits' \ +-H 'Authorization: Bearer sk-1234' \ +-F 'model=my-custom-image-edit-model' \ +-F 'image=@/path/to/image.png' \ +-F 'prompt=Make the sky blue' +``` + +Expected Response + +``` +{ + "created": 1721955063, + "data": [{"url": "https://example.com/edited-image.png"}], +} +``` + ## Anthropic `/v1/messages` - Write the integration for .acompletion @@ -517,4 +595,34 @@ class CustomLLM(BaseLLM): client: Optional[AsyncHTTPHandler] = None, ) -> ImageResponse: raise CustomLLMError(status_code=500, message="Not implemented yet!") + + def image_edit( + self, + model: str, + image: Any, + prompt: str, + model_response: ImageResponse, + api_key: Optional[str], + api_base: Optional[str], + optional_params: dict, + logging_obj: Any, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[HTTPHandler] = None, + ) -> ImageResponse: + raise CustomLLMError(status_code=500, message="Not implemented yet!") + + async def aimage_edit( + self, + model: str, + image: Any, + prompt: str, + model_response: ImageResponse, + api_key: Optional[str], + api_base: Optional[str], + optional_params: dict, + logging_obj: Any, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[AsyncHTTPHandler] = None, + ) -> ImageResponse: + raise CustomLLMError(status_code=500, message="Not implemented yet!") ``` diff --git a/docs/my-website/docs/providers/databricks.md b/docs/my-website/docs/providers/databricks.md index 921b06a17b73..2791d55dff1e 100644 --- a/docs/my-website/docs/providers/databricks.md +++ b/docs/my-website/docs/providers/databricks.md @@ -11,6 +11,99 @@ LiteLLM supports all models on Databricks ::: +## Authentication + +LiteLLM supports multiple authentication methods for Databricks, listed in order of preference: + +### OAuth M2M (Recommended for Production) + +OAuth Machine-to-Machine authentication using Service Principal credentials is the **recommended method for production** deployments per Databricks Partner requirements. + +```python +import os +from litellm import completion + +# Set OAuth credentials (Service Principal) +os.environ["DATABRICKS_CLIENT_ID"] = "your-service-principal-application-id" +os.environ["DATABRICKS_CLIENT_SECRET"] = "your-service-principal-secret" +os.environ["DATABRICKS_API_BASE"] = "https://adb-xxx.azuredatabricks.net/serving-endpoints" + +response = completion( + model="databricks/databricks-dbrx-instruct", + messages=[{"role": "user", "content": "Hello!"}], +) +``` + +### Personal Access Token (PAT) + +PAT authentication is supported for development and testing scenarios. + +```python +import os +from litellm import completion + +os.environ["DATABRICKS_API_KEY"] = "dapi..." # Your Personal Access Token +os.environ["DATABRICKS_API_BASE"] = "https://adb-xxx.azuredatabricks.net/serving-endpoints" + +response = completion( + model="databricks/databricks-dbrx-instruct", + messages=[{"role": "user", "content": "Hello!"}], +) +``` + +### Databricks SDK Authentication (Automatic) + +If no credentials are provided, LiteLLM will use the Databricks SDK for automatic authentication. This supports OAuth, Azure AD, and other unified auth methods configured in your environment. + +```python +from litellm import completion + +# No environment variables needed - uses Databricks SDK unified auth +# Requires: pip install databricks-sdk +response = completion( + model="databricks/databricks-dbrx-instruct", + messages=[{"role": "user", "content": "Hello!"}], +) +``` + +## Custom User-Agent for Partner Attribution + +If you're building a product on top of LiteLLM that integrates with Databricks, you can pass your own partner identifier for proper attribution in Databricks telemetry. + +The partner name will be prefixed to the LiteLLM user agent: + +```python +# Via parameter +response = completion( + model="databricks/databricks-dbrx-instruct", + messages=[{"role": "user", "content": "Hello!"}], + user_agent="mycompany/1.0.0", +) +# Resulting User-Agent: mycompany_litellm/1.79.1 + +# Via environment variable +os.environ["DATABRICKS_USER_AGENT"] = "mycompany/1.0.0" +# Resulting User-Agent: mycompany_litellm/1.79.1 +``` + +| Input | Resulting User-Agent | +|-------|---------------------| +| (none) | `litellm/1.79.1` | +| `mycompany/1.0.0` | `mycompany_litellm/1.79.1` | +| `partner_product/2.5.0` | `partner_product_litellm/1.79.1` | +| `acme` | `acme_litellm/1.79.1` | + +**Note:** The version from your custom user agent is ignored; LiteLLM's version is always used. + +## Security + +LiteLLM automatically redacts sensitive information (tokens, secrets, API keys) from all debug logs to prevent credential leakage. This includes: + +- Authorization headers +- API keys and tokens +- Client secrets +- Personal access tokens (PATs) + ## Usage @@ -51,6 +144,7 @@ response = completion( model: databricks/databricks-dbrx-instruct api_key: os.environ/DATABRICKS_API_KEY api_base: os.environ/DATABRICKS_API_BASE + user_agent: "mycompany/1.0.0" # Optional: for partner attribution ``` diff --git a/docs/my-website/docs/providers/deepseek.md b/docs/my-website/docs/providers/deepseek.md index 31efb36c21f1..1214431386d8 100644 --- a/docs/my-website/docs/providers/deepseek.md +++ b/docs/my-website/docs/providers/deepseek.md @@ -58,9 +58,56 @@ We support ALL Deepseek models, just set `deepseek/` as a prefix when sending co ## Reasoning Models | Model Name | Function Call | |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| deepseek-reasoner | `completion(model="deepseek/deepseek-reasoner", messages)` | +| deepseek-reasoner | `completion(model="deepseek/deepseek-reasoner", messages)` | +### Thinking / Reasoning Mode +Enable thinking mode for DeepSeek reasoner models using `thinking` or `reasoning_effort` parameters: + + + + +```python +from litellm import completion +import os + +os.environ['DEEPSEEK_API_KEY'] = "" + +resp = completion( + model="deepseek/deepseek-reasoner", + messages=[{"role": "user", "content": "What is 2+2?"}], + thinking={"type": "enabled"}, +) +print(resp.choices[0].message.reasoning_content) # Model's reasoning +print(resp.choices[0].message.content) # Final answer +``` + + + + +```python +from litellm import completion +import os + +os.environ['DEEPSEEK_API_KEY'] = "" + +resp = completion( + model="deepseek/deepseek-reasoner", + messages=[{"role": "user", "content": "What is 2+2?"}], + reasoning_effort="medium", # low, medium, high all map to thinking enabled +) +print(resp.choices[0].message.reasoning_content) # Model's reasoning +print(resp.choices[0].message.content) # Final answer +``` + + + + +:::note +DeepSeek only supports `{"type": "enabled"}` - unlike Anthropic, it doesn't support `budget_tokens`. Any `reasoning_effort` value other than `"none"` enables thinking mode. +::: + +### Basic Usage diff --git a/docs/my-website/docs/providers/docker_model_runner.md b/docs/my-website/docs/providers/docker_model_runner.md new file mode 100644 index 000000000000..fcd4c74f8f49 --- /dev/null +++ b/docs/my-website/docs/providers/docker_model_runner.md @@ -0,0 +1,277 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Docker Model Runner + +## Overview + +| Property | Details | +|-------|-------| +| Description | Docker Model Runner allows you to run large language models locally using Docker Desktop. | +| Provider Route on LiteLLM | `docker_model_runner/` | +| Link to Provider Doc | [Docker Model Runner ↗](https://docs.docker.com/ai/model-runner/) | +| Base URL | `http://localhost:22088` | +| Supported Operations | [`/chat/completions`](#sample-usage) | + +
+
+ +https://docs.docker.com/ai/model-runner/ + +**We support ALL Docker Model Runner models, just set `docker_model_runner/` as a prefix when sending completion requests** + +## Quick Start + +Docker Model Runner is a Docker Desktop feature that lets you run AI models locally. It provides better performance than other local solutions while maintaining OpenAI compatibility. + +### Installation + +1. Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) +2. Enable Docker Model Runner in Docker Desktop settings +3. Download your preferred model through Docker Desktop + +## Environment Variables + +```python showLineNumbers title="Environment Variables" +os.environ["DOCKER_MODEL_RUNNER_API_BASE"] = "http://localhost:22088/engines/llama.cpp" # Optional - defaults to this +os.environ["DOCKER_MODEL_RUNNER_API_KEY"] = "dummy-key" # Optional - Docker Model Runner may not require auth for local instances +``` + +**Note:** +- Docker Model Runner typically runs locally and may not require authentication. LiteLLM will use a dummy key by default if no key is provided. +- The API base should include the engine path (e.g., `/engines/llama.cpp`) + +## API Base Structure + +Docker Model Runner uses a unique URL structure: + +``` +http://model-runner.docker.internal/engines/{engine}/v1/chat/completions +``` + +Where `{engine}` is the engine you want to use (typically `llama.cpp`). + +**Important:** Specify the engine in your `api_base` URL, not in the model name: +- ✅ Correct: `api_base="http://localhost:22088/engines/llama.cpp"`, `model="docker_model_runner/llama-3.1"` +- ❌ Incorrect: `api_base="http://localhost:22088"`, `model="docker_model_runner/llama.cpp/llama-3.1"` + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="Docker Model Runner Non-streaming Completion" +import os +import litellm +from litellm import completion + +# Specify the engine in the api_base URL +os.environ["DOCKER_MODEL_RUNNER_API_BASE"] = "http://localhost:22088/engines/llama.cpp" + +messages = [{"content": "Hello, how are you?", "role": "user"}] + +# Docker Model Runner call +response = completion( + model="docker_model_runner/llama-3.1", + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="Docker Model Runner Streaming Completion" +import os +import litellm +from litellm import completion + +# Specify the engine in the api_base URL +os.environ["DOCKER_MODEL_RUNNER_API_BASE"] = "http://localhost:22088/engines/llama.cpp" + +messages = [{"content": "Hello, how are you?", "role": "user"}] + +# Docker Model Runner call with streaming +response = completion( + model="docker_model_runner/llama-3.1", + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +### Custom API Base and Engine + +```python showLineNumbers title="Custom API Base with Different Engine" +import litellm +from litellm import completion + +messages = [{"content": "Hello, how are you?", "role": "user"}] + +# Specify the engine in the api_base URL +# Using a different host and engine +response = completion( + model="docker_model_runner/llama-3.1", + messages=messages, + api_base="http://model-runner.docker.internal/engines/llama.cpp" +) + +print(response) +``` + +### Using Different Engines + +```python showLineNumbers title="Using a Different Engine" +import litellm +from litellm import completion + +messages = [{"content": "Hello, how are you?", "role": "user"}] + +# To use a different engine, specify it in the api_base +# For example, if Docker Model Runner supports other engines: +response = completion( + model="docker_model_runner/mistral-7b", + messages=messages, + api_base="http://localhost:22088/engines/custom-engine" +) + +print(response) +``` + +## Usage - LiteLLM Proxy + +Add the following to your LiteLLM Proxy configuration file: + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: llama-3.1 + litellm_params: + model: docker_model_runner/llama-3.1 + api_base: http://localhost:22088/engines/llama.cpp + + - model_name: mistral-7b + litellm_params: + model: docker_model_runner/mistral-7b + api_base: http://localhost:22088/engines/llama.cpp +``` + +Start your LiteLLM Proxy server: + +```bash showLineNumbers title="Start LiteLLM Proxy" +litellm --config config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + + + + +```python showLineNumbers title="Docker Model Runner via Proxy - Non-streaming" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-proxy-api-key" # Your proxy API key +) + +# Non-streaming response +response = client.chat.completions.create( + model="llama-3.1", + messages=[{"role": "user", "content": "hello from litellm"}] +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="Docker Model Runner via Proxy - Streaming" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-proxy-api-key" # Your proxy API key +) + +# Streaming response +response = client.chat.completions.create( + model="llama-3.1", + messages=[{"role": "user", "content": "hello from litellm"}], + stream=True +) + +for chunk in response: + if chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") +``` + + + + + +```python showLineNumbers title="Docker Model Runner via Proxy - LiteLLM SDK" +import litellm + +# Configure LiteLLM to use your proxy +response = litellm.completion( + model="litellm_proxy/llama-3.1", + messages=[{"role": "user", "content": "hello from litellm"}], + api_base="http://localhost:4000", + api_key="your-proxy-api-key" +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="Docker Model Runner via Proxy - LiteLLM SDK Streaming" +import litellm + +# Configure LiteLLM to use your proxy with streaming +response = litellm.completion( + model="litellm_proxy/llama-3.1", + messages=[{"role": "user", "content": "hello from litellm"}], + api_base="http://localhost:4000", + api_key="your-proxy-api-key", + stream=True +) + +for chunk in response: + if hasattr(chunk.choices[0], 'delta') and chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") +``` + + + + + +```bash showLineNumbers title="Docker Model Runner via Proxy - cURL" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-proxy-api-key" \ + -d '{ + "model": "llama-3.1", + "messages": [{"role": "user", "content": "hello from litellm"}] + }' +``` + +```bash showLineNumbers title="Docker Model Runner via Proxy - cURL Streaming" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-proxy-api-key" \ + -d '{ + "model": "llama-3.1", + "messages": [{"role": "user", "content": "hello from litellm"}], + "stream": true + }' +``` + + + + +For more detailed information on using the LiteLLM Proxy, see the [LiteLLM Proxy documentation](../providers/litellm_proxy). + +## API Reference + +For detailed API information, see the [Docker Model Runner API Reference](https://docs.docker.com/ai/model-runner/api-reference/). + diff --git a/docs/my-website/docs/providers/elevenlabs.md b/docs/my-website/docs/providers/elevenlabs.md index e80ea534f551..5cf62f51203b 100644 --- a/docs/my-website/docs/providers/elevenlabs.md +++ b/docs/my-website/docs/providers/elevenlabs.md @@ -7,10 +7,10 @@ ElevenLabs provides high-quality AI voice technology, including speech-to-text c | Property | Details | |----------|---------| -| Description | ElevenLabs offers advanced AI voice technology with speech-to-text transcription capabilities that support multiple languages and speaker diarization. | +| Description | ElevenLabs offers advanced AI voice technology with speech-to-text transcription and text-to-speech capabilities that support multiple languages and speaker diarization. | | Provider Route on LiteLLM | `elevenlabs/` | | Provider Doc | [ElevenLabs API ↗](https://elevenlabs.io/docs/api-reference) | -| Supported Endpoints | `/audio/transcriptions` | +| Supported Endpoints | `/audio/transcriptions`, `/audio/speech` | ## Quick Start @@ -228,4 +228,241 @@ ElevenLabs returns transcription responses in OpenAI-compatible format: 1. **Invalid API Key**: Ensure `ELEVENLABS_API_KEY` is set correctly +--- + +## Text-to-Speech (TTS) + +ElevenLabs provides high-quality text-to-speech capabilities through their TTS API, supporting multiple voices, languages, and audio formats. + +### Overview + +| Property | Details | +|----------|---------| +| Description | Convert text to natural-sounding speech using ElevenLabs' advanced TTS models | +| Provider Route on LiteLLM | `elevenlabs/` | +| Supported Operations | `/audio/speech` | +| Link to Provider Doc | [ElevenLabs TTS API ↗](https://elevenlabs.io/docs/api-reference/text-to-speech) | + +### Quick Start + +#### LiteLLM Python SDK + +```python showLineNumbers title="ElevenLabs Text-to-Speech with SDK" +import litellm +import os + +os.environ["ELEVENLABS_API_KEY"] = "your-elevenlabs-api-key" + +# Basic usage with voice mapping +audio = litellm.speech( + model="elevenlabs/eleven_multilingual_v2", + input="Testing ElevenLabs speech from LiteLLM.", + voice="alloy", # Maps to ElevenLabs voice ID automatically +) + +# Save audio to file +with open("test_output.mp3", "wb") as f: + f.write(audio.read()) +``` + +#### Advanced Usage: Overriding Parameters and ElevenLabs-Specific Features + +```python showLineNumbers title="Advanced TTS with custom parameters" +import litellm +import os + +os.environ["ELEVENLABS_API_KEY"] = "your-elevenlabs-api-key" + +# Example showing parameter overriding and ElevenLabs-specific parameters +audio = litellm.speech( + model="elevenlabs/eleven_multilingual_v2", + input="Testing ElevenLabs speech from LiteLLM.", + voice="alloy", # Can use mapped voice name or raw ElevenLabs voice_id + response_format="pcm", # Maps to ElevenLabs output_format + speed=1.1, # Maps to voice_settings.speed + # ElevenLabs-specific parameters - passed directly to API + pronunciation_dictionary_locators=[ + {"pronunciation_dictionary_id": "dict_123", "version_id": "v1"} + ], + model_id="eleven_multilingual_v2", # Override model if needed +) + +# Save audio to file +with open("test_output.mp3", "wb") as f: + f.write(audio.read()) +``` + +### Voice Mapping + +LiteLLM automatically maps common OpenAI voice names to ElevenLabs voice IDs: + +| OpenAI Voice | ElevenLabs Voice ID | Description | +|--------------|---------------------|-------------| +| `alloy` | `21m00Tcm4TlvDq8ikWAM` | Rachel - Neutral and balanced | +| `amber` | `5Q0t7uMcjvnagumLfvZi` | Paul - Warm and friendly | +| `ash` | `AZnzlk1XvdvUeBnXmlld` | Domi - Energetic | +| `august` | `D38z5RcWu1voky8WS1ja` | Fin - Professional | +| `blue` | `2EiwWnXFnvU5JabPnv8n` | Clyde - Deep and authoritative | +| `coral` | `9BWtsMINqrJLrRacOk9x` | Aria - Expressive | +| `lily` | `EXAVITQu4vr4xnSDxMaL` | Sarah - Friendly | +| `onyx` | `29vD33N1CtxCmqQRPOHJ` | Drew - Strong | +| `sage` | `CwhRBWXzGAHq8TQ4Fs17` | Roger - Calm | +| `verse` | `CYw3kZ02Hs0563khs1Fj` | Dave - Conversational | + +**Using Custom Voice IDs**: You can also pass any ElevenLabs voice ID directly. If the voice name is not in the mapping, LiteLLM will use it as-is: + +```python showLineNumbers title="Using custom ElevenLabs voice ID" +audio = litellm.speech( + model="elevenlabs/eleven_multilingual_v2", + input="Testing with a custom voice.", + voice="21m00Tcm4TlvDq8ikWAM", # Direct ElevenLabs voice ID +) +``` + +### Response Format Mapping + +LiteLLM maps OpenAI response formats to ElevenLabs output formats: + +| OpenAI Format | ElevenLabs Format | +|---------------|-------------------| +| `mp3` | `mp3_44100_128` | +| `pcm` | `pcm_44100` | +| `opus` | `opus_48000_128` | + +You can also pass ElevenLabs-specific output formats directly using the `output_format` parameter. + +### Supported Parameters + +```python showLineNumbers title="All Supported Parameters" +audio = litellm.speech( + model="elevenlabs/eleven_multilingual_v2", # Required + input="Text to convert to speech", # Required + voice="alloy", # Required: Voice selection (mapped or raw ID) + response_format="mp3", # Optional: Audio format (mp3, pcm, opus) + speed=1.0, # Optional: Speech speed (maps to voice_settings.speed) + # ElevenLabs-specific parameters (passed directly): + model_id="eleven_multilingual_v2", # Optional: Override model + voice_settings={ # Optional: Voice customization + "stability": 0.5, + "similarity_boost": 0.75, + "speed": 1.0 + }, + pronunciation_dictionary_locators=[ # Optional: Custom pronunciation + {"pronunciation_dictionary_id": "dict_123", "version_id": "v1"} + ], +) +``` + +### LiteLLM Proxy + +#### 1. Configure your proxy + +```yaml showLineNumbers title="ElevenLabs TTS configuration in config.yaml" +model_list: + - model_name: elevenlabs-tts + litellm_params: + model: elevenlabs/eleven_multilingual_v2 + api_key: os.environ/ELEVENLABS_API_KEY + +general_settings: + master_key: your-master-key +``` + +#### 2. Make TTS requests + +##### Simple Usage (OpenAI Parameters) + +You can use standard OpenAI-compatible parameters without any provider-specific configuration: + +```bash showLineNumbers title="Simple TTS request with curl" +curl http://localhost:4000/v1/audio/speech \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "elevenlabs-tts", + "input": "Testing ElevenLabs speech via the LiteLLM proxy.", + "voice": "alloy", + "response_format": "mp3" + }' \ + --output speech.mp3 +``` + +```python showLineNumbers title="Simple TTS with OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +response = client.audio.speech.create( + model="elevenlabs-tts", + input="Testing ElevenLabs speech via the LiteLLM proxy.", + voice="alloy", + response_format="mp3" +) + +# Save audio +with open("speech.mp3", "wb") as f: + f.write(response.content) +``` + +##### Advanced Usage (ElevenLabs-Specific Parameters) + +**Note**: When using the proxy, provider-specific parameters (like `pronunciation_dictionary_locators`, `voice_settings`, etc.) must be passed in the `extra_body` field. + +```bash showLineNumbers title="Advanced TTS request with curl" +curl http://localhost:4000/v1/audio/speech \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "elevenlabs-tts", + "input": "Testing ElevenLabs speech via the LiteLLM proxy.", + "voice": "alloy", + "response_format": "pcm", + "extra_body": { + "pronunciation_dictionary_locators": [ + {"pronunciation_dictionary_id": "dict_123", "version_id": "v1"} + ], + "voice_settings": { + "speed": 1.1, + "stability": 0.5, + "similarity_boost": 0.75 + } + } + }' \ + --output speech.mp3 +``` + +```python showLineNumbers title="Advanced TTS with OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +response = client.audio.speech.create( + model="elevenlabs-tts", + input="Testing ElevenLabs speech via the LiteLLM proxy.", + voice="alloy", + response_format="pcm", + extra_body={ + "pronunciation_dictionary_locators": [ + {"pronunciation_dictionary_id": "dict_123", "version_id": "v1"} + ], + "voice_settings": { + "speed": 1.1, + "stability": 0.5, + "similarity_boost": 0.75 + } + } +) + +# Save audio +with open("speech.mp3", "wb") as f: + f.write(response.content) +``` + + diff --git a/docs/my-website/docs/providers/fal_ai.md b/docs/my-website/docs/providers/fal_ai.md index d42182b57a11..da0fd19123b8 100644 --- a/docs/my-website/docs/providers/fal_ai.md +++ b/docs/my-website/docs/providers/fal_ai.md @@ -31,9 +31,14 @@ Get your API key from [fal.ai](https://fal.ai/). | Model Name | Description | Documentation | |------------|-------------|---------------| +| `fal_ai/fal-ai/flux-pro/v1.1` | FLUX Pro v1.1 - Balanced speed and quality | [Docs ↗](https://fal.ai/models/fal-ai/flux-pro/v1.1) | +| `fal_ai/flux/schnell` | Flux Schnell - Low-latency generation with `image_size` support | [Docs ↗](https://fal.ai/models/fal-ai/flux/schnell) | +| `fal_ai/fal-ai/bytedance/seedream/v3/text-to-image` | ByteDance Seedream v3 - Text-to-image with `image_size` control | [Docs ↗](https://fal.ai/models/fal-ai/bytedance/seedream/v3/text-to-image) | +| `fal_ai/fal-ai/bytedance/dreamina/v3.1/text-to-image` | ByteDance Dreamina v3.1 - Text-to-image with `image_size` control | [Docs ↗](https://fal.ai/models/fal-ai/bytedance/dreamina/v3.1/text-to-image) | | `fal_ai/fal-ai/flux-pro/v1.1-ultra` | FLUX Pro v1.1 Ultra - High-quality image generation | [Docs ↗](https://fal.ai/models/fal-ai/flux-pro/v1.1-ultra) | | `fal_ai/fal-ai/imagen4/preview` | Google's Imagen 4 - Highest quality model | [Docs ↗](https://fal.ai/models/fal-ai/imagen4/preview) | | `fal_ai/fal-ai/recraft/v3/text-to-image` | Recraft v3 - Multiple style options | [Docs ↗](https://fal.ai/models/fal-ai/recraft/v3/text-to-image) | +| `fal_ai/fal-ai/ideogram/v3` | Ideogram v3 - Lettering-first creative model (Balanced: $0.06/image) | [Docs ↗](https://fal.ai/models/fal-ai/ideogram/v3) | | `fal_ai/fal-ai/stable-diffusion-v35-medium` | Stable Diffusion v3.5 Medium | [Docs ↗](https://fal.ai/models/fal-ai/stable-diffusion-v35-medium) | | `fal_ai/bria/text-to-image/3.2` | Bria 3.2 - Commercial-grade generation | [Docs ↗](https://fal.ai/models/bria/text-to-image/3.2) | diff --git a/docs/my-website/docs/providers/fireworks_ai.md b/docs/my-website/docs/providers/fireworks_ai.md index 98d7c33ce7e6..4589066031aa 100644 --- a/docs/my-website/docs/providers/fireworks_ai.md +++ b/docs/my-website/docs/providers/fireworks_ai.md @@ -13,7 +13,7 @@ import TabItem from '@theme/TabItem'; | Description | The fastest and most efficient inference engine to build production-ready, compound AI systems. | | Provider Route on LiteLLM | `fireworks_ai/` | | Provider Doc | [Fireworks AI ↗](https://docs.fireworks.ai/getting-started/introduction) | -| Supported OpenAI Endpoints | `/chat/completions`, `/embeddings`, `/completions`, `/audio/transcriptions` | +| Supported OpenAI Endpoints | `/chat/completions`, `/embeddings`, `/completions`, `/audio/transcriptions`, `/rerank` | ## Overview @@ -204,7 +204,7 @@ from litellm import completion import os os.environ["FIREWORKS_AI_API_KEY"] = "YOUR_API_KEY" -os.environ["FIREWORKS_AI_API_BASE"] = "https://audio-prod.us-virginia-1.direct.fireworks.ai/v1" +os.environ["FIREWORKS_AI_API_BASE"] = "https://audio-prod.api.fireworks.ai/v1" completion = litellm.completion( model="fireworks_ai/accounts/fireworks/models/llama-v3p3-70b-instruct", @@ -300,6 +300,51 @@ litellm_settings:
+## Reasoning Effort + +The `reasoning_effort` parameter is supported on select Fireworks AI models. Supported models include: + + + + +```python +from litellm import completion +import os + +os.environ["FIREWORKS_AI_API_KEY"] = "YOUR_API_KEY" + +response = completion( + model="fireworks_ai/accounts/fireworks/models/qwen3-8b", + messages=[ + {"role": "user", "content": "What is the capital of France?"} + ], + reasoning_effort="low", +) +print(response) +``` + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_KEY" \ + -d '{ + "model": "fireworks_ai/accounts/fireworks/models/qwen3-8b", + "messages": [ + { + "role": "user", + "content": "What is the capital of France?" + } + ], + "reasoning_effort": "low" + }' +``` + + + + ## Supported Models - ALL Fireworks AI Models Supported! :::info @@ -343,7 +388,7 @@ from litellm import transcription import os os.environ["FIREWORKS_AI_API_KEY"] = "YOUR_API_KEY" -os.environ["FIREWORKS_AI_API_BASE"] = "https://audio-prod.us-virginia-1.direct.fireworks.ai/v1" +os.environ["FIREWORKS_AI_API_BASE"] = "https://audio-prod.api.fireworks.ai/v1" response = transcription( model="fireworks_ai/whisper-v3", @@ -363,7 +408,7 @@ model_list: - model_name: whisper-v3 litellm_params: model: fireworks_ai/whisper-v3 - api_base: https://audio-prod.us-virginia-1.direct.fireworks.ai/v1 + api_base: https://audio-prod.api.fireworks.ai/v1 api_key: os.environ/FIREWORKS_API_KEY model_info: mode: audio_transcription @@ -386,4 +431,87 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/audio/transcriptions' \ ``` -
\ No newline at end of file + + +## Rerank + +### Quick Start + + + + +```python +from litellm import rerank +import os + +os.environ["FIREWORKS_AI_API_KEY"] = "YOUR_API_KEY" + +query = "What is the capital of France?" +documents = [ + "Paris is the capital and largest city of France, home to the Eiffel Tower and the Louvre Museum.", + "France is a country in Western Europe known for its wine, cuisine, and rich history.", + "The weather in Europe varies significantly between northern and southern regions.", + "Python is a popular programming language used for web development and data science.", +] + +response = rerank( + model="fireworks_ai/fireworks/qwen3-reranker-8b", + query=query, + documents=documents, + top_n=3, + return_documents=True, +) +print(response) +``` + +[Pass API Key/API Base in `.rerank`](../set_keys.md#passing-args-to-completion) + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: qwen3-reranker-8b + litellm_params: + model: fireworks_ai/fireworks/qwen3-reranker-8b + api_key: os.environ/FIREWORKS_API_KEY + model_info: + mode: rerank +``` + +2. Start Proxy + +``` +litellm --config config.yaml +``` + +3. Test it + +```bash +curl http://0.0.0.0:4000/rerank \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen3-reranker-8b", + "query": "What is the capital of France?", + "documents": [ + "Paris is the capital and largest city of France, home to the Eiffel Tower and the Louvre Museum.", + "France is a country in Western Europe known for its wine, cuisine, and rich history.", + "The weather in Europe varies significantly between northern and southern regions.", + "Python is a popular programming language used for web development and data science." + ], + "top_n": 3, + "return_documents": true + }' +``` + + + + +### Supported Models + +| Model Name | Function Call | +|------------|---------------| +| fireworks/qwen3-reranker-8b | `rerank(model="fireworks_ai/fireworks/qwen3-reranker-8b", query=query, documents=documents)` | \ No newline at end of file diff --git a/docs/my-website/docs/providers/gemini.md b/docs/my-website/docs/providers/gemini.md index 40d646565286..b9ad7820dd47 100644 --- a/docs/my-website/docs/providers/gemini.md +++ b/docs/my-website/docs/providers/gemini.md @@ -10,11 +10,22 @@ import TabItem from '@theme/TabItem'; | Provider Route on LiteLLM | `gemini/` | | Provider Doc | [Google AI Studio ↗](https://aistudio.google.com/) | | API Endpoint for Provider | https://generativelanguage.googleapis.com | -| Supported OpenAI Endpoints | `/chat/completions`, [`/embeddings`](../embedding/supported_embedding#gemini-ai-embedding-models), `/completions` | +| Supported OpenAI Endpoints | `/chat/completions`, [`/embeddings`](../embedding/supported_embedding#gemini-ai-embedding-models), `/completions`, [`/videos`](./gemini/videos.md), [`/images/edits`](../image_edits.md) | | Pass-through Endpoint | [Supported](../pass_through/google_ai_studio.md) |
+:::tip Gemini API vs Vertex AI +| Model Format | Provider | Auth Required | +|-------------|----------|---------------| +| `gemini/gemini-2.0-flash` | Gemini API | `GEMINI_API_KEY` (simple API key) | +| `vertex_ai/gemini-2.0-flash` | Vertex AI | GCP credentials + project | +| `gemini-2.0-flash` (no prefix) | Vertex AI | GCP credentials + project | + +**If you just want to use an API key** (like OpenAI), use the `gemini/` prefix. + +Models without a prefix default to Vertex AI which requires full GCP authentication. +::: ## API Keys @@ -64,16 +75,40 @@ response = completion( LiteLLM translates OpenAI's `reasoning_effort` to Gemini's `thinking` parameter. [Code](https://github.com/BerriAI/litellm/blob/620664921902d7a9bfb29897a7b27c1a7ef4ddfb/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py#L362) -Added an additional non-OpenAI standard "disable" value for non-reasoning Gemini requests. +**Cost Optimization:** Use `reasoning_effort="none"` (OpenAI standard) for significant cost savings - up to 96% cheaper. [Google's docs](https://ai.google.dev/gemini-api/docs/openai) + +:::info +Note: Reasoning cannot be turned off on Gemini 2.5 Pro models. +::: -**Mapping** +:::tip Gemini 3 Models +For **Gemini 3+ models** (e.g., `gemini-3-pro-preview`), LiteLLM automatically maps `reasoning_effort` to the new `thinking_level` parameter instead of `thinking_budget`. The `thinking_level` parameter uses `"low"` or `"high"` values for better control over reasoning depth. +::: -| reasoning_effort | thinking | -| ---------------- | -------- | -| "disable" | "budget_tokens": 0 | -| "low" | "budget_tokens": 1024 | -| "medium" | "budget_tokens": 2048 | -| "high" | "budget_tokens": 4096 | +:::warning Image Models +**Gemini image models** (e.g., `gemini-3-pro-image-preview`, `gemini-2.0-flash-exp-image-generation`) do **not** support the `thinking_level` parameter. LiteLLM automatically excludes image models from receiving thinking configuration to prevent API errors. +::: + +**Mapping for Gemini 2.5 and earlier models** + +| reasoning_effort | thinking | Notes | +| ---------------- | -------- | ----- | +| "none" | "budget_tokens": 0, "includeThoughts": false | 💰 **Recommended for cost optimization** - OpenAI-compatible, always 0 | +| "disable" | "budget_tokens": DEFAULT (0), "includeThoughts": false | LiteLLM-specific, configurable via env var | +| "low" | "budget_tokens": 1024 | | +| "medium" | "budget_tokens": 2048 | | +| "high" | "budget_tokens": 4096 | | + +**Mapping for Gemini 3+ models** + +| reasoning_effort | thinking_level | Notes | +| ---------------- | -------------- | ----- | +| "minimal" | "low" | Minimizes latency and cost | +| "low" | "low" | Best for simple instruction following or chat | +| "medium" | "high" | Maps to high (medium not yet available) | +| "high" | "high" | Maximizes reasoning depth | +| "disable" | "low" | Cannot fully disable thinking in Gemini 3 | +| "none" | "low" | Cannot fully disable thinking in Gemini 3 | @@ -81,6 +116,14 @@ Added an additional non-OpenAI standard "disable" value for non-reasoning Gemini ```python from litellm import completion +# Cost-optimized: Use reasoning_effort="none" for best pricing +resp = completion( + model="gemini/gemini-2.0-flash-thinking-exp-01-21", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning_effort="none", # Up to 96% cheaper! +) + +# Or use other levels: "low", "medium", "high" resp = completion( model="gemini/gemini-2.5-flash-preview-04-17", messages=[{"role": "user", "content": "What is the capital of France?"}], @@ -124,6 +167,59 @@ curl http://0.0.0.0:4000/v1/chat/completions \ +### Gemini 3+ Models - `thinking_level` Parameter + +For Gemini 3+ models (e.g., `gemini-3-pro-preview`), you can use the new `thinking_level` parameter directly: + + + + +```python +from litellm import completion + +# Use thinking_level for Gemini 3 models +resp = completion( + model="gemini/gemini-3-pro-preview", + messages=[{"role": "user", "content": "Solve this complex math problem step by step."}], + reasoning_effort="high", # Options: "low" or "high" +) + +# Low thinking level for faster, simpler tasks +resp = completion( + model="gemini/gemini-3-pro-preview", + messages=[{"role": "user", "content": "What is the weather today?"}], + reasoning_effort="low", # Minimizes latency and cost +) +``` + + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "gemini-3-pro-preview", + "messages": [{"role": "user", "content": "Solve this complex problem."}], + "reasoning_effort": "high" + }' +``` + + + + +:::warning +**Temperature Recommendation for Gemini 3 Models** + +For Gemini 3 models, LiteLLM defaults `temperature` to `1.0` and strongly recommends keeping it at this default. Setting `temperature < 1.0` can cause: +- Infinite loops +- Degraded reasoning performance +- Failure on complex tasks + +LiteLLM will automatically set `temperature=1.0` if not specified for Gemini 3+ models. +::: **Expected Response** @@ -934,9 +1030,462 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +### Computer Use Tool + + +```python +from litellm import completion +import os +os.environ["GEMINI_API_KEY"] = "your-api-key" + +# Computer Use tool with browser environment +tools = [ + { + "type": "computer_use", + "environment": "browser", # optional: "browser" or "unspecified" + "excluded_predefined_functions": ["drag_and_drop"] # optional + } +] + +messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Navigate to google.com and search for 'LiteLLM'" + }, + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,..." # screenshot of current browser state + } + } + ] + } +] + +response = completion( + model="gemini/gemini-2.5-computer-use-preview-10-2025", + messages=messages, + tools=tools, +) + +print(response) + +# Handling tool responses with screenshots +# When the model makes a tool call, send the response back with a screenshot: +if response.choices[0].message.tool_calls: + tool_call = response.choices[0].message.tool_calls[0] + + # Add assistant message with tool call + messages.append(response.choices[0].message.model_dump()) + + # Add tool response with screenshot + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": [ + { + "type": "text", + "text": '{"url": "https://example.com", "status": "completed"}' + }, + { + "type": "input_image", + "image_url": "data:image/png;base64,..." # New screenshot after action (Can send an image url as well, litellm handles the conversion) + } + ] + }) + + # Continue conversation with updated screenshot + response = completion( + model="gemini/gemini-2.5-computer-use-preview-10-2025", + messages=messages, + tools=tools, + ) +``` + + + + +1. Add model to config.yaml + +```yaml +model_list: + - model_name: gemini-computer-use + litellm_params: + model: gemini/gemini-2.5-computer-use-preview-10-2025 + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Make request + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-computer-use", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Click on the search button" + }, + { + "type": "image_url", + "image_url": { + "url": "data:image/png;base64,..." + } + } + ] + } + ], + "tools": [ + { + "type": "computer_use", + "environment": "browser" + } + ] + }' +``` + +**Tool Response Format:** + +When responding to Computer Use tool calls, include the URL and screenshot: + +```json +{ + "role": "tool", + "tool_call_id": "call_abc123", + "content": [ + { + "type": "text", + "text": "{\"url\": \"https://example.com\", \"status\": \"completed\"}" + }, + { + "type": "input_image", + "image_url": "data:image/png;base64,..." + } + ] +} +``` + + + + +### Environment Mapping + +| LiteLLM Input | Gemini API Value | +|--------------|------------------| +| `"browser"` | `ENVIRONMENT_BROWSER` | +| `"unspecified"` | `ENVIRONMENT_UNSPECIFIED` | +| `ENVIRONMENT_BROWSER` | `ENVIRONMENT_BROWSER` (passed through) | +| `ENVIRONMENT_UNSPECIFIED` | `ENVIRONMENT_UNSPECIFIED` (passed through) | + + + +## Thought Signatures + +Thought signatures are encrypted representations of the model's internal reasoning process for a given turn in a conversation. By passing thought signatures back to the model in subsequent requests, you provide it with the context of its previous thoughts, allowing it to build upon its reasoning and maintain a coherent line of inquiry. + +Thought signatures are particularly important for multi-turn function calling scenarios where the model needs to maintain context across multiple tool invocations. + +### How Thought Signatures Work + +- **Function calls with signatures**: When Gemini returns a function call, it includes a `thought_signature` in the response +- **Preservation**: LiteLLM automatically extracts and stores thought signatures in `provider_specific_fields` of tool calls +- **Return in conversation history**: When you include the assistant's message with tool calls in subsequent requests, LiteLLM automatically preserves and returns the thought signatures to Gemini +- **Parallel function calls**: Only the first function call in a parallel set has a thought signature +- **Sequential function calls**: Each function call in a multi-step sequence has its own signature + +### Enabling Thought Signatures + +To enable thought signatures, you need to enable thinking/reasoning: + + + + +```python +from litellm import completion + +response = completion( + model="gemini/gemini-2.5-flash", + messages=[{"role": "user", "content": "What's the weather in Tokyo?"}], + tools=[...], + reasoning_effort="low", # Enable thinking to get thought signatures +) +``` + + + + +```bash +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-2.5-flash", + "messages": [{"role": "user", "content": "What'\''s the weather in Tokyo?"}], + "tools": [...], + "reasoning_effort": "low" + }' +``` + + + + +### Multi-Turn Function Calling with Thought Signatures + +When building conversation history for multi-turn function calling, you must include the thought signatures from previous responses. LiteLLM handles this automatically when you append the full assistant message to your conversation history. + + + + +```python +from openai import OpenAI +import json + +client = OpenAI(api_key="sk-1234", base_url="http://localhost:4000") + +def get_current_temperature(location: str) -> dict: + """Gets the current weather temperature for a given location.""" + return {"temperature": 30, "unit": "celsius"} + +def set_thermostat_temperature(temperature: int) -> dict: + """Sets the thermostat to a desired temperature.""" + return {"status": "success"} + +get_weather_declaration = { + "name": "get_current_temperature", + "description": "Gets the current weather temperature for a given location.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"], + }, +} + +set_thermostat_declaration = { + "name": "set_thermostat_temperature", + "description": "Sets the thermostat to a desired temperature.", + "parameters": { + "type": "object", + "properties": {"temperature": {"type": "integer"}}, + "required": ["temperature"], + }, +} + +# Initial request +messages = [ + {"role": "user", "content": "If it's too hot or too cold in London, set the thermostat to a comfortable level."} +] + +response = client.chat.completions.create( + model="gemini-2.5-flash", + messages=messages, + tools=[get_weather_declaration, set_thermostat_declaration], + reasoning_effort="low" +) + +# Append the assistant's message (includes thought signatures automatically) +messages.append(response.choices[0].message) + +# Execute tool calls and append results +for tool_call in response.choices[0].message.tool_calls: + if tool_call.function.name == "get_current_temperature": + result = get_current_temperature(**json.loads(tool_call.function.arguments)) + messages.append({ + "role": "tool", + "content": json.dumps(result), + "tool_call_id": tool_call.id + }) + +# Second request - thought signatures are automatically preserved +response2 = client.chat.completions.create( + model="gemini-2.5-flash", + messages=messages, + tools=[get_weather_declaration, set_thermostat_declaration], + reasoning_effort="low" +) + +print(response2.choices[0].message.content) +``` + + + + +```bash +# Step 1: Initial request +curl --location 'http://localhost:4000/v1/chat/completions' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer sk-1234' \ + --data '{ + "model": "gemini-2.5-flash", + "messages": [ + { + "role": "user", + "content": "If it'\''s too hot or too cold in London, set the thermostat to a comfortable level." + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_temperature", + "description": "Gets the current weather temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + }, + { + "type": "function", + "function": { + "name": "set_thermostat_temperature", + "description": "Sets the thermostat to a desired temperature.", + "parameters": { + "type": "object", + "properties": { + "temperature": {"type": "integer"} + }, + "required": ["temperature"] + } + } + } + ], + "tool_choice": "auto", + "reasoning_effort": "low" + }' +``` + +The response will include tool calls with thought signatures in `provider_specific_fields`: + +```json +{ + "choices": [{ + "message": { + "role": "assistant", + "tool_calls": [{ + "id": "call_abc123", + "type": "function", + "function": { + "name": "get_current_temperature", + "arguments": "{\"location\": \"London\"}" + }, + "index": 0, + "provider_specific_fields": { + "thought_signature": "CpcHAdHtim9+q4rstcbvQC0ic4x1/vqQlCJWgE+UZ6dTLYGHMMBkF/AxqL5UmP6SY46uYC8t4BTFiXG5zkw6EMJ...==" + } + }] + } + }] +} +``` + +```bash +# Step 2: Follow-up request with tool response +# Include the assistant message from Step 1 (with thought signatures in provider_specific_fields) +curl --location 'http://localhost:4000/v1/chat/completions' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer sk-1234' \ + --data '{ + "model": "gemini-2.5-flash", + "messages": [ + { + "role": "user", + "content": "If it'\''s too hot or too cold in London, set the thermostat to a comfortable level." + }, + { + "role": "assistant", + "content": null, + "tool_calls": [ + { + "id": "call_c130b9f8c2c042e9b65e39a88245", + "type": "function", + "function": { + "name": "get_current_temperature", + "arguments": "{\"location\": \"London\"}" + }, + "index": 0, + "provider_specific_fields": { + "thought_signature": "CpcHAdHtim9+q4rstcbvQC0ic4x1/vqQlCJWgE+UZ6dTLYGHMMBkF/AxqL5UmP6SY46uYC8t4BTFiXG5zkw6EMJ...==" + } + } + ] + }, + { + "role": "tool", + "content": "{\"temperature\": 30, \"unit\": \"celsius\"}", + "tool_call_id": "call_c130b9f8c2c042e9b65e39a88245" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_temperature", + "description": "Gets the current weather temperature for a given location.", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + }, + { + "type": "function", + "function": { + "name": "set_thermostat_temperature", + "description": "Sets the thermostat to a desired temperature.", + "parameters": { + "type": "object", + "properties": { + "temperature": {"type": "integer"} + }, + "required": ["temperature"] + } + } + } + ], + "tool_choice": "auto", + "reasoning_effort": "low" + }' +``` + + + + +### Important Notes + +1. **Automatic Handling**: LiteLLM automatically extracts thought signatures from Gemini responses and preserves them when you include assistant messages in conversation history. You don't need to manually extract or manage them. + +2. **Parallel Function Calls**: When the model makes parallel function calls, only the first function call will have a thought signature. Subsequent parallel calls won't have signatures. + +3. **Sequential Function Calls**: In multi-step function calling scenarios, each step's first function call will have its own thought signature that must be preserved. + +4. **Required for Context**: Thought signatures are essential for maintaining reasoning context across multi-turn conversations with function calling. Without them, the model may lose context of its previous reasoning. + +5. **Format**: Thought signatures are stored in `provider_specific_fields.thought_signature` of tool calls in the response, and are automatically included when you append the assistant message to your conversation history. + +6. **Chat Completions Clients**: With chat completions clients where you cannot control whether or not the previous assistant message is included as-is (ex langchain's ChatOpenAI), LiteLLM also preserves the thought signature by appending it to the tool call id (`call_123__thought__`) and extracting it back out before sending the outbound request to Gemini. ## JSON Mode @@ -1009,6 +1558,244 @@ LiteLLM Supports the following image types passed in `url` - Images with direct links - https://storage.googleapis.com/github-repo/img/gemini/intro/landmark3.jpg - Image in local storage - ./localimage.jpeg +## Media Resolution Control (Images & Videos) + +For Gemini 3+ models, LiteLLM supports per-part media resolution control using OpenAI's `detail` parameter. This allows you to specify different resolution levels for individual images and videos in your request, whether using `image_url` or `file` content types. + +**Supported `detail` values:** +- `"low"` - Maps to `media_resolution: "low"` (280 tokens for images, 70 tokens per frame for videos) +- `"medium"` - Maps to `media_resolution: "medium"` +- `"high"` - Maps to `media_resolution: "high"` (1120 tokens for images) +- `"ultra_high"` - Maps to `media_resolution: "ultra_high"` +- `"auto"` or `None` - Model decides optimal resolution (no `media_resolution` set) + +**Usage Examples:** + + + + +```python +from litellm import completion + +messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://example.com/chart.png", + "detail": "high" # High resolution for detailed chart analysis + } + }, + { + "type": "text", + "text": "Analyze this chart" + }, + { + "type": "image_url", + "image_url": { + "url": "https://example.com/icon.png", + "detail": "low" # Low resolution for simple icon + } + } + ] + } +] + +response = completion( + model="gemini/gemini-3-pro-preview", + messages=messages, +) +``` + + + + +```python +from litellm import completion + +messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Analyze this video" + }, + { + "type": "file", + "file": { + "file_id": "gs://my-bucket/video.mp4", + "format": "video/mp4", + "detail": "high" # High resolution for detailed video analysis + } + } + ] + } +] + +response = completion( + model="gemini/gemini-3-pro-preview", + messages=messages, +) +``` + + + + +:::info +**Per-Part Resolution:** Each image or video in your request can have its own `detail` setting, allowing mixed-resolution requests (e.g., a high-res chart alongside a low-res icon). This feature works with both `image_url` and `file` content types, and is only available for Gemini 3+ models. +::: + +## Video Metadata Control + +For Gemini 3+ models, LiteLLM supports fine-grained video processing control through the `video_metadata` field. This allows you to specify frame extraction rates and time ranges for video analysis. + +**Supported `video_metadata` parameters:** + +| Parameter | Type | Description | Example | +|-----------|------|-------------|---------| +| `fps` | Number | Frame extraction rate (frames per second) | `5` | +| `start_offset` | String | Start time for video clip processing | `"10s"` | +| `end_offset` | String | End time for video clip processing | `"60s"` | + +:::note +**Field Name Conversion:** LiteLLM automatically converts snake_case field names to camelCase for the Gemini API: +- `start_offset` → `startOffset` +- `end_offset` → `endOffset` +- `fps` remains unchanged +::: + +:::warning +- **Gemini 3+ Only:** This feature is only available for Gemini 3.0 and newer models +- **Video Files Recommended:** While `video_metadata` is designed for video files, error handling for other media types is delegated to the Vertex AI API +- **File Formats Supported:** Works with `gs://`, `https://`, and base64-encoded video files +::: + +**Usage Examples:** + + + + +```python +from litellm import completion + +response = completion( + model="gemini/gemini-3-pro-preview", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this video clip"}, + { + "type": "file", + "file": { + "file_id": "gs://my-bucket/video.mp4", + "format": "video/mp4", + "video_metadata": { + "fps": 5, # Extract 5 frames per second + "start_offset": "10s", # Start from 10 seconds + "end_offset": "60s" # End at 60 seconds + } + } + } + ] + } + ] +) + +print(response.choices[0].message.content) +``` + + + + +```python +from litellm import completion + +response = completion( + model="gemini/gemini-3-pro-preview", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Provide detailed analysis of this video segment"}, + { + "type": "file", + "file": { + "file_id": "https://example.com/presentation.mp4", + "format": "video/mp4", + "detail": "high", # High resolution for detailed analysis + "video_metadata": { + "fps": 10, # Extract 10 frames per second + "start_offset": "30s", # Start from 30 seconds + "end_offset": "90s" # End at 90 seconds + } + } + } + ] + } + ] +) + +print(response.choices[0].message.content) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: gemini-3-pro + litellm_params: + model: gemini/gemini-3-pro-preview + api_key: os.environ/GEMINI_API_KEY +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Make request + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "gemini-3-pro", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this video clip"}, + { + "type": "file", + "file": { + "file_id": "gs://my-bucket/video.mp4", + "format": "video/mp4", + "detail": "high", + "video_metadata": { + "fps": 5, + "start_offset": "10s", + "end_offset": "60s" + } + } + } + ] + } + ] + }' +``` + + + + ## Sample Usage ```python import os @@ -1053,6 +1840,57 @@ content = response.get('choices', [{}])[0].get('message', {}).get('content') print(content) ``` +## gemini-robotics-er-1.5-preview Usage + +```python +from litellm import api_base +from openai import OpenAI +import os +import base64 + +client = OpenAI(base_url="http://0.0.0.0:4000", api_key="sk-12345") +base64_image = base64.b64encode(open("closeup-object-on-table-many-260nw-1216144471.webp", "rb").read()).decode() + +import json +import re +tools = [{"codeExecution": {}}] +response = client.chat.completions.create( + model="gemini/gemini-robotics-er-1.5-preview", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Point to no more than 10 items in the image. The label returned should be an identifying name for the object detected. The answer should follow the json format: [{\"point\": [y, x], \"label\": }, ...]. The points are in [y, x] format normalized to 0-1000." + }, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"} + } + ] + } + ], + tools=tools +) + +# Extract JSON from markdown code block if present +content = response.choices[0].message.content +# Look for triple-backtick JSON block +match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL) +if match: + json_str = match.group(1) +else: + json_str = content + +try: + data = json.loads(json_str) + print(json.dumps(data, indent=2)) +except Exception as e: + print("Error parsing response as JSON:", e) + print("Response content:", content) +``` + ## Usage - PDF / Videos / etc. Files ### Inline Data (e.g. audio stream) @@ -1580,3 +2418,34 @@ curl -L -X POST 'http://localhost:4000/v1/chat/completions' \ +### Image Generation Pricing + +Gemini image generation models (like `gemini-3-pro-image-preview`) return `image_tokens` in the response usage. These tokens are priced differently from text tokens: + +| Token Type | Price per 1M tokens | Price per token | +|------------|---------------------|-----------------| +| Text output | $12 | $0.000012 | +| Image output | $120 | $0.00012 | + +The number of image tokens depends on the output resolution: + +| Resolution | Tokens per image | Cost per image | +|------------|------------------|----------------| +| 1K-2K (1024x1024 to 2048x2048) | 1,120 | $0.134 | +| 4K (4096x4096) | 2,000 | $0.24 | + +LiteLLM automatically calculates costs using `output_cost_per_image_token` from the model pricing configuration. + +**Example response usage:** +```json +{ + "completion_tokens_details": { + "reasoning_tokens": 225, + "text_tokens": 0, + "image_tokens": 1120 + } +} +``` + +For more details, see [Google's Gemini pricing documentation](https://ai.google.dev/gemini-api/docs/pricing). + diff --git a/docs/my-website/docs/providers/gemini/videos.md b/docs/my-website/docs/providers/gemini/videos.md new file mode 100644 index 000000000000..5b5d5a8a6369 --- /dev/null +++ b/docs/my-website/docs/providers/gemini/videos.md @@ -0,0 +1,409 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Gemini Video Generation (Veo) + +LiteLLM supports Google's Veo video generation models through a unified API interface. + +| Property | Details | +|-------|-------| +| Description | Google's Veo AI video generation models | +| Provider Route on LiteLLM | `gemini/` | +| Supported Models | `veo-3.0-generate-preview`, `veo-3.1-generate-preview` | +| Cost Tracking | ✅ Duration-based pricing | +| Logging Support | ✅ Full request/response logging | +| Proxy Server Support | ✅ Full proxy integration with virtual keys | +| Spend Management | ✅ Budget tracking and rate limiting | +| Link to Provider Doc | [Google Veo Documentation ↗](https://ai.google.dev/gemini-api/docs/video) | + +## Quick Start + +### Required API Keys + +```python +import os +os.environ["GEMINI_API_KEY"] = "your-google-api-key" +# OR +os.environ["GOOGLE_API_KEY"] = "your-google-api-key" +``` + +### Basic Usage + +```python +from litellm import video_generation, video_status, video_content +import os +import time + +os.environ["GEMINI_API_KEY"] = "your-google-api-key" + +# Step 1: Generate video +response = video_generation( + model="gemini/veo-3.0-generate-preview", + prompt="A cat playing with a ball of yarn in a sunny garden" +) + +print(f"Video ID: {response.id}") +print(f"Initial Status: {response.status}") # "processing" + +# Step 2: Poll for completion +while True: + status_response = video_status( + video_id=response.id + ) + + print(f"Current Status: {status_response.status}") + + if status_response.status == "completed": + break + elif status_response.status == "failed": + print("Video generation failed") + break + + time.sleep(10) # Wait 10 seconds before checking again + +# Step 3: Download video content +video_bytes = video_content( + video_id=response.id +) + +# Save to file +with open("generated_video.mp4", "wb") as f: + f.write(video_bytes) + +print("Video downloaded successfully!") +``` + +## Supported Models + +| Model Name | Description | Max Duration | Status | +|------------|-------------|--------------|--------| +| veo-3.0-generate-preview | Veo 3.0 video generation | 8 seconds | Preview | +| veo-3.1-generate-preview | Veo 3.1 video generation | 8 seconds | Preview | + +## Video Generation Parameters + +LiteLLM automatically maps OpenAI-style parameters to Veo's format: + +| OpenAI Parameter | Veo Parameter | Description | Example | +|------------------|---------------|-------------|---------| +| `prompt` | `prompt` | Text description of the video | "A cat playing" | +| `size` | `aspectRatio` | Video dimensions → aspect ratio | "1280x720" → "16:9" | +| `seconds` | `durationSeconds` | Duration in seconds | "8" → 8 | +| `input_reference` | `image` | Reference image to animate | File object or path | +| `model` | `model` | Model to use | "gemini/veo-3.0-generate-preview" | + +### Size to Aspect Ratio Mapping + +LiteLLM automatically converts size dimensions to Veo's aspect ratio format: +- `"1280x720"`, `"1920x1080"` → `"16:9"` (landscape) +- `"720x1280"`, `"1080x1920"` → `"9:16"` (portrait) + +### Supported Veo Parameters + +Based on Veo's API: +- **prompt** (required): Text description with optional audio cues +- **aspectRatio**: `"16:9"` (default) or `"9:16"` +- **resolution**: `"720p"` (default) or `"1080p"` (Veo 3.1 only, 16:9 aspect ratio only) +- **durationSeconds**: Video length (max 8 seconds for most models) +- **image**: Reference image for animation +- **negativePrompt**: What to exclude from the video (Veo 3.1) +- **referenceImages**: Style and content references (Veo 3.1 only) + +## Complete Workflow Example + +```python +import litellm +import time + +def generate_and_download_veo_video( + prompt: str, + output_file: str = "video.mp4", + size: str = "1280x720", + seconds: str = "8" +): + """ + Complete workflow for Veo video generation. + + Args: + prompt: Text description of the video + output_file: Where to save the video + size: Video dimensions (e.g., "1280x720" for 16:9) + seconds: Duration in seconds + + Returns: + bool: True if successful + """ + print(f"🎬 Generating video: {prompt}") + + # Step 1: Initiate generation + response = litellm.video_generation( + model="gemini/veo-3.0-generate-preview", + prompt=prompt, + size=size, # Maps to aspectRatio + seconds=seconds # Maps to durationSeconds + ) + + video_id = response.id + print(f"✓ Video generation started (ID: {video_id})") + + # Step 2: Wait for completion + max_wait_time = 600 # 10 minutes + start_time = time.time() + + while time.time() - start_time < max_wait_time: + status_response = litellm.video_status(video_id=video_id) + + if status_response.status == "completed": + print("✓ Video generation completed!") + break + elif status_response.status == "failed": + print("✗ Video generation failed") + return False + + print(f"⏳ Status: {status_response.status}") + time.sleep(10) + else: + print("✗ Timeout waiting for video generation") + return False + + # Step 3: Download video + print("⬇️ Downloading video...") + video_bytes = litellm.video_content(video_id=video_id) + + with open(output_file, "wb") as f: + f.write(video_bytes) + + print(f"✓ Video saved to {output_file}") + return True + +# Use it +generate_and_download_veo_video( + prompt="A serene lake at sunset with mountains in the background", + output_file="sunset_lake.mp4" +) +``` + +## Async Usage + +```python +from litellm import avideo_generation, avideo_status, avideo_content +import asyncio + +async def async_video_workflow(): + # Generate video + response = await avideo_generation( + model="gemini/veo-3.0-generate-preview", + prompt="A cat playing with a ball of yarn" + ) + + # Poll for completion + while True: + status = await avideo_status(video_id=response.id) + if status.status == "completed": + break + await asyncio.sleep(10) + + # Download content + video_bytes = await avideo_content(video_id=response.id) + + with open("video.mp4", "wb") as f: + f.write(video_bytes) + +# Run it +asyncio.run(async_video_workflow()) +``` + +## LiteLLM Proxy Usage + +### Configuration + +Add Veo models to your `config.yaml`: + +```yaml +model_list: + - model_name: veo-3 + litellm_params: + model: gemini/veo-3.0-generate-preview + api_key: os.environ/GEMINI_API_KEY +``` + +Start the proxy: + +```bash +litellm --config config.yaml +# Server running on http://0.0.0.0:4000 +``` + +### Making Requests + + + + +```bash +# Step 1: Generate video +curl --location 'http://0.0.0.0:4000/v1/videos' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer sk-1234' \ +--data '{ + "model": "veo-3", + "prompt": "A cat playing with a ball of yarn in a sunny garden" +}' + +# Response: {"id": "gemini::operations/generate_12345::...", "status": "processing", ...} + +# Step 2: Check status +curl --location 'http://localhost:4000/v1/videos/{video_id}' \ +--header 'x-litellm-api-key: sk-1234' + +# Step 3: Download video (when status is "completed") +curl --location 'http://localhost:4000/v1/videos/{video_id}/content' \ +--header 'x-litellm-api-key: sk-1234' \ +--output video.mp4 +``` + + + + +```python +import litellm + +litellm.api_base = "http://0.0.0.0:4000" +litellm.api_key = "sk-1234" + +# Generate video +response = litellm.video_generation( + model="veo-3", + prompt="A cat playing with a ball of yarn in a sunny garden" +) + +# Check status +import time +while True: + status = litellm.video_status(video_id=response.id) + if status.status == "completed": + break + time.sleep(10) + +# Download video +video_bytes = litellm.video_content(video_id=response.id) +with open("video.mp4", "wb") as f: + f.write(video_bytes) +``` + + + + +## Cost Tracking + +LiteLLM automatically tracks costs for Veo video generation: + +```python +response = litellm.video_generation( + model="gemini/veo-3.0-generate-preview", + prompt="A beautiful sunset" +) + +# Cost is calculated based on video duration +# Veo pricing: ~$0.10 per second (estimated) +# Default video duration: ~5 seconds +# Estimated cost: ~$0.50 +``` + +## Differences from OpenAI Video API + +| Feature | OpenAI (Sora) | Gemini (Veo) | +|---------|---------------|--------------| +| Reference Images | ✅ Supported | ❌ Not supported | +| Size Control | ✅ Supported | ❌ Not supported | +| Duration Control | ✅ Supported | ❌ Not supported | +| Video Remix/Edit | ✅ Supported | ❌ Not supported | +| Video List | ✅ Supported | ❌ Not supported | +| Prompt-based Generation | ✅ Supported | ✅ Supported | +| Async Operations | ✅ Supported | ✅ Supported | + +## Error Handling + +```python +from litellm import video_generation, video_status, video_content +from litellm.exceptions import APIError, Timeout + +try: + response = video_generation( + model="gemini/veo-3.0-generate-preview", + prompt="A beautiful landscape" + ) + + # Poll with timeout + max_attempts = 60 # 10 minutes (60 * 10s) + for attempt in range(max_attempts): + status = video_status(video_id=response.id) + + if status.status == "completed": + video_bytes = video_content(video_id=response.id) + with open("video.mp4", "wb") as f: + f.write(video_bytes) + break + elif status.status == "failed": + raise APIError("Video generation failed") + + time.sleep(10) + else: + raise Timeout("Video generation timed out") + +except APIError as e: + print(f"API Error: {e}") +except Timeout as e: + print(f"Timeout: {e}") +except Exception as e: + print(f"Unexpected error: {e}") +``` + +## Best Practices + +1. **Always poll for completion**: Veo video generation is asynchronous and can take several minutes +2. **Set reasonable timeouts**: Allow at least 5-10 minutes for video generation +3. **Handle failures gracefully**: Check for `failed` status and implement retry logic +4. **Use descriptive prompts**: More detailed prompts generally produce better results +5. **Store video IDs**: Save the operation ID/video ID to resume polling if your application restarts + +## Troubleshooting + +### Video generation times out + +```python +# Increase polling timeout +max_wait_time = 900 # 15 minutes instead of 10 +``` + +### Video not found when downloading + +```python +# Make sure video is completed before downloading +status = video_status(video_id=video_id) +if status.status != "completed": + print("Video not ready yet!") +``` + +### API key errors + +```python +# Verify your API key is set +import os +print(os.environ.get("GEMINI_API_KEY")) + +# Or pass it explicitly +response = video_generation( + model="gemini/veo-3.0-generate-preview", + prompt="...", + api_key="your-api-key-here" +) +``` + +## See Also + +- [OpenAI Video Generation](../openai/videos.md) +- [Azure Video Generation](../azure/videos.md) +- [Vertex AI Video Generation](../vertex_ai/videos.md) +- [Video Generation API Reference](/docs/videos) +- [Veo Pass-through Endpoints](/docs/pass_through/google_ai_studio#example-4-video-generation-with-veo) + diff --git a/docs/my-website/docs/providers/gemini_file_search.md b/docs/my-website/docs/providers/gemini_file_search.md new file mode 100644 index 000000000000..947715218a3c --- /dev/null +++ b/docs/my-website/docs/providers/gemini_file_search.md @@ -0,0 +1,414 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Gemini File Search + +Use Google Gemini's File Search for Retrieval Augmented Generation (RAG) with LiteLLM. + +Gemini File Search imports, chunks, and indexes your data to enable fast retrieval of relevant information based on user prompts. This information is then provided as context to the model for more accurate and relevant answers. + +[Official Gemini File Search Documentation](https://ai.google.dev/gemini-api/docs/file-search) + +## Features + +| Feature | Supported | Notes | +|---------|-----------|-------| +| Cost Tracking | ❌ | Cost calculation not yet implemented | +| Logging | ✅ | Full request/response logging | +| RAG Ingest API | ✅ | Upload → Chunk → Embed → Store | +| Vector Store Search | ✅ | Search with metadata filters | +| Custom Chunking | ✅ | Configure chunk size and overlap | +| Metadata Filtering | ✅ | Filter by custom metadata | +| Citations | ✅ | Extract from grounding metadata | + +## Quick Start + +### Setup + +Set your Gemini API key: + +```bash +export GEMINI_API_KEY="your-api-key" +# or +export GOOGLE_API_KEY="your-api-key" +``` + +### Basic RAG Ingest + + + + +```python +import litellm + +# Ingest a document +response = await litellm.aingest( + ingest_options={ + "name": "my-document-store", + "vector_store": { + "custom_llm_provider": "gemini" + } + }, + file_data=("document.txt", b"Your document content", "text/plain") +) + +print(f"Vector Store ID: {response['vector_store_id']}") +print(f"File ID: {response['file_id']}") +``` + + + + + +```bash +curl -X POST "http://localhost:4000/v1/rag/ingest" \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "file": { + "filename": "document.txt", + "content": "'$(base64 -i document.txt)'", + "content_type": "text/plain" + }, + "ingest_options": { + "name": "my-document-store", + "vector_store": { + "custom_llm_provider": "gemini" + } + } + }' +``` + + + + +### Search Vector Store + + + + +```python +import litellm + +# Search the vector store +response = await litellm.vector_stores.asearch( + vector_store_id="fileSearchStores/your-store-id", + query="What is the main topic?", + custom_llm_provider="gemini", + max_num_results=5 +) + +for result in response["data"]: + print(f"Score: {result.get('score')}") + print(f"Content: {result['content'][0]['text']}") +``` + + + + + +```bash +curl -X POST "http://localhost:4000/v1/vector_stores/fileSearchStores/your-store-id/search" \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "query": "What is the main topic?", + "custom_llm_provider": "gemini", + "max_num_results": 5 + }' +``` + + + + +## Advanced Features + +### Custom Chunking Configuration + +Control how documents are split into chunks: + +```python +import litellm + +response = await litellm.aingest( + ingest_options={ + "name": "custom-chunking-store", + "vector_store": { + "custom_llm_provider": "gemini" + }, + "chunking_strategy": { + "white_space_config": { + "max_tokens_per_chunk": 200, + "max_overlap_tokens": 20 + } + } + }, + file_data=("document.txt", document_content, "text/plain") +) +``` + +**Chunking Parameters:** +- `max_tokens_per_chunk`: Maximum tokens per chunk (default: 800, min: 100, max: 4096) +- `max_overlap_tokens`: Overlap between chunks (default: 400) + +### Metadata Filtering + +Attach custom metadata to files and filter searches: + +#### Attach Metadata During Ingest + +```python +import litellm + +response = await litellm.aingest( + ingest_options={ + "name": "metadata-store", + "vector_store": { + "custom_llm_provider": "gemini", + "custom_metadata": [ + {"key": "author", "string_value": "John Doe"}, + {"key": "year", "numeric_value": 2024}, + {"key": "category", "string_value": "documentation"} + ] + } + }, + file_data=("document.txt", document_content, "text/plain") +) +``` + +#### Search with Metadata Filter + +```python +import litellm + +response = await litellm.vector_stores.asearch( + vector_store_id="fileSearchStores/your-store-id", + query="What is LiteLLM?", + custom_llm_provider="gemini", + filters={"author": "John Doe", "category": "documentation"} +) +``` + +**Filter Syntax:** +- Simple equality: `{"key": "value"}` +- Gemini converts to: `key="value"` +- Multiple filters combined with AND + +### Using Existing Vector Store + +Ingest into an existing File Search store: + +```python +import litellm + +# First, create a store +create_response = await litellm.vector_stores.acreate( + name="My Persistent Store", + custom_llm_provider="gemini" +) +store_id = create_response["id"] + +# Then ingest multiple documents into it +for doc in documents: + await litellm.aingest( + ingest_options={ + "vector_store": { + "custom_llm_provider": "gemini", + "vector_store_id": store_id # Reuse existing store + } + }, + file_data=(doc["name"], doc["content"], doc["type"]) + ) +``` + +### Citation Extraction + +Gemini provides grounding metadata with citations: + +```python +import litellm + +response = await litellm.vector_stores.asearch( + vector_store_id="fileSearchStores/your-store-id", + query="Explain the concept", + custom_llm_provider="gemini" +) + +for result in response["data"]: + # Access citation information + if "attributes" in result: + print(f"URI: {result['attributes'].get('uri')}") + print(f"Title: {result['attributes'].get('title')}") + + # Content with relevance score + print(f"Score: {result.get('score')}") + print(f"Text: {result['content'][0]['text']}") +``` + +## Complete Example + +End-to-end workflow: + +```python +import litellm + +# 1. Create a File Search store +store_response = await litellm.vector_stores.acreate( + name="Knowledge Base", + custom_llm_provider="gemini" +) +store_id = store_response["id"] +print(f"Created store: {store_id}") + +# 2. Ingest documents with custom chunking and metadata +documents = [ + { + "name": "intro.txt", + "content": b"Introduction to LiteLLM...", + "metadata": [ + {"key": "section", "string_value": "intro"}, + {"key": "priority", "numeric_value": 1} + ] + }, + { + "name": "advanced.txt", + "content": b"Advanced features...", + "metadata": [ + {"key": "section", "string_value": "advanced"}, + {"key": "priority", "numeric_value": 2} + ] + } +] + +for doc in documents: + ingest_response = await litellm.aingest( + ingest_options={ + "name": f"ingest-{doc['name']}", + "vector_store": { + "custom_llm_provider": "gemini", + "vector_store_id": store_id, + "custom_metadata": doc["metadata"] + }, + "chunking_strategy": { + "white_space_config": { + "max_tokens_per_chunk": 300, + "max_overlap_tokens": 50 + } + } + }, + file_data=(doc["name"], doc["content"], "text/plain") + ) + print(f"Ingested: {doc['name']}") + +# 3. Search with filters +search_response = await litellm.vector_stores.asearch( + vector_store_id=store_id, + query="How do I get started?", + custom_llm_provider="gemini", + filters={"section": "intro"}, + max_num_results=3 +) + +# 4. Process results +for i, result in enumerate(search_response["data"]): + print(f"\nResult {i+1}:") + print(f" Score: {result.get('score')}") + print(f" File: {result.get('filename')}") + print(f" Content: {result['content'][0]['text'][:100]}...") +``` + +## Supported File Types + +Gemini File Search supports a wide range of file formats: + +### Documents +- PDF (`application/pdf`) +- Microsoft Word (`.docx`, `.doc`) +- Microsoft Excel (`.xlsx`, `.xls`) +- Microsoft PowerPoint (`.pptx`) +- OpenDocument formats (`.odt`, `.ods`, `.odp`) + +### Text Files +- Plain text (`text/plain`) +- Markdown (`text/markdown`) +- HTML (`text/html`) +- CSV (`text/csv`) +- JSON (`application/json`) +- XML (`application/xml`) + +### Code Files +- Python, JavaScript, TypeScript, Java, C/C++, Go, Rust, etc. +- Most common programming languages supported + +See [Gemini's full list of supported file types](https://ai.google.dev/gemini-api/docs/file-search#supported-file-types). + +## Pricing + +- **Indexing**: $0.15 per 1M tokens (embedding pricing) +- **Storage**: Free +- **Query embeddings**: Free +- **Retrieved tokens**: Charged as regular context tokens + +## Supported Models + +File Search works with: +- `gemini-3-pro-preview` +- `gemini-2.5-pro` +- `gemini-2.5-flash` (and preview versions) +- `gemini-2.5-flash-lite` (and preview versions) + +## Troubleshooting + +### Authentication Errors + +```python +# Ensure API key is set +import os +os.environ["GEMINI_API_KEY"] = "your-api-key" + +# Or pass explicitly +response = await litellm.aingest( + ingest_options={ + "vector_store": { + "custom_llm_provider": "gemini", + "api_key": "your-api-key" + } + }, + file_data=(...) +) +``` + +### Store Not Found + +Ensure you're using the full store name format: +- ✅ `fileSearchStores/abc123` +- ❌ `abc123` + +### Large Files + +For files >100MB, split them into smaller chunks before ingestion. + +### Slow Indexing + +After ingestion, Gemini may need time to index documents. Wait a few seconds before searching: + +```python +import time + +# After ingest +await litellm.aingest(...) + +# Wait for indexing +time.sleep(5) + +# Then search +await litellm.vector_stores.asearch(...) +``` + +## Related Resources + +- [Gemini File Search Official Docs](https://ai.google.dev/gemini-api/docs/file-search) +- [LiteLLM RAG Ingest API](/docs/rag_ingest) +- [LiteLLM Vector Store Search](/docs/vector_stores/search) +- [Using Vector Stores with Chat](/docs/completion/knowledgebase) + diff --git a/docs/my-website/docs/providers/gigachat.md b/docs/my-website/docs/providers/gigachat.md new file mode 100644 index 000000000000..13eec298c25d --- /dev/null +++ b/docs/my-website/docs/providers/gigachat.md @@ -0,0 +1,283 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# GigaChat +https://developers.sber.ru/docs/ru/gigachat/api/overview + +GigaChat is Sber AI's large language model, Russia's leading LLM provider. + +:::tip + +**We support ALL GigaChat models, just set `model=gigachat/` as a prefix when sending litellm requests** + +::: + +:::warning + +GigaChat API uses self-signed SSL certificates. You must pass `ssl_verify=False` in your requests. + +::: + +## Supported Features + +| Feature | Supported | +|---------|-----------| +| Chat Completion | Yes | +| Streaming | Yes | +| Async | Yes | +| Function Calling / Tools | Yes | +| Structured Output (JSON Schema) | Yes (via function call emulation) | +| Image Input | Yes (base64 and URL) - GigaChat-2-Max, GigaChat-2-Pro only | +| Embeddings | Yes | + +## API Key + +GigaChat uses OAuth authentication. Set your credentials as environment variables: + +```python +import os + +# Required: Set credentials (base64-encoded client_id:client_secret) +os.environ['GIGACHAT_CREDENTIALS'] = "your-credentials-here" + +# Optional: Set scope (default is GIGACHAT_API_PERS for personal use) +os.environ['GIGACHAT_SCOPE'] = "GIGACHAT_API_PERS" # or GIGACHAT_API_B2B for business +``` + +Get your credentials at: https://developers.sber.ru/studio/ + +## Sample Usage + +```python +from litellm import completion +import os + +os.environ['GIGACHAT_CREDENTIALS'] = "your-credentials-here" + +response = completion( + model="gigachat/GigaChat-2-Max", + messages=[ + {"role": "user", "content": "Hello from LiteLLM!"} + ], + ssl_verify=False, # Required for GigaChat +) +print(response) +``` + +## Sample Usage - Streaming + +```python +from litellm import completion +import os + +os.environ['GIGACHAT_CREDENTIALS'] = "your-credentials-here" + +response = completion( + model="gigachat/GigaChat-2-Max", + messages=[ + {"role": "user", "content": "Hello from LiteLLM!"} + ], + stream=True, + ssl_verify=False, # Required for GigaChat +) + +for chunk in response: + print(chunk) +``` + +## Sample Usage - Function Calling + +```python +from litellm import completion +import os + +os.environ['GIGACHAT_CREDENTIALS'] = "your-credentials-here" + +tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "City name"} + }, + "required": ["city"] + } + } +}] + +response = completion( + model="gigachat/GigaChat-2-Max", + messages=[{"role": "user", "content": "What's the weather in Moscow?"}], + tools=tools, + ssl_verify=False, # Required for GigaChat +) +print(response) +``` + +## Sample Usage - Structured Output + +GigaChat supports structured output via JSON schema (emulated through function calling): + +```python +from litellm import completion +import os + +os.environ['GIGACHAT_CREDENTIALS'] = "your-credentials-here" + +response = completion( + model="gigachat/GigaChat-2-Max", + messages=[{"role": "user", "content": "Extract info: John is 30 years old"}], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "person", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + } + } + } + }, + ssl_verify=False, # Required for GigaChat +) +print(response) # Returns JSON: {"name": "John", "age": 30} +``` + +## Sample Usage - Image Input + +GigaChat supports image input via base64 or URL (GigaChat-2-Max and GigaChat-2-Pro only): + +```python +from litellm import completion +import os + +os.environ['GIGACHAT_CREDENTIALS'] = "your-credentials-here" + +response = completion( + model="gigachat/GigaChat-2-Max", # Vision requires GigaChat-2-Max or GigaChat-2-Pro + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} + ] + }], + ssl_verify=False, # Required for GigaChat +) +print(response) +``` + +## Sample Usage - Embeddings + +```python +from litellm import embedding +import os + +os.environ['GIGACHAT_CREDENTIALS'] = "your-credentials-here" + +response = embedding( + model="gigachat/Embeddings", + input=["Hello world", "How are you?"], + ssl_verify=False, # Required for GigaChat +) +print(response) +``` + +## Usage with LiteLLM Proxy + +### 1. Set GigaChat Models on config.yaml + +```yaml +model_list: + - model_name: gigachat + litellm_params: + model: gigachat/GigaChat-2-Max + api_key: "os.environ/GIGACHAT_CREDENTIALS" + ssl_verify: false + - model_name: gigachat-lite + litellm_params: + model: gigachat/GigaChat-2-Lite + api_key: "os.environ/GIGACHAT_CREDENTIALS" + ssl_verify: false + - model_name: gigachat-embeddings + litellm_params: + model: gigachat/Embeddings + api_key: "os.environ/GIGACHAT_CREDENTIALS" + ssl_verify: false +``` + +### 2. Start Proxy + +```bash +litellm --config config.yaml +``` + +### 3. Test it + + + + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "gigachat", + "messages": [ + { + "role": "user", + "content": "Hello!" + } + ] +}' +``` + + + +```python +import openai +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="gigachat", + messages=[{"role": "user", "content": "Hello!"}] +) +print(response) +``` + + + +## Supported Models + +### Chat Models + +| Model Name | Context Window | Vision | Description | +|------------|----------------|--------|-------------| +| gigachat/GigaChat-2-Lite | 128K | No | Fast, lightweight model | +| gigachat/GigaChat-2-Pro | 128K | Yes | Professional model with vision | +| gigachat/GigaChat-2-Max | 128K | Yes | Maximum capability model | + +### Embedding Models + +| Model Name | Max Input | Dimensions | Description | +|------------|-----------|------------|-------------| +| gigachat/Embeddings | 512 | 1024 | Standard embeddings | +| gigachat/Embeddings-2 | 512 | 1024 | Updated embeddings | +| gigachat/EmbeddingsGigaR | 4096 | 2560 | High-dimensional embeddings | + +:::note +Available models may vary depending on your API access level (personal or business). +::: + +## Limitations + +- Only one function call per request (GigaChat API limitation) +- Maximum 1 image per message, 10 images total per conversation +- GigaChat API uses self-signed SSL certificates - `ssl_verify=False` is required diff --git a/docs/my-website/docs/providers/github_copilot.md b/docs/my-website/docs/providers/github_copilot.md index 2ebe6eacb1c4..306c9f949ec3 100644 --- a/docs/my-website/docs/providers/github_copilot.md +++ b/docs/my-website/docs/providers/github_copilot.md @@ -15,7 +15,7 @@ https://docs.github.com/en/copilot |-------|-------| | Description | GitHub Copilot Chat API provides access to GitHub's AI-powered coding assistant. | | Provider Route on LiteLLM | `github_copilot/` | -| Supported Endpoints | `/chat/completions` | +| Supported Endpoints | `/chat/completions`, `/embeddings` | | API Reference | [GitHub Copilot docs](https://docs.github.com/en/copilot) | ## Authentication @@ -62,6 +62,34 @@ for chunk in stream: print(chunk.choices[0].delta.content, end="") ``` +### Responses + +For GPT Codex models, only responses API is supported. + +```python showLineNumbers title="GitHub Copilot Responses" +import litellm + +response = await litellm.aresponses( + model="github_copilot/gpt-5.1-codex", + input="Write a Python hello world", + max_output_tokens=500 +) + +print(response) +``` + +### Embedding + +```python showLineNumbers title="GitHub Copilot Embedding" +import litellm + +response = litellm.embedding( + model="github_copilot/text-embedding-3-small", + input=["good morning from litellm"] +) +print(response) +``` + ## Usage - LiteLLM Proxy Add the following to your LiteLLM Proxy configuration file: @@ -71,6 +99,16 @@ model_list: - model_name: github_copilot/gpt-4 litellm_params: model: github_copilot/gpt-4 + - model_name: github_copilot/gpt-5.1-codex + model_info: + mode: responses + litellm_params: + model: github_copilot/gpt-5.1-codex + - model_name: github_copilot/text-embedding-ada-002 + model_info: + mode: embedding + litellm_params: + model: github_copilot/text-embedding-ada-002 ``` Start your LiteLLM Proxy server: @@ -180,7 +218,7 @@ extra_headers = { "editor-version": "vscode/1.85.1", # Editor version "editor-plugin-version": "copilot/1.155.0", # Plugin version "Copilot-Integration-Id": "vscode-chat", # Integration ID - "user-agent": "GithubCopilot/1.155.0" # User agent + "user-agent": "GithubCopilot/1.155.0" # User agent } ``` diff --git a/docs/my-website/docs/providers/gmi.md b/docs/my-website/docs/providers/gmi.md new file mode 100644 index 000000000000..8e321463239d --- /dev/null +++ b/docs/my-website/docs/providers/gmi.md @@ -0,0 +1,140 @@ +# GMI Cloud + +## Overview + +| Property | Details | +|-------|-------| +| Description | GMI Cloud is a GPU cloud infrastructure provider offering access to top AI models including Claude, GPT, DeepSeek, Gemini, and more through OpenAI-compatible APIs. | +| Provider Route on LiteLLM | `gmi/` | +| Link to Provider Doc | [GMI Cloud Docs ↗](https://docs.gmicloud.ai) | +| Base URL | `https://api.gmi-serving.com/v1` | +| Supported Operations | [`/chat/completions`](#sample-usage), [`/models`](#supported-models) | + +
+ +## What is GMI Cloud? + +GMI Cloud is a venture-backed digital infrastructure company ($82M+ funding) providing: +- **Top-tier GPU Access**: NVIDIA H100 GPUs for AI workloads +- **Multiple AI Models**: Claude, GPT, DeepSeek, Gemini, Kimi, Qwen, and more +- **OpenAI-Compatible API**: Drop-in replacement for OpenAI SDK +- **Global Infrastructure**: Data centers in US (Colorado) and APAC (Taiwan) + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["GMI_API_KEY"] = "" # your GMI Cloud API key +``` + +Get your GMI Cloud API key from [console.gmicloud.ai](https://console.gmicloud.ai). + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="GMI Cloud Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["GMI_API_KEY"] = "" # your GMI Cloud API key + +messages = [{"content": "What is the capital of France?", "role": "user"}] + +# GMI Cloud call +response = completion( + model="gmi/deepseek-ai/DeepSeek-V3.2", + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="GMI Cloud Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["GMI_API_KEY"] = "" # your GMI Cloud API key + +messages = [{"content": "Write a short poem about AI", "role": "user"}] + +# GMI Cloud call with streaming +response = completion( + model="gmi/anthropic/claude-sonnet-4.5", + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export GMI_API_KEY="" +``` + +### 2. Start the proxy + +```yaml +model_list: + - model_name: deepseek-v3 + litellm_params: + model: gmi/deepseek-ai/DeepSeek-V3.2 + api_key: os.environ/GMI_API_KEY + - model_name: claude-sonnet + litellm_params: + model: gmi/anthropic/claude-sonnet-4.5 + api_key: os.environ/GMI_API_KEY +``` + +## Supported Models + +| Model | Model ID | Context Length | +|-------|----------|----------------| +| Claude Opus 4.5 | `gmi/anthropic/claude-opus-4.5` | 409K | +| Claude Sonnet 4.5 | `gmi/anthropic/claude-sonnet-4.5` | 409K | +| Claude Sonnet 4 | `gmi/anthropic/claude-sonnet-4` | 409K | +| Claude Opus 4 | `gmi/anthropic/claude-opus-4` | 409K | +| GPT-5.2 | `gmi/openai/gpt-5.2` | 409K | +| GPT-5.1 | `gmi/openai/gpt-5.1` | 409K | +| GPT-5 | `gmi/openai/gpt-5` | 409K | +| GPT-4o | `gmi/openai/gpt-4o` | 131K | +| GPT-4o-mini | `gmi/openai/gpt-4o-mini` | 131K | +| DeepSeek V3.2 | `gmi/deepseek-ai/DeepSeek-V3.2` | 163K | +| DeepSeek V3 0324 | `gmi/deepseek-ai/DeepSeek-V3-0324` | 163K | +| Gemini 3 Pro | `gmi/google/gemini-3-pro-preview` | 1M | +| Gemini 3 Flash | `gmi/google/gemini-3-flash-preview` | 1M | +| Kimi K2 Thinking | `gmi/moonshotai/Kimi-K2-Thinking` | 262K | +| MiniMax M2.1 | `gmi/MiniMaxAI/MiniMax-M2.1` | 196K | +| Qwen3-VL 235B | `gmi/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8` | 262K | +| GLM-4.7 | `gmi/zai-org/GLM-4.7-FP8` | 202K | + +## Supported OpenAI Parameters + +GMI Cloud supports all standard OpenAI-compatible parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `messages` | array | **Required**. Array of message objects with 'role' and 'content' | +| `model` | string | **Required**. Model ID from available models | +| `stream` | boolean | Optional. Enable streaming responses | +| `temperature` | float | Optional. Sampling temperature | +| `top_p` | float | Optional. Nucleus sampling parameter | +| `max_tokens` | integer | Optional. Maximum tokens to generate | +| `frequency_penalty` | float | Optional. Penalize frequent tokens | +| `presence_penalty` | float | Optional. Penalize tokens based on presence | +| `stop` | string/array | Optional. Stop sequences | +| `response_format` | object | Optional. JSON mode with `{"type": "json_object"}` | + +## Additional Resources + +- [GMI Cloud Website](https://www.gmicloud.ai) +- [GMI Cloud Documentation](https://docs.gmicloud.ai) +- [GMI Cloud Console](https://console.gmicloud.ai) diff --git a/docs/my-website/docs/providers/google_ai_studio/files.md b/docs/my-website/docs/providers/google_ai_studio/files.md index ce61ce1a90bb..17fe6e73d94d 100644 --- a/docs/my-website/docs/providers/google_ai_studio/files.md +++ b/docs/my-website/docs/providers/google_ai_studio/files.md @@ -159,3 +159,150 @@ print(completion.choices[0].message) +## Azure Blob Storage Integration + +LiteLLM supports using Azure Blob Storage as a target storage backend for Gemini file uploads. This allows you to store files in Azure Data Lake Storage Gen2 instead of Google's managed storage. + +### Step 1: Setup Azure Blob Storage + +Configure your Azure Blob Storage account by setting the following environment variables: + +**Required Environment Variables:** +- `AZURE_STORAGE_ACCOUNT_NAME` - Your Azure Storage account name +- `AZURE_STORAGE_FILE_SYSTEM` - The container/filesystem name where files will be stored +- `AZURE_STORAGE_ACCOUNT_KEY` - Your account key + +### Step 2: Pass Azure Blob Storage as Target Storage + +When uploading files, specify `target_storage: "azure_storage"` to use Azure Blob Storage instead of the default storage. + +**Supported File Types:** + +Azure Blob Storage supports all Gemini-compatible file types: + +- **Images**: PNG, JPEG, WEBP +- **Audio**: AAC, FLAC, MP3, MPA, MPEG, MPGA, OPUS, PCM, WAV, WEBM +- **Video**: FLV, MOV, MPEG, MPEGPS, MPG, MP4, WEBM, WMV, 3GPP +- **Documents**: PDF, TXT + +> **Note:** Only small files can be sent as inline data because the total request size limit is 20 MB. + + +### Step 3: Upload Files with Azure Blob Storage for Gemini + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: "gemini-2.5-flash" + litellm_params: + model: gemini/gemini-2.5-flash + api_key: os.environ/GEMINI_API_KEY +``` + +2. Set environment variables + +```bash +export AZURE_STORAGE_ACCOUNT_NAME="your-storage-account" +export AZURE_STORAGE_FILE_SYSTEM="your-container-name" +export AZURE_STORAGE_ACCOUNT_KEY="your-account-key" +``` +or add them in your `.env` + +3. Start proxy + +```bash +litellm --config config.yaml +``` + +4. Upload file with Azure Blob Storage + +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://0.0.0.0:4000", + api_key="sk-1234" +) + +# Upload file to Azure Blob Storage +file = client.files.create( + file=open("document.pdf", "rb"), + purpose="user_data", + extra_body={ + "target_model_names": "gemini-2.0-flash", + "target_storage": "azure_storage" # 👈 Use Azure Blob Storage + } +) + +print(f"File uploaded to Azure Blob Storage: {file.id}") + +# Use the file with Gemini +completion = client.chat.completions.create( + model="gemini-2.0-flash", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Summarize this document"}, + { + "type": "file", + "file": { + "file_id": file.id, + } + } + ] + } + ] +) + +print(completion.choices[0].message.content) +``` + + + + +```bash +# Upload file with Azure Blob Storage +curl -X POST "http://0.0.0.0:4000/v1/files" \ + -H "Authorization: Bearer sk-1234" \ + -F "file=@document.pdf" \ + -F "purpose=user_data" \ + -F "target_storage=azure_storage" \ + -F "target_model_names=gemini-2.0-flash" \ + -F "custom_llm_provider=gemini" + +# Use the file with Gemini +curl -X POST "http://0.0.0.0:4000/v1/chat/completions" \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "gemini-2.0-flash", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Summarize this document"}, + { + "type": "file", + "file": { + "file_id": "file-id-from-upload", + "format": "application/pdf" + } + } + ] + } + ] + }' +``` + + + + +:::info +Files uploaded to Azure Blob Storage are stored in your Azure account and can be accessed via the returned file ID. The file URL format is: `https://{account}.blob.core.windows.net/{container}/{path}` +::: + diff --git a/docs/my-website/docs/providers/groq.md b/docs/my-website/docs/providers/groq.md index 59668b5eb5fc..55c222635d24 100644 --- a/docs/my-website/docs/providers/groq.md +++ b/docs/my-website/docs/providers/groq.md @@ -150,15 +150,15 @@ We support ALL Groq models, just set `groq/` as a prefix when sending completion | Model Name | Usage | |--------------------|---------------------------------------------------------| -| llama-3.1-8b-instant | `completion(model="groq/llama-3.1-8b-instant", messages)` | -| llama-3.1-70b-versatile | `completion(model="groq/llama-3.1-70b-versatile", messages)` | -| llama3-8b-8192 | `completion(model="groq/llama3-8b-8192", messages)` | -| llama3-70b-8192 | `completion(model="groq/llama3-70b-8192", messages)` | -| llama2-70b-4096 | `completion(model="groq/llama2-70b-4096", messages)` | -| mixtral-8x7b-32768 | `completion(model="groq/mixtral-8x7b-32768", messages)` | -| gemma-7b-it | `completion(model="groq/gemma-7b-it", messages)` | -| moonshotai/kimi-k2-instruct | `completion(model="groq/moonshotai/kimi-k2-instruct", messages)` | -| qwen3-32b | `completion(model="groq/qwen/qwen3-32b", messages)` | +| llama-3.3-70b-versatile | `completion(model="groq/llama-3.3-70b-versatile", messages)` | +| llama-3.1-8b-instant | `completion(model="groq/llama-3.1-8b-instant", messages)` | +| meta-llama/llama-4-scout-17b-16e-instruct | `completion(model="groq/meta-llama/llama-4-scout-17b-16e-instruct", messages)` | +| meta-llama/llama-4-maverick-17b-128e-instruct | `completion(model="groq/meta-llama/llama-4-maverick-17b-128e-instruct", messages)` | +| meta-llama/llama-guard-4-12b | `completion(model="groq/meta-llama/llama-guard-4-12b", messages)` | +| qwen/qwen3-32b | `completion(model="groq/qwen/qwen3-32b", messages)` | +| moonshotai/kimi-k2-instruct-0905 | `completion(model="groq/moonshotai/kimi-k2-instruct-0905", messages)` | +| openai/gpt-oss-120b | `completion(model="groq/openai/gpt-oss-120b", messages)` | +| openai/gpt-oss-20b | `completion(model="groq/openai/gpt-oss-20b", messages)` | ## Groq - Tool / Function Calling Example @@ -261,36 +261,33 @@ if tool_calls: print("second response\n", second_response) ``` -## Groq - Vision Example +## Groq - Vision Example -Select Groq models support vision. Check out their [model list](https://console.groq.com/docs/vision) for more details. +Groq's Llama 4 models support vision. Check out their [model list](https://console.groq.com/docs/vision) for more details. ```python -from litellm import completion - -import os +import os from litellm import completion os.environ["GROQ_API_KEY"] = "your-api-key" -# openai call response = completion( - model = "groq/llama-3.2-11b-vision-preview", + model = "groq/meta-llama/llama-4-scout-17b-16e-instruct", messages=[ { "role": "user", "content": [ { "type": "text", - "text": "What’s in this image?" + "text": "What's in this image?" }, { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png" } } ] @@ -342,7 +339,7 @@ response = client.chat.completions.create( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png" } } ] diff --git a/docs/my-website/docs/providers/helicone.md b/docs/my-website/docs/providers/helicone.md new file mode 100644 index 000000000000..3f0cfcbcb28e --- /dev/null +++ b/docs/my-website/docs/providers/helicone.md @@ -0,0 +1,268 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Helicone + +## Overview + +| Property | Details | +|-------|-------| +| Description | Helicone is an AI gateway and observability platform that provides OpenAI-compatible endpoints with advanced monitoring, caching, and analytics capabilities. | +| Provider Route on LiteLLM | `helicone/` | +| Link to Provider Doc | [Helicone Documentation ↗](https://docs.helicone.ai) | +| Base URL | `https://ai-gateway.helicone.ai/` | +| Supported Operations | [`/chat/completions`](#sample-usage), [`/completions`](#text-completion), [`/embeddings`](#embeddings) | + +
+ +**We support [ALL models available](https://helicone.ai/models) through Helicone's AI Gateway. Use `helicone/` as a prefix when sending requests.** + +## What is Helicone? + +Helicone is an open-source observability platform for LLM applications that provides: +- **Request Monitoring**: Track all LLM requests with detailed metrics +- **Caching**: Reduce costs and latency with intelligent caching +- **Rate Limiting**: Control request rates per user/key +- **Cost Tracking**: Monitor spend across models and users +- **Custom Properties**: Tag requests with metadata for filtering and analysis +- **Prompt Management**: Version control for prompts + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["HELICONE_API_KEY"] = "" # your Helicone API key +``` + +Get your Helicone API key from your [Helicone dashboard](https://helicone.ai). + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="Helicone Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["HELICONE_API_KEY"] = "" # your Helicone API key + +messages = [{"content": "What is the capital of France?", "role": "user"}] + +# Helicone call - routes through Helicone gateway to OpenAI +response = completion( + model="helicone/gpt-4", + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="Helicone Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["HELICONE_API_KEY"] = "" # your Helicone API key + +messages = [{"content": "Write a short poem about AI", "role": "user"}] + +# Helicone call with streaming +response = completion( + model="helicone/gpt-4", + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +### With Metadata (Helicone Custom Properties) + +```python showLineNumbers title="Helicone with Custom Properties" +import os +import litellm +from litellm import completion + +os.environ["HELICONE_API_KEY"] = "" # your Helicone API key + +response = completion( + model="helicone/gpt-4o-mini", + messages=[{"role": "user", "content": "What's the weather like?"}], + metadata={ + "Helicone-Property-Environment": "production", + "Helicone-Property-User-Id": "user_123", + "Helicone-Property-Session-Id": "session_abc" + } +) + +print(response) +``` + +### Text Completion + +```python showLineNumbers title="Helicone Text Completion" +import os +import litellm + +os.environ["HELICONE_API_KEY"] = "" # your Helicone API key + +response = litellm.completion( + model="helicone/gpt-4o-mini", # text completion model + prompt="Once upon a time" +) + +print(response) +``` + + +## Retry and Fallback Mechanisms + +```python +import litellm + +litellm.api_base = "https://ai-gateway.helicone.ai/" +litellm.metadata = { + "Helicone-Retry-Enabled": "true", + "helicone-retry-num": "3", + "helicone-retry-factor": "2", +} + +response = litellm.completion( + model="helicone/gpt-4o-mini/openai,claude-3-5-sonnet-20241022/anthropic", # Try OpenAI first, then fallback to Anthropic, then continue with other models, + messages=[{"role": "user", "content": "Hello"}] +) +``` + +## Supported OpenAI Parameters + +Helicone supports all standard OpenAI-compatible parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `messages` | array | **Required**. Array of message objects with 'role' and 'content' | +| `model` | string | **Required**. Model ID (e.g., gpt-4, claude-3-opus, etc.) | +| `stream` | boolean | Optional. Enable streaming responses | +| `temperature` | float | Optional. Sampling temperature | +| `top_p` | float | Optional. Nucleus sampling parameter | +| `max_tokens` | integer | Optional. Maximum tokens to generate | +| `frequency_penalty` | float | Optional. Penalize frequent tokens | +| `presence_penalty` | float | Optional. Penalize tokens based on presence | +| `stop` | string/array | Optional. Stop sequences | +| `n` | integer | Optional. Number of completions to generate | +| `tools` | array | Optional. List of available tools/functions | +| `tool_choice` | string/object | Optional. Control tool/function calling | +| `response_format` | object | Optional. Response format specification | +| `user` | string | Optional. User identifier | + +## Helicone-Specific Headers + +Pass these as metadata to leverage Helicone features: + +| Header | Description | +|--------|-------------| +| `Helicone-Property-*` | Custom properties for filtering (e.g., `Helicone-Property-User-Id`) | +| `Helicone-Cache-Enabled` | Enable caching for this request | +| `Helicone-User-Id` | User identifier for tracking | +| `Helicone-Session-Id` | Session identifier for grouping requests | +| `Helicone-Prompt-Id` | Prompt identifier for versioning | +| `Helicone-Rate-Limit-Policy` | Rate limiting policy name | + +Example with headers: + +```python showLineNumbers title="Helicone with Custom Headers" +import litellm + +response = litellm.completion( + model="helicone/gpt-4", + messages=[{"role": "user", "content": "Hello"}], + metadata={ + "Helicone-Cache-Enabled": "true", + "Helicone-Property-Environment": "production", + "Helicone-Property-User-Id": "user_123", + "Helicone-Session-Id": "session_abc", + "Helicone-Prompt-Id": "prompt_v1" + } +) +``` + +## Advanced Usage + +### Using with Different Providers + +Helicone acts as a gateway and supports multiple providers: + +```python showLineNumbers title="Helicone with Anthropic" +import litellm + +# Set both Helicone and Anthropic keys +os.environ["HELICONE_API_KEY"] = "your-helicone-key" + +response = litellm.completion( + model="helicone/claude-3.5-haiku/anthropic", + messages=[{"role": "user", "content": "Hello"}] +) +``` + +### Caching + +Enable caching to reduce costs and latency: + +```python showLineNumbers title="Helicone Caching" +import litellm + +response = litellm.completion( + model="helicone/gpt-4", + messages=[{"role": "user", "content": "What is 2+2?"}], + metadata={ + "Helicone-Cache-Enabled": "true" + } +) + +# Subsequent identical requests will be served from cache +response2 = litellm.completion( + model="helicone/gpt-4", + messages=[{"role": "user", "content": "What is 2+2?"}], + metadata={ + "Helicone-Cache-Enabled": "true" + } +) +``` + +## Features + +### Request Monitoring +- Track all requests with detailed metrics +- View request/response pairs +- Monitor latency and errors +- Filter by custom properties + +### Cost Tracking +- Per-model cost tracking +- Per-user cost tracking +- Cost alerts and budgets +- Historical cost analysis + +### Rate Limiting +- Per-user rate limits +- Per-API key rate limits +- Custom rate limit policies +- Automatic enforcement + +### Analytics +- Request volume trends +- Cost trends +- Latency percentiles +- Error rates + +Visit [Helicone Pricing](https://helicone.ai/pricing) for details. + +## Additional Resources + +- [Helicone Official Documentation](https://docs.helicone.ai) +- [Helicone Dashboard](https://helicone.ai) +- [Helicone GitHub](https://github.com/Helicone/helicone) +- [API Reference](https://docs.helicone.ai/rest/ai-gateway/post-v1-chat-completions) + diff --git a/docs/my-website/docs/providers/huggingface.md b/docs/my-website/docs/providers/huggingface.md index 399d49b5f465..985351e9f694 100644 --- a/docs/my-website/docs/providers/huggingface.md +++ b/docs/my-website/docs/providers/huggingface.md @@ -130,7 +130,7 @@ messages=[ { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png", } }, ], @@ -250,7 +250,7 @@ messages=[ { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png", } }, ], diff --git a/docs/my-website/docs/providers/langgraph.md b/docs/my-website/docs/providers/langgraph.md new file mode 100644 index 000000000000..9b4b24cf8f50 --- /dev/null +++ b/docs/my-website/docs/providers/langgraph.md @@ -0,0 +1,297 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# LangGraph + +Call LangGraph agents through LiteLLM using the OpenAI chat completions format. + +| Property | Details | +|----------|---------| +| Description | LangGraph is a framework for building stateful, multi-actor applications with LLMs. LiteLLM supports calling LangGraph agents via their streaming and non-streaming endpoints. | +| Provider Route on LiteLLM | `langgraph/{agent_id}` | +| Provider Doc | [LangGraph Platform ↗](https://langchain-ai.github.io/langgraph/cloud/quick_start/) | + +**Prerequisites:** You need a running LangGraph server. See [Setting Up a Local LangGraph Server](#setting-up-a-local-langgraph-server) below. + +## Quick Start + +### Model Format + +```shell showLineNumbers title="Model Format" +langgraph/{agent_id} +``` + +**Example:** +- `langgraph/agent` - calls the default agent + +### LiteLLM Python SDK + +```python showLineNumbers title="Basic LangGraph Completion" +import litellm + +response = litellm.completion( + model="langgraph/agent", + messages=[ + {"role": "user", "content": "What is 25 * 4?"} + ], + api_base="http://localhost:2024", +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="Streaming LangGraph Response" +import litellm + +response = litellm.completion( + model="langgraph/agent", + messages=[ + {"role": "user", "content": "What is the weather in Tokyo?"} + ], + api_base="http://localhost:2024", + stream=True, +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="") +``` + +### LiteLLM Proxy + +#### 1. Configure your model in config.yaml + + + + +```yaml showLineNumbers title="LiteLLM Proxy Configuration" +model_list: + - model_name: langgraph-agent + litellm_params: + model: langgraph/agent + api_base: http://localhost:2024 +``` + + + + +#### 2. Start the LiteLLM Proxy + +```bash showLineNumbers title="Start LiteLLM Proxy" +litellm --config config.yaml +``` + +#### 3. Make requests to your LangGraph agent + + + + +```bash showLineNumbers title="Basic Request" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "langgraph-agent", + "messages": [ + {"role": "user", "content": "What is 25 * 4?"} + ] + }' +``` + +```bash showLineNumbers title="Streaming Request" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "langgraph-agent", + "messages": [ + {"role": "user", "content": "What is the weather in Tokyo?"} + ], + "stream": true + }' +``` + + + + + +```python showLineNumbers title="Using OpenAI SDK with LiteLLM Proxy" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +response = client.chat.completions.create( + model="langgraph-agent", + messages=[ + {"role": "user", "content": "What is 25 * 4?"} + ] +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="Streaming with OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +stream = client.chat.completions.create( + model="langgraph-agent", + messages=[ + {"role": "user", "content": "What is the weather in Tokyo?"} + ], + stream=True +) + +for chunk in stream: + if chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") +``` + + + + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `LANGGRAPH_API_BASE` | Base URL of your LangGraph server (default: `http://localhost:2024`) | +| `LANGGRAPH_API_KEY` | Optional API key for authentication | + +## Supported Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `model` | string | The agent ID in format `langgraph/{agent_id}` | +| `messages` | array | Chat messages in OpenAI format | +| `stream` | boolean | Enable streaming responses | +| `api_base` | string | LangGraph server URL | +| `api_key` | string | Optional API key | + + +## Setting Up a Local LangGraph Server + +Before using LiteLLM with LangGraph, you need a running LangGraph server. + +### Prerequisites + +- Python 3.11+ +- An LLM API key (OpenAI or Google Gemini) + +### 1. Install the LangGraph CLI + +```bash +pip install "langgraph-cli[inmem]" +``` + +### 2. Create a new LangGraph project + +```bash +langgraph new my-agent --template new-langgraph-project-python +cd my-agent +``` + +### 3. Install dependencies + +```bash +pip install -e . +``` + +### 4. Set your API key + +```bash +echo "OPENAI_API_KEY=your_key_here" > .env +``` + +### 5. Start the server + +```bash +langgraph dev +``` + +The server will start at `http://localhost:2024`. + +### Verify the server is running + +```bash +curl -s --request POST \ + --url "http://localhost:2024/runs/wait" \ + --header 'Content-Type: application/json' \ + --data '{ + "assistant_id": "agent", + "input": { + "messages": [{"role": "human", "content": "Hello!"}] + } + }' +``` + + + +## LiteLLM A2A Gateway + +You can also connect to LangGraph agents through LiteLLM's A2A (Agent-to-Agent) Gateway UI. This provides a visual way to register and test agents without writing code. + +### 1. Navigate to Agents + +From the sidebar, click "Agents" to open the agent management page, then click "+ Add New Agent". + +![Navigate to Agents](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/27429cae-f743-440a-a6aa-29fa7ee013db/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=211,114) + +### 2. Select LangGraph Agent Type + +Click "A2A Standard" to see available agent types, then search for "langgraph" and select "Connect to LangGraph agents via the LangGraph Platform API". + +![Select A2A Standard](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/4add4088-683d-49ca-9374-23fd65dddf8e/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=511,139) + +![Select LangGraph](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/fd197907-47c7-4e05-959c-c0d42264263c/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=431,246) + +### 3. Configure the Agent + +Fill in the following fields: + +- **Agent Name** - A unique identifier (e.g., `lan-agent`) +- **LangGraph API Base** - Your LangGraph server URL, typically `http://127.0.0.1:2024/` +- **API Key** - Optional. LangGraph doesn't require an API key by default +- **Assistant ID** - Not used by LangGraph, you can enter any string here + +![Enter Agent Name](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/adce3df9-a67c-4d23-b2b5-05120738bc46/ascreenshot.jpeg?tl_px=0,0&br_px=2617,1463&force_format=jpeg&q=100&width=1120.0) + +![Enter API Base](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/6a6a03a7-f235-41db-b4ba-d32ced330f25/ascreenshot.jpeg?tl_px=0,251&br_px=2617,1714&force_format=jpeg&q=100&width=1120.0) + +Click "Create Agent" to save. + +![Create Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/ddee4295-9a32-4cda-8e3f-543e5047eb6a/ascreenshot.jpeg?tl_px=416,653&br_px=2618,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=686,316) + +### 4. Test in Playground + +Go to "Playground" in the sidebar to test your agent. Change the endpoint type to `/v1/a2a/message/send`. + +![Go to Playground](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/c4262189-95ac-4fbc-b5af-8aba8126e4f7/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=41,104) + +![Select A2A Endpoint](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/6cbc8e93-7d0c-47fc-9ad4-562663f759d5/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=324,265) + +### 5. Select Your Agent and Send a Message + +Pick your LangGraph agent from the dropdown and send a test message. + +![Select Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/d01da2f1-3b89-47d7-ba95-de2dd8efbc1e/ascreenshot.jpeg?tl_px=0,92&br_px=2201,1323&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=348,277) + +![Send Message](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/79db724e-a99e-493a-9747-dc91cb398370/ascreenshot.jpeg?tl_px=51,653&br_px=2252,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=524,444) + +The agent responds with its capabilities. You can now interact with your LangGraph agent through the A2A protocol. + +![Agent Response](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-13/82aa546a-0eb5-4836-b986-9aefcfe09e10/ascreenshot.jpeg?tl_px=295,28&br_px=2496,1259&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=524,277) + +## Further Reading + +- [LangGraph Platform Documentation](https://langchain-ai.github.io/langgraph/cloud/quick_start/) +- [LangGraph GitHub](https://github.com/langchain-ai/langgraph) +- [A2A Agent Gateway](../a2a.md) +- [A2A Cost Tracking](../a2a_cost_tracking.md) + diff --git a/docs/my-website/docs/providers/llamagate.md b/docs/my-website/docs/providers/llamagate.md new file mode 100644 index 000000000000..bc3626947718 --- /dev/null +++ b/docs/my-website/docs/providers/llamagate.md @@ -0,0 +1,228 @@ +# LlamaGate + +## Overview + +| Property | Details | +|-------|-------| +| Description | LlamaGate is an OpenAI-compatible API gateway for open-source LLMs with credit-based billing. Access 26+ open-source models including Llama, Mistral, DeepSeek, and Qwen at competitive prices. | +| Provider Route on LiteLLM | `llamagate/` | +| Link to Provider Doc | [LlamaGate Documentation ↗](https://llamagate.dev/docs) | +| Base URL | `https://api.llamagate.dev/v1` | +| Supported Operations | [`/chat/completions`](#sample-usage), [`/embeddings`](#embeddings) | + +
+ +## What is LlamaGate? + +LlamaGate provides access to open-source LLMs through an OpenAI-compatible API: +- **26+ Open-Source Models**: Llama 3.1/3.2, Mistral, Qwen, DeepSeek R1, and more +- **OpenAI-Compatible API**: Drop-in replacement for OpenAI SDK +- **Vision Models**: Qwen VL, LLaVA, olmOCR, UI-TARS for multimodal tasks +- **Reasoning Models**: DeepSeek R1, OpenThinker for complex problem-solving +- **Code Models**: CodeLlama, DeepSeek Coder, Qwen Coder, StarCoder2 +- **Embedding Models**: Nomic, Qwen3 Embedding for RAG and search +- **Competitive Pricing**: $0.02-$0.55 per 1M tokens + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["LLAMAGATE_API_KEY"] = "" # your LlamaGate API key +``` + +Get your API key from [llamagate.dev](https://llamagate.dev). + +## Supported Models + +### General Purpose +| Model | Model ID | +|-------|----------| +| Llama 3.1 8B | `llamagate/llama-3.1-8b` | +| Llama 3.2 3B | `llamagate/llama-3.2-3b` | +| Mistral 7B v0.3 | `llamagate/mistral-7b-v0.3` | +| Qwen 3 8B | `llamagate/qwen3-8b` | +| Dolphin 3 8B | `llamagate/dolphin3-8b` | + +### Reasoning Models +| Model | Model ID | +|-------|----------| +| DeepSeek R1 8B | `llamagate/deepseek-r1-8b` | +| DeepSeek R1 Distill Qwen 7B | `llamagate/deepseek-r1-7b-qwen` | +| OpenThinker 7B | `llamagate/openthinker-7b` | + +### Code Models +| Model | Model ID | +|-------|----------| +| Qwen 2.5 Coder 7B | `llamagate/qwen2.5-coder-7b` | +| DeepSeek Coder 6.7B | `llamagate/deepseek-coder-6.7b` | +| CodeLlama 7B | `llamagate/codellama-7b` | +| CodeGemma 7B | `llamagate/codegemma-7b` | +| StarCoder2 7B | `llamagate/starcoder2-7b` | + +### Vision Models +| Model | Model ID | +|-------|----------| +| Qwen 3 VL 8B | `llamagate/qwen3-vl-8b` | +| LLaVA 1.5 7B | `llamagate/llava-7b` | +| Gemma 3 4B | `llamagate/gemma3-4b` | +| olmOCR 7B | `llamagate/olmocr-7b` | +| UI-TARS 1.5 7B | `llamagate/ui-tars-7b` | + +### Embedding Models +| Model | Model ID | +|-------|----------| +| Nomic Embed Text | `llamagate/nomic-embed-text` | +| Qwen 3 Embedding 8B | `llamagate/qwen3-embedding-8b` | +| EmbeddingGemma 300M | `llamagate/embeddinggemma-300m` | + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="LlamaGate Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["LLAMAGATE_API_KEY"] = "" # your LlamaGate API key + +messages = [{"content": "What is the capital of France?", "role": "user"}] + +# LlamaGate call +response = completion( + model="llamagate/llama-3.1-8b", + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="LlamaGate Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["LLAMAGATE_API_KEY"] = "" # your LlamaGate API key + +messages = [{"content": "Write a short poem about AI", "role": "user"}] + +# LlamaGate call with streaming +response = completion( + model="llamagate/llama-3.1-8b", + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +### Vision + +```python showLineNumbers title="LlamaGate Vision Completion" +import os +import litellm +from litellm import completion + +os.environ["LLAMAGATE_API_KEY"] = "" # your LlamaGate API key + +messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}} + ] + } +] + +# LlamaGate vision call +response = completion( + model="llamagate/qwen3-vl-8b", + messages=messages +) + +print(response) +``` + +### Embeddings + +```python showLineNumbers title="LlamaGate Embeddings" +import os +import litellm +from litellm import embedding + +os.environ["LLAMAGATE_API_KEY"] = "" # your LlamaGate API key + +# LlamaGate embedding call +response = embedding( + model="llamagate/nomic-embed-text", + input=["Hello world", "How are you?"] +) + +print(response) +``` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export LLAMAGATE_API_KEY="" +``` + +### 2. Start the proxy + +```yaml +model_list: + - model_name: llama-3.1-8b + litellm_params: + model: llamagate/llama-3.1-8b + api_key: os.environ/LLAMAGATE_API_KEY + - model_name: deepseek-r1 + litellm_params: + model: llamagate/deepseek-r1-8b + api_key: os.environ/LLAMAGATE_API_KEY + - model_name: qwen-coder + litellm_params: + model: llamagate/qwen2.5-coder-7b + api_key: os.environ/LLAMAGATE_API_KEY +``` + +## Supported OpenAI Parameters + +LlamaGate supports all standard OpenAI-compatible parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `messages` | array | **Required**. Array of message objects with 'role' and 'content' | +| `model` | string | **Required**. Model ID | +| `stream` | boolean | Optional. Enable streaming responses | +| `temperature` | float | Optional. Sampling temperature (0-2) | +| `top_p` | float | Optional. Nucleus sampling parameter | +| `max_tokens` | integer | Optional. Maximum tokens to generate | +| `frequency_penalty` | float | Optional. Penalize frequent tokens | +| `presence_penalty` | float | Optional. Penalize tokens based on presence | +| `stop` | string/array | Optional. Stop sequences | +| `tools` | array | Optional. List of available tools/functions | +| `tool_choice` | string/object | Optional. Control tool/function calling | +| `response_format` | object | Optional. JSON mode or JSON schema | + +## Pricing + +LlamaGate offers competitive per-token pricing: + +| Model Category | Input (per 1M) | Output (per 1M) | +|----------------|----------------|-----------------| +| Embeddings | $0.02 | - | +| Small (3-4B) | $0.03-$0.04 | $0.08 | +| Medium (7-8B) | $0.03-$0.15 | $0.05-$0.55 | +| Code Models | $0.06-$0.10 | $0.12-$0.20 | +| Reasoning | $0.08-$0.10 | $0.15-$0.20 | + +## Additional Resources + +- [LlamaGate Documentation](https://llamagate.dev/docs) +- [LlamaGate Pricing](https://llamagate.dev/pricing) +- [LlamaGate API Reference](https://llamagate.dev/docs/api) diff --git a/docs/my-website/docs/providers/manus.md b/docs/my-website/docs/providers/manus.md new file mode 100644 index 000000000000..92bf2b9b966d --- /dev/null +++ b/docs/my-website/docs/providers/manus.md @@ -0,0 +1,369 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Manus + +Use Manus AI agents through LiteLLM's OpenAI-compatible Responses API. + +| Property | Details | +|----------|---------| +| Description | Manus is an AI agent platform for complex reasoning tasks, document analysis, and multi-step workflows with asynchronous task execution. | +| Provider Route on LiteLLM | `manus/{agent_profile}` | +| Supported Operations | `/responses` (Responses API), `/files` (Files API) | +| Provider Doc | [Manus API ↗](https://open.manus.im/docs/openai-compatibility) | + +## Model Format + +```shell +manus/{agent_profile} +``` + +**Examples:** +- `manus/manus-1.6` - General purpose agent +- `manus/manus-1.6-lite` - Lightweight agent for simple tasks +- `manus/manus-1.6-max` - Advanced agent for complex analysis + +## LiteLLM Python SDK + +```python showLineNumbers title="Basic Usage" +import litellm +import os +import time + +# Set API key +os.environ["MANUS_API_KEY"] = "your-manus-api-key" + +# Create task +response = litellm.responses( + model="manus/manus-1.6", + input="What's the capital of France?", +) + +print(f"Task ID: {response.id}") +print(f"Status: {response.status}") # "running" + +# Poll until complete +task_id = response.id +while response.status == "running": + time.sleep(5) + response = litellm.get_response( + response_id=task_id, + custom_llm_provider="manus", + ) + print(f"Status: {response.status}") + +# Get results +if response.status == "completed": + for message in response.output: + if message.role == "assistant": + print(message.content[0].text) +``` + +## LiteLLM AI Gateway + +### Setup + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: manus-agent + litellm_params: + model: manus/manus-1.6 + api_key: os.environ/MANUS_API_KEY +``` + +```bash title="Start Proxy" +litellm --config config.yaml +``` + +### Usage + + + + +```bash showLineNumbers title="Create Task" +# Create task +curl -X POST http://localhost:4000/responses \ + -H "Authorization: Bearer your-proxy-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "manus-agent", + "input": "What is the capital of France?" + }' + +# Response +{ + "id": "task_abc123", + "status": "running", + "metadata": { + "task_url": "https://manus.im/app/task_abc123" + } +} +``` + +```bash showLineNumbers title="Poll for Completion" +# Check status (repeat until status is "completed") +curl http://localhost:4000/responses/task_abc123 \ + -H "Authorization: Bearer your-proxy-key" + +# When completed +{ + "id": "task_abc123", + "status": "completed", + "output": [ + { + "role": "user", + "content": [{"text": "What is the capital of France?"}] + }, + { + "role": "assistant", + "content": [{"text": "The capital of France is Paris."}] + } + ] +} +``` + + + + +```python showLineNumbers title="Create Task and Poll" +import openai +import time + +client = openai.OpenAI( + base_url="http://localhost:4000", + api_key="your-proxy-key" +) + +# Create task +response = client.responses.create( + model="manus-agent", + input="What is the capital of France?" +) + +print(f"Task ID: {response.id}") +print(f"Status: {response.status}") # "running" + +# Poll until complete +task_id = response.id +while response.status == "running": + time.sleep(5) + response = client.responses.retrieve(response_id=task_id) + print(f"Status: {response.status}") + +# Get results +if response.status == "completed": + for message in response.output: + if message.role == "assistant": + print(message.content[0].text) +``` + + + + +## How It Works + +Manus operates as an **asynchronous agent API**: + +1. **Create Task**: When you call `litellm.responses()`, Manus creates a task and returns immediately with `status: "running"` +2. **Task Executes**: The agent works on your request in the background +3. **Poll for Completion**: You must repeatedly call `litellm.get_response()` or `client.responses.retrieve()` until the status changes to `"completed"` +4. **Get Results**: Once completed, the `output` field contains the full conversation + +**Task Statuses:** +- `running` - Agent is actively working +- `pending` - Agent is waiting for input +- `completed` - Task finished successfully +- `error` - Task failed + +:::tip Production Usage +For production applications, use [webhooks](https://open.manus.im/docs/webhooks) instead of polling to get notified when tasks complete. +::: + +## Supported Parameters + +| Parameter | Supported | Notes | +|-----------|-----------|-------| +| `input` | ✅ | Text, images, or structured content | +| `stream` | ✅ | Fake streaming (task runs async) | +| `max_output_tokens` | ✅ | Limits response length | +| `previous_response_id` | ✅ | For multi-turn conversations | + +## Files API + +Manus supports file uploads for document analysis and processing. Files can be uploaded and then referenced in Responses API calls. + +### LiteLLM Python SDK + +```python showLineNumbers title="Upload, Use, Retrieve, and Delete Files" +import litellm +import os + +# Set API key +os.environ["MANUS_API_KEY"] = "your-manus-api-key" + +# Upload file +file_content = b"This is a document for analysis." +created_file = await litellm.acreate_file( + file=("document.txt", file_content), + purpose="assistants", + custom_llm_provider="manus", +) +print(f"Uploaded file: {created_file.id}") + +# Use file with Responses API +response = await litellm.aresponses( + model="manus/manus-1.6", + input=[ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "Summarize this document."}, + {"type": "input_file", "file_id": created_file.id}, + ], + }, + ], + extra_body={"task_mode": "agent", "agent_profile": "manus-1.6-agent"}, +) +print(f"Response: {response.id}") + +# Retrieve file +retrieved_file = await litellm.afile_retrieve( + file_id=created_file.id, + custom_llm_provider="manus", +) +print(f"File details: {retrieved_file.filename}, {retrieved_file.bytes} bytes") + +# Delete file +deleted_file = await litellm.afile_delete( + file_id=created_file.id, + custom_llm_provider="manus", +) +print(f"Deleted: {deleted_file.deleted}") +``` + +### LiteLLM AI Gateway + + + + +```bash showLineNumbers title="Upload File" +# Upload file +curl -X POST http://localhost:4000/v1/files \ + -H "Authorization: Bearer your-proxy-key" \ + -F "file=@document.txt" \ + -F "purpose=assistants" \ + -F "custom_llm_provider=manus" + +# Response +{ + "id": "file_abc123", + "object": "file", + "bytes": 1024, + "created_at": 1234567890, + "filename": "document.txt", + "purpose": "assistants", + "status": "uploaded" +} +``` + +```bash showLineNumbers title="Use File with Responses API" +# Create response with file +curl -X POST http://localhost:4000/responses \ + -H "Authorization: Bearer your-proxy-key" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "manus-agent", + "input": [ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "Summarize this document."}, + {"type": "input_file", "file_id": "file_abc123"} + ] + } + ] + }' +``` + +```bash showLineNumbers title="Retrieve File" +# Get file details +curl http://localhost:4000/v1/files/file_abc123 \ + -H "Authorization: Bearer your-proxy-key" + +# Response +{ + "id": "file_abc123", + "object": "file", + "bytes": 1024, + "created_at": 1234567890, + "filename": "document.txt", + "purpose": "assistants", + "status": "uploaded" +} +``` + +```bash showLineNumbers title="Delete File" +# Delete file +curl -X DELETE http://localhost:4000/v1/files/file_abc123 \ + -H "Authorization: Bearer your-proxy-key" + +# Response +{ + "id": "file_abc123", + "object": "file", + "deleted": true +} +``` + + + + +```python showLineNumbers title="Upload, Use, Retrieve, and Delete Files" +import openai + +client = openai.OpenAI( + base_url="http://localhost:4000", + api_key="your-proxy-key" +) + +# Upload file +with open("document.txt", "rb") as f: + created_file = client.files.create( + file=f, + purpose="assistants", + extra_body={"custom_llm_provider": "manus"} + ) +print(f"Uploaded file: {created_file.id}") + +# Use file with Responses API +response = client.responses.create( + model="manus-agent", + input=[ + { + "role": "user", + "content": [ + {"type": "input_text", "text": "Summarize this document."}, + {"type": "input_file", "file_id": created_file.id} + ] + } + ] +) +print(f"Response: {response.id}") + +# Retrieve file +retrieved_file = client.files.retrieve(created_file.id) +print(f"File: {retrieved_file.filename}, {retrieved_file.bytes} bytes") + +# Delete file +deleted_file = client.files.delete(created_file.id) +print(f"Deleted: {deleted_file.deleted}") +``` + + + + +## Related Documentation + +- [LiteLLM Responses API](/docs/response_api) +- [LiteLLM Files API](/docs/proxy/litellm_managed_files) +- [Manus OpenAI Compatibility](https://open.manus.im/docs/openai-compatibility) diff --git a/docs/my-website/docs/providers/milvus_vector_stores.md b/docs/my-website/docs/providers/milvus_vector_stores.md new file mode 100644 index 000000000000..441735114839 --- /dev/null +++ b/docs/my-website/docs/providers/milvus_vector_stores.md @@ -0,0 +1,781 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Milvus - Vector Store + +Use Milvus as a vector store for RAG. + +## Quick Start + +You need three things: +1. A Milvus instance (cloud or self-hosted) +2. An embedding model (to convert your queries to vectors) +3. A Milvus collection with vector fields + +## Usage + + + + +### Basic Search + +```python +from litellm import vector_stores +import os + +# Set your credentials +os.environ["MILVUS_API_KEY"] = "your-milvus-api-key" +os.environ["MILVUS_API_BASE"] = "https://your-milvus-instance.milvus.io" + +# Search the vector store +response = vector_stores.search( + vector_store_id="my-collection-name", # Your Milvus collection name + query="What is the capital of France?", + custom_llm_provider="milvus", + litellm_embedding_model="azure/text-embedding-3-large", + litellm_embedding_config={ + "api_base": "your-embedding-endpoint", + "api_key": "your-embedding-api-key", + "api_version": "2025-09-01" + }, + milvus_text_field="book_intro", # Field name that contains text content + api_key=os.getenv("MILVUS_API_KEY"), +) + +print(response) +``` + +### Async Search + +```python +from litellm import vector_stores + +response = await vector_stores.asearch( + vector_store_id="my-collection-name", + query="What is the capital of France?", + custom_llm_provider="milvus", + litellm_embedding_model="azure/text-embedding-3-large", + litellm_embedding_config={ + "api_base": "your-embedding-endpoint", + "api_key": "your-embedding-api-key", + "api_version": "2025-09-01" + }, + milvus_text_field="book_intro", + api_key=os.getenv("MILVUS_API_KEY"), +) + +print(response) +``` + +### Advanced Options + +```python +from litellm import vector_stores + +response = vector_stores.search( + vector_store_id="my-collection-name", + query="What is the capital of France?", + custom_llm_provider="milvus", + litellm_embedding_model="azure/text-embedding-3-large", + litellm_embedding_config={ + "api_base": "your-embedding-endpoint", + "api_key": "your-embedding-api-key", + }, + milvus_text_field="book_intro", + api_key=os.getenv("MILVUS_API_KEY"), + # Milvus-specific parameters + limit=10, # Number of results to return + offset=0, # Pagination offset + dbName="default", # Database name + annsField="book_intro_vector", # Vector field name + outputFields=["id", "book_intro", "title"], # Fields to return + filter='book_id > 0', # Metadata filter expression + searchParams={"metric_type": "L2", "params": {"nprobe": 10}}, # Search parameters +) + +print(response) +``` + + + + + +### Setup Config + +Add this to your config.yaml: + +```yaml +vector_store_registry: + - vector_store_name: "milvus-knowledgebase" + litellm_params: + vector_store_id: "my-collection-name" + custom_llm_provider: "milvus" + api_key: os.environ/MILVUS_API_KEY + api_base: https://your-milvus-instance.milvus.io + litellm_embedding_model: "azure/text-embedding-3-large" + litellm_embedding_config: + api_base: https://your-endpoint.cognitiveservices.azure.com/ + api_key: os.environ/AZURE_API_KEY + api_version: "2025-09-01" + milvus_text_field: "book_intro" + # Optional Milvus parameters + annsField: "book_intro_vector" + limit: 10 +``` + +### Start Proxy + +```bash +litellm --config /path/to/config.yaml +``` + +### Search via API + +```bash +curl -X POST 'http://0.0.0.0:4000/v1/vector_stores/my-collection-name/search' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "query": "What is the capital of France?" +}' +``` + + + + +## Required Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `vector_store_id` | string | Your Milvus collection name | +| `custom_llm_provider` | string | Set to `"milvus"` | +| `litellm_embedding_model` | string | Model to generate query embeddings (e.g., `"azure/text-embedding-3-large"`) | +| `litellm_embedding_config` | dict | Config for the embedding model (api_base, api_key, api_version) | +| `milvus_text_field` | string | Field name in your collection that contains text content | +| `api_key` | string | Your Milvus API key (or set `MILVUS_API_KEY` env var) | +| `api_base` | string | Your Milvus API base URL (or set `MILVUS_API_BASE` env var) | + +## Optional Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `dbName` | string | Database name (default: "default") | +| `annsField` | string | Vector field name to search (default: "book_intro_vector") | +| `limit` | integer | Maximum number of results to return | +| `offset` | integer | Pagination offset | +| `filter` | string | Filter expression for metadata filtering | +| `groupingField` | string | Field to group results by | +| `outputFields` | list | List of fields to return in results | +| `searchParams` | dict | Search parameters like metric type and search parameters | +| `partitionNames` | list | List of partition names to search | +| `consistencyLevel` | string | Consistency level for the search | + +## Supported Features + +| Feature | Status | Notes | +|---------|--------|-------| +| Logging | ✅ Supported | Full logging support available | +| Guardrails | ❌ Not Yet Supported | Guardrails are not currently supported for vector stores | +| Cost Tracking | ✅ Supported | Cost is $0 for Milvus searches | +| Unified API | ✅ Supported | Call via OpenAI compatible `/v1/vector_stores/search` endpoint | +| Passthrough | ✅ Supported | Use native Milvus API format | + +## Response Format + +The response follows the standard LiteLLM vector store format: + +```json +{ + "object": "vector_store.search_results.page", + "search_query": "What is the capital of France?", + "data": [ + { + "score": 0.95, + "content": [ + { + "text": "Paris is the capital of France...", + "type": "text" + } + ], + "file_id": null, + "filename": null, + "attributes": { + "id": "123", + "title": "France Geography" + } + } + ] +} +``` + +## Passthrough API (Native Milvus Format) + +Use this to allow developers to **create** and **search** vector stores using the native Milvus API format, without giving them the Milvus credentials. + +This is for the proxy only. + +### Admin Flow + +#### 1. Add the vector store to LiteLLM + +```yaml +model_list: + - model_name: embedding-model + litellm_params: + model: azure/text-embedding-3-large + api_base: https://your-endpoint.cognitiveservices.azure.com/ + api_key: os.environ/AZURE_API_KEY + api_version: "2025-09-01" + +vector_store_registry: + - vector_store_name: "milvus-store" + litellm_params: + vector_store_id: "can-be-anything" # vector store id can be anything for the purpose of passthrough api + custom_llm_provider: "milvus" + api_key: os.environ/MILVUS_API_KEY + api_base: https://your-milvus-instance.milvus.io + +general_settings: + database_url: "postgresql://user:password@host:port/database" + master_key: "sk-1234" +``` + +Add your vector store credentials to LiteLLM. + +#### 2. Start the proxy + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +#### 3. Create a virtual index + +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/indexes' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "index_name": "dall-e-6", + "litellm_params": { + "vector_store_index": "real-collection-name", + "vector_store_name": "milvus-store" + } +}' +``` + +This is a virtual index, which the developer can use to create and search vector stores. + +#### 4. Create a key with the vector store permissions + +```bash +curl -L -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "allowed_vector_store_indexes": [{"index_name": "dall-e-6", "index_permissions": ["write", "read"]}], + "models": ["embedding-model"] +}' +``` + +Give the key access to the virtual index and the embedding model. + +**Expected response** + +```json +{ + "key": "sk-my-virtual-key" +} +``` + +### Developer Flow + +#### MilvusRESTClient + +To use the passthrough API, you need a simple REST client. Copy this `milvus_rest_client.py` file to your project: + +
+Click to expand milvus_rest_client.py + +```python +""" +Simple Milvus REST API v2 Client +Based on: https://milvus.io/api-reference/restful/v2.6.x/ +""" + +import requests +from typing import List, Dict, Any, Optional + + +class DataType: + """Milvus data types""" + + INT64 = "Int64" + FLOAT_VECTOR = "FloatVector" + VARCHAR = "VarChar" + BOOL = "Bool" + FLOAT = "Float" + + +class CollectionSchema: + """Collection schema builder""" + + def __init__(self): + self.fields = [] + + def add_field( + self, + field_name: str, + data_type: str, + is_primary: bool = False, + dim: Optional[int] = None, + description: str = "", + ): + """Add a field to the schema""" + field = { + "fieldName": field_name, + "dataType": data_type, + "isPrimary": is_primary, + "description": description, + } + if data_type == DataType.FLOAT_VECTOR and dim: + field["elementTypeParams"] = {"dim": str(dim)} + self.fields.append(field) + return self + + def to_dict(self): + """Convert schema to dict for API""" + return {"fields": self.fields} + + +class IndexParams: + """Index parameters builder""" + + def __init__(self): + self.indexes = [] + + def add_index( + self, field_name: str, metric_type: str = "L2", index_name: Optional[str] = None + ): + """Add an index""" + index = { + "fieldName": field_name, + "indexName": index_name or f"{field_name}_index", + "metricType": metric_type, + } + self.indexes.append(index) + return self + + def to_list(self): + """Convert to list for API""" + return self.indexes + + +class MilvusRESTClient: + """ + Simple Milvus REST API v2 Client + + Reference: https://milvus.io/api-reference/restful/v2.6.x/ + """ + + def __init__(self, uri: str, token: str, db_name: str = "default"): + """ + Initialize Milvus REST client + + Args: + uri: Milvus server URI (e.g., http://localhost:19530) + token: Authentication token + db_name: Database name + """ + self.base_url = uri.rstrip("/") + self.token = token + self.db_name = db_name + self.headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + + def _make_request(self, endpoint: str, data: Dict[str, Any]) -> Dict[str, Any]: + """Make a POST request to Milvus API""" + url = f"{self.base_url}{endpoint}" + + # Add dbName if not already in data and not default + if "dbName" not in data and self.db_name != "default": + data["dbName"] = self.db_name + + try: + response = requests.post(url, json=data, headers=self.headers) + response.raise_for_status() + except requests.exceptions.HTTPError as e: + print(f"e.response.text: {e.response.content}") + raise e + + result = response.json() + + # Check for API errors + if result.get("code") != 0: + raise Exception( + f"Milvus API Error: {result.get('message', 'Unknown error')}" + ) + + return result + + def has_collection(self, collection_name: str) -> bool: + """ + Check if a collection exists + + Reference: https://milvus.io/api-reference/restful/v2.6.x/v2/Collection%20(v2)/Has.md + """ + try: + result = self._make_request( + "/v2/vectordb/collections/has", {"collectionName": collection_name} + ) + return result.get("data", {}).get("has", False) + except Exception: + return False + + def drop_collection(self, collection_name: str): + """ + Drop a collection + + Reference: https://milvus.io/api-reference/restful/v2.6.x/v2/Collection%20(v2)/Drop.md + """ + return self._make_request( + "/v2/vectordb/collections/drop", {"collectionName": collection_name} + ) + + def create_schema(self) -> CollectionSchema: + """Create a new collection schema""" + return CollectionSchema() + + def prepare_index_params(self) -> IndexParams: + """Create index parameters""" + return IndexParams() + + def create_collection( + self, + collection_name: str, + schema: CollectionSchema, + index_params: Optional[IndexParams] = None, + ): + """ + Create a collection + + Reference: https://milvus.io/api-reference/restful/v2.6.x/v2/Collection%20(v2)/Create.md + """ + data = {"collectionName": collection_name, "schema": schema.to_dict()} + + if index_params: + data["indexParams"] = index_params.to_list() + + return self._make_request("/v2/vectordb/collections/create", data) + + def describe_collection(self, collection_name: str) -> Dict[str, Any]: + """ + Describe a collection + + Reference: https://milvus.io/api-reference/restful/v2.6.x/v2/Collection%20(v2)/Describe.md + """ + result = self._make_request( + "/v2/vectordb/collections/describe", {"collectionName": collection_name} + ) + return result.get("data", {}) + + def insert( + self, + collection_name: str, + data: List[Dict[str, Any]], + partition_name: Optional[str] = None, + ): + """ + Insert data into a collection + + Reference: https://milvus.io/api-reference/restful/v2.6.x/v2/Vector%20(v2)/Insert.md + """ + payload = {"collectionName": collection_name, "data": data} + + if partition_name: + payload["partitionName"] = partition_name + + result = self._make_request("/v2/vectordb/entities/insert", payload) + return result.get("data", {}) + + def flush(self, collection_name: str): + """ + Flush collection data to storage + + Reference: https://milvus.io/api-reference/restful/v2.6.x/v2/Collection%20(v2)/Flush.md + """ + return self._make_request( + "/v2/vectordb/collections/flush", {"collectionName": collection_name} + ) + + def search( + self, + collection_name: str, + data: List[List[float]], + anns_field: str, + limit: int = 10, + search_params: Optional[Dict[str, Any]] = None, + output_fields: Optional[List[str]] = None, + ) -> List[List[Dict]]: + """ + Search for vectors + + Reference: https://milvus.io/api-reference/restful/v2.6.x/v2/Vector%20(v2)/Search.md + """ + payload = { + "collectionName": collection_name, + "data": data, + "annsField": anns_field, + "limit": limit, + } + + if search_params: + payload["searchParams"] = search_params + + if output_fields: + payload["outputFields"] = output_fields + + result = self._make_request("/v2/vectordb/entities/search", payload) + return result.get("data", []) +``` + +
+ +#### 1. Create a collection with schema + +Note: Use the `/milvus` endpoint for the passthrough api that uses the `milvus` provider in your config. + +```python +from milvus_rest_client import MilvusRESTClient, DataType # Use the client from above +import random +import time + +# Configuration +uri = "http://0.0.0.0:4000/milvus" # IMPORTANT: Use the '/milvus' endpoint for passthrough +token = "sk-my-virtual-key" +collection_name = "dall-e-6" # Virtual index name + +# Initialize client +milvus_client = MilvusRESTClient(uri=uri, token=token) +print(f"Connected to DB: {uri} successfully") + +# Check if the collection exists and drop if it does +check_collection = milvus_client.has_collection(collection_name) +if check_collection: + milvus_client.drop_collection(collection_name) + print(f"Dropped the existing collection {collection_name} successfully") + +# Define schema +dim = 64 # Vector dimension + +print("Start to create the collection schema") +schema = milvus_client.create_schema() +schema.add_field( + "book_id", DataType.INT64, is_primary=True, description="customized primary id" +) +schema.add_field("word_count", DataType.INT64, description="word count") +schema.add_field( + "book_intro", DataType.FLOAT_VECTOR, dim=dim, description="book introduction" +) + +# Prepare index parameters +print("Start to prepare index parameters with default AUTOINDEX") +index_params = milvus_client.prepare_index_params() +index_params.add_index("book_intro", metric_type="L2") + +# Create collection +print(f"Start to create example collection: {collection_name}") +milvus_client.create_collection( + collection_name, schema=schema, index_params=index_params +) +collection_property = milvus_client.describe_collection(collection_name) +print("Collection details: %s" % collection_property) +``` + +#### 2. Insert data into the collection + +```python +# Insert data with customized ids +nb = 1000 +insert_rounds = 2 +start = 0 # first primary key id +total_rt = 0 # total response time for insert + +print( + f"Start to insert {nb*insert_rounds} entities into example collection: {collection_name}" +) +for i in range(insert_rounds): + vector = [random.random() for _ in range(dim)] + rows = [ + {"book_id": i, "word_count": random.randint(1, 100), "book_intro": vector} + for i in range(start, start + nb) + ] + t0 = time.time() + milvus_client.insert(collection_name, rows) + ins_rt = time.time() - t0 + start += nb + total_rt += ins_rt +print(f"Insert completed in {round(total_rt, 4)} seconds") + +# Flush the collection +print("Start to flush") +start_flush = time.time() +milvus_client.flush(collection_name) +end_flush = time.time() +print(f"Flush completed in {round(end_flush - start_flush, 4)} seconds") +``` + +#### 3. Search the collection + +```python +# Search configuration +nq = 3 # Number of query vectors +search_params = {"metric_type": "L2", "params": {"level": 2}} +limit = 2 # Number of results to return + +# Perform searches +for i in range(5): + search_vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] + t0 = time.time() + results = milvus_client.search( + collection_name, + data=search_vectors, + limit=limit, + search_params=search_params, + anns_field="book_intro", + ) + t1 = time.time() + print(f"Search {i} results: {results}") + print(f"Search {i} latency: {round(t1-t0, 4)} seconds") +``` + +#### Complete Example + +Here's a full working example: + +```python +from milvus_rest_client import MilvusRESTClient, DataType # Use the client from above +import random +import time + +# ---------------------------- +# 🔐 CONFIGURATION +# ---------------------------- +uri = "http://0.0.0.0:4000/milvus" # IMPORTANT: Use the '/milvus' endpoint +token = "sk-my-virtual-key" +collection_name = "dall-e-6" # Your virtual index name + +# ---------------------------- +# 📋 STEP 1 — Initialize Client +# ---------------------------- +milvus_client = MilvusRESTClient(uri=uri, token=token) +print(f"✅ Connected to DB: {uri} successfully") + +# ---------------------------- +# 🗑️ STEP 2 — Drop Existing Collection (if needed) +# ---------------------------- +check_collection = milvus_client.has_collection(collection_name) +if check_collection: + milvus_client.drop_collection(collection_name) + print(f"🗑️ Dropped the existing collection {collection_name} successfully") + +# ---------------------------- +# 📐 STEP 3 — Create Collection Schema +# ---------------------------- +dim = 64 # Vector dimension + +print("📐 Creating the collection schema") +schema = milvus_client.create_schema() +schema.add_field( + "book_id", DataType.INT64, is_primary=True, description="customized primary id" +) +schema.add_field("word_count", DataType.INT64, description="word count") +schema.add_field( + "book_intro", DataType.FLOAT_VECTOR, dim=dim, description="book introduction" +) + +# ---------------------------- +# 🔍 STEP 4 — Create Index +# ---------------------------- +print("🔍 Preparing index parameters with default AUTOINDEX") +index_params = milvus_client.prepare_index_params() +index_params.add_index("book_intro", metric_type="L2") + +# ---------------------------- +# 🏗️ STEP 5 — Create Collection +# ---------------------------- +print(f"🏗️ Creating collection: {collection_name}") +milvus_client.create_collection( + collection_name, schema=schema, index_params=index_params +) +collection_property = milvus_client.describe_collection(collection_name) +print(f"✅ Collection created: {collection_property}") + +# ---------------------------- +# 📤 STEP 6 — Insert Data +# ---------------------------- +nb = 1000 +insert_rounds = 2 +start = 0 +total_rt = 0 + +print(f"📤 Inserting {nb*insert_rounds} entities into collection") +for i in range(insert_rounds): + vector = [random.random() for _ in range(dim)] + rows = [ + {"book_id": i, "word_count": random.randint(1, 100), "book_intro": vector} + for i in range(start, start + nb) + ] + t0 = time.time() + milvus_client.insert(collection_name, rows) + ins_rt = time.time() - t0 + start += nb + total_rt += ins_rt +print(f"✅ Insert completed in {round(total_rt, 4)} seconds") + +# ---------------------------- +# 💾 STEP 7 — Flush Collection +# ---------------------------- +print("💾 Flushing collection") +start_flush = time.time() +milvus_client.flush(collection_name) +end_flush = time.time() +print(f"✅ Flush completed in {round(end_flush - start_flush, 4)} seconds") + +# ---------------------------- +# 🔍 STEP 8 — Search +# ---------------------------- +nq = 3 +search_params = {"metric_type": "L2", "params": {"level": 2}} +limit = 2 + +print(f"🔍 Performing {5} search operations") +for i in range(5): + search_vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] + t0 = time.time() + results = milvus_client.search( + collection_name, + data=search_vectors, + limit=limit, + search_params=search_params, + anns_field="book_intro", + ) + t1 = time.time() + print(f"✅ Search {i} results: {results}") + print(f" Search {i} latency: {round(t1-t0, 4)} seconds") +``` + +## How It Works + +When you search: + +1. LiteLLM converts your query to a vector using the embedding model you specified +2. It sends the vector to your Milvus instance via the `/v2/vectordb/entities/search` endpoint +3. Milvus finds the most similar documents in your collection using vector similarity search +4. Results come back with distance scores + +The embedding model can be any model supported by LiteLLM - Azure OpenAI, OpenAI, Bedrock, etc. + diff --git a/docs/my-website/docs/providers/minimax.md b/docs/my-website/docs/providers/minimax.md new file mode 100644 index 000000000000..9505c26aadef --- /dev/null +++ b/docs/my-website/docs/providers/minimax.md @@ -0,0 +1,639 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# MiniMax + +# MiniMax - v1/messages + +## Overview + +Litellm provides anthropic specs compatible support for minmax + +## Supported Models + +MiniMax offers three models through their Anthropic-compatible API: + +| Model | Description | Input Cost | Output Cost | Prompt Caching Read | Prompt Caching Write | +|-------|-------------|------------|-------------|---------------------|----------------------| +| **MiniMax-M2.1** | Powerful Multi-Language Programming with Enhanced Programming Experience (~60 tps) | $0.3/M tokens | $1.2/M tokens | $0.03/M tokens | $0.375/M tokens | +| **MiniMax-M2.1-lightning** | Faster and More Agile (~100 tps) | $0.3/M tokens | $2.4/M tokens | $0.03/M tokens | $0.375/M tokens | +| **MiniMax-M2** | Agentic capabilities, Advanced reasoning | $0.3/M tokens | $1.2/M tokens | $0.03/M tokens | $0.375/M tokens | + + +## Usage Examples + +### Basic Chat Completion + +```python +import litellm + +response = litellm.anthropic.messages.acreate( + model="minimax/MiniMax-M2.1", + messages=[{"role": "user", "content": "Hello, how are you?"}], + api_key="your-minimax-api-key", + api_base="https://api.minimax.io/anthropic/v1/messages", + max_tokens=1000 +) + +print(response.choices[0].message.content) +``` + +### Using Environment Variables + +```bash +export MINIMAX_API_KEY="your-minimax-api-key" +export MINIMAX_API_BASE="https://api.minimax.io/anthropic/v1/messages" +``` + +```python +import litellm + +response = litellm.anthropic.messages.acreate( + model="minimax/MiniMax-M2.1", + messages=[{"role": "user", "content": "Hello!"}], + max_tokens=1000 +) +``` + +### With Thinking (M2.1 Feature) + +```python +response = litellm.anthropic.messages.acreate( + model="minimax/MiniMax-M2.1", + messages=[{"role": "user", "content": "Solve: 2+2=?"}], + thinking={"type": "enabled", "budget_tokens": 1000}, + api_key="your-minimax-api-key" +) + +# Access thinking content +for block in response.choices[0].message.content: + if hasattr(block, 'type') and block.type == 'thinking': + print(f"Thinking: {block.thinking}") +``` + +### With Tool Calling + +```python +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + } +] + +response = litellm.anthropic.messages.acreate( + model="minimax/MiniMax-M2.1", + messages=[{"role": "user", "content": "What's the weather in SF?"}], + tools=tools, + api_key="your-minimax-api-key", + max_tokens=1000 +) +``` + + + +## Usage with LiteLLM Proxy + +You can use MiniMax models with the Anthropic SDK by routing through LiteLLM Proxy: + +| Step | Description | +|------|-------------| +| **1. Start LiteLLM Proxy** | Configure proxy with MiniMax models in `config.yaml` | +| **2. Set Environment Variables** | Point Anthropic SDK to proxy endpoint | +| **3. Use Anthropic SDK** | Call MiniMax models using native Anthropic SDK | + +### Step 1: Configure LiteLLM Proxy + +Create a `config.yaml`: + +```yaml +model_list: + - model_name: minimax/MiniMax-M2.1 + litellm_params: + model: minimax/MiniMax-M2.1 + api_key: os.environ/MINIMAX_API_KEY + api_base: https://api.minimax.io/anthropic/v1/messages +``` + +Start the proxy: + +```bash +litellm --config config.yaml +``` + +### Step 2: Use with Anthropic SDK + +```python +import os +os.environ["ANTHROPIC_BASE_URL"] = "http://localhost:4000" +os.environ["ANTHROPIC_API_KEY"] = "sk-1234" # Your LiteLLM proxy key + +import anthropic + +client = anthropic.Anthropic() + +message = client.messages.create( + model="minimax/MiniMax-M2.1", + max_tokens=1000, + system="You are a helpful assistant.", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Hi, how are you?" + } + ] + } + ] +) + +for block in message.content: + if block.type == "thinking": + print(f"Thinking:\n{block.thinking}\n") + elif block.type == "text": + print(f"Text:\n{block.text}\n") +``` + +# MiniMax - v1/chat/completions + +## Usage with LiteLLM SDK + +You can use MiniMax's OpenAI-compatible API directly with LiteLLM: + +### Basic Chat Completion + +```python +import litellm + +response = litellm.completion( + model="minimax/MiniMax-M2.1", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello, how are you?"} + ], + api_key="your-minimax-api-key", + api_base="https://api.minimax.io/v1" +) + +print(response.choices[0].message.content) +``` + +### Using Environment Variables + +```bash +export MINIMAX_API_KEY="your-minimax-api-key" +export MINIMAX_API_BASE="https://api.minimax.io/v1" +``` + +```python +import litellm + +response = litellm.completion( + model="minimax/MiniMax-M2.1", + messages=[{"role": "user", "content": "Hello!"}] +) +``` + +### With Reasoning Split + +```python +response = litellm.completion( + model="minimax/MiniMax-M2.1", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Solve: 2+2=?"} + ], + extra_body={"reasoning_split": True}, + api_key="your-minimax-api-key", + api_base="https://api.minimax.io/v1" +) + +# Access reasoning details if available +if hasattr(response.choices[0].message, 'reasoning_details'): + print(f"Thinking: {response.choices[0].message.reasoning_details}") +print(f"Response: {response.choices[0].message.content}") +``` + +### With Tool Calling + +```python +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } + } +] + +response = litellm.completion( + model="minimax/MiniMax-M2.1", + messages=[{"role": "user", "content": "What's the weather in SF?"}], + tools=tools, + api_key="your-minimax-api-key", + api_base="https://api.minimax.io/v1" +) +``` + +### Streaming + +```python +response = litellm.completion( + model="minimax/MiniMax-M2.1", + messages=[{"role": "user", "content": "Tell me a story"}], + stream=True, + api_key="your-minimax-api-key", + api_base="https://api.minimax.io/v1" +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="") +``` + + +## Usage with OpenAI SDK via LiteLLM Proxy + +You can also use MiniMax models with the OpenAI SDK by routing through LiteLLM Proxy: + +| Step | Description | +|------|-------------| +| **1. Start LiteLLM Proxy** | Configure proxy with MiniMax models in `config.yaml` | +| **2. Set Environment Variables** | Point OpenAI SDK to proxy endpoint | +| **3. Use OpenAI SDK** | Call MiniMax models using native OpenAI SDK | + +### Step 1: Configure LiteLLM Proxy + +Create a `config.yaml`: + +```yaml +model_list: + - model_name: minimax/MiniMax-M2.1 + litellm_params: + model: minimax/MiniMax-M2.1 + api_key: os.environ/MINIMAX_API_KEY + api_base: https://api.minimax.io/v1 +``` + +Start the proxy: + +```bash +litellm --config config.yaml +``` + +### Step 2: Use with OpenAI SDK + +```python +import os +os.environ["OPENAI_BASE_URL"] = "http://localhost:4000" +os.environ["OPENAI_API_KEY"] = "sk-1234" # Your LiteLLM proxy key + +from openai import OpenAI + +client = OpenAI() + +response = client.chat.completions.create( + model="minimax/MiniMax-M2.1", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hi, how are you?"}, + ], + # Set reasoning_split=True to separate thinking content + extra_body={"reasoning_split": True}, +) + +# Access thinking and response +if hasattr(response.choices[0].message, 'reasoning_details'): + print(f"Thinking:\n{response.choices[0].message.reasoning_details[0]['text']}\n") +print(f"Text:\n{response.choices[0].message.content}\n") +``` + +### Streaming with OpenAI SDK + +```python +from openai import OpenAI + +client = OpenAI() + +stream = client.chat.completions.create( + model="minimax/MiniMax-M2.1", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Tell me a story"}, + ], + extra_body={"reasoning_split": True}, + stream=True, +) + +reasoning_buffer = "" +text_buffer = "" + +for chunk in stream: + if hasattr(chunk.choices[0].delta, "reasoning_details") and chunk.choices[0].delta.reasoning_details: + for detail in chunk.choices[0].delta.reasoning_details: + if "text" in detail: + reasoning_text = detail["text"] + new_reasoning = reasoning_text[len(reasoning_buffer):] + if new_reasoning: + print(new_reasoning, end="", flush=True) + reasoning_buffer = reasoning_text + + if chunk.choices[0].delta.content: + content_text = chunk.choices[0].delta.content + new_text = content_text[len(text_buffer):] if text_buffer else content_text + if new_text: + print(new_text, end="", flush=True) + text_buffer = content_text +``` + +## Cost Calculation + +Cost calculation works automatically using the pricing information in `model_prices_and_context_window.json`. + +Example: +```python +response = litellm.completion( + model="minimax/MiniMax-M2.1", + messages=[{"role": "user", "content": "Hello!"}], + api_key="your-minimax-api-key" +) + +# Access cost information +print(f"Cost: ${response._hidden_params.get('response_cost', 0)}") +``` + +# MiniMax - Text-to-Speech + +## Quick Start + +## **LiteLLM Python SDK Usage** + +### Basic Usage + +```python +from pathlib import Path +from litellm import speech +import os + +os.environ["MINIMAX_API_KEY"] = "your-api-key" + +speech_file_path = Path(__file__).parent / "speech.mp3" +response = speech( + model="minimax/speech-2.6-hd", + voice="alloy", + input="The quick brown fox jumped over the lazy dogs", +) +response.stream_to_file(speech_file_path) +``` + +### Async Usage + +```python +from litellm import aspeech +from pathlib import Path +import os, asyncio + +os.environ["MINIMAX_API_KEY"] = "your-api-key" + +async def test_async_speech(): + speech_file_path = Path(__file__).parent / "speech.mp3" + response = await aspeech( + model="minimax/speech-2.6-hd", + voice="alloy", + input="The quick brown fox jumped over the lazy dogs", + ) + response.stream_to_file(speech_file_path) + +asyncio.run(test_async_speech()) +``` + +### Voice Selection + +MiniMax supports many voices. LiteLLM provides OpenAI-compatible voice names that map to MiniMax voices: + +```python +from litellm import speech + +# OpenAI-compatible voice names +voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer"] + +for voice in voices: + response = speech( + model="minimax/speech-2.6-hd", + voice=voice, + input=f"This is the {voice} voice", + ) + response.stream_to_file(f"speech_{voice}.mp3") +``` + +You can also use MiniMax-native voice IDs directly: + +```python +response = speech( + model="minimax/speech-2.6-hd", + voice="male-qn-qingse", # MiniMax native voice ID + input="Using native MiniMax voice ID", +) +``` + +### Custom Parameters + +MiniMax TTS supports additional parameters for fine-tuning audio output: + +```python +from litellm import speech + +response = speech( + model="minimax/speech-2.6-hd", + voice="alloy", + input="Custom audio parameters", + speed=1.5, # Speed: 0.5 to 2.0 + response_format="mp3", # Format: mp3, pcm, wav, flac + extra_body={ + "vol": 1.2, # Volume: 0.1 to 10 + "pitch": 2, # Pitch adjustment: -12 to 12 + "sample_rate": 32000, # 16000, 24000, or 32000 + "bitrate": 128000, # For MP3: 64000, 128000, 192000, 256000 + "channel": 1, # 1 for mono, 2 for stereo + } +) +response.stream_to_file("custom_speech.mp3") +``` + +### Response Formats + +```python +from litellm import speech + +# MP3 format (default) +response = speech( + model="minimax/speech-2.6-hd", + voice="alloy", + input="MP3 format audio", + response_format="mp3", +) + +# PCM format +response = speech( + model="minimax/speech-2.6-hd", + voice="alloy", + input="PCM format audio", + response_format="pcm", +) + +# WAV format +response = speech( + model="minimax/speech-2.6-hd", + voice="alloy", + input="WAV format audio", + response_format="wav", +) + +# FLAC format +response = speech( + model="minimax/speech-2.6-hd", + voice="alloy", + input="FLAC format audio", + response_format="flac", +) +``` + +## **LiteLLM Proxy Usage** + +LiteLLM provides an OpenAI-compatible `/audio/speech` endpoint for MiniMax TTS. + +### Setup + +Add MiniMax to your proxy configuration: + +```yaml +model_list: + - model_name: tts + litellm_params: + model: minimax/speech-2.6-hd + api_key: os.environ/MINIMAX_API_KEY + + - model_name: tts-turbo + litellm_params: + model: minimax/speech-2.6-turbo + api_key: os.environ/MINIMAX_API_KEY +``` + +Start the proxy: + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +### Making Requests + +```bash +curl http://0.0.0.0:4000/v1/audio/speech \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "tts", + "input": "The quick brown fox jumped over the lazy dog.", + "voice": "alloy" + }' \ + --output speech.mp3 +``` + +With custom parameters: + +```bash +curl http://0.0.0.0:4000/v1/audio/speech \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "tts", + "input": "Custom parameters example.", + "voice": "nova", + "speed": 1.5, + "response_format": "mp3", + "extra_body": { + "vol": 1.2, + "pitch": 1, + "sample_rate": 32000 + } + }' \ + --output custom_speech.mp3 +``` + +## Voice Mappings + +LiteLLM maps OpenAI-compatible voice names to MiniMax voice IDs: + +| OpenAI Voice | MiniMax Voice ID | Description | +|--------------|------------------|-------------| +| alloy | male-qn-qingse | Male voice | +| echo | male-qn-jingying | Male voice | +| fable | female-shaonv | Female voice | +| onyx | male-qn-badao | Male voice | +| nova | female-yujie | Female voice | +| shimmer | female-tianmei | Female voice | + +You can also use any MiniMax-native voice ID directly by passing it as the `voice` parameter. + + +### Streaming (WebSocket) + +:::note +The current implementation uses MiniMax's HTTP endpoint. For WebSocket streaming support, please refer to MiniMax's official documentation at [https://platform.minimax.io/docs](https://platform.minimax.io/docs). +::: + +## Error Handling + +```python +from litellm import speech +import litellm + +try: + response = speech( + model="minimax/speech-2.6-hd", + voice="alloy", + input="Test input", + ) + response.stream_to_file("output.mp3") +except litellm.exceptions.BadRequestError as e: + print(f"Bad request: {e}") +except litellm.exceptions.AuthenticationError as e: + print(f"Authentication failed: {e}") +except Exception as e: + print(f"Error: {e}") +``` + +### Extra Body Parameters + +Pass these via `extra_body`: + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| vol | float | Volume (0.1 to 10) | 1.0 | +| pitch | int | Pitch adjustment (-12 to 12) | 0 | +| sample_rate | int | Sample rate: 16000, 24000, 32000 | 32000 | +| bitrate | int | Bitrate for MP3: 64000, 128000, 192000, 256000 | 128000 | +| channel | int | Audio channels: 1 (mono) or 2 (stereo) | 1 | +| output_format | string | Output format: "hex" or "url" (url returns a URL valid for 24 hours) | hex | diff --git a/docs/my-website/docs/providers/nano-gpt.md b/docs/my-website/docs/providers/nano-gpt.md new file mode 100644 index 000000000000..4e46c032c75a --- /dev/null +++ b/docs/my-website/docs/providers/nano-gpt.md @@ -0,0 +1,170 @@ +# NanoGPT + +## Overview + +| Property | Details | +|-------|-------| +| Description | NanoGPT is a pay-per-prompt and subscription based AI service providing instant access to over 200+ powerful AI models with no subscriptions or registration required. | +| Provider Route on LiteLLM | `nano-gpt/` | +| Link to Provider Doc | [NanoGPT Website ↗](https://nano-gpt.com) | +| Base URL | `https://nano-gpt.com/api/v1` | +| Supported Operations | [`/chat/completions`](#sample-usage), [`/completions`](#text-completion), [`/embeddings`](#embeddings) | + +
+ +## What is NanoGPT? + +NanoGPT is a flexible AI API service that offers: +- **Pay-Per-Prompt Pricing**: No subscriptions, pay only for what you use +- **200+ AI Models**: Access to text, image, and video generation models +- **No Registration Required**: Get started instantly +- **OpenAI-Compatible API**: Easy integration with existing code +- **Streaming Support**: Real-time response streaming +- **Tool Calling**: Support for function calling + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["NANOGPT_API_KEY"] = "" # your NanoGPT API key +``` + +Get your NanoGPT API key from [nano-gpt.com](https://nano-gpt.com). + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="NanoGPT Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["NANOGPT_API_KEY"] = "" # your NanoGPT API key + +messages = [{"content": "What is the capital of France?", "role": "user"}] + +# NanoGPT call +response = completion( + model="nano-gpt/model-name", # Replace with actual model name + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="NanoGPT Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["NANOGPT_API_KEY"] = "" # your NanoGPT API key + +messages = [{"content": "Write a short poem about AI", "role": "user"}] + +# NanoGPT call with streaming +response = completion( + model="nano-gpt/model-name", # Replace with actual model name + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +### Tool Calling + +```python showLineNumbers title="NanoGPT Tool Calling" +import os +import litellm + +os.environ["NANOGPT_API_KEY"] = "" + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get current weather", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + } + } + } + } +] + +response = litellm.completion( + model="nano-gpt/model-name", + messages=[{"role": "user", "content": "What's the weather in Paris?"}], + tools=tools +) +``` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export NANOGPT_API_KEY="" +``` + +### 2. Start the proxy + +```yaml +model_list: + - model_name: nano-gpt-model + litellm_params: + model: nano-gpt/model-name # Replace with actual model name + api_key: os.environ/NANOGPT_API_KEY +``` + +## Supported OpenAI Parameters + +NanoGPT supports all standard OpenAI-compatible parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `messages` | array | **Required**. Array of message objects with 'role' and 'content' | +| `model` | string | **Required**. Model ID from 200+ available models | +| `stream` | boolean | Optional. Enable streaming responses | +| `temperature` | float | Optional. Sampling temperature | +| `top_p` | float | Optional. Nucleus sampling parameter | +| `max_tokens` | integer | Optional. Maximum tokens to generate | +| `frequency_penalty` | float | Optional. Penalize frequent tokens | +| `presence_penalty` | float | Optional. Penalize tokens based on presence | +| `stop` | string/array | Optional. Stop sequences | +| `n` | integer | Optional. Number of completions to generate | +| `tools` | array | Optional. List of available tools/functions | +| `tool_choice` | string/object | Optional. Control tool/function calling | +| `response_format` | object | Optional. Response format specification | +| `user` | string | Optional. User identifier | + +## Model Categories + +NanoGPT provides access to multiple model categories: +- **Text Generation**: 200+ LLMs for chat, completion, and analysis +- **Image Generation**: AI models for creating images +- **Video Generation**: AI models for video creation +- **Embedding Models**: Text embedding models for vector search + +## Pricing Model + +NanoGPT offers a flexible pricing structure: +- **Pay-Per-Prompt**: No subscription required +- **No Registration**: Get started immediately +- **Transparent Pricing**: Pay only for what you use + +## API Documentation + +For detailed API documentation, visit [docs.nano-gpt.com](https://docs.nano-gpt.com). + +## Additional Resources + +- [NanoGPT Website](https://nano-gpt.com) +- [NanoGPT API Documentation](https://nano-gpt.com/api) +- [NanoGPT Model List](https://docs.nano-gpt.com/api-reference/endpoint/models) diff --git a/docs/my-website/docs/providers/nvidia_nim_rerank.md b/docs/my-website/docs/providers/nvidia_nim_rerank.md index 7373014a9608..d28f056c24bc 100644 --- a/docs/my-website/docs/providers/nvidia_nim_rerank.md +++ b/docs/my-website/docs/providers/nvidia_nim_rerank.md @@ -141,6 +141,111 @@ curl -X POST http://0.0.0.0:4000/rerank \ }' ``` +## `/v1/ranking` Models (llama-3.2-nv-rerankqa-1b-v2) + +Some Nvidia NIM rerank models use the `/v1/ranking` endpoint instead of the default `/v1/retrieval/{model}/reranking` endpoint. + +Use the `ranking/` prefix to force requests to the `/v1/ranking` endpoint: + +### LiteLLM Python SDK + +```python showLineNumbers title="Force /v1/ranking endpoint with ranking/ prefix" +import litellm +import os + +os.environ['NVIDIA_NIM_API_KEY'] = "nvapi-..." + +# Use "ranking/" prefix to force /v1/ranking endpoint +response = litellm.rerank( + model="nvidia_nim/ranking/nvidia/llama-3.2-nv-rerankqa-1b-v2", + query="which way did the traveler go?", + documents=[ + "two roads diverged in a yellow wood...", + "then took the other, as just as fair...", + "i shall be telling this with a sigh somewhere ages and ages hence..." + ], + top_n=3, + truncate="END", # Optional: truncate long text from the end +) + +print(response) +``` + +### LiteLLM Proxy + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: nvidia-ranking + litellm_params: + model: nvidia_nim/ranking/nvidia/llama-3.2-nv-rerankqa-1b-v2 + api_key: os.environ/NVIDIA_NIM_API_KEY +``` + +```bash title="Request to LiteLLM Proxy" +curl -X POST http://0.0.0.0:4000/rerank \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "nvidia-ranking", + "query": "which way did the traveler go?", + "documents": [ + "two roads diverged in a yellow wood...", + "then took the other, as just as fair..." + ], + "top_n": 2 + }' +``` + +### Understanding Model Resolution + +**Ranking Endpoint (`/v1/ranking`):** + +``` +model: nvidia_nim/ranking/nvidia/llama-3.2-nv-rerankqa-1b-v2 + └────┬────┘ └──┬──┘ └─────────────┬──────────────────┘ + │ │ │ + │ │ └────▶ Model name sent to provider + │ │ + │ └────────────────────────▶ Tells LiteLLM the request/response and url should be sent to Nvidia NIM /v1/ranking endpoint + │ + └─────────────────────────────────▶ Provider prefix + +API URL: https://ai.api.nvidia.com/v1/ranking +``` + +**Visual Flow:** + +``` +Client Request LiteLLM Provider API +────────────── ──────────── ───────────── + +# Default reranking endpoint +model: "nvidia_nim/nvidia/model-name" + 1. Extracts model: nvidia/model-name + 2. Routes to default endpoint ──────▶ POST /v1/retrieval/nvidia/model-name/reranking + + +# Forced ranking endpoint +model: "nvidia_nim/ranking/nvidia/model-name" + 1. Detects "ranking/" prefix + 2. Extracts model: nvidia/model-name + 3. Routes to ranking endpoint ──────▶ POST /v1/ranking + Body: {"model": "nvidia/model-name", ...} +``` + +**When to use each endpoint:** + +| Endpoint | Model Prefix | Use Case | +|----------|--------------|----------| +| `/v1/retrieval/{model}/reranking` | `nvidia_nim/` | Default for most rerank models | +| `/v1/ranking` | `nvidia_nim/ranking/` | For models like `nvidia/llama-3.2-nv-rerankqa-1b-v2` that require this endpoint | + +:::tip + +Check the [Nvidia NIM model deployment page](https://build.nvidia.com/nvidia/llama-3_2-nv-rerankqa-1b-v2/deploy) to see which endpoint your model requires. + +::: + ## API Parameters ### Required Parameters @@ -203,16 +308,7 @@ response = litellm.rerank(
-## API Endpoint - -The rerank endpoint uses a different base URL than chat/embeddings: - -- **Chat/Embeddings:** `https://integrate.api.nvidia.com/v1/` -- **Rerank:** `https://ai.api.nvidia.com/v1/` - -LiteLLM automatically uses the correct endpoint for rerank requests. - -### Custom API Base URL +## Custom API Base URL You can override the default base URL in several ways: @@ -258,4 +354,3 @@ Get your Nvidia NIM API key from [Nvidia's website](https://developer.nvidia.com - [Nvidia NIM Chat Completions](./nvidia_nim#sample-usage) - [LiteLLM Rerank Endpoint](../rerank) - [Nvidia NIM Official Docs ↗](https://docs.api.nvidia.com/nim/reference/) - diff --git a/docs/my-website/docs/providers/oci.md b/docs/my-website/docs/providers/oci.md index 1f52fba04f3d..ce6fe18dd6ff 100644 --- a/docs/my-website/docs/providers/oci.md +++ b/docs/my-website/docs/providers/oci.md @@ -29,20 +29,40 @@ Check the [OCI Models List](https://docs.oracle.com/en-us/iaas/Content/generativ ## Authentication -LiteLLM uses OCI signing key authentication. Follow the [official Oracle tutorial](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to create a signing key and obtain the following parameters: +LiteLLM supports two authentication methods for OCI: + +### Method 1: Manual Credentials +Provide individual OCI credentials directly to LiteLLM. Follow the [official Oracle tutorial](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to create a signing key and obtain the following parameters: - `user` - `fingerprint` - `tenancy` - `region` -- `key_file` +- `key_file` or `key` +- `compartment_id` + +This is the default method for LiteLLM AI Gateway (LLM Proxy) access to OCI GenAI models. + +### Method 2: OCI SDK Signer +Use an OCI SDK `Signer` object for authentication. This method: +- Leverages the official [OCI SDK for signing](https://docs.oracle.com/en-us/iaas/tools/python/latest/api/signing.html) +- Supports additional authentication methods (instance principals, workload identity, etc.) + +To use this method, install the OCI SDK: +```bash +pip install oci +``` + +This method is an alternative when using the LiteLLM SDK on Oracle Cloud Infrastructure (instances or Oracle Kubernetes Engine). ## Usage -Input the parameters obtained from the OCI signing key creation process into the `completion` function. + + + +Input the parameters obtained from the OCI signing key creation process into the `completion` function: ```python -import os from litellm import completion messages = [{"role": "user", "content": "Hey! how's it going?"}] @@ -64,12 +84,119 @@ response = completion( print(response) ``` + + + +Use the OCI SDK `Signer` for authentication: + +```python +from litellm import completion +from oci.signer import Signer + +# Create an OCI Signer +signer = Signer( + tenancy="ocid1.tenancy.oc1..", + user="ocid1.user.oc1..", + fingerprint="xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx", + private_key_file_location="~/.oci/key.pem", + # Or use private_key_content="" +) + +messages = [{"role": "user", "content": "Hey! how's it going?"}] +response = completion( + model="oci/xai.grok-4", + messages=messages, + oci_signer=signer, + oci_region="us-chicago-1", # Optional, defaults to us-ashburn-1 + oci_serving_mode="ON_DEMAND", # Optional, default is "ON_DEMAND". Other option is "DEDICATED" + oci_compartment_id="", +) +print(response) +``` + +**Alternative: Use OCI Config File** + +The OCI SDK can automatically load credentials from `~/.oci/config`: + +```python +from litellm import completion +from oci.config import from_file +from oci.signer import Signer + +# Load config from file +config = from_file("~/.oci/config", "DEFAULT") # "DEFAULT" is the profile name +signer = Signer( + tenancy=config["tenancy"], + user=config["user"], + fingerprint=config["fingerprint"], + private_key_file_location=config["key_file"], + pass_phrase=config.get("pass_phrase") # Optional if key is encrypted +) + +messages = [{"role": "user", "content": "Hey! how's it going?"}] +response = completion( + model="oci/xai.grok-4", + messages=messages, + oci_signer=signer, + oci_region=config["region"], + oci_compartment_id="", +) +print(response) +``` + +**Instance Principal Authentication** + +For applications running on OCI compute instances: + +```python +from litellm import completion +from oci.auth.signers import InstancePrincipalsSecurityTokenSigner + +# Use instance principal authentication +signer = InstancePrincipalsSecurityTokenSigner() + +messages = [{"role": "user", "content": "Hey! how's it going?"}] +response = completion( + model="oci/xai.grok-4", + messages=messages, + oci_signer=signer, + oci_region="us-chicago-1", + oci_compartment_id="", +) +print(response) +``` + +**Workload Identity Authentication** + +For applications running in Oracle Kubernetes Engine (OKE): + +```python +from litellm import completion +from oci.auth.signers import get_oke_workload_identity_resource_principal_signer + +# Use workload identity authentication +signer = get_oke_workload_identity_resource_principal_signer() + +messages = [{"role": "user", "content": "Hey! how's it going?"}] +response = completion( + model="oci/xai.grok-4", + messages=messages, + oci_signer=signer, + oci_region="us-chicago-1", + oci_compartment_id="", +) +print(response) +``` + + ## Usage - Streaming Just set `stream=True` when calling completion. + + + ```python -import os from litellm import completion messages = [{"role": "user", "content": "Hey! how's it going?"}] @@ -93,23 +220,178 @@ for chunk in response: print(chunk["choices"][0]["delta"]["content"]) # same as openai format ``` + + + +```python +from litellm import completion +from oci.signer import Signer + +signer = Signer( + tenancy="ocid1.tenancy.oc1..", + user="ocid1.user.oc1..", + fingerprint="xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx", + private_key_file_location="~/.oci/key.pem", +) + +messages = [{"role": "user", "content": "Hey! how's it going?"}] +response = completion( + model="oci/xai.grok-4", + messages=messages, + stream=True, + oci_signer=signer, + oci_region="us-chicago-1", + oci_compartment_id="", +) +for chunk in response: + print(chunk["choices"][0]["delta"]["content"]) # same as openai format +``` + + + + ## Usage Examples by Model Type ### Using Cohere Models + + + +```python +from litellm import completion + +messages = [{"role": "user", "content": "Explain quantum computing"}] +response = completion( + model="oci/cohere.command-latest", + messages=messages, + oci_region="us-chicago-1", + oci_user=, + oci_fingerprint=, + oci_tenancy=, + oci_key=, + oci_compartment_id=, +) +print(response) +``` + + + + ```python from litellm import completion +from oci.signer import Signer + +signer = Signer( + tenancy="ocid1.tenancy.oc1..", + user="ocid1.user.oc1..", + fingerprint="xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx", + private_key_file_location="~/.oci/key.pem", +) messages = [{"role": "user", "content": "Explain quantum computing"}] response = completion( model="oci/cohere.command-latest", messages=messages, + oci_signer=signer, oci_region="us-chicago-1", + oci_compartment_id="", +) +print(response) +``` + + + + +## Using Dedicated Endpoints + +OCI supports dedicated endpoints for hosting models. Use the `oci_serving_mode="DEDICATED"` parameter along with `oci_endpoint_id` to specify the endpoint ID. + + + + +```python +from litellm import completion + +messages = [{"role": "user", "content": "Hey! how's it going?"}] +response = completion( + model="oci/xai.grok-4", # Must match the model type hosted on the endpoint + messages=messages, + oci_region=, oci_user=, oci_fingerprint=, oci_tenancy=, + oci_serving_mode="DEDICATED", + oci_endpoint_id="ocid1.generativeaiendpoint.oc1...", # Your dedicated endpoint OCID oci_key=, oci_compartment_id=, ) print(response) -``` \ No newline at end of file +``` + + + + +```python +from litellm import completion +from oci.signer import Signer + +signer = Signer( + tenancy="ocid1.tenancy.oc1..", + user="ocid1.user.oc1..", + fingerprint="xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx", + private_key_file_location="~/.oci/key.pem", +) + +messages = [{"role": "user", "content": "Hey! how's it going?"}] +response = completion( + model="oci/xai.grok-4", # Must match the model type hosted on the endpoint + messages=messages, + oci_signer=signer, + oci_region="us-chicago-1", + oci_serving_mode="DEDICATED", + oci_endpoint_id="ocid1.generativeaiendpoint.oc1...", # Your dedicated endpoint OCID + oci_compartment_id="", +) +print(response) +``` + + + + +**Important:** When using `oci_serving_mode="DEDICATED"`: +- The `model` parameter **must match the type of model hosted on your dedicated endpoint** (e.g., use `"oci/cohere.command-latest"` for Cohere models, `"oci/xai.grok-4"` for Grok models) +- The model name determines the API format and vendor-specific handling (Cohere vs Generic) +- The `oci_endpoint_id` parameter specifies your dedicated endpoint's OCID +- If `oci_endpoint_id` is not provided, the `model` parameter will be used as the endpoint ID (for backward compatibility) + +**Example with Cohere Dedicated Endpoint:** +```python +# For a dedicated endpoint hosting a Cohere model +response = completion( + model="oci/cohere.command-latest", # Use Cohere model name to get Cohere API format + messages=messages, + oci_region="us-chicago-1", + oci_user=, + oci_fingerprint=, + oci_tenancy=, + oci_serving_mode="DEDICATED", + oci_endpoint_id="ocid1.generativeaiendpoint.oc1...", # Your Cohere endpoint OCID + oci_key=, + oci_compartment_id=, +) +``` + +## Optional Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `oci_region` | string | `us-ashburn-1` | OCI region where the GenAI service is deployed | +| `oci_serving_mode` | string | `ON_DEMAND` | Service mode: `ON_DEMAND` for managed models or `DEDICATED` for dedicated endpoints | +| `oci_endpoint_id` | string | Same as `model` | (For DEDICATED mode) The OCID of your dedicated endpoint | +| `oci_compartment_id` | string | **Required** | The OCID of the OCI compartment containing your resources | +| `oci_user` | string | - | (Manual auth) The OCID of the OCI user | +| `oci_fingerprint` | string | - | (Manual auth) The fingerprint of the API signing key | +| `oci_tenancy` | string | - | (Manual auth) The OCID of your OCI tenancy | +| `oci_key` | string | - | (Manual auth) The private key content as a string | +| `oci_key_file` | string | - | (Manual auth) Path to the private key file | +| `oci_signer` | object | - | (SDK auth) OCI SDK Signer object for authentication | \ No newline at end of file diff --git a/docs/my-website/docs/providers/openai.md b/docs/my-website/docs/providers/openai.md index f9831c6d8be9..80645a51ac59 100644 --- a/docs/my-website/docs/providers/openai.md +++ b/docs/my-website/docs/providers/openai.md @@ -29,6 +29,18 @@ response = completion( ) ``` +:::info Metadata passthrough (preview) +When `litellm.enable_preview_features = True`, LiteLLM forwards only the values inside `metadata` to OpenAI. + +```python +completion( + model="gpt-4o", + messages=[{"role": "user", "content": "hi"}], + metadata= {"custom_meta_key": "value"}, +) +``` +::: + ### Usage - LiteLLM Proxy Server Here's how to call OpenAI models with the LiteLLM Proxy Server @@ -176,6 +188,15 @@ os.environ["OPENAI_BASE_URL"] = "https://your_host/v1" # OPTIONAL | gpt-5-mini-2025-08-07 | `response = completion(model="gpt-5-mini-2025-08-07", messages=messages)` | | gpt-5-nano-2025-08-07 | `response = completion(model="gpt-5-nano-2025-08-07", messages=messages)` | | gpt-5-pro | `response = completion(model="gpt-5-pro", messages=messages)` | +| gpt-5.2 | `response = completion(model="gpt-5.2", messages=messages)` | +| gpt-5.2-2025-12-11 | `response = completion(model="gpt-5.2-2025-12-11", messages=messages)` | +| gpt-5.2-chat-latest | `response = completion(model="gpt-5.2-chat-latest", messages=messages)` | +| gpt-5.2-pro | `response = completion(model="gpt-5.2-pro", messages=messages)` | +| gpt-5.2-pro-2025-12-11 | `response = completion(model="gpt-5.2-pro-2025-12-11", messages=messages)` | +| gpt-5.1 | `response = completion(model="gpt-5.1", messages=messages)` | +| gpt-5.1-codex | `response = completion(model="gpt-5.1-codex", messages=messages)` | +| gpt-5.1-codex-mini | `response = completion(model="gpt-5.1-codex-mini", messages=messages)` | +| gpt-5.1-codex-max | `response = completion(model="gpt-5.1-codex-max", messages=messages)` | | gpt-4.1 | `response = completion(model="gpt-4.1", messages=messages)` | | gpt-4.1-mini | `response = completion(model="gpt-4.1-mini", messages=messages)` | | gpt-4.1-nano | `response = completion(model="gpt-4.1-nano", messages=messages)` | @@ -237,7 +258,7 @@ response = completion( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png" } } ] @@ -410,6 +431,137 @@ Expected Response: ``` +### Advanced: Using `reasoning_effort` with `summary` field + +By default, `reasoning_effort` accepts a string value (`"none"`, `"minimal"`, `"low"`, `"medium"`, `"high"`, `"xhigh"`—`"xhigh"` is only supported on `gpt-5.1-codex-max` and `gpt-5.2` models) and only sets the effort level without including a reasoning summary. + +To opt-in to the `summary` feature, you can pass `reasoning_effort` as a dictionary. **Note:** The `summary` field requires your OpenAI organization to have verification status. Using `summary` without verification will result in a 400 error from OpenAI. + + + +```python +# Option 1: String format (default - no summary) +response = litellm.completion( + model="openai/responses/gpt-5-mini", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning_effort="high" # Only sets effort level +) + +# Option 2: Dict format (with optional summary - requires org verification) +response = litellm.completion( + model="openai/responses/gpt-5-mini", + messages=[{"role": "user", "content": "What is the capital of France?"}], + reasoning_effort={"effort": "high", "summary": "auto"} # "auto", "detailed", or "concise" (not all supported by all models) +) +``` + + + +```bash +# Option 1: String format (default - no summary) +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "openai/responses/gpt-5-mini", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "reasoning_effort": "high" +}' + +# Option 2: Dict format (with optional summary - requires org verification) +# summary options: "auto", "detailed", or "concise" (not all supported by all models) +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "openai/responses/gpt-5-mini", + "messages": [{"role": "user", "content": "What is the capital of France?"}], + "reasoning_effort": {"effort": "high", "summary": "auto"} +}' +``` + + + +**Summary field options:** +- `"auto"`: System automatically determines the appropriate summary level based on the model +- `"concise"`: Provides a shorter summary (not supported by GPT-5 series models) +- `"detailed"`: Offers a comprehensive reasoning summary + +**Note:** GPT-5 series models support `"auto"` and `"detailed"`, but do not support `"concise"`. O-series models (o3-pro, o4-mini, o3) support all three options. Some models like o3-mini and o1 do not support reasoning summaries at all. + +**Supported `reasoning_effort` values by model:** + +| Model | Default (when not set) | Supported Values | +|-------|----------------------|------------------| +| `gpt-5.1` | `none` | `none`, `low`, `medium`, `high` | +| `gpt-5` | `medium` | `minimal`, `low`, `medium`, `high` | +| `gpt-5-mini` | `medium` | `minimal`, `low`, `medium`, `high` | +| `gpt-5-nano` | `none` | `none`, `low`, `medium`, `high` | +| `gpt-5-codex` | `adaptive` | `low`, `medium`, `high` (no `minimal`) | +| `gpt-5.1-codex` | `adaptive` | `low`, `medium`, `high` (no `minimal`) | +| `gpt-5.1-codex-mini` | `adaptive` | `low`, `medium`, `high` (no `minimal`) | +| `gpt-5.1-codex-max` | `adaptive` | `low`, `medium`, `high`, `xhigh` (no `minimal`) | +| `gpt-5.2` | `medium` | `none`, `low`, `medium`, `high`, `xhigh` | +| `gpt-5.2-pro` | `high` | `low`, `medium`, `high`, `xhigh` | +| `gpt-5-pro` | `high` | `high` only | + +**Note:** +- GPT-5.1 introduced a new `reasoning_effort="none"` setting for faster, lower-latency responses. This replaces the `"minimal"` setting from GPT-5. +- `gpt-5.1-codex-max` and `gpt-5.2` models support `reasoning_effort="xhigh"`. All other models will reject this value. +- `gpt-5-pro` only accepts `reasoning_effort="high"`. Other values will return an error. +- When `reasoning_effort` is not set (None), OpenAI defaults to the value shown in the "Default" column. + +See [OpenAI Reasoning documentation](https://platform.openai.com/docs/guides/reasoning) for more details on organization verification requirements. + +### Verbosity Control for GPT-5 Models + +The `verbosity` parameter controls the length and detail of responses from GPT-5 family models. It accepts three values: `"low"`, `"medium"`, or `"high"`. + +**Supported models:** `gpt-5`, `gpt-5.1`, `gpt-5-mini`, `gpt-5-nano`, `gpt-5-pro` + +**Note:** GPT-5-Codex models (`gpt-5-codex`, `gpt-5.1-codex`, `gpt-5.1-codex-mini`, `gpt-5.1-codex-max`) do **not** support the `verbosity` parameter. + +**Use cases:** +- **`"low"`**: Best for concise answers or simple code generation (e.g., SQL queries) +- **`"medium"`**: Default - balanced output length +- **`"high"`**: Use when you need thorough explanations or extensive code refactoring + + + +```python +import litellm + +# Low verbosity - concise responses +response = litellm.completion( + model="gpt-5.1", + messages=[{"role": "user", "content": "Write a function to reverse a string"}], + verbosity="low" +) + +# High verbosity - detailed responses +response = litellm.completion( + model="gpt-5.1", + messages=[{"role": "user", "content": "Explain how neural networks work"}], + verbosity="high" +) +``` + + + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gpt-5.1", + "messages": [{"role": "user", "content": "Write a function to reverse a string"}], + "verbosity": "low" +}' +``` + + + + ## OpenAI Chat Completion to Responses API Bridge Call any Responses API model from OpenAI's `/chat/completions` endpoint. @@ -846,4 +998,4 @@ response = completion( LiteLLM supports OpenAI's video generation models including Sora. -For detailed documentation on video generation, see [OpenAI Video Generation →](./openai/video_generation.md) \ No newline at end of file +For detailed documentation on video generation, see [OpenAI Video Generation →](./openai/video_generation.md) diff --git a/docs/my-website/docs/providers/openai/responses_api.md b/docs/my-website/docs/providers/openai/responses_api.md index 8d91ca674b71..75eab1afac54 100644 --- a/docs/my-website/docs/providers/openai/responses_api.md +++ b/docs/my-website/docs/providers/openai/responses_api.md @@ -623,6 +623,58 @@ display(styled_df) +## Function Calling + +```python showLineNumbers title="Function Calling with Parallel Tool Calls" +import litellm +import json + +tools = [ + { + "type": "function", + "name": "get_weather", + "description": "Get current weather for a location", + "parameters": { + "type": "object", + "properties": { + "location": {"type": "string"} + }, + "required": ["location"] + } + } +] + +# Step 1: Request with tools (parallel_tool_calls=True allows multiple calls) +response = litellm.responses( + model="openai/gpt-4o", + input=[{"role": "user", "content": "What's the weather in Paris and Tokyo?"}], + tools=tools, + parallel_tool_calls=True, # Defaults = True +) + +# Step 2: Execute tool calls and collect results +tool_results = [] +for output in response.output: + if output.type == "function_call": + result = {"temperature": 15, "condition": "sunny"} # Your function logic here + tool_results.append({ + "type": "function_call_output", + "call_id": output.call_id, + "output": json.dumps(result) + }) + +# Step 3: Send results back +final_response = litellm.responses( + model="openai/gpt-4o", + input=tool_results, + tools=tools, +) + +print(final_response.output) +``` + +Set `parallel_tool_calls=False` to ensure zero or one tool is called per turn. [More details](https://platform.openai.com/docs/guides/function-calling#parallel-function-calling). + ## Free-form Function Calling @@ -633,7 +685,6 @@ display(styled_df) import litellm response = litellm.responses( - response = client.responses.create( model="gpt-5-mini", input="Please use the code_exec tool to calculate the area of a circle with radius equal to the number of 'r's in strawberry", text={"format": {"type": "text"}}, diff --git a/docs/my-website/docs/providers/openai/text_to_speech.md b/docs/my-website/docs/providers/openai/text_to_speech.md index a4aeb9e52575..f4507faa066d 100644 --- a/docs/my-website/docs/providers/openai/text_to_speech.md +++ b/docs/my-website/docs/providers/openai/text_to_speech.md @@ -46,7 +46,7 @@ os.environ["OPENAI_API_KEY"] = "sk-.." async def test_async_speech(): speech_file_path = Path(__file__).parent / "speech.mp3" - response = await litellm.aspeech( + response = await aspeech( model="openai/tts-1", voice="alloy", input="the quick brown fox jumped over the lazy dogs", diff --git a/docs/my-website/docs/providers/openai/videos.md b/docs/my-website/docs/providers/openai/videos.md index 473279e60ef9..202c79c2446e 100644 --- a/docs/my-website/docs/providers/openai/videos.md +++ b/docs/my-website/docs/providers/openai/videos.md @@ -36,7 +36,6 @@ print(f"Status: {response.status}") # Download video content when ready video_bytes = video_content( video_id=response.id, - model="sora-2" ) # Save to file @@ -44,6 +43,113 @@ with open("generated_video.mp4", "wb") as f: f.write(video_bytes) ``` +## **LiteLLM Proxy Usage** + +LiteLLM provides OpenAI API compatible video endpoints for complete video generation workflow: + +- `/videos/generations` - Generate new videos +- `/videos/remix` - Edit existing videos with reference images +- `/videos/status` - Check video generation status +- `/videos/retrieval` - Download completed videos + +**Setup** + +Add this to your litellm proxy config.yaml + +```yaml +model_list: + - model_name: sora-2 + litellm_params: + model: openai/sora-2 + api_key: os.environ/OPENAI_API_KEY +``` + +Start litellm + +```bash +litellm --config /path/to/config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +Test video generation request + +```bash +curl --location 'http://localhost:4000/v1/videos' \ +--header 'Content-Type: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--data '{ + "model": "sora-2", + "prompt": "A beautiful sunset over the ocean" +}' +``` + +Test video status request + +```bash +# Using custom-llm-provider header +curl --location 'http://localhost:4000/v1/videos/video_id' \ +--header 'Accept: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--header 'custom-llm-provider: openai' +``` + +Test video retrieval request + +```bash +# Using custom-llm-provider header +curl --location 'http://localhost:4000/v1/videos/video_id/content' \ +--header 'Accept: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--header 'custom-llm-provider: openai' \ +--output video.mp4 + +# Or using query parameter +curl --location 'http://localhost:4000/v1/videos/video_id/content?custom_llm_provider=openai' \ +--header 'Accept: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--output video.mp4 +``` + +Test video remix request + +```bash +# Using custom_llm_provider in request body +curl --location --request POST 'http://localhost:4000/v1/videos/video_id/remix' \ +--header 'Accept: application/json' \ +--header 'Content-Type: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--data '{ + "prompt": "New remix instructions", + "custom_llm_provider": "openai" +}' + +# Or using custom-llm-provider header +curl --location --request POST 'http://localhost:4000/v1/videos/video_id/remix' \ +--header 'Accept: application/json' \ +--header 'Content-Type: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--header 'custom-llm-provider: openai' \ +--data '{ + "prompt": "New remix instructions" +}' +``` + +Test OpenAI video generation request + +```bash +curl http://localhost:4000/v1/videos \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sora-2", + "prompt": "A cat playing with a ball of yarn in a sunny garden", + "seconds": "8", + "size": "720x1280" + }' +``` + + ## Supported Models | Model Name | Description | Max Duration | Supported Sizes | @@ -64,8 +170,7 @@ with open("generated_video.mp4", "wb") as f: ```python # Download video content video_bytes = video_content( - video_id="video_1234567890", - model="sora-2" + video_id="video_1234567890" ) # Save to file @@ -96,8 +201,7 @@ def generate_and_download_video(prompt): # Step 3: Download video video_bytes = litellm.video_content( - video_id=video_id, - model="sora-2" + video_id=video_id ) # Step 4: Save to file @@ -112,6 +216,7 @@ video_file = generate_and_download_video( ) ``` + ## Video Editing with Reference Images ```python @@ -133,8 +238,7 @@ from litellm.exceptions import BadRequestError, AuthenticationError try: response = video_generation( - prompt="A cat playing with a ball of yarn", - model="sora-2" + prompt="A cat playing with a ball of yarn" ) except AuthenticationError as e: print(f"Authentication failed: {e}") diff --git a/docs/my-website/docs/providers/openai_compatible.md b/docs/my-website/docs/providers/openai_compatible.md index 2f11379a8db6..f67500f2b108 100644 --- a/docs/my-website/docs/providers/openai_compatible.md +++ b/docs/my-website/docs/providers/openai_compatible.md @@ -11,7 +11,7 @@ Selecting `openai` as the provider routes your request to an OpenAI-compatible e This library **requires** an API key for all requests, either through the `api_key` parameter or the `OPENAI_API_KEY` environment variable. -If you don’t want to provide a fake API key in each request, consider using a provider that directly matches your +If you don't want to provide a fake API key in each request, consider using a provider that directly matches your OpenAI-compatible endpoint, such as [`hosted_vllm`](/docs/providers/vllm) or [`llamafile`](/docs/providers/llamafile). ::: @@ -150,4 +150,4 @@ model_list: api_base: http://my-custom-base api_key: "" supports_system_message: False # 👈 KEY CHANGE -``` \ No newline at end of file +``` diff --git a/docs/my-website/docs/providers/openrouter.md b/docs/my-website/docs/providers/openrouter.md index 327634909b3d..38eb998c98b1 100644 --- a/docs/my-website/docs/providers/openrouter.md +++ b/docs/my-website/docs/providers/openrouter.md @@ -1,5 +1,5 @@ # OpenRouter -LiteLLM supports all the text / chat / vision models from [OpenRouter](https://openrouter.ai/docs) +LiteLLM supports all the text / chat / vision / embedding models from [OpenRouter](https://openrouter.ai/docs)
Open In Colab @@ -78,3 +78,135 @@ response = completion( route= "" ) ``` + +## Embedding + +```python +from litellm import embedding +import os + +os.environ["OPENROUTER_API_KEY"] = "your-api-key" + +response = embedding( + model="openrouter/openai/text-embedding-3-small", + input=["good morning from litellm", "this is another item"], +) +print(response) +``` + +## Image Generation + +OpenRouter supports image generation through select models like Google Gemini image generation models. LiteLLM transforms standard image generation requests to OpenRouter's chat completion format. + +### Supported Parameters + +- `size`: Maps to OpenRouter's `aspect_ratio` format + - `1024x1024` → `1:1` (square) + - `1536x1024` → `3:2` (landscape) + - `1024x1536` → `2:3` (portrait) + - `1792x1024` → `16:9` (wide landscape) + - `1024x1792` → `9:16` (tall portrait) + +- `quality`: Maps to OpenRouter's `image_size` format (Gemini models) + - `low` or `standard` → `1K` + - `medium` → `2K` + - `high` or `hd` → `4K` + +- `n`: Number of images to generate + +### Usage + +```python +from litellm import image_generation +import os + +os.environ["OPENROUTER_API_KEY"] = "your-api-key" + +# Basic image generation +response = image_generation( + model="openrouter/google/gemini-2.5-flash-image", + prompt="A beautiful sunset over a calm ocean", +) +print(response) +``` + +### Advanced Usage with Parameters + +```python +from litellm import image_generation +import os + +os.environ["OPENROUTER_API_KEY"] = "your-api-key" + +# Generate high-quality landscape image +response = image_generation( + model="openrouter/google/gemini-2.5-flash-image", + prompt="A serene mountain landscape with a lake", + size="1536x1024", # Landscape format + quality="high", # High quality (4K) +) + +# Access the generated image +image_data = response.data[0] +if image_data.b64_json: + # Base64 encoded image + print(f"Generated base64 image: {image_data.b64_json[:50]}...") +elif image_data.url: + # Image URL + print(f"Generated image URL: {image_data.url}") +``` + +### Using OpenRouter-Specific Parameters + +You can also pass OpenRouter-specific parameters directly using `image_config`: + +```python +from litellm import image_generation +import os + +os.environ["OPENROUTER_API_KEY"] = "your-api-key" + +response = image_generation( + model="openrouter/google/gemini-2.5-flash-image", + prompt="A futuristic cityscape at night", + image_config={ + "aspect_ratio": "16:9", # OpenRouter native format + "image_size": "4K" # OpenRouter native format + } +) +print(response) +``` + +### Response Format + +The response follows the standard LiteLLM ImageResponse format: + +```python +{ + "created": 1703658209, + "data": [{ + "b64_json": "iVBORw0KGgoAAAANSUhEUgAA...", # Base64 encoded image + "url": None, + "revised_prompt": None + }], + "usage": { + "input_tokens": 10, + "output_tokens": 1290, + "total_tokens": 1300 + } +} +``` + +### Cost Tracking + +OpenRouter provides cost information in the response, which LiteLLM automatically tracks: + +```python +response = image_generation( + model="openrouter/google/gemini-2.5-flash-image", + prompt="A cute baby sea otter", +) + +# Cost is available in the response metadata +print(f"Request cost: ${response._hidden_params['additional_headers']['llm_provider-x-litellm-response-cost']}") +``` diff --git a/docs/my-website/docs/providers/ovhcloud.md b/docs/my-website/docs/providers/ovhcloud.md index 6c42208f2ccc..94625b0f2edc 100644 --- a/docs/my-website/docs/providers/ovhcloud.md +++ b/docs/my-website/docs/providers/ovhcloud.md @@ -311,6 +311,21 @@ response = embedding( print(response.data) ``` +### Audio Transcription + +```python +from litellm import transcription + +audio_file = open("path/to/your/audio.wav", "rb") + +response = transcription( + model="ovhcloud/whisper-large-v3-turbo", + file=audio_file +) + +print(response.text) +``` + ## Usage with LiteLLM Proxy Server Here's how to call a OVHCloud AI Endpoints model with the LiteLLM Proxy Server diff --git a/docs/my-website/docs/providers/poe.md b/docs/my-website/docs/providers/poe.md new file mode 100644 index 000000000000..ba4089ae6a49 --- /dev/null +++ b/docs/my-website/docs/providers/poe.md @@ -0,0 +1,139 @@ +# Poe + +## Overview + +| Property | Details | +|-------|-------| +| Description | Poe is Quora's AI platform that provides access to more than 100 models across text, image, video, and voice modalities through a developer-friendly API. | +| Provider Route on LiteLLM | `poe/` | +| Link to Provider Doc | [Poe Website ↗](https://poe.com) | +| Base URL | `https://api.poe.com/v1` | +| Supported Operations | [`/chat/completions`](#sample-usage) | + +
+ +## What is Poe? + +Poe is Quora's comprehensive AI platform that offers: +- **100+ Models**: Access to a wide variety of AI models +- **Multiple Modalities**: Text, image, video, and voice AI +- **Popular Models**: Including OpenAI's GPT series and Anthropic's Claude +- **Developer API**: Easy integration for applications +- **Extensive Reach**: Benefits from Quora's 400M monthly unique visitors + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["POE_API_KEY"] = "" # your Poe API key +``` + +Get your Poe API key from the [Poe platform](https://poe.com). + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="Poe Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["POE_API_KEY"] = "" # your Poe API key + +messages = [{"content": "What is the capital of France?", "role": "user"}] + +# Poe call +response = completion( + model="poe/model-name", # Replace with actual model name + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="Poe Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["POE_API_KEY"] = "" # your Poe API key + +messages = [{"content": "Write a short poem about AI", "role": "user"}] + +# Poe call with streaming +response = completion( + model="poe/model-name", # Replace with actual model name + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export POE_API_KEY="" +``` + +### 2. Start the proxy + +```yaml +model_list: + - model_name: poe-model + litellm_params: + model: poe/model-name # Replace with actual model name + api_key: os.environ/POE_API_KEY +``` + +## Supported OpenAI Parameters + +Poe supports all standard OpenAI-compatible parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `messages` | array | **Required**. Array of message objects with 'role' and 'content' | +| `model` | string | **Required**. Model ID from 100+ available models | +| `stream` | boolean | Optional. Enable streaming responses | +| `temperature` | float | Optional. Sampling temperature | +| `top_p` | float | Optional. Nucleus sampling parameter | +| `max_tokens` | integer | Optional. Maximum tokens to generate | +| `frequency_penalty` | float | Optional. Penalize frequent tokens | +| `presence_penalty` | float | Optional. Penalize tokens based on presence | +| `stop` | string/array | Optional. Stop sequences | +| `tools` | array | Optional. List of available tools/functions | +| `tool_choice` | string/object | Optional. Control tool/function calling | +| `response_format` | object | Optional. Response format specification | +| `user` | string | Optional. User identifier | + +## Available Model Categories + +Poe provides access to models across multiple providers: +- **OpenAI Models**: Including GPT-4, GPT-4 Turbo, GPT-3.5 Turbo +- **Anthropic Models**: Including Claude 3 Opus, Sonnet, Haiku +- **Other Popular Models**: Various provider models available +- **Multi-Modal**: Text, image, video, and voice models + +## Platform Benefits + +Using Poe through LiteLLM offers several advantages: +- **Unified Access**: Single API for many different models +- **Quora Integration**: Access to large user base and content ecosystem +- **Content Sharing**: Capabilities to share model outputs with followers +- **Content Distribution**: Best AI content distributed to all users +- **Model Discovery**: Efficient way to explore new AI models + +## Developer Resources + +Poe is actively building developer features and welcomes early access requests for API integration. + +## Additional Resources + +- [Poe Website](https://poe.com) +- [Poe AI Quora Space](https://poeai.quora.com) +- [Quora Blog Post about Poe](https://quorablog.quora.com/Poe) diff --git a/docs/my-website/docs/providers/publicai.md b/docs/my-website/docs/providers/publicai.md new file mode 100644 index 000000000000..1ab8bd5a06c4 --- /dev/null +++ b/docs/my-website/docs/providers/publicai.md @@ -0,0 +1,209 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# PublicAI + +## Overview + +| Property | Details | +|-------|-------| +| Description | PublicAI provides large language models including essential models like the swiss-ai apertus model. | +| Provider Route on LiteLLM | `publicai/` | +| Link to Provider Doc | [PublicAI ↗](https://platform.publicai.co/) | +| Base URL | `https://platform.publicai.co/` | +| Supported Operations | [`/chat/completions`](#sample-usage) | + +
+
+ +https://platform.publicai.co/ + +**We support ALL PublicAI models, just set `publicai/` as a prefix when sending completion requests** + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["PUBLICAI_API_KEY"] = "" # your PublicAI API key +``` + +You can overwrite the base url with: + +``` +os.environ["PUBLICAI_API_BASE"] = "https://platform.publicai.co/v1" +``` + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="PublicAI Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["PUBLICAI_API_KEY"] = "" # your PublicAI API key + +messages = [{"content": "Hello, how are you?", "role": "user"}] + +# PublicAI call +response = completion( + model="publicai/swiss-ai/apertus-8b-instruct", + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="PublicAI Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["PUBLICAI_API_KEY"] = "" # your PublicAI API key + +messages = [{"content": "Hello, how are you?", "role": "user"}] + +# PublicAI call with streaming +response = completion( + model="publicai/swiss-ai/apertus-8b-instruct", + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +## Usage - LiteLLM Proxy + +Add the following to your LiteLLM Proxy configuration file: + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: swiss-ai-apertus-8b + litellm_params: + model: publicai/swiss-ai/apertus-8b-instruct + api_key: os.environ/PUBLICAI_API_KEY + + - model_name: swiss-ai-apertus-70b + litellm_params: + model: publicai/swiss-ai/apertus-70b-instruct + api_key: os.environ/PUBLICAI_API_KEY +``` + +Start your LiteLLM Proxy server: + +```bash showLineNumbers title="Start LiteLLM Proxy" +litellm --config config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + + + + +```python showLineNumbers title="PublicAI via Proxy - Non-streaming" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-proxy-api-key" # Your proxy API key +) + +# Non-streaming response +response = client.chat.completions.create( + model="swiss-ai-apertus-8b", + messages=[{"role": "user", "content": "hello from litellm"}] +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="PublicAI via Proxy - Streaming" +from openai import OpenAI + +# Initialize client with your proxy URL +client = OpenAI( + base_url="http://localhost:4000", # Your proxy URL + api_key="your-proxy-api-key" # Your proxy API key +) + +# Streaming response +response = client.chat.completions.create( + model="swiss-ai-apertus-8b", + messages=[{"role": "user", "content": "hello from litellm"}], + stream=True +) + +for chunk in response: + if chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") +``` + + + + + +```python showLineNumbers title="PublicAI via Proxy - LiteLLM SDK" +import litellm + +# Configure LiteLLM to use your proxy +response = litellm.completion( + model="litellm_proxy/swiss-ai-apertus-8b", + messages=[{"role": "user", "content": "hello from litellm"}], + api_base="http://localhost:4000", + api_key="your-proxy-api-key" +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="PublicAI via Proxy - LiteLLM SDK Streaming" +import litellm + +# Configure LiteLLM to use your proxy with streaming +response = litellm.completion( + model="litellm_proxy/swiss-ai-apertus-8b", + messages=[{"role": "user", "content": "hello from litellm"}], + api_base="http://localhost:4000", + api_key="your-proxy-api-key", + stream=True +) + +for chunk in response: + if hasattr(chunk.choices[0], 'delta') and chunk.choices[0].delta.content is not None: + print(chunk.choices[0].delta.content, end="") +``` + + + + + +```bash showLineNumbers title="PublicAI via Proxy - cURL" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-proxy-api-key" \ + -d '{ + "model": "swiss-ai-apertus-8b", + "messages": [{"role": "user", "content": "hello from litellm"}] + }' +``` + +```bash showLineNumbers title="PublicAI via Proxy - cURL Streaming" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-proxy-api-key" \ + -d '{ + "model": "swiss-ai-apertus-8b", + "messages": [{"role": "user", "content": "hello from litellm"}], + "stream": true + }' +``` + + + + +For more detailed information on using the LiteLLM Proxy, see the [LiteLLM Proxy documentation](../providers/litellm_proxy). diff --git a/docs/my-website/docs/providers/pydantic_ai_agent.md b/docs/my-website/docs/providers/pydantic_ai_agent.md new file mode 100644 index 000000000000..e96295faaf34 --- /dev/null +++ b/docs/my-website/docs/providers/pydantic_ai_agent.md @@ -0,0 +1,121 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Pydantic AI Agents + +Call Pydantic AI Agents via LiteLLM's A2A Gateway. + +| Property | Details | +|----------|---------| +| Description | Pydantic AI agents with native A2A support via the `to_a2a()` method. LiteLLM provides fake streaming support for agents that don't natively stream. | +| Provider Route on LiteLLM | A2A Gateway | +| Supported Endpoints | `/v1/a2a/message/send` | +| Provider Doc | [Pydantic AI Agents ↗](https://ai.pydantic.dev/agents/) | + +## LiteLLM A2A Gateway + +All Pydantic AI agents need to be exposed as A2A agents using the `to_a2a()` method. Once your agent server is running, you can add it to the LiteLLM Gateway. + +### 1. Setup Pydantic AI Agent Server + +LiteLLM requires Pydantic AI agents to follow the [A2A (Agent-to-Agent) protocol](https://github.com/google/A2A). Pydantic AI has native A2A support via the `to_a2a()` method, which exposes your agent as an A2A-compliant server. + +#### Install Dependencies + +```bash +pip install pydantic-ai fasta2a uvicorn +``` + +#### Create Agent + +```python title="agent.py" +from pydantic_ai import Agent + +agent = Agent('openai:gpt-4o-mini', instructions='Be helpful!') + +@agent.tool_plain +def get_weather(city: str) -> str: + """Get weather for a city.""" + return f"Weather in {city}: Sunny, 72°F" + +@agent.tool_plain +def calculator(expression: str) -> str: + """Evaluate a math expression.""" + return str(eval(expression)) + +# Native A2A server - Pydantic AI handles it automatically +app = agent.to_a2a() +``` + +#### Run Server + +```bash +uvicorn agent:app --host 0.0.0.0 --port 9999 +``` + +Server runs at `http://localhost:9999` + +### 2. Navigate to Agents + +From the sidebar, click "Agents" to open the agent management page, then click "+ Add New Agent". + +### 3. Select Pydantic AI Agent Type + +Click "A2A Standard" to see available agent types, then select "Pydantic AI". + +![Select A2A Standard](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/1055acb1-064b-4465-8e6a-8278291bc661/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=395,147) + +![Select Pydantic AI](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/0998e38c-8534-40f1-931a-be96c2cae0ad/ascreenshot.jpeg?tl_px=0,52&br_px=2201,1283&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=421,277) + +### 4. Configure the Agent + +Fill in the following fields: + +- **Agent Name** - A unique identifier for your agent (e.g., `test-pydantic-agent`) +- **Agent URL** - The URL where your Pydantic AI agent is running. We use `http://localhost:9999` because that's where we started our Pydantic AI agent server in the previous step. + +![Enter Agent Name](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/8cf3fbde-05f3-48d1-81b6-6f857bd6d360/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=443,225) + +![Configure Agent Name](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/fb555808-4761-4c49-a415-200ac1bdb525/ascreenshot.jpeg?tl_px=0,0&br_px=2617,1463&force_format=jpeg&q=100&width=1120.0) + +![Enter Agent URL](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/303eae61-4352-4fb0-a537-806839c234ba/ascreenshot.jpeg?tl_px=0,212&br_px=2201,1443&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=456,277) + +### 5. Create Agent + +Click "Create Agent" to save your configuration. + +![Create Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/914f3367-df7d-4244-bd4d-e99ce0a6193a/ascreenshot.jpeg?tl_px=416,438&br_px=2618,1669&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=690,277) + +### 6. Test in Playground + +Go to "Playground" in the sidebar to test your agent. + +![Go to Playground](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/c73c9f3b-22af-4105-aafa-2d34c4986ef3/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=44,97) + +### 7. Select A2A Endpoint + +Click the endpoint dropdown and search for "a2a", then select `/v1/a2a/message/send`. + +![Click Endpoint Dropdown](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/196d97ac-bcba-47f0-9880-97b80250e00c/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=261,230) + +![Search for A2A](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/26b68f21-29f9-4c4c-b8b5-d2e11cbfd14a/ascreenshot.jpeg?tl_px=0,0&br_px=2617,1463&force_format=jpeg&q=100&width=1120.0) + +![Select A2A Endpoint](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/41576fb1-d385-4fb2-84e9-142dd7fe5181/ascreenshot.jpeg?tl_px=0,0&br_px=2201,1230&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=307,270) + +### 8. Select Your Agent and Send a Message + +Pick your Pydantic AI agent from the dropdown and send a test message. + +![Click Agent Dropdown](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/a96d7967-3d54-4cbf-bd3e-b38f1be9df76/ascreenshot.jpeg?tl_px=0,54&br_px=2201,1285&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=274,277) + +![Select Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/e05a5a6e-d044-4480-b94e-7c03cfb92ac5/ascreenshot.jpeg?tl_px=0,113&br_px=2201,1344&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=290,277) + +![Send Message](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/29162702-968a-401a-aac1-c844bfc5f4a3/ascreenshot.jpeg?tl_px=91,653&br_px=2292,1883&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=524,436) + + +## Further Reading + +- [Pydantic AI Documentation](https://ai.pydantic.dev/) +- [Pydantic AI Agents](https://ai.pydantic.dev/agents/) +- [A2A Agent Gateway](../a2a.md) +- [A2A Cost Tracking](../a2a_cost_tracking.md) diff --git a/docs/my-website/docs/providers/ragflow.md b/docs/my-website/docs/providers/ragflow.md new file mode 100644 index 000000000000..73223bd07b5f --- /dev/null +++ b/docs/my-website/docs/providers/ragflow.md @@ -0,0 +1,244 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# RAGFlow + +Litellm supports Ragflow's chat completions APIs + +## Supported Features + +- ✅ Chat completions +- ✅ Streaming responses +- ✅ Both chat and agent endpoints +- ✅ Multiple credential sources (params, env vars, litellm_params) +- ✅ OpenAI-compatible API format + + +## API Key + +```python +# env variable +os.environ['RAGFLOW_API_KEY'] +``` + +## API Base + +```python +# env variable +os.environ['RAGFLOW_API_BASE'] +``` + +## Overview + +RAGFlow provides OpenAI-compatible APIs with unique path structures that include chat and agent IDs: + +- **Chat endpoint**: `/api/v1/chats_openai/{chat_id}/chat/completions` +- **Agent endpoint**: `/api/v1/agents_openai/{agent_id}/chat/completions` + +The model name format embeds the endpoint type and ID: +- Chat: `ragflow/chat/{chat_id}/{model_name}` +- Agent: `ragflow/agent/{agent_id}/{model_name}` + + +## Sample Usage - Chat Endpoint + +```python +from litellm import completion +import os + +os.environ['RAGFLOW_API_KEY'] = "your-ragflow-api-key" +os.environ['RAGFLOW_API_BASE'] = "http://localhost:9380" # or your hosted URL + +response = completion( + model="ragflow/chat/my-chat-id/gpt-4o-mini", + messages=[{"role": "user", "content": "How does the deep doc understanding work?"}] +) +print(response) +``` + +## Sample Usage - Agent Endpoint + +```python +from litellm import completion +import os + +os.environ['RAGFLOW_API_KEY'] = "your-ragflow-api-key" +os.environ['RAGFLOW_API_BASE'] = "http://localhost:9380" # or your hosted URL + +response = completion( + model="ragflow/agent/my-agent-id/gpt-4o-mini", + messages=[{"role": "user", "content": "What are the key features?"}] +) +print(response) +``` + +## Sample Usage - With Parameters + +You can also pass `api_key` and `api_base` directly as parameters: + +```python +from litellm import completion + +response = completion( + model="ragflow/chat/my-chat-id/gpt-4o-mini", + messages=[{"role": "user", "content": "Hello!"}], + api_key="your-ragflow-api-key", + api_base="http://localhost:9380" +) +print(response) +``` + +## Sample Usage - Streaming + +```python +from litellm import completion +import os + +os.environ['RAGFLOW_API_KEY'] = "your-ragflow-api-key" +os.environ['RAGFLOW_API_BASE'] = "http://localhost:9380" + +response = completion( + model="ragflow/agent/my-agent-id/gpt-4o-mini", + messages=[{"role": "user", "content": "Explain RAGFlow"}], + stream=True +) + +for chunk in response: + print(chunk) +``` + +## Model Name Format + +The model name must follow one of these formats: + +### Chat Endpoint +``` +ragflow/chat/{chat_id}/{model_name} +``` + +Example: `ragflow/chat/my-chat-id/gpt-4o-mini` + +### Agent Endpoint +``` +ragflow/agent/{agent_id}/{model_name} +``` + +Example: `ragflow/agent/my-agent-id/gpt-4o-mini` + +Where: +- `{chat_id}` or `{agent_id}` is the ID of your chat or agent in RAGFlow +- `{model_name}` is the actual model name (e.g., `gpt-4o-mini`, `gpt-4o`, etc.) + +## Configuration Sources + +LiteLLM supports multiple ways to provide credentials, checked in this order: + +1. **Function parameters**: `api_key="..."`, `api_base="..."` +2. **litellm_params**: `litellm_params={"api_key": "...", "api_base": "..."}` +3. **Environment variables**: `RAGFLOW_API_KEY`, `RAGFLOW_API_BASE` +4. **Global litellm settings**: `litellm.api_key`, `litellm.api_base` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export RAGFLOW_API_KEY="your-ragflow-api-key" +export RAGFLOW_API_BASE="http://localhost:9380" +``` + +### 2. Start the proxy + + + + +```yaml +model_list: + - model_name: ragflow-chat-gpt4 + litellm_params: + model: ragflow/chat/my-chat-id/gpt-4o-mini + api_key: os.environ/RAGFLOW_API_KEY + api_base: os.environ/RAGFLOW_API_BASE + - model_name: ragflow-agent-gpt4 + litellm_params: + model: ragflow/agent/my-agent-id/gpt-4o-mini + api_key: os.environ/RAGFLOW_API_KEY + api_base: os.environ/RAGFLOW_API_BASE +``` + + + + +```bash +$ litellm --config /path/to/config.yaml + +# Server running on http://0.0.0.0:4000 +``` + + + + +### 3. Test it + + + + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "ragflow-chat-gpt4", + "messages": [ + {"role": "user", "content": "How does RAGFlow work?"} + ] + }' +``` + + + + +```python +from openai import OpenAI + +client = OpenAI( + api_key="sk-1234", # Your LiteLLM proxy key + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="ragflow-chat-gpt4", + messages=[ + {"role": "user", "content": "How does RAGFlow work?"} + ] +) +print(response) +``` + + + + +## API Base URL Handling + +The `api_base` parameter can be provided with or without `/v1` suffix. LiteLLM will automatically handle it: + +- `http://localhost:9380` → `http://localhost:9380/api/v1/chats_openai/{chat_id}/chat/completions` +- `http://localhost:9380/v1` → `http://localhost:9380/api/v1/chats_openai/{chat_id}/chat/completions` +- `http://localhost:9380/api/v1` → `http://localhost:9380/api/v1/chats_openai/{chat_id}/chat/completions` + +All three formats will work correctly. + +## Error Handling + +If you encounter errors: + +1. **Invalid model format**: Ensure your model name follows `ragflow/{chat|agent}/{id}/{model_name}` format +2. **Missing api_base**: Provide `api_base` via parameter, environment variable, or litellm_params +3. **Connection errors**: Verify your RAGFlow server is running and accessible at the provided `api_base` + +:::info + +For more information about passing provider-specific parameters, [go here](../completion/provider_specific_params.md) + +::: + diff --git a/docs/my-website/docs/providers/ragflow_vector_store.md b/docs/my-website/docs/providers/ragflow_vector_store.md new file mode 100644 index 000000000000..bc014cacbe6f --- /dev/null +++ b/docs/my-website/docs/providers/ragflow_vector_store.md @@ -0,0 +1,349 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +import Image from '@theme/IdealImage'; + +# RAGFlow Vector Stores + +Litellm support creation and management of datasets for document processing and knowledge base management in Ragflow. + +| Property | Details | +|----------|---------| +| Description | RAGFlow datasets enable document processing, chunking, and knowledge base management for RAG applications. | +| Provider Route on LiteLLM | `ragflow` in the litellm vector_store_registry | +| Provider Doc | [RAGFlow API Documentation ↗](https://ragflow.io/docs) | +| Supported Operations | Dataset Management (Create, List, Update, Delete) | +| Search/Retrieval | ❌ Not supported (management only) | + +## Quick Start + +### LiteLLM Python SDK + +```python showLineNumbers title="Example using LiteLLM Python SDK" +import os +import litellm + +# Set RAGFlow credentials +os.environ["RAGFLOW_API_KEY"] = "your-ragflow-api-key" +os.environ["RAGFLOW_API_BASE"] = "http://localhost:9380" # Optional, defaults to localhost:9380 + +# Create a RAGFlow dataset +response = litellm.vector_stores.create( + name="my-dataset", + custom_llm_provider="ragflow", + metadata={ + "description": "My knowledge base dataset", + "embedding_model": "BAAI/bge-large-zh-v1.5@BAAI", + "chunk_method": "naive" + } +) + +print(f"Created dataset ID: {response.id}") +print(f"Dataset name: {response.name}") +``` + +### LiteLLM Proxy + +#### 1. Configure your vector_store_registry + + + + +```yaml +model_list: + - model_name: gpt-4o-mini + litellm_params: + model: gpt-4o-mini + api_key: os.environ/OPENAI_API_KEY + +vector_store_registry: + - vector_store_name: "ragflow-knowledge-base" + litellm_params: + vector_store_id: "your-dataset-id" + custom_llm_provider: "ragflow" + api_key: os.environ/RAGFLOW_API_KEY + api_base: os.environ/RAGFLOW_API_BASE # Optional + vector_store_description: "RAGFlow dataset for knowledge base" + vector_store_metadata: + source: "Company documentation" +``` + + + + + +On the LiteLLM UI, Navigate to Experimental > Vector Stores > Create Vector Store. On this page you can create a vector store with a name, vector store id and credentials. + + + + + + +#### 2. Create a dataset via Proxy + + + + +```bash +curl http://localhost:4000/v1/vector_stores \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "name": "my-ragflow-dataset", + "custom_llm_provider": "ragflow", + "metadata": { + "description": "Test dataset", + "chunk_method": "naive" + } + }' +``` + + + + + +```python +from openai import OpenAI + +# Initialize client with your LiteLLM proxy URL +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +# Create a RAGFlow dataset +response = client.vector_stores.create( + name="my-ragflow-dataset", + custom_llm_provider="ragflow", + metadata={ + "description": "Test dataset", + "chunk_method": "naive" + } +) + +print(f"Created dataset: {response.id}") +``` + + + + +## Configuration + +### Environment Variables + +RAGFlow vector stores support configuration via environment variables: + +- `RAGFLOW_API_KEY` - Your RAGFlow API key (required) +- `RAGFLOW_API_BASE` - RAGFlow API base URL (optional, defaults to `http://localhost:9380`) + +### Parameters + +You can also pass these via `litellm_params`: + +- `api_key` - RAGFlow API key (overrides `RAGFLOW_API_KEY` env var) +- `api_base` - RAGFlow API base URL (overrides `RAGFLOW_API_BASE` env var) + +## Dataset Creation Options + +### Basic Dataset Creation + +```python +response = litellm.vector_stores.create( + name="basic-dataset", + custom_llm_provider="ragflow" +) +``` + +### Dataset with Chunk Method + +RAGFlow supports various chunk methods for different document types: + + + + +```python +response = litellm.vector_stores.create( + name="general-dataset", + custom_llm_provider="ragflow", + metadata={ + "chunk_method": "naive", + "parser_config": { + "chunk_token_num": 512, + "delimiter": "\n", + "html4excel": False, + "layout_recognize": "DeepDOC" + } + } +) +``` + + + + + +```python +response = litellm.vector_stores.create( + name="book-dataset", + custom_llm_provider="ragflow", + metadata={ + "chunk_method": "book", + "parser_config": { + "raptor": { + "use_raptor": False + } + } + } +) +``` + + + + + +```python +response = litellm.vector_stores.create( + name="qa-dataset", + custom_llm_provider="ragflow", + metadata={ + "chunk_method": "qa", + "parser_config": { + "raptor": { + "use_raptor": False + } + } + } +) +``` + + + + + +```python +response = litellm.vector_stores.create( + name="paper-dataset", + custom_llm_provider="ragflow", + metadata={ + "chunk_method": "paper", + "parser_config": { + "raptor": { + "use_raptor": False + } + } + } +) +``` + + + + +### Dataset with Ingestion Pipeline + +Instead of using a chunk method, you can use an ingestion pipeline: + +```python +response = litellm.vector_stores.create( + name="pipeline-dataset", + custom_llm_provider="ragflow", + metadata={ + "parse_type": 2, # Number of parsers in your pipeline + "pipeline_id": "d0bebe30ae2211f0970942010a8e0005" # 32-character hex ID + } +) +``` + +**Note**: `chunk_method` and `pipeline_id` are mutually exclusive. Use one or the other. + +### Advanced Parser Configuration + +```python +response = litellm.vector_stores.create( + name="advanced-dataset", + custom_llm_provider="ragflow", + metadata={ + "chunk_method": "naive", + "description": "Advanced dataset with custom parser config", + "embedding_model": "BAAI/bge-large-zh-v1.5@BAAI", + "permission": "me", # or "team" + "parser_config": { + "chunk_token_num": 1024, + "delimiter": "\n!?;。;!?", + "html4excel": True, + "layout_recognize": "DeepDOC", + "auto_keywords": 5, + "auto_questions": 3, + "task_page_size": 12, + "raptor": { + "use_raptor": True + }, + "graphrag": { + "use_graphrag": False + } + } + } +) +``` + +## Supported Chunk Methods + +RAGFlow supports the following chunk methods: + +- `naive` - General purpose (default) +- `book` - For book documents +- `email` - For email documents +- `laws` - For legal documents +- `manual` - Manual chunking +- `one` - Single chunk +- `paper` - For academic papers +- `picture` - For image documents +- `presentation` - For presentation documents +- `qa` - Q&A format +- `table` - For table documents +- `tag` - Tag-based chunking + +## RAGFlow-Specific Parameters + +All RAGFlow-specific parameters should be passed via the `metadata` field: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `avatar` | string | Base64 encoding of the avatar (max 65535 chars) | +| `description` | string | Brief description of the dataset (max 65535 chars) | +| `embedding_model` | string | Embedding model name (e.g., "BAAI/bge-large-zh-v1.5@BAAI") | +| `permission` | string | Access permission: "me" (default) or "team" | +| `chunk_method` | string | Chunking method (see supported methods above) | +| `parser_config` | object | Parser configuration (varies by chunk_method) | +| `parse_type` | int | Number of parsers in pipeline (required with pipeline_id) | +| `pipeline_id` | string | 32-character hex pipeline ID (required with parse_type) | + +## Error Handling + +RAGFlow returns error responses in the following format: + +```json +{ + "code": 101, + "message": "Dataset name 'my-dataset' already exists" +} +``` + +LiteLLM automatically maps these to appropriate exceptions: + +- `code != 0` → Raises exception with the error message +- Missing required fields → Raises `ValueError` +- Mutually exclusive parameters → Raises `ValueError` + +## Limitations + +- **Search/Retrieval**: RAGFlow vector stores support dataset management only. Search operations are not supported and will raise `NotImplementedError`. +- **List/Update/Delete**: These operations are not yet implemented through the standard vector store API. Use RAGFlow's native API endpoints directly. + +## Further Reading + +Vector Stores: +- [Vector Store Creation](../vector_stores/create.md) +- [Using Vector Stores with Completions](../completion/knowledgebase.md) +- [Vector Store Registry](../completion/knowledgebase.md#vectorstoreregistry) + diff --git a/docs/my-website/docs/providers/runwayml/images.md b/docs/my-website/docs/providers/runwayml/images.md new file mode 100644 index 000000000000..00146d10baa3 --- /dev/null +++ b/docs/my-website/docs/providers/runwayml/images.md @@ -0,0 +1,198 @@ +# RunwayML - Image Generation + +## Overview + +| Property | Details | +|-------|-------| +| Description | RunwayML provides advanced AI-powered image generation with high-quality results | +| Provider Route on LiteLLM | `runwayml/` | +| Supported Operations | [`/images/generations`](#quick-start) | +| Link to Provider Doc | [RunwayML API ↗](https://docs.dev.runwayml.com/) | + +LiteLLM supports RunwayML's Gen-4 image generation API, allowing you to generate high-quality images from text prompts. + +## Quick Start + +```python showLineNumbers title="Basic Image Generation" +from litellm import image_generation +import os + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" + +response = image_generation( + model="runwayml/gen4_image", + prompt="A serene mountain landscape at sunset", + size="1920x1080" +) + +print(response.data[0].url) +``` + +## Authentication + +Set your RunwayML API key: + +```python showLineNumbers title="Set API Key" +import os + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" +``` + +## Supported Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Model to use (e.g., `runwayml/gen4_image`) | +| `prompt` | string | Yes | Text description for the image | +| `size` | string | No | Image dimensions (default: `1920x1080`) | + +### Supported Sizes + +- `1024x1024` +- `1792x1024` +- `1024x1792` +- `1920x1080` (default) +- `1080x1920` + +## Async Usage + +```python showLineNumbers title="Async Image Generation" +from litellm import aimage_generation +import os +import asyncio + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" + +async def generate_image(): + response = await aimage_generation( + model="runwayml/gen4_image", + prompt="A futuristic city skyline at night", + size="1920x1080" + ) + + print(response.data[0].url) + +asyncio.run(generate_image()) +``` + +## LiteLLM Proxy Usage + +Add RunwayML to your proxy configuration: + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: gen4-image + litellm_params: + model: runwayml/gen4_image + api_key: os.environ/RUNWAYML_API_KEY +``` + +Start the proxy: + +```bash +litellm --config /path/to/config.yaml +``` + +Generate images through the proxy: + +```bash showLineNumbers title="Proxy Request" +curl --location 'http://localhost:4000/v1/images/generations' \ +--header 'Content-Type: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--data '{ + "model": "runwayml/gen4_image", + "prompt": "A serene mountain landscape at sunset", + "size": "1920x1080" +}' +``` + +## Supported Models + +| Model | Description | Default Size | +|-------|-------------|--------------| +| `runwayml/gen4_image` | High-quality image generation | 1920x1080 | + +## Cost Tracking + +LiteLLM automatically tracks RunwayML image generation costs: + +```python showLineNumbers title="Cost Tracking" +from litellm import image_generation, completion_cost + +response = image_generation( + model="runwayml/gen4_image", + prompt="A serene mountain landscape at sunset", + size="1920x1080" +) + +cost = completion_cost(completion_response=response) +print(f"Image generation cost: ${cost}") +``` + +## Supported Features + +| Feature | Supported | +|---------|-----------| +| Image Generation | ✅ | +| Cost Tracking | ✅ | +| Logging | ✅ | +| Fallbacks | ✅ | +| Load Balancing | ✅ | + + + +## How It Works + +RunwayML uses an asynchronous task-based API pattern. LiteLLM handles the polling and response transformation automatically. + +### Complete Flow Diagram + +```mermaid +sequenceDiagram + participant Client + box rgb(200, 220, 255) LiteLLM AI Gateway + participant LiteLLM + end + participant RunwayML as RunwayML API + + Client->>LiteLLM: POST /images/generations (OpenAI format) + Note over LiteLLM: Transform to RunwayML format + + LiteLLM->>RunwayML: POST v1/text_to_image + RunwayML-->>LiteLLM: 200 OK + task ID + + Note over LiteLLM: Automatic Polling + loop Every 2 seconds + LiteLLM->>RunwayML: GET v1/tasks/{task_id} + RunwayML-->>LiteLLM: Status: RUNNING + end + + LiteLLM->>RunwayML: GET v1/tasks/{task_id} + RunwayML-->>LiteLLM: Status: SUCCEEDED + image URL + + Note over LiteLLM: Transform to OpenAI format + LiteLLM-->>Client: Image Response (OpenAI format) +``` + +### What LiteLLM Does For You + +When you call `litellm.image_generation()` or `/v1/images/generations`: + +1. **Request Transformation**: Converts OpenAI image generation format → RunwayML format +2. **Submits Task**: Sends transformed request to RunwayML API +3. **Receives Task ID**: Captures the task ID from the initial response +4. **Automatic Polling**: + - Polls the task status endpoint every 2 seconds + - Continues until status is `SUCCEEDED` or `FAILED` + - Default timeout: 10 minutes (configurable via `RUNWAYML_POLLING_TIMEOUT`) +5. **Response Transformation**: Converts RunwayML format → OpenAI format +6. **Returns Result**: Sends unified OpenAI format response to client + +**Polling Configuration:** +- Default timeout: 600 seconds (10 minutes) +- Configurable via `RUNWAYML_POLLING_TIMEOUT` environment variable +- Uses sync (`time.sleep()`) or async (`await asyncio.sleep()`) based on call type + +:::info +**Typical processing time**: 10-30 seconds depending on image size and complexity +::: diff --git a/docs/my-website/docs/providers/runwayml/text-to-speech.md b/docs/my-website/docs/providers/runwayml/text-to-speech.md new file mode 100644 index 000000000000..020269863c6f --- /dev/null +++ b/docs/my-website/docs/providers/runwayml/text-to-speech.md @@ -0,0 +1,244 @@ +# RunwayML - Text-to-Speech + +## Overview + +| Property | Details | +|-------|-------| +| Description | RunwayML provides high-quality AI-powered text-to-speech with natural-sounding voices | +| Provider Route on LiteLLM | `runwayml/` | +| Supported Operations | [`/audio/speech`](#quick-start) | +| Link to Provider Doc | [RunwayML API ↗](https://docs.dev.runwayml.com/) | + +LiteLLM supports RunwayML's text-to-speech API with automatic task polling, allowing you to generate natural-sounding audio from text. + +## Quick Start + +```python showLineNumbers title="Basic Text-to-Speech" +from litellm import speech +import os + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" + +response = speech( + model="runwayml/eleven_multilingual_v2", + input="Step right up, ladies and gentlemen! Have you ever wished for a toaster that's not just a toaster but a marvel of modern ingenuity?", + voice="alloy" +) + +# Save the audio +with open("output.mp3", "wb") as f: + f.write(response.content) +``` + +## Authentication + +Set your RunwayML API key: + +```python showLineNumbers title="Set API Key" +import os + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" +``` + +## Supported Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Model to use (e.g., `runwayml/eleven_multilingual_v2`) | +| `input` | string | Yes | Text to convert to speech | +| `voice` | string or dict | Yes | Voice to use (OpenAI name, RunwayML preset, or voice config) | + +## Voice Options + +### Using OpenAI Voice Names + +OpenAI voice names are automatically mapped to appropriate RunwayML voices: + +```python showLineNumbers title="OpenAI Voice Names" +from litellm import speech + +# These OpenAI voice names work automatically +response = speech( + model="runwayml/eleven_multilingual_v2", + input="Hello, world!", + voice="alloy" # Maya - neutral, balanced female voice +) +``` + +**Voice Mappings:** +- `alloy` → Maya (neutral, balanced female voice) +- `echo` → James (male voice) +- `fable` → Bernard (warm, storytelling voice) +- `onyx` → Vincent (deep male voice) +- `nova` → Serene (warm, expressive female voice) +- `shimmer` → Ella (clear, friendly female voice) + +### Using RunwayML Preset Voices + +You can directly specify any RunwayML preset voice by passing the preset name as a string: + +```python showLineNumbers title="RunwayML Preset Names" +from litellm import speech + +# Pass the RunwayML voice name as a string +response = speech( + model="runwayml/eleven_multilingual_v2", + input="Hello, world!", + voice="Maya" # LiteLLM automatically formats this for RunwayML +) + +# Try different RunwayML voices +response = speech( + model="runwayml/eleven_multilingual_v2", + input="Step right up, ladies and gentlemen!", + voice="Bernard" # Great for storytelling +) +``` + +**Available RunwayML Voices:** + +Maya, Arjun, Serene, Bernard, Billy, Mark, Clint, Mabel, Chad, Leslie, Eleanor, Elias, Elliot, Grungle, Brodie, Sandra, Kirk, Kylie, Lara, Lisa, Malachi, Marlene, Martin, Miriam, Monster, Paula, Pip, Rusty, Ragnar, Xylar, Maggie, Jack, Katie, Noah, James, Rina, Ella, Mariah, Frank, Claudia, Niki, Vincent, Kendrick, Myrna, Tom, Wanda, Benjamin, Kiana, Rachel + +:::tip +Simply pass the voice name as a string - LiteLLM automatically handles the internal RunwayML API format conversion. +::: + +## Async Usage + +```python showLineNumbers title="Async Text-to-Speech" +from litellm import aspeech +import os +import asyncio + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" + +async def generate_speech(): + response = await aspeech( + model="runwayml/eleven_multilingual_v2", + input="This is an asynchronous text-to-speech request.", + voice="nova" + ) + + with open("output.mp3", "wb") as f: + f.write(response.content) + + print("Audio generated successfully!") + +asyncio.run(generate_speech()) +``` + +## LiteLLM Proxy Usage + +Add RunwayML to your proxy configuration: + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: runway-tts + litellm_params: + model: runwayml/eleven_multilingual_v2 + api_key: os.environ/RUNWAYML_API_KEY +``` + +Start the proxy: + +```bash +litellm --config /path/to/config.yaml +``` + +Generate speech through the proxy: + +```bash showLineNumbers title="Proxy Request" +curl --location 'http://localhost:4000/v1/audio/speech' \ +--header 'Content-Type: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--data '{ + "model": "runwayml/eleven_multilingual_v2", + "input": "Hello from the LiteLLM proxy!", + "voice": "alloy" +}' +``` + +With RunwayML-specific voice: + +```bash showLineNumbers title="Proxy Request with RunwayML Voice" +curl --location 'http://localhost:4000/v1/audio/speech' \ +--header 'Content-Type: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--data '{ + "model": "runwayml/eleven_multilingual_v2", + "input": "Hello with a custom RunwayML voice!", + "voice": "Bernard" +}' +``` + +## Supported Models + +| Model | Description | +|-------|-------------| +| `runwayml/eleven_multilingual_v2` | High-quality multilingual text-to-speech | + +## Cost Tracking + +LiteLLM automatically tracks RunwayML text-to-speech costs: + +```python showLineNumbers title="Cost Tracking" +from litellm import speech, completion_cost + +response = speech( + model="runwayml/eleven_multilingual_v2", + input="Hello, world!", + voice="alloy" +) + +cost = completion_cost(completion_response=response) +print(f"Text-to-speech cost: ${cost}") +``` + +## Supported Features + +| Feature | Supported | +|---------|-----------| +| Text-to-Speech | ✅ | +| Cost Tracking | ✅ | +| Logging | ✅ | +| Fallbacks | ✅ | +| Load Balancing | ✅ | +| 50+ Voice Presets | ✅ | + +## How It Works + +RunwayML uses an asynchronous task-based API pattern. LiteLLM handles the polling and response transformation automatically. + +### Complete Flow Diagram + +```mermaid +sequenceDiagram + participant Client + box rgb(200, 220, 255) LiteLLM AI Gateway + participant LiteLLM + end + participant RunwayML as RunwayML API + participant Storage as Audio Storage + + Client->>LiteLLM: POST /audio/speech (OpenAI format) + Note over LiteLLM: Transform to RunwayML format
Map voice to preset ID + + LiteLLM->>RunwayML: POST v1/text_to_speech + RunwayML-->>LiteLLM: 200 OK + task ID + + Note over LiteLLM: Automatic Polling + loop Every 2 seconds + LiteLLM->>RunwayML: GET v1/tasks/{task_id} + RunwayML-->>LiteLLM: Status: RUNNING + end + + LiteLLM->>RunwayML: GET v1/tasks/{task_id} + RunwayML-->>LiteLLM: Status: SUCCEEDED + audio URL + + LiteLLM->>Storage: GET audio URL + Storage-->>LiteLLM: Audio data (MP3) + + Note over LiteLLM: Return audio content + LiteLLM-->>Client: Audio Response (binary) +``` + diff --git a/docs/my-website/docs/providers/runwayml/videos.md b/docs/my-website/docs/providers/runwayml/videos.md new file mode 100644 index 000000000000..33621509a31f --- /dev/null +++ b/docs/my-website/docs/providers/runwayml/videos.md @@ -0,0 +1,266 @@ +# RunwayML - Video Generation + +LiteLLM supports RunwayML's Gen-4 video generation API, allowing you to generate videos from text prompts and images. + +## Quick Start + +```python showLineNumbers title="Basic Video Generation" +from litellm import video_generation +import os + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" + +# Generate video from text and image +response = video_generation( + model="runwayml/gen4_turbo", + prompt="A high quality demo video of litellm ai gateway", + input_reference="https://media.licdn.com/dms/image/v2/D4D0BAQFqOrIAJEgtLw/company-logo_200_200/company-logo_200_200/0/1714076049190/berri_ai_logo?e=2147483647&v=beta&t=7tG_KRZZ4MPGc7Iin79PcFcrpvf5Hu6rBM4ptHGU1DY", + seconds=5, + size="1280x720" +) + +print(f"Video ID: {response.id}") +print(f"Status: {response.status}") +``` + +## Authentication + +Set your RunwayML API key: + +```python showLineNumbers title="Set API Key" +import os + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" +``` + +## Supported Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `model` | string | Yes | Model to use (e.g., `runwayml/gen4_turbo`) | +| `prompt` | string | Yes | Text description for the video | +| `input_reference` | string/file | Yes | URL or file path to reference image | +| `seconds` | int | No | Video duration (5 or 10 seconds) | +| `size` | string | No | Video dimensions (`1280x720` or `720x1280`). Can also use `ratio` format (`1280:720`) | + +## Complete Workflow + +```python showLineNumbers title="Complete Video Generation Workflow" +from litellm import video_generation, video_status, video_content +import os +import time + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" + +# 1. Generate video +response = video_generation( + model="runwayml/gen4_turbo", + prompt="A high quality demo video of litellm ai gateway", + input_reference="https://media.licdn.com/dms/image/v2/D4D0BAQFqOrIAJEgtLw/company-logo_200_200/company-logo_200_200/0/1714076049190/berri_ai_logo?e=2147483647&v=beta&t=7tG_KRZZ4MPGc7Iin79PcFcrpvf5Hu6rBM4ptHGU1DY", + seconds=5, + size="1280x720" +) + +video_id = response.id +print(f"Video generation started: {video_id}") + +# 2. Check status until completed +while True: + status_response = video_status(video_id=video_id) + print(f"Status: {status_response.status}") + + if status_response.status == "completed": + print("Video generation completed!") + break + elif status_response.status == "failed": + print("Video generation failed") + break + + time.sleep(10) # Wait 10 seconds before checking again + +# 3. Download video content +video_bytes = video_content(video_id=video_id) + +# 4. Save to file +with open("generated_video.mp4", "wb") as f: + f.write(video_bytes) + +print("Video saved successfully!") +``` + +## Async Usage + +```python showLineNumbers title="Async Video Generation" +from litellm import avideo_generation, avideo_status, avideo_content +import os +import asyncio + +os.environ["RUNWAYML_API_KEY"] = "your-api-key" + +async def generate_video(): + # Generate video + response = await avideo_generation( + model="runwayml/gen4_turbo", + prompt="A serene lake with mountains in the background", + input_reference="https://example.com/lake.jpg", + seconds=5, + size="1280x720" + ) + + video_id = response.id + print(f"Video generation started: {video_id}") + + # Poll for completion + while True: + status_response = await avideo_status(video_id=video_id) + print(f"Status: {status_response.status}") + + if status_response.status == "completed": + break + elif status_response.status == "failed": + print("Video generation failed") + return + + await asyncio.sleep(10) + + # Download video + video_bytes = await avideo_content(video_id=video_id) + + # Save to file + with open("generated_video.mp4", "wb") as f: + f.write(video_bytes) + + print("Video saved successfully!") + +asyncio.run(generate_video()) +``` + +## LiteLLM Proxy Usage + +Add RunwayML to your proxy configuration: + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: gen4-turbo + litellm_params: + model: runwayml/gen4_turbo + api_key: os.environ/RUNWAYML_API_KEY +``` + +Start the proxy: + +```bash +litellm --config /path/to/config.yaml +``` + +Generate videos through the proxy: + +```bash showLineNumbers title="Proxy Request" +curl --location 'http://localhost:4000/v1/videos' \ +--header 'Content-Type: application/json' \ +--header 'x-litellm-api-key: sk-1234' \ +--data '{ + "model": "runwayml/gen4_turbo", + "prompt": "A high quality demo video of litellm ai gateway", + "input_reference": "https://media.licdn.com/dms/image/v2/D4D0BAQFqOrIAJEgtLw/company-logo_200_200/company-logo_200_200/0/1714076049190/berri_ai_logo?e=2147483647&v=beta&t=7tG_KRZZ4MPGc7Iin79PcFcrpvf5Hu6rBM4ptHGU1DY", + "ratio": "1280:720" +}' +``` + +Check video status: + +```bash showLineNumbers title="Check Status" +curl --location 'http://localhost:4000/v1/videos/{video_id}' \ +--header 'x-litellm-api-key: sk-1234' +``` + +Download video content: + +```bash showLineNumbers title="Download Video" +curl --location 'http://localhost:4000/v1/videos/{video_id}/content' \ +--header 'x-litellm-api-key: sk-1234' \ +--output video.mp4 +``` + +## Supported Models + +| Model | Description | Duration | Aspect Ratios | +|-------|-------------|----------|---------------| +| `runwayml/gen4_turbo` | Fast video generation | 5-10s | 1280x720, 720x1280 | + +## Error Handling + +```python showLineNumbers title="Error Handling" +from litellm import video_generation, video_status +import time + +try: + response = video_generation( + model="runwayml/gen4_turbo", + prompt="A scenic mountain view", + input_reference="https://example.com/mountain.jpg", + seconds=5 + ) + + # Poll for completion + max_attempts = 60 # 10 minutes max + attempts = 0 + + while attempts < max_attempts: + status_response = video_status(video_id=response.id) + + if status_response.status == "completed": + print("Video generation completed!") + break + elif status_response.status == "failed": + error = status_response.error or {} + print(f"Video generation failed: {error.get('message', 'Unknown error')}") + break + + time.sleep(10) + attempts += 1 + + if attempts >= max_attempts: + print("Video generation timed out") + +except Exception as e: + print(f"Error: {str(e)}") +``` + +## Cost Tracking + +LiteLLM automatically tracks RunwayML video generation costs: + +```python showLineNumbers title="Cost Tracking" +from litellm import video_generation, completion_cost + +response = video_generation( + model="runwayml/gen4_turbo", + prompt="A high quality demo video of litellm ai gateway", + input_reference="https://media.licdn.com/dms/image/v2/D4D0BAQFqOrIAJEgtLw/company-logo_200_200/company-logo_200_200/0/1714076049190/berri_ai_logo?e=2147483647&v=beta&t=7tG_KRZZ4MPGc7Iin79PcFcrpvf5Hu6rBM4ptHGU1DY", + seconds=5, + size="1280x720" +) + +# Calculate cost +cost = completion_cost(completion_response=response) +print(f"Video generation cost: ${cost}") +``` + +## API Reference + +For complete API details, see the [OpenAI Video Generation API specification](https://platform.openai.com/docs/guides/video-generation) which LiteLLM follows. + +## Supported Features + +| Feature | Supported | +|---------|-----------| +| Video Generation | ✅ | +| Image-to-Video | ✅ | +| Status Checking | ✅ | +| Content Download | ✅ | +| Cost Tracking | ✅ | +| Logging | ✅ | +| Fallbacks | ✅ | +| Load Balancing | ✅ | + diff --git a/docs/my-website/docs/providers/sap.md b/docs/my-website/docs/providers/sap.md new file mode 100644 index 000000000000..16f30a2e99c7 --- /dev/null +++ b/docs/my-website/docs/providers/sap.md @@ -0,0 +1,559 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# SAP Generative AI Hub + +LiteLLM supports SAP Generative AI Hub's Orchestration Service. + +| Property | Details | +|-------|--------------------------------------------------------------------------------------------------------------------------------------------------------| +| Description | SAP's Generative AI Hub provides access to OpenAI, Anthropic, Gemini, Mistral, NVIDIA, Amazon, and SAP LLMs through the AI Core orchestration service. | +| Provider Route on LiteLLM | `sap/` | +| Supported Endpoints | `/chat/completions`, `/embeddings` | +| API Reference | [SAP AI Core Documentation](https://help.sap.com/docs/sap-ai-core) | + +## Prerequisites + +Before you begin, ensure you have: + +1. **SAP BTP Account** with access to SAP AI Core +2. **AI Core Service Instance** provisioned in your subaccount +3. **Service Key** created for your AI Core instance (this contains your credentials) +4. **Resource Group** with deployed AI models (check with your SAP administrator) + +:::tip Where to Find Your Credentials +Your credentials come from the **Service Key** you create in SAP BTP Cockpit: + +1. Navigate to your **Subaccount** → **Instances and Subscriptions** +2. Find your **AI Core** instance and click on it +3. Go to **Service Keys** and create one (or use existing) +4. The JSON contains all values needed below + +The service key JSON looks like this: + +```json +{ + "clientid": "sb-abc123...", + "clientsecret": "xyz789...", + "url": "https://myinstance.authentication.eu10.hana.ondemand.com", + "serviceurls": { + "AI_API_URL": "https://api.ai.prod.eu-central-1.aws.ml.hana.ondemand.com" + } +} +``` + +:::info Resource Group +The resource group is typically configured separately in your AI Core deployment, not in the service key itself. You can set it via the `AICORE_RESOURCE_GROUP` environment variable (defaults to "default"). +::: + +## Quick Start + +### Step 1: Install LiteLLM + +```bash +pip install litellm +``` + +### Step 2: Set Your Credentials + +Choose **one** of these authentication methods: + + + + +The simplest approach - paste your entire service key as a single environment variable. The service key must be wrapped in a `credentials` object: + +```bash +export AICORE_SERVICE_KEY='{ + "credentials": { + "clientid": "your-client-id", + "clientsecret": "your-client-secret", + "url": "https://.authentication.sap.hana.ondemand.com", + "serviceurls": { + "AI_API_URL": "https://api.ai..aws.ml.hana.ondemand.com" + } + } +}' +export AICORE_RESOURCE_GROUP="default" +``` + + + + +Alternatively, instead of using the service key above, you could set each credential separately: + +```bash +export AICORE_AUTH_URL="https://.authentication.sap.hana.ondemand.com/oauth/token" +export AICORE_CLIENT_ID="your-client-id" +export AICORE_CLIENT_SECRET="your-client-secret" +export AICORE_RESOURCE_GROUP="default" +export AICORE_BASE_URL="https://api.ai..aws.ml.hana.ondemand.com/v2" +``` + + + + +### Step 3: Make Your First Request + +```python title="test_sap.py" +from litellm import completion + +response = completion( + model="sap/gpt-4o", + messages=[{"role": "user", "content": "Hello from LiteLLM!"}] +) +print(response.choices[0].message.content) +``` + +Run it: + +```bash +python test_sap.py +``` + +**Expected output:** + +```text +Hello! How can I assist you today? +``` + +### Step 4: Verify Your Setup (Optional) + +Test that everything is working with this diagnostic script: + +```python title="verify_sap_setup.py" +import os +import litellm + +# Enable debug logging to see what's happening +import os +os.environ["LITELLM_LOG"] = "DEBUG" + +# Either use AICORE_SERVICE_KEY (contains all credentials including resourcegroup) +# OR use individual variables (all required together) +individual_vars = ["AICORE_AUTH_URL", "AICORE_CLIENT_ID", "AICORE_CLIENT_SECRET", "AICORE_BASE_URL", "AICORE_RESOURCE_GROUP"] + +print("=== SAP Gen AI Hub Setup Verification ===\n") + +# Check for service key method +if os.environ.get("AICORE_SERVICE_KEY"): + print("✓ Using AICORE_SERVICE_KEY authentication (includes resource group)") +else: + # Check individual variables + missing = [v for v in individual_vars if not os.environ.get(v)] + if missing: + print(f"✗ Missing environment variables: {missing}") + else: + print("✓ Using individual variable authentication") + print(f"✓ Resource group: {os.environ.get('AICORE_RESOURCE_GROUP')}") + +# Test API connection +print("\n=== Testing API Connection ===\n") +try: + response = litellm.completion( + model="sap/gpt-4o", + messages=[{"role": "user", "content": "Say 'Connection successful!' and nothing else."}], + max_tokens=20 + ) + print(f"✓ API Response: {response.choices[0].message.content}") + print("\n🎉 Setup complete! You're ready to use SAP Gen AI Hub with LiteLLM.") +except Exception as e: + print(f"✗ API Error: {e}") + print("\nTroubleshooting tips:") + print(" 1. Verify your service key credentials are correct") + print(" 2. Check that 'gpt-4o' is deployed in your resource group") + print(" 3. Ensure your SAP AI Core instance is running") +``` + +Run the verification: + +```bash +python verify_sap_setup.py +``` + +**Expected output on success:** + +```text +=== SAP Gen AI Hub Setup Verification === + +✓ Using AICORE_SERVICE_KEY authentication +✓ Resource group: default + +=== Testing API Connection === + +✓ API Response: Connection successful! + +🎉 Setup complete! You're ready to use SAP Gen AI Hub with LiteLLM. +``` + +## Authentication + +SAP Generative AI Hub uses OAuth2 service keys for authentication. See [Quick Start](#quick-start) for setup instructions. + +### Environment Variables Reference + +| Variable | Required | Description | +|----------|----------|-------------| +| `AICORE_SERVICE_KEY` | Yes* | Complete service key JSON (recommended method) | +| `AICORE_RESOURCE_GROUP` | Yes | Your AI Core resource group name | +| `AICORE_AUTH_URL` | Yes* | OAuth token URL (alternative to service key) | +| `AICORE_CLIENT_ID` | Yes* | OAuth client ID (alternative to service key) | +| `AICORE_CLIENT_SECRET` | Yes* | OAuth client secret (alternative to service key) | +| `AICORE_BASE_URL` | Yes* | AI Core API base URL (alternative to service key) | + +*Choose either `AICORE_SERVICE_KEY` OR the individual variables (`AICORE_AUTH_URL`, `AICORE_CLIENT_ID`, `AICORE_CLIENT_SECRET`, `AICORE_BASE_URL`). + +## Model Naming Conventions + +Understanding model naming is crucial for using SAP Gen AI Hub correctly. The naming pattern differs depending on whether you're using the SDK directly or through the proxy. + +### Direct SDK Usage + +When calling LiteLLM's SDK directly, you **must** include the `sap/` prefix in the model name: + +```python +# Correct - includes sap/ prefix +model="sap/gpt-4o" +model="sap/anthropic--claude-4.5-sonnet" +model="sap/gemini-2.5-pro" + +# Incorrect - missing prefix +model="gpt-4o" # ❌ Won't work +``` + +### Proxy Usage + +When using the LiteLLM Proxy, you use the **friendly `model_name`** defined in your configuration. The proxy automatically handles the `sap/` prefix routing. + +```yaml +# In config.yaml, define the mapping +model_list: + - model_name: gpt-4o # ← Use this name in client requests + litellm_params: + model: sap/gpt-4o # ← Proxy handles the sap/ prefix +``` + +```python +# Client request - no sap/ prefix needed +client.chat.completions.create( + model="gpt-4o", # ✓ Correct for proxy usage + messages=[...] +) +``` + +### Anthropic Models Special Syntax + +Anthropic models use a double-dash (`--`) prefix convention: + +| Provider | Model Example | LiteLLM Format | +|----------|---------------|----------------| +| OpenAI | GPT-4o | `sap/gpt-4o` | +| Anthropic | Claude 4.5 Sonnet | `sap/anthropic--claude-4.5-sonnet` | +| Google | Gemini 2.5 Pro | `sap/gemini-2.5-pro` | +| Mistral | Mistral Large | `sap/mistral-large` | + +### Quick Reference Table + +| Usage Type | Model Format | Example | +|------------|--------------|---------| +| Direct SDK | `sap/` | `sap/gpt-4o` | +| Direct SDK (Anthropic) | `sap/anthropic--` | `sap/anthropic--claude-4.5-sonnet` | +| Proxy Client | `` | `gpt-4o` or `claude-sonnet` | + +## Using the Python SDK + +The LiteLLM Python SDK automatically detects your authentication method. Simply set your environment variables and make requests. + +```python showLineNumbers title="Basic Completion" +from litellm import completion + +# Assumes AICORE_AUTH_URL, AICORE_CLIENT_ID, etc. are set +response = completion( + model="sap/anthropic--claude-4.5-sonnet", + messages=[{"role": "user", "content": "Explain quantum computing"}] +) +print(response.choices[0].message.content) +``` + +Both authentication methods (individual variables or service key JSON) work automatically - no code changes required. + +## Using the Proxy Server + +The LiteLLM Proxy provides a unified OpenAI-compatible API for your SAP models. + +### Configuration + +Create a `config.yaml` file in your project directory with your model mappings and credentials: + +```yaml showLineNumbers title="config.yaml" +model_list: + # OpenAI models + - model_name: gpt-5 + litellm_params: + model: sap/gpt-5 + + # Anthropic models (note the double-dash) + - model_name: claude-sonnet + litellm_params: + model: sap/anthropic--claude-4.5-sonnet + + - model_name: claude-opus + litellm_params: + model: sap/anthropic--claude-4.5-opus + + # Embeddings + - model_name: text-embedding-3-small + litellm_params: + model: sap/text-embedding-3-small + +litellm_settings: + drop_params: true + set_verbose: false + request_timeout: 600 + num_retries: 2 + forward_client_headers_to_llm_api: ["anthropic-version"] + +general_settings: + master_key: "sk-1234" # Enter here your desired master key starting with 'sk-'. + + # UI Admin is not required but helpful including the management of keys for your team(s). If you are using a database, these parameters are required: + database_url: "Enter you database URL." + UI_USERNAME: "Your desired UI admin account name" + UI_PASSWORD: "Your desired and strong pwd" + +# Authentication +environment_variables: + AICORE_SERVICE_KEY: '{"credentials": {"clientid": "...", "clientsecret": "...", "url": "...", "serviceurls": {"AI_API_URL": "..."}}}' + AICORE_RESOURCE_GROUP: "default" +``` + +### Starting the Proxy + +```bash showLineNumbers title="Start Proxy" +litellm --config config.yaml +``` + +The proxy will start on `http://localhost:4000` by default. + +### Making Requests + + + + +```bash showLineNumbers title="Test Request" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gpt-4o", + "messages": [{"role": "user", "content": "Hello"}] + }' +``` + + + + +```python showLineNumbers title="OpenAI SDK" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="sk-1234" +) + +response = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": "Hello"}] +) +print(response.choices[0].message.content) +``` + + + + +```python showLineNumbers title="LiteLLM SDK" +import os +import litellm + +os.environ["LITELLM_PROXY_API_KEY"] = "sk-1234" +litellm.use_litellm_proxy = True + +response = litellm.completion( + model="claude-sonnet", + messages=[{"content": "Hello, how are you?", "role": "user"}], + api_base="http://localhost:4000" +) + +print(response) +``` + + + + +## Features + +### Streaming Responses + +Stream responses in real-time for better user experience: + +```python showLineNumbers title="Streaming Chat Completion" +from litellm import completion + +response = completion( + model="sap/gpt-4o", + messages=[{"role": "user", "content": "Count from 1 to 10"}], + stream=True +) + +for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +``` + +### Structured Output + +#### JSON Schema (Recommended) + +Use JSON Schema for structured output with strict validation: + +```python showLineNumbers title="JSON Schema Response" +from litellm import completion + +response = completion( + model="sap/gpt-4o", + messages=[{ + "role": "user", + "content": "Generate info about Tokyo" + }], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "city_info", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "population": {"type": "number"}, + "country": {"type": "string"} + }, + "required": ["name", "population", "country"], + "additionalProperties": False + }, + "strict": True + } + } +) + +print(response.choices[0].message.content) +# Output: {"name":"Tokyo","population":37000000,"country":"Japan"} +``` + +#### JSON Object Format + +For flexible JSON output without schema validation: + +```python showLineNumbers title="JSON Object Response" +from litellm import completion + +response = completion( + model="sap/gpt-4o", + messages=[{ + "role": "user", + "content": "Generate a person object in JSON format with name and age" + }], + response_format={"type": "json_object"} +) + +print(response.choices[0].message.content) +``` + +:::note SAP Platform Requirement +When using `json_object` type, SAP's orchestration service requires the word "json" to appear in your prompt. This ensures explicit intent for JSON formatting. For schema-validated output without this requirement, use `json_schema` instead (recommended). +::: + +### Multi-turn Conversations + +Maintain conversation context across multiple turns: + +```python showLineNumbers title="Multi-turn Conversation" +from litellm import completion + +response = completion( + model="sap/gpt-4o", + messages=[ + {"role": "user", "content": "My name is Alice"}, + {"role": "assistant", "content": "Hello Alice! Nice to meet you."}, + {"role": "user", "content": "What is my name?"} + ] +) + +print(response.choices[0].message.content) +# Output: Your name is Alice. +``` + +### Embeddings + +Generate vector embeddings for semantic search and retrieval: + +```python showLineNumbers title="Create Embeddings" +from litellm import embedding + +response = embedding( + model="sap/text-embedding-3-small", + input=["Hello world", "Machine learning is fascinating"] +) + +print(response.data[0]["embedding"]) # Vector representation +``` + +## Reference + +### Supported Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `model` | string | Model identifier (with `sap/` prefix for SDK) | +| `messages` | array | Conversation messages | +| `temperature` | float | Controls randomness (0-2) | +| `max_tokens` | integer | Maximum tokens in response | +| `top_p` | float | Nucleus sampling threshold | +| `stream` | boolean | Enable streaming responses | +| `response_format` | object | Output format (`json_object`, `json_schema`) | +| `tools` | array | Function calling tool definitions | +| `tool_choice` | string/object | Tool selection behavior | + +### Supported Models + +For the complete and up-to-date list of available models provided by SAP Gen AI Hub, please refer to the [SAP AI Core Generative AI Hub documentation](https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/models-and-scenarios-in-generative-ai-hub). + +:::info Model Availability +Model availability varies by SAP deployment region and your subscription. Contact your SAP administrator to confirm which models are available in your environment. +::: + +### Troubleshooting + +**Authentication Errors** + +If you receive authentication errors: + +1. Verify all required environment variables are set correctly +2. Check that your service key hasn't expired +3. Confirm your resource group has access to the desired models +4. Ensure the `AICORE_AUTH_URL` and `AICORE_BASE_URL` match your SAP region + +**Model Not Found** + +If a model returns "not found": + +1. Verify the model is available in your SAP deployment +2. Check you're using the correct model name format (`sap/` prefix for SDK) +3. Confirm your resource group has access to that specific model +4. For Anthropic models, ensure you're using the `anthropic--` double-dash prefix + +**Rate Limiting** + +SAP Gen AI Hub enforces rate limits based on your subscription. If you hit limits: + +1. Implement exponential backoff retry logic +2. Consider using the proxy's built-in rate limiting features +3. Contact your SAP administrator to review quota allocations diff --git a/docs/my-website/docs/providers/sarvam.md b/docs/my-website/docs/providers/sarvam.md new file mode 100644 index 000000000000..6a2924567810 --- /dev/null +++ b/docs/my-website/docs/providers/sarvam.md @@ -0,0 +1,92 @@ +# Sarvam.ai + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +LiteLLM supports all the text models from [Sarvam ai](https://docs.sarvam.ai/api-reference-docs/chat/chat-completions) + +## Usage + +```python +import os +from litellm import completion + +# Set your Sarvam API key +os.environ["SARVAM_API_KEY"] = "" + +messages = [{"role": "user", "content": "Hello"}] + +response = completion( + model="sarvam/sarvam-m", + messages=messages, +) +print(response) +``` + +## Usage with LiteLLM Proxy Server + +Here's how to call a Sarvam.ai model with the LiteLLM Proxy Server + +1. **Modify the `config.yaml`:** + + ```yaml + model_list: + - model_name: my-model + litellm_params: + model: sarvam/ # add sarvam/ prefix to route as Sarvam provider + api_key: api-key # api key to send your model + ``` + +2. **Start the proxy:** + + ```bash + $ litellm --config /path/to/config.yaml + ``` + +3. **Send a request to LiteLLM Proxy Server:** + + + + + + ```python + import openai + + client = openai.OpenAI( + api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys + base_url="http://0.0.0.0:4000" # litellm-proxy-base url + ) + + response = client.chat.completions.create( + model="my-model", + messages=[ + { + "role": "user", + "content": "what llm are you" + } + ], + ) + + print(response) + ``` + + + + + ```shell + curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "my-model", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ] + }' + ``` + + + diff --git a/docs/my-website/docs/providers/snowflake.md b/docs/my-website/docs/providers/snowflake.md index 40deef878056..483bf939fe64 100644 --- a/docs/my-website/docs/providers/snowflake.md +++ b/docs/my-website/docs/providers/snowflake.md @@ -3,20 +3,15 @@ import TabItem from '@theme/TabItem'; # Snowflake -| Property | Details | -|-------|-------| -| Description | The Snowflake Cortex LLM REST API lets you access the COMPLETE function via HTTP POST requests| -| Provider Route on LiteLLM | `snowflake/` | -| Link to Provider Doc | [Snowflake ↗](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-llm-rest-api) | -| Base URL | `https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete` | -| Supported OpenAI Endpoints | `/chat/completions`, `/completions` | +| Property | Details | +|----------------------------|-----------------------------------------------------------------------------------------------------------| +| Description | The Snowflake Cortex LLM REST API lets you access the COMPLETE and EMBED functions via HTTP POST requests | +| Provider Route on LiteLLM | `snowflake/` | +| Link to Provider Doc | [Snowflake ↗](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-llm-rest-api) | +| Base URLs | `https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:complete`,`https://{account-id}.snowflakecomputing.com/api/v2/cortex/inference:embed`| +| Supported OpenAI Endpoints | `/chat/completions`, `/completions`, `/embeddings` | - -Currently, Snowflake's REST API does not have an endpoint for `snowflake-arctic-embed` embedding models. If you want to use these embedding models with Litellm, you can call them through our Hugging Face provider. - -Find the Arctic Embed models [here](https://huggingface.co/collections/Snowflake/arctic-embed-661fd57d50fab5fc314e4c18) on Hugging Face. - ## Supported OpenAI Parameters ``` "temperature", @@ -29,6 +24,9 @@ Find the Arctic Embed models [here](https://huggingface.co/collections/Snowflake Snowflake does have API keys. Instead, you access the Snowflake API with your JWT token and account identifier. +It is also possible to use [programmatic access tokens](https://docs.snowflake.com/en/user-guide/programmatic-access-tokens) (PAT). It can be defined by using 'pat/' prefix + + ```python import os os.environ["SNOWFLAKE_JWT"] = "YOUR JWT" @@ -37,17 +35,38 @@ os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER" ## Usage ```python -from litellm import completion +from litellm import completion, embedding ## set ENV variables -os.environ["SNOWFLAKE_JWT"] = "YOUR JWT" +os.environ["SNOWFLAKE_JWT"] = "JWT_TOKEN" os.environ["SNOWFLAKE_ACCOUNT_ID"] = "YOUR ACCOUNT IDENTIFIER" -# Snowflake call +# Snowflake completion call response = completion( model="snowflake/mistral-7b", messages = [{ "content": "Hello, how are you?","role": "user"}] ) + +# Snowflake embedding call +response = embedding( + model="snowflake/mistral-7b", + input = ["My text"] +) + +# Pass`api_key` and `account_id` as parameters +response = completion( + model="snowflake/mistral-7b", + messages = [{ "content": "Hello, how are you?","role": "user"}], + account_id="AAAA-BBBB", + api_key="JWT_TOKEN" +) + +# using PAT +response = completion( + model="snowflake/mistral-7b", + messages = [{ "content": "Hello, how are you?","role": "user"}], + api_key="pat/PAT_TOKEN" +) ``` ## Usage with LiteLLM Proxy diff --git a/docs/my-website/docs/providers/stability.md b/docs/my-website/docs/providers/stability.md new file mode 100644 index 000000000000..c4bc5376d1f7 --- /dev/null +++ b/docs/my-website/docs/providers/stability.md @@ -0,0 +1,496 @@ +# Stability AI +https://stability.ai/ + +## Overview + +| Property | Details | +|-------|-------| +| Description | Stability AI creates open AI models for image, video, audio, and 3D generation. Known for Stable Diffusion. | +| Provider Route on LiteLLM | `stability/` | +| Link to Provider Doc | [Stability AI API ↗](https://platform.stability.ai/docs/api-reference) | +| Supported Operations | [`/images/generations`](#image-generation), [`/images/edits`](#image-editing) | + +LiteLLM supports Stability AI Image Generation calls via the Stability AI REST API (not via Bedrock). + +## API Key + +```python +# env variable +os.environ['STABILITY_API_KEY'] = "your-api-key" +``` + +Get your API key from the [Stability AI Platform](https://platform.stability.ai/). + +## Image Generation + +### Usage - LiteLLM Python SDK + +```python showLineNumbers +from litellm import image_generation +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Stability AI image generation call +response = image_generation( + model="stability/sd3.5-large", + prompt="A beautiful sunset over a calm ocean", +) +print(response) +``` + +### Usage - LiteLLM Proxy Server + +#### 1. Setup config.yaml + +```yaml showLineNumbers +model_list: + - model_name: sd3 + litellm_params: + model: stability/sd3.5-large + api_key: os.environ/STABILITY_API_KEY + model_info: + mode: image_generation + +general_settings: + master_key: sk-1234 +``` + +#### 2. Start the proxy + +```bash showLineNumbers +litellm --config config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +#### 3. Test it + +```bash showLineNumbers +curl --location 'http://0.0.0.0:4000/v1/images/generations' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer sk-1234' \ +--data '{ + "model": "sd3", + "prompt": "A beautiful sunset over a calm ocean" +}' +``` + +### Advanced Usage - With Additional Parameters + +```python showLineNumbers +from litellm import image_generation +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +response = image_generation( + model="stability/sd3.5-large", + prompt="A beautiful sunset over a calm ocean", + size="1792x1024", # Maps to aspect_ratio 16:9 + negative_prompt="blurry, low quality", # Stability-specific + seed=12345, # For reproducibility +) +print(response) +``` + +### Supported Parameters + +Stability AI supports the following OpenAI-compatible parameters: + +| Parameter | Type | Description | Example | +|-----------|------|-------------|---------| +| `size` | string | Image dimensions (mapped to aspect_ratio) | `"1024x1024"` | +| `n` | integer | Number of images (note: Stability returns 1 per request) | `1` | +| `response_format` | string | Format of response (`b64_json` only for Stability) | `"b64_json"` | + +### Size to Aspect Ratio Mapping + +The `size` parameter is automatically mapped to Stability's `aspect_ratio`: + +| OpenAI Size | Stability Aspect Ratio | +|-------------|----------------------| +| `1024x1024` | `1:1` | +| `1792x1024` | `16:9` | +| `1024x1792` | `9:16` | +| `512x512` | `1:1` | +| `256x256` | `1:1` | + +### Using Stability-Specific Parameters + +You can pass parameters that are specific to Stability AI directly in your request: + +```python showLineNumbers +from litellm import image_generation +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +response = image_generation( + model="stability/sd3.5-large", + prompt="A beautiful sunset over a calm ocean", + # Stability-specific parameters + negative_prompt="blurry, watermark, text", + aspect_ratio="16:9", # Use directly instead of size + seed=42, + output_format="png", # png, jpeg, or webp +) +print(response) +``` + +### Supported Image Generation Models + +| Model Name | Function Call | Description | +|------------|---------------|-------------| +| sd3 | `image_generation(model="stability/sd3", ...)` | Stable Diffusion 3 | +| sd3-large | `image_generation(model="stability/sd3-large", ...)` | SD3 Large | +| sd3-large-turbo | `image_generation(model="stability/sd3-large-turbo", ...)` | SD3 Large Turbo (faster) | +| sd3-medium | `image_generation(model="stability/sd3-medium", ...)` | SD3 Medium | +| sd3.5-large | `image_generation(model="stability/sd3.5-large", ...)` | SD 3.5 Large (recommended) | +| sd3.5-large-turbo | `image_generation(model="stability/sd3.5-large-turbo", ...)` | SD 3.5 Large Turbo | +| sd3.5-medium | `image_generation(model="stability/sd3.5-medium", ...)` | SD 3.5 Medium | +| stable-image-ultra | `image_generation(model="stability/stable-image-ultra", ...)` | Stable Image Ultra | +| stable-image-core | `image_generation(model="stability/stable-image-core", ...)` | Stable Image Core | + +For more details on available models and features, see: https://platform.stability.ai/docs/api-reference + +## Response Format + +Stability AI returns images in base64 format. The response is OpenAI-compatible: + +```python +{ + "created": 1234567890, + "data": [ + { + "b64_json": "iVBORw0KGgo..." # Base64 encoded image + } + ] +} +``` + +## Image Editing + +Stability AI supports various image editing operations including inpainting, upscaling, outpainting, background removal, and more. + +:::info Optional Parameters +**Important:** Different Stability models have different parameter requirements: +- Some models don't require a `prompt` (e.g., upscaling, background removal) +- The `style-transfer` model uses `init_image` and `style_image` instead of `image` +- The `outpaint` model requires numeric parameters (`left`, `right`, `up`, `down`) +LiteLLM automatically handles these differences for you. +::: + +### Usage - LiteLLM Python SDK + +#### Inpainting (Edit with Mask) + +```python showLineNumbers +from litellm import image_edit +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Inpainting - edit specific areas using a mask +response = image_edit( + model="stability/stable-image-inpaint-v1:0", + image=open("original_image.png", "rb"), + mask=open("mask_image.png", "rb"), + prompt="Add a beautiful sunset in the masked area", + size="1024x1024", +) +print(response) +``` + +#### Image Upscaling + +```python showLineNumbers +from litellm import image_edit +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Conservative upscaling - preserves details +response = image_edit( + model="stability/stable-conservative-upscale-v1:0", + image=open("low_res_image.png", "rb"), + prompt="Upscale this image while preserving details", +) + +# Creative upscaling - adds creative details +response = image_edit( + model="stability/stable-creative-upscale-v1:0", + image=open("low_res_image.png", "rb"), + prompt="Upscale and enhance with creative details", + creativity=0.3, # 0-0.35, higher = more creative +) + +# Fast upscaling - quick upscaling (no prompt needed) +response = image_edit( + model="stability/stable-fast-upscale-v1:0", + image=open("low_res_image.png", "rb"), + # No prompt required for fast upscale +) +print(response) +``` + +#### Image Outpainting + +```python showLineNumbers +from litellm import image_edit +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Extend image beyond its borders +response = image_edit( + model="stability/stable-outpaint-v1:0", + image=open("original_image.png", "rb"), + prompt="Extend this landscape with mountains", + left=100, # Pixels to extend on the left + right=100, # Pixels to extend on the right + up=50, # Pixels to extend on top + down=50, # Pixels to extend on bottom +) +print(response) +``` + +#### Background Removal + +```python showLineNumbers +from litellm import image_edit +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Remove background from image +response = image_edit( + model="stability/stable-image-remove-background-v1:0", + image=open("portrait.png", "rb"), + # No prompt required for fast upscale +) +print(response) +``` + +#### Search and Replace + +```python showLineNumbers +from litellm import image_edit +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Search and replace objects in image +response = image_edit( + model="stability/stable-image-search-replace-v1:0", + image=open("scene.png", "rb"), + prompt="A red sports car", + search_prompt="blue sedan", # What to replace +) + +# Search and recolor +response = image_edit( + model="stability/stable-image-search-recolor-v1:0", + image=open("scene.png", "rb"), + prompt="Make it golden yellow", + select_prompt="the car", # What to recolor +) +print(response) +``` + +#### Image Control (Sketch/Structure) + +```python showLineNumbers +from litellm import image_edit +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Control with sketch +response = image_edit( + model="stability/stable-image-control-sketch-v1:0", + image=open("sketch.png", "rb"), + prompt="Turn this sketch into a realistic photo", + control_strength=0.7, # 0-1, higher = more control +) + +# Control with structure +response = image_edit( + model="stability/stable-image-control-structure-v1:0", + image=open("structure_reference.png", "rb"), + prompt="Generate image following this structure", + control_strength=0.7, +) +print(response) +``` + +#### Erase Objects + +```python showLineNumbers +from litellm import image_edit +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Erase objects from image +response = image_edit( + model="stability/stable-image-erase-object-v1:0", + image=open("scene.png", "rb"), + mask=open("object_mask.png", "rb"), # Mask the object to erase + # No prompt needed +) +print(response) +``` +#### Style Transfer + +```python showLineNumbers +from litellm import image_edit +import os + +os.environ['STABILITY_API_KEY'] = "your-api-key" + +# Transfer style from one image to another +# Note: Uses init_image (via image param) and style_image +response = image_edit( + model="stability/stable-style-transfer-v1:0", + image=open("content_image.png", "rb"), # Maps to init_image + style_image=open("style_reference.png", "rb"), # Style to apply + fidelity=0.5, # 0-1, balance between content and style + # No prompt needed +) + +print(response) + +### Supported Image Edit Models + +| Model Name | Function Call | Description | +|------------|---------------|-------------| +| stable-image-inpaint-v1:0 | `image_edit(model="stability/stable-image-inpaint-v1:0", ...)` | Inpainting with mask | +| stable-conservative-upscale-v1:0 | `image_edit(model="stability/stable-conservative-upscale-v1:0", ...)` | Conservative upscaling | +| stable-creative-upscale-v1:0 | `image_edit(model="stability/stable-creative-upscale-v1:0", ...)` | Creative upscaling | +| stable-fast-upscale-v1:0 | `image_edit(model="stability/stable-fast-upscale-v1:0", ...)` | Fast upscaling | +| stable-outpaint-v1:0 | `image_edit(model="stability/stable-outpaint-v1:0", ...)` | Extend image borders | +| stable-image-remove-background-v1:0 | `image_edit(model="stability/stable-image-remove-background-v1:0", ...)` | Remove background | +| stable-image-search-replace-v1:0 | `image_edit(model="stability/stable-image-search-replace-v1:0", ...)` | Search and replace objects | +| stable-image-search-recolor-v1:0 | `image_edit(model="stability/stable-image-search-recolor-v1:0", ...)` | Search and recolor | +| stable-image-control-sketch-v1:0 | `image_edit(model="stability/stable-image-control-sketch-v1:0", ...)` | Control with sketch | +| stable-image-control-structure-v1:0 | `image_edit(model="stability/stable-image-control-structure-v1:0", ...)` | Control with structure | +| stable-image-erase-object-v1:0 | `image_edit(model="stability/stable-image-erase-object-v1:0", ...)` | Erase objects | +| stable-image-style-guide-v1:0 | `image_edit(model="stability/stable-image-style-guide-v1:0", ...)` | Apply style guide | +| stable-style-transfer-v1:0 | `image_edit(model="stability/stable-style-transfer-v1:0", ...)` | Transfer style | + +### Usage - LiteLLM Proxy Server + +#### 1. Setup config.yaml + +```yaml showLineNumbers +model_list: + - model_name: stability-inpaint + litellm_params: + model: stability/stable-image-inpaint-v1:0 + api_key: os.environ/STABILITY_API_KEY + model_info: + mode: image_edit + + - model_name: stability-upscale + litellm_params: + model: stability/stable-conservative-upscale-v1:0 + api_key: os.environ/STABILITY_API_KEY + model_info: + mode: image_edit + +general_settings: + master_key: sk-1234 +``` + +#### 2. Start the proxy + +```bash showLineNumbers +litellm --config config.yaml + +# RUNNING on http://0.0.0.0:4000 +``` + +#### 3. Test it + +```bash showLineNumbers +curl -X POST "http://0.0.0.0:4000/v1/images/edits" \ + -H "Authorization: Bearer sk-1234" \ + -F "model=stability-inpaint" \ + -F "image=@original_image.png" \ + -F "mask=@mask_image.png" \ + -F "prompt=Add a beautiful garden in the masked area" +``` + +## AWS Bedrock (Stability) + +LiteLLM also supports Stability AI models via AWS Bedrock. This is useful if you're already using AWS infrastructure. + +### Usage - Bedrock Stability + +```python showLineNumbers +from litellm import image_edit +import os + +# Set AWS credentials +os.environ["AWS_ACCESS_KEY_ID"] = "your-access-key" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your-secret-key" +os.environ["AWS_REGION_NAME"] = "us-east-1" + +# Bedrock Stability inpainting +response = image_edit( + model="bedrock/us.stability.stable-image-inpaint-v1:0", + image=open("original_image.png", "rb"), + mask=open("mask_image.png", "rb"), + prompt="Add flowers in the masked area", +) +print(response) +``` +# Fast upscale without prompt +response = image_edit( + model="bedrock/stability.stable-fast-upscale-v1:0", + image=open("low_res_image.png", "rb"), +) + +# Outpaint with numeric parameters +response = image_edit( + model="bedrock/stability.stable-outpaint-v1:0", + image=open("original_image.png", "rb"), + left=100, # Automatically converted to int + right=100, + up=50, + down=50, +) + +print(response) + +### Supported Bedrock Stability Models + +All Stability AI image edit models are available via Bedrock with the `bedrock/` prefix: + +| Direct API Model | Bedrock Model | Description | +|------------------|---------------|-------------| +| stability/stable-image-inpaint-v1:0 | bedrock/us.stability.stable-image-inpaint-v1:0 | Inpainting | +| stability/stable-conservative-upscale-v1:0 | bedrock/stability.stable-conservative-upscale-v1:0 | Conservative upscaling | +| stability/stable-creative-upscale-v1:0 | bedrock/stability.stable-creative-upscale-v1:0 | Creative upscaling | +| stability/stable-fast-upscale-v1:0 | bedrock/stability.stable-fast-upscale-v1:0 | Fast upscaling | +| stability/stable-outpaint-v1:0 | bedrock/stability.stable-outpaint-v1:0 | Outpainting | +| stability/stable-image-remove-background-v1:0 | bedrock/stability.stable-image-remove-background-v1:0 | Remove background | +| stability/stable-image-search-replace-v1:0 | bedrock/stability.stable-image-search-replace-v1:0 | Search and replace | +| stability/stable-image-search-recolor-v1:0 | bedrock/stability.stable-image-search-recolor-v1:0 | Search and recolor | +| stability/stable-image-control-sketch-v1:0 | bedrock/stability.stable-image-control-sketch-v1:0 | Control with sketch | +| stability/stable-image-control-structure-v1:0 | bedrock/stability.stable-image-control-structure-v1:0 | Control with structure | +| stability/stable-image-erase-object-v1:0 | bedrock/stability.stable-image-erase-object-v1:0 | Erase objects | + +**Note:** Bedrock model IDs may use `us.stability.*` or `stability.*` prefix depending on the region and model. + +## Comparing Routes + +LiteLLM supports Stability AI models via two routes: + +| Route | Provider | Use Case | Image Generation | Image Editing | +|-------|----------|----------|------------------|---------------| +| `stability/` | Stability AI Direct API | Direct access, all latest models | ✅ | ✅ | +| `bedrock/stability.*` | AWS Bedrock | AWS integration, enterprise features | ✅ | ✅ | + +Use `stability/` for direct API access. Use `bedrock/stability.*` if you're already using AWS Bedrock. diff --git a/docs/my-website/docs/providers/synthetic.md b/docs/my-website/docs/providers/synthetic.md new file mode 100644 index 000000000000..b3ba3d0a9e76 --- /dev/null +++ b/docs/my-website/docs/providers/synthetic.md @@ -0,0 +1,119 @@ +# Synthetic + +## Overview + +| Property | Details | +|-------|-------| +| Description | Synthetic runs open-source AI models in secure datacenters within the US and EU, with a focus on privacy. They never train on your data and auto-delete API data within 14 days. | +| Provider Route on LiteLLM | `synthetic/` | +| Link to Provider Doc | [Synthetic Website ↗](https://synthetic.new) | +| Base URL | `https://api.synthetic.new/openai/v1` | +| Supported Operations | [`/chat/completions`](#sample-usage) | + +
+ +## What is Synthetic? + +Synthetic is a privacy-focused AI platform that provides access to open-source LLMs with the following guarantees: +- **Privacy-First**: Data never used for training +- **Secure Hosting**: Models run in secure datacenters in US and EU +- **Auto-Deletion**: API data automatically deleted within 14 days +- **Open Source**: Runs open-source AI models + +## Required Variables + +```python showLineNumbers title="Environment Variables" +os.environ["SYNTHETIC_API_KEY"] = "" # your Synthetic API key +``` + +Get your Synthetic API key from [synthetic.new](https://synthetic.new). + +## Usage - LiteLLM Python SDK + +### Non-streaming + +```python showLineNumbers title="Synthetic Non-streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["SYNTHETIC_API_KEY"] = "" # your Synthetic API key + +messages = [{"content": "What is the capital of France?", "role": "user"}] + +# Synthetic call +response = completion( + model="synthetic/model-name", # Replace with actual model name + messages=messages +) + +print(response) +``` + +### Streaming + +```python showLineNumbers title="Synthetic Streaming Completion" +import os +import litellm +from litellm import completion + +os.environ["SYNTHETIC_API_KEY"] = "" # your Synthetic API key + +messages = [{"content": "Write a short poem about AI", "role": "user"}] + +# Synthetic call with streaming +response = completion( + model="synthetic/model-name", # Replace with actual model name + messages=messages, + stream=True +) + +for chunk in response: + print(chunk) +``` + +## Usage - LiteLLM Proxy Server + +### 1. Save key in your environment + +```bash +export SYNTHETIC_API_KEY="" +``` + +### 2. Start the proxy + +```yaml +model_list: + - model_name: synthetic-model + litellm_params: + model: synthetic/model-name # Replace with actual model name + api_key: os.environ/SYNTHETIC_API_KEY +``` + +## Supported OpenAI Parameters + +Synthetic supports all standard OpenAI-compatible parameters: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `messages` | array | **Required**. Array of message objects with 'role' and 'content' | +| `model` | string | **Required**. Model ID | +| `stream` | boolean | Optional. Enable streaming responses | +| `temperature` | float | Optional. Sampling temperature | +| `top_p` | float | Optional. Nucleus sampling parameter | +| `max_tokens` | integer | Optional. Maximum tokens to generate | +| `frequency_penalty` | float | Optional. Penalize frequent tokens | +| `presence_penalty` | float | Optional. Penalize tokens based on presence | +| `stop` | string/array | Optional. Stop sequences | + +## Privacy & Security + +Synthetic provides enterprise-grade privacy protections: +- Data auto-deleted within 14 days +- No data used for model training +- Secure hosting in US and EU datacenters +- Compliance-friendly architecture + +## Additional Resources + +- [Synthetic Website](https://synthetic.new) diff --git a/docs/my-website/docs/providers/vercel_ai_gateway.md b/docs/my-website/docs/providers/vercel_ai_gateway.md index 91f0a18ea1c9..3ff007171edd 100644 --- a/docs/my-website/docs/providers/vercel_ai_gateway.md +++ b/docs/my-website/docs/providers/vercel_ai_gateway.md @@ -11,7 +11,7 @@ import TabItem from '@theme/TabItem'; | Provider Route on LiteLLM | `vercel_ai_gateway/` | | Link to Provider Doc | [Vercel AI Gateway Documentation ↗](https://vercel.com/docs/ai-gateway) | | Base URL | `https://ai-gateway.vercel.sh/v1` | -| Supported Operations | `/chat/completions`, `/models` | +| Supported Operations | `/chat/completions`, `/embeddings`, `/models` |

@@ -73,7 +73,7 @@ messages = [{"content": "Hello, how are you?", "role": "user"}] # Vercel AI Gateway call with streaming response = completion( - model="vercel_ai_gateway/openai/gpt-4o", + model="vercel_ai_gateway/openai/gpt-4o", messages=messages, stream=True ) @@ -82,6 +82,33 @@ for chunk in response: print(chunk) ``` +### Embeddings + +```python showLineNumbers title="Vercel AI Gateway Embeddings" +import os +from litellm import embedding + +os.environ["VERCEL_AI_GATEWAY_API_KEY"] = "your-api-key" + +# Vercel AI Gateway embedding call +response = embedding( + model="vercel_ai_gateway/openai/text-embedding-3-small", + input="Hello world" +) + +print(response.data[0]["embedding"][:5]) # Print first 5 dimensions +``` + +You can also specify the `dimensions` parameter: + +```python showLineNumbers title="Vercel AI Gateway Embeddings with Dimensions" +response = embedding( + model="vercel_ai_gateway/openai/text-embedding-3-small", + input=["Hello world", "Goodbye world"], + dimensions=768 +) +``` + ## Usage - LiteLLM Proxy Add the following to your LiteLLM Proxy configuration file: @@ -97,6 +124,11 @@ model_list: litellm_params: model: vercel_ai_gateway/anthropic/claude-4-sonnet api_key: os.environ/VERCEL_AI_GATEWAY_API_KEY + + - model_name: text-embedding-3-small-gateway + litellm_params: + model: vercel_ai_gateway/openai/text-embedding-3-small + api_key: os.environ/VERCEL_AI_GATEWAY_API_KEY ``` Start your LiteLLM Proxy server: diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index 874b637e4db3..63e4dceec005 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -14,6 +14,17 @@ import TabItem from '@theme/TabItem'; | Base URL | 1. Regional endpoints
`https://{vertex_location}-aiplatform.googleapis.com/`
2. Global endpoints (limited availability)
`https://aiplatform.googleapis.com/`| | Supported Operations | [`/chat/completions`](#sample-usage), `/completions`, [`/embeddings`](#embedding-models), [`/audio/speech`](#text-to-speech-apis), [`/fine_tuning`](#fine-tuning-apis), [`/batches`](#batch-apis), [`/files`](#batch-apis), [`/images`](#image-generation-models), [`/rerank`](#rerank-api) | +:::tip Vertex AI vs Gemini API +| Model Format | Provider | Auth Required | +|-------------|----------|---------------| +| `vertex_ai/gemini-2.0-flash` | Vertex AI | GCP credentials + project | +| `gemini-2.0-flash` (no prefix) | Vertex AI | GCP credentials + project | +| `gemini/gemini-2.0-flash` | Gemini API | `GEMINI_API_KEY` (simple API key) | + +**If you just want to use an API key** (like OpenAI), use the `gemini/` prefix instead. See [Gemini - Google AI Studio](./gemini.md). + +Models without a prefix default to Vertex AI which requires GCP authentication. +:::

@@ -1390,6 +1401,77 @@ model_list: +### **Workload Identity Federation** + +LiteLLM supports [Google Cloud Workload Identity Federation (WIF)](https://cloud.google.com/iam/docs/workload-identity-federation), which allows you to grant on-premises or multi-cloud workloads access to Google Cloud resources without using a service account key. This is the recommended approach for workloads running in other cloud environments (AWS, Azure, etc.) or on-premises. + +To use Workload Identity Federation, pass the path to your WIF credentials configuration file via `vertex_credentials`: + + + + +```python +from litellm import completion + +response = completion( + model="vertex_ai/gemini-1.5-pro", + messages=[{"role": "user", "content": "Hello!"}], + vertex_credentials="/path/to/wif-credentials.json", # 👈 WIF credentials file + vertex_project="your-gcp-project-id", + vertex_location="us-central1" +) +``` + + + + +```yaml +model_list: + - model_name: gemini-model + litellm_params: + model: vertex_ai/gemini-1.5-pro + vertex_project: your-gcp-project-id + vertex_location: us-central1 + vertex_credentials: /path/to/wif-credentials.json # 👈 WIF credentials file +``` + +Alternatively, you can create credentials in **LLM Credentials** in the LiteLLM UI and use those to authenticate your models: + +```yaml +model_list: + - model_name: gemini-model + litellm_params: + model: vertex_ai/gemini-1.5-pro + vertex_project: your-gcp-project-id + vertex_location: us-central1 + litellm_credential_name: my-vertex-wif-credential # 👈 Reference credential stored in UI +``` + + + + +**WIF Credentials File Format** + +Your WIF credentials JSON file typically looks like this (for AWS federation): + +```json +{ + "type": "external_account", + "audience": "//iam.googleapis.com/projects/PROJECT_NUMBER/locations/global/workloadIdentityPools/POOL_ID/providers/PROVIDER_ID", + "subject_token_type": "urn:ietf:params:aws:token-type:aws4_request", + "service_account_impersonation_url": "https://iamcredentials.googleapis.com/v1/projects/-/serviceAccounts/SERVICE_ACCOUNT_EMAIL:generateAccessToken", + "token_url": "https://sts.googleapis.com/v1/token", + "credential_source": { + "environment_id": "aws1", + "region_url": "http://169.254.169.254/latest/meta-data/placement/availability-zone", + "url": "http://169.254.169.254/latest/meta-data/iam/security-credentials", + "regional_cred_verification_url": "https://sts.{region}.amazonaws.com?Action=GetCallerIdentity&Version=2011-06-15" + } +} +``` + +For more details on setting up Workload Identity Federation, see [Google Cloud WIF documentation](https://cloud.google.com/iam/docs/workload-identity-federation). + ### **Environment Variables** You can set: @@ -1604,6 +1686,56 @@ litellm.vertex_location = "us-central1 # Your Location | gemini-2.5-flash-preview-09-2025 | `completion('gemini-2.5-flash-preview-09-2025', messages)`, `completion('vertex_ai/gemini-2.5-flash-preview-09-2025', messages)` | | gemini-2.5-flash-lite-preview-09-2025 | `completion('gemini-2.5-flash-lite-preview-09-2025', messages)`, `completion('vertex_ai/gemini-2.5-flash-lite-preview-09-2025', messages)` | +## Private Service Connect (PSC) Endpoints + +LiteLLM supports Vertex AI models deployed to Private Service Connect (PSC) endpoints, allowing you to use custom `api_base` URLs for private deployments. + +### Usage + +```python +from litellm import completion + +# Use PSC endpoint with custom api_base +response = completion( + model="vertex_ai/1234567890", # Numeric endpoint ID + messages=[{"role": "user", "content": "Hello!"}], + api_base="http://10.96.32.8", # Your PSC endpoint + vertex_project="my-project-id", + vertex_location="us-central1", + use_psc_endpoint_format=True +) +``` + +**Key Features:** +- Supports both numeric endpoint IDs and custom model names +- Works with both completion and embedding endpoints +- Automatically constructs full PSC URL: `{api_base}/v1/projects/{project}/locations/{location}/endpoints/{model}:{endpoint}` +- Compatible with streaming requests + +### Configuration + +Add PSC endpoints to your `config.yaml`: + +```yaml +model_list: + - model_name: psc-gemini + litellm_params: + model: vertex_ai/1234567890 # Numeric endpoint ID + api_base: "http://10.96.32.8" # Your PSC endpoint + vertex_project: "my-project-id" + vertex_location: "us-central1" + vertex_credentials: "/path/to/service_account.json" + use_psc_endpoint_format: True + - model_name: psc-embedding + litellm_params: + model: vertex_ai/text-embedding-004 + api_base: "http://10.96.32.8" # Your PSC endpoint + vertex_project: "my-project-id" + vertex_location: "us-central1" + vertex_credentials: "/path/to/service_account.json" + use_psc_endpoint_format: True +``` + ## Fine-tuned Models You can call fine-tuned Vertex AI Gemini models through LiteLLM @@ -1741,7 +1873,7 @@ response = litellm.completion( { "type": "image_url", "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + "url": "https://awsmp-logos.s3.amazonaws.com/seller-xw5kijmvmzasy/c233c9ade2ccb5491072ae232c814942.png" } } ] @@ -1836,6 +1968,244 @@ assert isinstance( ``` +## Media Resolution Control (Images & Videos) + +For Gemini 3+ models, LiteLLM supports per-part media resolution control using OpenAI's `detail` parameter. This allows you to specify different resolution levels for individual images and videos in your request, whether using `image_url` or `file` content types. + +**Supported `detail` values:** +- `"low"` - Maps to `media_resolution: "low"` (280 tokens for images, 70 tokens per frame for videos) +- `"medium"` - Maps to `media_resolution: "medium"` +- `"high"` - Maps to `media_resolution: "high"` (1120 tokens for images) +- `"ultra_high"` - Maps to `media_resolution: "ultra_high"` +- `"auto"` or `None` - Model decides optimal resolution (no `media_resolution` set) + +**Usage Examples:** + + + + +```python +from litellm import completion + +messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://example.com/chart.png", + "detail": "high" # High resolution for detailed chart analysis + } + }, + { + "type": "text", + "text": "Analyze this chart" + }, + { + "type": "image_url", + "image_url": { + "url": "https://example.com/icon.png", + "detail": "low" # Low resolution for simple icon + } + } + ] + } +] + +response = completion( + model="vertex_ai/gemini-3-pro-preview", + messages=messages, +) +``` + + + + +```python +from litellm import completion + +messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "Analyze this video" + }, + { + "type": "file", + "file": { + "file_id": "gs://my-bucket/video.mp4", + "format": "video/mp4", + "detail": "high" # High resolution for detailed video analysis + } + } + ] + } +] + +response = completion( + model="vertex_ai/gemini-3-pro-preview", + messages=messages, +) +``` + + + + +:::info +**Per-Part Resolution:** Each image or video in your request can have its own `detail` setting, allowing mixed-resolution requests (e.g., a high-res chart alongside a low-res icon). This feature works with both `image_url` and `file` content types, and is only available for Gemini 3+ models. +::: + +## Video Metadata Control + +For Gemini 3+ models, LiteLLM supports fine-grained video processing control through the `video_metadata` field. This allows you to specify frame extraction rates and time ranges for video analysis. + +**Supported `video_metadata` parameters:** + +| Parameter | Type | Description | Example | +|-----------|------|-------------|---------| +| `fps` | Number | Frame extraction rate (frames per second) | `5` | +| `start_offset` | String | Start time for video clip processing | `"10s"` | +| `end_offset` | String | End time for video clip processing | `"60s"` | + +:::note +**Field Name Conversion:** LiteLLM automatically converts snake_case field names to camelCase for the Gemini API: +- `start_offset` → `startOffset` +- `end_offset` → `endOffset` +- `fps` remains unchanged +::: + +:::warning +- **Gemini 3+ Only:** This feature is only available for Gemini 3.0 and newer models +- **Video Files Recommended:** While `video_metadata` is designed for video files, error handling for other media types is delegated to the Vertex AI API +- **File Formats Supported:** Works with `gs://`, `https://`, and base64-encoded video files +::: + +**Usage Examples:** + + + + +```python +from litellm import completion + +response = completion( + model="vertex_ai/gemini-3-pro-preview", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this video clip"}, + { + "type": "file", + "file": { + "file_id": "gs://my-bucket/video.mp4", + "format": "video/mp4", + "video_metadata": { + "fps": 5, # Extract 5 frames per second + "start_offset": "10s", # Start from 10 seconds + "end_offset": "60s" # End at 60 seconds + } + } + } + ] + } + ] +) + +print(response.choices[0].message.content) +``` + + + + +```python +from litellm import completion + +response = completion( + model="vertex_ai/gemini-3-pro-preview", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "Provide detailed analysis of this video segment"}, + { + "type": "file", + "file": { + "file_id": "https://example.com/presentation.mp4", + "format": "video/mp4", + "detail": "high", # High resolution for detailed analysis + "video_metadata": { + "fps": 10, # Extract 10 frames per second + "start_offset": "30s", # Start from 30 seconds + "end_offset": "90s" # End at 90 seconds + } + } + } + ] + } + ] +) + +print(response.choices[0].message.content) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: gemini-3-pro + litellm_params: + model: vertex_ai/gemini-3-pro-preview + vertex_project: your-project + vertex_location: us-central1 +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Make request + +```bash +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{ + "model": "gemini-3-pro", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this video clip"}, + { + "type": "file", + "file": { + "file_id": "gs://my-bucket/video.mp4", + "format": "video/mp4", + "detail": "high", + "video_metadata": { + "fps": 5, + "start_offset": "10s", + "end_offset": "60s" + } + } + } + ] + } + ] + }' +``` + + + ## Usage - PDF / Videos / Audio etc. Files @@ -2550,355 +2920,6 @@ print(response) - -## **Gemini TTS (Text-to-Speech) Audio Output** - -:::info - -LiteLLM supports Gemini TTS models on Vertex AI that can generate audio responses using the OpenAI-compatible `audio` parameter format. - -::: - -### Supported Models - -LiteLLM supports Gemini TTS models with audio capabilities on Vertex AI (e.g. `vertex_ai/gemini-2.5-flash-preview-tts` and `vertex_ai/gemini-2.5-pro-preview-tts`). For the complete list of available TTS models and voices, see the [official Gemini TTS documentation](https://ai.google.dev/gemini-api/docs/speech-generation). - -### Limitations - -:::warning - -**Important Limitations**: -- Gemini TTS models only support the `pcm16` audio format -- **Streaming support has not been added** to TTS models yet -- The `modalities` parameter must be set to `['audio']` for TTS requests - -::: - -### Quick Start - - - - -```python -from litellm import completion -import json - -## GET CREDENTIALS -file_path = 'path/to/vertex_ai_service_account.json' - -# Load the JSON file -with open(file_path, 'r') as file: - vertex_credentials = json.load(file) - -# Convert to JSON string -vertex_credentials_json = json.dumps(vertex_credentials) - -response = completion( - model="vertex_ai/gemini-2.5-flash-preview-tts", - messages=[{"role": "user", "content": "Say hello in a friendly voice"}], - modalities=["audio"], # Required for TTS models - audio={ - "voice": "Kore", - "format": "pcm16" # Required: must be "pcm16" - }, - vertex_credentials=vertex_credentials_json -) - -print(response) -``` - - - - -1. Setup config.yaml - -```yaml -model_list: - - model_name: gemini-tts-flash - litellm_params: - model: vertex_ai/gemini-2.5-flash-preview-tts - vertex_project: "your-project-id" - vertex_location: "us-central1" - vertex_credentials: "/path/to/service_account.json" - - model_name: gemini-tts-pro - litellm_params: - model: vertex_ai/gemini-2.5-pro-preview-tts - vertex_project: "your-project-id" - vertex_location: "us-central1" - vertex_credentials: "/path/to/service_account.json" -``` - -2. Start proxy - -```bash -litellm --config /path/to/config.yaml -``` - -3. Make TTS request - -```bash -curl http://0.0.0.0:4000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer " \ - -d '{ - "model": "gemini-tts-flash", - "messages": [{"role": "user", "content": "Say hello in a friendly voice"}], - "modalities": ["audio"], - "audio": { - "voice": "Kore", - "format": "pcm16" - } - }' -``` - - - - -### Advanced Usage - -You can combine TTS with other Gemini features: - -```python -response = completion( - model="vertex_ai/gemini-2.5-pro-preview-tts", - messages=[ - {"role": "system", "content": "You are a helpful assistant that speaks clearly."}, - {"role": "user", "content": "Explain quantum computing in simple terms"} - ], - modalities=["audio"], - audio={ - "voice": "Charon", - "format": "pcm16" - }, - temperature=0.7, - max_tokens=150, - vertex_credentials=vertex_credentials_json -) -``` - -For more information about Gemini's TTS capabilities and available voices, see the [official Gemini TTS documentation](https://ai.google.dev/gemini-api/docs/speech-generation). - -## **Text to Speech APIs** - -:::info - -LiteLLM supports calling [Vertex AI Text to Speech API](https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech) in the OpenAI text to speech API format - -::: - - - -### Usage - Basic - - - - -Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param - -**Sync Usage** - -```python -speech_file_path = Path(__file__).parent / "speech_vertex.mp3" -response = litellm.speech( - model="vertex_ai/", - input="hello what llm guardrail do you have", -) -response.stream_to_file(speech_file_path) -``` - -**Async Usage** -```python -speech_file_path = Path(__file__).parent / "speech_vertex.mp3" -response = litellm.aspeech( - model="vertex_ai/", - input="hello what llm guardrail do you have", -) -response.stream_to_file(speech_file_path) -``` - - - - -1. Add model to config.yaml -```yaml -model_list: - - model_name: vertex-tts - litellm_params: - model: vertex_ai/ # Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param - vertex_project: "adroit-crow-413218" - vertex_location: "us-central1" - vertex_credentials: adroit-crow-413218-a956eef1a2a8.json - -litellm_settings: - drop_params: True -``` - -2. Start Proxy - -``` -$ litellm --config /path/to/config.yaml -``` - -3. Make Request use OpenAI Python SDK - - -```python -import openai - -client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") - -# see supported values for "voice" on vertex here: -# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech -response = client.audio.speech.create( - model = "vertex-tts", - input="the quick brown fox jumped over the lazy dogs", - voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'} -) -print("response from proxy", response) -``` - - - - - -### Usage - `ssml` as input - -Pass your `ssml` as input to the `input` param, if it contains ``, it will be automatically detected and passed as `ssml` to the Vertex AI API - -If you need to force your `input` to be passed as `ssml`, set `use_ssml=True` - - - - -Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param - - -```python -speech_file_path = Path(__file__).parent / "speech_vertex.mp3" - - -ssml = """ - -

Hello, world!

-

This is a test of the text-to-speech API.

-
-""" - -response = litellm.speech( - input=ssml, - model="vertex_ai/test", - voice={ - "languageCode": "en-UK", - "name": "en-UK-Studio-O", - }, - audioConfig={ - "audioEncoding": "LINEAR22", - "speakingRate": "10", - }, -) -response.stream_to_file(speech_file_path) -``` - -
- - - -```python -import openai - -client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") - -ssml = """ - -

Hello, world!

-

This is a test of the text-to-speech API.

-
-""" - -# see supported values for "voice" on vertex here: -# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech -response = client.audio.speech.create( - model = "vertex-tts", - input=ssml, - voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}, -) -print("response from proxy", response) -``` - -
-
- - -### Forcing SSML Usage - -You can force the use of SSML by setting the `use_ssml` parameter to `True`. This is useful when you want to ensure that your input is treated as SSML, even if it doesn't contain the `` tags. - -Here are examples of how to force SSML usage: - - - - - -Vertex AI does not support passing a `model` param - so passing `model=vertex_ai/` is the only required param - - -```python -speech_file_path = Path(__file__).parent / "speech_vertex.mp3" - - -ssml = """ - -

Hello, world!

-

This is a test of the text-to-speech API.

-
-""" - -response = litellm.speech( - input=ssml, - use_ssml=True, - model="vertex_ai/test", - voice={ - "languageCode": "en-UK", - "name": "en-UK-Studio-O", - }, - audioConfig={ - "audioEncoding": "LINEAR22", - "speakingRate": "10", - }, -) -response.stream_to_file(speech_file_path) -``` - -
- - - -```python -import openai - -client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") - -ssml = """ - -

Hello, world!

-

This is a test of the text-to-speech API.

-
-""" - -# see supported values for "voice" on vertex here: -# https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech -response = client.audio.speech.create( - model = "vertex-tts", - input=ssml, # pass as None since OpenAI SDK requires this param - voice={'languageCode': 'en-US', 'name': 'en-US-Studio-O'}, - extra_body={"use_ssml": True}, -) -print("response from proxy", response) -``` - -
-
- ## **Fine Tuning APIs** diff --git a/docs/my-website/docs/providers/vertex_ai/videos.md b/docs/my-website/docs/providers/vertex_ai/videos.md new file mode 100644 index 000000000000..4aaf74354b1e --- /dev/null +++ b/docs/my-website/docs/providers/vertex_ai/videos.md @@ -0,0 +1,268 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Vertex AI Video Generation (Veo) + +LiteLLM supports Vertex AI's Veo video generation models using the unified OpenAI video API surface. + +| Property | Details | +|-------|-------| +| Description | Google Cloud Vertex AI Veo video generation models | +| Provider Route on LiteLLM | `vertex_ai/` | +| Supported Models | `veo-2.0-generate-001`, `veo-3.0-generate-preview`, `veo-3.0-fast-generate-preview`, `veo-3.1-generate-preview`, `veo-3.1-fast-generate-preview` | +| Cost Tracking | ✅ Duration-based pricing | +| Logging Support | ✅ Full request/response logging | +| Proxy Server Support | ✅ Full proxy integration with virtual keys | +| Spend Management | ✅ Budget tracking and rate limiting | +| Link to Provider Doc | [Vertex AI Veo Documentation ↗](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/veo-video-generation) | + +## Quick Start + +### Required Environment Setup + +```python +import json +import os + +os.environ["VERTEXAI_PROJECT"] = "your-gcp-project-id" +os.environ["VERTEXAI_LOCATION"] = "us-central1" + +# Option 1: Point to a service account file +os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/service_account.json" + +# Option 2: Store the service account JSON directly +with open("/path/to/service_account.json", "r", encoding="utf-8") as f: + os.environ["VERTEXAI_CREDENTIALS"] = f.read() +``` + +### Basic Usage + +```python +from litellm import video_generation, video_status, video_content +import json +import os +import time + +with open("/path/to/service_account.json", "r", encoding="utf-8") as f: + vertex_credentials = f.read() + +response = video_generation( + model="vertex_ai/veo-3.0-generate-preview", + prompt="A cat playing with a ball of yarn in a sunny garden", + vertex_project="your-gcp-project-id", + vertex_location="us-central1", + vertex_credentials=vertex_credentials, + seconds="8", + size="1280x720", +) + +print(f"Video ID: {response.id}") +print(f"Initial Status: {response.status}") + +# Poll for completion +while True: + status = video_status( + video_id=response.id, + vertex_project="your-gcp-project-id", + vertex_location="us-central1", + vertex_credentials=vertex_credentials, + ) + + print(f"Current Status: {status.status}") + + if status.status == "completed": + break + if status.status == "failed": + raise RuntimeError("Video generation failed") + + time.sleep(10) + +# Download the rendered video +video_bytes = video_content( + video_id=response.id, + vertex_project="your-gcp-project-id", + vertex_location="us-central1", + vertex_credentials=vertex_credentials, +) + +with open("generated_video.mp4", "wb") as f: + f.write(video_bytes) +``` + +## Supported Models + +| Model Name | Description | Max Duration | Status | +|------------|-------------|--------------|--------| +| veo-2.0-generate-001 | Veo 2.0 video generation | 5 seconds | GA | +| veo-3.0-generate-preview | Veo 3.0 high quality | 8 seconds | Preview | +| veo-3.0-fast-generate-preview | Veo 3.0 fast generation | 8 seconds | Preview | +| veo-3.1-generate-preview | Veo 3.1 high quality | 10 seconds | Preview | +| veo-3.1-fast-generate-preview | Veo 3.1 fast | 10 seconds | Preview | + +## Video Generation Parameters + +LiteLLM converts OpenAI-style parameters to Veo's API shape automatically: + +| OpenAI Parameter | Vertex AI Parameter | Description | Example | +|------------------|---------------------|-------------|---------| +| `prompt` | `instances[].prompt` | Text description of the video | "A cat playing" | +| `size` | `parameters.aspectRatio` | Converted to `16:9` or `9:16` | "1280x720" → `16:9` | +| `seconds` | `parameters.durationSeconds` | Clip length in seconds | "8" → `8` | +| `input_reference` | `instances[].image` | Reference image for animation | `open("image.jpg", "rb")` | +| Provider-specific params | `extra_body` | Forwarded to Vertex API | `{"negativePrompt": "blurry"}` | + +### Size to Aspect Ratio Mapping + +- `1280x720`, `1920x1080` → `16:9` +- `720x1280`, `1080x1920` → `9:16` +- Unknown sizes default to `16:9` + +## Async Usage + +```python +from litellm import avideo_generation, avideo_status, avideo_content +import asyncio +import json + +with open("/path/to/service_account.json", "r", encoding="utf-8") as f: + vertex_credentials = f.read() + + +async def workflow(): + response = await avideo_generation( + model="vertex_ai/veo-3.1-generate-preview", + prompt="Slow motion water droplets splashing into a pool", + seconds="10", + vertex_project="your-gcp-project-id", + vertex_location="us-central1", + vertex_credentials=vertex_credentials, + ) + + while True: + status = await avideo_status( + video_id=response.id, + vertex_project="your-gcp-project-id", + vertex_location="us-central1", + vertex_credentials=vertex_credentials, + ) + + if status.status == "completed": + break + if status.status == "failed": + raise RuntimeError("Video generation failed") + + await asyncio.sleep(10) + + video_bytes = await avideo_content( + video_id=response.id, + vertex_project="your-gcp-project-id", + vertex_location="us-central1", + vertex_credentials=vertex_credentials, + ) + + with open("veo_water.mp4", "wb") as f: + f.write(video_bytes) + +asyncio.run(workflow()) +``` + +## LiteLLM Proxy Usage + +Add Veo models to your `config.yaml`: + +```yaml +model_list: + - model_name: veo-3 + litellm_params: + model: vertex_ai/veo-3.0-generate-preview + vertex_project: os.environ/VERTEXAI_PROJECT + vertex_location: os.environ/VERTEXAI_LOCATION + vertex_credentials: os.environ/VERTEXAI_CREDENTIALS +``` + +Start the proxy and make requests: + + + + +```bash +# Step 1: Generate video +curl --location 'http://0.0.0.0:4000/videos' \ +--header 'Content-Type: application/json' \ +--header 'Authorization: Bearer sk-1234' \ +--data '{ + "model": "veo-3", + "prompt": "Aerial shot over a futuristic city at sunrise", + "seconds": "8" +}' + +# Step 2: Poll status +curl --location 'http://localhost:4000/v1/videos/{video_id}' \ +--header 'x-litellm-api-key: sk-1234' + +# Step 3: Download video +curl --location 'http://localhost:4000/v1/videos/{video_id}/content' \ +--header 'x-litellm-api-key: sk-1234' \ +--output video.mp4 +``` + + + + +```python +import litellm + +litellm.api_base = "http://0.0.0.0:4000" +litellm.api_key = "sk-1234" + +response = litellm.video_generation( + model="veo-3", + prompt="Aerial shot over a futuristic city at sunrise", +) + +status = litellm.video_status(video_id=response.id) +while status.status not in ["completed", "failed"]: + status = litellm.video_status(video_id=response.id) + +if status.status == "completed": + content = litellm.video_content(video_id=response.id) + with open("veo_city.mp4", "wb") as f: + f.write(content) +``` + + + + +## Cost Tracking + +LiteLLM records the duration returned by Veo so you can apply duration-based pricing. + +```python +with open("/path/to/service_account.json", "r", encoding="utf-8") as f: + vertex_credentials = f.read() + +response = video_generation( + model="vertex_ai/veo-2.0-generate-001", + prompt="Flowers blooming in fast forward", + seconds="5", + vertex_project="your-gcp-project-id", + vertex_location="us-central1", + vertex_credentials=vertex_credentials, +) + +print(response.usage) # {"duration_seconds": 5.0} +``` + +## Troubleshooting + +- **`vertex_project is required`**: set `VERTEXAI_PROJECT` env var or pass `vertex_project` in the request. +- **`Permission denied`**: ensure the service account has the `Vertex AI User` role and the correct region enabled. +- **Video stuck in `processing`**: Veo operations are long-running. Continue polling every 10–15 seconds up to ~10 minutes. + +## See Also + +- [OpenAI Video Generation](../openai/videos.md) +- [Azure Video Generation](../azure/videos.md) +- [Gemini Video Generation](../gemini/videos.md) +- [Video Generation API Reference](/docs/videos) + diff --git a/docs/my-website/docs/providers/vertex_ai_agent_engine.md b/docs/my-website/docs/providers/vertex_ai_agent_engine.md new file mode 100644 index 000000000000..3bd40e986845 --- /dev/null +++ b/docs/my-website/docs/providers/vertex_ai_agent_engine.md @@ -0,0 +1,216 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Vertex AI Agent Engine + +Call Vertex AI Agent Engine (Reasoning Engines) in the OpenAI Request/Response format. + +| Property | Details | +|----------|---------| +| Description | Vertex AI Agent Engine provides hosted agent runtimes that can execute agentic workflows with foundation models, tools, and custom logic. | +| Provider Route on LiteLLM | `vertex_ai/agent_engine/{RESOURCE_NAME}` | +| Supported Endpoints | `/chat/completions`, `/v1/messages`, `/v1/responses`, `/v1/a2a/message/send` | +| Provider Doc | [Vertex AI Agent Engine ↗](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/overview) | + +## Quick Start + +### Model Format + +```shell showLineNumbers title="Model Format" +vertex_ai/agent_engine/{RESOURCE_NAME} +``` + +**Example:** +- `vertex_ai/agent_engine/projects/1060139831167/locations/us-central1/reasoningEngines/8263861224643493888` + +### LiteLLM Python SDK + +```python showLineNumbers title="Basic Agent Completion" +import litellm + +response = litellm.completion( + model="vertex_ai/agent_engine/projects/1060139831167/locations/us-central1/reasoningEngines/8263861224643493888", + messages=[ + {"role": "user", "content": "Explain machine learning in simple terms"} + ], +) + +print(response.choices[0].message.content) +``` + +```python showLineNumbers title="Streaming Agent Responses" +import litellm + +response = await litellm.acompletion( + model="vertex_ai/agent_engine/projects/1060139831167/locations/us-central1/reasoningEngines/8263861224643493888", + messages=[ + {"role": "user", "content": "What are the key principles of software architecture?"} + ], + stream=True, +) + +async for chunk in response: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="") +``` + +### LiteLLM Proxy + +#### 1. Configure your model in config.yaml + + + + +```yaml showLineNumbers title="LiteLLM Proxy Configuration" +model_list: + - model_name: vertex-agent-1 + litellm_params: + model: vertex_ai/agent_engine/projects/1060139831167/locations/us-central1/reasoningEngines/8263861224643493888 + vertex_project: your-project-id + vertex_location: us-central1 +``` + + + + +#### 2. Start the LiteLLM Proxy + +```bash showLineNumbers title="Start LiteLLM Proxy" +litellm --config config.yaml +``` + +#### 3. Make requests to your Vertex AI Agent Engine + + + + +```bash showLineNumbers title="Basic Agent Request" +curl http://localhost:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -d '{ + "model": "vertex-agent-1", + "messages": [ + {"role": "user", "content": "Summarize the main benefits of cloud computing"} + ] + }' +``` + + + + + +```python showLineNumbers title="Using OpenAI SDK with LiteLLM Proxy" +from openai import OpenAI + +client = OpenAI( + base_url="http://localhost:4000", + api_key="your-litellm-api-key" +) + +response = client.chat.completions.create( + model="vertex-agent-1", + messages=[ + {"role": "user", "content": "What are best practices for API design?"} + ] +) + +print(response.choices[0].message.content) +``` + + + + +## LiteLLM A2A Gateway + +You can also connect to Vertex AI Agent Engine through LiteLLM's A2A (Agent-to-Agent) Gateway UI. This provides a visual way to register and test agents without writing code. + +### 1. Navigate to Agents + +From the sidebar, click "Agents" to open the agent management page, then click "+ Add New Agent". + +![Click Agents](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/9a979927-ce6b-4168-9fba-e53e28f1c2c4/ascreenshot.jpeg?tl_px=0,14&br_px=1376,783&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=17,277) + +![Add New Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/a311750c-2e85-4589-99cb-2ce7e4021e77/ascreenshot.jpeg?tl_px=0,0&br_px=1376,769&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=195,257) + +### 2. Select Vertex AI Agent Engine Type + +Click "A2A Standard" to see available agent types, then select "Vertex AI Agent Engine". + +![Select A2A Standard](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/5b1acc4c-dc3f-4639-b4a0-e64b35c228fd/ascreenshot.jpeg?tl_px=52,0&br_px=1428,769&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=524,271) + +![Select Vertex AI Agent Engine](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/2f3bab61-3e02-4db7-84f0-82200a0f4136/ascreenshot.jpeg?tl_px=0,244&br_px=1376,1013&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=477,277) + +### 3. Configure the Agent + +Fill in the following fields: + +- **Agent Name** - A friendly name for your agent (e.g., `my-vertex-agent`) +- **Reasoning Engine Resource ID** - The full resource path from Google Cloud Console (e.g., `projects/1060139831167/locations/us-central1/reasoningEngines/8263861224643493888`) +- **Vertex Project** - Your Google Cloud project ID +- **Vertex Location** - The region where your agent is deployed (e.g., `us-central1`) + +![Enter Agent Name](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/695b84c7-9511-4337-bf19-f4505ab2b72b/ascreenshot.jpeg?tl_px=0,90&br_px=1376,859&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=480,276) + +![Enter Resource ID](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/ddce64df-b3a3-4519-ab62-f137887bcea2/ascreenshot.jpeg?tl_px=0,294&br_px=1376,1063&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=440,277) + +You can find the Resource ID in Google Cloud Console under Vertex AI > Agent Engine: + +![Copy Resource ID from Google Cloud Console](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/185d7f17-cbaa-45de-948d-49d2091805ea/ascreenshot.jpeg?tl_px=0,165&br_px=1376,934&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=493,276) + +![Enter Vertex Project](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/a64da441-3e61-4811-a1e3-9f0b12c949ff/ascreenshot.jpeg?tl_px=0,233&br_px=1376,1002&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=501,277) + +You can find the Project ID in Google Cloud Console: + +![Copy Project ID from Google Cloud Console](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/9ecad3bb-a534-42d6-9604-33906014fad6/user_cropped_screenshot.webp?tl_px=0,0&br_px=1728,1028&force_format=jpeg&q=100&width=1120.0) + +![Enter Vertex Location](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/316d1f38-4fb7-4377-86b6-c0fe7ac24383/ascreenshot.jpeg?tl_px=0,330&br_px=1376,1099&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=423,277) + +### 4. Create Agent + +Click "Create Agent" to save your configuration. + +![Create Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/fb04b95d-793f-4eed-acf4-d1b3b5fa65e9/ascreenshot.jpeg?tl_px=352,347&br_px=1728,1117&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=623,498) + +### 5. Test in Playground + +Go to "Playground" in the sidebar to test your agent. + +![Go to Playground](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/9e01369b-6102-4fe3-96a7-90082cadfd6e/ascreenshot.jpeg?tl_px=0,0&br_px=1376,769&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=55,226) + +### 6. Select A2A Endpoint + +Click the endpoint dropdown and select `/v1/a2a/message/send`. + +![Select Endpoint](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/d5aeac35-531b-4cf0-af2d-88f0a71fd736/ascreenshot.jpeg?tl_px=0,146&br_px=1376,915&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=299,277) + +### 7. Select Your Agent and Send a Message + +Pick your Vertex AI Agent Engine from the dropdown and send a test message. + +![Select Agent](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/353431f3-a0ba-4436-865d-ae11595e9cc4/ascreenshot.jpeg?tl_px=0,263&br_px=1376,1032&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=270,277) + +![Send Message](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/fbfce72e-f50b-43e1-b6e5-0d41192d8e2d/ascreenshot.jpeg?tl_px=95,347&br_px=1471,1117&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=524,474) + +![Agent Response](https://ajeuwbhvhr.cloudimg.io/https://colony-recorder.s3.amazonaws.com/files/2025-12-16/892dd826-fbf9-4530-8d82-95270889274a/ascreenshot.jpeg?tl_px=0,82&br_px=1376,851&force_format=jpeg&q=100&width=1120.0&wat=1&wat_opacity=0.7&wat_gravity=northwest&wat_url=https://colony-recorder.s3.us-west-1.amazonaws.com/images/watermarks/FB923C_standard.png&wat_pad=485,277) + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `GOOGLE_APPLICATION_CREDENTIALS` | Path to service account JSON key file | +| `VERTEXAI_PROJECT` | Google Cloud project ID | +| `VERTEXAI_LOCATION` | Google Cloud region (default: `us-central1`) | + +```bash +export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account.json" +export VERTEXAI_PROJECT="your-project-id" +export VERTEXAI_LOCATION="us-central1" +``` + +## Further Reading + +- [Vertex AI Agent Engine Documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/overview) +- [Create a Reasoning Engine](https://cloud.google.com/vertex-ai/generative-ai/docs/reasoning-engine/create) +- [A2A Agent Gateway](../a2a.md) +- [Vertex AI Provider](./vertex.md) diff --git a/docs/my-website/docs/providers/vertex_embedding.md b/docs/my-website/docs/providers/vertex_embedding.md new file mode 100644 index 000000000000..5656ade337b9 --- /dev/null +++ b/docs/my-website/docs/providers/vertex_embedding.md @@ -0,0 +1,587 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Vertex AI Embedding + +## Usage - Embedding + + + + +```python +import litellm +from litellm import embedding +litellm.vertex_project = "hardy-device-38811" # Your Project ID +litellm.vertex_location = "us-central1" # proj location + +response = embedding( + model="vertex_ai/textembedding-gecko", + input=["good morning from litellm"], +) +print(response) +``` + + + + + +1. Add model to config.yaml +```yaml +model_list: + - model_name: snowflake-arctic-embed-m-long-1731622468876 + litellm_params: + model: vertex_ai/ + vertex_project: "adroit-crow-413218" + vertex_location: "us-central1" + vertex_credentials: adroit-crow-413218-a956eef1a2a8.json + +litellm_settings: + drop_params: True +``` + +2. Start Proxy + +``` +$ litellm --config /path/to/config.yaml +``` + +3. Make Request using OpenAI Python SDK, Langchain Python SDK + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.embeddings.create( + model="snowflake-arctic-embed-m-long-1731622468876", + input = ["good morning from litellm", "this is another item"], +) + +print(response) +``` + + + + + +#### Supported Embedding Models +All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a0249f630a6792d49dffc2c5d9b7/model_prices_and_context_window.json#L835) are supported + +| Model Name | Function Call | +|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| text-embedding-004 | `embedding(model="vertex_ai/text-embedding-004", input)` | +| text-multilingual-embedding-002 | `embedding(model="vertex_ai/text-multilingual-embedding-002", input)` | +| textembedding-gecko | `embedding(model="vertex_ai/textembedding-gecko", input)` | +| textembedding-gecko-multilingual | `embedding(model="vertex_ai/textembedding-gecko-multilingual", input)` | +| textembedding-gecko-multilingual@001 | `embedding(model="vertex_ai/textembedding-gecko-multilingual@001", input)` | +| textembedding-gecko@001 | `embedding(model="vertex_ai/textembedding-gecko@001", input)` | +| textembedding-gecko@003 | `embedding(model="vertex_ai/textembedding-gecko@003", input)` | +| text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` | +| text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` | +| Fine-tuned OR Custom Embedding models | `embedding(model="vertex_ai/", input)` | + +### Supported OpenAI (Unified) Params + +| [param](../embedding/supported_embedding.md#input-params-for-litellmembedding) | type | [vertex equivalent](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api) | +|-------|-------------|--------------------| +| `input` | **string or List[string]** | `instances` | +| `dimensions` | **int** | `output_dimensionality` | +| `input_type` | **Literal["RETRIEVAL_QUERY","RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY", "CLASSIFICATION", "CLUSTERING", "QUESTION_ANSWERING", "FACT_VERIFICATION"]** | `task_type` | + +#### Usage with OpenAI (Unified) Params + + + + + +```python +response = litellm.embedding( + model="vertex_ai/text-embedding-004", + input=["good morning from litellm", "gm"] + input_type = "RETRIEVAL_DOCUMENT", + dimensions=1, +) +``` + + + + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.embeddings.create( + model="text-embedding-004", + input = ["good morning from litellm", "gm"], + dimensions=1, + extra_body = { + "input_type": "RETRIEVAL_QUERY", + } +) + +print(response) +``` + + + + +### Supported Vertex Specific Params + +| param | type | +|-------|-------------| +| `auto_truncate` | **bool** | +| `task_type` | **Literal["RETRIEVAL_QUERY","RETRIEVAL_DOCUMENT", "SEMANTIC_SIMILARITY", "CLASSIFICATION", "CLUSTERING", "QUESTION_ANSWERING", "FACT_VERIFICATION"]** | +| `title` | **str** | + +#### Usage with Vertex Specific Params (Use `task_type` and `title`) + +You can pass any vertex specific params to the embedding model. Just pass them to the embedding function like this: + +[Relevant Vertex AI doc with all embedding params](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text-embeddings-api#request_body) + + + + +```python +response = litellm.embedding( + model="vertex_ai/text-embedding-004", + input=["good morning from litellm", "gm"] + task_type = "RETRIEVAL_DOCUMENT", + title = "test", + dimensions=1, + auto_truncate=True, +) +``` + + + + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.embeddings.create( + model="text-embedding-004", + input = ["good morning from litellm", "gm"], + dimensions=1, + extra_body = { + "task_type": "RETRIEVAL_QUERY", + "auto_truncate": True, + "title": "test", + } +) + +print(response) +``` + + + +## **BGE Embeddings** + +Use BGE (Baidu General Embedding) models deployed on Vertex AI. + +### Usage + + + + +```python showLineNumbers title="Using BGE on Vertex AI" +import litellm + +response = litellm.embedding( + model="vertex_ai/bge/", + input=["Hello", "World"], + vertex_project="your-project-id", + vertex_location="your-location" +) + +print(response) +``` + + + + + +1. Add model to config.yaml +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: bge-embedding + litellm_params: + model: vertex_ai/bge/ + vertex_project: "your-project-id" + vertex_location: "us-central1" + vertex_credentials: your-credentials.json + +litellm_settings: + drop_params: True +``` + +2. Start Proxy + +```bash +$ litellm --config /path/to/config.yaml +``` + +3. Make Request using OpenAI Python SDK + +```python showLineNumbers title="Making requests to BGE" +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.embeddings.create( + model="bge-embedding", + input=["good morning from litellm", "this is another item"] +) + +print(response) +``` + +Using a Private Service Connect (PSC) endpoint + +```yaml showLineNumbers title="config.yaml (PSC)" +model_list: + - model_name: bge-small-en-v1.5 + litellm_params: + model: vertex_ai/bge/1234567890 + api_base: http://10.96.32.8 # Your PSC IP + vertex_project: my-project-id #optional + vertex_location: us-central1 #optional +``` + + + + +## **Multi-Modal Embeddings** + + +Known Limitations: +- Only supports 1 image / video / image per request +- Only supports GCS or base64 encoded images / videos + +### Usage + + + + +Using GCS Images + +```python +response = await litellm.aembedding( + model="vertex_ai/multimodalembedding@001", + input="gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png" # will be sent as a gcs image +) +``` + +Using base 64 encoded images + +```python +response = await litellm.aembedding( + model="vertex_ai/multimodalembedding@001", + input="data:image/jpeg;base64,..." # will be sent as a base64 encoded image +) +``` + + + + +1. Add model to config.yaml +```yaml +model_list: + - model_name: multimodalembedding@001 + litellm_params: + model: vertex_ai/multimodalembedding@001 + vertex_project: "adroit-crow-413218" + vertex_location: "us-central1" + vertex_credentials: adroit-crow-413218-a956eef1a2a8.json + +litellm_settings: + drop_params: True +``` + +2. Start Proxy + +``` +$ litellm --config /path/to/config.yaml +``` + +3. Make Request use OpenAI Python SDK, Langchain Python SDK + + + + + + +Requests with GCS Image / Video URI + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# # request sent to model set on litellm proxy, `litellm --model` +response = client.embeddings.create( + model="multimodalembedding@001", + input = "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", +) + +print(response) +``` + +Requests with base64 encoded images + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# # request sent to model set on litellm proxy, `litellm --model` +response = client.embeddings.create( + model="multimodalembedding@001", + input = "data:image/jpeg;base64,...", +) + +print(response) +``` + + + + + +Requests with GCS Image / Video URI +```python +from langchain_openai import OpenAIEmbeddings + +embeddings_models = "multimodalembedding@001" + +embeddings = OpenAIEmbeddings( + model="multimodalembedding@001", + base_url="http://0.0.0.0:4000", + api_key="sk-1234", # type: ignore +) + + +query_result = embeddings.embed_query( + "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png" +) +print(query_result) + +``` + +Requests with base64 encoded images + +```python +from langchain_openai import OpenAIEmbeddings + +embeddings_models = "multimodalembedding@001" + +embeddings = OpenAIEmbeddings( + model="multimodalembedding@001", + base_url="http://0.0.0.0:4000", + api_key="sk-1234", # type: ignore +) + + +query_result = embeddings.embed_query( + "data:image/jpeg;base64,..." +) +print(query_result) + +``` + + + + + + + + + +1. Add model to config.yaml +```yaml +default_vertex_config: + vertex_project: "adroit-crow-413218" + vertex_location: "us-central1" + vertex_credentials: adroit-crow-413218-a956eef1a2a8.json +``` + +2. Start Proxy + +``` +$ litellm --config /path/to/config.yaml +``` + +3. Make Request use OpenAI Python SDK + +```python +import vertexai + +from vertexai.vision_models import Image, MultiModalEmbeddingModel, Video +from vertexai.vision_models import VideoSegmentConfig +from google.auth.credentials import Credentials + + +LITELLM_PROXY_API_KEY = "sk-1234" +LITELLM_PROXY_BASE = "http://0.0.0.0:4000/vertex-ai" + +import datetime + +class CredentialsWrapper(Credentials): + def __init__(self, token=None): + super().__init__() + self.token = token + self.expiry = None # or set to a future date if needed + + def refresh(self, request): + pass + + def apply(self, headers, token=None): + headers['Authorization'] = f'Bearer {self.token}' + + @property + def expired(self): + return False # Always consider the token as non-expired + + @property + def valid(self): + return True # Always consider the credentials as valid + +credentials = CredentialsWrapper(token=LITELLM_PROXY_API_KEY) + +vertexai.init( + project="adroit-crow-413218", + location="us-central1", + api_endpoint=LITELLM_PROXY_BASE, + credentials = credentials, + api_transport="rest", + +) + +model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding") +image = Image.load_from_file( + "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png" +) + +embeddings = model.get_embeddings( + image=image, + contextual_text="Colosseum", + dimension=1408, +) +print(f"Image Embedding: {embeddings.image_embedding}") +print(f"Text Embedding: {embeddings.text_embedding}") +``` + + + + + +### Text + Image + Video Embeddings + + + + +Text + Image + +```python +response = await litellm.aembedding( + model="vertex_ai/multimodalembedding@001", + input=["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"] # will be sent as a gcs image +) +``` + +Text + Video + +```python +response = await litellm.aembedding( + model="vertex_ai/multimodalembedding@001", + input=["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image +) +``` + +Image + Video + +```python +response = await litellm.aembedding( + model="vertex_ai/multimodalembedding@001", + input=["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"] # will be sent as a gcs image +) +``` + + + + + +1. Add model to config.yaml +```yaml +model_list: + - model_name: multimodalembedding@001 + litellm_params: + model: vertex_ai/multimodalembedding@001 + vertex_project: "adroit-crow-413218" + vertex_location: "us-central1" + vertex_credentials: adroit-crow-413218-a956eef1a2a8.json + +litellm_settings: + drop_params: True +``` + +2. Start Proxy + +``` +$ litellm --config /path/to/config.yaml +``` + +3. Make Request use OpenAI Python SDK, Langchain Python SDK + + +Text + Image + +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# # request sent to model set on litellm proxy, `litellm --model` +response = client.embeddings.create( + model="multimodalembedding@001", + input = ["hey", "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"], +) + +print(response) +``` + +Text + Video +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# # request sent to model set on litellm proxy, `litellm --model` +response = client.embeddings.create( + model="multimodalembedding@001", + input = ["hey", "gs://my-bucket/embeddings/supermarket-video.mp4"], +) + +print(response) +``` + +Image + Video +```python +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +# # request sent to model set on litellm proxy, `litellm --model` +response = client.embeddings.create( + model="multimodalembedding@001", + input = ["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png", "gs://my-bucket/embeddings/supermarket-video.mp4"], +) + +print(response) +``` + + + \ No newline at end of file diff --git a/docs/my-website/docs/providers/vertex_image.md b/docs/my-website/docs/providers/vertex_image.md index 27e584cb222d..c4d5d554088b 100644 --- a/docs/my-website/docs/providers/vertex_image.md +++ b/docs/my-website/docs/providers/vertex_image.md @@ -1,18 +1,65 @@ # Vertex AI Image Generation -Vertex AI Image Generation uses Google's Imagen models to generate high-quality images from text descriptions. +Vertex AI supports two types of image generation: + +1. **Gemini Image Generation Models** (Nano Banana 🍌) - Conversational image generation using `generateContent` API +2. **Imagen Models** - Traditional image generation using `predict` API | Property | Details | |----------|---------| -| Description | Vertex AI Image Generation uses Google's Imagen models to generate high-quality images from text descriptions. | +| Description | Vertex AI Image Generation supports both Gemini image generation models | | Provider Route on LiteLLM | `vertex_ai/` | | Provider Doc | [Google Cloud Vertex AI Image Generation ↗](https://cloud.google.com/vertex-ai/docs/generative-ai/image/generate-images) | +| Gemini Image Generation Docs | [Gemini Image Generation ↗](https://ai.google.dev/gemini-api/docs/image-generation) | ## Quick Start -### LiteLLM Python SDK +### Gemini Image Generation Models + +Gemini image generation models support conversational image creation with features like: +- Text-to-Image generation +- Image editing (text + image → image) +- Multi-turn image refinement +- High-fidelity text rendering +- Up to 4K resolution (Gemini 3 Pro) -```python showLineNumbers title="Basic Image Generation" +```python showLineNumbers title="Gemini 2.5 Flash Image" +import litellm + +# Generate a single image +response = await litellm.aimage_generation( + prompt="A nano banana dish in a fancy restaurant with a Gemini theme", + model="vertex_ai/gemini-2.5-flash-image", + vertex_ai_project="your-project-id", + vertex_ai_location="us-central1", + n=1, + size="1024x1024", +) + +print(response.data[0].b64_json) # Gemini returns base64 images +``` + +```python showLineNumbers title="Gemini 3 Pro Image Preview (4K output)" +import litellm + +# Generate high-resolution image +response = await litellm.aimage_generation( + prompt="Da Vinci style anatomical sketch of a dissected Monarch butterfly", + model="vertex_ai/gemini-3-pro-image-preview", + vertex_ai_project="your-project-id", + vertex_ai_location="us-central1", + n=1, + size="1024x1024", + # Optional: specify image size for Gemini 3 Pro + # imageSize="4K", # Options: "1K", "2K", "4K" +) + +print(response.data[0].b64_json) +``` + +### Imagen Models + +```python showLineNumbers title="Imagen Image Generation" import litellm # Generate a single image @@ -21,9 +68,11 @@ response = await litellm.aimage_generation( model="vertex_ai/imagen-4.0-generate-001", vertex_ai_project="your-project-id", vertex_ai_location="us-central1", + n=1, + size="1024x1024", ) -print(response.data[0].url) +print(response.data[0].b64_json) # Imagen also returns base64 images ``` ### LiteLLM Proxy @@ -70,6 +119,18 @@ print(response.data[0].url) ## Supported Models +### Gemini Image Generation Models + +- `vertex_ai/gemini-2.5-flash-image` - Fast, efficient image generation (1024px resolution) +- `vertex_ai/gemini-3-pro-image-preview` - Advanced model with 4K output, Google Search grounding, and thinking mode +- `vertex_ai/gemini-2.0-flash-preview-image` - Preview model +- `vertex_ai/gemini-2.5-flash-image-preview` - Preview model + +### Imagen Models + +- `vertex_ai/imagegeneration@006` - Legacy Imagen model +- `vertex_ai/imagen-4.0-generate-001` - Latest Imagen model +- `vertex_ai/imagen-3.0-generate-001` - Imagen 3.0 model :::tip @@ -77,7 +138,5 @@ print(response.data[0].url) ::: -LiteLLM supports all Vertex AI Imagen models available through Google Cloud. - For the complete and up-to-date list of supported models, visit: [https://models.litellm.ai/](https://models.litellm.ai/) diff --git a/docs/my-website/docs/providers/vertex_ocr.md b/docs/my-website/docs/providers/vertex_ocr.md new file mode 100644 index 000000000000..9ff22a037754 --- /dev/null +++ b/docs/my-website/docs/providers/vertex_ocr.md @@ -0,0 +1,240 @@ +# Vertex AI OCR + +## Overview + +| Property | Details | +|-------|-------| +| Description | Vertex AI OCR provides document intelligence capabilities powered by Mistral, enabling text extraction from PDFs and images | +| Provider Route on LiteLLM | `vertex_ai/` | +| Supported Operations | `/ocr` | +| Link to Provider Doc | [Vertex AI ↗](https://cloud.google.com/vertex-ai) + +Extract text from documents and images using Vertex AI's OCR models, powered by Mistral. + +## Quick Start + +### **LiteLLM SDK** + +```python showLineNumbers title="SDK Usage" +import litellm +import os + +# Set environment variables +os.environ["VERTEXAI_PROJECT"] = "your-project-id" +os.environ["VERTEXAI_LOCATION"] = "us-central1" + +# OCR with PDF URL +response = litellm.ocr( + model="vertex_ai/mistral-ocr-2505", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + } +) + +# Access extracted text +for page in response.pages: + print(page.text) +``` + +### **LiteLLM PROXY** + +```yaml showLineNumbers title="proxy_config.yaml" +model_list: + - model_name: vertex-ocr + litellm_params: + model: vertex_ai/mistral-ocr-2505 + vertex_project: os.environ/VERTEXAI_PROJECT + vertex_location: os.environ/VERTEXAI_LOCATION + vertex_credentials: path/to/service-account.json # Optional + model_info: + mode: ocr +``` + +**Start Proxy** +```bash +litellm --config proxy_config.yaml +``` + +**Call OCR via Proxy** +```bash showLineNumbers title="cURL Request" +curl -X POST http://localhost:4000/ocr \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer your-api-key" \ + -d '{ + "model": "vertex-ocr", + "document": { + "type": "document_url", + "document_url": "https://arxiv.org/pdf/2201.04234" + } + }' +``` + +## Authentication + +Vertex AI OCR supports multiple authentication methods: + +### Service Account JSON + +```python showLineNumbers title="Service Account Auth" +response = litellm.ocr( + model="vertex_ai/mistral-ocr-2505", + document={"type": "document_url", "document_url": "https://..."}, + vertex_project="your-project-id", + vertex_location="us-central1", + vertex_credentials="path/to/service-account.json" +) +``` + +### Application Default Credentials + +```python showLineNumbers title="Default Credentials" +# Relies on GOOGLE_APPLICATION_CREDENTIALS environment variable +response = litellm.ocr( + model="vertex_ai/mistral-ocr-2505", + document={"type": "document_url", "document_url": "https://..."}, + vertex_project="your-project-id", + vertex_location="us-central1" +) +``` + +## Document Types + +Vertex AI OCR supports both PDFs and images. + +### PDF Documents + +```python showLineNumbers title="PDF OCR" +response = litellm.ocr( + model="vertex_ai/mistral-ocr-2505", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + }, + vertex_project="your-project-id", + vertex_location="us-central1" +) +``` + +### Image Documents + +```python showLineNumbers title="Image OCR" +response = litellm.ocr( + model="vertex_ai/mistral-ocr-2505", + document={ + "type": "image_url", + "image_url": "https://example.com/image.png" + }, + vertex_project="your-project-id", + vertex_location="us-central1" +) +``` + +### Base64 Encoded Documents + +```python showLineNumbers title="Base64 PDF" +import base64 + +# Read and encode PDF +with open("document.pdf", "rb") as f: + pdf_base64 = base64.b64encode(f.read()).decode() + +response = litellm.ocr( + model="vertex_ai/mistral-ocr-2505", # This doesn't work for deepseek + document={ + "type": "document_url", + "document_url": f"data:application/pdf;base64,{pdf_base64}" + }, + vertex_project="your-project-id", + vertex_location="us-central1" +) +``` + +## Supported Parameters + +```python showLineNumbers title="All Parameters" +response = litellm.ocr( + model="vertex_ai/mistral-ocr-2505", + document={ # Required: Document to process + "type": "document_url", + "document_url": "https://..." + }, + vertex_project="your-project-id", # Required: GCP project ID + vertex_location="us-central1", # Optional: Defaults to us-central1 + vertex_credentials="path/to/key.json", # Optional: Service account key + include_image_base64=True, # Optional: Include base64 images + pages=[0, 1, 2], # Optional: Specific pages to process + image_limit=10 # Optional: Limit number of images +) +``` + +## Response Format + +```python showLineNumbers title="Response Structure" +# Response has the following structure +response.pages # List of pages with extracted text +response.model # Model used +response.object # "ocr" +response.usage_info # Token usage information + +# Access page content +for page in response.pages: + print(f"Page {page.page_number}:") + print(page.text) +``` + +## Async Support + +```python showLineNumbers title="Async Usage" +import litellm + +response = await litellm.aocr( + model="vertex_ai/mistral-ocr-2505", + document={ + "type": "document_url", + "document_url": "https://example.com/document.pdf" + }, + vertex_project="your-project-id", + vertex_location="us-central1" +) +``` + +## Cost Tracking + +LiteLLM automatically tracks costs for Vertex AI OCR: + +- **Cost per page**: $0.0005 (based on $1.50 per 1,000 pages) + +```python showLineNumbers title="View Cost" +response = litellm.ocr( + model="vertex_ai/mistral-ocr-2505", + document={"type": "document_url", "document_url": "https://..."}, + vertex_project="your-project-id" +) + +# Access cost information +print(f"Cost: ${response._hidden_params.get('response_cost', 0)}") +``` + +## Important Notes + +:::info URL Conversion +Vertex AI Mistral OCR endpoints don't have internet access. LiteLLM automatically converts public URLs to base64 data URIs before sending requests to Vertex AI. +::: + +:::tip Regional Availability +Mistral OCR is available in multiple regions. Specify `vertex_location` to use a region closer to your data: +- `us-central1` (default) +- `europe-west1` +- `asia-southeast1` + +Deepseek OCR is only available in global region. +::: + +## Supported Models + +- `mistral-ocr-2505` - Latest Mistral OCR model on Vertex AI +- `deepseek-ocr-maas` - Lates Deepseek OCR model on Vertex AI + +Use the Vertex AI provider prefix: `vertex_ai/` + diff --git a/docs/my-website/docs/providers/vertex_speech.md b/docs/my-website/docs/providers/vertex_speech.md new file mode 100644 index 000000000000..751782a323c0 --- /dev/null +++ b/docs/my-website/docs/providers/vertex_speech.md @@ -0,0 +1,426 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Vertex AI Text to Speech + +| Property | Details | +|-------|-------| +| Description | Google Cloud Text-to-Speech with Chirp3 HD voices and Gemini TTS | +| Provider Route on LiteLLM | `vertex_ai/chirp` (Chirp), `vertex_ai/gemini-*-tts` (Gemini) | + +## Chirp3 HD Voices + +Google Cloud Text-to-Speech API with high-quality Chirp3 HD voices. + +### Quick Start + +#### LiteLLM Python SDK + +```python showLineNumbers title="Chirp3 Quick Start" +from litellm import speech +from pathlib import Path + +speech_file_path = Path(__file__).parent / "speech.mp3" +response = speech( + model="vertex_ai/chirp", + voice="alloy", # OpenAI voice name - automatically mapped + input="Hello, this is Vertex AI Text to Speech", + vertex_project="your-project-id", + vertex_location="us-central1", +) +response.stream_to_file(speech_file_path) +``` + +#### LiteLLM AI Gateway + +**1. Setup config.yaml** + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: vertex-tts + litellm_params: + model: vertex_ai/chirp + vertex_project: "your-project-id" + vertex_location: "us-central1" + vertex_credentials: "/path/to/service_account.json" +``` + +**2. Start the proxy** + +```bash title="Start LiteLLM Proxy" +litellm --config /path/to/config.yaml +``` + +**3. Make requests** + + + + +```bash showLineNumbers title="Chirp3 Quick Start" +curl http://0.0.0.0:4000/v1/audio/speech \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "vertex-tts", + "voice": "alloy", + "input": "Hello, this is Vertex AI Text to Speech" + }' \ + --output speech.mp3 +``` + + + + +```python showLineNumbers title="Chirp3 Quick Start" +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.audio.speech.create( + model="vertex-tts", + voice="alloy", + input="Hello, this is Vertex AI Text to Speech", +) +response.stream_to_file("speech.mp3") +``` + + + + +### Voice Mapping + +LiteLLM maps OpenAI voice names to Google Cloud voices. You can use either OpenAI voices or Google Cloud voices directly. + +| OpenAI Voice | Google Cloud Voice | +|-------------|-------------------| +| `alloy` | en-US-Studio-O | +| `echo` | en-US-Studio-M | +| `fable` | en-GB-Studio-B | +| `onyx` | en-US-Wavenet-D | +| `nova` | en-US-Studio-O | +| `shimmer` | en-US-Wavenet-F | + +### Using Google Cloud Voices Directly + +#### LiteLLM Python SDK + +```python showLineNumbers title="Chirp3 HD Voice" +from litellm import speech + +# Pass Chirp3 HD voice name directly +response = speech( + model="vertex_ai/chirp", + voice="en-US-Chirp3-HD-Charon", + input="Hello with a Chirp3 HD voice", + vertex_project="your-project-id", +) +response.stream_to_file("speech.mp3") +``` + +```python showLineNumbers title="Voice as Dict (Multilingual)" +from litellm import speech + +# Pass as dict for full control over language and voice +response = speech( + model="vertex_ai/chirp", + voice={ + "languageCode": "de-DE", + "name": "de-DE-Chirp3-HD-Charon", + }, + input="Hallo, dies ist ein Test", + vertex_project="your-project-id", +) +response.stream_to_file("speech.mp3") +``` + +#### LiteLLM AI Gateway + + + + +```bash showLineNumbers title="Chirp3 HD Voice" +curl http://0.0.0.0:4000/v1/audio/speech \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "vertex-tts", + "voice": "en-US-Chirp3-HD-Charon", + "input": "Hello with a Chirp3 HD voice" + }' \ + --output speech.mp3 +``` + +```bash showLineNumbers title="Voice as Dict (Multilingual)" +curl http://0.0.0.0:4000/v1/audio/speech \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "vertex-tts", + "voice": {"languageCode": "de-DE", "name": "de-DE-Chirp3-HD-Charon"}, + "input": "Hallo, dies ist ein Test" + }' \ + --output speech.mp3 +``` + + + + +```python showLineNumbers title="Chirp3 HD Voice" +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.audio.speech.create( + model="vertex-tts", + voice="en-US-Chirp3-HD-Charon", + input="Hello with a Chirp3 HD voice", +) +response.stream_to_file("speech.mp3") +``` + +```python showLineNumbers title="Voice as Dict (Multilingual)" +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.audio.speech.create( + model="vertex-tts", + voice={"languageCode": "de-DE", "name": "de-DE-Chirp3-HD-Charon"}, + input="Hallo, dies ist ein Test", +) +response.stream_to_file("speech.mp3") +``` + + + + +Browse available voices: [Google Cloud Text-to-Speech Console](https://console.cloud.google.com/vertex-ai/generative/speech/text-to-speech) + +### Passing Raw SSML + +LiteLLM auto-detects SSML when your input contains `` tags and passes it through unchanged. + +#### LiteLLM Python SDK + +```python showLineNumbers title="SSML Input" +from litellm import speech + +ssml = """ + +

Hello, world!

+

This is a test of the text-to-speech API.

+
+""" + +response = speech( + model="vertex_ai/chirp", + voice="en-US-Studio-O", + input=ssml, # Auto-detected as SSML + vertex_project="your-project-id", +) +response.stream_to_file("speech.mp3") +``` + +```python showLineNumbers title="Force SSML Mode" +from litellm import speech + +# Force SSML mode with use_ssml=True +response = speech( + model="vertex_ai/chirp", + voice="en-US-Studio-O", + input="Speaking slowly", + use_ssml=True, + vertex_project="your-project-id", +) +response.stream_to_file("speech.mp3") +``` + +#### LiteLLM AI Gateway + + + + +```bash showLineNumbers title="SSML Input" +curl http://0.0.0.0:4000/v1/audio/speech \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "vertex-tts", + "voice": "en-US-Studio-O", + "input": "

Hello!

How are you?

" + }' \ + --output speech.mp3 +``` + +
+ + +```python showLineNumbers title="SSML Input" +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +ssml = """

Hello!

How are you?

""" + +response = client.audio.speech.create( + model="vertex-tts", + voice="en-US-Studio-O", + input=ssml, +) +response.stream_to_file("speech.mp3") +``` + +
+
+ +### Supported Parameters + +| Parameter | Description | Values | +|-----------|-------------|--------| +| `voice` | Voice selection | OpenAI voice, Google Cloud voice name, or dict | +| `input` | Text to convert | Plain text or SSML | +| `speed` | Speaking rate | 0.25 to 4.0 (default: 1.0) | +| `response_format` | Audio format | `mp3`, `opus`, `wav`, `pcm`, `flac` | +| `use_ssml` | Force SSML mode | `True` / `False` | + +### Async Usage + +```python showLineNumbers title="Async Speech Generation" +import asyncio +from litellm import aspeech + +async def main(): + response = await aspeech( + model="vertex_ai/chirp", + voice="alloy", + input="Hello from async", + vertex_project="your-project-id", + ) + response.stream_to_file("speech.mp3") + +asyncio.run(main()) +``` + +--- + +## Gemini TTS + +Gemini models with audio output capabilities using the chat completions API. + +:::warning +**Limitations:** +- Only supports `pcm16` audio format +- Streaming not yet supported +- Must set `modalities: ["audio"]` +- When using via LiteLLM Proxy, must include `"allowed_openai_params": ["audio", "modalities"]` in the request body to enable audio parameters +::: + +### Quick Start + +#### LiteLLM Python SDK + +```python showLineNumbers title="Gemini TTS Quick Start" +from litellm import completion +import json + +# Load credentials +with open('path/to/service_account.json', 'r') as file: + vertex_credentials = json.dumps(json.load(file)) + +response = completion( + model="vertex_ai/gemini-2.5-flash-preview-tts", + messages=[{"role": "user", "content": "Say hello in a friendly voice"}], + modalities=["audio"], + audio={ + "voice": "Kore", + "format": "pcm16" + }, + vertex_credentials=vertex_credentials +) +print(response) +``` + +#### LiteLLM AI Gateway + +**1. Setup config.yaml** + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: gemini-tts + litellm_params: + model: vertex_ai/gemini-2.5-flash-preview-tts + vertex_project: "your-project-id" + vertex_location: "us-central1" + vertex_credentials: "/path/to/service_account.json" +``` + +**2. Start the proxy** + +```bash title="Start LiteLLM Proxy" +litellm --config /path/to/config.yaml +``` + +**3. Make requests** + + + + +```bash showLineNumbers title="Gemini TTS Request" +curl http://0.0.0.0:4000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-1234" \ + -d '{ + "model": "gemini-tts", + "messages": [{"role": "user", "content": "Say hello in a friendly voice"}], + "modalities": ["audio"], + "audio": {"voice": "Kore", "format": "pcm16"}, + "allowed_openai_params": ["audio", "modalities"] + }' +``` + + + + +```python showLineNumbers title="Gemini TTS Request" +import openai + +client = openai.OpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") + +response = client.chat.completions.create( + model="gemini-tts", + messages=[{"role": "user", "content": "Say hello in a friendly voice"}], + modalities=["audio"], + audio={"voice": "Kore", "format": "pcm16"}, + extra_body={"allowed_openai_params": ["audio", "modalities"]} +) +print(response) +``` + + + + +### Supported Models + +- `vertex_ai/gemini-2.5-flash-preview-tts` +- `vertex_ai/gemini-2.5-pro-preview-tts` + +See [Gemini TTS documentation](https://ai.google.dev/gemini-api/docs/speech-generation) for available voices. + +### Advanced Usage + +```python showLineNumbers title="Gemini TTS with System Prompt" +from litellm import completion + +response = completion( + model="vertex_ai/gemini-2.5-pro-preview-tts", + messages=[ + {"role": "system", "content": "You are a helpful assistant that speaks clearly."}, + {"role": "user", "content": "Explain quantum computing in simple terms"} + ], + modalities=["audio"], + audio={"voice": "Charon", "format": "pcm16"}, + temperature=0.7, + max_tokens=150, + vertex_credentials=vertex_credentials +) +``` diff --git a/docs/my-website/docs/providers/vllm_batches.md b/docs/my-website/docs/providers/vllm_batches.md new file mode 100644 index 000000000000..44c4d9149125 --- /dev/null +++ b/docs/my-website/docs/providers/vllm_batches.md @@ -0,0 +1,178 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# vLLM - Batch + Files API + +LiteLLM supports vLLM's Batch and Files API for processing large volumes of requests asynchronously. + +| Feature | Supported | +|---------|-----------| +| `/v1/files` | ✅ | +| `/v1/batches` | ✅ | +| Cost Tracking | ✅ | + +## Quick Start + +### 1. Setup config.yaml + +Define your vLLM model in `config.yaml`. LiteLLM uses the model name to route batch requests to the correct vLLM server. + +```yaml +model_list: + - model_name: my-vllm-model + litellm_params: + model: hosted_vllm/meta-llama/Llama-2-7b-chat-hf + api_base: http://localhost:8000 # your vLLM server +``` + +### 2. Start LiteLLM Proxy + +```bash +litellm --config /path/to/config.yaml +``` + +### 3. Create Batch File + +Create a JSONL file with your batch requests: + +```jsonl +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "my-vllm-model", "messages": [{"role": "user", "content": "Hello!"}]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "my-vllm-model", "messages": [{"role": "user", "content": "How are you?"}]}} +``` + +### 4. Upload File & Create Batch + +:::tip Model Routing +LiteLLM needs to know which model (and therefore which vLLM server) to use for batch operations. Specify the model using the `x-litellm-model` header when uploading files. LiteLLM will encode this model info into the file ID, so subsequent batch operations automatically route to the correct server. + +See [Multi-Account / Model-Based Routing](../batches#multi-account--model-based-routing) for more details. +::: + + + + +**Upload File** + +```bash +curl http://localhost:4000/v1/files \ + -H "Authorization: Bearer sk-1234" \ + -H "x-litellm-model: my-vllm-model" \ + -F purpose="batch" \ + -F file="@batch_requests.jsonl" +``` + +**Create Batch** + +```bash +curl http://localhost:4000/v1/batches \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "input_file_id": "file-abc123", + "endpoint": "/v1/chat/completions", + "completion_window": "24h" + }' +``` + +**Check Batch Status** + +```bash +curl http://localhost:4000/v1/batches/batch_abc123 \ + -H "Authorization: Bearer sk-1234" +``` + + + + +```python +import litellm +import asyncio + +async def run_vllm_batch(): + # Upload file + file_obj = await litellm.acreate_file( + file=open("batch_requests.jsonl", "rb"), + purpose="batch", + custom_llm_provider="hosted_vllm", + ) + print(f"File uploaded: {file_obj.id}") + + # Create batch + batch = await litellm.acreate_batch( + completion_window="24h", + endpoint="/v1/chat/completions", + input_file_id=file_obj.id, + custom_llm_provider="hosted_vllm", + ) + print(f"Batch created: {batch.id}") + + # Poll for completion + while True: + batch_status = await litellm.aretrieve_batch( + batch_id=batch.id, + custom_llm_provider="hosted_vllm", + ) + print(f"Status: {batch_status.status}") + + if batch_status.status == "completed": + break + elif batch_status.status in ["failed", "cancelled"]: + raise Exception(f"Batch failed: {batch_status.status}") + + await asyncio.sleep(5) + + # Get results + if batch_status.output_file_id: + results = await litellm.afile_content( + file_id=batch_status.output_file_id, + custom_llm_provider="hosted_vllm", + ) + print(f"Results: {results}") + +asyncio.run(run_vllm_batch()) +``` + + + + +## Supported Operations + +| Operation | Endpoint | Method | +|-----------|----------|--------| +| Upload file | `/v1/files` | POST | +| List files | `/v1/files` | GET | +| Retrieve file | `/v1/files/{file_id}` | GET | +| Delete file | `/v1/files/{file_id}` | DELETE | +| Get file content | `/v1/files/{file_id}/content` | GET | +| Create batch | `/v1/batches` | POST | +| List batches | `/v1/batches` | GET | +| Retrieve batch | `/v1/batches/{batch_id}` | GET | +| Cancel batch | `/v1/batches/{batch_id}/cancel` | POST | + +## Environment Variables + +```bash +# Set vLLM server endpoint +export HOSTED_VLLM_API_BASE="http://localhost:8000" + +# Optional: API key if your vLLM server requires authentication +export HOSTED_VLLM_API_KEY="your-api-key" +``` + +## How Model Routing Works + +When you upload a file with `x-litellm-model: my-vllm-model`, LiteLLM: + +1. Encodes the model name into the returned file ID +2. Uses this encoded model info to automatically route subsequent batch operations to the correct vLLM server +3. No need to specify the model again when creating batches or retrieving results + +This enables multi-tenant batch processing where different teams can use different vLLM deployments through the same LiteLLM proxy. + +**Learn more:** [Multi-Account / Model-Based Routing](../batches#multi-account--model-based-routing) + +## Related + +- [vLLM Provider Overview](./vllm) +- [Batch API Overview](../batches) +- [Files API](../files_endpoints) diff --git a/docs/my-website/docs/providers/voyage.md b/docs/my-website/docs/providers/voyage.md index 4b729bc9f58a..43369cd6ab78 100644 --- a/docs/my-website/docs/providers/voyage.md +++ b/docs/my-website/docs/providers/voyage.md @@ -14,12 +14,41 @@ import os os.environ['VOYAGE_API_KEY'] = "" response = embedding( - model="voyage/voyage-3-large", + model="voyage/voyage-3.5", input=["good morning from litellm"], ) print(response) ``` +## Supported Parameters + +VoyageAI embeddings support the following optional parameters: + +- `input_type`: Specifies the type of input for retrieval optimization + - `"query"`: Use for search queries + - `"document"`: Use for documents being indexed +- `dimensions`: Output embedding dimensions (256, 512, 1024, or 2048) +- `encoding_format`: Output format (`"float"`, `"int8"`, `"uint8"`, `"binary"`, `"ubinary"`) +- `truncation`: Whether to truncate inputs exceeding max tokens (default: `True`) + +### Example with Parameters + +```python +from litellm import embedding +import os + +os.environ['VOYAGE_API_KEY'] = "your-api-key" + +# Embedding with custom dimensions and input type +response = embedding( + model="voyage/voyage-3.5", + input=["Your text here"], + dimensions=512, + input_type="document" +) +print(f"Embedding dimensions: {len(response.data[0]['embedding'])}") +``` + ## Supported Models All models listed here https://docs.voyageai.com/embeddings/#models-and-specifics are supported @@ -40,5 +69,188 @@ All models listed here https://docs.voyageai.com/embeddings/#models-and-specific | voyage-2 | `embedding(model="voyage/voyage-2", input)` | | voyage-lite-02-instruct | `embedding(model="voyage/voyage-lite-02-instruct", input)` | | voyage-01 | `embedding(model="voyage/voyage-01", input)` | -| voyage-lite-01 | `embedding(model="voyage/voyage-lite-01", input)` | -| voyage-lite-01-instruct | `embedding(model="voyage/voyage-lite-01-instruct", input)` | +| voyage-lite-01 | `embedding(model="voyage/voyage-lite-01", input)` | +| voyage-lite-01-instruct | `embedding(model="voyage/voyage-lite-01-instruct", input)` | + +## Contextual Embeddings (voyage-context-3) + +VoyageAI's `voyage-context-3` model provides contextualized chunk embeddings, where each chunk is embedded with awareness of its surrounding document context. This significantly improves retrieval quality compared to standard context-agnostic embeddings. + +### Key Benefits +- Chunks understand their position and role within the full document +- Improved retrieval accuracy for long documents (outperforms competitors by 7-23%) +- Better handling of ambiguous references and cross-chunk dependencies +- Seamless drop-in replacement for standard embeddings in RAG pipelines + +### Usage + +Contextual embeddings require a **nested input format** where each inner list represents chunks from a single document: + +```python +from litellm import embedding +import os + +os.environ['VOYAGE_API_KEY'] = "your-api-key" + +# Single document with multiple chunks +response = embedding( + model="voyage/voyage-context-3", + input=[ + [ + "Chapter 1: Introduction to AI", + "This chapter covers the basics of artificial intelligence.", + "We will explore machine learning and deep learning." + ] + ] +) +print(f"Number of chunk groups: {len(response.data)}") + +# Multiple documents +response = embedding( + model="voyage/voyage-context-3", + input=[ + ["Paris is the capital of France.", "It is known for the Eiffel Tower."], + ["Tokyo is the capital of Japan.", "It is a major economic hub."] + ] +) +print(f"Processed {len(response.data)} documents") +``` + +### Specifications +- Model: `voyage-context-3` +- Context length: 32,000 tokens per document +- Output dimensions: 256, 512, 1024 (default), or 2048 +- Max inputs: 1,000 per request +- Max total tokens: 120,000 +- Max chunks: 16,000 +- Pricing: $0.18 per million tokens + +### When to Use Contextual Embeddings + +**Use `voyage-context-3` when:** +- Processing long documents split into chunks +- Document structure and flow are important +- References between sections matter +- You need to preserve document hierarchy + +**Use standard models (voyage-3.5, voyage-3-large) when:** +- Embedding independent pieces of text +- Processing short queries +- Document context is not relevant +- You need faster/cheaper processing + +## Model Selection Guide + +| Model | Best For | Context Length | Price/M Tokens | +|-------|----------|----------------|----------------| +| voyage-3.5 | General-purpose, multilingual | 32K | $0.06 | +| voyage-3.5-lite | Latency-sensitive applications | 32K | $0.02 | +| voyage-3-large | Best overall quality | 32K | $0.18 | +| voyage-code-3 | Code retrieval and search | 32K | $0.18 | +| voyage-finance-2 | Financial documents | 32K | $0.12 | +| voyage-law-2 | Legal documents | 16K | $0.12 | +| voyage-context-3 | Contextual document embeddings | 32K | $0.18 | + +## Rerank + +Voyage AI provides reranking models to improve search relevance by reordering documents based on their relevance to a query. + +### Quick Start + +```python +from litellm import rerank +import os + +os.environ["VOYAGE_API_KEY"] = "your-api-key" + +response = rerank( + model="voyage/rerank-2.5", + query="What is the capital of France?", + documents=[ + "Paris is the capital of France.", + "London is the capital of England.", + "Berlin is the capital of Germany.", + ], + top_n=3, +) + +print(response) +``` + +### Async Usage + +```python +from litellm import arerank +import os +import asyncio + +os.environ["VOYAGE_API_KEY"] = "your-api-key" + +async def main(): + response = await arerank( + model="voyage/rerank-2.5-lite", + query="Best programming language for beginners?", + documents=[ + "Python is great for beginners due to simple syntax.", + "JavaScript runs in browsers and is versatile.", + "Rust has a steep learning curve but is very safe.", + ], + top_n=2, + ) + print(response) + +asyncio.run(main()) +``` + +### LiteLLM Proxy Usage + +Add to your `config.yaml`: + +```yaml +model_list: + - model_name: rerank-2.5 + litellm_params: + model: voyage/rerank-2.5 + api_key: os.environ/VOYAGE_API_KEY + - model_name: rerank-2.5-lite + litellm_params: + model: voyage/rerank-2.5-lite + api_key: os.environ/VOYAGE_API_KEY +``` + +Test with curl: + +```bash +curl http://localhost:4000/rerank \ + -H "Authorization: Bearer sk-1234" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "rerank-2.5", + "query": "What is the capital of France?", + "documents": [ + "Paris is the capital of France.", + "London is the capital of England.", + "Berlin is the capital of Germany." + ], + "top_n": 3 + }' +``` + +### Supported Rerank Models + +| Model | Context Length | Description | Price/M Tokens | +|-------|----------------|-------------|----------------| +| rerank-2.5 | 32K | Best quality, multilingual, instruction-following | $0.05 | +| rerank-2.5-lite | 32K | Optimized for latency and cost | $0.02 | +| rerank-2 | 16K | Legacy model | $0.05 | +| rerank-2-lite | 8K | Legacy model, faster | $0.02 | + +### Supported Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `model` | string | Model name (e.g., `voyage/rerank-2.5`) | +| `query` | string | The search query | +| `documents` | list | List of documents to rerank | +| `top_n` | int | Number of top results to return | +| `return_documents` | bool | Whether to include document text in response | diff --git a/docs/my-website/docs/providers/watsonx.md b/docs/my-website/docs/providers/watsonx.md deleted file mode 100644 index 23d8d259ac0a..000000000000 --- a/docs/my-website/docs/providers/watsonx.md +++ /dev/null @@ -1,287 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -# IBM watsonx.ai - -LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings. - -## Environment Variables -```python -os.environ["WATSONX_URL"] = "" # (required) Base URL of your WatsonX instance -# (required) either one of the following: -os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key -os.environ["WATSONX_TOKEN"] = "" # IAM auth token -# optional - can also be passed as params to completion() or embedding() -os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance -os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models -os.environ["WATSONX_ZENAPIKEY"] = "" # Zen API key (use for long-term api token) -``` - -See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai. - -## Usage - -
- Open In Colab - - -```python -import os -from litellm import completion - -os.environ["WATSONX_URL"] = "" -os.environ["WATSONX_APIKEY"] = "" - -## Call WATSONX `/text/chat` endpoint - supports function calling -response = completion( - model="watsonx/meta-llama/llama-3-1-8b-instruct", - messages=[{ "content": "what is your favorite colour?","role": "user"}], - project_id="" # or pass with os.environ["WATSONX_PROJECT_ID"] -) - -## Call WATSONX `/text/generation` endpoint - not all models support /chat route. -response = completion( - model="watsonx/ibm/granite-13b-chat-v2", - messages=[{ "content": "what is your favorite colour?","role": "user"}], - project_id="" -) -``` - -## Usage - Streaming -```python -import os -from litellm import completion - -os.environ["WATSONX_URL"] = "" -os.environ["WATSONX_APIKEY"] = "" -os.environ["WATSONX_PROJECT_ID"] = "" - -response = completion( - model="watsonx/meta-llama/llama-3-1-8b-instruct", - messages=[{ "content": "what is your favorite colour?","role": "user"}], - stream=True -) -for chunk in response: - print(chunk) -``` - -#### Example Streaming Output Chunk -```json -{ - "choices": [ - { - "finish_reason": null, - "index": 0, - "delta": { - "content": "I don't have a favorite color, but I do like the color blue. What's your favorite color?" - } - } - ], - "created": null, - "model": "watsonx/ibm/granite-13b-chat-v2", - "usage": { - "prompt_tokens": null, - "completion_tokens": null, - "total_tokens": null - } -} -``` - -## Usage - Models in deployment spaces - -Models that have been deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/` format (where `` is the ID of the deployed model in your deployment space). - -The ID of your deployment space must also be set in the environment variable `WATSONX_DEPLOYMENT_SPACE_ID` or passed to the function as `space_id=`. - -```python -import litellm -response = litellm.completion( - model="watsonx/deployment/", - messages=[{"content": "Hello, how are you?", "role": "user"}], - space_id="" -) -``` - -## Usage - Embeddings - -LiteLLM also supports making requests to IBM watsonx.ai embedding models. The credential needed for this is the same as for completion. - -```python -from litellm import embedding - -response = embedding( - model="watsonx/ibm/slate-30m-english-rtrvr", - input=["What is the capital of France?"], - project_id="" -) -print(response) -# EmbeddingResponse(model='ibm/slate-30m-english-rtrvr', data=[{'object': 'embedding', 'index': 0, 'embedding': [-0.037463713, -0.02141933, -0.02851813, 0.015519324, ..., -0.0021367231, -0.01704561, -0.001425816, 0.0035238306]}], object='list', usage=Usage(prompt_tokens=8, total_tokens=8)) -``` - -## OpenAI Proxy Usage - -Here's how to call IBM watsonx.ai with the LiteLLM Proxy Server - -### 1. Save keys in your environment - -```bash -export WATSONX_URL="" -export WATSONX_APIKEY="" -export WATSONX_PROJECT_ID="" -``` - -### 2. Start the proxy - - - - -```bash -$ litellm --model watsonx/meta-llama/llama-3-8b-instruct - -# Server running on http://0.0.0.0:4000 -``` - - - - -```yaml -model_list: - - model_name: llama-3-8b - litellm_params: - # all params accepted by litellm.completion() - model: watsonx/meta-llama/llama-3-8b-instruct - api_key: "os.environ/WATSONX_API_KEY" # does os.getenv("WATSONX_API_KEY") -``` - - - -### 3. Test it - - - - - -```shell -curl --location 'http://0.0.0.0:4000/chat/completions' \ ---header 'Content-Type: application/json' \ ---data ' { - "model": "llama-3-8b", - "messages": [ - { - "role": "user", - "content": "what is your favorite colour?" - } - ] - } -' -``` - - - -```python -import openai -client = openai.OpenAI( - api_key="anything", - base_url="http://0.0.0.0:4000" -) - -# request sent to model set on litellm proxy, `litellm --model` -response = client.chat.completions.create(model="llama-3-8b", messages=[ - { - "role": "user", - "content": "what is your favorite colour?" - } -]) - -print(response) - -``` - - - -```python -from langchain.chat_models import ChatOpenAI -from langchain.prompts.chat import ( - ChatPromptTemplate, - HumanMessagePromptTemplate, - SystemMessagePromptTemplate, -) -from langchain.schema import HumanMessage, SystemMessage - -chat = ChatOpenAI( - openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy - model = "llama-3-8b", - temperature=0.1 -) - -messages = [ - SystemMessage( - content="You are a helpful assistant that im using to make a test request to." - ), - HumanMessage( - content="test from litellm. tell me why it's amazing in 1 sentence" - ), -] -response = chat(messages) - -print(response) -``` - - - - -## Authentication - -### Passing credentials as parameters - -You can also pass the credentials as parameters to the completion and embedding functions. - -```python -import os -from litellm import completion - -response = completion( - model="watsonx/ibm/granite-13b-chat-v2", - messages=[{ "content": "What is your favorite color?","role": "user"}], - url="", - api_key="", - project_id="" -) -``` - - -## Supported IBM watsonx.ai Models - -Here are some examples of models available in IBM watsonx.ai that you can use with LiteLLM: - -| Mode Name | Command | -|------------------------------------|------------------------------------------------------------------------------------------| -| Flan T5 XXL | `completion(model=watsonx/google/flan-t5-xxl, messages=messages)` | -| Flan Ul2 | `completion(model=watsonx/google/flan-ul2, messages=messages)` | -| Mt0 XXL | `completion(model=watsonx/bigscience/mt0-xxl, messages=messages)` | -| Gpt Neox | `completion(model=watsonx/eleutherai/gpt-neox-20b, messages=messages)` | -| Mpt 7B Instruct2 | `completion(model=watsonx/ibm/mpt-7b-instruct2, messages=messages)` | -| Starcoder | `completion(model=watsonx/bigcode/starcoder, messages=messages)` | -| Llama 2 70B Chat | `completion(model=watsonx/meta-llama/llama-2-70b-chat, messages=messages)` | -| Llama 2 13B Chat | `completion(model=watsonx/meta-llama/llama-2-13b-chat, messages=messages)` | -| Granite 13B Instruct | `completion(model=watsonx/ibm/granite-13b-instruct-v1, messages=messages)` | -| Granite 13B Chat | `completion(model=watsonx/ibm/granite-13b-chat-v1, messages=messages)` | -| Flan T5 XL | `completion(model=watsonx/google/flan-t5-xl, messages=messages)` | -| Granite 13B Chat V2 | `completion(model=watsonx/ibm/granite-13b-chat-v2, messages=messages)` | -| Granite 13B Instruct V2 | `completion(model=watsonx/ibm/granite-13b-instruct-v2, messages=messages)` | -| Elyza Japanese Llama 2 7B Instruct | `completion(model=watsonx/elyza/elyza-japanese-llama-2-7b-instruct, messages=messages)` | -| Mixtral 8X7B Instruct V01 Q | `completion(model=watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q, messages=messages)` | - - -For a list of all available models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp). - - -## Supported IBM watsonx.ai Embedding Models - -| Model Name | Function Call | -|------------|------------------------------------------------------------------------| -| Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` | -| Slate 125m | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` | - - -For a list of all available embedding models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx). \ No newline at end of file diff --git a/docs/my-website/docs/providers/watsonx/audio_transcription.md b/docs/my-website/docs/providers/watsonx/audio_transcription.md new file mode 100644 index 000000000000..37b4bb438a2d --- /dev/null +++ b/docs/my-website/docs/providers/watsonx/audio_transcription.md @@ -0,0 +1,57 @@ +# WatsonX Audio Transcription + +## Overview + +| Property | Details | +|----------|---------| +| Description | WatsonX audio transcription using Whisper models for speech-to-text | +| Provider Route on LiteLLM | `watsonx/` | +| Supported Operations | `/v1/audio/transcriptions` | +| Link to Provider Doc | [IBM WatsonX.ai ↗](https://www.ibm.com/watsonx) | + +## Quick Start + +### **LiteLLM SDK** + +```python showLineNumbers title="transcription.py" +import litellm + +response = litellm.transcription( + model="watsonx/whisper-large-v3-turbo", + file=open("audio.mp3", "rb"), + api_base="https://us-south.ml.cloud.ibm.com", + api_key="your-api-key", + project_id="your-project-id" +) +print(response.text) +``` + +### **LiteLLM Proxy** + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: whisper-large-v3-turbo + litellm_params: + model: watsonx/whisper-large-v3-turbo + api_key: os.environ/WATSONX_APIKEY + api_base: os.environ/WATSONX_URL + project_id: os.environ/WATSONX_PROJECT_ID +``` + +```bash title="Request" +curl http://localhost:4000/v1/audio/transcriptions \ + -H "Authorization: Bearer sk-1234" \ + -F file="@audio.mp3" \ + -F model="whisper-large-v3-turbo" +``` + +## Supported Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `model` | string | Model ID (e.g., `watsonx/whisper-large-v3-turbo`) | +| `file` | file | Audio file to transcribe | +| `language` | string | Language code (e.g., `en`) | +| `prompt` | string | Optional prompt to guide transcription | +| `temperature` | float | Sampling temperature (0-1) | +| `response_format` | string | `json`, `text`, `srt`, `verbose_json`, `vtt` | diff --git a/docs/my-website/docs/providers/watsonx/index.md b/docs/my-website/docs/providers/watsonx/index.md new file mode 100644 index 000000000000..14e0c07c081f --- /dev/null +++ b/docs/my-website/docs/providers/watsonx/index.md @@ -0,0 +1,230 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# IBM watsonx.ai + +LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings. + +## Environment Variables +```python +os.environ["WATSONX_URL"] = "" # (required) Base URL of your WatsonX instance +# (required) either one of the following: +os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key +os.environ["WATSONX_TOKEN"] = "" # IAM auth token +# optional - can also be passed as params to completion() or embedding() +os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance +os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models +os.environ["WATSONX_ZENAPIKEY"] = "" # Zen API key (use for long-term api token) +``` + +See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai. + +## Usage + + + Open In Colab + + +```python showLineNumbers title="Chat Completion" +import os +from litellm import completion + +os.environ["WATSONX_URL"] = "" +os.environ["WATSONX_APIKEY"] = "" + +response = completion( + model="watsonx/meta-llama/llama-3-1-8b-instruct", + messages=[{ "content": "what is your favorite colour?","role": "user"}], + project_id="" +) +``` + +## Usage - Streaming +```python showLineNumbers title="Streaming" +import os +from litellm import completion + +os.environ["WATSONX_URL"] = "" +os.environ["WATSONX_APIKEY"] = "" +os.environ["WATSONX_PROJECT_ID"] = "" + +response = completion( + model="watsonx/meta-llama/llama-3-1-8b-instruct", + messages=[{ "content": "what is your favorite colour?","role": "user"}], + stream=True +) +for chunk in response: + print(chunk) +``` + +## Usage - Models in deployment spaces + +Models deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/` format. + +```python showLineNumbers title="Deployment Space" +import litellm + +response = litellm.completion( + model="watsonx/deployment/", + messages=[{"content": "Hello, how are you?", "role": "user"}], + space_id="" +) +``` + +## Usage - Embeddings + +```python showLineNumbers title="Embeddings" +from litellm import embedding + +response = embedding( + model="watsonx/ibm/slate-30m-english-rtrvr", + input=["What is the capital of France?"], + project_id="" +) +``` + +## LiteLLM Proxy Usage + +### 1. Save keys in your environment + +```bash +export WATSONX_URL="" +export WATSONX_APIKEY="" +export WATSONX_PROJECT_ID="" +``` + +### 2. Start the proxy + + + + +```bash +$ litellm --model watsonx/meta-llama/llama-3-8b-instruct +``` + + + + +```yaml showLineNumbers title="config.yaml" +model_list: + - model_name: llama-3-8b + litellm_params: + model: watsonx/meta-llama/llama-3-8b-instruct + api_key: "os.environ/WATSONX_API_KEY" +``` + + + +### 3. Test it + + + + + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "model": "llama-3-8b", + "messages": [ + { + "role": "user", + "content": "what is your favorite colour?" + } + ] + }' +``` + + + +```python showLineNumbers +import openai + +client = openai.OpenAI( + api_key="anything", + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="llama-3-8b", + messages=[{"role": "user", "content": "what is your favorite colour?"}] +) +print(response) +``` + + + + +## Supported Models + +| Model Name | Command | +|------------------------------------|------------------------------------------------------------------------------------------| +| Llama 3.1 8B Instruct | `completion(model="watsonx/meta-llama/llama-3-1-8b-instruct", messages=messages)` | +| Llama 2 70B Chat | `completion(model="watsonx/meta-llama/llama-2-70b-chat", messages=messages)` | +| Granite 13B Chat V2 | `completion(model="watsonx/ibm/granite-13b-chat-v2", messages=messages)` | +| Mixtral 8X7B Instruct | `completion(model="watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q", messages=messages)` | + +For all available models, see [watsonx.ai documentation](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx). + +## Supported Embedding Models + +| Model Name | Function Call | +|------------|------------------------------------------------------------------------| +| Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` | +| Slate 125m | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` | + +For all available embedding models, see [watsonx.ai embedding documentation](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx). + + +## Advanced + +### Using Zen API Key + +You can use a Zen API key for long-term authentication instead of generating IAM tokens. Pass it either as an environment variable or as a parameter: + +```python +import os +from litellm import completion + +# Option 1: Set as environment variable +os.environ["WATSONX_ZENAPIKEY"] = "your-zen-api-key" + +response = completion( + model="watsonx/ibm/granite-13b-chat-v2", + messages=[{"content": "What is your favorite color?", "role": "user"}], + project_id="your-project-id" +) + +# Option 2: Pass as parameter +response = completion( + model="watsonx/ibm/granite-13b-chat-v2", + messages=[{"content": "What is your favorite color?", "role": "user"}], + zen_api_key="your-zen-api-key", + project_id="your-project-id" +) +``` + +**Using with LiteLLM Proxy via OpenAI client:** + +```python +import openai + +client = openai.OpenAI( + api_key="sk-1234", # LiteLLM proxy key + base_url="http://0.0.0.0:4000" +) + +response = client.chat.completions.create( + model="watsonx/ibm/granite-3-3-8b-instruct", + messages=[{"role": "user", "content": "What is your favorite color?"}], + max_tokens=2048, + extra_body={ + "project_id": "your-project-id", + "zen_api_key": "your-zen-api-key" + } +) +``` + +See [IBM documentation](https://www.ibm.com/docs/en/watsonx/w-and-w/2.2.0?topic=keys-generating-zenapikey-authorization-tokens) for more information on generating Zen API keys. + + diff --git a/docs/my-website/docs/providers/xai.md b/docs/my-website/docs/providers/xai.md index 49a3640991d8..afeecc215280 100644 --- a/docs/my-website/docs/providers/xai.md +++ b/docs/my-website/docs/providers/xai.md @@ -11,6 +11,68 @@ https://docs.x.ai/docs ::: +## Supported Models + + + +**Latest Release** - Grok 4.1 Fast: Optimized for high-performance agentic tool calling with 2M context and prompt caching. + +| Model | Context | Features | +|-------|---------|----------| +| `xai/grok-4-1-fast-reasoning` | 2M tokens | **Reasoning**, Function calling, Vision, Audio, Web search, Caching | +| `xai/grok-4-1-fast-non-reasoning` | 2M tokens | Function calling, Vision, Audio, Web search, Caching | + +**When to use:** +- ✅ **Reasoning model**: Complex analysis, planning, multi-step reasoning problems +- ✅ **Non-reasoning model**: Simple queries, faster responses, lower token usage + +**Example:** +```python +from litellm import completion + +# With reasoning +response = completion( + model="xai/grok-4-1-fast-reasoning", + messages=[{"role": "user", "content": "Analyze this problem step by step..."}] +) + +# Without reasoning +response = completion( + model="xai/grok-4-1-fast-non-reasoning", + messages=[{"role": "user", "content": "What's 2+2?"}] +) +``` + +--- + +### All Available Models + +| Model Family | Model | Context | Features | +|--------------|-------|---------|----------| +| **Grok 4.1** | `xai/grok-4-1-fast-reasoning` | 2M | **Reasoning**, Tools, Vision, Audio, Web search, Caching | +| | `xai/grok-4-1-fast-non-reasoning` | 2M | Tools, Vision, Audio, Web search, Caching | +| **Grok 4** | `xai/grok-4` | 256K | Tools, Web search | +| | `xai/grok-4-0709` | 256K | Tools, Web search | +| | `xai/grok-4-fast-reasoning` | 2M | **Reasoning**, Tools, Web search | +| | `xai/grok-4-fast-non-reasoning` | 2M | Tools, Web search | +| **Grok 3** | `xai/grok-3` | 131K | Tools, Web search | +| | `xai/grok-3-mini` | 131K | Tools, Web search | +| | `xai/grok-3-fast-beta` | 131K | Tools, Web search | +| **Grok Code** | `xai/grok-code-fast` | 256K | **Reasoning**, Tools, Code generation, Caching | +| **Grok 2** | `xai/grok-2` | 131K | Tools, **Vision** | +| | `xai/grok-2-vision-latest` | 32K | Tools, **Vision** | + +**Features:** +- **Reasoning** = Chain-of-thought reasoning with reasoning tokens +- **Tools** = Function calling / Tool use +- **Web search** = Live internet search +- **Vision** = Image understanding +- **Audio** = Audio input support +- **Caching** = Prompt caching for cost savings +- **Code generation** = Optimized for code tasks + +**Pricing:** See [xAI's pricing page](https://docs.x.ai/docs/models) for current rates. + ## API Key ```python # env variable diff --git a/docs/my-website/docs/providers/xiaomi_mimo.md b/docs/my-website/docs/providers/xiaomi_mimo.md new file mode 100644 index 000000000000..040f51440152 --- /dev/null +++ b/docs/my-website/docs/providers/xiaomi_mimo.md @@ -0,0 +1,137 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Xiaomi MiMo +https://platform.xiaomimimo.com/#/docs + +:::tip + +**We support ALL Xiaomi MiMo models, just set `model=xiaomi_mimo/` as a prefix when sending litellm requests** + +::: + +## API Key +```python +# env variable +os.environ['XIAOMI_MIMO_API_KEY'] +``` + +## Sample Usage +```python +from litellm import completion +import os + +os.environ['XIAOMI_MIMO_API_KEY'] = "" +response = completion( + model="xiaomi_mimo/mimo-v2-flash", + messages=[ + { + "role": "user", + "content": "What's the weather like in Boston today in Fahrenheit?", + } + ], + max_tokens=1024, + temperature=0.3, + top_p=0.95, +) +print(response) +``` + +## Sample Usage - Streaming +```python +from litellm import completion +import os + +os.environ['XIAOMI_MIMO_API_KEY'] = "" +response = completion( + model="xiaomi_mimo/mimo-v2-flash", + messages=[ + { + "role": "user", + "content": "What's the weather like in Boston today in Fahrenheit?", + } + ], + stream=True, + max_tokens=1024, + temperature=0.3, + top_p=0.95, +) + +for chunk in response: + print(chunk) +``` + + +## Usage with LiteLLM Proxy Server + +Here's how to call a Xiaomi MiMo model with the LiteLLM Proxy Server + +1. Modify the config.yaml + + ```yaml + model_list: + - model_name: my-model + litellm_params: + model: xiaomi_mimo/ # add xiaomi_mimo/ prefix to route as Xiaomi MiMo provider + api_key: api-key # api key to send your model + ``` + + +2. Start the proxy + + ```bash + $ litellm --config /path/to/config.yaml + ``` + +3. Send Request to LiteLLM Proxy Server + + + + + + ```python + import openai + client = openai.OpenAI( + api_key="sk-1234", # pass litellm proxy key, if you're using virtual keys + base_url="http://0.0.0.0:4000" # litellm-proxy-base url + ) + + response = client.chat.completions.create( + model="my-model", + messages = [ + { + "role": "user", + "content": "what llm are you" + } + ], + ) + + print(response) + ``` + + + + + ```shell + curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "my-model", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + }' + ``` + + + + +## Supported Models + +| Model Name | Usage | +|------------|-------| +| mimo-v2-flash | `completion(model="xiaomi_mimo/mimo-v2-flash", messages)` | diff --git a/docs/my-website/docs/providers/zai.md b/docs/my-website/docs/providers/zai.md new file mode 100644 index 000000000000..937ccd676806 --- /dev/null +++ b/docs/my-website/docs/providers/zai.md @@ -0,0 +1,137 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Z.AI (Zhipu AI) +https://z.ai/ + +**We support Z.AI GLM text/chat models, just set `zai/` as a prefix when sending completion requests** + +## API Key +```python +# env variable +os.environ['ZAI_API_KEY'] +``` + +## Sample Usage +```python +from litellm import completion +import os + +os.environ['ZAI_API_KEY'] = "" +response = completion( + model="zai/glm-4.7", + messages=[ + {"role": "user", "content": "hello from litellm"} + ], +) +print(response) +``` + +## Sample Usage - Streaming +```python +from litellm import completion +import os + +os.environ['ZAI_API_KEY'] = "" +response = completion( + model="zai/glm-4.7", + messages=[ + {"role": "user", "content": "hello from litellm"} + ], + stream=True +) + +for chunk in response: + print(chunk) +``` + +## Supported Models + +We support ALL Z.AI GLM models, just set `zai/` as a prefix when sending completion requests. + +| Model Name | Function Call | Notes | +|------------|---------------|-------| +| glm-4.7 | `completion(model="zai/glm-4.7", messages)` | **Latest flagship**, 200K context, **Reasoning** | +| glm-4.6 | `completion(model="zai/glm-4.6", messages)` | 200K context | +| glm-4.5 | `completion(model="zai/glm-4.5", messages)` | 128K context | +| glm-4.5v | `completion(model="zai/glm-4.5v", messages)` | Vision model | +| glm-4.5-x | `completion(model="zai/glm-4.5-x", messages)` | Premium tier | +| glm-4.5-air | `completion(model="zai/glm-4.5-air", messages)` | Lightweight | +| glm-4.5-airx | `completion(model="zai/glm-4.5-airx", messages)` | Fast lightweight | +| glm-4-32b-0414-128k | `completion(model="zai/glm-4-32b-0414-128k", messages)` | 32B parameter model | +| glm-4.5-flash | `completion(model="zai/glm-4.5-flash", messages)` | **FREE tier** | + +## Model Pricing + +| Model | Input ($/1M tokens) | Output ($/1M tokens) | Cached Input ($/1M tokens) | Context Window | +|-------|---------------------|----------------------|---------------------------|----------------| +| glm-4.7 | $0.60 | $2.20 | $0.11 | 200K | +| glm-4.6 | $0.60 | $2.20 | - | 200K | +| glm-4.5 | $0.60 | $2.20 | - | 128K | +| glm-4.5v | $0.60 | $1.80 | - | 128K | +| glm-4.5-x | $2.20 | $8.90 | - | 128K | +| glm-4.5-air | $0.20 | $1.10 | - | 128K | +| glm-4.5-airx | $1.10 | $4.50 | - | 128K | +| glm-4-32b-0414-128k | $0.10 | $0.10 | - | 128K | +| glm-4.5-flash | **FREE** | **FREE** | - | 128K | + +## Using with LiteLLM Proxy + + + + +```python +from litellm import completion +import os + +os.environ['ZAI_API_KEY'] = "" +response = completion( + model="zai/glm-4.7", + messages=[{"role": "user", "content": "Hello, how are you?"}], +) + +print(response.choices[0].message.content) +``` + + + + +1. Setup config.yaml + +```yaml +model_list: + - model_name: glm-4.7 + litellm_params: + model: zai/glm-4.7 + api_key: os.environ/ZAI_API_KEY + - model_name: glm-4.5-flash # Free tier + litellm_params: + model: zai/glm-4.5-flash + api_key: os.environ/ZAI_API_KEY +``` + +2. Run proxy + +```bash +litellm --config config.yaml +``` + +3. Test it! + +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "glm-4.7", + "messages": [ + { + "role": "user", + "content": "Hello, how are you?" + } + ] +}' +``` + + + diff --git a/docs/my-website/docs/proxy/access_control.md b/docs/my-website/docs/proxy/access_control.md index 678032be9a2a..7ada3f8b237b 100644 --- a/docs/my-website/docs/proxy/access_control.md +++ b/docs/my-website/docs/proxy/access_control.md @@ -51,7 +51,7 @@ LiteLLM has two types of roles: | Role Name | Permissions | |-----------|-------------| | `org_admin` | Admin over a specific organization. Can create teams and users within their organization ✨ **Premium Feature** | -| `team_admin` | Admin over a specific team. Can manage team members, update team settings, and create keys for their team. ✨ **Premium Feature** | +| `team_admin` | Admin over a specific team. Can manage team members, update team member permissions, and create keys for their team. ✨ **Premium Feature** | ## What Can Each Role Do? diff --git a/docs/my-website/docs/proxy/admin_ui_sso.md b/docs/my-website/docs/proxy/admin_ui_sso.md index ae082848b6bd..7b299429db72 100644 --- a/docs/my-website/docs/proxy/admin_ui_sso.md +++ b/docs/my-website/docs/proxy/admin_ui_sso.md @@ -73,8 +73,21 @@ GOOGLE_CLIENT_SECRET= ```shell MICROSOFT_CLIENT_ID="84583a4d-" MICROSOFT_CLIENT_SECRET="nbk8Q~" -MICROSOFT_TENANT="5a39737 +MICROSOFT_TENANT="5a39737" ``` + +**Optional: Custom Microsoft SSO Endpoints** + +If you need to use custom Microsoft SSO endpoints (e.g., for a custom identity provider, sovereign cloud, or proxy), you can override the default endpoints: + +```shell +MICROSOFT_AUTHORIZATION_ENDPOINT="https://your-custom-url.com/oauth2/v2.0/authorize" +MICROSOFT_TOKEN_ENDPOINT="https://your-custom-url.com/oauth2/v2.0/token" +MICROSOFT_USERINFO_ENDPOINT="https://your-custom-graph-api.com/v1.0/me" +``` + +If these are not set, the default Microsoft endpoints are used based on your tenant. + - Set Redirect URI on your App Registration on https://portal.azure.com/ - Set a redirect url = `/sso/callback` ```shell @@ -98,6 +111,42 @@ To set up app roles: 4. Assign users to these roles in your Enterprise Application 5. When users sign in via SSO, LiteLLM will automatically assign them the corresponding role +**Advanced: Custom User Attribute Mapping** + +For certain Microsoft Entra ID configurations, you may need to override the default user attribute field names. This is useful when your organization uses custom claims or non-standard attribute names in the SSO response. + +**Step 1: Debug SSO Response** + +First, inspect the JWT fields returned by your Microsoft SSO provider using the [SSO Debug Route](#debugging-sso-jwt-fields). + +1. Add `/sso/debug/callback` as a redirect URL in your Azure App Registration +2. Navigate to `https:///sso/debug/login` +3. Complete the SSO flow to see the returned user attributes + +**Step 2: Identify Field Attribute Names** + +From the debug response, identify the field names used for email, display name, user ID, first name, and last name. + +**Step 3: Set Environment Variables** + +Override the default attribute names by setting these environment variables: + +| Environment Variable | Description | Default Value | +|---------------------|-------------|---------------| +| `MICROSOFT_USER_EMAIL_ATTRIBUTE` | Field name for user email | `userPrincipalName` | +| `MICROSOFT_USER_DISPLAY_NAME_ATTRIBUTE` | Field name for display name | `displayName` | +| `MICROSOFT_USER_ID_ATTRIBUTE` | Field name for user ID | `id` | +| `MICROSOFT_USER_FIRST_NAME_ATTRIBUTE` | Field name for first name | `givenName` | +| `MICROSOFT_USER_LAST_NAME_ATTRIBUTE` | Field name for last name | `surname` | + +**Step 4: Restart the Proxy** + +After setting the environment variables, restart the proxy: + +```bash +litellm --config /path/to/config.yaml +``` + @@ -130,6 +179,17 @@ GENERIC_INCLUDE_CLIENT_ID = "false" # some providers enforce that the client_id GENERIC_SCOPE = "openid profile email" # default scope openid is sometimes not enough to retrieve basic user info like first_name and last_name located in profile scope ``` +**Assigning User Roles via SSO** + +Use `GENERIC_USER_ROLE_ATTRIBUTE` to specify which attribute in the SSO token contains the user's role. The role value must be one of the following supported LiteLLM roles: + +- `proxy_admin` - Admin over the platform +- `proxy_admin_viewer` - Can login, view all keys, view all spend (read-only) +- `internal_user` - Can login, view/create/delete their own keys, view their spend +- `internal_user_view_only` - Can login, view their own keys, view their own spend + +Nested attribute paths are supported (e.g., `claims.role` or `attributes.litellm_role`). + - Set Redirect URI, if your provider requires it - Set a redirect url = `/sso/callback` ```shell @@ -380,3 +440,54 @@ If you need to inspect the JWT fields received from your SSO provider by LiteLLM Once redirected, you should see a page called "SSO Debug Information". This page displays the JWT fields received from your SSO provider (as shown in the image above) + +## Advanced + +### Manage User Roles via Azure App Roles + +Centralize role management by defining user permissions in Azure Entra ID. LiteLLM will automatically assign roles based on your Azure configuration when users sign in—no need to manually manage roles in LiteLLM. + +#### Step 1: Create App Roles on Azure App Registration + +1. Navigate to your App Registration on https://portal.azure.com/ +2. Go to **App roles** > **Create app role** +3. Configure the app role using one of the [supported LiteLLM roles](./access_control.md#global-proxy-roles): + - **Display name**: Admin Viewer (or your preferred display name) + - **Value**: `proxy_admin_viewer` (must match one of the LiteLLM role values exactly) +4. Click **Apply** to save the role +5. Repeat for each LiteLLM role you want to use + + +**Supported LiteLLM role values** (see [full role documentation](./access_control.md#global-proxy-roles)): +- `proxy_admin` - Full admin access +- `proxy_admin_viewer` - Read-only admin access +- `internal_user` - Can create/view/delete own keys +- `internal_user_viewer` - Can view own keys (read-only) + + + +--- + +#### Step 2: Assign Users to App Roles + +1. Navigate to **Enterprise Applications** on https://portal.azure.com/ +2. Select your LiteLLM application +3. Go to **Users and groups** > **Add user/group** +4. Select the user +5. Under **Select a role**, choose the app role you created (e.g., `proxy_admin_viewer`) +6. Click **Assign** to save + + + +--- + +#### Step 3: Sign in and verify + +1. Sign in to the LiteLLM UI via SSO +2. LiteLLM will automatically extract the app role from the JWT token +3. The user will be assigned the corresponding role (you can verify this in the UI by checking the user profile dropdown) + + + +**Note:** The role from Entra ID will take precedence over any existing role in the LiteLLM database. This ensures your SSO provider is the authoritative source for user roles. + diff --git a/docs/my-website/docs/proxy/ai_hub.md b/docs/my-website/docs/proxy/ai_hub.md new file mode 100644 index 000000000000..613629f27d5a --- /dev/null +++ b/docs/my-website/docs/proxy/ai_hub.md @@ -0,0 +1,341 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# AI Hub + +Share models and agents with your organization. Show developers what's available without needing to rebuild them. + +This feature is **available in v1.74.3-stable and above**. + +## Overview + +Admin can select models/agents to expose on public AI hub → Users go to the public url and see what's available. + + + +## Models + +### How to use + +#### 1. Go to the Admin UI + +Navigate to the Model Hub page in the Admin UI (`PROXY_BASE_URL/ui/?login=success&page=model-hub-table`) + + + +#### 2. Select the models you want to expose + +Click on `Select Models to Make Public` and select the models you want to expose. + + + +#### 3. Confirm the changes + + + +#### 4. Success! + +Go to the public url (`PROXY_BASE_URL/ui/model_hub_table`) and see available models. + + + +### API Endpoints + +- `GET /public/model_hub` – returns the list of public model groups. Requires a valid user API key. +- `GET /public/model_hub/info` – returns metadata (docs title, version, useful links) for the public model hub. + +## Agents + +:::info +Agents are only available in v1.79.4-stable and above. +::: + +Share pre-built agents (A2A spec) across your organization. Users can discover and use agents without rebuilding them. + +[**Demo Video**](https://drive.google.com/file/d/1r-_Rtiu04RW5Fwwu3_eshtA1oZtC3_DH/view?usp=sharing) + +### 1. Create an agent + +Create an agent that follows the [A2A spec](https://a2a.dev/). + + + + + + + + +```bash +curl -X POST 'http://0.0.0.0:4000/v1/agents' \ +--header 'Authorization: Bearer ' \ +--header 'Content-Type: application/json' \ +--data '{ + "agent_name": "hello-world-agent", + "agent_card_params": { + "protocolVersion": "1.0", + "name": "Hello World Agent", + "description": "Just a hello world agent", + "url": "http://localhost:9999/", + "version": "1.0.0", + "defaultInputModes": ["text"], + "defaultOutputModes": ["text"], + "capabilities": { + "streaming": true + }, + "skills": [ + { + "id": "hello_world", + "name": "Returns hello world", + "description": "just returns hello world", + "tags": ["hello world"], + "examples": ["hi", "hello world"] + } + ] + } +}' +``` + +**Expected Response** + +```json +{ + "agent_id": "123e4567-e89b-12d3-a456-426614174000", + "agent_name": "hello-world-agent", + "agent_card_params": { + "protocolVersion": "1.0", + "name": "Hello World Agent", + "description": "Just a hello world agent", + "url": "http://localhost:9999/", + "version": "1.0.0", + "defaultInputModes": ["text"], + "defaultOutputModes": ["text"], + "capabilities": { + "streaming": true + }, + "skills": [ + { + "id": "hello_world", + "name": "Returns hello world", + "description": "just returns hello world", + "tags": ["hello world"], + "examples": ["hi", "hello world"] + } + ] + }, + "created_at": "2025-11-15T10:30:00Z", + "created_by": "user123" +} +``` + + + + +### 2. Make agent public + +Make the agent discoverable on the AI Hub. + + + + +Navigate to the Agents Tab on the AI Hub page + + + +Select the agents you want to make public and click on `Make Public` button. + + + + + + +**Option 1: Make single agent public** + +```bash +curl -X POST 'http://0.0.0.0:4000/v1/agents/123e4567-e89b-12d3-a456-426614174000/make_public' \ +--header 'Authorization: Bearer ' \ +--header 'Content-Type: application/json' +``` + +**Option 2: Make multiple agents public** + + +```bash +curl -X POST 'http://0.0.0.0:4000/v1/agents/make_public' \ +--header 'Authorization: Bearer ' \ +--header 'Content-Type: application/json' \ +--data '{ + "agent_ids": [ + "123e4567-e89b-12d3-a456-426614174000", + "123e4567-e89b-12d3-a456-426614174001" + ] +}' +``` + +**Expected Response** + +```json +{ + "message": "Successfully updated public agent groups", + "public_agent_groups": [ + "123e4567-e89b-12d3-a456-426614174000" + ], + "updated_by": "user123" +} +``` + + + + + + + +### 3. View public agents + +Users can now discover the agent via the public endpoint. + + + + + + + + + +```bash +curl -X GET 'http://0.0.0.0:4000/public/agent_hub' \ +--header 'Authorization: Bearer ' +``` + +**Expected Response** + +```json +[ + { + "protocolVersion": "1.0", + "name": "Hello World Agent", + "description": "Just a hello world agent", + "url": "http://localhost:9999/", + "version": "1.0.0", + "defaultInputModes": ["text"], + "defaultOutputModes": ["text"], + "capabilities": { + "streaming": true + }, + "skills": [ + { + "id": "hello_world", + "name": "Returns hello world", + "description": "just returns hello world", + "tags": ["hello world"], + "examples": ["hi", "hello world"] + } + ] + } +] +``` + + + + + +## MCP Servers + +### How to use + +#### 1. Add MCP Server + +Go here for instructions: [MCP Overview](../mcp#adding-your-mcp) + + +#### 2. Make MCP server public + + + + +Navigate to AI Hub page, and select the MCP tab (`PROXY_BASE_URL/ui/?login=success&page=mcp-server-table`) + + + + + + +```bash +curl -L -X POST 'http://localhost:4000/v1/mcp/make_public' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{"mcp_server_ids":["e856f9a3-abc6-45b1-9d06-62fa49ac293d"]}' +``` + + + + + +#### 3. View public MCP servers + +Users can now discover the MCP server via the public endpoint (`PROXY_BASE_URL/ui/model_hub_table`) + + + + + + + + + +```bash +curl -L -X GET 'http://0.0.0.0:4000/public/mcp_hub' \ +-H 'Authorization: Bearer sk-1234' +``` + +**Expected Response** + +```json +[ + { + "server_id": "e856f9a3-abc6-45b1-9d06-62fa49ac293d", + "name": "deepwiki-mcp", + "alias": null, + "server_name": "deepwiki-mcp", + "url": "https://mcp.deepwiki.com/mcp", + "transport": "http", + "spec_path": null, + "auth_type": "none", + "mcp_info": { + "server_name": "deepwiki-mcp", + "description": "free mcp server " + } + }, + { + "server_id": "a634819f-3f93-4efc-9108-e49c5b83ad84", + "name": "deepwiki_2", + "alias": "deepwiki_2", + "server_name": "deepwiki_2", + "url": "https://mcp.deepwiki.com/mcp", + "transport": "http", + "spec_path": null, + "auth_type": "none", + "mcp_info": { + "server_name": "deepwiki_2", + "mcp_server_cost_info": null + } + }, + { + "server_id": "33f950e4-2edb-41fa-91fc-0b9581269be6", + "name": "edc_mcp_server", + "alias": "edc_mcp_server", + "server_name": "edc_mcp_server", + "url": "http://lelvdckdputildev.itg.ti.com:8085/api/mcp", + "transport": "http", + "spec_path": null, + "auth_type": "none", + "mcp_info": { + "server_name": "edc_mcp_server", + "mcp_server_cost_info": null + } + } +] +``` + + + \ No newline at end of file diff --git a/docs/my-website/docs/proxy/alerting.md b/docs/my-website/docs/proxy/alerting.md index 4cbcd0cffce9..38d6d47be445 100644 --- a/docs/my-website/docs/proxy/alerting.md +++ b/docs/my-website/docs/proxy/alerting.md @@ -215,16 +215,16 @@ general_settings: alerting: ["slack"] alerting_threshold: 0.0001 # (Seconds) set an artificially low threshold for testing alerting alert_to_webhook_url: { - "llm_exceptions": "https://hooks.slack.com/services/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", - "llm_too_slow": "https://hooks.slack.com/services/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", - "llm_requests_hanging": "https://hooks.slack.com/services/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", - "budget_alerts": "https://hooks.slack.com/services/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", - "db_exceptions": "https://hooks.slack.com/services/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", - "daily_reports": "https://hooks.slack.com/services/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", - "spend_reports": "https://hooks.slack.com/services/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", - "cooldown_deployment": "https://hooks.slack.com/services/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", - "new_model_added": "https://hooks.slack.com/services/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", - "outage_alerts": "https://hooks.slack.com/services/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", + "llm_exceptions": "example-slack-webhook-url", + "llm_too_slow": "example-slack-webhook-url", + "llm_requests_hanging": "example-slack-webhook-url", + "budget_alerts": "example-slack-webhook-url", + "db_exceptions": "example-slack-webhook-url", + "daily_reports": "example-slack-webhook-url", + "spend_reports": "example-slack-webhook-url", + "cooldown_deployment": "example-slack-webhook-url", + "new_model_added": "example-slack-webhook-url", + "outage_alerts": "example-slack-webhook-url", } litellm_settings: @@ -399,7 +399,7 @@ curl -X GET --location 'http://0.0.0.0:4000/health/services?service=webhook' \ { "spend": 1, # the spend for the 'event_group' "max_budget": 0, # the 'max_budget' set for the 'event_group' - "token": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b", + "token": "example-api-key-123", "user_id": "default_user_id", "team_id": null, "user_email": null, diff --git a/docs/my-website/docs/proxy/arize_phoenix_prompts.md b/docs/my-website/docs/proxy/arize_phoenix_prompts.md new file mode 100644 index 000000000000..138074b1bc36 --- /dev/null +++ b/docs/my-website/docs/proxy/arize_phoenix_prompts.md @@ -0,0 +1,134 @@ +# Arize Phoenix Prompt Management + +Use prompt versions from [Arize Phoenix](https://phoenix.arize.com/) with LiteLLM SDK and Proxy. + +## Quick Start + +### SDK + +```python +import litellm + +response = litellm.completion( + model="gpt-4o", + prompt_id="UHJvbXB0VmVyc2lvbjox", + prompt_integration="arize_phoenix", + api_key="your-arize-phoenix-token", + api_base="https://app.phoenix.arize.com/s/your-workspace", + prompt_variables={"question": "What is AI?"}, +) +``` + +### Proxy + +**1. Add prompt to config** + +```yaml +prompts: + - prompt_id: "simple_prompt" + litellm_params: + prompt_id: "UHJvbXB0VmVyc2lvbjox" + prompt_integration: "arize_phoenix" + api_base: https://app.phoenix.arize.com/s/your-workspace + api_key: os.environ/PHOENIX_API_KEY + ignore_prompt_manager_model: true # optional: use model from config instead + ignore_prompt_manager_optional_params: true # optional: ignore temp, max_tokens from prompt +``` + +**2. Make request** + +```bash +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer sk-1234' \ + -d '{ + "model": "gpt-3.5-turbo", + "prompt_id": "simple_prompt", + "prompt_variables": { + "question": "Explain quantum computing" + } + }' +``` + +## Configuration + +### Get Arize Phoenix Credentials + +1. **API Token**: Get from [Arize Phoenix Settings](https://app.phoenix.arize.com/) +2. **Workspace URL**: `https://app.phoenix.arize.com/s/{your-workspace}` +3. **Prompt ID**: Found in prompt version URL + +**Set environment variable**: +```bash +export PHOENIX_API_KEY="your-token" +``` + +### SDK + PROXY Options + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `prompt_id` | Yes | Arize Phoenix prompt version ID | +| `prompt_integration` | Yes | Set to `"arize_phoenix"` | +| `api_base` | Yes | Workspace URL | +| `api_key` | Yes | Access token | +| `prompt_variables` | No | Variables for template | + +### Proxy-only Options + +| Parameter | Description | +|-----------|-------------| +| `ignore_prompt_manager_model` | Use config model instead of prompt's model | +| `ignore_prompt_manager_optional_params` | Ignore temperature, max_tokens from prompt | + +## Variable Templates + +Arize Phoenix uses Mustache/Handlebars syntax: + +```python +# Template: "Hello {{name}}, question: {{question}}" +prompt_variables = { + "name": "Alice", + "question": "What is ML?" +} +# Result: "Hello Alice, question: What is ML?" +``` + + +## Combine with Additional Messages + +```python +response = litellm.completion( + model="gpt-4o", + prompt_id="UHJvbXB0VmVyc2lvbjox", + prompt_integration="arize_phoenix", + api_base="https://app.phoenix.arize.com/s/your-workspace", + prompt_variables={"question": "Explain AI"}, + messages=[ + {"role": "user", "content": "Keep it under 50 words"} + ] +) +``` + + +## Error Handling + +```python +try: + response = litellm.completion( + model="gpt-4o", + prompt_id="invalid-id", + prompt_integration="arize_phoenix", + api_base="https://app.phoenix.arize.com/s/workspace" + ) +except Exception as e: + print(f"Error: {e}") + # 404: Prompt not found + # 401: Invalid credentials + # 403: Access denied +``` + +## Support + +- [LiteLLM GitHub Issues](https://github.com/BerriAI/litellm/issues) +- [Arize Phoenix Docs](https://docs.arize.com/phoenix) + diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 6da977c8b05c..3cb9e9f3fe43 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -1,28 +1,29 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# Caching +# Caching -:::note +:::note For OpenAI/Anthropic Prompt Caching, go [here](../completion/prompt_caching.md) ::: -Cache LLM Responses. LiteLLM's caching system stores and reuses LLM responses to save costs and reduce latency. When you make the same request twice, the cached response is returned instead of calling the LLM API again. - - +Cache LLM Responses. LiteLLM's caching system stores and reuses LLM responses to save costs and +reduce latency. When you make the same request twice, the cached response is returned instead of +calling the LLM API again. ### Supported Caches - In Memory Cache - Disk Cache -- Redis Cache +- Redis Cache - Qdrant Semantic Cache - Redis Semantic Cache -- s3 Bucket Cache +- S3 Bucket Cache +- GCS Bucket Cache ## Quick Start + @@ -30,6 +31,7 @@ Cache LLM Responses. LiteLLM's caching system stores and reuses LLM responses to Caching can be enabled by adding the `cache` key in the `config.yaml` #### Step 1: Add `cache` to the config.yaml + ```yaml model_list: - model_name: gpt-3.5-turbo @@ -41,18 +43,19 @@ model_list: litellm_settings: set_verbose: True - cache: True # set cache responses to True, litellm defaults to using a redis cache + cache: True # set cache responses to True, litellm defaults to using a redis cache ``` -#### [OPTIONAL] Step 1.5: Add redis namespaces, default ttl +#### [OPTIONAL] Step 1.5: Add redis namespaces, default ttl #### Namespace + If you want to create some folder for your keys, you can set a namespace, like this: ```yaml litellm_settings: - cache: true - cache_params: # set cache params for redis + cache: true + cache_params: # set cache params for redis type: redis namespace: "litellm.caching.caching" ``` @@ -63,7 +66,7 @@ and keys will be stored like: litellm.caching.caching: ``` -#### Redis Cluster +#### Redis Cluster @@ -75,12 +78,11 @@ model_list: litellm_params: model: "*" - litellm_settings: cache: True cache_params: type: redis - redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] + redis_startup_nodes: [{ "host": "127.0.0.1", "port": "7001" }] ``` @@ -121,8 +123,7 @@ print("REDIS_CLUSTER_NODES", os.environ["REDIS_CLUSTER_NODES"]) -#### Redis Sentinel - +#### Redis Sentinel @@ -134,7 +135,6 @@ model_list: litellm_params: model: "*" - litellm_settings: cache: true cache_params: @@ -181,18 +181,17 @@ print("REDIS_SENTINEL_NODES", os.environ["REDIS_SENTINEL_NODES"]) ```yaml litellm_settings: - cache: true - cache_params: # set cache params for redis + cache: true + cache_params: # set cache params for redis type: redis ttl: 600 # will be cached on redis for 600s - # default_in_memory_ttl: Optional[float], default is None. time in seconds. - # default_in_redis_ttl: Optional[float], default is None. time in seconds. + # default_in_memory_ttl: Optional[float], default is None. time in seconds. + # default_in_redis_ttl: Optional[float], default is None. time in seconds. ``` - #### SSL -just set `REDIS_SSL="True"` in your .env, and LiteLLM will pick this up. +just set `REDIS_SSL="True"` in your .env, and LiteLLM will pick this up. ```env REDIS_SSL="True" @@ -204,14 +203,14 @@ For quick testing, you can also use REDIS_URL, eg.: REDIS_URL="rediss://.." ``` -but we **don't** recommend using REDIS_URL in prod. We've noticed a performance difference between using it vs. redis_host, port, etc. +but we **don't** recommend using REDIS_URL in prod. We've noticed a performance difference between +using it vs. redis_host, port, etc. #### GCP IAM Authentication For GCP Memorystore Redis with IAM authentication, install the required dependency: -:::info -IAM authentication for redis is only supported via GCP and only on Redis Clusters for now. +:::info IAM authentication for redis is only supported via GCP and only on Redis Clusters for now. ::: ```shell @@ -229,7 +228,8 @@ litellm_settings: cache: True cache_params: type: redis - redis_startup_nodes: [{"host": "10.128.0.2", "port": 6379}, {"host": "10.128.0.2", "port": 11008}] + redis_startup_nodes: + [{ "host": "10.128.0.2", "port": 6379 }, { "host": "10.128.0.2", "port": 11008 }] gcp_service_account: "projects/-/serviceAccounts/your-sa@project.iam.gserviceaccount.com" ssl: true ssl_cert_reqs: null @@ -242,7 +242,6 @@ litellm_settings: You can configure GCP IAM Redis authentication in your .env: - For Redis Cluster: ```env @@ -283,24 +282,44 @@ Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable cac ``` **Additional kwargs** -You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: +:::info +Use `REDIS_*` environment variables to configure all Redis client library parameters. This is the suggested mechanism for toggling Redis settings as it automatically maps environment variables to Redis client kwargs. +::: + +You can pass in any additional redis.Redis arg, by storing the variable + value in your os +environment, like this: + ```shell REDIS_ = "" -``` +``` + +For example: +```shell +REDIS_SSL = "True" +REDIS_SSL_CERT_REQS = "None" +REDIS_CONNECTION_POOL_KWARGS = '{"max_connections": 20}' +``` + +:::warning +**Note**: For non-string Redis parameters (like integers, booleans, or complex objects), avoid using `REDIS_*` environment variables as they may fail during Redis client initialization. Instead, use `cache_kwargs` in your router configuration for such parameters. +::: [**See how it's read from the environment**](https://github.com/BerriAI/litellm/blob/4d7ff1b33b9991dcf38d821266290631d9bcd2dd/litellm/_redis.py#L40) + #### Step 3: Run proxy with config + ```shell $ litellm --config /path/to/config.yaml ``` - + Caching can be enabled by adding the `cache` key in the `config.yaml` #### Step 1: Add `cache` to the config.yaml + ```yaml model_list: - model_name: fake-openai-endpoint @@ -315,13 +334,13 @@ model_list: litellm_settings: set_verbose: True - cache: True # set cache responses to True, litellm defaults to using a redis cache + cache: True # set cache responses to True, litellm defaults to using a redis cache cache_params: type: qdrant-semantic qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list qdrant_collection_name: test_collection qdrant_quantization_config: binary - similarity_threshold: 0.8 # similarity threshold for semantic cache + similarity_threshold: 0.8 # similarity threshold for semantic cache ``` #### Step 2: Add Qdrant Credentials to your .env @@ -332,11 +351,11 @@ QDRANT_API_BASE = "https://5392d382-45*********.cloud.qdrant.io" ``` #### Step 3: Run proxy with config + ```shell $ litellm --config /path/to/config.yaml ``` - #### Step 4. Test it ```shell @@ -351,13 +370,15 @@ curl -i http://localhost:4000/v1/chat/completions \ }' ``` -**Expect to see `x-litellm-semantic-similarity` in the response headers when semantic caching is one** +**Expect to see `x-litellm-semantic-similarity` in the response headers when semantic caching is +one** #### Step 1: Add `cache` to the config.yaml + ```yaml model_list: - model_name: gpt-3.5-turbo @@ -369,28 +390,70 @@ model_list: litellm_settings: set_verbose: True - cache: True # set cache responses to True - cache_params: # set cache params for s3 + cache: True # set cache responses to True + cache_params: # set cache params for s3 type: s3 - s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3 - s3_region_name: us-west-2 # AWS Region Name for S3 - s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 - s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 - s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets + s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3 + s3_region_name: us-west-2 # AWS Region Name for S3 + s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 + s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 + s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets ``` #### Step 2: Run proxy with config + ```shell $ litellm --config /path/to/config.yaml ``` + + + +#### Step 1: Add `cache` to the config.yaml + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + - model_name: text-embedding-ada-002 + litellm_params: + model: text-embedding-ada-002 + +litellm_settings: + set_verbose: True + cache: True # set cache responses to True + cache_params: # set cache params for gcs + type: gcs + gcs_bucket_name: cache-bucket-litellm # GCS Bucket Name for caching + gcs_path_service_account: os.environ/GCS_PATH_SERVICE_ACCOUNT # use os.environ/ to pass environment variables. This is the path to your GCS service account JSON file + gcs_path: cache/ # [OPTIONAL] GCS path prefix for cache objects +``` + +#### Step 2: Add GCS Credentials to .env + +Set the GCS environment variables in your .env file: + +```shell +GCS_BUCKET_NAME="your-gcs-bucket-name" +GCS_PATH_SERVICE_ACCOUNT="/path/to/service-account.json" +``` + +#### Step 3: Run proxy with config + +```shell +$ litellm --config /path/to/config.yaml +``` + + Caching can be enabled by adding the `cache` key in the `config.yaml` #### Step 1: Add `cache` to the config.yaml + ```yaml model_list: - model_name: gpt-3.5-turbo @@ -405,40 +468,45 @@ model_list: litellm_settings: set_verbose: True - cache: True # set cache responses to True + cache: True # set cache responses to True cache_params: - type: "redis-semantic" - similarity_threshold: 0.8 # similarity threshold for semantic cache + type: "redis-semantic" + similarity_threshold: 0.8 # similarity threshold for semantic cache redis_semantic_cache_embedding_model: azure-embedding-model # set this to a model_name set in model_list ``` #### Step 2: Add Redis Credentials to .env + Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching. - ```shell - REDIS_URL = "" # REDIS_URL='redis://username:password@hostname:port/database' - ## OR ## - REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com' - REDIS_PORT = "" # REDIS_PORT='18841' - REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing' - ``` +```shell +REDIS_URL = "" # REDIS_URL='redis://username:password@hostname:port/database' +## OR ## +REDIS_HOST = "" # REDIS_HOST='redis-18841.c274.us-east-1-3.ec2.cloud.redislabs.com' +REDIS_PORT = "" # REDIS_PORT='18841' +REDIS_PASSWORD = "" # REDIS_PASSWORD='liteLlmIsAmazing' +``` **Additional kwargs** -You can pass in any additional redis.Redis arg, by storing the variable + value in your os environment, like this: +You can pass in any additional redis.Redis arg, by storing the variable + value in your os +environment, like this: + ```shell REDIS_ = "" -``` +``` #### Step 3: Run proxy with config + ```shell $ litellm --config /path/to/config.yaml ``` - + #### Step 1: Add `cache` to the config.yaml + ```yaml litellm_settings: cache: True @@ -447,6 +515,7 @@ litellm_settings: ``` #### Step 2: Run proxy with config + ```shell $ litellm --config /path/to/config.yaml ``` @@ -456,15 +525,17 @@ $ litellm --config /path/to/config.yaml #### Step 1: Add `cache` to the config.yaml + ```yaml litellm_settings: cache: True cache_params: type: disk - disk_cache_dir: /tmp/litellm-cache # OPTIONAL, default to ./.litellm_cache + disk_cache_dir: /tmp/litellm-cache # OPTIONAL, default to ./.litellm_cache ``` #### Step 2: Run proxy with config + ```shell $ litellm --config /path/to/config.yaml ``` @@ -473,7 +544,6 @@ $ litellm --config /path/to/config.yaml - ## Usage ### Basic @@ -482,6 +552,7 @@ $ litellm --config /path/to/config.yaml Send the same request twice: + ```shell curl http://0.0.0.0:4000/v1/chat/completions \ -H "Content-Type: application/json" \ @@ -499,10 +570,12 @@ curl http://0.0.0.0:4000/v1/chat/completions \ "temperature": 0.7 }' ``` + Send the same request twice: + ```shell curl --location 'http://0.0.0.0:4000/embeddings' \ --header 'Content-Type: application/json' \ @@ -518,18 +591,19 @@ curl --location 'http://0.0.0.0:4000/embeddings' \ "input": ["write a litellm poem"] }' ``` + ### Dynamic Cache Controls -| Parameter | Type | Description | -|-----------|------|-------------| -| `ttl` | *Optional(int)* | Will cache the response for the user-defined amount of time (in seconds) | -| `s-maxage` | *Optional(int)* | Will only accept cached responses that are within user-defined range (in seconds) | -| `no-cache` | *Optional(bool)* | Will not store the response in cache. | -| `no-store` | *Optional(bool)* | Will not cache the response | -| `namespace` | *Optional(str)* | Will cache the response under a user-defined namespace | +| Parameter | Type | Description | +| ----------- | ---------------- | --------------------------------------------------------------------------------- | +| `ttl` | _Optional(int)_ | Will cache the response for the user-defined amount of time (in seconds) | +| `s-maxage` | _Optional(int)_ | Will only accept cached responses that are within user-defined range (in seconds) | +| `no-cache` | _Optional(bool)_ | Will not store the response in cache. | +| `no-store` | _Optional(bool)_ | Will not cache the response | +| `namespace` | _Optional(str)_ | Will cache the response under a user-defined namespace | Each cache parameter can be controlled on a per-request basis. Here are examples for each parameter: @@ -558,6 +632,7 @@ chat_completion = client.chat.completions.create( } ) ``` + @@ -574,6 +649,7 @@ curl http://localhost:4000/v1/chat/completions \ ] }' ``` + @@ -602,6 +678,7 @@ chat_completion = client.chat.completions.create( } ) ``` + @@ -618,10 +695,12 @@ curl http://localhost:4000/v1/chat/completions \ ] }' ``` + ### `no-cache` + Force a fresh response, bypassing the cache. @@ -645,6 +724,7 @@ chat_completion = client.chat.completions.create( } ) ``` + @@ -661,6 +741,7 @@ curl http://localhost:4000/v1/chat/completions \ ] }' ``` + @@ -668,7 +749,6 @@ curl http://localhost:4000/v1/chat/completions \ Will not store the response in cache. - @@ -690,6 +770,7 @@ chat_completion = client.chat.completions.create( } ) ``` + @@ -706,10 +787,12 @@ curl http://localhost:4000/v1/chat/completions \ ] }' ``` + ### `namespace` + Store the response under a specific cache namespace. @@ -733,6 +816,7 @@ chat_completion = client.chat.completions.create( } ) ``` + @@ -749,36 +833,37 @@ curl http://localhost:4000/v1/chat/completions \ ] }' ``` + - - ## Set cache for proxy, but not on the actual llm api call -Use this if you just want to enable features like rate limiting, and loadbalancing across multiple instances. - -Set `supported_call_types: []` to disable caching on the actual api call. +Use this if you just want to enable features like rate limiting, and loadbalancing across multiple +instances. +Set `supported_call_types: []` to disable caching on the actual api call. ```yaml litellm_settings: cache: True cache_params: type: redis - supported_call_types: [] + supported_call_types: [] ``` - ## Debugging Caching - `/cache/ping` + LiteLLM Proxy exposes a `/cache/ping` endpoint to test if the cache is working as expected **Usage** + ```shell curl --location 'http://0.0.0.0:4000/cache/ping' -H "Authorization: Bearer sk-1234" ``` **Expected Response - when cache healthy** + ```shell { "status": "healthy", @@ -803,7 +888,8 @@ curl --location 'http://0.0.0.0:4000/cache/ping' -H "Authorization: Bearer sk-1 ### Control Call Types Caching is on for - (`/chat/completion`, `/embeddings`, etc.) -By default, caching is on for all call types. You can control which call types caching is on for by setting `supported_call_types` in `cache_params` +By default, caching is on for all call types. You can control which call types caching is on for by +setting `supported_call_types` in `cache_params` **Cache will only be on for the call types specified in `supported_call_types`** @@ -812,10 +898,13 @@ litellm_settings: cache: True cache_params: type: redis - supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"] - # /chat/completions, /completions, /embeddings, /audio/transcriptions + supported_call_types: + ["acompletion", "atext_completion", "aembedding", "atranscription"] + # /chat/completions, /completions, /embeddings, /audio/transcriptions ``` + ### Set Cache Params on config.yaml + ```yaml model_list: - model_name: gpt-3.5-turbo @@ -827,22 +916,25 @@ model_list: litellm_settings: set_verbose: True - cache: True # set cache responses to True, litellm defaults to using a redis cache - cache_params: # cache_params are optional - type: "redis" # The type of cache to initialize. Can be "local" or "redis". Defaults to "local". - host: "localhost" # The host address for the Redis cache. Required if type is "redis". - port: 6379 # The port number for the Redis cache. Required if type is "redis". - password: "your_password" # The password for the Redis cache. Required if type is "redis". - + cache: True # set cache responses to True, litellm defaults to using a redis cache + cache_params: # cache_params are optional + type: "redis" # The type of cache to initialize. Can be "local", "redis", "s3", or "gcs". Defaults to "local". + host: "localhost" # The host address for the Redis cache. Required if type is "redis". + port: 6379 # The port number for the Redis cache. Required if type is "redis". + password: "your_password" # The password for the Redis cache. Required if type is "redis". + # Optional configurations - supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"] - # /chat/completions, /completions, /embeddings, /audio/transcriptions + supported_call_types: + ["acompletion", "atext_completion", "aembedding", "atranscription"] + # /chat/completions, /completions, /embeddings, /audio/transcriptions ``` -### Deleting Cache Keys - `/cache/delete` +### Deleting Cache Keys - `/cache/delete` + In order to delete a cache key, send a request to `/cache/delete` with the `keys` you want to delete -Example +Example + ```shell curl -X POST "http://0.0.0.0:4000/cache/delete" \ -H "Authorization: Bearer sk-1234" \ @@ -854,7 +946,10 @@ curl -X POST "http://0.0.0.0:4000/cache/delete" \ ``` #### Viewing Cache Keys from responses -You can view the cache_key in the response headers, on cache hits the cache key is sent as the `x-litellm-cache-key` response headers + +You can view the cache_key in the response headers, on cache hits the cache key is sent as the +`x-litellm-cache-key` response headers + ```shell curl -i --location 'http://0.0.0.0:4000/chat/completions' \ --header 'Authorization: Bearer sk-1234' \ @@ -871,7 +966,8 @@ curl -i --location 'http://0.0.0.0:4000/chat/completions' \ }' ``` -Response from litellm proxy +Response from litellm proxy + ```json date: Thu, 04 Apr 2024 17:37:21 GMT content-type: application/json @@ -891,7 +987,7 @@ x-litellm-cache-key: 586bf3f3c1bf5aecb55bd9996494d3bbc69eb58397163add6d49537762a ], "created": 1712252235, } - + ``` ### **Set Caching Default Off - Opt in only ** @@ -916,7 +1012,6 @@ litellm_settings: 2. **Opting in to cache when cache is default off** - @@ -939,6 +1034,7 @@ chat_completion = client.chat.completions.create( } ) ``` + @@ -977,45 +1073,49 @@ litellm_settings: ```yaml cache_params: - # ttl + # ttl ttl: Optional[float] default_in_memory_ttl: Optional[float] default_in_redis_ttl: Optional[float] max_connections: Optional[Int] - # Type of cache (options: "local", "redis", "s3") + # Type of cache (options: "local", "redis", "s3", "gcs") type: s3 # List of litellm call types to cache for # Options: "completion", "acompletion", "embedding", "aembedding" - supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"] - # /chat/completions, /completions, /embeddings, /audio/transcriptions + supported_call_types: + ["acompletion", "atext_completion", "aembedding", "atranscription"] + # /chat/completions, /completions, /embeddings, /audio/transcriptions # Redis cache parameters - host: localhost # Redis server hostname or IP address - port: "6379" # Redis server port (as a string) - password: secret_password # Redis server password + host: localhost # Redis server hostname or IP address + port: "6379" # Redis server port (as a string) + password: secret_password # Redis server password namespace: Optional[str] = None, - + # GCP IAM Authentication for Redis - gcp_service_account: "projects/-/serviceAccounts/your-sa@project.iam.gserviceaccount.com" # GCP service account for IAM authentication - gcp_ssl_ca_certs: "./server-ca.pem" # Path to SSL CA certificate file for GCP Memorystore Redis - ssl: true # Enable SSL for secure connections - ssl_cert_reqs: null # Set to null for self-signed certificates - ssl_check_hostname: false # Set to false for self-signed certificates - + gcp_service_account: "projects/-/serviceAccounts/your-sa@project.iam.gserviceaccount.com" # GCP service account for IAM authentication + gcp_ssl_ca_certs: "./server-ca.pem" # Path to SSL CA certificate file for GCP Memorystore Redis + ssl: true # Enable SSL for secure connections + ssl_cert_reqs: null # Set to null for self-signed certificates + ssl_check_hostname: false # Set to false for self-signed certificates # S3 cache parameters - s3_bucket_name: your_s3_bucket_name # Name of the S3 bucket - s3_region_name: us-west-2 # AWS region of the S3 bucket - s3_api_version: 2006-03-01 # AWS S3 API version - s3_use_ssl: true # Use SSL for S3 connections (options: true, false) - s3_verify: true # SSL certificate verification for S3 connections (options: true, false) - s3_endpoint_url: https://s3.amazonaws.com # S3 endpoint URL - s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3 - s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3 - s3_aws_session_token: your_session_token # AWS Session Token for temporary credentials - + s3_bucket_name: your_s3_bucket_name # Name of the S3 bucket + s3_region_name: us-west-2 # AWS region of the S3 bucket + s3_api_version: 2006-03-01 # AWS S3 API version + s3_use_ssl: true # Use SSL for S3 connections (options: true, false) + s3_verify: true # SSL certificate verification for S3 connections (options: true, false) + s3_endpoint_url: https://s3.amazonaws.com # S3 endpoint URL + s3_aws_access_key_id: your_access_key # AWS Access Key ID for S3 + s3_aws_secret_access_key: your_secret_key # AWS Secret Access Key for S3 + s3_aws_session_token: your_session_token # AWS Session Token for temporary credentials + + # GCS cache parameters + gcs_bucket_name: your_gcs_bucket_name # Name of the GCS bucket + gcs_path_service_account: /path/to/service-account.json # Path to GCS service account JSON file + gcs_path: cache/ # [OPTIONAL] GCS path prefix for cache objects ``` ## Provider-Specific Optional Parameters Caching diff --git a/docs/my-website/docs/proxy/call_hooks.md b/docs/my-website/docs/proxy/call_hooks.md index aef33f8c7083..17354725fd5e 100644 --- a/docs/my-website/docs/proxy/call_hooks.md +++ b/docs/my-website/docs/proxy/call_hooks.md @@ -10,6 +10,17 @@ import Image from '@theme/IdealImage'; **Understanding Callback Hooks?** Check out our [Callback Management Guide](../observability/callback_management.md) to understand the differences between proxy-specific hooks like `async_pre_call_hook` and general logging hooks like `async_log_success_event`. ::: +## Which Hook Should I Use? + +| Hook | Use Case | When It Runs | +|------|----------|--------------| +| `async_pre_call_hook` | Modify incoming request before it's sent to model | Before the LLM API call is made | +| `async_moderation_hook` | Run checks on input in parallel to LLM API call | In parallel with the LLM API call | +| `async_post_call_success_hook` | Modify outgoing response (non-streaming) | After successful LLM API call, for non-streaming responses | +| `async_post_call_failure_hook` | Transform error responses sent to clients | After failed LLM API call | +| `async_post_call_streaming_hook` | Modify outgoing response (streaming) | After successful LLM API call, for streaming responses | +| `async_post_call_response_headers_hook` | Inject custom HTTP response headers | After LLM API call (both success and failure) | + See a complete example with our [parallel request rate limiter](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/parallel_request_limiter.py) ## Quick Start @@ -51,7 +62,21 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit original_exception: Exception, user_api_key_dict: UserAPIKeyAuth, traceback_str: Optional[str] = None, - ): + ) -> Optional[HTTPException]: + """ + Transform error responses sent to clients. + + Return an HTTPException to replace the original error with a user-friendly message. + Return None to use the original exception. + + Example: + if isinstance(original_exception, litellm.ContextWindowExceededError): + return HTTPException( + status_code=400, + detail="Your prompt is too long. Please reduce the length and try again." + ) + return None # Use original exception + """ pass async def async_post_call_success_hook( @@ -91,6 +116,18 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit async for item in response: yield item + async def async_post_call_response_headers_hook( + self, + data: dict, + user_api_key_dict: UserAPIKeyAuth, + response: Any, + request_headers: Optional[Dict[str, str]] = None, + ) -> Optional[Dict[str, str]]: + """ + Inject custom headers into HTTP response (runs for both success and failure). + """ + return {"x-custom-header": "custom-value"} + proxy_handler_instance = MyCustomHandler() ``` @@ -330,3 +367,66 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ "usage": {} } ``` + +## Advanced - Transform Error Responses + +Transform technical API errors into user-friendly messages using `async_post_call_failure_hook`. Return an `HTTPException` to replace the original error, or `None` to use the original exception. + +```python +from litellm.integrations.custom_logger import CustomLogger +from fastapi import HTTPException +from typing import Optional +import litellm + +class MyErrorTransformer(CustomLogger): + async def async_post_call_failure_hook( + self, + request_data: dict, + original_exception: Exception, + user_api_key_dict: UserAPIKeyAuth, + traceback_str: Optional[str] = None, + ) -> Optional[HTTPException]: + if isinstance(original_exception, litellm.ContextWindowExceededError): + return HTTPException( + status_code=400, + detail="Your prompt is too long. Please reduce the length and try again." + ) + if isinstance(original_exception, litellm.RateLimitError): + return HTTPException( + status_code=429, + detail="Rate limit exceeded. Please try again in a moment." + ) + return None # Use original exception + +proxy_handler_instance = MyErrorTransformer() +``` + +**Result:** Clients receive `"Your prompt is too long..."` instead of `"ContextWindowExceededError: Prompt exceeds context window"`. + +## Advanced - Inject Custom HTTP Response Headers + +Use `async_post_call_response_headers_hook` to inject custom HTTP headers into responses. This hook runs for **both successful and failed** LLM API calls. + +```python +from litellm.integrations.custom_logger import CustomLogger +from litellm.proxy.proxy_server import UserAPIKeyAuth +from typing import Any, Dict, Optional + +class CustomHeaderLogger(CustomLogger): + def __init__(self): + super().__init__() + + async def async_post_call_response_headers_hook( + self, + data: dict, + user_api_key_dict: UserAPIKeyAuth, + response: Any, + request_headers: Optional[Dict[str, str]] = None, + ) -> Optional[Dict[str, str]]: + """ + Inject custom headers into all responses (success and failure). + """ + return {"x-custom-header": "custom-value"} + +proxy_handler_instance = CustomHeaderLogger() +``` diff --git a/docs/my-website/docs/proxy/cli_sso.md b/docs/my-website/docs/proxy/cli_sso.md index f7669d6a25cb..ad0f033f802e 100644 --- a/docs/my-website/docs/proxy/cli_sso.md +++ b/docs/my-website/docs/proxy/cli_sso.md @@ -9,6 +9,57 @@ Use the litellm cli to authenticate to the LiteLLM Gateway. This is great if you ## Usage +### Prerequisites - Start LiteLLM Proxy with Beta Flag + +:::warning[Beta Feature - Required] + +CLI SSO Authentication is currently in beta. You must set this environment variable **when starting up your LiteLLM Proxy**: + +```bash +export EXPERIMENTAL_UI_LOGIN="True" +litellm --config config.yaml +``` + +Or add it to your proxy startup command: + +```bash +EXPERIMENTAL_UI_LOGIN="True" litellm --config config.yaml +``` + +::: + +### Configuration + +#### JWT Token Expiration + +By default, CLI authentication tokens expire after **24 hours**. You can customize this expiration time by setting the `LITELLM_CLI_JWT_EXPIRATION_HOURS` environment variable when starting your LiteLLM Proxy: + +```bash +# Set CLI JWT tokens to expire after 48 hours +export LITELLM_CLI_JWT_EXPIRATION_HOURS=48 +export EXPERIMENTAL_UI_LOGIN="True" +litellm --config config.yaml +``` + +Or in a single command: + +```bash +LITELLM_CLI_JWT_EXPIRATION_HOURS=48 EXPERIMENTAL_UI_LOGIN="True" litellm --config config.yaml +``` + +**Examples:** +- `LITELLM_CLI_JWT_EXPIRATION_HOURS=12` - Tokens expire after 12 hours +- `LITELLM_CLI_JWT_EXPIRATION_HOURS=168` - Tokens expire after 7 days (168 hours) +- `LITELLM_CLI_JWT_EXPIRATION_HOURS=720` - Tokens expire after 30 days (720 hours) + +:::tip +You can check your current token's age and expiration status using: +```bash +litellm-proxy whoami +``` +::: + +### Steps 1. **Install the CLI** @@ -33,6 +84,8 @@ Use the litellm cli to authenticate to the LiteLLM Gateway. This is great if you 2. **Set up environment variables** + On your local machine, set the proxy URL: + ```bash export LITELLM_PROXY_URL=http://localhost:4000 ``` diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md index caa025cf1e0f..bb2c7e01c805 100644 --- a/docs/my-website/docs/proxy/config_settings.md +++ b/docs/my-website/docs/proxy/config_settings.md @@ -24,73 +24,81 @@ litellm_settings: turn_off_message_logging: boolean # prevent the messages and responses from being logged to on your callbacks, but request metadata will still be logged. Useful for privacy/compliance when handling sensitive data. redact_user_api_key_info: boolean # Redact information about the user api key (hashed token, user_id, team id, etc.), from logs. Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging. langfuse_default_tags: ["cache_hit", "cache_key", "proxy_base_url", "user_api_key_alias", "user_api_key_user_id", "user_api_key_user_email", "user_api_key_team_alias", "semantic-similarity", "proxy_base_url"] # default tags for Langfuse Logging - # Networking settings - request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout + request_timeout: 10 # (int) llm requesttimeout in seconds. Raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout force_ipv4: boolean # If true, litellm will force ipv4 for all LLM requests. Some users have seen httpx ConnectionError when using ipv6 + Anthropic API - set_verbose: boolean # sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION + # Debugging - see debugging docs for more options + # Use `--debug` or `--detailed_debug` CLI flags, or set LITELLM_LOG env var to "INFO", "DEBUG", or "ERROR" json_logs: boolean # if true, logs will be in json format # Fallbacks, reliability default_fallbacks: ["claude-opus"] # set default_fallbacks, in case a specific model group is misconfigured / bad. - content_policy_fallbacks: [{"gpt-3.5-turbo-small": ["claude-opus"]}] # fallbacks for ContentPolicyErrors - context_window_fallbacks: [{"gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"]}] # fallbacks for ContextWindowExceededErrors + content_policy_fallbacks: [{ "gpt-3.5-turbo-small": ["claude-opus"] }] # fallbacks for ContentPolicyErrors + context_window_fallbacks: [{ "gpt-3.5-turbo-small": ["gpt-3.5-turbo-large", "claude-opus"] }] # fallbacks for ContextWindowExceededErrors # MCP Aliases - Map aliases to MCP server names for easier tool access - mcp_aliases: { "github": "github_mcp_server", "zapier": "zapier_mcp_server", "deepwiki": "deepwiki_mcp_server" } # Maps friendly aliases to MCP server names. Only the first alias for each server is used + mcp_aliases: { + "github": "github_mcp_server", + "zapier": "zapier_mcp_server", + "deepwiki": "deepwiki_mcp_server", + } # Maps friendly aliases to MCP server names. Only the first alias for each server is used # Caching settings - cache: true - cache_params: # set cache params for redis - type: redis # type of cache to initialize + cache: true + cache_params: # set cache params for redis + type: redis # type of cache to initialize (options: "local", "redis", "s3", "gcs") # Optional - Redis Settings - host: "localhost" # The host address for the Redis cache. Required if type is "redis". - port: 6379 # The port number for the Redis cache. Required if type is "redis". - password: "your_password" # The password for the Redis cache. Required if type is "redis". + host: "localhost" # The host address for the Redis cache. Required if type is "redis". + port: 6379 # The port number for the Redis cache. Required if type is "redis". + password: "your_password" # The password for the Redis cache. Required if type is "redis". namespace: "litellm.caching.caching" # namespace for redis cache max_connections: 100 # [OPTIONAL] Set Maximum number of Redis connections. Passed directly to redis-py. - # Optional - Redis Cluster Settings - redis_startup_nodes: [{"host": "127.0.0.1", "port": "7001"}] + redis_startup_nodes: [{ "host": "127.0.0.1", "port": "7001" }] # Optional - Redis Sentinel Settings service_name: "mymaster" sentinel_nodes: [["localhost", 26379]] # Optional - GCP IAM Authentication for Redis - gcp_service_account: "projects/-/serviceAccounts/your-sa@project.iam.gserviceaccount.com" # GCP service account for IAM authentication - gcp_ssl_ca_certs: "./server-ca.pem" # Path to SSL CA certificate file for GCP Memorystore Redis - ssl: true # Enable SSL for secure connections - ssl_cert_reqs: null # Set to null for self-signed certificates - ssl_check_hostname: false # Set to false for self-signed certificates + gcp_service_account: "projects/-/serviceAccounts/your-sa@project.iam.gserviceaccount.com" # GCP service account for IAM authentication + gcp_ssl_ca_certs: "./server-ca.pem" # Path to SSL CA certificate file for GCP Memorystore Redis + ssl: true # Enable SSL for secure connections + ssl_cert_reqs: null # Set to null for self-signed certificates + ssl_check_hostname: false # Set to false for self-signed certificates # Optional - Qdrant Semantic Cache Settings qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list qdrant_collection_name: test_collection qdrant_quantization_config: binary - similarity_threshold: 0.8 # similarity threshold for semantic cache + similarity_threshold: 0.8 # similarity threshold for semantic cache # Optional - S3 Cache Settings - s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3 - s3_region_name: us-west-2 # AWS Region Name for S3 - s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 - s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 - s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket + s3_bucket_name: cache-bucket-litellm # AWS Bucket Name for S3 + s3_region_name: us-west-2 # AWS Region Name for S3 + s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 + s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 + s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 bucket + + # Optional - GCS Cache Settings + gcs_bucket_name: cache-bucket-litellm # GCS Bucket Name for caching + gcs_path_service_account: os.environ/GCS_PATH_SERVICE_ACCOUNT # Path to GCS service account JSON file + gcs_path: cache/ # [OPTIONAL] GCS path prefix for cache objects # Common Cache settings # Optional - Supported call types for caching - supported_call_types: ["acompletion", "atext_completion", "aembedding", "atranscription"] - # /chat/completions, /completions, /embeddings, /audio/transcriptions + supported_call_types: + ["acompletion", "atext_completion", "aembedding", "atranscription"] + # /chat/completions, /completions, /embeddings, /audio/transcriptions mode: default_off # if default_off, you need to opt in to caching on a per call basis ttl: 600 # ttl for caching - disable_copilot_system_to_assistant: False # If false (default), converts all 'system' role messages to 'assistant' for GitHub Copilot compatibility. Set to true to disable this behavior. - + disable_copilot_system_to_assistant: False # If false (default), converts all 'system' role messages to 'assistant' for GitHub Copilot compatibility. Set to true to disable this behavior. callback_settings: otel: - message_logging: boolean # OTEL logging callback specific settings + message_logging: boolean # OTEL logging callback specific settings general_settings: completion_model: string @@ -104,21 +112,23 @@ general_settings: disable_responses_id_security: boolean # turn off response ID security checks that prevent users from accessing other users' responses enable_jwt_auth: boolean # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims enforce_user_param: boolean # requires all openai endpoint requests to have a 'user' param + reject_clientside_metadata_tags: boolean # if true, rejects requests with client-side 'metadata.tags' to prevent users from influencing budgets allowed_routes: ["route1", "route2"] # list of allowed proxy API routes - a user can access. (currently JWT-Auth only) key_management_system: google_kms # either google_kms or azure_kms master_key: string maximum_spend_logs_retention_period: 30d # The maximum time to retain spend logs before deletion. maximum_spend_logs_retention_interval: 1d # interval in which the spend log cleanup task should run in. + user_mcp_management_mode: restricted # or "view_all" # Database Settings database_url: string - database_connection_pool_limit: 0 # default 100 + database_connection_pool_limit: 0 # default 10 database_connection_timeout: 0 # default 60s allow_requests_on_db_unavailable: boolean # if true, will allow requests that can not connect to the DB to verify Virtual Key to still work custom_auth: string - max_parallel_requests: 0 # the max parallel requests allowed per deployment - global_max_parallel_requests: 0 # the max parallel requests allowed on the proxy all up + max_parallel_requests: 0 # the max parallel requests allowed per deployment + global_max_parallel_requests: 0 # the max parallel requests allowed on the proxy all up infer_model_from_keys: true background_health_checks: true health_check_interval: 300 @@ -136,6 +146,7 @@ router_settings: cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails disable_cooldowns: True # bool - Disable cooldowns for all models enable_tag_filtering: True # bool - Use tag based routing for requests + tag_filtering_match_any: True # bool - Tag matching behavior (only when enable_tag_filtering=true). `true`: match if deployment has ANY requested tag; `false`: match only if deployment has ALL requested tags retry_policy: { # Dict[str, int]: retry policy for different types of exceptions "AuthenticationErrorRetries": 3, "TimeoutErrorRetries": 3, @@ -167,10 +178,11 @@ router_settings: | turn_off_message_logging | boolean | If true, prevents messages and responses from being logged to callbacks, but request metadata will still be logged. Useful for privacy/compliance when handling sensitive data [Proxy Logging](logging) | | modify_params | boolean | If true, allows modifying the parameters of the request before it is sent to the LLM provider | | enable_preview_features | boolean | If true, enables preview features - e.g. Azure O1 Models with streaming support.| +| LITELLM_DISABLE_STOP_SEQUENCE_LIMIT | Disable validation for stop sequence limit (default: 4) | | redact_user_api_key_info | boolean | If true, redacts information about the user api key from logs [Proxy Logging](logging#redacting-userapikeyinfo) | | mcp_aliases | object | Maps friendly aliases to MCP server names for easier tool access. Only the first alias for each server is used. [MCP Aliases](../mcp#mcp-aliases) | | langfuse_default_tags | array of strings | Default tags for Langfuse Logging. Use this if you want to control which LiteLLM-specific fields are logged as tags by the LiteLLM proxy. By default LiteLLM Proxy logs no LiteLLM-specific fields as tags. [Further docs](./logging#litellm-specific-tags-on-langfuse---cache_hit-cache_key) | -| set_verbose | boolean | If true, sets litellm.set_verbose=True to view verbose debug logs. DO NOT LEAVE THIS ON IN PRODUCTION | +| set_verbose | boolean | [DEPRECATED - see debugging docs](./debugging) Use `--debug` or `--detailed_debug` CLI flags, or set `LITELLM_LOG` env var to "INFO", "DEBUG", or "ERROR" instead. | | json_logs | boolean | If true, logs will be in json format. If you need to store the logs as JSON, just set the `litellm.json_logs = True`. We currently just log the raw POST request from litellm as a JSON [Further docs](./debugging) | | default_fallbacks | array of strings | List of fallback models to use if a specific model group is misconfigured / bad. [Further docs](./reliability#default-fallbacks) | | request_timeout | integer | The timeout for requests in seconds. If not set, the default value is `6000 seconds`. [For reference OpenAI Python SDK defaults to `600 seconds`.](https://github.com/openai/openai-python/blob/main/src/openai/_constants.py) | @@ -201,6 +213,7 @@ router_settings: | disable_responses_id_security | boolean | If true, disables response ID security checks that prevent users from accessing response IDs from other users. When false (default), response IDs are encrypted with user information to ensure users can only access their own responses. Applies to /v1/responses endpoints | | enable_jwt_auth | boolean | allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims. [Doc on JWT Tokens](token_auth) | | enforce_user_param | boolean | If true, requires all OpenAI endpoint requests to have a 'user' param. [Doc on call hooks](call_hooks)| +| reject_clientside_metadata_tags | boolean | If true, rejects requests that contain client-side 'metadata.tags' to prevent users from influencing budgets by sending different tags. Tags can only be inherited from the API key metadata. | | allowed_routes | array of strings | List of allowed proxy API routes a user can access [Doc on controlling allowed routes](enterprise#control-available-public-private-routes)| | key_management_system | string | Specifies the key management system. [Doc Secret Managers](../secret) | | master_key | string | The master key for the proxy [Set up Virtual Keys](virtual_keys) | @@ -227,12 +240,13 @@ router_settings: | image_generation_model | str | The default model to use for image generation - ignores model set in request | | store_model_in_db | boolean | If true, enables storing model + credential information in the DB. | | supported_db_objects | List[str] | Fine-grained control over which object types to load from the database when `store_model_in_db` is True. Available types: `"models"`, `"mcp"`, `"guardrails"`, `"vector_stores"`, `"pass_through_endpoints"`, `"prompts"`, `"model_cost_map"`. If not set, all object types are loaded (default behavior). Example: `supported_db_objects: ["mcp"]` to only load MCP servers from DB. | +| user_mcp_management_mode | string | Controls what non-admins can see on the MCP dashboard. `restricted` (default) only lists MCP servers that the user’s teams are explicitly allowed to access. `view_all` lets every user see the full MCP server list. Tool list/call always respects per-key permissions, so users still cannot run MCP calls without access. | | store_prompts_in_spend_logs | boolean | If true, allows prompts and responses to be stored in the spend logs table. | | max_request_size_mb | int | The maximum size for requests in MB. Requests above this size will be rejected. | | max_response_size_mb | int | The maximum size for responses in MB. LLM Responses above this size will not be sent. | | proxy_budget_rescheduler_min_time | int | The minimum time (in seconds) to wait before checking db for budget resets. **Default is 597 seconds** | | proxy_budget_rescheduler_max_time | int | The maximum time (in seconds) to wait before checking db for budget resets. **Default is 605 seconds** | -| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. **Default is 30 seconds** | +| proxy_batch_write_at | int | Time (in seconds) to wait before batch writing spend logs to the db. **Default is 10 seconds** | | proxy_batch_polling_interval | int | Time (in seconds) to wait before polling a batch, to check if it's completed. **Default is 6000 seconds (1 hour)** | | alerting_args | dict | Args for Slack Alerting [Doc on Slack Alerting](./alerting.md) | | custom_key_generate | str | Custom function for key generation [Doc on custom key generation](./virtual_keys.md#custom--key-generate) | @@ -261,13 +275,14 @@ router_settings: | forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). | | forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers and `anthropic-beta` headers) to the backend LLM call | | maximum_spend_logs_retention_period | str | Used to set the max retention time for spend logs in the db, after which they will be auto-purged | -| maximum_spend_logs_retention_interval | str | Used to set the interval in which the spend log cleanup task should run in. | +| maximum_spend_logs_retention_interval | str | Used to set the interval in which the spend log cleanup task should run in. | + ### router_settings - Reference :::info -Most values can also be set via `litellm_settings`. If you see overlapping values, settings on `router_settings` will override those on `litellm_settings`. -::: +Most values can also be set via `litellm_settings`. If you see overlapping values, settings on +`router_settings` will override those on `litellm_settings`. ::: ```yaml router_settings: @@ -275,11 +290,12 @@ router_settings: redis_host: # string redis_password: # string redis_port: # string - enable_pre_call_checks: true # bool - Before call is made check if a call is within model context window - allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. + enable_pre_call_checks: true # bool - Before call is made check if a call is within model context window + allowed_fails: 3 # cooldown model if it fails > 1 call in a minute. cooldown_time: 30 # (in seconds) how long to cooldown model if fails/min > allowed_fails - disable_cooldowns: True # bool - Disable cooldowns for all models + disable_cooldowns: True # bool - Disable cooldowns for all models enable_tag_filtering: True # bool - Use tag based routing for requests + tag_filtering_match_any: True # bool - Tag matching behavior (only when enable_tag_filtering=true). `true`: match if deployment has ANY requested tag; `false`: match only if deployment has ALL requested tags retry_policy: { # Dict[str, int]: retry policy for different types of exceptions "AuthenticationErrorRetries": 3, "TimeoutErrorRetries": 3, @@ -289,11 +305,11 @@ router_settings: } allowed_fails_policy: { "BadRequestErrorAllowedFails": 1000, # Allow 1000 BadRequestErrors before cooling down a deployment - "AuthenticationErrorAllowedFails": 10, # int - "TimeoutErrorAllowedFails": 12, # int - "RateLimitErrorAllowedFails": 10000, # int - "ContentPolicyViolationErrorAllowedFails": 15, # int - "InternalServerErrorAllowedFails": 20, # int + "AuthenticationErrorAllowedFails": 10, # int + "TimeoutErrorAllowedFails": 12, # int + "RateLimitErrorAllowedFails": 10000, # int + "ContentPolicyViolationErrorAllowedFails": 15, # int + "InternalServerErrorAllowedFails": 20, # int } content_policy_fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for content policy violations fallbacks=[{"claude-2": ["my-fallback-model"]}] # List[Dict[str, List[str]]]: Fallback model for all errors @@ -309,6 +325,7 @@ router_settings: | content_policy_fallbacks | array of objects | Specifies fallback models for content policy violations. [More information here](reliability) | | fallbacks | array of objects | Specifies fallback models for all types of errors. [More information here](reliability) | | enable_tag_filtering | boolean | If true, uses tag based routing for requests [Tag Based Routing](tag_routing) | +| tag_filtering_match_any | boolean | Tag matching behavior (only when enable_tag_filtering=true). `true`: match if deployment has ANY requested tag; `false`: match only if deployment has ALL requested tags | | cooldown_time | integer | The duration (in seconds) to cooldown a model if it exceeds the allowed failures. | | disable_cooldowns | boolean | If true, disables cooldowns for all models. [More information here](reliability) | | retry_policy | object | Specifies the number of retries for different types of exceptions. [More information here](reliability) | @@ -323,7 +340,7 @@ router_settings: | stream_timeout | Optional[float] | The default timeout for a streaming request. If not set, the 'timeout' value is used. | | debug_level | Literal["DEBUG", "INFO"] | The debug level for the logging library in the router. Defaults to "INFO". | | client_ttl | int | Time-to-live for cached clients in seconds. Defaults to 3600. | -| cache_kwargs | dict | Additional keyword arguments for the cache initialization. | +| cache_kwargs | dict | Additional keyword arguments for the cache initialization. Use this for non-string Redis parameters that may fail when set via `REDIS_*` environment variables. | | routing_strategy_args | dict | Additional keyword arguments for the routing strategy - e.g. lowest latency routing default ttl | | model_group_alias | dict | Model group alias mapping. E.g. `{"claude-3-haiku": "claude-3-haiku-20240229"}` | | num_retries | int | Number of retries for a request. Defaults to 3. | @@ -331,7 +348,7 @@ router_settings: | caching_groups | Optional[List[tuple]] | List of model groups for caching across model groups. Defaults to None. - e.g. caching_groups=[("openai-gpt-3.5-turbo", "azure-gpt-3.5-turbo")]| | alerting_config | AlertingConfig | [SDK-only arg] Slack alerting configuration. Defaults to None. [Further Docs](../routing.md#alerting-) | | assistants_config | AssistantsConfig | Set on proxy via `assistant_settings`. [Further docs](../assistants.md) | -| set_verbose | boolean | [DEPRECATED PARAM - see debug docs](./debugging.md) If true, sets the logging level to verbose. | +| set_verbose | boolean | [DEPRECATED PARAM - see debug docs](./debugging) If true, sets the logging level to verbose. | | retry_after | int | Time to wait before retrying a request in seconds. Defaults to 0. If `x-retry-after` is received from LLM API, this value is overridden. | | provider_budget_config | ProviderBudgetConfig | Provider budget configuration. Use this to set llm_provider budget limits. example $100/day to OpenAI, $100/day to Azure, etc. Defaults to None. [Further Docs](./provider_budget_routing.md) | | enable_pre_call_checks | boolean | If true, checks if a call is within the model's context window before making the call. [More information here](reliability) | @@ -343,6 +360,7 @@ router_settings: | optional_pre_call_checks | List[str] | List of pre-call checks to add to the router. Currently supported: 'router_budget_limiting', 'prompt_caching' | | ignore_invalid_deployments | boolean | If true, ignores invalid deployments. Default for proxy is True - to prevent invalid models from blocking other models from being loaded. | | search_tools | List[SearchToolTypedDict] | List of search tool configurations for Search API integration. Each tool specifies a search_tool_name and litellm_params with search_provider, api_key, api_base, etc. [Further Docs](../search.md) | +| guardrail_list | List[GuardrailTypedDict] | List of guardrail configurations for guardrail load balancing. Enables load balancing across multiple guardrail deployments with the same guardrail_name. [Further Docs](./guardrails/guardrail_load_balancing.md) | ### environment variables - Reference @@ -357,10 +375,15 @@ router_settings: | AISPEND_ACCOUNT_ID | Account ID for AI Spend | AISPEND_API_KEY | API Key for AI Spend | AIOHTTP_CONNECTOR_LIMIT | Connection limit for aiohttp connector. When set to 0, no limit is applied. **Default is 0** +| AIOHTTP_CONNECTOR_LIMIT_PER_HOST | Connection limit per host for aiohttp connector. When set to 0, no limit is applied. **Default is 0** | AIOHTTP_KEEPALIVE_TIMEOUT | Keep-alive timeout for aiohttp connections in seconds. **Default is 120** | AIOHTTP_TRUST_ENV | Flag to enable aiohttp trust environment. When this is set to True, aiohttp will respect HTTP(S)_PROXY env vars. **Default is False** | AIOHTTP_TTL_DNS_CACHE | DNS cache time-to-live for aiohttp in seconds. **Default is 300** | ALLOWED_EMAIL_DOMAINS | List of email domains allowed for access +| APSCHEDULER_COALESCE | Whether to combine multiple pending executions of a job into one. **Default is False** +| APSCHEDULER_MAX_INSTANCES | Maximum number of concurrent instances of each job. **Default is 1** +| APSCHEDULER_MISFIRE_GRACE_TIME | Grace time in seconds for misfired jobs. **Default is 1** +| APSCHEDULER_REPLACE_EXISTING | Whether to replace existing jobs with the same ID. **Default is False** | ARIZE_API_KEY | API key for Arize platform integration | ARIZE_SPACE_KEY | Space key for Arize platform | ARGILLA_BATCH_SIZE | Batch size for Argilla logging @@ -371,8 +394,11 @@ router_settings: | ATHINA_API_KEY | API key for Athina service | ATHINA_BASE_URL | Base URL for Athina service (defaults to `https://log.athina.ai`) | AUTH_STRATEGY | Strategy used for authentication (e.g., OAuth, API key) +| AUTO_REDIRECT_UI_LOGIN_TO_SSO | Flag to enable automatic redirect of UI login page to SSO when SSO is configured. Default is **true** +| AUDIO_SPEECH_CHUNK_SIZE | Chunk size for audio speech processing. Default is 1024 | ANTHROPIC_API_KEY | API key for Anthropic service | ANTHROPIC_API_BASE | Base URL for Anthropic API. Default is https://api.anthropic.com +| ANTHROPIC_TOKEN_COUNTING_BETA_VERSION | Beta version header for Anthropic token counting API. Default is `token-counting-2024-11-01` | AWS_ACCESS_KEY_ID | Access Key ID for AWS services | AWS_BATCH_ROLE_ARN | ARN of the AWS IAM role for batch operations | AWS_DEFAULT_REGION | Default AWS region for service interactions when AWS_REGION is not set @@ -388,20 +414,29 @@ router_settings: | AWS_WEB_IDENTITY_TOKEN | Web identity token for AWS | AWS_WEB_IDENTITY_TOKEN_FILE | Path to file containing web identity token for AWS | AZURE_API_VERSION | Version of the Azure API being used +| AZURE_AI_API_BASE | Base URL for Azure AI services (e.g., Azure AI Anthropic) +| AZURE_AI_API_KEY | API key for Azure AI services (e.g., Azure AI Anthropic) | AZURE_AUTHORITY_HOST | Azure authority host URL | AZURE_CERTIFICATE_PASSWORD | Password for Azure OpenAI certificate | AZURE_CLIENT_ID | Client ID for Azure services | AZURE_CLIENT_SECRET | Client secret for Azure services -| AZURE_CODE_INTERPRETER_COST_PER_SESSION | Cost per session for Azure Code Interpreter service | AZURE_COMPUTER_USE_INPUT_COST_PER_1K_TOKENS | Input cost per 1K tokens for Azure Computer Use service | AZURE_COMPUTER_USE_OUTPUT_COST_PER_1K_TOKENS | Output cost per 1K tokens for Azure Computer Use service | AZURE_DEFAULT_RESPONSES_API_VERSION | Version of the Azure Default Responses API being used. Default is "preview" +| AZURE_DOCUMENT_INTELLIGENCE_API_VERSION | API version for Azure Document Intelligence service +| AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI | Default DPI (dots per inch) setting for Azure Document Intelligence service | AZURE_TENANT_ID | Tenant ID for Azure Active Directory | AZURE_USERNAME | Username for Azure services, use in conjunction with AZURE_PASSWORD for azure ad token with basic username/password workflow | AZURE_PASSWORD | Password for Azure services, use in conjunction with AZURE_USERNAME for azure ad token with basic username/password workflow | AZURE_FEDERATED_TOKEN_FILE | File path to Azure federated token | AZURE_FILE_SEARCH_COST_PER_GB_PER_DAY | Cost per GB per day for Azure File Search service | AZURE_SCOPE | For EntraID Auth, Scope for Azure services, defaults to "https://cognitiveservices.azure.com/.default" +| AZURE_SENTINEL_DCR_IMMUTABLE_ID | Immutable ID of the Data Collection Rule for Azure Sentinel logging +| AZURE_SENTINEL_STREAM_NAME | Stream name for Azure Sentinel logging +| AZURE_SENTINEL_CLIENT_SECRET | Client secret for Azure Sentinel authentication +| AZURE_SENTINEL_ENDPOINT | Endpoint for Azure Sentinel logging +| AZURE_SENTINEL_TENANT_ID | Tenant ID for Azure Sentinel authentication +| AZURE_SENTINEL_CLIENT_ID | Client ID for Azure Sentinel authentication | AZURE_KEY_VAULT_URI | URI for Azure Key Vault | AZURE_OPERATION_POLLING_TIMEOUT | Timeout in seconds for Azure operation polling | AZURE_STORAGE_ACCOUNT_KEY | The Azure Storage Account Key to use for Authentication to Azure Blob Storage logging @@ -417,15 +452,32 @@ router_settings: | BERRISPEND_ACCOUNT_ID | Account ID for BerriSpend service | BRAINTRUST_API_KEY | API key for Braintrust integration | BRAINTRUST_API_BASE | Base URL for Braintrust API. Default is https://api.braintrustdata.com/v1 +| BRAINTRUST_MOCK | Enable mock mode for Braintrust integration testing. When set to true, intercepts Braintrust API calls and returns mock responses without making actual network calls. Default is false +| BRAINTRUST_MOCK_LATENCY_MS | Mock latency in milliseconds for Braintrust API calls when mock mode is enabled. Simulates network round-trip time. Default is 100ms | CACHED_STREAMING_CHUNK_DELAY | Delay in seconds for cached streaming chunks. Default is 0.02 +| CHATGPT_API_BASE | Base URL for ChatGPT API. Default is https://chatgpt.com/backend-api/codex +| CHATGPT_AUTH_FILE | Filename for ChatGPT authentication data. Default is "auth.json" +| CHATGPT_DEFAULT_INSTRUCTIONS | Default system instructions for ChatGPT provider +| CHATGPT_ORIGINATOR | Originator identifier for ChatGPT API requests. Default is "codex_cli_rs" +| CHATGPT_TOKEN_DIR | Directory to store ChatGPT authentication tokens. Default is "~/.config/litellm/chatgpt" +| CHATGPT_USER_AGENT | Custom user agent string for ChatGPT API requests +| CHATGPT_USER_AGENT_SUFFIX | Suffix to append to the ChatGPT user agent string | CIRCLE_OIDC_TOKEN | OpenID Connect token for CircleCI | CIRCLE_OIDC_TOKEN_V2 | Version 2 of the OpenID Connect token for CircleCI +| CLI_JWT_EXPIRATION_HOURS | Expiration time in hours for CLI-generated JWT tokens. Default is 24 hours. Can also be set via LITELLM_CLI_JWT_EXPIRATION_HOURS | CLOUDZERO_API_KEY | CloudZero API key for authentication | CLOUDZERO_CONNECTION_ID | CloudZero connection ID for data submission | CLOUDZERO_EXPORT_INTERVAL_MINUTES | Interval in minutes for CloudZero data export operations | CLOUDZERO_MAX_FETCHED_DATA_RECORDS | Maximum number of data records to fetch from CloudZero | CLOUDZERO_TIMEZONE | Timezone for date handling (default: UTC) | CONFIG_FILE_PATH | File path for configuration file +| CYBERARK_ACCOUNT | CyberArk account name for secret management +| CYBERARK_API_BASE | Base URL for CyberArk API +| CYBERARK_API_KEY | API key for CyberArk secret management service +| CYBERARK_CLIENT_CERT | Path to client certificate for CyberArk authentication +| CYBERARK_CLIENT_KEY | Path to client key for CyberArk authentication +| CYBERARK_USERNAME | Username for CyberArk authentication +| CYBERARK_SSL_VERIFY | Flag to enable or disable SSL certificate verification for CyberArk. Default is True | CONFIDENT_API_KEY | API key for DeepEval integration | CUSTOM_TIKTOKEN_CACHE_DIR | Custom directory for Tiktoken cache | CONFIDENT_API_KEY | API key for Confident AI (Deepeval) Logging service @@ -439,6 +491,9 @@ router_settings: | DATABASE_USER | Username for database connection | DATABASE_USERNAME | Alias for database user | DATABRICKS_API_BASE | Base URL for Databricks API +| DATABRICKS_CLIENT_ID | Client ID for Databricks OAuth M2M authentication (Service Principal application ID) +| DATABRICKS_CLIENT_SECRET | Client secret for Databricks OAuth M2M authentication +| DATABRICKS_USER_AGENT | Custom user agent string for Databricks API requests. Used for partner telemetry attribution | DAYS_IN_A_MONTH | Days in a month for calculation purposes. Default is 28 | DAYS_IN_A_WEEK | Days in a week for calculation purposes. Default is 7 | DAYS_IN_A_YEAR | Days in a year for calculation purposes. Default is 365 @@ -449,24 +504,34 @@ router_settings: | DD_BASE_URL | Base URL for Datadog integration | DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration | _DATADOG_BASE_URL | (Alternative to DD_BASE_URL) Base URL for Datadog integration +| DD_AGENT_HOST | Hostname or IP of DataDog agent (e.g., "localhost"). When set, logs are sent to agent instead of direct API +| DD_AGENT_PORT | Port of DataDog agent for log intake. Default is 10518 | DD_API_KEY | API key for Datadog integration +| DD_APP_KEY | Application key for Datadog Cost Management integration. Required along with DD_API_KEY for cost metrics | DD_SITE | Site URL for Datadog (e.g., datadoghq.com) | DD_SOURCE | Source identifier for Datadog logs | DD_TRACER_STREAMING_CHUNK_YIELD_RESOURCE | Resource name for Datadog tracing of streaming chunk yields. Default is "streaming.chunk.yield" | DD_ENV | Environment identifier for Datadog logs. Only supported for `datadog_llm_observability` callback | DD_SERVICE | Service identifier for Datadog logs. Defaults to "litellm-server" | DD_VERSION | Version identifier for Datadog logs. Defaults to "unknown" +| DATADOG_MOCK | Enable mock mode for Datadog integration testing. When set to true, intercepts Datadog API calls and returns mock responses without making actual network calls. Default is false +| DATADOG_MOCK_LATENCY_MS | Mock latency in milliseconds for Datadog API calls when mock mode is enabled. Simulates network round-trip time. Default is 100ms | DEBUG_OTEL | Enable debug mode for OpenTelemetry | DEFAULT_ALLOWED_FAILS | Maximum failures allowed before cooling down a model. Default is 3 +| DEFAULT_A2A_AGENT_TIMEOUT | Default timeout in seconds for A2A (Agent-to-Agent) protocol requests. Default is 6000 | DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS | Default maximum tokens for Anthropic chat completions. Default is 4096 | DEFAULT_BATCH_SIZE | Default batch size for operations. Default is 512 +| DEFAULT_CHUNK_OVERLAP | Default chunk overlap for RAG text splitters. Default is 200 +| DEFAULT_CHUNK_SIZE | Default chunk size for RAG text splitters. Default is 1000 | DEFAULT_CLIENT_DISCONNECT_CHECK_TIMEOUT_SECONDS | Timeout in seconds for checking client disconnection. Default is 1 | DEFAULT_COOLDOWN_TIME_SECONDS | Duration in seconds to cooldown a model after failures. Default is 5 | DEFAULT_CRON_JOB_LOCK_TTL_SECONDS | Time-to-live for cron job locks in seconds. Default is 60 (1 minute) | DEFAULT_DATAFORSEO_LOCATION_CODE | Default location code for DataForSEO search API. Default is 2250 (France) | DEFAULT_FAILURE_THRESHOLD_PERCENT | Threshold percentage of failures to cool down a deployment. Default is 0.5 (50%) +| DEFAULT_FAILURE_THRESHOLD_MINIMUM_REQUESTS | Minimum number of requests before applying error rate cooldown. Prevents cooldown from triggering on first failure. Default is 5 | DEFAULT_FLUSH_INTERVAL_SECONDS | Default interval in seconds for flushing operations. Default is 5 | DEFAULT_HEALTH_CHECK_INTERVAL | Default interval in seconds for health checks. Default is 300 (5 minutes) +| DEFAULT_HEALTH_CHECK_PROMPT | Default prompt used during health checks for non-image models. Default is "test from litellm" | DEFAULT_IMAGE_HEIGHT | Default height for images. Default is 300 | DEFAULT_IMAGE_TOKEN_COUNT | Default token count for images. Default is 250 | DEFAULT_IMAGE_WIDTH | Default width for images. Default is 300 @@ -493,6 +558,7 @@ router_settings: | DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH | Default minimal reasoning effort thinking budget for Gemini 2.5 Flash. Default is 512 | DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_FLASH_LITE | Default minimal reasoning effort thinking budget for Gemini 2.5 Flash Lite. Default is 512 | DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET_GEMINI_2_5_PRO | Default minimal reasoning effort thinking budget for Gemini 2.5 Pro. Default is 512 +| DEFAULT_REDIS_MAJOR_VERSION | Default Redis major version to assume when version cannot be determined. Default is 7 | DEFAULT_REDIS_SYNC_INTERVAL | Default Redis synchronization interval in seconds. Default is 1 | DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND | Default price per second for Replicate GPU. Default is 0.001400 | DEFAULT_REPLICATE_POLLING_DELAY_SECONDS | Default delay in seconds for Replicate polling. Default is 1 @@ -504,6 +570,7 @@ router_settings: | DEFAULT_SLACK_ALERTING_THRESHOLD | Default threshold for Slack alerting. Default is 300 | DEFAULT_SOFT_BUDGET | Default soft budget for LiteLLM proxy keys. Default is 50.0 | DEFAULT_TRIM_RATIO | Default ratio of tokens to trim from prompt end. Default is 0.75 +| DEFAULT_GOOGLE_VIDEO_DURATION_SECONDS | Default duration for video generation in seconds in google. Default is 8 | DIRECT_URL | Direct URL for service endpoint | DISABLE_ADMIN_UI | Toggle to disable the admin UI | DISABLE_AIOHTTP_TRANSPORT | Flag to disable aiohttp transport. When this is set to True, litellm will use httpx instead of aiohttp. **Default is False** @@ -515,10 +582,14 @@ router_settings: | DOCS_TITLE | Title of the documentation pages | DOCS_URL | The path to the Swagger API documentation. **By default this is "/"** | EMAIL_LOGO_URL | URL for the logo used in emails +| EMAIL_BUDGET_ALERT_TTL | Time-to-live for email budget alerts in seconds +| EMAIL_BUDGET_ALERT_MAX_SPEND_ALERT_PERCENTAGE | Maximum spend percentage for triggering email budget alerts | EMAIL_SUPPORT_CONTACT | Support contact email address | EMAIL_SIGNATURE | Custom HTML footer/signature for all emails. Can include HTML tags for formatting and links. | EMAIL_SUBJECT_INVITATION | Custom subject template for invitation emails. | EMAIL_SUBJECT_KEY_CREATED | Custom subject template for key creation emails. +| EMAIL_BUDGET_ALERT_MAX_SPEND_ALERT_PERCENTAGE | Percentage of max budget that triggers alerts (as decimal: 0.8 = 80%). Default is 0.8 +| EMAIL_BUDGET_ALERT_TTL | Time-to-live for budget alert deduplication in seconds. Default is 86400 (24 hours) | ENKRYPTAI_API_BASE | Base URL for EnkryptAI Guardrails API. **Default is https://api.enkryptai.com** | ENKRYPTAI_API_KEY | API key for EnkryptAI Guardrails service | EXPERIMENTAL_MULTI_INSTANCE_RATE_LIMITING | Flag to enable new multi-instance rate limiting. **Default is False** @@ -527,6 +598,18 @@ router_settings: | FIREWORKS_AI_56_B_MOE | Size parameter for Fireworks AI 56B MOE model. Default is 56 | FIREWORKS_AI_80_B | Size parameter for Fireworks AI 80B model. Default is 80 | FIREWORKS_AI_176_B_MOE | Size parameter for Fireworks AI 176B MOE model. Default is 176 +| FOCUS_PROVIDER | Destination provider for Focus exports (e.g., `s3`). Defaults to `s3`. +| FOCUS_FORMAT | Output format for Focus exports. Defaults to `parquet`. +| FOCUS_FREQUENCY | Frequency for scheduled Focus exports (`hourly`, `daily`, or `interval`). Defaults to `hourly`. +| FOCUS_CRON_OFFSET | Minute offset used when scheduling hourly/daily Focus exports. Defaults to `5` minutes. +| FOCUS_INTERVAL_SECONDS | Interval (in seconds) for Focus exports when `frequency` is `interval`. +| FOCUS_PREFIX | Object key prefix (or folder) used when uploading Focus export files. Defaults to `focus_exports`. +| FOCUS_S3_BUCKET_NAME | S3 bucket to upload Focus export files when using the S3 destination. +| FOCUS_S3_REGION_NAME | AWS region for the Focus export S3 bucket. +| FOCUS_S3_ENDPOINT_URL | Custom endpoint for the Focus export S3 client (optional; useful for S3-compatible storage). +| FOCUS_S3_ACCESS_KEY | AWS access key ID used by the Focus export S3 client. +| FOCUS_S3_SECRET_KEY | AWS secret access key used by the Focus export S3 client. +| FOCUS_S3_SESSION_TOKEN | AWS session token used by the Focus export S3 client (optional). | FUNCTION_DEFINITION_TOKEN_COUNT | Token count for function definitions. Default is 9 | GALILEO_BASE_URL | Base URL for Galileo platform | GALILEO_PASSWORD | Password for Galileo authentication @@ -534,9 +617,12 @@ router_settings: | GALILEO_USERNAME | Username for Galileo authentication | GOOGLE_SECRET_MANAGER_PROJECT_ID | Project ID for Google Secret Manager | GCS_BUCKET_NAME | Name of the Google Cloud Storage bucket +| GCS_MOCK | Enable mock mode for GCS integration testing. When set to true, intercepts GCS API calls and returns mock responses without making actual network calls. Default is false +| GCS_MOCK_LATENCY_MS | Mock latency in milliseconds for GCS API calls when mock mode is enabled. Simulates network round-trip time. Default is 150ms | GCS_PATH_SERVICE_ACCOUNT | Path to the Google Cloud service account JSON file | GCS_FLUSH_INTERVAL | Flush interval for GCS logging (in seconds). Specify how often you want a log to be sent to GCS. **Default is 20 seconds** | GCS_BATCH_SIZE | Batch size for GCS logging. Specify after how many logs you want to flush to GCS. If `BATCH_SIZE` is set to 10, logs are flushed every 10 logs. **Default is 2048** +| GCS_USE_BATCHED_LOGGING | Enable batched logging for GCS. When enabled (default), multiple log payloads are combined into single GCS object uploads (NDJSON format), dramatically reducing API calls. When disabled, sends each log individually as separate GCS objects (legacy behavior). **Default is true** | GCS_PUBSUB_TOPIC_ID | PubSub Topic ID to send LiteLLM SpendLogs to. | GCS_PUBSUB_PROJECT_ID | PubSub Project ID to send LiteLLM SpendLogs to. | GENERIC_AUTHORIZATION_ENDPOINT | Authorization endpoint for generic OAuth providers @@ -556,6 +642,12 @@ router_settings: | GENERIC_USER_PROVIDER_ATTRIBUTE | Attribute specifying the user's provider | GENERIC_USER_ROLE_ATTRIBUTE | Attribute specifying the user's role | GENERIC_USERINFO_ENDPOINT | Endpoint to fetch user information in generic OAuth +| GENERIC_LOGGER_ENDPOINT | Endpoint URL for the Generic Logger callback to send logs to +| GENERIC_LOGGER_HEADERS | JSON string of headers to include in Generic Logger callback requests +| GENERIC_ROLE_MAPPINGS_DEFAULT_ROLE | Default LiteLLM role to assign when no role mapping matches in generic SSO. Used with GENERIC_ROLE_MAPPINGS_ROLES +| GENERIC_ROLE_MAPPINGS_GROUP_CLAIM | The claim/attribute name in the SSO token that contains the user's groups. Used for role mapping +| GENERIC_ROLE_MAPPINGS_ROLES | Python dict string mapping LiteLLM roles to SSO group names. Example: `{"proxy_admin": ["admin-group"], "internal_user": ["users"]}` +| GENERIC_USER_ROLE_MAPPINGS | Alternative to GENERIC_ROLE_MAPPINGS_ROLES for configuring user role mappings from SSO | GEMINI_API_BASE | Base URL for Gemini API. Default is https://generativelanguage.googleapis.com | GALILEO_BASE_URL | Base URL for Galileo platform | GALILEO_PASSWORD | Password for Galileo authentication @@ -568,6 +660,8 @@ router_settings: | GREENSCALE_ENDPOINT | Endpoint URL for Greenscale service | GRAYSWAN_API_BASE | Base URL for GraySwan API. Default is https://api.grayswan.ai | GRAYSWAN_API_KEY | API key for GraySwan Cygnal service +| GRAYSWAN_REASONING_MODE | Reasoning mode for GraySwan guardrail +| GRAYSWAN_VIOLATION_THRESHOLD | Violation threshold for GraySwan guardrail | GOOGLE_APPLICATION_CREDENTIALS | Path to Google Cloud credentials JSON file | GOOGLE_CLIENT_ID | Client ID for Google OAuth | GOOGLE_CLIENT_SECRET | Client secret for Google OAuth @@ -578,15 +672,26 @@ router_settings: | HEROKU_API_KEY | API key for Heroku services | HF_API_BASE | Base URL for Hugging Face API | HCP_VAULT_ADDR | Address for [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) +| HCP_VAULT_APPROLE_MOUNT_PATH | Mount path for AppRole authentication in [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault). Default is "approle" +| HCP_VAULT_APPROLE_ROLE_ID | Role ID for AppRole authentication in [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) +| HCP_VAULT_APPROLE_SECRET_ID | Secret ID for AppRole authentication in [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) | HCP_VAULT_CLIENT_CERT | Path to client certificate for [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) | HCP_VAULT_CLIENT_KEY | Path to client key for [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) +| HCP_VAULT_MOUNT_NAME | Mount name for [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) | HCP_VAULT_NAMESPACE | Namespace for [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) +| HCP_VAULT_PATH_PREFIX | Path prefix for [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) | HCP_VAULT_TOKEN | Token for [Hashicorp Vault Secret Manager](../secret.md#hashicorp-vault) | HCP_VAULT_CERT_ROLE | Role for [Hashicorp Vault Secret Manager Auth](../secret.md#hashicorp-vault) | HELICONE_API_KEY | API key for Helicone service | HELICONE_API_BASE | Base URL for Helicone service, defaults to `https://api.helicone.ai` +| HELICONE_MOCK | Enable mock mode for Helicone integration testing. When set to true, intercepts Helicone API calls and returns mock responses without making actual network calls. Default is false +| HELICONE_MOCK_LATENCY_MS | Mock latency in milliseconds for Helicone API calls when mock mode is enabled. Simulates network round-trip time. Default is 100ms | HOSTNAME | Hostname for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog) | HOURS_IN_A_DAY | Hours in a day for calculation purposes. Default is 24 +| HIDDENLAYER_API_BASE | Base URL for HiddenLayer API. Defaults to `https://api.hiddenlayer.ai` +| HIDDENLAYER_AUTH_URL | Authentication URL for HiddenLayer. Defaults to `https://auth.hiddenlayer.ai` +| HIDDENLAYER_CLIENT_ID | Client ID for HiddenLayer SaaS authentication +| HIDDENLAYER_CLIENT_SECRET | Client secret for HiddenLayer SaaS authentication | HUGGINGFACE_API_BASE | Base URL for Hugging Face API | HUGGINGFACE_API_KEY | API key for Hugging Face API | HUMANLOOP_PROMPT_CACHE_TTL_SECONDS | Time-to-live in seconds for cached prompts in Humanloop. Default is 60 @@ -606,15 +711,21 @@ router_settings: | LANGFUSE_FLUSH_INTERVAL | Interval for flushing Langfuse logs | LANGFUSE_TRACING_ENVIRONMENT | Environment for Langfuse tracing | LANGFUSE_HOST | Host URL for Langfuse service +| LANGFUSE_MOCK | Enable mock mode for Langfuse integration testing. When set to true, intercepts Langfuse API calls and returns mock responses without making actual network calls. Default is false +| LANGFUSE_MOCK_LATENCY_MS | Mock latency in milliseconds for Langfuse API calls when mock mode is enabled. Simulates network round-trip time. Default is 100ms | LANGFUSE_PUBLIC_KEY | Public key for Langfuse authentication | LANGFUSE_RELEASE | Release version of Langfuse integration | LANGFUSE_SECRET_KEY | Secret key for Langfuse authentication +| LANGFUSE_PROPAGATE_TRACE_ID | Flag to enable propagating trace ID to Langfuse. Default is False | LANGSMITH_API_KEY | API key for Langsmith platform | LANGSMITH_BASE_URL | Base URL for Langsmith service | LANGSMITH_BATCH_SIZE | Batch size for operations in Langsmith | LANGSMITH_DEFAULT_RUN_NAME | Default name for Langsmith run | LANGSMITH_PROJECT | Project name for Langsmith integration | LANGSMITH_SAMPLING_RATE | Sampling rate for Langsmith logging +| LANGSMITH_TENANT_ID | Tenant ID for Langsmith multi-tenant deployments +| LANGSMITH_MOCK | Enable mock mode for Langsmith integration testing. When set to true, intercepts Langsmith API calls and returns mock responses without making actual network calls. Default is false +| LANGSMITH_MOCK_LATENCY_MS | Mock latency in milliseconds for Langsmith API calls when mock mode is enabled. Simulates network round-trip time. Default is 100ms | LANGTRACE_API_KEY | API key for Langtrace service | LASSO_API_BASE | Base URL for Lasso API | LASSO_API_KEY | API key for Lasso service @@ -626,20 +737,27 @@ router_settings: | LITERAL_API_URL | API URL for Literal service | LITERAL_BATCH_SIZE | Batch size for Literal operations | LITELLM_ANTHROPIC_DISABLE_URL_SUFFIX | Disable automatic URL suffix appending for Anthropic API base URLs. When set to `true`, prevents LiteLLM from automatically adding `/v1/messages` or `/v1/complete` to custom Anthropic API endpoints +| LITELLM_CLI_JWT_EXPIRATION_HOURS | Expiration time in hours for CLI-generated JWT tokens. Default is 24 hours +| LITELLM_DD_AGENT_HOST | Hostname or IP of DataDog agent for LiteLLM-specific logging. When set, logs are sent to agent instead of direct API +| LITELLM_DD_AGENT_PORT | Port of DataDog agent for LiteLLM-specific log intake. Default is 10518 +| LITELLM_DD_LLM_OBS_PORT | Port for Datadog LLM Observability agent. Default is 8126 | LITELLM_DONT_SHOW_FEEDBACK_BOX | Flag to hide feedback box in LiteLLM UI | LITELLM_DROP_PARAMS | Parameters to drop in LiteLLM requests | LITELLM_MODIFY_PARAMS | Parameters to modify in LiteLLM requests | LITELLM_EMAIL | Email associated with LiteLLM account | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES | Maximum retries for parallel requests in LiteLLM | LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRY_TIMEOUT | Timeout for retries of parallel requests in LiteLLM +| LITELLM_DISABLE_LAZY_LOADING | When set to "1", "true", "yes", or "on", disables lazy loading of attributes (currently only affects encoding/tiktoken). This ensures encoding is initialized before VCR starts recording HTTP requests, fixing VCR cassette creation issues. See [issue #18659](https://github.com/BerriAI/litellm/issues/18659) | LITELLM_MIGRATION_DIR | Custom migrations directory for prisma migrations, used for baselining db in read-only file systems. | LITELLM_HOSTED_UI | URL of the hosted UI for LiteLLM +| LITELLM_UI_API_DOC_BASE_URL | Optional override for the API Reference base URL (used in sample code/docs) when the admin UI runs on a different host than the proxy. Defaults to `PROXY_BASE_URL` when unset. | LITELM_ENVIRONMENT | Environment of LiteLLM Instance, used by logging services. Currently only used by DeepEval. | LITELLM_KEY_ROTATION_ENABLED | Enable auto-key rotation for LiteLLM (boolean). Default is false. | LITELLM_KEY_ROTATION_CHECK_INTERVAL_SECONDS | Interval in seconds for how often to run job that auto-rotates keys. Default is 86400 (24 hours). | LITELLM_LICENSE | License key for LiteLLM usage | LITELLM_LOCAL_MODEL_COST_MAP | Local configuration for model cost mapping in LiteLLM | LITELLM_LOG | Enable detailed logging for LiteLLM +| LITELLM_MODEL_COST_MAP_URL | URL for fetching model cost map data. Default is https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json | LITELLM_LOG_FILE | File path to write LiteLLM logs to. When set, logs will be written to both console and the specified file | LITELLM_LOGGER_NAME | Name for OTEL logger | LITELLM_METER_NAME | Name for OTEL Meter @@ -647,17 +765,29 @@ router_settings: | LITELLM_OTEL_INTEGRATION_ENABLE_METRICS | Optionally enable emantic metrics for OTEL | LITELLM_MASTER_KEY | Master key for proxy authentication | LITELLM_MODE | Operating mode for LiteLLM (e.g., production, development) +| LITELLM_NON_ROOT | Flag to run LiteLLM in non-root mode for enhanced security in Docker containers | LITELLM_RATE_LIMIT_WINDOW_SIZE | Rate limit window size for LiteLLM. Default is 60 +| LITELLM_REASONING_AUTO_SUMMARY | If set to "true", automatically enables detailed reasoning summaries for reasoning models (e.g., o1, o3-mini, deepseek-reasoner). When enabled, adds `summary: "detailed"` to reasoning effort configurations. Default is "false" | LITELLM_SALT_KEY | Salt key for encryption in LiteLLM | LITELLM_SSL_CIPHERS | SSL/TLS cipher configuration for faster handshakes. Controls cipher suite preferences for OpenSSL connections. | LITELLM_SECRET_AWS_KMS_LITELLM_LICENSE | AWS KMS encrypted license for LiteLLM | LITELLM_TOKEN | Access token for LiteLLM integration +| LITELLM_USER_AGENT | Custom user agent string for LiteLLM API requests. Used for partner telemetry attribution | LITELLM_PRINT_STANDARD_LOGGING_PAYLOAD | If true, prints the standard logging payload to the console - useful for debugging | LITELM_ENVIRONMENT | Environment for LiteLLM Instance. This is currently only logged to DeepEval to determine the environment for DeepEval integration. | LOGFIRE_TOKEN | Token for Logfire logging service +| LOGFIRE_BASE_URL | Base URL for Logfire logging service (useful for self hosted deployments) +| LOGGING_WORKER_CONCURRENCY | Maximum number of concurrent coroutine slots for the logging worker on the asyncio event loop. Default is 100. Setting too high will flood the event loop with logging tasks which will lower the overall latency of the requests. +| LOGGING_WORKER_MAX_QUEUE_SIZE | Maximum size of the logging worker queue. When the queue is full, the worker aggressively clears tasks to make room instead of dropping logs. Default is 50,000 +| LOGGING_WORKER_MAX_TIME_PER_COROUTINE | Maximum time in seconds allowed for each coroutine in the logging worker before timing out. Default is 20.0 +| LOGGING_WORKER_CLEAR_PERCENTAGE | Percentage of the queue to extract when clearing. Default is 50% | MAX_EXCEPTION_MESSAGE_LENGTH | Maximum length for exception messages. Default is 2000 +| MAX_ITERATIONS_TO_CLEAR_QUEUE | Maximum number of iterations to attempt when clearing the logging worker queue during shutdown. Default is 200 +| MAX_TIME_TO_CLEAR_QUEUE | Maximum time in seconds to spend clearing the logging worker queue during shutdown. Default is 5.0 +| LOGGING_WORKER_AGGRESSIVE_CLEAR_COOLDOWN_SECONDS | Cooldown time in seconds before allowing another aggressive clear operation when the queue is full. Default is 0.5 | MAX_STRING_LENGTH_PROMPT_IN_DB | Maximum length for strings in spend logs when sanitizing request bodies. Strings longer than this will be truncated. Default is 1000 | MAX_IN_MEMORY_QUEUE_FLUSH_COUNT | Maximum count for in-memory queue flush operations. Default is 1000 +| MAX_IMAGE_URL_DOWNLOAD_SIZE_MB | Maximum size in MB for downloading images from URLs. Prevents memory issues from downloading very large images. Images exceeding this limit will be rejected before download. Set to 0 to completely disable image URL handling (all image_url requests will be blocked). Default is 50MB (matching [OpenAI's limit](https://platform.openai.com/docs/guides/images-vision?api-mode=chat#image-input-requirements)) | MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES | Maximum length for the long side of high-resolution images. Default is 2000 | MAX_REDIS_BUFFER_DEQUEUE_COUNT | Maximum count for Redis buffer dequeue operations. Default is 100 | MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES | Maximum length for the short side of high-resolution images. Default is 768 @@ -670,15 +800,23 @@ router_settings: | MAX_TOKEN_TRIMMING_ATTEMPTS | Maximum number of attempts to trim a token message. Default is 10 | MAXIMUM_TRACEBACK_LINES_TO_LOG | Maximum number of lines to log in traceback in LiteLLM Logs UI. Default is 100 | MAX_RETRY_DELAY | Maximum delay in seconds for retrying requests. Default is 8.0 -| MAX_LANGFUSE_INITIALIZED_CLIENTS | Maximum number of Langfuse clients to initialize on proxy. Default is 20. This is set since langfuse initializes 1 thread everytime a client is initialized. We've had an incident in the past where we reached 100% cpu utilization because Langfuse was initialized several times. +| MAX_LANGFUSE_INITIALIZED_CLIENTS | Maximum number of Langfuse clients to initialize on proxy. Default is 50. This is set since langfuse initializes 1 thread everytime a client is initialized. We've had an incident in the past where we reached 100% cpu utilization because Langfuse was initialized several times. | MIN_NON_ZERO_TEMPERATURE | Minimum non-zero temperature value. Default is 0.0001 | MINIMUM_PROMPT_CACHE_TOKEN_COUNT | Minimum token count for caching a prompt. Default is 1024 | MISTRAL_API_BASE | Base URL for Mistral API. Default is https://api.mistral.ai | MISTRAL_API_KEY | API key for Mistral API +| MICROSOFT_AUTHORIZATION_ENDPOINT | Custom authorization endpoint URL for Microsoft SSO (overrides default Microsoft OAuth authorization endpoint) | MICROSOFT_CLIENT_ID | Client ID for Microsoft services | MICROSOFT_CLIENT_SECRET | Client secret for Microsoft services -| MICROSOFT_TENANT | Tenant ID for Microsoft Azure | MICROSOFT_SERVICE_PRINCIPAL_ID | Service Principal ID for Microsoft Enterprise Application. (This is an advanced feature if you want litellm to auto-assign members to Litellm Teams based on their Microsoft Entra ID Groups) +| MICROSOFT_TENANT | Tenant ID for Microsoft Azure +| MICROSOFT_TOKEN_ENDPOINT | Custom token endpoint URL for Microsoft SSO (overrides default Microsoft OAuth token endpoint) +| MICROSOFT_USER_DISPLAY_NAME_ATTRIBUTE | Field name for user display name in Microsoft SSO response. Default is `displayName` +| MICROSOFT_USER_EMAIL_ATTRIBUTE | Field name for user email in Microsoft SSO response. Default is `userPrincipalName` +| MICROSOFT_USER_FIRST_NAME_ATTRIBUTE | Field name for user first name in Microsoft SSO response. Default is `givenName` +| MICROSOFT_USER_ID_ATTRIBUTE | Field name for user ID in Microsoft SSO response. Default is `id` +| MICROSOFT_USER_LAST_NAME_ATTRIBUTE | Field name for user last name in Microsoft SSO response. Default is `surname` +| MICROSOFT_USERINFO_ENDPOINT | Custom userinfo endpoint URL for Microsoft SSO (overrides default Microsoft Graph userinfo endpoint) | NO_DOCS | Flag to disable Swagger UI documentation | NO_REDOC | Flag to disable Redoc documentation | NO_PROXY | List of addresses to bypass proxy @@ -687,6 +825,7 @@ router_settings: | OPENAI_BASE_URL | Base URL for OpenAI API | OPENAI_API_BASE | Base URL for OpenAI API. Default is https://api.openai.com/ | OPENAI_API_KEY | API key for OpenAI services +| OPENAI_CHATGPT_API_BASE | Alternative to CHATGPT_API_BASE. Base URL for ChatGPT API | OPENAI_FILE_SEARCH_COST_PER_1K_CALLS | Cost per 1000 calls for OpenAI file search. Default is 0.0025 | OPENAI_ORGANIZATION | Organization identifier for OpenAI | OPENID_BASE_URL | Base URL for OpenID Connect services @@ -695,6 +834,9 @@ router_settings: | OPENMETER_API_ENDPOINT | API endpoint for OpenMeter integration | OPENMETER_API_KEY | API key for OpenMeter services | OPENMETER_EVENT_TYPE | Type of events sent to OpenMeter +| ONYX_API_BASE | Base URL for Onyx Security AI Guard service (defaults to https://ai-guard.onyx.security) +| ONYX_API_KEY | API key for Onyx Security AI Guard service +| ONYX_TIMEOUT | Timeout in seconds for Onyx Guard server requests. Default is 10 | OTEL_ENDPOINT | OpenTelemetry endpoint for traces | OTEL_EXPORTER_OTLP_ENDPOINT | OpenTelemetry endpoint for traces | OTEL_ENVIRONMENT_NAME | Environment name for OpenTelemetry @@ -705,6 +847,7 @@ router_settings: | OTEL_EXPORTER_OTLP_HEADERS | Headers for OpenTelemetry requests | OTEL_SERVICE_NAME | Service name identifier for OpenTelemetry | OTEL_TRACER_NAME | Tracer name for OpenTelemetry tracing +| OTEL_LOGS_EXPORTER | Exporter type for OpenTelemetry logs (e.g., console) | PAGERDUTY_API_KEY | API key for PagerDuty Alerting | PANW_PRISMA_AIRS_API_KEY | API key for PANW Prisma AIRS service | PANW_PRISMA_AIRS_API_BASE | Base URL for PANW Prisma AIRS service @@ -717,6 +860,8 @@ router_settings: | POD_NAME | Pod name for the server, this will be [emitted to `datadog` logs](https://docs.litellm.ai/docs/proxy/logging#datadog) as `POD_NAME` | POSTHOG_API_KEY | API key for PostHog analytics integration | POSTHOG_API_URL | Base URL for PostHog API (defaults to https://us.i.posthog.com) +| POSTHOG_MOCK | Enable mock mode for PostHog integration testing. When set to true, intercepts PostHog API calls and returns mock responses without making actual network calls. Default is false +| POSTHOG_MOCK_LATENCY_MS | Mock latency in milliseconds for PostHog API calls when mock mode is enabled. Simulates network round-trip time. Default is 100ms | PREDIBASE_API_BASE | Base URL for Predibase API | PRESIDIO_ANALYZER_API_BASE | Base URL for Presidio Analyzer service | PRESIDIO_ANONYMIZER_API_BASE | Base URL for Presidio Anonymizer service @@ -726,7 +871,7 @@ router_settings: | PROMPTLAYER_API_KEY | API key for PromptLayer integration | PROXY_ADMIN_ID | Admin identifier for proxy server | PROXY_BASE_URL | Base URL for proxy service -| PROXY_BATCH_WRITE_AT | Time in seconds to wait before batch writing spend logs to the database. Default is 30 +| PROXY_BATCH_WRITE_AT | Time in seconds to wait before batch writing spend logs to the database. Default is 10 | PROXY_BATCH_POLLING_INTERVAL | Time in seconds to wait before polling a batch, to check if it's completed. Default is 6000s (1 hour) | PROXY_BUDGET_RESCHEDULER_MAX_TIME | Maximum time in seconds to wait before checking database for budget resets. Default is 605 | PROXY_BUDGET_RESCHEDULER_MIN_TIME | Minimum time in seconds to wait before checking database for budget resets. Default is 597 @@ -750,12 +895,21 @@ router_settings: | REPLICATE_MODEL_NAME_WITH_ID_LENGTH | Length of Replicate model names with ID. Default is 64 | REPLICATE_POLLING_DELAY_SECONDS | Delay in seconds for Replicate polling operations. Default is 0.5 | REQUEST_TIMEOUT | Timeout in seconds for requests. Default is 6000 +| ROOT_REDIRECT_URL | URL to redirect root path (/) to when DOCS_URL is set to something other than "/" (DOCS_URL is "/" by default) | ROUTER_MAX_FALLBACKS | Maximum number of fallbacks for router. Default is 5 +| RUNWAYML_DEFAULT_API_VERSION | Default API version for RunwayML service. Default is "2024-11-06" +| RUNWAYML_POLLING_TIMEOUT | Timeout in seconds for RunwayML image generation polling. Default is 600 (10 minutes) +| S3_VECTORS_DEFAULT_DIMENSION | Default vector dimension for S3 Vectors RAG ingestion. Default is 1024 +| S3_VECTORS_DEFAULT_DISTANCE_METRIC | Default distance metric for S3 Vectors RAG ingestion. Options: "cosine", "euclidean". Default is "cosine" | SECRET_MANAGER_REFRESH_INTERVAL | Refresh interval in seconds for secret manager. Default is 86400 (24 hours) | SEPARATE_HEALTH_APP | If set to '1', runs health endpoints on a separate ASGI app and port. Default: '0'. | SEPARATE_HEALTH_PORT | Port for the separate health endpoints app. Only used if SEPARATE_HEALTH_APP=1. Default: 4001. +| SUPERVISORD_STOPWAITSECS | Upper bound timeout in seconds for graceful shutdown when SEPARATE_HEALTH_APP=1. Default: 3600 (1 hour). | SERVER_ROOT_PATH | Root path for the server application -| SET_VERBOSE | Flag to enable verbose logging +| SEND_USER_API_KEY_ALIAS | Flag to send user API key alias to Zscaler AI Guard. Default is False +| SEND_USER_API_KEY_TEAM_ID | Flag to send user API key team ID to Zscaler AI Guard. Default is False +| SEND_USER_API_KEY_USER_ID | Flag to send user API key user ID to Zscaler AI Guard. Default is False +| SET_VERBOSE | [DEPRECATED] Use `LITELLM_LOG` instead with values "INFO", "DEBUG", or "ERROR". See [debugging docs](./debugging) | SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD | Minimum number of requests to consider "reasonable traffic" for single-deployment cooldown logic. Default is 1000 | SLACK_DAILY_REPORT_FREQUENCY | Frequency of daily Slack reports (e.g., daily, weekly) | SLACK_WEBHOOK_URL | Webhook URL for Slack integration @@ -766,6 +920,9 @@ router_settings: | SMTP_SENDER_LOGO | Logo used in emails sent via SMTP | SMTP_TLS | Flag to enable or disable TLS for SMTP connections | SMTP_USERNAME | Username for SMTP authentication (do not set if SMTP does not require auth) +| SENDGRID_API_KEY | API key for SendGrid email service +| RESEND_API_KEY | API key for Resend email service +| SENDGRID_SENDER_EMAIL | Email address used as the sender in SendGrid email transactions | SPEND_LOGS_URL | URL for retrieving spend logs | SPEND_LOG_CLEANUP_BATCH_SIZE | Number of logs deleted per batch during cleanup. Default is 1000 | SSL_CERTIFICATE | Path to the SSL certificate file @@ -797,9 +954,17 @@ router_settings: | UPSTREAM_LANGFUSE_SECRET_KEY | Secret key for upstream Langfuse authentication | USE_AWS_KMS | Flag to enable AWS Key Management Service for encryption | USE_PRISMA_MIGRATE | Flag to use prisma migrate instead of prisma db push. Recommended for production environments. +| WANDB_API_KEY | API key for Weights & Biases (W&B) logging integration +| WANDB_HOST | Host URL for Weights & Biases (W&B) service +| WANDB_PROJECT_ID | Project ID for Weights & Biases (W&B) logging integration | WEBHOOK_URL | URL for receiving webhooks from external services | SPEND_LOG_RUN_LOOPS | Constant for setting how many runs of 1000 batch deletes should spend_log_cleanup task run | SPEND_LOG_CLEANUP_BATCH_SIZE | Number of logs deleted per batch during cleanup. Default is 1000 +| SPEND_LOG_QUEUE_POLL_INTERVAL | Polling interval in seconds for spend log queue. Default is 2.0 +| SPEND_LOG_QUEUE_SIZE_THRESHOLD | Threshold for spend log queue size before processing. Default is 100 | COROUTINE_CHECKER_MAX_SIZE_IN_MEMORY | Maximum size for CoroutineChecker in-memory cache. Default is 1000 | DEFAULT_SHARED_HEALTH_CHECK_TTL | Time-to-live in seconds for cached health check results in shared health check mode. Default is 300 (5 minutes) -| DEFAULT_SHARED_HEALTH_CHECK_LOCK_TTL | Time-to-live in seconds for health check lock in shared health check mode. Default is 60 (1 minute) \ No newline at end of file +| DEFAULT_SHARED_HEALTH_CHECK_LOCK_TTL | Time-to-live in seconds for health check lock in shared health check mode. Default is 60 (1 minute) +| ZSCALER_AI_GUARD_API_KEY | API key for Zscaler AI Guard service +| ZSCALER_AI_GUARD_POLICY_ID | Policy ID for Zscaler AI Guard guardrails +| ZSCALER_AI_GUARD_URL | Base URL for Zscaler AI Guard API. Default is https://api.us1.zseclipse.net/v1/detection/execute-policy diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md index 18177b7c4d20..a5674bf2bc5e 100644 --- a/docs/my-website/docs/proxy/configs.md +++ b/docs/my-website/docs/proxy/configs.md @@ -116,7 +116,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \ "role": "user", "content": "what llm are you" } - ], + ] } ' ``` @@ -576,10 +576,31 @@ custom_tokenizer: ```yaml general_settings: - database_connection_pool_limit: 100 # sets connection pool for prisma client to postgres db at 100 + database_connection_pool_limit: 10 # sets connection pool per worker for prisma client to postgres db (default: 10, recommended: 10-20) database_connection_timeout: 60 # sets a 60s timeout for any connection call to the db ``` +**How to calculate the right value:** + +The connection limit is applied **per worker process**, not per instance. This means if you have multiple workers, each worker will create its own connection pool. + +**Formula:** +``` +database_connection_pool_limit = MAX_DB_CONNECTIONS ÷ (number_of_instances × number_of_workers_per_instance) +``` + +**Example:** +- Your database allows a maximum of **100 connections** +- You're running **1 instance** of LiteLLM +- Each instance has **8 workers** (set via `--num_workers 8`) + +Calculation: `100 ÷ (1 × 8) = 12.5` + +Since you shouldn't use 12.5, round down to **10** to leave a safety buffer. This means: +- Each of the 8 workers will have a connection pool limit of 10 +- Total maximum connections: 8 workers × 10 connections = 80 connections +- This stays safely under your database's 100 connection limit + ## Extras @@ -655,7 +676,7 @@ docker run --name litellm-proxy \ -e LITELLM_CONFIG_BUCKET_OBJECT_KEY="> \ -e LITELLM_CONFIG_BUCKET_TYPE="gcs" \ -p 4000:4000 \ - ghcr.io/berriai/litellm-database:main-latest --detailed_debug + docker.litellm.ai/berriai/litellm-database:main-latest --detailed_debug ``` @@ -676,7 +697,7 @@ docker run --name litellm-proxy \ -e LITELLM_CONFIG_BUCKET_NAME= \ -e LITELLM_CONFIG_BUCKET_OBJECT_KEY="> \ -p 4000:4000 \ - ghcr.io/berriai/litellm-database:main-latest + docker.litellm.ai/berriai/litellm-database:main-latest ``` diff --git a/docs/my-website/docs/proxy/control_plane_and_data_plane.md b/docs/my-website/docs/proxy/control_plane_and_data_plane.md index db0b7884c929..b0fe2b71ee2d 100644 --- a/docs/my-website/docs/proxy/control_plane_and_data_plane.md +++ b/docs/my-website/docs/proxy/control_plane_and_data_plane.md @@ -163,6 +163,10 @@ DISABLE_LLM_API_ENDPOINTS=true - `/config/*` - Configuration updates - All other administrative endpoints +### `LITELLM_UI_API_DOC_BASE_URL` + +Optional override for the API Reference base URL (used in sample code/docs) when the admin UI runs on a different host than the proxy. + ## Usage Patterns diff --git a/docs/my-website/docs/proxy/cost_tracking.md b/docs/my-website/docs/proxy/cost_tracking.md index da8b6f5c5252..26a4920c093f 100644 --- a/docs/my-website/docs/proxy/cost_tracking.md +++ b/docs/my-website/docs/proxy/cost_tracking.md @@ -9,7 +9,7 @@ Track spend for keys, users, and teams across 100+ LLMs. LiteLLM automatically tracks spend for all known models. See our [model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) :::tip Keep Pricing Data Updated -[Sync model pricing data from GitHub](../sync_models_github.md) to ensure accurate cost tracking. +[Sync model pricing data from GitHub](./sync_models_github.md) to ensure accurate cost tracking. ::: ### How to Track Spend with LiteLLM @@ -722,7 +722,7 @@ curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end ```shell [ { - "api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b", + "api_key": "example-api-key-123", "total_cost": 0.3201286305151999, "total_input_tokens": 36.0, "total_output_tokens": 1593.0, @@ -766,7 +766,7 @@ curl -X GET 'http://localhost:4000/global/spend/report?start_date=2024-04-01&end ```shell [ { - "api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b", + "api_key": "example-api-key-123", "total_cost": 0.00013132, "total_input_tokens": 105.0, "total_output_tokens": 872.0, @@ -1151,7 +1151,7 @@ curl -X GET "http://0.0.0.0:4000/spend/logs?request_id= UserAPIKeyAuth: @@ -114,6 +115,29 @@ UserAPIKeyAuth( ) ``` +### Object Permission Example (MCP, agents, etc.) + +```python +from litellm.proxy._experimental.mcp_server.mcp_server_manager import ( + global_mcp_server_manager, +) + +def _server_id(name: str) -> str: + server = global_mcp_server_manager.get_mcp_server_by_name(name) + if not server: + raise ValueError(f"Unknown MCP server '{name}'") + return server.server_id + +object_permission = LiteLLM_ObjectPermissionTable( + mcp_servers=[_server_id("deepwiki"), _server_id("everything")], # MCP servers this key is allowed to use + mcp_tool_permissions={"deepwiki": ["search", "read_doc"]}, # optional per-server tool allow-list +) + +UserAPIKeyAuth( + object_permission=object_permission, +) +``` + ### Advanced Configuration ```python UserAPIKeyAuth( @@ -139,6 +163,7 @@ UserAPIKeyAuth( ### Complete Example ```python +from fastapi import Request from datetime import datetime, timedelta from litellm.proxy._types import UserAPIKeyAuth, LitellmUserRoles @@ -333,4 +358,4 @@ async def user_api_key_auth( except Exception: raise Exception("Invalid API key") -``` \ No newline at end of file +``` diff --git a/docs/my-website/docs/proxy/custom_pricing.md b/docs/my-website/docs/proxy/custom_pricing.md index 4698889786b9..8f4a4c450f52 100644 --- a/docs/my-website/docs/proxy/custom_pricing.md +++ b/docs/my-website/docs/proxy/custom_pricing.md @@ -9,7 +9,8 @@ LiteLLM provides flexible cost tracking and pricing customization for all LLM pr - **Custom Pricing** - Override default model costs or set pricing for custom models - **Cost Per Token** - Track costs based on input/output tokens (most common) - **Cost Per Second** - Track costs based on runtime (e.g., Sagemaker) -- **Provider Discounts** - Apply percentage-based discounts to specific providers +- **[Provider Discounts](./provider_discounts.md)** - Apply percentage-based discounts to specific providers +- **[Provider Margins](./provider_margins.md)** - Add fees/margins to LLM costs for internal billing - **Base Model Mapping** - Ensure accurate cost tracking for Azure deployments By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async). [**Learn More**](../observability/custom_callback.md) @@ -66,58 +67,6 @@ model_list: output_cost_per_token: 0.000520 # 👈 ONLY to track cost per token ``` -## Provider-Specific Cost Discounts - -Apply percentage-based discounts to specific providers (e.g., negotiated enterprise pricing). - -#### Usage with LiteLLM Proxy Server - -**Step 1: Add discount config to config.yaml** - -```yaml -# Apply 5% discount to all Vertex AI and Gemini costs -cost_discount_config: - vertex_ai: 0.05 # 5% discount - gemini: 0.05 # 5% discount - openrouter: 0.05 # 5% discount - # openai: 0.10 # 10% discount (example) -``` - -**Step 2: Start proxy** - -```bash -litellm /path/to/config.yaml -``` - -The discount will be automatically applied to all cost calculations for the configured providers. - - -#### How Discounts Work - -- Discounts are applied **after** all other cost calculations (tokens, caching, tools, etc.) -- The discount is a percentage (0.05 = 5%, 0.10 = 10%, etc.) -- Discounts only apply to the configured providers -- Original cost, discount amount, and final cost are tracked in cost breakdown logs -- Discount information is returned in response headers: - - `x-litellm-response-cost` - Final cost after discount - - `x-litellm-response-cost-original` - Cost before discount - - `x-litellm-response-cost-discount-amount` - Discount amount in USD - -#### Supported Providers - -You can apply discounts to all LiteLLM supported providers. Common examples: - -- `vertex_ai` - Google Vertex AI -- `gemini` - Google Gemini -- `openai` - OpenAI -- `anthropic` - Anthropic -- `azure` - Azure OpenAI -- `bedrock` - AWS Bedrock -- `cohere` - Cohere -- `openrouter` - OpenRouter - -See the full list of providers in the [LlmProviders](https://github.com/BerriAI/litellm/blob/main/litellm/types/utils.py) enum. - ## Override Model Cost Map You can override [our model cost map](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) with your own custom pricing for a mapped model. @@ -178,6 +127,28 @@ model_list: base_model: azure/gpt-4-1106-preview ``` +### OpenAI Models with Dated Versions + +`base_model` is also useful when OpenAI returns a dated model name in the response that differs from your configured model name. + +**Example**: You configure custom pricing for `gpt-4o-mini-audio-preview`, but OpenAI returns `gpt-4o-mini-audio-preview-2024-12-17` in the response. Since LiteLLM uses the response model name for pricing lookup, your custom pricing won't be applied. + +**Solution** ✅: Set `base_model` to the key you want LiteLLM to use for pricing lookup. + +```yaml +model_list: + - model_name: my-audio-model + litellm_params: + model: openai/gpt-4o-mini-audio-preview + api_key: os.environ/OPENAI_API_KEY + model_info: + base_model: gpt-4o-mini-audio-preview # 👈 Used for pricing lookup + input_cost_per_token: 0.0000006 + output_cost_per_token: 0.0000024 + input_cost_per_audio_token: 0.00001 + output_cost_per_audio_token: 0.00002 +``` + ## Debugging diff --git a/docs/my-website/docs/proxy/custom_prompt_management.md b/docs/my-website/docs/proxy/custom_prompt_management.md index 98e5228af365..f82e7fb68cb1 100644 --- a/docs/my-website/docs/proxy/custom_prompt_management.md +++ b/docs/my-website/docs/proxy/custom_prompt_management.md @@ -173,6 +173,28 @@ curl -X POST http://0.0.0.0:4000/v1/chat/completions \ +### Using the LiteLLM SDK Directly + +If you call `litellm.completion()` from a Python script (without going through the proxy), register your custom prompt manager before making the request: + +```python + +import litellm +from custom_prompt import prompt_management + +litellm.callbacks = [prompt_management] +litellm.use_litellm_proxy = True + +response = litellm.completion( + model="gpt-4", + messages=[{"role": "user", "content": "hi"}], + prompt_id="1234", + prompt_variables={"user_message": "hi"}, +) +``` + +> **Note:** `litellm.callbacks = [prompt_management]` (or equivalently `litellm.logging_callback_manager.add_litellm_callback(prompt_management)`) is required in SDK scripts. The proxy reads `callbacks` from `config.yaml` automatically, but standalone scripts do not. + The request will be transformed from: ```json { diff --git a/docs/my-website/docs/proxy/customer_usage.md b/docs/my-website/docs/proxy/customer_usage.md new file mode 100644 index 000000000000..5a6c06fdc811 --- /dev/null +++ b/docs/my-website/docs/proxy/customer_usage.md @@ -0,0 +1,155 @@ +import Image from '@theme/IdealImage'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Customer Usage + +Track and visualize end-user spend directly in the dashboard. Monitor customer-level usage analytics, spend logs, and activity metrics to understand how your customers are using your LLM services. + +This feature is **available in v1.80.8-stable and above**. + +## Overview + +Customer Usage enables you to track spend and usage for individual customers (end users) by passing an ID in your API requests. This allows you to: + +- Track spend per customer automatically +- View customer-level usage analytics in the Admin UI +- Filter spend logs and activity metrics by customer ID +- Set budgets and rate limits per customer +- Monitor customer usage patterns and trends + + + +## How to Track Spend + +Track customer spend by including a `user` field in your API requests or by passing a customer ID header. The customer ID will be automatically tracked and associated with all spend from that request. + + + + +### Using Request Body + +Make a `/chat/completions` call with the `user` field containing your customer ID: + +```bash showLineNumbers title="Track spend with customer ID in body" +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer sk-1234' \ + --data '{ + "model": "gpt-3.5-turbo", + "user": "customer-123", + "messages": [ + { + "role": "user", + "content": "What is the capital of France?" + } + ] + }' +``` + + + + +### Using Request Headers + +You can also pass the customer ID via HTTP headers. This is useful for tools that support custom headers but don't allow modifying the request body (like Claude Code with `ANTHROPIC_CUSTOM_HEADERS`). + +LiteLLM automatically recognizes these standard headers (no configuration required): +- `x-litellm-customer-id` +- `x-litellm-end-user-id` + +```bash showLineNumbers title="Track spend with customer ID in header" +curl -X POST 'http://0.0.0.0:4000/chat/completions' \ + --header 'Content-Type: application/json' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'x-litellm-customer-id: customer-123' \ + --data '{ + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "What is the capital of France?" + } + ] + }' +``` + +#### Using with Claude Code + +Claude Code supports custom headers via the `ANTHROPIC_CUSTOM_HEADERS` environment variable. Set it to pass your customer ID: + +```bash title="Configure Claude Code with customer tracking" +export ANTHROPIC_BASE_URL="http://0.0.0.0:4000/v1/messages" +export ANTHROPIC_API_KEY="sk-1234" +export ANTHROPIC_CUSTOM_HEADERS="x-litellm-customer-id: my-customer-id" +``` + +Now all requests from Claude Code will automatically track spend under `my-customer-id`. + + + + +The customer ID will be automatically upserted into the database with the new spend. If the customer ID already exists, spend will be incremented. + +### Example using OpenWebUI + +See the [Open WebUI tutorial](../tutorials/openweb_ui.md) for detailed instructions on connecting Open WebUI to LiteLLM and tracking customer usage. + +## How to View Spend + +### View Spend in Admin UI + +Navigate to the Customer Usage tab in the Admin UI to view customer-level spend analytics: + +#### 1. Access Customer Usage + +Go to the Usage page in the Admin UI (`PROXY_BASE_URL/ui/?login=success&page=new_usage`) and click on the **Customer Usage** tab. + + + +#### 2. View Customer Analytics + +The Customer Usage dashboard provides: + +- **Total spend per customer**: View aggregated spend across all customers +- **Daily spend trends**: See how customer spend changes over time +- **Model usage breakdown**: Understand which models each customer uses +- **Activity metrics**: Track requests, tokens, and success rates per customer + + + +#### 3. Filter by Customer + +Use the customer filter dropdown to view spend for specific customers: + +- Select one or more customer IDs from the dropdown +- View filtered analytics, spend logs, and activity metrics +- Compare spend across different customers + + + +## Use Cases + +### Customer Billing + +Track spend per customer to accurately bill your end users: + +- Monitor individual customer usage +- Generate invoices based on actual spend +- Set spending limits per customer + +### Usage Analytics + +Understand how different customers use your service: + +- Identify high-value customers +- Analyze usage patterns +- Optimize resource allocation + +--- + +## Related Features + +- [Customers / End-User Budgets](./customers.md) - Set budgets and rate limits for customers +- [Cost Tracking](./cost_tracking.md) - Comprehensive cost tracking and analytics +- [Billing](./billing.md) - Bill customers based on their usage diff --git a/docs/my-website/docs/proxy/customers.md b/docs/my-website/docs/proxy/customers.md index ac160d265428..1101884c36b6 100644 --- a/docs/my-website/docs/proxy/customers.md +++ b/docs/my-website/docs/proxy/customers.md @@ -12,7 +12,7 @@ Track spend, set budgets for your customers. Make a /chat/completions call, pass 'user' - First call Works -```bash +```bash showLineNumbers title="Make request with customer ID" curl -X POST 'http://0.0.0.0:4000/chat/completions' \ --header 'Content-Type: application/json' \ --header 'Authorization: Bearer sk-1234' \ # 👈 YOUR PROXY KEY @@ -39,14 +39,14 @@ If the customer_id already exists, spend will be incremented. Call `/customer/info` to get a customer's all up spend -```bash +```bash showLineNumbers title="Get customer spend" curl -X GET 'http://0.0.0.0:4000/customer/info?end_user_id=ishaan3' \ # 👈 CUSTOMER ID -H 'Authorization: Bearer sk-1234' \ # 👈 YOUR PROXY KEY ``` Expected Response: -``` +```json showLineNumbers title="Response" { "user_id": "ishaan3", "blocked": false, @@ -67,20 +67,20 @@ E.g. if your server is `https://webhook.site` and your listening on `6ab090e8-c5 1. Add webhook url to your proxy environment: -```bash +```bash showLineNumbers title="Set webhook URL" export WEBHOOK_URL="https://webhook.site/6ab090e8-c55f-4a23-b075-3209f5c57906" ``` 2. Add 'webhook' to config.yaml -```yaml +```yaml showLineNumbers title="config.yaml" general_settings: alerting: ["webhook"] # 👈 KEY CHANGE ``` 3. Test it! -```bash +```bash showLineNumbers title="Test webhook" curl -X POST 'http://localhost:4000/chat/completions' \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer sk-1234' \ @@ -99,11 +99,11 @@ curl -X POST 'http://localhost:4000/chat/completions' \ Expected Response -```json +```json showLineNumbers title="Webhook event payload" { "spend": 0.0011120000000000001, # 👈 SPEND "max_budget": null, - "token": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b", + "token": "example-api-key-123", "customer_id": "krrish12", # 👈 CUSTOMER ID "user_id": null, "team_id": null, @@ -127,12 +127,51 @@ Expected Response Set customer budgets (e.g. monthly budgets, tpm/rpm limits) on LiteLLM Proxy +### Default Budget for All Customers + +Apply budget limits to all customers without explicit budgets. This is useful for rate limiting and spending controls across all end users. + +**Step 1: Create a default budget** + +```bash showLineNumbers title="Create default budget" +curl -X POST 'http://localhost:4000/budget/new' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "max_budget": 10, + "rpm_limit": 2, + "tpm_limit": 1000 +}' +``` + +**Step 2: Configure the default budget ID** + +```yaml showLineNumbers title="config.yaml" +litellm_settings: + max_end_user_budget_id: "budget_id_from_step_1" +``` + +**Step 3: Test it** + +```bash showLineNumbers title="Make request with customer ID" +curl -X POST 'http://localhost:4000/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer sk-1234' \ +-d '{ + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello"}], + "user": "my-customer-id" +}' +``` + +The customer will be subject to the default budget limits (RPM, TPM, and $ budget). Customers with explicit budgets are unaffected. + ### Quick Start Create / Update a customer with budget **Create New Customer w/ budget** -```bash +```bash showLineNumbers title="Create customer with budget" curl -X POST 'http://0.0.0.0:4000/customer/new' -H 'Authorization: Bearer sk-1234' -H 'Content-Type: application/json' @@ -144,7 +183,7 @@ curl -X POST 'http://0.0.0.0:4000/customer/new' **Test it!** -```bash +```bash showLineNumbers title="Test customer budget" curl -X POST 'http://localhost:4000/chat/completions' \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer sk-1234' \ @@ -180,7 +219,7 @@ Create and assign customers to pricing tiers. Use the `/budget/new` endpoint for creating a new budget. [API Reference](https://litellm-api.up.railway.app/#/budget%20management/new_budget_budget_new_post) -```bash +```bash showLineNumbers title="Create budget via API" curl -X POST 'http://localhost:4000/budget/new' \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer sk-1234' \ @@ -200,7 +239,7 @@ In your application code, assign budget when creating a new customer. Just use the `budget_id` used when creating the budget. In our example, this is `my-free-tier`. -```bash +```bash showLineNumbers title="Assign budget to customer" curl -X POST 'http://localhost:4000/customer/new' \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer sk-1234' \ @@ -215,7 +254,7 @@ curl -X POST 'http://localhost:4000/customer/new' \ -```bash +```bash showLineNumbers title="Test with curl" curl -X POST 'http://localhost:4000/customer/new' \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer sk-1234' \ @@ -228,7 +267,7 @@ curl -X POST 'http://localhost:4000/customer/new' \ -```python +```python showLineNumbers title="Test with OpenAI SDK" from openai import OpenAI client = OpenAI( base_url="", diff --git a/docs/my-website/docs/proxy/db_deadlocks.md b/docs/my-website/docs/proxy/db_deadlocks.md index ef9d31d62324..fd02ce50e83b 100644 --- a/docs/my-website/docs/proxy/db_deadlocks.md +++ b/docs/my-website/docs/proxy/db_deadlocks.md @@ -4,6 +4,12 @@ import TabItem from '@theme/TabItem'; # High Availability Setup (Resolve DB Deadlocks) +:::tip Essential for Production + +This configuration is **required** for production deployments handling 1000+ requests per second. Without Redis configured, you may experience PostgreSQL connection exhaustion (`FATAL: sorry, too many clients already`). + +::: + Resolve any Database Deadlocks you see in high traffic by using this setup ## What causes the problem? diff --git a/docs/my-website/docs/proxy/db_info.md b/docs/my-website/docs/proxy/db_info.md index 946089bf1474..5ef9fa55043a 100644 --- a/docs/my-website/docs/proxy/db_info.md +++ b/docs/my-website/docs/proxy/db_info.md @@ -46,8 +46,8 @@ You can see the full DB Schema [here](https://github.com/BerriAI/litellm/blob/ma | Table Name | Description | Row Insert Frequency | |------------|-------------|---------------------| -| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **High - every LLM API request - Success or Failure** | -| LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - when enabled** | +| LiteLLM_SpendLogs | Detailed logs of all API requests. Records token usage, spend, and timing information. Tracks which models and keys were used. | **Medium - this is a batch process that runs on an interval.** | +| LiteLLM_AuditLog | Tracks changes to system configuration. Records who made changes and what was modified. Maintains history of updates to teams, users, and models. | **Off by default**, **High - Runs on every change to an entity** | ## Disable `LiteLLM_SpendLogs` diff --git a/docs/my-website/docs/proxy/deleted_keys_teams.md b/docs/my-website/docs/proxy/deleted_keys_teams.md new file mode 100644 index 000000000000..a4736ed5ed26 --- /dev/null +++ b/docs/my-website/docs/proxy/deleted_keys_teams.md @@ -0,0 +1,106 @@ +import Image from '@theme/IdealImage'; + +# Deleted Keys & Teams Audit Logs + + + +View deleted API keys and teams along with their spend and budget information at the time of deletion for auditing and compliance purposes. + +## Overview + +The Deleted Keys & Teams feature provides a comprehensive audit trail for deleted entities in your LiteLLM proxy. This feature was implemented to easily allow audits of which key or team was deleted along with the spend/budget at the time of deletion. + +When a key or team is deleted, LiteLLM automatically captures: + +- **Deletion timestamp** - When the entity was deleted +- **Deleted by** - Who performed the deletion action +- **Spend at deletion** - The total spend accumulated at the time of deletion +- **Original budget** - The budget that was set for the entity before deletion +- **Entity details** - Key or team identification information + +This information is preserved even after deletion, allowing you to maintain accurate financial records and audit trails for compliance purposes. + +## Viewing Deleted Keys + +### Step 1: Navigate to API Keys Page + +Navigate to the API Keys page in the LiteLLM UI: + +``` +http://localhost:4000/ui/?login=success&page=api-keys +``` + +![](https://colony-recorder.s3.amazonaws.com/files/2026-01-17/73b97ba9-0ab5-4140-aee2-05fa90463461/ascreenshot_5e6d9f05d452405c83d7a368349d087d_text_export.jpeg) + +### Step 2: Access Logs Section + +Click on the "Logs" menu item in the navigation. + +![](https://colony-recorder.s3.amazonaws.com/files/2026-01-17/73b97ba9-0ab5-4140-aee2-05fa90463461/ascreenshot_8ebab354b1e542e59e1082e519927edd_text_export.jpeg) + +### Step 3: View Deleted Keys + +Click on "Deleted Keys" to view the table of all deleted API keys. + +![](https://colony-recorder.s3.amazonaws.com/files/2026-01-17/00668558-9326-4a6f-8e87-159d54b17a72/ascreenshot_d0e50e49e9aa43d4a22ada6f12a78b12_text_export.jpeg) + +### Step 4: Review Deletion Information + +The Deleted Keys table includes comprehensive information about each deleted key: + +- **When** the key was deleted (timestamp) +- **Who** deleted the key (user/admin information) +- **Key identification** details + +![](https://colony-recorder.s3.amazonaws.com/files/2026-01-17/8538f7c4-634e-44c8-8d7d-fafbd6da0b02/ascreenshot_6b73f9c6a52d4e40a2368ef441cf6c8f_text_export.jpeg) + +### Step 5: View Financial Information + +The table also displays financial information captured at the time of deletion: + +- **Spend at deletion** - Total spend accumulated when the key was deleted +- **Original budget** - The budget limit that was set for the key + +![](https://colony-recorder.s3.amazonaws.com/files/2026-01-17/f8b03850-b17c-490c-a507-c3b0b6c050ab/ascreenshot_070b139f111844bba38fbed8835b097b_text_export.jpeg) + +## Viewing Deleted Teams + +### Step 1: Access Deleted Teams + +From the Logs section, click on "Deleted Teams" to view all deleted teams. + +![](https://colony-recorder.s3.amazonaws.com/files/2026-01-17/716ce26f-09af-4a6d-99c5-921d6b6a8555/ascreenshot_d36c16f1cf894340aa8bc20ada5922ac_text_export.jpeg) + +### Step 2: Review Team Deletion Information + +The Deleted Teams table provides detailed information about each deleted team: + +- **When** the team was deleted (timestamp) +- **Who** deleted the team (user/admin information) +- **Team identification** details + +![](https://colony-recorder.s3.amazonaws.com/files/2026-01-17/0a3f2d3f-179a-4ad7-916e-b77a13dca01d/ascreenshot_ded5970762d54528ae656421148116c4_text_export.jpeg) + +### Step 3: View Team Financial Information + +Similar to deleted keys, the Deleted Teams table shows financial information: + +- **Spend at deletion** - Total spend accumulated when the team was deleted +- **Original budget** - The budget limit that was set for the team + +![](https://colony-recorder.s3.amazonaws.com/files/2026-01-17/5b24871f-b57e-404d-8fbe-a4b27cb2a6a0/ascreenshot_3121fbafbd6b4abf90993ce6c03c608d_text_export.jpeg) + +## Use Cases + +This feature is particularly useful for: + +- **Financial Auditing** - Track spend and budgets for deleted entities +- **Compliance** - Maintain records of who deleted what and when +- **Cost Analysis** - Understand spending patterns before deletion +- **Accountability** - Identify which admin or user performed deletions +- **Historical Records** - Preserve financial data even after entity deletion + +## Related Features + +- [Audit Logs](./multiple_admins.md) - View comprehensive audit logs for all entity changes +- [UI Logs](./ui_logs.md) - View request logs and spend tracking diff --git a/docs/my-website/docs/proxy/demo.md b/docs/my-website/docs/proxy/demo.md deleted file mode 100644 index c4b8671aab90..000000000000 --- a/docs/my-website/docs/proxy/demo.md +++ /dev/null @@ -1,9 +0,0 @@ -# Demo App - -Here is a demo of the proxy. To log in pass in: - -- Username: admin -- Password: sk-1234 - - -[Demo UI](https://demo.litellm.ai/ui) diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md index 7d2389383d15..0761e0e9fa8a 100644 --- a/docs/my-website/docs/proxy/deploy.md +++ b/docs/my-website/docs/proxy/deploy.md @@ -2,16 +2,50 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import Image from '@theme/IdealImage'; -# Docker, Deployment +# Docker, Helm, Terraform + +:::info No Limits on LiteLLM OSS +There are **no limits** on the number of users, keys, or teams you can create on LiteLLM OSS. +::: You can find the Dockerfile to build litellm proxy [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile) +> Note: Production requires at least 4 CPU cores and 8 GB RAM. + ## Quick Start +:::info +Facing issues with pulling the docker image? Email us at support@berri.ai. +::: + To start using Litellm, run the following commands in a shell: + + + + +``` +docker pull docker.litellm.ai/berriai/litellm:main-latest +``` + +[**See all docker images**](https://github.com/orgs/BerriAI/packages) + + + + + +```shell +$ pip install 'litellm[proxy]' +``` + + + + + +Use this docker compose to spin up the proxy with a postgres database running locally. + ```bash -# Get the code +# Get the docker compose file curl -O https://raw.githubusercontent.com/BerriAI/litellm/main/docker-compose.yml curl -O https://raw.githubusercontent.com/BerriAI/litellm/main/prometheus.yml @@ -24,12 +58,12 @@ echo 'LITELLM_MASTER_KEY="sk-1234"' > .env # password generator to get a random hash for litellm salt key echo 'LITELLM_SALT_KEY="sk-1234"' >> .env -source .env - # Start docker compose up ``` + + ### Docker Run @@ -57,7 +91,7 @@ docker run \ -e AZURE_API_KEY=d6*********** \ -e AZURE_API_BASE=https://openai-***********/ \ -p 4000:4000 \ - ghcr.io/berriai/litellm:main-stable \ + docker.litellm.ai/berriai/litellm:main-stable \ --config /app/config.yaml --detailed_debug ``` @@ -87,12 +121,12 @@ See all supported CLI args [here](https://docs.litellm.ai/docs/proxy/cli): Here's how you can run the docker image and pass your config to `litellm` ```shell -docker run ghcr.io/berriai/litellm:main-stable --config your_config.yaml +docker run docker.litellm.ai/berriai/litellm:main-stable --config your_config.yaml ``` Here's how you can run the docker image and start litellm on port 8002 with `num_workers=8` ```shell -docker run ghcr.io/berriai/litellm:main-stable --port 8002 --num_workers 8 +docker run docker.litellm.ai/berriai/litellm:main-stable --port 8002 --num_workers 8 ``` @@ -100,7 +134,7 @@ docker run ghcr.io/berriai/litellm:main-stable --port 8002 --num_workers 8 ```shell # Use the provided base image -FROM ghcr.io/berriai/litellm:main-stable +FROM docker.litellm.ai/berriai/litellm:main-stable # Set the working directory to /app WORKDIR /app @@ -166,6 +200,7 @@ Example `requirements.txt` ```shell litellm[proxy]==1.57.3 # Specify the litellm version you want to use +litellm-enterprise prometheus_client langfuse prisma @@ -195,7 +230,7 @@ docker run \ s/o [Nicholas Cecere](https://www.linkedin.com/in/nicholas-cecere-24243549/) for his LiteLLM User Management Terraform -👉 [Go here for Terraform](https://github.com/ncecere/terraform-litellm-user-mgmt) +👉 [Go here for Terraform](https://github.com/BerriAI/terraform-provider-litellm) ### Kubernetes @@ -242,7 +277,7 @@ spec: spec: containers: - name: litellm - image: ghcr.io/berriai/litellm:main-stable # it is recommended to fix a version generally + image: docker.litellm.ai/berriai/litellm:main-stable # it is recommended to fix a version generally args: - "--config" - "/app/proxy_server_config.yaml" @@ -279,9 +314,9 @@ Use this when you want to use litellm helm chart as a dependency for other chart #### Step 1. Pull the litellm helm chart ```bash -helm pull oci://ghcr.io/berriai/litellm-helm +helm pull oci://docker.litellm.ai/berriai/litellm-helm -# Pulled: ghcr.io/berriai/litellm-helm:0.1.2 +# Pulled: docker.litellm.ai/berriai/litellm-helm:0.1.2 # Digest: sha256:7d3ded1c99c1597f9ad4dc49d84327cf1db6e0faa0eeea0c614be5526ae94e2a ``` @@ -329,6 +364,26 @@ LiteLLM is compatible with several SDKs - including OpenAI SDK, Anthropic SDK, M ### Deploy with Database ##### Docker, Kubernetes, Helm Chart +:::warning High Traffic Deployments (1000+ RPS) + +If you expect high traffic (1000+ requests per second), **Redis is required** to prevent database connection exhaustion and deadlocks. + +Add this to your config: +```yaml +general_settings: + use_redis_transaction_buffer: true + +litellm_settings: + cache: true + cache_params: + type: redis + host: your-redis-host +``` + +See [Resolve DB Deadlocks](/docs/proxy/db_deadlocks) for details. + +::: + Requirements: - Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc) Set `DATABASE_URL=postgresql://:@:/` in your env - Set a `LITELLM_MASTER_KEY`, this is your Proxy Admin key - you can use this to create other keys (🚨 must start with `sk-`) @@ -340,7 +395,7 @@ Requirements: We maintain a [separate Dockerfile](https://github.com/BerriAI/litellm/pkgs/container/litellm-database) for reducing build time when running LiteLLM proxy with a connected Postgres Database ```shell -docker pull ghcr.io/berriai/litellm-database:main-stable +docker pull docker.litellm.ai/berriai/litellm-database:main-stable ``` ```shell @@ -351,7 +406,7 @@ docker run \ -e AZURE_API_KEY=d6*********** \ -e AZURE_API_BASE=https://openai-***********/ \ -p 4000:4000 \ - ghcr.io/berriai/litellm-database:main-stable \ + docker.litellm.ai/berriai/litellm-database:main-stable \ --config /app/config.yaml --detailed_debug ``` @@ -379,7 +434,7 @@ spec: spec: containers: - name: litellm-container - image: ghcr.io/berriai/litellm:main-stable + image: docker.litellm.ai/berriai/litellm:main-stable imagePullPolicy: Always env: - name: AZURE_API_KEY @@ -516,9 +571,9 @@ Use this when you want to use litellm helm chart as a dependency for other chart #### Step 1. Pull the litellm helm chart ```bash -helm pull oci://ghcr.io/berriai/litellm-helm +helm pull oci://docker.litellm.ai/berriai/litellm-helm -# Pulled: ghcr.io/berriai/litellm-helm:0.1.2 +# Pulled: docker.litellm.ai/berriai/litellm-helm:0.1.2 # Digest: sha256:7d3ded1c99c1597f9ad4dc49d84327cf1db6e0faa0eeea0c614be5526ae94e2a ``` @@ -575,7 +630,7 @@ router_settings: Start docker container with config ```shell -docker run ghcr.io/berriai/litellm:main-stable --config your_config.yaml +docker run docker.litellm.ai/berriai/litellm:main-stable --config your_config.yaml ``` ### Deploy with Database + Redis @@ -610,7 +665,7 @@ Start `litellm-database`docker container with config docker run --name litellm-proxy \ -e DATABASE_URL=postgresql://:@:/ \ -p 4000:4000 \ -ghcr.io/berriai/litellm-database:main-stable --config your_config.yaml +docker.litellm.ai/berriai/litellm-database:main-stable --config your_config.yaml ``` ### (Non Root) - without Internet Connection @@ -620,7 +675,7 @@ By default `prisma generate` downloads [prisma's engine binaries](https://www.pr Use this docker image to deploy litellm with pre-generated prisma binaries. ```bash -docker pull ghcr.io/berriai/litellm-non_root:main-stable +docker pull docker.litellm.ai/berriai/litellm-non_root:main-stable ``` [Published Docker Image link](https://github.com/BerriAI/litellm/pkgs/container/litellm-non_root) @@ -639,7 +694,7 @@ Use this, If you need to set ssl certificates for your on prem litellm proxy Pass `ssl_keyfile_path` (Path to the SSL keyfile) and `ssl_certfile_path` (Path to the SSL certfile) when starting litellm proxy ```shell -docker run ghcr.io/berriai/litellm:main-stable \ +docker run docker.litellm.ai/berriai/litellm:main-stable \ --ssl_keyfile_path ssl_test/keyfile.key \ --ssl_certfile_path ssl_test/certfile.crt ``` @@ -654,7 +709,7 @@ Step 1. Build your custom docker image with hypercorn ```shell # Use the provided base image -FROM ghcr.io/berriai/litellm:main-stable +FROM docker.litellm.ai/berriai/litellm:main-stable # Set the working directory to /app WORKDIR /app @@ -702,7 +757,7 @@ Usage Example: In this example, we set the keepalive timeout to 75 seconds. ```shell showLineNumbers title="docker run" -docker run ghcr.io/berriai/litellm:main-stable \ +docker run docker.litellm.ai/berriai/litellm:main-stable \ --keepalive_timeout 75 ``` @@ -711,7 +766,7 @@ In this example, we set the keepalive timeout to 75 seconds. ```shell showLineNumbers title="Environment Variable" export KEEPALIVE_TIMEOUT=75 -docker run ghcr.io/berriai/litellm:main-stable +docker run docker.litellm.ai/berriai/litellm:main-stable ``` @@ -722,7 +777,7 @@ Use this to mitigate memory growth by recycling workers after a fixed number of Usage Examples: ```shell showLineNumbers title="docker run (CLI flag)" -docker run ghcr.io/berriai/litellm:main-stable \ +docker run docker.litellm.ai/berriai/litellm:main-stable \ --max_requests_before_restart 10000 ``` @@ -730,7 +785,7 @@ Or set via environment variable: ```shell showLineNumbers title="Environment Variable" export MAX_REQUESTS_BEFORE_RESTART=10000 -docker run ghcr.io/berriai/litellm:main-stable +docker run docker.litellm.ai/berriai/litellm:main-stable ``` @@ -759,7 +814,7 @@ docker run --name litellm-proxy \ -e LITELLM_CONFIG_BUCKET_OBJECT_KEY="> \ -e LITELLM_CONFIG_BUCKET_TYPE="gcs" \ -p 4000:4000 \ - ghcr.io/berriai/litellm-database:main-stable --detailed_debug + docker.litellm.ai/berriai/litellm-database:main-stable --detailed_debug ``` @@ -780,11 +835,21 @@ docker run --name litellm-proxy \ -e LITELLM_CONFIG_BUCKET_NAME= \ -e LITELLM_CONFIG_BUCKET_OBJECT_KEY="> \ -p 4000:4000 \ - ghcr.io/berriai/litellm-database:main-stable + docker.litellm.ai/berriai/litellm-database:main-stable ``` +### 6. Disable pulling live model prices + +Disable pulling the model prices from LiteLLM's [hosted model prices file](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), if you're seeing long cold start times or network security issues. + +```env +export LITELLM_LOCAL_MODEL_COST_MAP="True" +``` + +This will use the local model prices file instead. + ## Platform-specific Guide @@ -897,7 +962,7 @@ Run the following command, replacing `` with the value you copied docker run --name litellm-proxy \ -e DATABASE_URL= \ -p 4000:4000 \ - ghcr.io/berriai/litellm-database:main-stable + docker.litellm.ai/berriai/litellm-database:main-stable ``` #### 4. Access the Application: @@ -976,7 +1041,7 @@ services: context: . args: target: runtime - image: ghcr.io/berriai/litellm:main-stable + image: docker.litellm.ai/berriai/litellm:main-stable ports: - "4000:4000" # Map the container port to the host, change the host port if necessary volumes: @@ -1060,4 +1125,4 @@ A: We explored MySQL but that was hard to maintain and led to bugs for customers **Q: If there is Postgres downtime, how does LiteLLM react? Does it fail-open or is there API downtime?** -A: You can gracefully handle DB unavailability if it's on your VPC. See our production guide for more details: [Gracefully Handle DB Unavailability](https://docs.litellm.ai/docs/proxy/prod#6-if-running-litellm-on-vpc-gracefully-handle-db-unavailability) \ No newline at end of file +A: You can gracefully handle DB unavailability if it's on your VPC. See our production guide for more details: [Gracefully Handle DB Unavailability](https://docs.litellm.ai/docs/proxy/prod#6-if-running-litellm-on-vpc-gracefully-handle-db-unavailability) diff --git a/docs/my-website/docs/proxy/docker_quick_start.md b/docs/my-website/docs/proxy/docker_quick_start.md index f3da18065ec4..efdc73de43e4 100644 --- a/docs/my-website/docs/proxy/docker_quick_start.md +++ b/docs/my-website/docs/proxy/docker_quick_start.md @@ -2,7 +2,7 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -# E2E Tutorial +# Getting Started Tutorial End-to-End tutorial for LiteLLM Proxy to: - Add an Azure OpenAI model @@ -20,7 +20,7 @@ End-to-End tutorial for LiteLLM Proxy to: ``` -docker pull ghcr.io/berriai/litellm:main-latest +docker pull docker.litellm.ai/berriai/litellm:main-latest ``` [**See all docker images**](https://github.com/orgs/BerriAI/packages) @@ -52,8 +52,6 @@ echo 'LITELLM_MASTER_KEY="sk-1234"' > .env # password generator to get a random hash for litellm salt key echo 'LITELLM_SALT_KEY="sk-1234"' >> .env -source .env - # Start docker compose up ``` @@ -82,6 +80,8 @@ model_list: ### Model List Specification +You can read more about how model resolution works in the [Model Configuration](#understanding-model-configuration) section. + - **`model_name`** (`str`) - This field should contain the name of the model as received. - **`litellm_params`** (`dict`) [See All LiteLLM Params](https://github.com/BerriAI/litellm/blob/559a6ad826b5daef41565f54f06c739c8c068b28/litellm/types/router.py#L222) - **`model`** (`str`) - Specifies the model name to be sent to `litellm.acompletion` / `litellm.aembedding`, etc. This is the identifier used by LiteLLM to route to the correct model + provider logic on the backend. @@ -89,6 +89,10 @@ model_list: - **`api_base`** (`str`) - The API base for your azure deployment. - **`api_version`** (`str`) - The API Version to use when calling Azure's OpenAI API. Get the latest Inference API version [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation?source=recommendations#latest-preview-api-releases). +--- + + +--- ### Useful Links - [**All Supported LLM API Providers (OpenAI/Bedrock/Vertex/etc.)**](../providers/) @@ -115,7 +119,7 @@ docker run \ -e AZURE_API_KEY=d6*********** \ -e AZURE_API_BASE=https://openai-***********/ \ -p 4000:4000 \ - ghcr.io/berriai/litellm:main-latest \ + docker.litellm.ai/berriai/litellm:main-latest \ --config /app/config.yaml --detailed_debug # RUNNING on http://0.0.0.0:4000 @@ -298,7 +302,7 @@ docker run \ -e AZURE_API_KEY=d6*********** \ -e AZURE_API_BASE=https://openai-***********/ \ -p 4000:4000 \ - ghcr.io/berriai/litellm:main-latest \ + docker.litellm.ai/berriai/litellm:main-latest \ --config /app/config.yaml --detailed_debug ``` @@ -407,6 +411,138 @@ curl -X POST 'http://0.0.0.0:4000/chat/completions' \ - [Set Budgets / Rate Limits per key/user/teams](./users.md) - [Dynamic TPM/RPM Limits for keys](./team_budgets.md#dynamic-tpmrpm-allocation) +## Key Concepts + +This section explains key concepts on LiteLLM AI Gateway. + +### Understanding Model Configuration + +For this config.yaml example: + +```yaml +model_list: + - model_name: gpt-4o + litellm_params: + model: azure/my_azure_deployment + api_base: os.environ/AZURE_API_BASE + api_key: "os.environ/AZURE_API_KEY" + api_version: "2025-01-01-preview" # [OPTIONAL] litellm uses the latest azure api_version by default +``` + +**How Model Resolution Works:** + +``` +Client Request LiteLLM Proxy Provider API +────────────── ──────────────── ───────────── + +POST /chat/completions +{ 1. Looks up model_name + "model": "gpt-4o" ──────────▶ in config.yaml + ... +} 2. Finds matching entry: + model_name: gpt-4o + + 3. Extracts litellm_params: + model: azure/my_azure_deployment + api_base: https://... + api_key: sk-... + + 4. Routes to provider ──▶ Azure OpenAI API + POST /deployments/my_azure_deployment/... +``` + +**Breaking Down the `model` Parameter under `litellm_params`:** + +```yaml +model_list: + - model_name: gpt-4o # What the client calls + litellm_params: + model: azure/my_azure_deployment # / + ───── ─────────────────── + │ │ + │ └─────▶ Model name sent to the provider API + │ + └─────────────────▶ Provider that LiteLLM routes to +``` + +**Visual Breakdown:** + +``` +model: azure/my_azure_deployment + └─┬─┘ └─────────┬─────────┘ + │ │ + │ └────▶ The actual model identifier that gets sent to Azure + │ (e.g., your deployment name, or the model name) + │ + └──────────────────▶ Tells LiteLLM which provider to use + (azure, openai, anthropic, bedrock, etc.) +``` + +**Key Concepts:** + +- **`model_name`**: The alias your client uses to call the model. This is what you send in your API requests (e.g., `gpt-4o`). + +- **`model` (in litellm_params)**: Format is `/` + - **Provider** (before `/`): Routes to the correct LLM provider (e.g., `azure`, `openai`, `anthropic`, `bedrock`) + - **Model identifier** (after `/`): The actual model/deployment name sent to that provider's API + +**Advanced Configuration Examples:** + +For custom OpenAI-compatible endpoints (e.g., vLLM, Ollama, custom deployments): + +```yaml +model_list: + - model_name: my-custom-model + litellm_params: + model: openai/nvidia/llama-3.2-nv-embedqa-1b-v2 + api_base: http://my-service.svc.cluster.local:8000/v1 + api_key: "sk-1234" +``` + +**Breaking down complex model paths:** + +``` +model: openai/nvidia/llama-3.2-nv-embedqa-1b-v2 + └─┬──┘ └────────────┬────────────────┘ + │ │ + │ └────▶ Full model string sent to the provider API + │ (in this case: "nvidia/llama-3.2-nv-embedqa-1b-v2") + │ + └──────────────────────▶ Provider (openai = OpenAI-compatible API) +``` + +The key point: Everything after the first `/` is passed as-is to the provider's API. + +**Common Patterns:** + +```yaml +model_list: + # Azure deployment + - model_name: gpt-4 + litellm_params: + model: azure/gpt-4-deployment + api_base: https://my-azure.openai.azure.com + + # OpenAI + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4 + api_key: os.environ/OPENAI_API_KEY + + # Custom OpenAI-compatible endpoint + - model_name: my-llama-model + litellm_params: + model: openai/meta/llama-3-8b + api_base: http://my-vllm-server:8000/v1 + api_key: "optional-key" + + # Bedrock + - model_name: claude-3 + litellm_params: + model: bedrock/anthropic.claude-3-sonnet-20240229-v1:0 + aws_region_name: us-east-1 +``` + ## Troubleshooting @@ -504,7 +640,7 @@ LiteLLM Proxy uses the [LiteLLM Python SDK](https://docs.litellm.ai/docs/routing - [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) - [Community Discord 💭](https://discord.gg/wuPM9dRgDw) -- [Community Slack 💭](https://join.slack.com/share/enQtOTE0ODczMzk2Nzk4NC01YjUxNjY2YjBlYTFmNDRiZTM3NDFiYTM3MzVkODFiMDVjOGRjMmNmZTZkZTMzOWQzZGQyZWIwYjQ0MWExYmE3) +- [Community Slack 💭](https://www.litellm.ai/support) - Our emails ✉️ ishaan@berri.ai / krrish@berri.ai diff --git a/docs/my-website/docs/proxy/dynamic_logging.md b/docs/my-website/docs/proxy/dynamic_logging.md index 3bc9f72b0332..42df221bb844 100644 --- a/docs/my-website/docs/proxy/dynamic_logging.md +++ b/docs/my-website/docs/proxy/dynamic_logging.md @@ -211,4 +211,64 @@ x-litellm-disable-callbacks: LANGFUSE,datadog,PROMETHEUS x-litellm-disable-callbacks: langfuse,DATADOG,prometheus ``` +--- + +## Disabling Dynamic Callback Management (Enterprise) + +Some organizations have compliance requirements where **all requests must be logged under all circumstances**. For these cases, you can disable dynamic callback management entirely to ensure users cannot disable any logging callbacks. + +### Use Case + +This is designed for enterprise scenarios where: +- **Compliance requirements** mandate that all API requests must be logged +- **Audit trails** must be complete with no gaps +- **Security policies** require all traffic to be monitored +- **No exceptions** can be made for callback disabling + +### How to Disable + +Set `allow_dynamic_callback_disabling` to `false` in your config.yaml: + +```yaml showLineNumbers title="config.yaml" +litellm_settings: + allow_dynamic_callback_disabling: false +``` + +### Effect + +When disabled: +- The `x-litellm-disable-callbacks` header will be **ignored** +- All configured callbacks will **always execute** for every request +- Users cannot bypass logging through headers or request metadata +- All requests are guaranteed to be logged per your proxy configuration + +### Example: Compliance Logging Setup + +Here's a complete example for an organization requiring guaranteed logging: + +```yaml showLineNumbers title="config.yaml" +# config.yaml +model_list: + - model_name: gpt-4 + litellm_params: + model: openai/gpt-4 + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + callbacks: ["langfuse", "datadog", "s3"] + # Disable dynamic callback disabling for compliance + allow_dynamic_callback_disabling: false +``` + +With this configuration: +- All requests will be logged to Langfuse, Datadog, and S3 +- Users cannot disable any of these callbacks via headers +- Complete audit trail is guaranteed for compliance requirements + +:::info + +**Default Behavior**: Dynamic callback disabling is **enabled by default** (`allow_dynamic_callback_disabling: true`). You must explicitly set it to `false` to enforce guaranteed logging. + +::: + diff --git a/docs/my-website/docs/proxy/dynamic_rate_limit.md b/docs/my-website/docs/proxy/dynamic_rate_limit.md index 9c875a51eba1..3c3500f8a6c0 100644 --- a/docs/my-website/docs/proxy/dynamic_rate_limit.md +++ b/docs/my-website/docs/proxy/dynamic_rate_limit.md @@ -149,6 +149,7 @@ litellm_settings: priority_reservation_settings: default_priority: 0 # Weight (0%) assigned to keys without explicit priority metadata saturation_threshold: 0.50 # A model is saturated if it has hit 50% of its RPM limit + saturation_check_cache_ttl: 60 # How long (seconds) saturation values are cached locally general_settings: master_key: sk-1234 # OR set `LITELLM_MASTER_KEY=".."` in your .env @@ -168,6 +169,8 @@ general_settings: - **default_priority (float)**: Weight/percentage (0.0 to 1.0) assigned to API keys that have no priority metadata set (defaults to 0.5) - **saturation_threshold (float)**: Saturation level (0.0 to 1.0) at which strict priority enforcement begins for a model. Saturation is calculated as `max(current_rpm/max_rpm, current_tpm/max_tpm)`. Below this threshold, generous mode allows priority borrowing from unused capacity. Above this threshold, strict mode enforces normalized priority limits. - Example: When model usage is low, keys can use more than their allocated share. When model usage is high, keys are strictly limited to their allocated share. +- **saturation_check_cache_ttl (int)**: TTL in seconds for local cache when reading saturation values from Redis (defaults to 60). In multi-node deployments, this controls how quickly nodes converge on the same saturation state. Lower values mean faster convergence but more Redis reads. + - Example: Set to `5` for faster multi-node consistency, or `0` to always read directly from Redis. **Start Proxy** @@ -175,7 +178,37 @@ general_settings: litellm --config /path/to/config.yaml ``` -#### 2. Create Keys with Priority Levels +### Set priority on either a team or a key + +Priority can be set at either the **team level** or **key level**. Team-level priority takes precedence over key-level priority. + +**Option A: Set Priority on Team (Recommended)** + +All keys within a team will inherit the team's priority. This is useful when you want all keys for a specific environment or project to have the same priority. + +```bash +curl -X POST 'http://0.0.0.0:4000/team/new' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{ + "team_alias": "production-team", + "metadata": {"priority": "prod"} +}' +``` + +Create a key for this team: +```bash +curl -X POST 'http://0.0.0.0:4000/key/generate' \ +-H 'Authorization: Bearer sk-1234' \ +-H 'Content-Type: application/json' \ +-d '{ + "team_id": "team-id-from-previous-response" +}' +``` + +**Option B: Set Priority on Individual Keys** + +Set priority directly on the key. This is useful when you need fine-grained control per key. **Production Key:** ```bash @@ -205,7 +238,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \ -d '{}' ``` -**Expected Response for both:** +**Expected Response:** ```json { "key": "sk-...", @@ -214,6 +247,11 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \ } ``` +**Priority Resolution Order:** +1. If key belongs to a team with `metadata.priority` set → use team priority +2. Else if key has `metadata.priority` set → use key priority +3. Else → use `default_priority` from config + #### 3. Test Priority Allocation **Test Production Key (should get 9 RPM):** diff --git a/docs/my-website/docs/proxy/email.md b/docs/my-website/docs/proxy/email.md index 1ee67e823083..ad158cb34291 100644 --- a/docs/my-website/docs/proxy/email.md +++ b/docs/my-website/docs/proxy/email.md @@ -18,7 +18,7 @@ Send LiteLLM Proxy users emails for specific events. | Category | Details | |----------|---------| -| Supported Events | • User added as a user on LiteLLM Proxy
• Proxy API Key created for user | +| Supported Events | • User added as a user on LiteLLM Proxy
• Proxy API Key created for user
• Proxy API Key rotated for user | | Supported Email Integrations | • Resend API
• SMTP | ## Usage @@ -68,6 +68,23 @@ litellm_settings: callbacks: ["resend_email"] ``` +
+ + +Add `sendgrid_email` to your proxy config.yaml under `litellm_settings` + +set the following env variables + +```shell showLineNumbers +SENDGRID_API_KEY="SG.1234" +SENDGRID_SENDER_EMAIL="notifications@your-domain.com" +``` + +```yaml showLineNumbers title="proxy_config.yaml" +litellm_settings: + callbacks: ["sendgrid_email"] +``` +
@@ -77,6 +94,35 @@ On the LiteLLM Proxy UI, go to users > create a new user. After creating a new user, they will receive an email invite a the email you specified when creating the user. +### 3. Configure Budget Alerts (Optional) + +Enable budget alert emails by adding "email" to the `alerts` list in your proxy configuration: + +```yaml showLineNumbers title="proxy_config.yaml" +general_settings: + alerts: ["email"] +``` + +#### Budget Alert Types + +**Soft Budget Alerts**: Automatically triggered when a key exceeds its soft budget limit. These alerts help you monitor spending before reaching critical thresholds. + +**Max Budget Alerts**: Automatically triggered when a key reaches a specified percentage of its maximum budget (default: 80%). These alerts warn you when you're approaching budget exhaustion. + +Both alert types send a maximum of one email per 24-hour period to prevent spam. + +#### Configuration Options + +Customize budget alert behavior using these environment variables: + +```yaml showLineNumbers title=".env" +# Percentage of max budget that triggers alerts (as decimal: 0.8 = 80%) +EMAIL_BUDGET_ALERT_MAX_SPEND_ALERT_PERCENTAGE=0.8 + +# Time-to-live for alert deduplication in seconds (default: 24 hours) +EMAIL_BUDGET_ALERT_TTL=86400 +``` + ## Email Templates @@ -123,6 +169,35 @@ On the Create Key Modal, Select Advanced Settings > Set Send Email to True. style={{width: '70%', display: 'block', margin: '0 0 2rem 0'}} /> +### 3. Proxy API Key Rotated for User + +This email is sent when you rotate an API key for a user on LiteLLM Proxy. + + + +**How to trigger this event** + +On the LiteLLM Proxy UI, go to Virtual Keys > Click on a key > Click "Regenerate Key" + +:::info + +Ensure there is a `user_id` attached to the key. This would have been set when creating the key. + +::: + + + +After regenerating the key, the user will receive an email notification with: +- Security-focused messaging about the rotation +- The new API key (or a placeholder if `EMAIL_INCLUDE_API_KEY=false`) +- Instructions to update their applications +- Security best practices ## Email Customization @@ -141,6 +216,8 @@ LiteLLM allows you to customize various aspects of your email notifications. Bel | Email Signature | `EMAIL_SIGNATURE` | string (HTML) | Standard LiteLLM footer | `"

Best regards,
Your Team

Visit us

"` | HTML-formatted footer for all emails | | Invitation Subject | `EMAIL_SUBJECT_INVITATION` | string | "LiteLLM: New User Invitation" | `"Welcome to Your Company!"` | Subject line for invitation emails | | Key Creation Subject | `EMAIL_SUBJECT_KEY_CREATED` | string | "LiteLLM: API Key Created" | `"Your New API Key is Ready"` | Subject line for key creation emails | +| Key Rotation Subject | `EMAIL_SUBJECT_KEY_ROTATED` | string | "LiteLLM: API Key Rotated" | `"Your API Key Has Been Rotated"` | Subject line for key rotation emails | +| Include API Key | `EMAIL_INCLUDE_API_KEY` | boolean | true | `"false"` | Whether to include the actual API key in emails (set to false for enhanced security) | | Proxy Base URL | `PROXY_BASE_URL` | string | http://0.0.0.0:4000 | `"https://proxy.your-company.com"` | Base URL for the LiteLLM Proxy (used in email links) | @@ -181,11 +258,44 @@ EMAIL_SIGNATURE="

Best regards,
Your Company Team

+ + +""" + diff --git a/litellm/integrations/email_templates/templates.py b/litellm/integrations/email_templates/templates.py index 7029e8ce12ad..5de23db0f24b 100644 --- a/litellm/integrations/email_templates/templates.py +++ b/litellm/integrations/email_templates/templates.py @@ -60,3 +60,51 @@ Best,
The LiteLLM team
""" + +SOFT_BUDGET_ALERT_EMAIL_TEMPLATE = """ + LiteLLM Logo + +

Hi {recipient_email},
+ + Your LiteLLM API key has crossed its soft budget limit of {soft_budget}.

+ + Current Spend: {spend}
+ Soft Budget: {soft_budget}
+ {max_budget_info} + +

+ ⚠️ Note: Your API requests will continue to work, but you should monitor your usage closely. + If you reach your maximum budget, requests will be rejected. +

+ + You can view your usage and manage your budget in the LiteLLM Dashboard.

+ + If you have any questions, please send an email to {email_support_contact}

+ + Best,
+ The LiteLLM team
+""" + +MAX_BUDGET_ALERT_EMAIL_TEMPLATE = """ + LiteLLM Logo + +

Hi {recipient_email},
+ + Your LiteLLM API key has reached {percentage}% of its maximum budget.

+ + Current Spend: {spend}
+ Maximum Budget: {max_budget}
+ Alert Threshold: {alert_threshold} ({percentage}%)
+ +

+ ⚠️ Warning: You are approaching your maximum budget limit. + Once you reach your maximum budget of {max_budget}, all API requests will be rejected. +

+ + You can view your usage and manage your budget in the LiteLLM Dashboard.

+ + If you have any questions, please send an email to {email_support_contact}

+ + Best,
+ The LiteLLM team
+""" \ No newline at end of file diff --git a/litellm/integrations/focus/__init__.py b/litellm/integrations/focus/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/litellm/integrations/focus/database.py b/litellm/integrations/focus/database.py new file mode 100644 index 000000000000..298254670eb3 --- /dev/null +++ b/litellm/integrations/focus/database.py @@ -0,0 +1,113 @@ +"""Database access helpers for Focus export.""" + +from __future__ import annotations + +from datetime import datetime +from typing import Any, Dict, Optional + +import polars as pl + + +class FocusLiteLLMDatabase: + """Retrieves LiteLLM usage data for Focus export workflows.""" + + def _ensure_prisma_client(self): + from litellm.proxy.proxy_server import prisma_client + + if prisma_client is None: + raise RuntimeError( + "Database not connected. Connect a database to your proxy - " + "https://docs.litellm.ai/docs/simple_proxy#managing-auth---virtual-keys" + ) + return prisma_client + + async def get_usage_data( + self, + *, + limit: Optional[int] = None, + start_time_utc: Optional[datetime] = None, + end_time_utc: Optional[datetime] = None, + ) -> pl.DataFrame: + """Return usage data for the requested window.""" + client = self._ensure_prisma_client() + + where_clauses: list[str] = [] + query_params: list[Any] = [] + placeholder_index = 1 + if start_time_utc: + where_clauses.append(f"dus.updated_at >= ${placeholder_index}::timestamptz") + query_params.append(start_time_utc) + placeholder_index += 1 + if end_time_utc: + where_clauses.append(f"dus.updated_at <= ${placeholder_index}::timestamptz") + query_params.append(end_time_utc) + placeholder_index += 1 + + where_clause = "" + if where_clauses: + where_clause = "WHERE " + " AND ".join(where_clauses) + + limit_clause = "" + if limit is not None: + try: + limit_value = int(limit) + except (TypeError, ValueError) as exc: # pragma: no cover - defensive guard + raise ValueError("limit must be an integer") from exc + if limit_value < 0: + raise ValueError("limit must be non-negative") + limit_clause = f" LIMIT ${placeholder_index}" + query_params.append(limit_value) + + query = f""" + SELECT + dus.id, + dus.date, + dus.user_id, + dus.api_key, + dus.model, + dus.model_group, + dus.custom_llm_provider, + dus.prompt_tokens, + dus.completion_tokens, + dus.spend, + dus.api_requests, + dus.successful_requests, + dus.failed_requests, + dus.cache_creation_input_tokens, + dus.cache_read_input_tokens, + dus.created_at, + dus.updated_at, + vt.team_id, + vt.key_alias as api_key_alias, + tt.team_alias, + ut.user_email as user_email + FROM "LiteLLM_DailyUserSpend" dus + LEFT JOIN "LiteLLM_VerificationToken" vt ON dus.api_key = vt.token + LEFT JOIN "LiteLLM_TeamTable" tt ON vt.team_id = tt.team_id + LEFT JOIN "LiteLLM_UserTable" ut ON dus.user_id = ut.user_id + {where_clause} + ORDER BY dus.date DESC, dus.created_at DESC + {limit_clause} + """ + + try: + db_response = await client.db.query_raw(query, *query_params) + return pl.DataFrame(db_response, infer_schema_length=None) + except Exception as exc: + raise RuntimeError(f"Error retrieving usage data: {exc}") from exc + + async def get_table_info(self) -> Dict[str, Any]: + """Return metadata about the spend table for diagnostics.""" + client = self._ensure_prisma_client() + + info_query = """ + SELECT column_name, data_type, is_nullable + FROM information_schema.columns + WHERE table_name = 'LiteLLM_DailyUserSpend' + ORDER BY ordinal_position; + """ + try: + columns_response = await client.db.query_raw(info_query) + return {"columns": columns_response, "table_name": "LiteLLM_DailyUserSpend"} + except Exception as exc: + raise RuntimeError(f"Error getting table info: {exc}") from exc diff --git a/litellm/integrations/focus/destinations/__init__.py b/litellm/integrations/focus/destinations/__init__.py new file mode 100644 index 000000000000..233f1da0c9b5 --- /dev/null +++ b/litellm/integrations/focus/destinations/__init__.py @@ -0,0 +1,12 @@ +"""Destination implementations for Focus export.""" + +from .base import FocusDestination, FocusTimeWindow +from .factory import FocusDestinationFactory +from .s3_destination import FocusS3Destination + +__all__ = [ + "FocusDestination", + "FocusDestinationFactory", + "FocusTimeWindow", + "FocusS3Destination", +] diff --git a/litellm/integrations/focus/destinations/base.py b/litellm/integrations/focus/destinations/base.py new file mode 100644 index 000000000000..8042a7e23b9a --- /dev/null +++ b/litellm/integrations/focus/destinations/base.py @@ -0,0 +1,30 @@ +"""Abstract destination interfaces for Focus export.""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from typing import Protocol + + +@dataclass(frozen=True) +class FocusTimeWindow: + """Represents the span of data exported in a single batch.""" + + start_time: datetime + end_time: datetime + frequency: str + + +class FocusDestination(Protocol): + """Protocol for anything that can receive Focus export files.""" + + async def deliver( + self, + *, + content: bytes, + time_window: FocusTimeWindow, + filename: str, + ) -> None: + """Persist the serialized export for the provided time window.""" + ... diff --git a/litellm/integrations/focus/destinations/factory.py b/litellm/integrations/focus/destinations/factory.py new file mode 100644 index 000000000000..cb7696a11deb --- /dev/null +++ b/litellm/integrations/focus/destinations/factory.py @@ -0,0 +1,59 @@ +"""Factory helpers for Focus export destinations.""" + +from __future__ import annotations + +import os +from typing import Any, Dict, Optional + +from .base import FocusDestination +from .s3_destination import FocusS3Destination + + +class FocusDestinationFactory: + """Builds destination instances based on provider/config settings.""" + + @staticmethod + def create( + *, + provider: str, + prefix: str, + config: Optional[Dict[str, Any]] = None, + ) -> FocusDestination: + """Return a destination implementation for the requested provider.""" + provider_lower = provider.lower() + normalized_config = FocusDestinationFactory._resolve_config( + provider=provider_lower, overrides=config or {} + ) + if provider_lower == "s3": + return FocusS3Destination(prefix=prefix, config=normalized_config) + raise NotImplementedError( + f"Provider '{provider}' not supported for Focus export" + ) + + @staticmethod + def _resolve_config( + *, + provider: str, + overrides: Dict[str, Any], + ) -> Dict[str, Any]: + if provider == "s3": + resolved = { + "bucket_name": overrides.get("bucket_name") + or os.getenv("FOCUS_S3_BUCKET_NAME"), + "region_name": overrides.get("region_name") + or os.getenv("FOCUS_S3_REGION_NAME"), + "endpoint_url": overrides.get("endpoint_url") + or os.getenv("FOCUS_S3_ENDPOINT_URL"), + "aws_access_key_id": overrides.get("aws_access_key_id") + or os.getenv("FOCUS_S3_ACCESS_KEY"), + "aws_secret_access_key": overrides.get("aws_secret_access_key") + or os.getenv("FOCUS_S3_SECRET_KEY"), + "aws_session_token": overrides.get("aws_session_token") + or os.getenv("FOCUS_S3_SESSION_TOKEN"), + } + if not resolved.get("bucket_name"): + raise ValueError("FOCUS_S3_BUCKET_NAME must be provided for S3 exports") + return {k: v for k, v in resolved.items() if v is not None} + raise NotImplementedError( + f"Provider '{provider}' not supported for Focus export configuration" + ) diff --git a/litellm/integrations/focus/destinations/s3_destination.py b/litellm/integrations/focus/destinations/s3_destination.py new file mode 100644 index 000000000000..c6d5554b438e --- /dev/null +++ b/litellm/integrations/focus/destinations/s3_destination.py @@ -0,0 +1,74 @@ +"""S3 destination implementation for Focus export.""" + +from __future__ import annotations + +import asyncio +from datetime import timezone +from typing import Any, Optional + +import boto3 + +from .base import FocusDestination, FocusTimeWindow + + +class FocusS3Destination(FocusDestination): + """Handles uploading serialized exports to S3 buckets.""" + + def __init__( + self, + *, + prefix: str, + config: Optional[dict[str, Any]] = None, + ) -> None: + config = config or {} + bucket_name = config.get("bucket_name") + if not bucket_name: + raise ValueError("bucket_name must be provided for S3 destination") + self.bucket_name = bucket_name + self.prefix = prefix.rstrip("/") + self.config = config + + async def deliver( + self, + *, + content: bytes, + time_window: FocusTimeWindow, + filename: str, + ) -> None: + object_key = self._build_object_key(time_window=time_window, filename=filename) + await asyncio.to_thread(self._upload, content, object_key) + + def _build_object_key(self, *, time_window: FocusTimeWindow, filename: str) -> str: + start_utc = time_window.start_time.astimezone(timezone.utc) + date_component = f"date={start_utc.strftime('%Y-%m-%d')}" + parts = [self.prefix, date_component] + if time_window.frequency == "hourly": + parts.append(f"hour={start_utc.strftime('%H')}") + key_prefix = "/".join(filter(None, parts)) + return f"{key_prefix}/{filename}" if key_prefix else filename + + def _upload(self, content: bytes, object_key: str) -> None: + client_kwargs: dict[str, Any] = {} + region_name = self.config.get("region_name") + if region_name: + client_kwargs["region_name"] = region_name + endpoint_url = self.config.get("endpoint_url") + if endpoint_url: + client_kwargs["endpoint_url"] = endpoint_url + + session_kwargs: dict[str, Any] = {} + for key in ( + "aws_access_key_id", + "aws_secret_access_key", + "aws_session_token", + ): + if self.config.get(key): + session_kwargs[key] = self.config[key] + + s3_client = boto3.client("s3", **client_kwargs, **session_kwargs) + s3_client.put_object( + Bucket=self.bucket_name, + Key=object_key, + Body=content, + ContentType="application/octet-stream", + ) diff --git a/litellm/integrations/focus/export_engine.py b/litellm/integrations/focus/export_engine.py new file mode 100644 index 000000000000..22ebce2a168d --- /dev/null +++ b/litellm/integrations/focus/export_engine.py @@ -0,0 +1,124 @@ +"""Core export engine for Focus integrations (heavy dependencies).""" + +from __future__ import annotations + +from typing import Any, Dict, Optional + +import polars as pl + +from litellm._logging import verbose_logger + +from .database import FocusLiteLLMDatabase +from .destinations import FocusDestinationFactory, FocusTimeWindow +from .serializers import FocusParquetSerializer, FocusSerializer +from .transformer import FocusTransformer + + +class FocusExportEngine: + """Engine that fetches, normalizes, and uploads Focus exports.""" + + def __init__( + self, + *, + provider: str, + export_format: str, + prefix: str, + destination_config: Optional[dict[str, Any]] = None, + ) -> None: + self.provider = provider + self.export_format = export_format + self.prefix = prefix + self._destination = FocusDestinationFactory.create( + provider=self.provider, + prefix=self.prefix, + config=destination_config, + ) + self._serializer = self._init_serializer() + self._transformer = FocusTransformer() + self._database = FocusLiteLLMDatabase() + + def _init_serializer(self) -> FocusSerializer: + if self.export_format != "parquet": + raise NotImplementedError("Only parquet export supported currently") + return FocusParquetSerializer() + + async def dry_run_export_usage_data(self, limit: Optional[int]) -> Dict[str, Any]: + data = await self._database.get_usage_data(limit=limit) + normalized = self._transformer.transform(data) + + usage_sample = data.head(min(50, len(data))).to_dicts() + normalized_sample = normalized.head(min(50, len(normalized))).to_dicts() + + summary = { + "total_records": len(normalized), + "total_spend": self._sum_column(normalized, "spend"), + "total_tokens": self._sum_column(normalized, "total_tokens"), + "unique_teams": self._count_unique(normalized, "team_id"), + "unique_models": self._count_unique(normalized, "model"), + } + + return { + "usage_data": usage_sample, + "normalized_data": normalized_sample, + "summary": summary, + } + + async def export_window( + self, + *, + window: FocusTimeWindow, + limit: Optional[int], + ) -> None: + data = await self._database.get_usage_data( + limit=limit, + start_time_utc=window.start_time, + end_time_utc=window.end_time, + ) + if data.is_empty(): + verbose_logger.debug("Focus export: no usage data for window %s", window) + return + + normalized = self._transformer.transform(data) + if normalized.is_empty(): + verbose_logger.debug( + "Focus export: normalized data empty for window %s", window + ) + return + + await self._serialize_and_upload(normalized, window) + + async def _serialize_and_upload( + self, frame: pl.DataFrame, window: FocusTimeWindow + ) -> None: + payload = self._serializer.serialize(frame) + if not payload: + verbose_logger.debug("Focus export: serializer returned empty payload") + return + await self._destination.deliver( + content=payload, + time_window=window, + filename=self._build_filename(), + ) + + def _build_filename(self) -> str: + if not self._serializer.extension: + raise ValueError("Serializer must declare a file extension") + return f"usage.{self._serializer.extension}" + + @staticmethod + def _sum_column(frame: pl.DataFrame, column: str) -> float: + if frame.is_empty() or column not in frame.columns: + return 0.0 + value = frame.select(pl.col(column).sum().alias("sum")).row(0)[0] + if value is None: + return 0.0 + return float(value) + + @staticmethod + def _count_unique(frame: pl.DataFrame, column: str) -> int: + if frame.is_empty() or column not in frame.columns: + return 0 + value = frame.select(pl.col(column).n_unique().alias("unique")).row(0)[0] + if value is None: + return 0 + return int(value) diff --git a/litellm/integrations/focus/focus_logger.py b/litellm/integrations/focus/focus_logger.py new file mode 100644 index 000000000000..ade1cf861b1a --- /dev/null +++ b/litellm/integrations/focus/focus_logger.py @@ -0,0 +1,211 @@ +"""Focus export logger orchestrating DB pull/transform/upload.""" + +from __future__ import annotations + +import os +from datetime import datetime, timedelta, timezone +from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast + +import litellm +from litellm._logging import verbose_logger +from litellm.integrations.custom_logger import CustomLogger + +from .destinations import FocusTimeWindow + +if TYPE_CHECKING: + from apscheduler.schedulers.asyncio import AsyncIOScheduler + from .export_engine import FocusExportEngine +else: + AsyncIOScheduler = Any + +FOCUS_USAGE_DATA_JOB_NAME = "focus_export_usage_data" +DEFAULT_DRY_RUN_LIMIT = 500 + + +class FocusLogger(CustomLogger): + """Coordinates Focus export jobs across transformer/serializer/destination layers.""" + + def __init__( + self, + *, + provider: Optional[str] = None, + export_format: Optional[str] = None, + frequency: Optional[str] = None, + cron_offset_minute: Optional[int] = None, + interval_seconds: Optional[int] = None, + prefix: Optional[str] = None, + destination_config: Optional[dict[str, Any]] = None, + **kwargs: Any, + ) -> None: + super().__init__(**kwargs) + self.provider = (provider or os.getenv("FOCUS_PROVIDER") or "s3").lower() + self.export_format = ( + export_format or os.getenv("FOCUS_FORMAT") or "parquet" + ).lower() + self.frequency = (frequency or os.getenv("FOCUS_FREQUENCY") or "hourly").lower() + self.cron_offset_minute = ( + cron_offset_minute + if cron_offset_minute is not None + else int(os.getenv("FOCUS_CRON_OFFSET", "5")) + ) + raw_interval = ( + interval_seconds + if interval_seconds is not None + else os.getenv("FOCUS_INTERVAL_SECONDS") + ) + self.interval_seconds = int(raw_interval) if raw_interval is not None else None + env_prefix = os.getenv("FOCUS_PREFIX") + self.prefix: str = ( + prefix if prefix is not None else (env_prefix if env_prefix else "focus_exports") + ) + + self._destination_config = destination_config + self._engine: Optional["FocusExportEngine"] = None + + def _ensure_engine(self) -> "FocusExportEngine": + """Instantiate the heavy export engine lazily.""" + if self._engine is None: + from .export_engine import FocusExportEngine + + self._engine = FocusExportEngine( + provider=self.provider, + export_format=self.export_format, + prefix=self.prefix, + destination_config=self._destination_config, + ) + return self._engine + + async def export_usage_data( + self, + *, + limit: Optional[int] = None, + start_time_utc: Optional[datetime] = None, + end_time_utc: Optional[datetime] = None, + ) -> None: + """Public hook to trigger export immediately.""" + if bool(start_time_utc) ^ bool(end_time_utc): + raise ValueError( + "start_time_utc and end_time_utc must be provided together" + ) + + if start_time_utc and end_time_utc: + window = FocusTimeWindow( + start_time=start_time_utc, + end_time=end_time_utc, + frequency=self.frequency, + ) + else: + window = self._compute_time_window(datetime.now(timezone.utc)) + await self._export_window(window=window, limit=limit) + + async def dry_run_export_usage_data( + self, limit: Optional[int] = DEFAULT_DRY_RUN_LIMIT + ) -> dict[str, Any]: + """Return transformed data without uploading.""" + engine = self._ensure_engine() + return await engine.dry_run_export_usage_data(limit=limit) + + async def initialize_focus_export_job(self) -> None: + """Entry point for scheduler jobs to run export cycle with locking.""" + from litellm.proxy.proxy_server import proxy_logging_obj + + pod_lock_manager = None + if proxy_logging_obj is not None: + writer = getattr(proxy_logging_obj, "db_spend_update_writer", None) + if writer is not None: + pod_lock_manager = getattr(writer, "pod_lock_manager", None) + + if pod_lock_manager and pod_lock_manager.redis_cache: + acquired = await pod_lock_manager.acquire_lock( + cronjob_id=FOCUS_USAGE_DATA_JOB_NAME + ) + if not acquired: + verbose_logger.debug("Focus export: unable to acquire pod lock") + return + try: + await self._run_scheduled_export() + finally: + await pod_lock_manager.release_lock( + cronjob_id=FOCUS_USAGE_DATA_JOB_NAME + ) + else: + await self._run_scheduled_export() + + @staticmethod + async def init_focus_export_background_job( + scheduler: AsyncIOScheduler, + ) -> None: + """Register the export cron/interval job with the provided scheduler.""" + + focus_loggers: List[ + CustomLogger + ] = litellm.logging_callback_manager.get_custom_loggers_for_type( + callback_type=FocusLogger + ) + if not focus_loggers: + verbose_logger.debug( + "No Focus export logger registered; skipping scheduler" + ) + return + + focus_logger = cast(FocusLogger, focus_loggers[0]) + trigger_kwargs = focus_logger._build_scheduler_trigger() + scheduler.add_job( + focus_logger.initialize_focus_export_job, + **trigger_kwargs, + ) + + def _build_scheduler_trigger(self) -> Dict[str, Any]: + """Return scheduler configuration for the selected frequency.""" + if self.frequency == "interval": + seconds = self.interval_seconds or 60 + return {"trigger": "interval", "seconds": seconds} + + if self.frequency == "hourly": + minute = max(0, min(59, self.cron_offset_minute)) + return {"trigger": "cron", "minute": minute, "second": 0} + + if self.frequency == "daily": + total_minutes = max(0, self.cron_offset_minute) + hour = min(23, total_minutes // 60) + minute = min(59, total_minutes % 60) + return {"trigger": "cron", "hour": hour, "minute": minute, "second": 0} + + raise ValueError(f"Unsupported frequency: {self.frequency}") + + async def _run_scheduled_export(self) -> None: + """Execute the scheduled export for the configured window.""" + window = self._compute_time_window(datetime.now(timezone.utc)) + await self._export_window(window=window, limit=None) + + async def _export_window( + self, + *, + window: FocusTimeWindow, + limit: Optional[int], + ) -> None: + engine = self._ensure_engine() + await engine.export_window(window=window, limit=limit) + + def _compute_time_window(self, now: datetime) -> FocusTimeWindow: + """Derive the time window to export based on configured frequency.""" + now_utc = now.astimezone(timezone.utc) + if self.frequency == "hourly": + end_time = now_utc.replace(minute=0, second=0, microsecond=0) + start_time = end_time - timedelta(hours=1) + elif self.frequency == "daily": + end_time = now_utc.replace(hour=0, minute=0, second=0, microsecond=0) + start_time = end_time - timedelta(days=1) + elif self.frequency == "interval": + interval = timedelta(seconds=self.interval_seconds or 60) + end_time = now_utc + start_time = end_time - interval + else: + raise ValueError(f"Unsupported frequency: {self.frequency}") + return FocusTimeWindow( + start_time=start_time, + end_time=end_time, + frequency=self.frequency, + ) + +__all__ = ["FocusLogger"] diff --git a/litellm/integrations/focus/schema.py b/litellm/integrations/focus/schema.py new file mode 100644 index 000000000000..ac2f33dad0aa --- /dev/null +++ b/litellm/integrations/focus/schema.py @@ -0,0 +1,50 @@ +"""Schema definitions for Focus export data.""" + +from __future__ import annotations + +import polars as pl + +# see: https://focus.finops.org/focus-specification/v1-2/ +FOCUS_NORMALIZED_SCHEMA = pl.Schema( + [ + ("BilledCost", pl.Decimal(18, 6)), + ("BillingAccountId", pl.String), + ("BillingAccountName", pl.String), + ("BillingCurrency", pl.String), + ("BillingPeriodStart", pl.Datetime(time_unit="us")), + ("BillingPeriodEnd", pl.Datetime(time_unit="us")), + ("ChargeCategory", pl.String), + ("ChargeClass", pl.String), + ("ChargeDescription", pl.String), + ("ChargeFrequency", pl.String), + ("ChargePeriodStart", pl.Datetime(time_unit="us")), + ("ChargePeriodEnd", pl.Datetime(time_unit="us")), + ("ConsumedQuantity", pl.Decimal(18, 6)), + ("ConsumedUnit", pl.String), + ("ContractedCost", pl.Decimal(18, 6)), + ("ContractedUnitPrice", pl.Decimal(18, 6)), + ("EffectiveCost", pl.Decimal(18, 6)), + ("InvoiceIssuerName", pl.String), + ("ListCost", pl.Decimal(18, 6)), + ("ListUnitPrice", pl.Decimal(18, 6)), + ("PricingCategory", pl.String), + ("PricingQuantity", pl.Decimal(18, 6)), + ("PricingUnit", pl.String), + ("ProviderName", pl.String), + ("PublisherName", pl.String), + ("RegionId", pl.String), + ("RegionName", pl.String), + ("ResourceId", pl.String), + ("ResourceName", pl.String), + ("ResourceType", pl.String), + ("ServiceCategory", pl.String), + ("ServiceSubcategory", pl.String), + ("ServiceName", pl.String), + ("SubAccountId", pl.String), + ("SubAccountName", pl.String), + ("SubAccountType", pl.String), + ("Tags", pl.Object), + ] +) + +__all__ = ["FOCUS_NORMALIZED_SCHEMA"] diff --git a/litellm/integrations/focus/serializers/__init__.py b/litellm/integrations/focus/serializers/__init__.py new file mode 100644 index 000000000000..18187bf73e58 --- /dev/null +++ b/litellm/integrations/focus/serializers/__init__.py @@ -0,0 +1,6 @@ +"""Serializer package exports for Focus integration.""" + +from .base import FocusSerializer +from .parquet import FocusParquetSerializer + +__all__ = ["FocusSerializer", "FocusParquetSerializer"] diff --git a/litellm/integrations/focus/serializers/base.py b/litellm/integrations/focus/serializers/base.py new file mode 100644 index 000000000000..6da080dae819 --- /dev/null +++ b/litellm/integrations/focus/serializers/base.py @@ -0,0 +1,18 @@ +"""Serializer abstractions for Focus export.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod + +import polars as pl + + +class FocusSerializer(ABC): + """Base serializer turning Focus frames into bytes.""" + + extension: str = "" + + @abstractmethod + def serialize(self, frame: pl.DataFrame) -> bytes: + """Convert the normalized Focus frame into the chosen format.""" + raise NotImplementedError diff --git a/litellm/integrations/focus/serializers/parquet.py b/litellm/integrations/focus/serializers/parquet.py new file mode 100644 index 000000000000..6b3dde5903d9 --- /dev/null +++ b/litellm/integrations/focus/serializers/parquet.py @@ -0,0 +1,22 @@ +"""Parquet serializer for Focus export.""" + +from __future__ import annotations + +import io + +import polars as pl + +from .base import FocusSerializer + + +class FocusParquetSerializer(FocusSerializer): + """Serialize normalized Focus frames to Parquet bytes.""" + + extension = "parquet" + + def serialize(self, frame: pl.DataFrame) -> bytes: + """Encode the provided frame as a parquet payload.""" + target = frame if not frame.is_empty() else pl.DataFrame(schema=frame.schema) + buffer = io.BytesIO() + target.write_parquet(buffer, compression="snappy") + return buffer.getvalue() diff --git a/litellm/integrations/focus/transformer.py b/litellm/integrations/focus/transformer.py new file mode 100644 index 000000000000..cac12b7be146 --- /dev/null +++ b/litellm/integrations/focus/transformer.py @@ -0,0 +1,90 @@ +"""Focus export data transformer.""" + +from __future__ import annotations + +from datetime import timedelta + +import polars as pl + +from .schema import FOCUS_NORMALIZED_SCHEMA + + +class FocusTransformer: + """Transforms LiteLLM DB rows into Focus-compatible schema.""" + + schema = FOCUS_NORMALIZED_SCHEMA + + def transform(self, frame: pl.DataFrame) -> pl.DataFrame: + """Return a normalized frame expected by downstream serializers.""" + if frame.is_empty(): + return pl.DataFrame(schema=self.schema) + + # derive period start/end from usage date + frame = frame.with_columns( + pl.col("date") + .cast(pl.Utf8) + .str.strptime(pl.Datetime(time_unit="us"), format="%Y-%m-%d", strict=False) + .alias("usage_date"), + ) + frame = frame.with_columns( + pl.col("usage_date").alias("ChargePeriodStart"), + (pl.col("usage_date") + timedelta(days=1)).alias("ChargePeriodEnd"), + ) + + def fmt(col): + return col.dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + DEC = pl.Decimal(18, 6) + + def dec(col): + return col.cast(DEC) + + none_str = pl.lit(None, dtype=pl.Utf8) + none_dec = pl.lit(None, dtype=pl.Decimal(18, 6)) + + return frame.select( + dec(pl.col("spend").fill_null(0.0)).alias("BilledCost"), + pl.col("api_key").cast(pl.String).alias("BillingAccountId"), + pl.col("api_key_alias").cast(pl.String).alias("BillingAccountName"), + pl.lit("API Key").alias("BillingAccountType"), + pl.lit("USD").alias("BillingCurrency"), + fmt(pl.col("ChargePeriodEnd")).alias("BillingPeriodEnd"), + fmt(pl.col("ChargePeriodStart")).alias("BillingPeriodStart"), + pl.lit("Usage").alias("ChargeCategory"), + none_str.alias("ChargeClass"), + pl.col("model").cast(pl.String).alias("ChargeDescription"), + pl.lit("Usage-Based").alias("ChargeFrequency"), + fmt(pl.col("ChargePeriodEnd")).alias("ChargePeriodEnd"), + fmt(pl.col("ChargePeriodStart")).alias("ChargePeriodStart"), + dec(pl.lit(1.0)).alias("ConsumedQuantity"), + pl.lit("Requests").alias("ConsumedUnit"), + dec(pl.col("spend").fill_null(0.0)).alias("ContractedCost"), + none_str.alias("ContractedUnitPrice"), + dec(pl.col("spend").fill_null(0.0)).alias("EffectiveCost"), + pl.col("custom_llm_provider").cast(pl.String).alias("InvoiceIssuerName"), + none_str.alias("InvoiceId"), + dec(pl.col("spend").fill_null(0.0)).alias("ListCost"), + none_dec.alias("ListUnitPrice"), + none_str.alias("AvailabilityZone"), + pl.lit("USD").alias("PricingCurrency"), + none_str.alias("PricingCategory"), + dec(pl.lit(1.0)).alias("PricingQuantity"), + none_dec.alias("PricingCurrencyContractedUnitPrice"), + dec(pl.col("spend").fill_null(0.0)).alias("PricingCurrencyEffectiveCost"), + none_dec.alias("PricingCurrencyListUnitPrice"), + pl.lit("Requests").alias("PricingUnit"), + pl.col("custom_llm_provider").cast(pl.String).alias("ProviderName"), + pl.col("custom_llm_provider").cast(pl.String).alias("PublisherName"), + none_str.alias("RegionId"), + none_str.alias("RegionName"), + pl.col("model").cast(pl.String).alias("ResourceId"), + pl.col("model").cast(pl.String).alias("ResourceName"), + pl.col("model").cast(pl.String).alias("ResourceType"), + pl.lit("AI and Machine Learning").alias("ServiceCategory"), + pl.lit("Generative AI").alias("ServiceSubcategory"), + pl.col("model_group").cast(pl.String).alias("ServiceName"), + pl.col("team_id").cast(pl.String).alias("SubAccountId"), + pl.col("team_alias").cast(pl.String).alias("SubAccountName"), + none_str.alias("SubAccountType"), + none_str.alias("Tags"), + ) diff --git a/litellm/integrations/gcs_bucket/Readme.md b/litellm/integrations/gcs_bucket/Readme.md index 2ab0b23353b8..6808823c9256 100644 --- a/litellm/integrations/gcs_bucket/Readme.md +++ b/litellm/integrations/gcs_bucket/Readme.md @@ -8,5 +8,5 @@ This folder contains the GCS Bucket Logging integration for LiteLLM Gateway. - `gcs_bucket_base.py`: This file contains the GCSBucketBase class which handles Authentication for GCS Buckets ## Further Reading -- [Doc setting up GCS Bucket Logging on LiteLLM Proxy (Gateway)](https://docs.litellm.ai/docs/proxy/bucket) +- [Doc setting up GCS Bucket Logging on LiteLLM Proxy (Gateway)](https://docs.litellm.ai/docs/observability/gcs_bucket_integration) - [Doc on Key / Team Based logging with GCS](https://docs.litellm.ai/docs/proxy/team_logging) \ No newline at end of file diff --git a/litellm/integrations/gcs_bucket/gcs_bucket.py b/litellm/integrations/gcs_bucket/gcs_bucket.py index 9190f921d509..3cb629055318 100644 --- a/litellm/integrations/gcs_bucket/gcs_bucket.py +++ b/litellm/integrations/gcs_bucket/gcs_bucket.py @@ -1,9 +1,11 @@ import asyncio +import hashlib import json import os +import time from litellm._uuid import uuid from datetime import datetime, timedelta, timezone -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple from urllib.parse import quote from litellm._logging import verbose_logger @@ -26,19 +28,21 @@ def __init__(self, bucket_name: Optional[str] = None) -> None: super().__init__(bucket_name=bucket_name) - # Init Batch logging settings - self.log_queue: List[GCSLogQueueItem] = [] self.batch_size = int(os.getenv("GCS_BATCH_SIZE", GCS_DEFAULT_BATCH_SIZE)) self.flush_interval = int( os.getenv("GCS_FLUSH_INTERVAL", GCS_DEFAULT_FLUSH_INTERVAL_SECONDS) ) - asyncio.create_task(self.periodic_flush()) + self.use_batched_logging = ( + os.getenv("GCS_USE_BATCHED_LOGGING", str(GCS_DEFAULT_USE_BATCHED_LOGGING).lower()).lower() == "true" + ) self.flush_lock = asyncio.Lock() super().__init__( flush_lock=self.flush_lock, batch_size=self.batch_size, flush_interval=self.flush_interval, ) + self.log_queue: asyncio.Queue[GCSLogQueueItem] = asyncio.Queue() # type: ignore[assignment] + asyncio.create_task(self.periodic_flush()) AdditionalLoggingUtils.__init__(self) if premium_user is not True: @@ -65,8 +69,7 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti ) if logging_payload is None: raise ValueError("standard_logging_object not found in kwargs") - # Add to logging queue - this will be flushed periodically - self.log_queue.append( + await self.log_queue.put( GCSLogQueueItem( payload=logging_payload, kwargs=kwargs, response_obj=response_obj ) @@ -89,7 +92,9 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti if logging_payload is None: raise ValueError("standard_logging_object not found in kwargs") # Add to logging queue - this will be flushed periodically - self.log_queue.append( + # Use asyncio.Queue.put() for thread-safe concurrent access + # If queue is full, this will block until space is available (backpressure) + await self.log_queue.put( GCSLogQueueItem( payload=logging_payload, kwargs=kwargs, response_obj=response_obj ) @@ -98,28 +103,145 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti except Exception as e: verbose_logger.exception(f"GCS Bucket logging error: {str(e)}") - async def async_send_batch(self): + def _drain_queue_batch(self) -> List[GCSLogQueueItem]: + """ + Drain items from the queue (non-blocking), respecting batch_size limit. + + This prevents unbounded queue growth when processing is slower than log accumulation. + + Returns: + List of items to process, up to batch_size items """ - Process queued logs in batch - sends logs to GCS Bucket + items_to_process: List[GCSLogQueueItem] = [] + while len(items_to_process) < self.batch_size: + try: + items_to_process.append(self.log_queue.get_nowait()) + except asyncio.QueueEmpty: + break + return items_to_process + def _generate_batch_object_name(self, date_str: str, batch_id: str) -> str: + """ + Generate object name for a batched log file. + Format: {date}/batch-{batch_id}.ndjson + """ + return f"{date_str}/batch-{batch_id}.ndjson" - GCS Bucket does not have a Batch endpoint to batch upload logs + def _get_config_key(self, kwargs: Dict[str, Any]) -> str: + """ + Extract a synchronous grouping key from kwargs to group items by GCS config. + This allows us to batch items with the same bucket/credentials together. + + Returns a string key that uniquely identifies the GCS config combination. + This key may contain sensitive information (bucket names, paths) - use _sanitize_config_key() + for logging purposes. + """ + standard_callback_dynamic_params = kwargs.get("standard_callback_dynamic_params", None) or {} + + bucket_name = standard_callback_dynamic_params.get("gcs_bucket_name", None) or self.BUCKET_NAME or "default" + path_service_account = standard_callback_dynamic_params.get("gcs_path_service_account", None) or self.path_service_account_json or "default" + + return f"{bucket_name}|{path_service_account}" + + def _sanitize_config_key(self, config_key: str) -> str: + """ + Create a sanitized version of the config key for logging. + Uses a hash to avoid exposing sensitive bucket names or service account paths. + + Returns a short hash prefix for safe logging. + """ + hash_obj = hashlib.sha256(config_key.encode('utf-8')) + return f"config-{hash_obj.hexdigest()[:8]}" + + def _group_items_by_config(self, items: List[GCSLogQueueItem]) -> Dict[str, List[GCSLogQueueItem]]: + """ + Group items by their GCS config (bucket + credentials). + This ensures items with different configs are processed separately. + + Returns a dict mapping config_key -> list of items with that config. + """ + grouped: Dict[str, List[GCSLogQueueItem]] = {} + for item in items: + config_key = self._get_config_key(item["kwargs"]) + if config_key not in grouped: + grouped[config_key] = [] + grouped[config_key].append(item) + return grouped + + def _combine_payloads_to_ndjson(self, items: List[GCSLogQueueItem]) -> str: + """ + Combine multiple log payloads into newline-delimited JSON (NDJSON) format. + Each line is a valid JSON object representing one log entry. + """ + lines = [] + for item in items: + logging_payload = item["payload"] + json_line = json.dumps(logging_payload, default=str, ensure_ascii=False) + lines.append(json_line) + return "\n".join(lines) + + async def _send_grouped_batch(self, items: List[GCSLogQueueItem], config_key: str) -> Tuple[int, int]: + """ + Send a batch of items that share the same GCS config. + + Returns: + (success_count, error_count) + """ + if not items: + return (0, 0) + + first_kwargs = items[0]["kwargs"] + + try: + gcs_logging_config: GCSLoggingConfig = await self.get_gcs_logging_config( + first_kwargs + ) - Instead, we - - collect the logs to flush every `GCS_FLUSH_INTERVAL` seconds - - during async_send_batch, we make 1 POST request per log to GCS Bucket + headers = await self.construct_request_headers( + vertex_instance=gcs_logging_config["vertex_instance"], + service_account_json=gcs_logging_config["path_service_account"], + ) + bucket_name = gcs_logging_config["bucket_name"] + + current_date = self._get_object_date_from_datetime(datetime.now(timezone.utc)) + batch_id = f"{int(time.time() * 1000)}-{uuid.uuid4().hex[:8]}" + object_name = self._generate_batch_object_name(current_date, batch_id) + combined_payload = self._combine_payloads_to_ndjson(items) + + await self._log_json_data_on_gcs( + headers=headers, + bucket_name=bucket_name, + object_name=object_name, + logging_payload=combined_payload, + ) + + success_count = len(items) + error_count = 0 + return (success_count, error_count) + + except Exception as e: + success_count = 0 + error_count = len(items) + verbose_logger.exception( + f"GCS Bucket error logging batch payload to GCS bucket: {str(e)}" + ) + return (success_count, error_count) + async def _send_individual_logs(self, items: List[GCSLogQueueItem]) -> None: """ - if not self.log_queue: - return - - for log_item in self.log_queue: - logging_payload = log_item["payload"] - kwargs = log_item["kwargs"] - response_obj = log_item.get("response_obj", None) or {} + Send each log individually as separate GCS objects (legacy behavior). + This is used when GCS_USE_BATCHED_LOGGING is disabled. + """ + for item in items: + await self._send_single_log_item(item) + async def _send_single_log_item(self, item: GCSLogQueueItem) -> None: + """ + Send a single log item to GCS as an individual object. + """ + try: gcs_logging_config: GCSLoggingConfig = await self.get_gcs_logging_config( - kwargs + item["kwargs"] ) headers = await self.construct_request_headers( @@ -127,24 +249,45 @@ async def async_send_batch(self): service_account_json=gcs_logging_config["path_service_account"], ) bucket_name = gcs_logging_config["bucket_name"] - object_name = self._get_object_name(kwargs, logging_payload, response_obj) + + object_name = self._get_object_name( + kwargs=item["kwargs"], + logging_payload=item["payload"], + response_obj=item["response_obj"], + ) + + await self._log_json_data_on_gcs( + headers=headers, + bucket_name=bucket_name, + object_name=object_name, + logging_payload=item["payload"], + ) + except Exception as e: + verbose_logger.exception( + f"GCS Bucket error logging individual payload to GCS bucket: {str(e)}" + ) - try: - await self._log_json_data_on_gcs( - headers=headers, - bucket_name=bucket_name, - object_name=object_name, - logging_payload=logging_payload, - ) - except Exception as e: - # don't let one log item fail the entire batch - verbose_logger.exception( - f"GCS Bucket error logging payload to GCS bucket: {str(e)}" - ) - pass + async def async_send_batch(self): + """ + Process queued logs - sends logs to GCS Bucket. + + If `GCS_USE_BATCHED_LOGGING` is enabled (default), batches multiple log payloads + into single GCS object uploads (NDJSON format), dramatically reducing API calls. + + If disabled, sends each log individually as separate GCS objects (legacy behavior). + """ + items_to_process = self._drain_queue_batch() - # Clear the queue after processing - self.log_queue.clear() + if not items_to_process: + return + + if self.use_batched_logging: + grouped_items = self._group_items_by_config(items_to_process) + + for config_key, group_items in grouped_items.items(): + await self._send_grouped_batch(group_items, config_key) + else: + await self._send_individual_logs(items_to_process) def _get_object_name( self, kwargs: Dict, logging_payload: StandardLoggingPayload, response_obj: Any @@ -186,7 +329,6 @@ async def get_request_response_payload( "start_time_utc is required for getting a payload from GCS Bucket" ) - # Try current day, next day, and previous day dates_to_try = [ start_time_utc, start_time_utc + timedelta(days=1), @@ -230,5 +372,23 @@ def _generate_failure_object_name( def _get_object_date_from_datetime(self, datetime_obj: datetime) -> str: return datetime_obj.strftime("%Y-%m-%d") + async def flush_queue(self): + """ + Override flush_queue to work with asyncio.Queue. + """ + await self.async_send_batch() + self.last_flush_time = time.time() + + async def periodic_flush(self): + """ + Override periodic_flush to work with asyncio.Queue. + """ + while True: + await asyncio.sleep(self.flush_interval) + verbose_logger.debug( + f"GCS Bucket periodic flush after {self.flush_interval} seconds" + ) + await self.flush_queue() + async def async_health_check(self) -> IntegrationHealthCheckStatus: raise NotImplementedError("GCS Bucket does not support health check") diff --git a/litellm/integrations/gcs_bucket/gcs_bucket_base.py b/litellm/integrations/gcs_bucket/gcs_bucket_base.py index 2612face0503..b1db9ec9588b 100644 --- a/litellm/integrations/gcs_bucket/gcs_bucket_base.py +++ b/litellm/integrations/gcs_bucket/gcs_bucket_base.py @@ -2,6 +2,13 @@ import os from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union +from litellm.integrations.gcs_bucket.gcs_bucket_mock_client import ( + should_use_gcs_mock, + create_mock_gcs_client, + mock_vertex_auth_methods, +) + + from litellm._logging import verbose_logger from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.llms.custom_httpx.http_handler import ( @@ -20,6 +27,12 @@ class GCSBucketBase(CustomBatchLogger): def __init__(self, bucket_name: Optional[str] = None, **kwargs) -> None: + self.is_mock_mode = should_use_gcs_mock() + + if self.is_mock_mode: + mock_vertex_auth_methods() + create_mock_gcs_client() + self.async_httpx_client = get_async_httpx_client( llm_provider=httpxSpecialProvider.LoggingCallback ) diff --git a/litellm/integrations/gcs_bucket/gcs_bucket_mock_client.py b/litellm/integrations/gcs_bucket/gcs_bucket_mock_client.py new file mode 100644 index 000000000000..2d14f5eb9623 --- /dev/null +++ b/litellm/integrations/gcs_bucket/gcs_bucket_mock_client.py @@ -0,0 +1,192 @@ +""" +Mock client for GCS Bucket integration testing. + +This module intercepts GCS API calls and Vertex AI auth calls, returning successful +mock responses, allowing full code execution without making actual network calls. + +Usage: + Set GCS_MOCK=true in environment variables or config to enable mock mode. +""" + +import asyncio + +from litellm._logging import verbose_logger +from litellm.integrations.mock_client_factory import MockClientConfig, create_mock_client_factory, MockResponse + +# Use factory for POST handler +_config = MockClientConfig( + name="GCS", + env_var="GCS_MOCK", + default_latency_ms=150, + default_status_code=200, + default_json_data={"kind": "storage#object", "name": "mock-object"}, + url_matchers=["storage.googleapis.com"], + patch_async_handler=True, + patch_sync_client=False, +) + +_create_mock_gcs_post, should_use_gcs_mock = create_mock_client_factory(_config) + +# Store original methods for GET/DELETE (GCS-specific) +_original_async_handler_get = None +_original_async_handler_delete = None +_mocks_initialized = False + +# Default mock latency in seconds (simulates network round-trip) +# Typical GCS API calls take 100-300ms for uploads, 50-150ms for GET/DELETE +_MOCK_LATENCY_SECONDS = float(__import__("os").getenv("GCS_MOCK_LATENCY_MS", "150")) / 1000.0 + + +async def _mock_async_handler_get(self, url, params=None, headers=None, follow_redirects=None): + """Monkey-patched AsyncHTTPHandler.get that intercepts GCS calls.""" + # Only mock GCS API calls + if isinstance(url, str) and "storage.googleapis.com" in url: + verbose_logger.info(f"[GCS MOCK] GET to {url}") + await asyncio.sleep(_MOCK_LATENCY_SECONDS) + # Return a minimal but valid StandardLoggingPayload JSON string as bytes + # This matches what GCS returns when downloading with ?alt=media + mock_payload = { + "id": "mock-request-id", + "trace_id": "mock-trace-id", + "call_type": "completion", + "stream": False, + "response_cost": 0.0, + "status": "success", + "status_fields": {"llm_api_status": "success"}, + "custom_llm_provider": "mock", + "total_tokens": 0, + "prompt_tokens": 0, + "completion_tokens": 0, + "startTime": 0.0, + "endTime": 0.0, + "completionStartTime": 0.0, + "response_time": 0.0, + "model_map_information": {"model": "mock-model"}, + "model": "mock-model", + "model_id": None, + "model_group": None, + "api_base": "https://api.mock.com", + "metadata": {}, + "cache_hit": None, + "cache_key": None, + "saved_cache_cost": 0.0, + "request_tags": [], + "end_user": None, + "requester_ip_address": None, + "messages": None, + "response": None, + "error_str": None, + "error_information": None, + "model_parameters": {}, + "hidden_params": {}, + "guardrail_information": None, + "standard_built_in_tools_params": None, + } + return MockResponse( + status_code=200, + json_data=mock_payload, + url=url, + elapsed_seconds=_MOCK_LATENCY_SECONDS + ) + if _original_async_handler_get is not None: + return await _original_async_handler_get(self, url=url, params=params, headers=headers, follow_redirects=follow_redirects) + raise RuntimeError("Original AsyncHTTPHandler.get not available") + + +async def _mock_async_handler_delete(self, url, data=None, json=None, params=None, headers=None, timeout=None, stream=False, content=None): + """Monkey-patched AsyncHTTPHandler.delete that intercepts GCS calls.""" + # Only mock GCS API calls + if isinstance(url, str) and "storage.googleapis.com" in url: + verbose_logger.info(f"[GCS MOCK] DELETE to {url}") + await asyncio.sleep(_MOCK_LATENCY_SECONDS) + # DELETE returns 204 No Content with empty body (not JSON) + return MockResponse( + status_code=204, + json_data=None, # Empty body for DELETE + url=url, + elapsed_seconds=_MOCK_LATENCY_SECONDS + ) + if _original_async_handler_delete is not None: + return await _original_async_handler_delete(self, url=url, data=data, json=json, params=params, headers=headers, timeout=timeout, stream=stream, content=content) + raise RuntimeError("Original AsyncHTTPHandler.delete not available") + + +def create_mock_gcs_client(): + """ + Monkey-patch AsyncHTTPHandler methods to intercept GCS calls. + + AsyncHTTPHandler is used by LiteLLM's get_async_httpx_client() which is what + GCSBucketBase uses for making API calls. + + This function is idempotent - it only initializes mocks once, even if called multiple times. + """ + global _original_async_handler_get, _original_async_handler_delete, _mocks_initialized + + # Use factory for POST handler + _create_mock_gcs_post() + + # If already initialized, skip GET/DELETE patching + if _mocks_initialized: + return + + verbose_logger.debug("[GCS MOCK] Initializing GCS GET/DELETE handlers...") + + # Patch GET and DELETE handlers (GCS-specific) + from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler + + if _original_async_handler_get is None: + _original_async_handler_get = AsyncHTTPHandler.get + AsyncHTTPHandler.get = _mock_async_handler_get # type: ignore + verbose_logger.debug("[GCS MOCK] Patched AsyncHTTPHandler.get") + + if _original_async_handler_delete is None: + _original_async_handler_delete = AsyncHTTPHandler.delete + AsyncHTTPHandler.delete = _mock_async_handler_delete # type: ignore + verbose_logger.debug("[GCS MOCK] Patched AsyncHTTPHandler.delete") + + verbose_logger.debug(f"[GCS MOCK] Mock latency set to {_MOCK_LATENCY_SECONDS*1000:.0f}ms") + verbose_logger.debug("[GCS MOCK] GCS mock client initialization complete") + + _mocks_initialized = True + + +def mock_vertex_auth_methods(): + """ + Monkey-patch Vertex AI auth methods to return fake tokens. + This prevents auth failures when GCS_MOCK is enabled. + + This function is idempotent - it only patches once, even if called multiple times. + """ + from litellm.llms.vertex_ai.vertex_llm_base import VertexBase + + # Store original methods if not already stored + if not hasattr(VertexBase, '_original_ensure_access_token_async'): + setattr(VertexBase, '_original_ensure_access_token_async', VertexBase._ensure_access_token_async) + setattr(VertexBase, '_original_ensure_access_token', VertexBase._ensure_access_token) + setattr(VertexBase, '_original_get_token_and_url', VertexBase._get_token_and_url) + + async def _mock_ensure_access_token_async(self, credentials, project_id, custom_llm_provider): + """Mock async auth method - returns fake token.""" + verbose_logger.debug("[GCS MOCK] Vertex AI auth: _ensure_access_token_async called") + return ("mock-gcs-token", "mock-project-id") + + def _mock_ensure_access_token(self, credentials, project_id, custom_llm_provider): + """Mock sync auth method - returns fake token.""" + verbose_logger.debug("[GCS MOCK] Vertex AI auth: _ensure_access_token called") + return ("mock-gcs-token", "mock-project-id") + + def _mock_get_token_and_url(self, model, auth_header, vertex_credentials, vertex_project, + vertex_location, gemini_api_key, stream, custom_llm_provider, api_base): + """Mock get_token_and_url - returns fake token.""" + verbose_logger.debug("[GCS MOCK] Vertex AI auth: _get_token_and_url called") + return ("mock-gcs-token", "https://storage.googleapis.com") + + # Patch the methods + VertexBase._ensure_access_token_async = _mock_ensure_access_token_async # type: ignore + VertexBase._ensure_access_token = _mock_ensure_access_token # type: ignore + VertexBase._get_token_and_url = _mock_get_token_and_url # type: ignore + + verbose_logger.debug("[GCS MOCK] Patched Vertex AI auth methods") + + +# should_use_gcs_mock is already created by the factory diff --git a/enterprise/litellm_enterprise/enterprise_callbacks/generic_api_callback.py b/litellm/integrations/generic_api/generic_api_callback.py similarity index 53% rename from enterprise/litellm_enterprise/enterprise_callbacks/generic_api_callback.py rename to litellm/integrations/generic_api/generic_api_callback.py index 7e259d4e19d1..1c62ce9fcc3e 100644 --- a/enterprise/litellm_enterprise/enterprise_callbacks/generic_api_callback.py +++ b/litellm/integrations/generic_api/generic_api_callback.py @@ -7,13 +7,15 @@ """ import asyncio +import json import os +import re import traceback -from litellm._uuid import uuid -from typing import Dict, List, Optional, Union +from typing import Dict, List, Literal, Optional, Union import litellm from litellm._logging import verbose_logger +from litellm._uuid import uuid from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.litellm_core_utils.safe_json_dumps import safe_dumps from litellm.llms.custom_httpx.http_handler import ( @@ -22,12 +24,85 @@ ) from litellm.types.utils import StandardLoggingPayload +API_EVENT_TYPES = Literal["llm_api_success", "llm_api_failure"] +LOG_FORMAT_TYPES = Literal["json_array", "ndjson", "single"] + + +def load_compatible_callbacks() -> Dict: + """ + Load the generic_api_compatible_callbacks.json file + + Returns: + Dict: Dictionary of compatible callbacks configuration + """ + try: + json_path = os.path.join( + os.path.dirname(__file__), "generic_api_compatible_callbacks.json" + ) + with open(json_path, "r") as f: + return json.load(f) + except Exception as e: + verbose_logger.warning( + f"Error loading generic_api_compatible_callbacks.json: {str(e)}" + ) + return {} + + +def is_callback_compatible(callback_name: str) -> bool: + """ + Check if a callback_name exists in the compatible callbacks list + + Args: + callback_name: Name of the callback to check + + Returns: + bool: True if callback_name exists in the compatible callbacks, False otherwise + """ + compatible_callbacks = load_compatible_callbacks() + return callback_name in compatible_callbacks + + +def get_callback_config(callback_name: str) -> Optional[Dict]: + """ + Get the configuration for a specific callback + + Args: + callback_name: Name of the callback to get config for + + Returns: + Optional[Dict]: Configuration dict for the callback, or None if not found + """ + compatible_callbacks = load_compatible_callbacks() + return compatible_callbacks.get(callback_name) + + +def substitute_env_variables(value: str) -> str: + """ + Replace {{environment_variables.VAR_NAME}} patterns with actual environment variable values + + Args: + value: String that may contain {{environment_variables.VAR_NAME}} patterns + + Returns: + str: String with environment variables substituted + """ + pattern = r"\{\{environment_variables\.([A-Z_]+)\}\}" + + def replace_env_var(match): + env_var_name = match.group(1) + return os.getenv(env_var_name, "") + + return re.sub(pattern, replace_env_var, value) + class GenericAPILogger(CustomBatchLogger): def __init__( self, endpoint: Optional[str] = None, headers: Optional[dict] = None, + event_types: Optional[List[API_EVENT_TYPES]] = None, + callback_name: Optional[str] = None, + log_format: Optional[LOG_FORMAT_TYPES] = None, **kwargs, ): """ @@ -36,7 +111,41 @@ def __init__( Args: endpoint: Optional[str] = None, headers: Optional[dict] = None, + event_types: Optional[List[API_EVENT_TYPES]] = None, + callback_name: Optional[str] = None - If provided, loads config from generic_api_compatible_callbacks.json + log_format: Optional[LOG_FORMAT_TYPES] = None - Format for log output: "json_array" (default), "ndjson", or "single" """ + ######################################################### + # Check if callback_name is provided and load config + ######################################################### + if callback_name: + if is_callback_compatible(callback_name): + verbose_logger.debug( + f"Loading configuration for callback: {callback_name}" + ) + callback_config = get_callback_config(callback_name) + + # Use config from JSON if not explicitly provided + if callback_config: + if endpoint is None and "endpoint" in callback_config: + endpoint = substitute_env_variables(callback_config["endpoint"]) + + if "headers" in callback_config: + headers = headers or {} + for key, value in callback_config["headers"].items(): + if key not in headers: + headers[key] = substitute_env_variables(value) + + if event_types is None and "event_types" in callback_config: + event_types = callback_config["event_types"] + + if log_format is None and "log_format" in callback_config: + log_format = callback_config["log_format"] + else: + verbose_logger.warning( + f"callback_name '{callback_name}' not found in generic_api_compatible_callbacks.json" + ) + ######################################################### # Init httpx client ######################################################### @@ -51,8 +160,18 @@ def __init__( self.headers: Dict = self._get_headers(headers) self.endpoint: str = endpoint + self.event_types: Optional[List[API_EVENT_TYPES]] = event_types + self.callback_name: Optional[str] = callback_name + + # Validate and store log_format + if log_format is not None and log_format not in ["json_array", "ndjson", "single"]: + raise ValueError( + f"Invalid log_format: {log_format}. Must be one of: 'json_array', 'ndjson', 'single'" + ) + self.log_format: LOG_FORMAT_TYPES = log_format or "json_array" + verbose_logger.debug( - f"in init GenericAPILogger, endpoint {self.endpoint}, headers {self.headers}" + f"in init GenericAPILogger, callback_name: {self.callback_name}, endpoint {self.endpoint}, headers {self.headers}, event_types: {self.event_types}, log_format: {self.log_format}" ) ######################################################### @@ -114,9 +233,9 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti Raises: Raises a NON Blocking verbose_logger.exception if an error occurs """ - from litellm.proxy.utils import _premium_user_check - _premium_user_check() + if self.event_types is not None and "llm_api_success" not in self.event_types: + return try: verbose_logger.debug( @@ -153,9 +272,8 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti - Creates a StandardLoggingPayload - Adds to batch queue """ - from litellm.proxy.utils import _premium_user_check - - _premium_user_check() + if self.event_types is not None and "llm_api_failure" not in self.event_types: + return try: verbose_logger.debug( @@ -185,25 +303,65 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti async def async_send_batch(self): """ Sends the batch of messages to Generic API Endpoint + + Supports three formats: + - json_array: Sends all logs as a JSON array (default) + - ndjson: Sends logs as newline-delimited JSON + - single: Sends each log as individual HTTP request in parallel """ try: if not self.log_queue: return verbose_logger.debug( - f"Generic API Logger - about to flush {len(self.log_queue)} events" + f"Generic API Logger - about to flush {len(self.log_queue)} events in '{self.log_format}' format" ) - # make POST request to Generic API Endpoint - response = await self.async_httpx_client.post( - url=self.endpoint, - headers=self.headers, - data=safe_dumps(self.log_queue), - ) + if self.log_format == "single": + # Send each log as individual HTTP request in parallel + tasks = [] + for log_entry in self.log_queue: + task = self.async_httpx_client.post( + url=self.endpoint, + headers=self.headers, + data=safe_dumps(log_entry), + ) + tasks.append(task) + + # Execute all requests in parallel + responses = await asyncio.gather(*tasks, return_exceptions=True) + + # Log results + for idx, result in enumerate(responses): + if isinstance(result, Exception): + verbose_logger.exception( + f"Generic API Logger - Error sending log {idx}: {result}" + ) + else: + # result is a Response object + verbose_logger.debug( + f"Generic API Logger - sent log {idx}, status: {result.status_code}" # type: ignore + ) + else: + # Format the payload based on log_format + if self.log_format == "json_array": + data = safe_dumps(self.log_queue) + elif self.log_format == "ndjson": + data = "\n".join(safe_dumps(log) for log in self.log_queue) + else: + raise ValueError(f"Unknown log_format: {self.log_format}") + + # Make POST request + response = await self.async_httpx_client.post( + url=self.endpoint, + headers=self.headers, + data=data, + ) - verbose_logger.debug( - f"Generic API Logger - sent batch to {self.endpoint}, status code {response.status_code}" - ) + verbose_logger.debug( + f"Generic API Logger - sent batch to {self.endpoint}, " + f"status: {response.status_code}, format: {self.log_format}" + ) except Exception as e: verbose_logger.exception( diff --git a/litellm/integrations/generic_api/generic_api_compatible_callbacks.json b/litellm/integrations/generic_api/generic_api_compatible_callbacks.json new file mode 100644 index 000000000000..13fe79ae671e --- /dev/null +++ b/litellm/integrations/generic_api/generic_api_compatible_callbacks.json @@ -0,0 +1,37 @@ +{ + "sample_callback": { + "event_types": ["llm_api_success", "llm_api_failure"], + "endpoint": "{{environment_variables.SAMPLE_CALLBACK_URL}}", + "headers": { + "Content-Type": "application/json", + "Authorization": "Bearer {{environment_variables.SAMPLE_CALLBACK_API_KEY}}" + }, + "environment_variables": ["SAMPLE_CALLBACK_URL", "SAMPLE_CALLBACK_API_KEY"] + }, + "rubrik": { + "event_types": ["llm_api_success"], + "endpoint": "{{environment_variables.RUBRIK_WEBHOOK_URL}}", + "headers": { + "Content-Type": "application/json", + "Authorization": "Bearer {{environment_variables.RUBRIK_API_KEY}}" + }, + "environment_variables": ["RUBRIK_API_KEY", "RUBRIK_WEBHOOK_URL"] + }, + "sumologic": { + "endpoint": "{{environment_variables.SUMOLOGIC_WEBHOOK_URL}}", + "headers": { + "Content-Type": "application/json" + }, + "environment_variables": ["SUMOLOGIC_WEBHOOK_URL"], + "log_format": "ndjson" + }, + "qualifire_eval": { + "event_types": ["llm_api_success"], + "endpoint": "{{environment_variables.QUALIFIRE_WEBHOOK_URL}}", + "headers": { + "Content-Type": "application/json", + "X-Qualifire-API-Key": "{{environment_variables.QUALIFIRE_API_KEY}}" + }, + "environment_variables": ["QUALIFIRE_API_KEY", "QUALIFIRE_WEBHOOK_URL"] + } +} diff --git a/litellm/integrations/generic_prompt_management/__init__.py b/litellm/integrations/generic_prompt_management/__init__.py new file mode 100644 index 000000000000..7466dc9c68de --- /dev/null +++ b/litellm/integrations/generic_prompt_management/__init__.py @@ -0,0 +1,80 @@ +"""Generic prompt management integration for LiteLLM.""" + +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from .generic_prompt_manager import GenericPromptManager + from litellm.types.prompts.init_prompts import PromptLiteLLMParams, PromptSpec + from litellm.integrations.custom_prompt_management import CustomPromptManagement + +from litellm.types.prompts.init_prompts import SupportedPromptIntegrations + +from .generic_prompt_manager import GenericPromptManager + +# Global instances +global_generic_prompt_config: Optional[dict] = None + + +def set_global_generic_prompt_config(config: dict) -> None: + """ + Set the global generic prompt configuration. + + Args: + config: Dictionary containing generic prompt configuration + - api_base: Base URL for the API + - api_key: Optional API key for authentication + - timeout: Request timeout in seconds (default: 30) + """ + import litellm + + litellm.global_generic_prompt_config = config # type: ignore + + +def prompt_initializer( + litellm_params: "PromptLiteLLMParams", prompt_spec: "PromptSpec" +) -> "CustomPromptManagement": + """ + Initialize a prompt from a generic prompt management API. + """ + prompt_id = getattr(litellm_params, "prompt_id", None) + + api_base = litellm_params.api_base + api_key = litellm_params.api_key + if not api_base: + raise ValueError("api_base is required in generic_prompt_config") + + provider_specific_query_params = litellm_params.provider_specific_query_params + + try: + generic_prompt_manager = GenericPromptManager( + api_base=api_base, + api_key=api_key, + prompt_id=prompt_id, + additional_provider_specific_query_params=provider_specific_query_params, + **litellm_params.model_dump( + exclude_none=True, + exclude={ + "prompt_id", + "api_key", + "provider_specific_query_params", + "api_base", + }, + ), + ) + + return generic_prompt_manager + except Exception as e: + raise e + + +prompt_initializer_registry = { + SupportedPromptIntegrations.GENERIC_PROMPT_MANAGEMENT.value: prompt_initializer, +} + +# Export public API +__all__ = [ + "GenericPromptManager", + "set_global_generic_prompt_config", + "global_generic_prompt_config", + "prompt_initializer_registry", +] diff --git a/litellm/integrations/generic_prompt_management/generic_prompt_manager.py b/litellm/integrations/generic_prompt_management/generic_prompt_manager.py new file mode 100644 index 000000000000..9490d9fde1c1 --- /dev/null +++ b/litellm/integrations/generic_prompt_management/generic_prompt_manager.py @@ -0,0 +1,501 @@ +""" +Generic prompt manager that integrates with LiteLLM's prompt management system. +Fetches prompts from any API that implements the /beta/litellm_prompt_management endpoint. +""" + +import json +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +import httpx + +from litellm.integrations.custom_prompt_management import CustomPromptManagement +from litellm.integrations.prompt_management_base import ( + PromptManagementBase, + PromptManagementClient, +) +from litellm.llms.custom_httpx.http_handler import ( + _get_httpx_client, + get_async_httpx_client, +) +from litellm.types.llms.custom_http import httpxSpecialProvider +from litellm.types.llms.openai import AllMessageValues +from litellm.types.prompts.init_prompts import PromptSpec +from litellm.types.utils import StandardCallbackDynamicParams + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj + + +class GenericPromptManager(CustomPromptManagement): + """ + Generic prompt manager that integrates with LiteLLM's prompt management system. + + This class enables using prompts from any API that implements the + /beta/litellm_prompt_management endpoint. + + Usage: + # Configure API access + generic_config = { + "api_base": "https://your-api.com", + "api_key": "your-api-key", # optional + "timeout": 30, # optional, defaults to 30 + } + + # Use with completion + response = litellm.completion( + model="generic_prompt/gpt-4", + prompt_id="my_prompt_id", + prompt_variables={"variable": "value"}, + generic_prompt_config=generic_config, + messages=[{"role": "user", "content": "Additional message"}] + ) + """ + + def __init__( + self, + api_base: str, + api_key: Optional[str] = None, + timeout: int = 30, + prompt_id: Optional[str] = None, + additional_provider_specific_query_params: Optional[Dict[str, Any]] = None, + **kwargs, + ): + """ + Initialize the Generic Prompt Manager. + + Args: + api_base: Base URL for the API (e.g., "https://your-api.com") + api_key: Optional API key for authentication + timeout: Request timeout in seconds (default: 30) + prompt_id: Optional prompt ID to pre-load + """ + super().__init__(**kwargs) + self.api_base = api_base.rstrip("/") + self.api_key = api_key + self.timeout = timeout + self.prompt_id = prompt_id + self.additional_provider_specific_query_params = ( + additional_provider_specific_query_params + ) + self._prompt_cache: Dict[str, PromptManagementClient] = {} + + @property + def integration_name(self) -> str: + """Integration name used in model names like 'generic_prompt/gpt-4'.""" + return "generic_prompt" + + def _get_headers(self) -> Dict[str, str]: + """Get HTTP headers for API requests.""" + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + } + if self.api_key: + headers["Authorization"] = f"Bearer {self.api_key}" + return headers + + def _fetch_prompt_from_api( + self, prompt_id: Optional[str], prompt_spec: Optional[PromptSpec] + ) -> Dict[str, Any]: + """ + Fetch a prompt from the API. + + Args: + prompt_id: The ID of the prompt to fetch + + Returns: + The prompt data from the API + + Raises: + Exception: If the API request fails + """ + if prompt_id is None and prompt_spec is None: + raise ValueError("prompt_id or prompt_spec is required") + + url = f"{self.api_base}/beta/litellm_prompt_management" + params = { + "prompt_id": prompt_id, + **(self.additional_provider_specific_query_params or {}), + } + http_client = _get_httpx_client() + + try: + + response = http_client.get( + url, + params=params, + headers=self._get_headers(), + ) + + response.raise_for_status() + return response.json() + except httpx.HTTPError as e: + raise Exception(f"Failed to fetch prompt '{prompt_id}' from API: {e}") + except json.JSONDecodeError as e: + raise Exception(f"Failed to parse prompt response for '{prompt_id}': {e}") + + async def async_fetch_prompt_from_api( + self, prompt_id: Optional[str], prompt_spec: Optional[PromptSpec] + ) -> Dict[str, Any]: + """ + Fetch a prompt from the API asynchronously. + """ + if prompt_id is None and prompt_spec is None: + raise ValueError("prompt_id or prompt_spec is required") + + url = f"{self.api_base}/beta/litellm_prompt_management" + params = { + "prompt_id": prompt_id, + **( + prompt_spec.litellm_params.provider_specific_query_params + if prompt_spec + and prompt_spec.litellm_params.provider_specific_query_params + else {} + ), + } + + http_client = get_async_httpx_client( + llm_provider=httpxSpecialProvider.PromptManagement, + ) + + try: + response = await http_client.get( + url, + params=params, + headers=self._get_headers(), + ) + response.raise_for_status() + return response.json() + except httpx.HTTPError as e: + raise Exception(f"Failed to fetch prompt '{prompt_id}' from API: {e}") + except json.JSONDecodeError as e: + raise Exception(f"Failed to parse prompt response for '{prompt_id}': {e}") + + def _parse_api_response( + self, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], + api_response: Dict[str, Any], + ) -> PromptManagementClient: + """ + Parse the API response into a PromptManagementClient structure. + + Expected API response format: + { + "prompt_id": "string", + "prompt_template": [ + {"role": "system", "content": "..."}, + {"role": "user", "content": "..."} + ], + "prompt_template_model": "gpt-4", # optional + "prompt_template_optional_params": { # optional + "temperature": 0.7, + "max_tokens": 100 + } + } + + Args: + prompt_id: The ID of the prompt + api_response: The response from the API + + Returns: + PromptManagementClient structure + """ + return PromptManagementClient( + prompt_id=prompt_id, + prompt_template=api_response.get("prompt_template", []), + prompt_template_model=api_response.get("prompt_template_model"), + prompt_template_optional_params=api_response.get( + "prompt_template_optional_params" + ), + completed_messages=None, + ) + + def should_run_prompt_management( + self, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], + dynamic_callback_params: StandardCallbackDynamicParams, + ) -> bool: + """ + Determine if prompt management should run based on the prompt_id. + + For Generic Prompt Manager, we always return True and handle the prompt loading + in the _compile_prompt_helper method. + """ + if prompt_id is not None or ( + prompt_spec is not None + and prompt_spec.litellm_params.provider_specific_query_params is not None + ): + return True + return False + + def _get_cache_key( + self, + prompt_id: Optional[str], + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ) -> str: + return f"{prompt_id}:{prompt_label}:{prompt_version}" + + def _common_caching_logic( + self, + prompt_id: Optional[str], + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + prompt_variables: Optional[dict] = None, + ) -> Optional[PromptManagementClient]: + """ + Common caching logic for the prompt manager. + """ + # Check cache first + cache_key = self._get_cache_key(prompt_id, prompt_label, prompt_version) + if cache_key in self._prompt_cache: + cached_prompt = self._prompt_cache[cache_key] + # Return a copy with variables applied if needed + if prompt_variables: + return self._apply_variables(cached_prompt, prompt_variables) + return cached_prompt + return None + + def _compile_prompt_helper( + self, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ) -> PromptManagementClient: + """ + Compile a prompt template into a PromptManagementClient structure. + + This method: + 1. Fetches the prompt from the API (with caching) + 2. Applies any prompt variables (if the API supports it) + 3. Returns the structured prompt data + + Args: + prompt_id: The ID of the prompt + prompt_variables: Variables to substitute in the template (optional) + dynamic_callback_params: Dynamic callback parameters + prompt_label: Optional label for the prompt version + prompt_version: Optional specific version number + + Returns: + PromptManagementClient structure + """ + cached_prompt = self._common_caching_logic( + prompt_id=prompt_id, + prompt_label=prompt_label, + prompt_version=prompt_version, + prompt_variables=prompt_variables, + ) + if cached_prompt: + return cached_prompt + + cache_key = self._get_cache_key(prompt_id, prompt_label, prompt_version) + try: + # Fetch from API + api_response = self._fetch_prompt_from_api(prompt_id, prompt_spec) + + # Parse the response + prompt_client = self._parse_api_response( + prompt_id, prompt_spec, api_response + ) + + # Cache the result + self._prompt_cache[cache_key] = prompt_client + + # Apply variables if provided + if prompt_variables: + prompt_client = self._apply_variables(prompt_client, prompt_variables) + + return prompt_client + + except Exception as e: + raise ValueError(f"Error compiling prompt '{prompt_id}': {e}") + + async def async_compile_prompt_helper( + self, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ) -> PromptManagementClient: + + # Check cache first + cached_prompt = self._common_caching_logic( + prompt_id=prompt_id, + prompt_label=prompt_label, + prompt_version=prompt_version, + prompt_variables=prompt_variables, + ) + if cached_prompt: + return cached_prompt + + cache_key = self._get_cache_key(prompt_id, prompt_label, prompt_version) + + try: + # Fetch from API + + api_response = await self.async_fetch_prompt_from_api( + prompt_id=prompt_id, prompt_spec=prompt_spec + ) + + # Parse the response + prompt_client = self._parse_api_response( + prompt_id, prompt_spec, api_response + ) + + # Cache the result + self._prompt_cache[cache_key] = prompt_client + + # Apply variables if provided + if prompt_variables: + prompt_client = self._apply_variables(prompt_client, prompt_variables) + + return prompt_client + + except Exception as e: + raise ValueError( + f"Error compiling prompt '{prompt_id}': {e}, prompt_spec: {prompt_spec}" + ) + + def _apply_variables( + self, + prompt_client: PromptManagementClient, + variables: Dict[str, Any], + ) -> PromptManagementClient: + """ + Apply variables to the prompt template. + + This performs simple string substitution using {variable_name} syntax. + + Args: + prompt_client: The prompt client structure + variables: Variables to substitute + + Returns: + Updated PromptManagementClient with variables applied + """ + # Create a copy of the prompt template with variables applied + updated_messages: List[AllMessageValues] = [] + for message in prompt_client["prompt_template"]: + updated_message = dict(message) # type: ignore + if "content" in updated_message and isinstance( + updated_message["content"], str + ): + content = updated_message["content"] + for key, value in variables.items(): + content = content.replace(f"{{{key}}}", str(value)) + content = content.replace( + f"{{{{{key}}}}}", str(value) + ) # Also support {{key}} + updated_message["content"] = content + updated_messages.append(updated_message) # type: ignore + + return PromptManagementClient( + prompt_id=prompt_client["prompt_id"], + prompt_template=updated_messages, + prompt_template_model=prompt_client["prompt_template_model"], + prompt_template_optional_params=prompt_client[ + "prompt_template_optional_params" + ], + completed_messages=None, + ) + + async def async_get_chat_completion_prompt( + self, + model: str, + messages: List[AllMessageValues], + non_default_params: dict, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + litellm_logging_obj: "LiteLLMLoggingObj", + prompt_spec: Optional[PromptSpec] = None, + tools: Optional[List[Dict]] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, + ) -> Tuple[str, List[AllMessageValues], dict]: + """ + Get chat completion prompt and return processed model, messages, and parameters. + """ + + return await PromptManagementBase.async_get_chat_completion_prompt( + self, + model, + messages, + non_default_params, + prompt_id=prompt_id, + prompt_variables=prompt_variables, + litellm_logging_obj=litellm_logging_obj, + dynamic_callback_params=dynamic_callback_params, + prompt_spec=prompt_spec, + tools=tools, + prompt_label=prompt_label, + prompt_version=prompt_version, + ignore_prompt_manager_model=( + ignore_prompt_manager_model + or prompt_spec.litellm_params.ignore_prompt_manager_model + if prompt_spec + else False + ), + ignore_prompt_manager_optional_params=( + ignore_prompt_manager_optional_params + or prompt_spec.litellm_params.ignore_prompt_manager_optional_params + if prompt_spec + else False + ), + ) + + def get_chat_completion_prompt( + self, + model: str, + messages: List[AllMessageValues], + non_default_params: dict, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, + ) -> Tuple[str, List[AllMessageValues], dict]: + """ + Get chat completion prompt and return processed model, messages, and parameters. + """ + return PromptManagementBase.get_chat_completion_prompt( + self, + model, + messages, + non_default_params, + prompt_id=prompt_id, + prompt_variables=prompt_variables, + dynamic_callback_params=dynamic_callback_params, + prompt_spec=prompt_spec, + prompt_label=prompt_label, + prompt_version=prompt_version, + ignore_prompt_manager_model=( + ignore_prompt_manager_model + or prompt_spec.litellm_params.ignore_prompt_manager_model + if prompt_spec + else False + ), + ignore_prompt_manager_optional_params=( + ignore_prompt_manager_optional_params + or prompt_spec.litellm_params.ignore_prompt_manager_optional_params + if prompt_spec + else False + ), + ) + + def clear_cache(self) -> None: + """Clear the prompt cache.""" + self._prompt_cache.clear() diff --git a/litellm/integrations/gitlab/gitlab_prompt_manager.py b/litellm/integrations/gitlab/gitlab_prompt_manager.py index 37013273cb0b..b073948d7685 100644 --- a/litellm/integrations/gitlab/gitlab_prompt_manager.py +++ b/litellm/integrations/gitlab/gitlab_prompt_manager.py @@ -2,41 +2,49 @@ GitLab prompt manager with configurable prompts folder. """ -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + from jinja2 import DictLoader, Environment, select_autoescape from litellm.integrations.custom_prompt_management import CustomPromptManagement + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any +from litellm.integrations.gitlab.gitlab_client import GitLabClient from litellm.integrations.prompt_management_base import ( PromptManagementBase, PromptManagementClient, ) from litellm.types.llms.openai import AllMessageValues +from litellm.types.prompts.init_prompts import PromptSpec from litellm.types.utils import StandardCallbackDynamicParams -from litellm.integrations.gitlab.gitlab_client import GitLabClient - GITLAB_PREFIX = "gitlab::" + def encode_prompt_id(raw_id: str) -> str: """Convert GitLab path IDs like 'invoice/extract' → 'gitlab::invoice::extract'""" if raw_id.startswith(GITLAB_PREFIX): return raw_id # already encoded return f"{GITLAB_PREFIX}{raw_id.replace('/', '::')}" + def decode_prompt_id(encoded_id: str) -> str: """Convert 'gitlab::invoice::extract' → 'invoice/extract'""" if not encoded_id.startswith(GITLAB_PREFIX): return encoded_id - return encoded_id[len(GITLAB_PREFIX):].replace("::", "/") + return encoded_id[len(GITLAB_PREFIX) :].replace("::", "/") class GitLabPromptTemplate: def __init__( - self, - template_id: str, - content: str, - metadata: Dict[str, Any], - model: Optional[str] = None, + self, + template_id: str, + content: str, + metadata: Dict[str, Any], + model: Optional[str] = None, ): self.template_id = template_id self.content = content @@ -60,13 +68,12 @@ class GitLabTemplateManager: New: supports `prompts_path` (or `folder`) in gitlab_config to scope where prompts live. """ - def __init__( - self, - gitlab_config: Dict[str, Any], - prompt_id: Optional[str] = None, - ref: Optional[str] = None, - gitlab_client: Optional[GitLabClient] = None + self, + gitlab_config: Dict[str, Any], + prompt_id: Optional[str] = None, + ref: Optional[str] = None, + gitlab_client: Optional[GitLabClient] = None, ): self.gitlab_config = dict(gitlab_config) self.prompt_id = prompt_id @@ -78,9 +85,9 @@ def __init__( # Folder inside repo to look for prompts (e.g., "prompts" or "prompts/chat") self.prompts_path: str = ( - self.gitlab_config.get("prompts_path") - or self.gitlab_config.get("folder") - or "" + self.gitlab_config.get("prompts_path") + or self.gitlab_config.get("folder") + or "" ).strip("/") self.jinja_env = Environment( @@ -120,7 +127,9 @@ def _repo_path_to_id(self, repo_path: str) -> str: # ---------- loading ---------- - def _load_prompt_from_gitlab(self, prompt_id: str, *, ref: Optional[str] = None) -> None: + def _load_prompt_from_gitlab( + self, prompt_id: str, *, ref: Optional[str] = None + ) -> None: """Load a specific .prompt file from GitLab (scoped under prompts_path if set).""" try: # prompt_id = decode_prompt_id(prompt_id) @@ -130,7 +139,9 @@ def _load_prompt_from_gitlab(self, prompt_id: str, *, ref: Optional[str] = None) template = self._parse_prompt_file(prompt_content, prompt_id) self.prompts[prompt_id] = template except Exception as e: - raise Exception(f"Failed to load prompt '{encode_prompt_id(prompt_id)}' from GitLab: {e}") + raise Exception( + f"Failed to load prompt '{encode_prompt_id(prompt_id)}' from GitLab: {e}" + ) def load_all_prompts(self, *, recursive: bool = True) -> List[str]: """ @@ -146,9 +157,7 @@ def load_all_prompts(self, *, recursive: bool = True) -> List[str]: # ---------- parsing & rendering ---------- - def _parse_prompt_file( - self, content: str, prompt_id: str - ) -> GitLabPromptTemplate: + def _parse_prompt_file(self, content: str, prompt_id: str) -> GitLabPromptTemplate: if content.startswith("---"): parts = content.split("---", 2) if len(parts) >= 3: @@ -165,6 +174,7 @@ def _parse_prompt_file( if frontmatter_str: try: import yaml + metadata = yaml.safe_load(frontmatter_str) or {} except ImportError: metadata = self._parse_yaml_basic(frontmatter_str) @@ -199,7 +209,7 @@ def _parse_yaml_basic(self, yaml_str: str) -> Dict[str, Any]: return result def render_template( - self, template_id: str, variables: Optional[Dict[str, Any]] = None + self, template_id: str, variables: Optional[Dict[str, Any]] = None ) -> str: if template_id not in self.prompts: raise ValueError(f"Template '{template_id}' not found") @@ -244,9 +254,14 @@ def list_templates(self, *, recursive: bool = True) -> List[str]: ) # Classic returns GitLab tree entries; filter *.prompt blobs files = [] - for f in (raw or []): - if isinstance(f, dict) and f.get("type") == "blob" and str(f.get("path", "")).endswith(".prompt") and 'path' in f: - files.append(f['path']) + for f in raw or []: + if ( + isinstance(f, dict) + and f.get("type") == "blob" + and str(f.get("path", "")).endswith(".prompt") + and "path" in f + ): + files.append(f["path"]) # type: ignore return [self._repo_path_to_id(p) for p in files] @@ -266,11 +281,11 @@ class GitLabPromptManager(CustomPromptManagement): """ def __init__( - self, - gitlab_config: Dict[str, Any], - prompt_id: Optional[str] = None, - ref: Optional[str] = None, # tag/branch/SHA override - gitlab_client: Optional[GitLabClient] = None + self, + gitlab_config: Dict[str, Any], + prompt_id: Optional[str] = None, + ref: Optional[str] = None, # tag/branch/SHA override + gitlab_client: Optional[GitLabClient] = None, ): self.gitlab_config = gitlab_config self.prompt_id = prompt_id @@ -295,16 +310,16 @@ def prompt_manager(self) -> GitLabTemplateManager: gitlab_config=self.gitlab_config, prompt_id=self.prompt_id, ref=self._ref_override, - gitlab_client=self._injected_gitlab_client + gitlab_client=self._injected_gitlab_client, ) return self._prompt_manager def get_prompt_template( - self, - prompt_id: str, - prompt_variables: Optional[Dict[str, Any]] = None, - *, - ref: Optional[str] = None, + self, + prompt_id: str, + prompt_variables: Optional[Dict[str, Any]] = None, + *, + ref: Optional[str] = None, ) -> Tuple[str, Dict[str, Any]]: if prompt_id not in self.prompt_manager.prompts: self.prompt_manager._load_prompt_from_gitlab(prompt_id, ref=ref) @@ -326,15 +341,15 @@ def get_prompt_template( return rendered_prompt, metadata def pre_call_hook( - self, - user_id: Optional[str], - messages: List[AllMessageValues], - function_call: Optional[Union[Dict[str, Any], str]] = None, - litellm_params: Optional[Dict[str, Any]] = None, - prompt_id: Optional[str] = None, - prompt_variables: Optional[Dict[str, Any]] = None, - prompt_version: Optional[str] = None, - **kwargs, + self, + user_id: Optional[str], + messages: List[AllMessageValues], + function_call: Optional[Union[Dict[str, Any], str]] = None, + litellm_params: Optional[Dict[str, Any]] = None, + prompt_id: Optional[str] = None, + prompt_variables: Optional[Dict[str, Any]] = None, + prompt_version: Optional[str] = None, + **kwargs, ) -> Tuple[List[AllMessageValues], Optional[Dict[str, Any]]]: if not prompt_id: return messages, litellm_params @@ -358,16 +373,24 @@ def pre_call_hook( if prompt_metadata.get("model"): litellm_params["model"] = prompt_metadata["model"] - for param in ["temperature", "max_tokens", "top_p", "frequency_penalty", "presence_penalty"]: + for param in [ + "temperature", + "max_tokens", + "top_p", + "frequency_penalty", + "presence_penalty", + ]: if param in prompt_metadata: litellm_params[param] = prompt_metadata[param] return final_messages, litellm_params except Exception as e: import litellm - litellm._logging.verbose_proxy_logger.error(f"Error in GitLab prompt pre_call_hook: {e}") - return messages, litellm_params + litellm._logging.verbose_proxy_logger.error( + f"Error in GitLab prompt pre_call_hook: {e}" + ) + return messages, litellm_params def _parse_prompt_to_messages(self, prompt_content: str) -> List[AllMessageValues]: messages: List[AllMessageValues] = [] @@ -405,15 +428,15 @@ def _parse_prompt_to_messages(self, prompt_content: str) -> List[AllMessageValue return messages def post_call_hook( - self, - user_id: Optional[str], - response: Any, - input_messages: List[AllMessageValues], - function_call: Optional[Union[Dict[str, Any], str]] = None, - litellm_params: Optional[Dict[str, Any]] = None, - prompt_id: Optional[str] = None, - prompt_variables: Optional[Dict[str, Any]] = None, - **kwargs, + self, + user_id: Optional[str], + response: Any, + input_messages: List[AllMessageValues], + function_call: Optional[Union[Dict[str, Any], str]] = None, + litellm_params: Optional[Dict[str, Any]] = None, + prompt_id: Optional[str] = None, + prompt_variables: Optional[Dict[str, Any]] = None, + **kwargs, ) -> Any: return response @@ -436,27 +459,35 @@ def reload_prompts(self) -> None: _ = self.prompt_manager # trigger re-init/load def should_run_prompt_management( - self, - prompt_id: str, - dynamic_callback_params: StandardCallbackDynamicParams, + self, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], + dynamic_callback_params: StandardCallbackDynamicParams, ) -> bool: - return True + return prompt_id is not None def _compile_prompt_helper( - self, - prompt_id: str, - prompt_variables: Optional[dict], - dynamic_callback_params: StandardCallbackDynamicParams, - prompt_label: Optional[str] = None, - prompt_version: Optional[int] = None, + self, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, ) -> PromptManagementClient: + if prompt_id is None: + raise ValueError("prompt_id is required for GitLab prompt manager") + try: decoded_id = decode_prompt_id(prompt_id) if decoded_id not in self.prompt_manager.prompts: - git_ref = getattr(dynamic_callback_params, "extra", {}).get("git_ref") if hasattr(dynamic_callback_params, "extra") else None + git_ref = ( + getattr(dynamic_callback_params, "extra", {}).get("git_ref") + if hasattr(dynamic_callback_params, "extra") + else None + ) self.prompt_manager._load_prompt_from_gitlab(decoded_id, ref=git_ref) - rendered_prompt, prompt_metadata = self.get_prompt_template( prompt_id, prompt_variables ) @@ -465,7 +496,13 @@ def _compile_prompt_helper( template_model = prompt_metadata.get("model") optional_params: Dict[str, Any] = {} - for param in ["temperature", "max_tokens", "top_p", "frequency_penalty", "presence_penalty"]: + for param in [ + "temperature", + "max_tokens", + "top_p", + "frequency_penalty", + "presence_penalty", + ]: if param in prompt_metadata: optional_params[param] = prompt_metadata[param] @@ -479,16 +516,44 @@ def _compile_prompt_helper( except Exception as e: raise ValueError(f"Error compiling prompt '{prompt_id}': {e}") + async def async_compile_prompt_helper( + self, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ) -> PromptManagementClient: + """ + Async version of compile prompt helper. Since GitLab operations use sync client, + this simply delegates to the sync version. + """ + if prompt_id is None: + raise ValueError("prompt_id is required for GitLab prompt manager") + + return self._compile_prompt_helper( + prompt_id=prompt_id, + prompt_spec=prompt_spec, + prompt_variables=prompt_variables, + dynamic_callback_params=dynamic_callback_params, + prompt_label=prompt_label, + prompt_version=prompt_version, + ) + def get_chat_completion_prompt( - self, - model: str, - messages: List[AllMessageValues], - non_default_params: dict, - prompt_id: Optional[str], - prompt_variables: Optional[dict], - dynamic_callback_params: StandardCallbackDynamicParams, - prompt_label: Optional[str] = None, - prompt_version: Optional[int] = None, + self, + model: str, + messages: List[AllMessageValues], + non_default_params: dict, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, ) -> Tuple[str, List[AllMessageValues], dict]: return PromptManagementBase.get_chat_completion_prompt( self, @@ -498,8 +563,45 @@ def get_chat_completion_prompt( prompt_id, prompt_variables, dynamic_callback_params, - prompt_label, - prompt_version, + prompt_spec=prompt_spec, + prompt_label=prompt_label, + prompt_version=prompt_version, + ) + + async def async_get_chat_completion_prompt( + self, + model: str, + messages: List[AllMessageValues], + non_default_params: dict, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + litellm_logging_obj: LiteLLMLoggingObj, + prompt_spec: Optional[PromptSpec] = None, + tools: Optional[List[Dict]] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, + ) -> Tuple[str, List[AllMessageValues], dict]: + """ + Async version - delegates to PromptManagementBase async implementation. + """ + return await PromptManagementBase.async_get_chat_completion_prompt( + self, + model, + messages, + non_default_params, + prompt_id=prompt_id, + prompt_variables=prompt_variables, + litellm_logging_obj=litellm_logging_obj, + dynamic_callback_params=dynamic_callback_params, + prompt_spec=prompt_spec, + tools=tools, + prompt_label=prompt_label, + prompt_version=prompt_version, + ignore_prompt_manager_model=ignore_prompt_manager_model, + ignore_prompt_manager_optional_params=ignore_prompt_manager_optional_params, ) @@ -537,11 +639,11 @@ class GitLabPromptCache: """ def __init__( - self, - gitlab_config: Dict[str, Any], - *, - ref: Optional[str] = None, - gitlab_client: Optional[GitLabClient] = None, + self, + gitlab_config: Dict[str, Any], + *, + ref: Optional[str] = None, + gitlab_client: Optional[GitLabClient] = None, ) -> None: # Build a PromptManager (which internally builds TemplateManager + Client) self.prompt_manager = GitLabPromptManager( @@ -550,7 +652,9 @@ def __init__( ref=ref, gitlab_client=gitlab_client, ) - self.template_manager: GitLabTemplateManager = self.prompt_manager.prompt_manager + self.template_manager: GitLabTemplateManager = ( + self.prompt_manager.prompt_manager + ) # In-memory stores self._by_file: Dict[str, Dict[str, Any]] = {} @@ -565,7 +669,9 @@ def load_all(self, *, recursive: bool = True) -> Dict[str, Dict[str, Any]]: Scan GitLab for all .prompt files under prompts_path, load and parse each, and return the mapping of repo file path -> JSON-like dict. """ - ids = self.template_manager.list_templates(recursive=recursive) # IDs relative to prompts_path + ids = self.template_manager.list_templates( + recursive=recursive + ) # IDs relative to prompts_path for pid in ids: # Ensure template is loaded into TemplateManager if pid not in self.template_manager.prompts: @@ -579,7 +685,9 @@ def load_all(self, *, recursive: bool = True) -> Dict[str, Dict[str, Any]]: if tmpl is None: continue - file_path = self.template_manager._id_to_repo_path(pid) # "prompts/chat/..../file.prompt" + file_path = self.template_manager._id_to_repo_path( + pid + ) # "prompts/chat/..../file.prompt" entry = self._template_to_json(pid, tmpl) self._by_file[file_path] = entry @@ -623,7 +731,9 @@ def get_by_id(self, prompt_id: str) -> Optional[Dict[str, Any]]: # Internals # ------------------------- - def _template_to_json(self, prompt_id: str, tmpl: GitLabPromptTemplate) -> Dict[str, Any]: + def _template_to_json( + self, prompt_id: str, tmpl: GitLabPromptTemplate + ) -> Dict[str, Any]: """ Normalize a GitLabPromptTemplate into a JSON-like dict that is easy to serialize. """ @@ -637,12 +747,14 @@ def _template_to_json(self, prompt_id: str, tmpl: GitLabPromptTemplate) -> Dict[ optional_params = dict(tmpl.optional_params or {}) return { - "id": prompt_id, # e.g. "greet/hi" - "path": self.template_manager._id_to_repo_path(prompt_id), # e.g. "prompts/chat/greet/hi.prompt" - "content": tmpl.content, # rendered content (without frontmatter) - "metadata": md, # parsed frontmatter + "id": prompt_id, # e.g. "greet/hi" + "path": self.template_manager._id_to_repo_path( + prompt_id + ), # e.g. "prompts/chat/greet/hi.prompt" + "content": tmpl.content, # rendered content (without frontmatter) + "metadata": md, # parsed frontmatter "model": model, "temperature": temperature, "max_tokens": max_tokens, "optional_params": optional_params, - } \ No newline at end of file + } diff --git a/litellm/integrations/helicone.py b/litellm/integrations/helicone.py index 198cbaf40581..b996813b4e7c 100644 --- a/litellm/integrations/helicone.py +++ b/litellm/integrations/helicone.py @@ -4,6 +4,11 @@ import traceback import litellm +from litellm._logging import verbose_logger +from litellm.integrations.helicone_mock_client import ( + should_use_helicone_mock, + create_mock_helicone_client, +) class HeliconeLogger: @@ -22,6 +27,11 @@ class HeliconeLogger: def __init__(self): # Instance variables + self.is_mock_mode = should_use_helicone_mock() + if self.is_mock_mode: + create_mock_helicone_client() + verbose_logger.info("[HELICONE MOCK] Helicone logger initialized in mock mode") + self.provider_url = "https://api.openai.com/v1" self.key = os.getenv("HELICONE_API_KEY") self.api_base = os.getenv("HELICONE_API_BASE") or "https://api.hconeai.com" @@ -185,7 +195,10 @@ def log_success( } response = litellm.module_level_client.post(url, headers=headers, json=data) if response.status_code == 200: - print_verbose("Helicone Logging - Success!") + if self.is_mock_mode: + print_verbose("[HELICONE MOCK] Helicone Logging - Successfully mocked!") + else: + print_verbose("Helicone Logging - Success!") else: print_verbose( f"Helicone Logging - Error Request was not successful. Status Code: {response.status_code}" diff --git a/litellm/integrations/helicone_mock_client.py b/litellm/integrations/helicone_mock_client.py new file mode 100644 index 000000000000..0f4670a1d2cb --- /dev/null +++ b/litellm/integrations/helicone_mock_client.py @@ -0,0 +1,32 @@ +""" +Mock HTTP client for Helicone integration testing. + +This module intercepts Helicone API calls and returns successful mock responses, +allowing full code execution without making actual network calls. + +Usage: + Set HELICONE_MOCK=true in environment variables or config to enable mock mode. +""" + +from litellm.integrations.mock_client_factory import MockClientConfig, create_mock_client_factory + +# Create mock client using factory +# Helicone uses HTTPHandler which internally uses httpx.Client.send(), not httpx.Client.post() +_config = MockClientConfig( + name="HELICONE", + env_var="HELICONE_MOCK", + default_latency_ms=100, + default_status_code=200, + default_json_data={"status": "success"}, + url_matchers=[ + ".hconeai.com", + "hconeai.com", + ".helicone.ai", + "helicone.ai", + ], + patch_async_handler=False, + patch_sync_client=False, # HTTPHandler uses self.client.send(), not self.client.post() + patch_http_handler=True, # Patch HTTPHandler.post directly +) + +create_mock_helicone_client, should_use_helicone_mock = create_mock_client_factory(_config) diff --git a/litellm/integrations/humanloop.py b/litellm/integrations/humanloop.py index 8e60d3736e02..369df5ee0bd8 100644 --- a/litellm/integrations/humanloop.py +++ b/litellm/integrations/humanloop.py @@ -14,6 +14,7 @@ from litellm.llms.custom_httpx.http_handler import _get_httpx_client from litellm.secret_managers.main import get_secret_str from litellm.types.llms.openai import AllMessageValues +from litellm.types.prompts.init_prompts import PromptSpec from litellm.types.utils import StandardCallbackDynamicParams from .custom_logger import CustomLogger @@ -156,8 +157,11 @@ def get_chat_completion_prompt( prompt_id: Optional[str], prompt_variables: Optional[dict], dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, prompt_label: Optional[str] = None, prompt_version: Optional[int] = None, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, ) -> Tuple[ str, List[AllMessageValues], @@ -178,6 +182,7 @@ def get_chat_completion_prompt( prompt_id=prompt_id, prompt_variables=prompt_variables, dynamic_callback_params=dynamic_callback_params, + prompt_spec=prompt_spec, ) prompt_template = prompt_manager._get_prompt_from_id( diff --git a/litellm/integrations/langfuse/langfuse.py b/litellm/integrations/langfuse/langfuse.py index 7f807bb8b0c9..7bf97665fd2a 100644 --- a/litellm/integrations/langfuse/langfuse.py +++ b/litellm/integrations/langfuse/langfuse.py @@ -3,15 +3,33 @@ import os import traceback from datetime import datetime -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Union, + cast, +) from packaging.version import Version import litellm from litellm._logging import verbose_logger from litellm.constants import MAX_LANGFUSE_INITIALIZED_CLIENTS -from litellm.litellm_core_utils.core_helpers import safe_deep_copy +from litellm.litellm_core_utils.core_helpers import ( + safe_deep_copy, + reconstruct_model_name, + filter_exceptions_from_params, +) from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info +from litellm.integrations.langfuse.langfuse_mock_client import ( + create_mock_langfuse_client, + should_use_langfuse_mock, +) from litellm.llms.custom_httpx.http_handler import _get_httpx_client from litellm.secret_managers.main import str_to_bool from litellm.types.integrations.langfuse import * @@ -37,6 +55,41 @@ Langfuse = Any +def _extract_cache_read_input_tokens(usage_obj) -> int: + """ + Extract cache_read_input_tokens from usage object. + + Checks both: + 1. Top-level cache_read_input_tokens (Anthropic format) + 2. prompt_tokens_details.cached_tokens (Gemini, OpenAI format) + + See: https://github.com/BerriAI/litellm/issues/18520 + + Args: + usage_obj: Usage object from LLM response + + Returns: + int: Number of cached tokens read, defaults to 0 + """ + cache_read_input_tokens = usage_obj.get("cache_read_input_tokens") or 0 + + # Check prompt_tokens_details.cached_tokens (used by Gemini and other providers) + if hasattr(usage_obj, "prompt_tokens_details"): + prompt_tokens_details = getattr(usage_obj, "prompt_tokens_details", None) + if prompt_tokens_details is not None and hasattr( + prompt_tokens_details, "cached_tokens" + ): + cached_tokens = getattr(prompt_tokens_details, "cached_tokens", None) + if ( + cached_tokens is not None + and isinstance(cached_tokens, (int, float)) + and cached_tokens > 0 + ): + cache_read_input_tokens = cached_tokens + + return cache_read_input_tokens + + class LangFuseLogger: # Class variables or attributes def __init__( @@ -70,8 +123,14 @@ def __init__( self.langfuse_flush_interval = LangFuseLogger._get_langfuse_flush_interval( flush_interval ) - http_client = _get_httpx_client() - self.langfuse_client = http_client.client + + if should_use_langfuse_mock(): + self.langfuse_client = create_mock_langfuse_client() + self.is_mock_mode = True + else: + http_client = _get_httpx_client() + self.langfuse_client = http_client.client + self.is_mock_mode = False parameters = { "public_key": self.public_key, @@ -90,11 +149,15 @@ def __init__( # set the current langfuse project id in the environ # this is used by Alerting to link to the correct project - try: - project_id = self.Langfuse.client.projects.get().data[0].id - os.environ["LANGFUSE_PROJECT_ID"] = project_id - except Exception: - project_id = None + if self.is_mock_mode: + os.environ["LANGFUSE_PROJECT_ID"] = "mock-project-id" + verbose_logger.debug("Langfuse Mock: Using mock project ID") + else: + try: + project_id = self.Langfuse.client.projects.get().data[0].id + os.environ["LANGFUSE_PROJECT_ID"] = project_id + except Exception: + project_id = None if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None: upstream_langfuse_debug = ( @@ -228,6 +291,8 @@ def log_event_on_langfuse( functions = optional_params.pop("functions", None) tools = optional_params.pop("tools", None) + # Remove secret_fields to prevent leaking sensitive data (e.g., authorization headers) + optional_params.pop("secret_fields", None) if functions is not None: prompt["functions"] = functions if tools is not None: @@ -435,12 +500,17 @@ def _log_langfuse_v1( ) ) + custom_llm_provider = cast(Optional[str], kwargs.get("custom_llm_provider")) + model_name = reconstruct_model_name( + kwargs.get("model", ""), custom_llm_provider, metadata + ) + trace.generation( CreateGeneration( name=metadata.get("generation_name", "litellm-completion"), startTime=start_time, endTime=end_time, - model=kwargs["model"], + model=model_name, modelParameters=optional_params, prompt=input, completion=output, @@ -470,7 +540,6 @@ def _log_langfuse_v2( # noqa: PLR0915 verbose_logger.debug("Langfuse Layer Logging - logging to langfuse v2") try: - metadata = metadata or {} standard_logging_object: Optional[StandardLoggingPayload] = cast( Optional[StandardLoggingPayload], kwargs.get("standard_logging_object", None), @@ -534,12 +603,35 @@ def _log_langfuse_v2( # noqa: PLR0915 session_id = clean_metadata.pop("session_id", None) trace_name = cast(Optional[str], clean_metadata.pop("trace_name", None)) - trace_id = clean_metadata.pop("trace_id", litellm_call_id) + trace_id = clean_metadata.pop("trace_id", None) + # Use standard_logging_object.trace_id if available (when trace_id from metadata is None) + # This allows standard trace_id to be used when provided in standard_logging_object + if trace_id is None and standard_logging_object is not None: + trace_id = cast( + Optional[str], standard_logging_object.get("trace_id") + ) + # Fallback to litellm_call_id if no trace_id found + if trace_id is None: + trace_id = litellm_call_id existing_trace_id = clean_metadata.pop("existing_trace_id", None) + # If existing_trace_id is provided, use it as the trace_id to return + # This allows continuing an existing trace while still returning the correct trace_id + if existing_trace_id is not None: + trace_id = existing_trace_id update_trace_keys = cast(list, clean_metadata.pop("update_trace_keys", [])) debug = clean_metadata.pop("debug_langfuse", None) mask_input = clean_metadata.pop("mask_input", False) mask_output = clean_metadata.pop("mask_output", False) + # Look for masking function in the dedicated location first (set by scrub_sensitive_keys_in_metadata) + # Fall back to metadata for backwards compatibility + masking_function = litellm_params.get( + "_langfuse_masking_function" + ) or clean_metadata.pop("langfuse_masking_function", None) + + # Apply custom masking function if provided + if masking_function is not None and callable(masking_function): + input = self._apply_masking_function(input, masking_function) + output = self._apply_masking_function(output, masking_function) clean_metadata = redact_user_api_key_info(metadata=clean_metadata) @@ -613,9 +705,10 @@ def _log_langfuse_v2( # noqa: PLR0915 clean_metadata["litellm_response_cost"] = cost if standard_logging_object is not None: - clean_metadata["hidden_params"] = standard_logging_object[ - "hidden_params" - ] + hidden_params = standard_logging_object.get("hidden_params", {}) + clean_metadata["hidden_params"] = filter_exceptions_from_params( + hidden_params + ) if ( litellm.langfuse_default_tags is not None @@ -683,16 +776,35 @@ def _log_langfuse_v2( # noqa: PLR0915 _usage_obj = getattr(response_obj, "usage", None) if _usage_obj: + # Safely get usage values, defaulting None to 0 for Langfuse compatibility. + # Some providers may return null for token counts. + prompt_tokens = getattr(_usage_obj, "prompt_tokens", None) or 0 + completion_tokens = ( + getattr(_usage_obj, "completion_tokens", None) or 0 + ) + total_tokens = getattr(_usage_obj, "total_tokens", None) or 0 + + cache_creation_input_tokens = ( + _usage_obj.get("cache_creation_input_tokens") or 0 + ) + cache_read_input_tokens = _extract_cache_read_input_tokens( + _usage_obj + ) + usage = { - "prompt_tokens": _usage_obj.prompt_tokens, - "completion_tokens": _usage_obj.completion_tokens, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, "total_cost": cost if self._supports_costs() else None, } - usage_details = LangfuseUsageDetails(input=_usage_obj.prompt_tokens, - output=_usage_obj.completion_tokens, - total=_usage_obj.total_tokens, - cache_creation_input_tokens=_usage_obj.get('cache_creation_input_tokens', 0), - cache_read_input_tokens=_usage_obj.get('cache_read_input_tokens', 0)) + # According to langfuse documentation: "the input value must be reduced by the number of cache_read_input_tokens" + input_tokens = prompt_tokens - cache_read_input_tokens + usage_details = LangfuseUsageDetails( + input=input_tokens, + output=completion_tokens, + total=total_tokens, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, + ) generation_name = clean_metadata.pop("generation_name", None) if generation_name is None: @@ -716,12 +828,17 @@ def _log_langfuse_v2( # noqa: PLR0915 if system_fingerprint is not None: optional_params["system_fingerprint"] = system_fingerprint + custom_llm_provider = cast(Optional[str], kwargs.get("custom_llm_provider")) + model_name = reconstruct_model_name( + kwargs.get("model", ""), custom_llm_provider, metadata + ) + generation_params = { "name": generation_name, "id": clean_metadata.pop("generation_id", generation_id), "start_time": start_time, "end_time": end_time, - "model": kwargs["model"], + "model": model_name, "model_parameters": optional_params, "input": input if not mask_input else "redacted-by-litellm", "output": output if not mask_output else "redacted-by-litellm", @@ -753,7 +870,17 @@ def _log_langfuse_v2( # noqa: PLR0915 generation_client = trace.generation(**generation_params) - return generation_client.trace_id, generation_id + # Return the trace_id we set (which should be litellm_call_id when no explicit trace_id provided) + # We explicitly set trace_id in trace_params["id"], so langfuse should use it + # Verify langfuse accepted our trace_id; if it differs, log a warning but still return our intended value + # to match expected test behavior + if hasattr(generation_client, "trace_id") and generation_client.trace_id: + if generation_client.trace_id != trace_id: + verbose_logger.warning( + f"Langfuse trace_id mismatch: set {trace_id}, but langfuse returned {generation_client.trace_id}. " + "Using our intended trace_id for consistency." + ) + return trace_id, generation_id except Exception: verbose_logger.error(f"Langfuse Layer Error - {traceback.format_exc()}") return None, None @@ -790,7 +917,7 @@ def _get_responses_api_content_for_langfuse( """ Get the responses API content for Langfuse logging """ - if hasattr(response_obj, 'output') and response_obj.output: + if hasattr(response_obj, "output") and response_obj.output: # ResponsesAPIResponse.output is a list of strings return response_obj.output else: @@ -847,6 +974,47 @@ def _supports_completion_start_time(self): """Check if current langfuse version supports completion start time""" return Version(self.langfuse_sdk_version) >= Version("2.7.3") + @staticmethod + def _apply_masking_function( + data: Any, masking_function: Callable[[Any], Any] + ) -> Any: + """ + Apply a masking function to data, handling different data types. + + Args: + data: The data to mask (can be str, dict, list, or None) + masking_function: A callable that takes data and returns masked data + + Returns: + The masked data + """ + if data is None: + return None + + try: + if isinstance(data, str): + return masking_function(data) + elif isinstance(data, dict): + masked_dict = {} + for key, value in data.items(): + masked_dict[key] = LangFuseLogger._apply_masking_function( + value, masking_function + ) + return masked_dict + elif isinstance(data, list): + return [ + LangFuseLogger._apply_masking_function(item, masking_function) + for item in data + ] + else: + # For other types, try to apply the function directly + return masking_function(data) + except Exception as e: + verbose_logger.warning( + f"Failed to apply masking function: {e}. Returning original data." + ) + return data + @staticmethod def _get_langfuse_flush_interval(flush_interval: int) -> int: """ @@ -880,29 +1048,44 @@ def _log_guardrail_information_as_span( guardrail_information = standard_logging_object.get( "guardrail_information", None ) - if guardrail_information is None: + if not guardrail_information: verbose_logger.debug( - "Not logging guardrail information as span because guardrail_information is None" + "Not logging guardrail information as span because guardrail_information is empty" ) return - span = trace.span( - name="guardrail", - input=guardrail_information.get("guardrail_request", None), - output=guardrail_information.get("guardrail_response", None), - metadata={ - "guardrail_name": guardrail_information.get("guardrail_name", None), - "guardrail_mode": guardrail_information.get("guardrail_mode", None), - "guardrail_masked_entity_count": guardrail_information.get( - "masked_entity_count", None - ), - }, - start_time=guardrail_information.get("start_time", None), # type: ignore - end_time=guardrail_information.get("end_time", None), # type: ignore - ) + if not isinstance(guardrail_information, list): + verbose_logger.debug( + "Not logging guardrail information as span because guardrail_information is not a list: %s", + type(guardrail_information), + ) + return + + for guardrail_entry in guardrail_information: + if not isinstance(guardrail_entry, dict): + verbose_logger.debug( + "Skipping guardrail entry with unexpected type: %s", + type(guardrail_entry), + ) + continue + + span = trace.span( + name="guardrail", + input=guardrail_entry.get("guardrail_request", None), + output=guardrail_entry.get("guardrail_response", None), + metadata={ + "guardrail_name": guardrail_entry.get("guardrail_name", None), + "guardrail_mode": guardrail_entry.get("guardrail_mode", None), + "guardrail_masked_entity_count": guardrail_entry.get( + "masked_entity_count", None + ), + }, + start_time=guardrail_entry.get("start_time", None), # type: ignore + end_time=guardrail_entry.get("end_time", None), # type: ignore + ) - verbose_logger.debug(f"Logged guardrail information as span: {span}") - span.end() + verbose_logger.debug(f"Logged guardrail information as span: {span}") + span.end() def _add_prompt_to_generation_params( diff --git a/litellm/integrations/langfuse/langfuse_mock_client.py b/litellm/integrations/langfuse/langfuse_mock_client.py new file mode 100644 index 000000000000..8ed6cff8d47f --- /dev/null +++ b/litellm/integrations/langfuse/langfuse_mock_client.py @@ -0,0 +1,35 @@ +""" +Mock httpx client for Langfuse integration testing. + +This module intercepts Langfuse API calls and returns successful mock responses, +allowing full code execution without making actual network calls. + +Usage: + Set LANGFUSE_MOCK=true in environment variables or config to enable mock mode. +""" + +import httpx +from litellm.integrations.mock_client_factory import MockClientConfig, create_mock_client_factory + +# Create mock client using factory +_config = MockClientConfig( + name="LANGFUSE", + env_var="LANGFUSE_MOCK", + default_latency_ms=100, + default_status_code=200, + default_json_data={"status": "success"}, + url_matchers=[ + ".langfuse.com", + "langfuse.com", + ], + patch_async_handler=False, + patch_sync_client=True, +) + +_create_mock_langfuse_client_internal, should_use_langfuse_mock = create_mock_client_factory(_config) + +# Langfuse needs to return an httpx.Client instance +def create_mock_langfuse_client(): + """Create and return an httpx.Client instance - the monkey-patch intercepts all calls.""" + _create_mock_langfuse_client_internal() + return httpx.Client() diff --git a/litellm/integrations/langfuse/langfuse_otel.py b/litellm/integrations/langfuse/langfuse_otel.py index 43d16b5e4cb5..08493a0e8ecf 100644 --- a/litellm/integrations/langfuse/langfuse_otel.py +++ b/litellm/integrations/langfuse/langfuse_otel.py @@ -5,6 +5,9 @@ from litellm._logging import verbose_logger from litellm.integrations.arize import _utils +from litellm.integrations.langfuse.langfuse_otel_attributes import ( + LangfuseLLMObsOTELAttributes, +) from litellm.integrations.opentelemetry import OpenTelemetry from litellm.types.integrations.langfuse_otel import ( LangfuseOtelConfig, @@ -33,27 +36,24 @@ LANGFUSE_CLOUD_US_ENDPOINT = "https://us.cloud.langfuse.com/api/public/otel" - class LangfuseOtelLogger(OpenTelemetry): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - @staticmethod def set_langfuse_otel_attributes(span: Span, kwargs, response_obj): """ Sets OpenTelemetry span attributes for Langfuse observability. Uses the same attribute setting logic as Arize Phoenix for consistency. """ - _utils.set_attributes(span, kwargs, response_obj) + + _utils.set_attributes(span, kwargs, response_obj, LangfuseLLMObsOTELAttributes) ######################################################### # Set Langfuse specific attributes ######################################################### LangfuseOtelLogger._set_langfuse_specific_attributes( - span=span, - kwargs=kwargs, - response_obj=response_obj + span=span, kwargs=kwargs, response_obj=response_obj ) return @@ -87,31 +87,10 @@ def _extract_langfuse_metadata(kwargs: dict) -> dict: return metadata @staticmethod - def _set_langfuse_specific_attributes(span: Span, kwargs, response_obj): - """ - Sets Langfuse specific metadata attributes onto the OTEL span. - - All keys supported by the vanilla Langfuse integration are mapped to - OTEL-safe attribute names defined in LangfuseSpanAttributes. Complex - values (lists/dicts) are serialised to JSON strings for OTEL - compatibility. - """ + def _set_metadata_attributes(span: Span, metadata: dict): + """Helper to set metadata attributes from mapping.""" from litellm.integrations.arize._utils import safe_set_attribute - from litellm.litellm_core_utils.safe_json_dumps import safe_dumps - - # 1) Environment variable override - langfuse_environment = os.environ.get("LANGFUSE_TRACING_ENVIRONMENT") - if langfuse_environment: - safe_set_attribute( - span, - LangfuseSpanAttributes.LANGFUSE_ENVIRONMENT.value, - langfuse_environment, - ) - # 2) Dynamic metadata from kwargs / headers - metadata = LangfuseOtelLogger._extract_langfuse_metadata(kwargs) - - # Mapping from metadata key -> OTEL attribute enum mapping = { "generation_name": LangfuseSpanAttributes.GENERATION_NAME, "generation_id": LangfuseSpanAttributes.GENERATION_ID, @@ -135,7 +114,6 @@ def _set_langfuse_specific_attributes(span: Span, kwargs, response_obj): for key, enum_attr in mapping.items(): if key in metadata and metadata[key] is not None: value = metadata[key] - # Lists / dicts must be stringified for OTEL if isinstance(value, (list, dict)): try: value = json.dumps(value) @@ -143,74 +121,138 @@ def _set_langfuse_specific_attributes(span: Span, kwargs, response_obj): value = str(value) safe_set_attribute(span, enum_attr.value, value) - # 3) Set observation input/output for better UI display - # - # These Langfuse-specific attributes provide better UI display, - # especially for tool calls and function calling. - # Set observation input (messages) - messages = kwargs.get("messages") - if messages: - safe_set_attribute( - span, - LangfuseSpanAttributes.OBSERVATION_INPUT.value, - safe_dumps(messages), - ) + @staticmethod + def _set_observation_output(span: Span, response_obj): + """Helper to set observation output attributes.""" + from litellm.integrations.arize._utils import safe_set_attribute + from litellm.litellm_core_utils.safe_json_dumps import safe_dumps - # Set observation output (response with tool_calls if present) - if response_obj and hasattr(response_obj, "get"): - choices = response_obj.get("choices", []) - if choices: - # Extract the first choice's message - first_choice = choices[0] - message = first_choice.get("message", {}) - - # Check if there are tool_calls - tool_calls = message.get("tool_calls") - if tool_calls: - # Transform tool_calls to Langfuse-expected format - transformed_tool_calls = [] - for tool_call in tool_calls: - function = tool_call.get("function", {}) - arguments_str = function.get("arguments", "{}") - - # Parse arguments from JSON string to object - try: - arguments_obj = json.loads(arguments_str) if isinstance(arguments_str, str) else arguments_str - except json.JSONDecodeError: - arguments_obj = {} - - # Create Langfuse-compatible tool call object + if not response_obj or not hasattr(response_obj, "get"): + return + + choices = response_obj.get("choices", []) + if choices: + first_choice = choices[0] + message = first_choice.get("message", {}) + tool_calls = message.get("tool_calls") + if tool_calls: + transformed_tool_calls = [] + for tool_call in tool_calls: + function = tool_call.get("function", {}) + arguments_str = function.get("arguments", "{}") + try: + arguments_obj = ( + json.loads(arguments_str) + if isinstance(arguments_str, str) + else arguments_str + ) + except json.JSONDecodeError: + arguments_obj = {} + langfuse_tool_call = { + "id": response_obj.get("id", ""), + "name": function.get("name", ""), + "call_id": tool_call.get("id", ""), + "type": "function_call", + "arguments": arguments_obj, + } + transformed_tool_calls.append(langfuse_tool_call) + safe_set_attribute( + span, + LangfuseSpanAttributes.OBSERVATION_OUTPUT.value, + safe_dumps(transformed_tool_calls), + ) + else: + output_data = {} + if message.get("role"): + output_data["role"] = message.get("role") + if message.get("content") is not None: + output_data["content"] = message.get("content") + if output_data: + safe_set_attribute( + span, + LangfuseSpanAttributes.OBSERVATION_OUTPUT.value, + safe_dumps(output_data), + ) + + output = response_obj.get("output", []) + if output: + output_items_data: list[dict] = [] + for item in output: + if hasattr(item, "type"): + item_type = item.type + if item_type == "reasoning" and hasattr(item, "summary"): + for summary in item.summary: + if hasattr(summary, "text"): + output_items_data.append( + { + "role": "reasoning_summary", + "content": summary.text, + } + ) + elif item_type == "message": + output_items_data.append( + { + "role": getattr(item, "role", "assistant"), + "content": getattr( + getattr(item, "content", [{}])[0], "text", "" + ), + } + ) + elif item_type == "function_call": + arguments_str = getattr(item, "arguments", "{}") + arguments_obj = ( + json.loads(arguments_str) + if isinstance(arguments_str, str) + else arguments_str + ) langfuse_tool_call = { - "id": response_obj.get("id", ""), - "name": function.get("name", ""), - "call_id": tool_call.get("id", ""), + "id": getattr(item, "id", ""), + "name": getattr(item, "name", ""), + "call_id": getattr(item, "call_id", ""), "type": "function_call", "arguments": arguments_obj, } - transformed_tool_calls.append(langfuse_tool_call) + output_items_data.append(langfuse_tool_call) + if output_items_data: + safe_set_attribute( + span, + LangfuseSpanAttributes.OBSERVATION_OUTPUT.value, + safe_dumps(output_items_data), + ) - # Set the observation output with transformed tool_calls - safe_set_attribute( - span, - LangfuseSpanAttributes.OBSERVATION_OUTPUT.value, - safe_dumps(transformed_tool_calls), - ) - else: - # No tool_calls, use regular content-based output - output_data = {} + @staticmethod + def _set_langfuse_specific_attributes(span: Span, kwargs, response_obj): + """ + Sets Langfuse specific metadata attributes onto the OTEL span. - if message.get("role"): - output_data["role"] = message.get("role") + All keys supported by the vanilla Langfuse integration are mapped to + OTEL-safe attribute names defined in LangfuseSpanAttributes. Complex + values (lists/dicts) are serialised to JSON strings for OTEL + compatibility. + """ + from litellm.integrations.arize._utils import safe_set_attribute + from litellm.litellm_core_utils.safe_json_dumps import safe_dumps - if message.get("content") is not None: - output_data["content"] = message.get("content") + langfuse_environment = os.environ.get("LANGFUSE_TRACING_ENVIRONMENT") + if langfuse_environment: + safe_set_attribute( + span, + LangfuseSpanAttributes.LANGFUSE_ENVIRONMENT.value, + langfuse_environment, + ) - if output_data: - safe_set_attribute( - span, - LangfuseSpanAttributes.OBSERVATION_OUTPUT.value, - safe_dumps(output_data), - ) + metadata = LangfuseOtelLogger._extract_langfuse_metadata(kwargs) + LangfuseOtelLogger._set_metadata_attributes(span=span, metadata=metadata) + + messages = kwargs.get("messages") + if messages: + safe_set_attribute( + span, + LangfuseSpanAttributes.OBSERVATION_INPUT.value, + safe_dumps(messages), + ) + + LangfuseOtelLogger._set_observation_output(span=span, response_obj=response_obj) @staticmethod def _get_langfuse_otel_host() -> Optional[str]: @@ -262,8 +304,7 @@ def get_langfuse_otel_config() -> LangfuseOtelConfig: verbose_logger.debug(f"Using Langfuse US cloud endpoint: {endpoint}") auth_header = LangfuseOtelLogger._get_langfuse_authorization_header( - public_key=public_key, - secret_key=secret_key + public_key=public_key, secret_key=secret_key ) otlp_auth_headers = f"Authorization={auth_header}" @@ -274,7 +315,7 @@ def get_langfuse_otel_config() -> LangfuseOtelConfig: return LangfuseOtelConfig( otlp_auth_headers=otlp_auth_headers, protocol="otlp_http" ) - + @staticmethod def _get_langfuse_authorization_header(public_key: str, secret_key: str) -> str: """ @@ -282,11 +323,10 @@ def _get_langfuse_authorization_header(public_key: str, secret_key: str) -> str: """ auth_string = f"{public_key}:{secret_key}" auth_header = base64.b64encode(auth_string.encode()).decode() - return f'Basic {auth_header}' - + return f"Basic {auth_header}" + def construct_dynamic_otel_headers( - self, - standard_callback_dynamic_params: StandardCallbackDynamicParams + self, standard_callback_dynamic_params: StandardCallbackDynamicParams ) -> Optional[dict]: """ Construct dynamic Langfuse headers from standard callback dynamic params @@ -298,13 +338,29 @@ def construct_dynamic_otel_headers( """ dynamic_headers = {} - dynamic_langfuse_public_key = standard_callback_dynamic_params.get("langfuse_public_key") - dynamic_langfuse_secret_key = standard_callback_dynamic_params.get("langfuse_secret_key") + dynamic_langfuse_public_key = standard_callback_dynamic_params.get( + "langfuse_public_key" + ) + dynamic_langfuse_secret_key = standard_callback_dynamic_params.get( + "langfuse_secret_key" + ) if dynamic_langfuse_public_key and dynamic_langfuse_secret_key: auth_header = LangfuseOtelLogger._get_langfuse_authorization_header( public_key=dynamic_langfuse_public_key, - secret_key=dynamic_langfuse_secret_key + secret_key=dynamic_langfuse_secret_key, ) dynamic_headers["Authorization"] = auth_header - + return dynamic_headers + + async def async_service_success_hook(self, *args, **kwargs): + """ + Langfuse should not receive service success logs. + """ + pass + + async def async_service_failure_hook(self, *args, **kwargs): + """ + Langfuse should not receive service failure logs. + """ + pass diff --git a/litellm/integrations/langfuse/langfuse_otel_attributes.py b/litellm/integrations/langfuse/langfuse_otel_attributes.py new file mode 100644 index 000000000000..fb4a0a6a36ce --- /dev/null +++ b/litellm/integrations/langfuse/langfuse_otel_attributes.py @@ -0,0 +1,108 @@ +""" +If the LLM Obs has any specific attributes to log request or response, we can add them here. + +Relevant Issue: https://github.com/BerriAI/litellm/issues/13764 +""" + +import json +from typing import TYPE_CHECKING, Any, Dict, Optional, Union + +from pydantic import BaseModel +from typing_extensions import override + +from litellm.integrations.opentelemetry_utils.base_otel_llm_obs_attributes import ( + BaseLLMObsOTELAttributes, + safe_set_attribute, +) +from litellm.types.llms.openai import HttpxBinaryResponseContent, ResponsesAPIResponse +from litellm.types.utils import ( + EmbeddingResponse, + ImageResponse, + ModelResponse, + RerankResponse, + TextCompletionResponse, + TranscriptionResponse, +) + +if TYPE_CHECKING: + from opentelemetry.trace import Span + + +def get_output_content_by_type( + response_obj: Union[ + None, + dict, + EmbeddingResponse, + ModelResponse, + TextCompletionResponse, + ImageResponse, + TranscriptionResponse, + RerankResponse, + HttpxBinaryResponseContent, + ResponsesAPIResponse, + list, + ], + kwargs: Optional[Dict[str, Any]] = None, +) -> str: + """ + Extract output content from response objects based on their type. + + This utility function handles the type-specific logic for converting + various response objects into appropriate output formats for Langfuse logging. + + Args: + response_obj: The response object returned by the function + kwargs: Optional keyword arguments containing call_type and other metadata + + Returns: + The formatted output content suitable for Langfuse logging, or None + """ + if response_obj is None: + return "" + + kwargs = kwargs or {} + call_type = kwargs.get("call_type", None) + + # Embedding responses - no output content + if call_type == "embedding" or isinstance(response_obj, EmbeddingResponse): + return "embedding-output" + + # Binary/Speech responses + if isinstance(response_obj, HttpxBinaryResponseContent): + return "speech-output" + + if isinstance(response_obj, BaseModel): + return response_obj.model_dump_json() + + if response_obj and ( + isinstance(response_obj, dict) or isinstance(response_obj, list) + ): + return json.dumps(response_obj) + else: + return "" + + +class LangfuseLLMObsOTELAttributes(BaseLLMObsOTELAttributes): + @staticmethod + @override + def set_messages(span: "Span", kwargs: Dict[str, Any]): + prompt = {"messages": kwargs.get("messages")} + optional_params = kwargs.get("optional_params", {}) + functions = optional_params.get("functions") + tools = optional_params.get("tools") + if functions is not None: + prompt["functions"] = functions + if tools is not None: + prompt["tools"] = tools + + input = prompt + safe_set_attribute(span, "langfuse.observation.input", json.dumps(input)) + + @staticmethod + @override + def set_response_output_messages(span: "Span", response_obj): + safe_set_attribute( + span, + "langfuse.observation.output", + get_output_content_by_type(response_obj), + ) diff --git a/litellm/integrations/langfuse/langfuse_prompt_management.py b/litellm/integrations/langfuse/langfuse_prompt_management.py index 58698ef35a5f..3986fc6a6efc 100644 --- a/litellm/integrations/langfuse/langfuse_prompt_management.py +++ b/litellm/integrations/langfuse/langfuse_prompt_management.py @@ -13,6 +13,7 @@ from litellm.integrations.prompt_management_base import PromptManagementClient from litellm.litellm_core_utils.asyncify import run_async_function from litellm.types.llms.openai import AllMessageValues, ChatCompletionSystemMessage +from litellm.types.prompts.init_prompts import PromptSpec from litellm.types.utils import StandardCallbackDynamicParams, StandardLoggingPayload from ...litellm_core_utils.specialty_caches.dynamic_logging_cache import ( @@ -136,7 +137,6 @@ def _get_prompt_from_id( prompt_label: Optional[str] = None, prompt_version: Optional[int] = None, ) -> PROMPT_CLIENT: - prompt_client = langfuse_client.get_prompt( langfuse_prompt_id, label=prompt_label, version=prompt_version ) @@ -184,14 +184,13 @@ async def async_get_chat_completion_prompt( prompt_variables: Optional[dict], dynamic_callback_params: StandardCallbackDynamicParams, litellm_logging_obj: LiteLLMLoggingObj, + prompt_spec: Optional[PromptSpec] = None, tools: Optional[List[Dict]] = None, prompt_label: Optional[str] = None, prompt_version: Optional[int] = None, - ) -> Tuple[ - str, - List[AllMessageValues], - dict, - ]: + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, + ) -> Tuple[str, List[AllMessageValues], dict,]: return self.get_chat_completion_prompt( model, messages, @@ -199,15 +198,21 @@ async def async_get_chat_completion_prompt( prompt_id, prompt_variables, dynamic_callback_params, + prompt_spec=prompt_spec, prompt_label=prompt_label, prompt_version=prompt_version, + ignore_prompt_manager_model=ignore_prompt_manager_model, + ignore_prompt_manager_optional_params=ignore_prompt_manager_optional_params, ) def should_run_prompt_management( self, - prompt_id: str, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], dynamic_callback_params: StandardCallbackDynamicParams, ) -> bool: + if prompt_id is None: + return False langfuse_client = langfuse_client_init( langfuse_public_key=dynamic_callback_params.get("langfuse_public_key"), langfuse_secret=dynamic_callback_params.get("langfuse_secret"), @@ -222,12 +227,16 @@ def should_run_prompt_management( def _compile_prompt_helper( self, - prompt_id: str, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], prompt_variables: Optional[dict], dynamic_callback_params: StandardCallbackDynamicParams, prompt_label: Optional[str] = None, prompt_version: Optional[int] = None, ) -> PromptManagementClient: + if prompt_id is None: + raise ValueError("prompt_id is required for Langfuse prompt management") + langfuse_client = langfuse_client_init( langfuse_public_key=dynamic_callback_params.get("langfuse_public_key"), langfuse_secret=dynamic_callback_params.get("langfuse_secret"), @@ -262,49 +271,88 @@ def _compile_prompt_helper( completed_messages=None, ) + async def async_compile_prompt_helper( + self, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ) -> PromptManagementClient: + return self._compile_prompt_helper( + prompt_id=prompt_id, + prompt_variables=prompt_variables, + dynamic_callback_params=dynamic_callback_params, + prompt_spec=prompt_spec, + prompt_label=prompt_label, + prompt_version=prompt_version, + ) + def log_success_event(self, kwargs, response_obj, start_time, end_time): return run_async_function( self.async_log_success_event, kwargs, response_obj, start_time, end_time ) - async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): - standard_callback_dynamic_params = kwargs.get( - "standard_callback_dynamic_params" - ) - langfuse_logger_to_use = LangFuseHandler.get_langfuse_logger_for_request( - globalLangfuseLogger=self, - standard_callback_dynamic_params=standard_callback_dynamic_params, - in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache, - ) - langfuse_logger_to_use.log_event_on_langfuse( - kwargs=kwargs, - response_obj=response_obj, - start_time=start_time, - end_time=end_time, - user_id=kwargs.get("user", None), + def log_failure_event(self, kwargs, response_obj, start_time, end_time): + return run_async_function( + self.async_log_failure_event, kwargs, response_obj, start_time, end_time ) + async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): + try: + standard_callback_dynamic_params = kwargs.get( + "standard_callback_dynamic_params" + ) + langfuse_logger_to_use = LangFuseHandler.get_langfuse_logger_for_request( + globalLangfuseLogger=self, + standard_callback_dynamic_params=standard_callback_dynamic_params, + in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache, + ) + langfuse_logger_to_use.log_event_on_langfuse( + kwargs=kwargs, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + user_id=kwargs.get("user", None), + ) + except Exception as e: + from litellm._logging import verbose_logger + + verbose_logger.exception( + f"Langfuse Layer Error - Exception occurred while logging success event: {str(e)}" + ) + self.handle_callback_failure(callback_name="langfuse") + async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - standard_callback_dynamic_params = kwargs.get( - "standard_callback_dynamic_params" - ) - langfuse_logger_to_use = LangFuseHandler.get_langfuse_logger_for_request( - globalLangfuseLogger=self, - standard_callback_dynamic_params=standard_callback_dynamic_params, - in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache, - ) - standard_logging_object = cast( - Optional[StandardLoggingPayload], - kwargs.get("standard_logging_object", None), - ) - if standard_logging_object is None: - return - langfuse_logger_to_use.log_event_on_langfuse( - start_time=start_time, - end_time=end_time, - response_obj=None, - user_id=kwargs.get("user", None), - status_message=standard_logging_object["error_str"], - level="ERROR", - kwargs=kwargs, - ) + try: + standard_callback_dynamic_params = kwargs.get( + "standard_callback_dynamic_params" + ) + langfuse_logger_to_use = LangFuseHandler.get_langfuse_logger_for_request( + globalLangfuseLogger=self, + standard_callback_dynamic_params=standard_callback_dynamic_params, + in_memory_dynamic_logger_cache=in_memory_dynamic_logger_cache, + ) + standard_logging_object = cast( + Optional[StandardLoggingPayload], + kwargs.get("standard_logging_object", None), + ) + if standard_logging_object is None: + return + langfuse_logger_to_use.log_event_on_langfuse( + start_time=start_time, + end_time=end_time, + response_obj=None, + user_id=kwargs.get("user", None), + status_message=standard_logging_object["error_str"], + level="ERROR", + kwargs=kwargs, + ) + except Exception as e: + from litellm._logging import verbose_logger + + verbose_logger.exception( + f"Langfuse Layer Error - Exception occurred while logging failure event: {str(e)}" + ) + self.handle_callback_failure(callback_name="langfuse") diff --git a/litellm/integrations/langsmith.py b/litellm/integrations/langsmith.py index cc9b361b69df..ebd005f8804e 100644 --- a/litellm/integrations/langsmith.py +++ b/litellm/integrations/langsmith.py @@ -15,6 +15,10 @@ import litellm from litellm._logging import verbose_logger from litellm.integrations.custom_batch_logger import CustomBatchLogger +from litellm.integrations.langsmith_mock_client import ( + should_use_langsmith_mock, + create_mock_langsmith_client, +) from litellm.llms.custom_httpx.http_handler import ( get_async_httpx_client, httpxSpecialProvider, @@ -40,14 +44,22 @@ def __init__( langsmith_project: Optional[str] = None, langsmith_base_url: Optional[str] = None, langsmith_sampling_rate: Optional[float] = None, + langsmith_tenant_id: Optional[str] = None, **kwargs, ): self.flush_lock = asyncio.Lock() super().__init__(**kwargs, flush_lock=self.flush_lock) + self.is_mock_mode = should_use_langsmith_mock() + + if self.is_mock_mode: + create_mock_langsmith_client() + verbose_logger.debug("[LANGSMITH MOCK] LangSmith logger initialized in mock mode") + self.default_credentials = self.get_credentials_from_env( langsmith_api_key=langsmith_api_key, langsmith_project=langsmith_project, langsmith_base_url=langsmith_base_url, + langsmith_tenant_id=langsmith_tenant_id, ) self.sampling_rate: float = ( langsmith_sampling_rate @@ -76,6 +88,7 @@ def get_credentials_from_env( langsmith_api_key: Optional[str] = None, langsmith_project: Optional[str] = None, langsmith_base_url: Optional[str] = None, + langsmith_tenant_id: Optional[str] = None, ) -> LangsmithCredentialsObject: _credentials_api_key = langsmith_api_key or os.getenv("LANGSMITH_API_KEY") _credentials_project = ( @@ -86,11 +99,13 @@ def get_credentials_from_env( or os.getenv("LANGSMITH_BASE_URL") or "https://api.smith.langchain.com" ) + _credentials_tenant_id = langsmith_tenant_id or os.getenv("LANGSMITH_TENANT_ID") return LangsmithCredentialsObject( LANGSMITH_API_KEY=_credentials_api_key, LANGSMITH_BASE_URL=_credentials_base_url, LANGSMITH_PROJECT=_credentials_project, + LANGSMITH_TENANT_ID=_credentials_tenant_id, ) def _prepare_log_data( @@ -129,6 +144,13 @@ def _prepare_log_data( "metadata" ] # ensure logged metadata is json serializable + extra_metadata = dict(metadata) + requester_metadata = extra_metadata.get("requester_metadata") + if requester_metadata and isinstance(requester_metadata, dict): + for key in ("session_id", "thread_id", "conversation_id"): + if key in requester_metadata and key not in extra_metadata: + extra_metadata[key] = requester_metadata[key] + data = { "name": run_name, "run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain" @@ -138,7 +160,7 @@ def _prepare_log_data( "start_time": payload["startTime"], "end_time": payload["endTime"], "tags": payload["request_tags"], - "extra": metadata, + "extra": extra_metadata, } if payload["error_str"] is not None and payload["status"] == "failure": @@ -365,14 +387,19 @@ async def _log_batch_on_langsmith( """ langsmith_api_base = credentials["LANGSMITH_BASE_URL"] langsmith_api_key = credentials["LANGSMITH_API_KEY"] + langsmith_tenant_id = credentials.get("LANGSMITH_TENANT_ID") url = self._add_endpoint_to_url(langsmith_api_base, "runs/batch") headers = {"x-api-key": langsmith_api_key} + if langsmith_tenant_id: + headers["x-tenant-id"] = langsmith_tenant_id elements_to_log = [queue_object["data"] for queue_object in queue_objects] try: verbose_logger.debug( "Sending batch of %s runs to Langsmith", len(elements_to_log) ) + if self.is_mock_mode: + verbose_logger.debug("[LANGSMITH MOCK] Mock mode enabled - API calls will be intercepted") response = await self.async_httpx_client.post( url=url, json={"post": elements_to_log}, @@ -385,9 +412,14 @@ async def _log_batch_on_langsmith( f"Langsmith Error: {response.status_code} - {response.text}" ) else: - verbose_logger.debug( - f"Batch of {len(self.log_queue)} runs successfully created" - ) + if self.is_mock_mode: + verbose_logger.debug( + f"[LANGSMITH MOCK] Batch of {len(elements_to_log)} runs successfully mocked" + ) + else: + verbose_logger.debug( + f"Batch of {len(self.log_queue)} runs successfully created" + ) except httpx.HTTPStatusError as e: verbose_logger.exception( f"Langsmith HTTP Error: {e.response.status_code} - {e.response.text}" @@ -418,6 +450,7 @@ def _group_batches_by_credentials(self) -> Dict[CredentialsKey, BatchGroup]: api_key=credentials["LANGSMITH_API_KEY"], project=credentials["LANGSMITH_PROJECT"], base_url=credentials["LANGSMITH_BASE_URL"], + tenant_id=credentials.get("LANGSMITH_TENANT_ID"), ) if key not in log_queue_by_credentials: @@ -430,9 +463,9 @@ def _group_batches_by_credentials(self) -> Dict[CredentialsKey, BatchGroup]: return log_queue_by_credentials def _get_sampling_rate_to_use_for_request(self, kwargs: Dict[str, Any]) -> float: - standard_callback_dynamic_params: Optional[StandardCallbackDynamicParams] = ( - kwargs.get("standard_callback_dynamic_params", None) - ) + standard_callback_dynamic_params: Optional[ + StandardCallbackDynamicParams + ] = kwargs.get("standard_callback_dynamic_params", None) sampling_rate: float = self.sampling_rate if standard_callback_dynamic_params is not None: _sampling_rate = standard_callback_dynamic_params.get( @@ -452,9 +485,9 @@ def _get_credentials_to_use_for_request( Otherwise, use the default credentials. """ - standard_callback_dynamic_params: Optional[StandardCallbackDynamicParams] = ( - kwargs.get("standard_callback_dynamic_params", None) - ) + standard_callback_dynamic_params: Optional[ + StandardCallbackDynamicParams + ] = kwargs.get("standard_callback_dynamic_params", None) if standard_callback_dynamic_params is not None: credentials = self.get_credentials_from_env( langsmith_api_key=standard_callback_dynamic_params.get( @@ -466,6 +499,9 @@ def _get_credentials_to_use_for_request( langsmith_base_url=standard_callback_dynamic_params.get( "langsmith_base_url", None ), + langsmith_tenant_id=standard_callback_dynamic_params.get( + "langsmith_tenant_id", None + ), ) else: credentials = self.default_credentials @@ -491,13 +527,16 @@ def _send_batch(self): def get_run_by_id(self, run_id): langsmith_api_key = self.default_credentials["LANGSMITH_API_KEY"] - langsmith_api_base = self.default_credentials["LANGSMITH_BASE_URL"] + langsmith_tenant_id = self.default_credentials.get("LANGSMITH_TENANT_ID") url = f"{langsmith_api_base}/runs/{run_id}" + headers = {"x-api-key": langsmith_api_key} + if langsmith_tenant_id: + headers["x-tenant-id"] = langsmith_tenant_id response = litellm.module_level_client.get( url=url, - headers={"x-api-key": langsmith_api_key}, + headers=headers, ) return response.json() diff --git a/litellm/integrations/langsmith_mock_client.py b/litellm/integrations/langsmith_mock_client.py new file mode 100644 index 000000000000..ef6029082317 --- /dev/null +++ b/litellm/integrations/langsmith_mock_client.py @@ -0,0 +1,29 @@ +""" +Mock client for LangSmith integration testing. + +This module intercepts LangSmith API calls and returns successful mock responses, +allowing full code execution without making actual network calls. + +Usage: + Set LANGSMITH_MOCK=true in environment variables or config to enable mock mode. +""" + +from litellm.integrations.mock_client_factory import MockClientConfig, create_mock_client_factory + +# Create mock client using factory +_config = MockClientConfig( + name="LANGSMITH", + env_var="LANGSMITH_MOCK", + default_latency_ms=100, + default_status_code=200, + default_json_data={"status": "success", "ids": ["mock-run-id"]}, + url_matchers=[ + ".smith.langchain.com", + "api.smith.langchain.com", + "smith.langchain.com", + ], + patch_async_handler=True, + patch_sync_client=False, +) + +create_mock_langsmith_client, should_use_langsmith_mock = create_mock_client_factory(_config) diff --git a/litellm/integrations/levo/README.md b/litellm/integrations/levo/README.md new file mode 100644 index 000000000000..cb18b1dbfb07 --- /dev/null +++ b/litellm/integrations/levo/README.md @@ -0,0 +1,125 @@ +# Levo AI Integration + +This integration enables sending LLM observability data to Levo AI using OpenTelemetry (OTLP) protocol. + +## Overview + +The Levo integration extends LiteLLM's OpenTelemetry support to automatically send traces to Levo's collector endpoint with proper authentication and routing headers. + +## Features + +- **Automatic OTLP Export**: Sends OpenTelemetry traces to Levo collector +- **Levo-Specific Headers**: Automatically includes `x-levo-organization-id` and `x-levo-workspace-id` for routing +- **Simple Configuration**: Just use `callbacks: ["levo"]` in your LiteLLM config +- **Environment-Based Setup**: Configure via environment variables + +## Quick Start + +### 1. Install Dependencies + +```bash +pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp-proto-http opentelemetry-exporter-otlp-proto-grpc +``` + +### 2. Configure LiteLLM + +Add to your `litellm_config.yaml`: + +```yaml +litellm_settings: + callbacks: ["levo"] +``` + +### 3. Set Environment Variables + +```bash +export LEVOAI_API_KEY="" +export LEVOAI_ORG_ID="" +export LEVOAI_WORKSPACE_ID="" +export LEVOAI_COLLECTOR_URL="" +``` + +### 4. Start LiteLLM + +```bash +litellm --config config.yaml +``` + +All LLM requests will now automatically be sent to Levo! + +## Configuration + +### Required Environment Variables + +| Variable | Description | +|----------|-------------| +| `LEVOAI_API_KEY` | Your Levo API key for authentication | +| `LEVOAI_ORG_ID` | Your Levo organization ID for routing | +| `LEVOAI_WORKSPACE_ID` | Your Levo workspace ID for routing | +| `LEVOAI_COLLECTOR_URL` | Full collector endpoint URL from Levo support | + +### Optional Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `LEVOAI_ENV_NAME` | Environment name for tagging traces | `None` | + +**Important**: The `LEVOAI_COLLECTOR_URL` is used exactly as provided. No path manipulation is performed. + +## How It Works + +1. **LevoLogger** extends LiteLLM's `OpenTelemetry` class +2. **Configuration** is read from environment variables via `get_levo_config()` +3. **OTLP Headers** are automatically set: + - `Authorization: Bearer {LEVOAI_API_KEY}` + - `x-levo-organization-id: {LEVOAI_ORG_ID}` + - `x-levo-workspace-id: {LEVOAI_WORKSPACE_ID}` +4. **Traces** are sent to the collector endpoint in OTLP format + +## Code Structure + +``` +litellm/integrations/levo/ +├── __init__.py # Exports LevoLogger +├── levo.py # LevoLogger implementation +└── README.md # This file +``` + +### Key Classes + +- **LevoLogger**: Extends `OpenTelemetry`, handles Levo-specific configuration +- **LevoConfig**: Pydantic model for Levo configuration (defined in `levo.py`) + +## Testing + +See the test files in `tests/test_litellm/integrations/levo/`: +- `test_levo.py`: Unit tests for configuration +- `test_levo_integration.py`: Integration tests for callback registration + +## Error Handling + +The integration validates all required environment variables at initialization: +- Missing `LEVOAI_API_KEY`: Raises `ValueError` with clear message +- Missing `LEVOAI_ORG_ID`: Raises `ValueError` with clear message +- Missing `LEVOAI_WORKSPACE_ID`: Raises `ValueError` with clear message +- Missing `LEVOAI_COLLECTOR_URL`: Raises `ValueError` with clear message + +## Integration with LiteLLM + +The Levo callback is registered in: +- `litellm/litellm_core_utils/custom_logger_registry.py`: Maps `"levo"` to `LevoLogger` +- `litellm/litellm_core_utils/litellm_logging.py`: Instantiates `LevoLogger` when `callbacks: ["levo"]` is used +- `litellm/__init__.py`: Added to `_custom_logger_compatible_callbacks_literal` + +## Documentation + +For detailed documentation, see: +- [LiteLLM Levo Integration Docs](../../../../docs/my-website/docs/observability/levo_integration.md) +- [Levo Documentation](https://docs.levo.ai) + +## Support + +For issues or questions: +- LiteLLM Issues: https://github.com/BerriAI/litellm/issues +- Levo Support: support@levo.ai + diff --git a/litellm/integrations/levo/__init__.py b/litellm/integrations/levo/__init__.py new file mode 100644 index 000000000000..7f4f84437d41 --- /dev/null +++ b/litellm/integrations/levo/__init__.py @@ -0,0 +1,3 @@ +from litellm.integrations.levo.levo import LevoLogger + +__all__ = ["LevoLogger"] diff --git a/litellm/integrations/levo/levo.py b/litellm/integrations/levo/levo.py new file mode 100644 index 000000000000..562f2fd90684 --- /dev/null +++ b/litellm/integrations/levo/levo.py @@ -0,0 +1,117 @@ +import os +from typing import TYPE_CHECKING, Any, Optional, Union + +from litellm.integrations.opentelemetry import OpenTelemetry + +if TYPE_CHECKING: + from opentelemetry.trace import Span as _Span + + from litellm.integrations.opentelemetry import OpenTelemetryConfig as _OpenTelemetryConfig + from litellm.types.integrations.arize import Protocol as _Protocol + + Protocol = _Protocol + OpenTelemetryConfig = _OpenTelemetryConfig + Span = Union[_Span, Any] +else: + Protocol = Any + OpenTelemetryConfig = Any + Span = Any + + +class LevoConfig: + """Configuration for Levo OTLP integration.""" + + def __init__( + self, + otlp_auth_headers: Optional[str], + protocol: Protocol, + endpoint: str, + ): + self.otlp_auth_headers = otlp_auth_headers + self.protocol = protocol + self.endpoint = endpoint + + +class LevoLogger(OpenTelemetry): + """Levo Logger that extends OpenTelemetry for OTLP integration.""" + + @staticmethod + def get_levo_config() -> LevoConfig: + """ + Retrieves the Levo configuration based on environment variables. + + Returns: + LevoConfig: Configuration object containing Levo OTLP settings. + + Raises: + ValueError: If required environment variables are missing. + """ + # Required environment variables + api_key = os.environ.get("LEVOAI_API_KEY", None) + org_id = os.environ.get("LEVOAI_ORG_ID", None) + workspace_id = os.environ.get("LEVOAI_WORKSPACE_ID", None) + collector_url = os.environ.get("LEVOAI_COLLECTOR_URL", None) + + # Validate required env vars + if not api_key: + raise ValueError( + "LEVOAI_API_KEY environment variable is required for Levo integration." + ) + if not org_id: + raise ValueError( + "LEVOAI_ORG_ID environment variable is required for Levo integration." + ) + if not workspace_id: + raise ValueError( + "LEVOAI_WORKSPACE_ID environment variable is required for Levo integration." + ) + if not collector_url: + raise ValueError( + "LEVOAI_COLLECTOR_URL environment variable is required for Levo integration. " + "Please contact Levo support to get your collector URL." + ) + + # Use collector URL exactly as provided by the user + endpoint = collector_url + protocol: Protocol = "otlp_http" + + # Build OTLP headers string + # Format: Authorization=Bearer {api_key},x-levo-organization-id={org_id},x-levo-workspace-id={workspace_id} + headers_parts = [f"Authorization=Bearer {api_key}"] + headers_parts.append(f"x-levo-organization-id={org_id}") + headers_parts.append(f"x-levo-workspace-id={workspace_id}") + + otlp_auth_headers = ",".join(headers_parts) + + return LevoConfig( + otlp_auth_headers=otlp_auth_headers, + protocol=protocol, + endpoint=endpoint, + ) + + async def async_health_check(self): + """ + Health check for Levo integration. + + Returns: + dict: Health status with status and message/error_message keys. + """ + try: + config = self.get_levo_config() + + if not config.otlp_auth_headers: + return { + "status": "unhealthy", + "error_message": "LEVOAI_API_KEY environment variable not set", + } + + return { + "status": "healthy", + "message": "Levo credentials are configured properly", + } + except ValueError as e: + return { + "status": "unhealthy", + "error_message": str(e), + } + diff --git a/litellm/integrations/mlflow.py b/litellm/integrations/mlflow.py index b348737868d6..6378e55f7e1d 100644 --- a/litellm/integrations/mlflow.py +++ b/litellm/integrations/mlflow.py @@ -129,8 +129,11 @@ def _handle_stream_event(self, kwargs, response_obj, start_time, end_time): self._add_chunk_events(span, response_obj) # If this is the final chunk, end the span. The final chunk - # has complete_streaming_response that gathers the full response. - if final_response := kwargs.get("complete_streaming_response"): + # has the assembled streaming response (key differs between sync/async paths). + final_response = kwargs.get("complete_streaming_response") or kwargs.get( + "async_complete_streaming_response" + ) + if final_response: end_time_ns = int(end_time.timestamp() * 1e9) self._extract_and_set_chat_attributes(span, kwargs, final_response) @@ -153,7 +156,9 @@ def _add_chunk_events(self, span, response_obj): span.add_event( SpanEvent( name="streaming_chunk", - attributes={"delta": json.dumps(choice.delta.model_dump())}, + attributes={ + "delta": json.dumps(choice.delta.model_dump, default=str) + }, ) ) except Exception: diff --git a/litellm/integrations/mock_client_factory.py b/litellm/integrations/mock_client_factory.py new file mode 100644 index 000000000000..2f04fae9f769 --- /dev/null +++ b/litellm/integrations/mock_client_factory.py @@ -0,0 +1,216 @@ +""" +Factory for creating mock HTTP clients for integration testing. + +This module provides a simple factory pattern to create mock clients that intercept +API calls and return successful mock responses, allowing full code execution without +making actual network calls. +""" + +import httpx +import json +import asyncio +from datetime import timedelta +from typing import Dict, Optional, List, cast +from dataclasses import dataclass + +from litellm._logging import verbose_logger + + +@dataclass +class MockClientConfig: + """Configuration for creating a mock client.""" + name: str # e.g., "GCS", "LANGFUSE", "LANGSMITH", "DATADOG" + env_var: str # e.g., "GCS_MOCK", "LANGFUSE_MOCK" + default_latency_ms: int = 100 # Default mock latency in milliseconds + default_status_code: int = 200 # Default HTTP status code + default_json_data: Optional[Dict] = None # Default JSON response data + url_matchers: Optional[List[str]] = None # List of strings to match in URLs (e.g., ["storage.googleapis.com"]) + patch_async_handler: bool = True # Whether to patch AsyncHTTPHandler.post + patch_sync_client: bool = False # Whether to patch httpx.Client.post + patch_http_handler: bool = False # Whether to patch HTTPHandler.post (for sync calls that use HTTPHandler) + + def __post_init__(self): + """Ensure url_matchers is a list.""" + if self.url_matchers is None: + self.url_matchers = [] + + +class MockResponse: + """Generic mock httpx.Response that satisfies API requirements.""" + + def __init__(self, status_code: int = 200, json_data: Optional[Dict] = None, url: Optional[str] = None, elapsed_seconds: float = 0.0): + self.status_code = status_code + self._json_data = json_data or {"status": "success"} + self.headers = httpx.Headers({}) + self.is_success = status_code < 400 + self.is_error = status_code >= 400 + self.is_redirect = 300 <= status_code < 400 + self.url = httpx.URL(url) if url else httpx.URL("") + self.elapsed = timedelta(seconds=elapsed_seconds) + self._text = json.dumps(self._json_data) if json_data else "" + self._content = self._text.encode("utf-8") + + @property + def text(self) -> str: + """Return response text.""" + return self._text + + @property + def content(self) -> bytes: + """Return response content.""" + return self._content + + def json(self) -> Dict: + """Return JSON response data.""" + return self._json_data + + def read(self) -> bytes: + """Read response content.""" + return self._content + + def raise_for_status(self): + """Raise exception for error status codes.""" + if self.status_code >= 400: + raise Exception(f"HTTP {self.status_code}") + + +def _is_url_match(url, matchers: List[str]) -> bool: + """Check if URL matches any of the provided matchers.""" + try: + parsed_url = httpx.URL(url) if isinstance(url, str) else url + url_str = str(parsed_url).lower() + hostname = parsed_url.host or "" + + for matcher in matchers: + if matcher.lower() in url_str or matcher.lower() in hostname.lower(): + return True + + # Also check for localhost with matcher in path + if hostname in ("localhost", "127.0.0.1"): + for matcher in matchers: + if matcher.lower() in url_str: + return True + + return False + except Exception: + return False + + +def create_mock_client_factory(config: MockClientConfig): # noqa: PLR0915 + """ + Factory function that creates mock client functions based on configuration. + + Returns: + tuple: (create_mock_client_func, should_use_mock_func) + """ + # Store original methods for restoration + _original_async_handler_post = None + _original_sync_client_post = None + _original_http_handler_post = None + _mocks_initialized = False + + # Calculate mock latency + import os + latency_env = f"{config.name.upper()}_MOCK_LATENCY_MS" + _MOCK_LATENCY_SECONDS = float(os.getenv(latency_env, str(config.default_latency_ms))) / 1000.0 + + # Create URL matcher function + def _is_mock_url(url) -> bool: + # url_matchers is guaranteed to be a list after __post_init__ + return _is_url_match(url, cast(List[str], config.url_matchers)) + + # Create async handler mock + async def _mock_async_handler_post(self, url, data=None, json=None, params=None, headers=None, timeout=None, stream=False, logging_obj=None, files=None, content=None): + """Monkey-patched AsyncHTTPHandler.post that intercepts API calls.""" + if isinstance(url, str) and _is_mock_url(url): + verbose_logger.info(f"[{config.name} MOCK] POST to {url}") + await asyncio.sleep(_MOCK_LATENCY_SECONDS) + return MockResponse( + status_code=config.default_status_code, + json_data=config.default_json_data, + url=url, + elapsed_seconds=_MOCK_LATENCY_SECONDS + ) + if _original_async_handler_post is not None: + return await _original_async_handler_post(self, url=url, data=data, json=json, params=params, headers=headers, timeout=timeout, stream=stream, logging_obj=logging_obj, files=files, content=content) + raise RuntimeError("Original AsyncHTTPHandler.post not available") + + # Create sync client mock + def _mock_sync_client_post(self, url, **kwargs): + """Monkey-patched httpx.Client.post that intercepts API calls.""" + if _is_mock_url(url): + verbose_logger.info(f"[{config.name} MOCK] POST to {url} (sync)") + return MockResponse( + status_code=config.default_status_code, + json_data=config.default_json_data, + url=url, + elapsed_seconds=_MOCK_LATENCY_SECONDS + ) + if _original_sync_client_post is not None: + return _original_sync_client_post(self, url, **kwargs) + + # Create HTTPHandler mock (for sync calls that use HTTPHandler.post) + def _mock_http_handler_post(self, url, data=None, json=None, params=None, headers=None, timeout=None, stream=False, files=None, content=None, logging_obj=None): + """Monkey-patched HTTPHandler.post that intercepts API calls.""" + if isinstance(url, str) and _is_mock_url(url): + verbose_logger.info(f"[{config.name} MOCK] POST to {url}") + import time + time.sleep(_MOCK_LATENCY_SECONDS) + return MockResponse( + status_code=config.default_status_code, + json_data=config.default_json_data, + url=url, + elapsed_seconds=_MOCK_LATENCY_SECONDS + ) + if _original_http_handler_post is not None: + return _original_http_handler_post(self, url=url, data=data, json=json, params=params, headers=headers, timeout=timeout, stream=stream, files=files, content=content, logging_obj=logging_obj) + raise RuntimeError("Original HTTPHandler.post not available") + + # Create mock client initialization function + def create_mock_client(): + """Initialize the mock client by patching HTTP handlers.""" + nonlocal _original_async_handler_post, _original_sync_client_post, _original_http_handler_post, _mocks_initialized + + if _mocks_initialized: + return + + verbose_logger.debug(f"[{config.name} MOCK] Initializing {config.name} mock client...") + + if config.patch_async_handler and _original_async_handler_post is None: + from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler + _original_async_handler_post = AsyncHTTPHandler.post + AsyncHTTPHandler.post = _mock_async_handler_post # type: ignore + verbose_logger.debug(f"[{config.name} MOCK] Patched AsyncHTTPHandler.post") + + if config.patch_sync_client and _original_sync_client_post is None: + _original_sync_client_post = httpx.Client.post + httpx.Client.post = _mock_sync_client_post # type: ignore + verbose_logger.debug(f"[{config.name} MOCK] Patched httpx.Client.post") + + if config.patch_http_handler and _original_http_handler_post is None: + from litellm.llms.custom_httpx.http_handler import HTTPHandler + _original_http_handler_post = HTTPHandler.post + HTTPHandler.post = _mock_http_handler_post # type: ignore + verbose_logger.debug(f"[{config.name} MOCK] Patched HTTPHandler.post") + + verbose_logger.debug(f"[{config.name} MOCK] Mock latency set to {_MOCK_LATENCY_SECONDS*1000:.0f}ms") + verbose_logger.debug(f"[{config.name} MOCK] {config.name} mock client initialization complete") + + _mocks_initialized = True + + # Create should_use_mock function + def should_use_mock() -> bool: + """Determine if mock mode should be enabled.""" + import os + from litellm.secret_managers.main import str_to_bool + + mock_mode = os.getenv(config.env_var, "false") + result = str_to_bool(mock_mode) + result = bool(result) if result is not None else False + + if result: + verbose_logger.info(f"{config.name} Mock Mode: ENABLED - API calls will be mocked") + + return result + + return create_mock_client, should_use_mock diff --git a/litellm/integrations/opentelemetry.py b/litellm/integrations/opentelemetry.py index 9315384ad964..18898be7dced 100644 --- a/litellm/integrations/opentelemetry.py +++ b/litellm/integrations/opentelemetry.py @@ -7,13 +7,20 @@ from litellm._logging import verbose_logger from litellm.integrations.custom_logger import CustomLogger from litellm.litellm_core_utils.safe_json_dumps import safe_dumps +from litellm.secret_managers.main import get_secret_bool from litellm.types.services import ServiceLoggerPayload from litellm.types.utils import ( ChatCompletionMessageToolCall, + CostBreakdown, Function, + LLMResponseTypes, StandardCallbackDynamicParams, StandardLoggingPayload, ) +from litellm.integrations._types.open_inference import ( + OpenInferenceSpanKindValues, + SpanAttributes, +) # OpenTelemetry imports moved to individual functions to avoid import errors when not installed @@ -45,43 +52,12 @@ LITELLM_TRACER_NAME = os.getenv("OTEL_TRACER_NAME", "litellm") LITELLM_METER_NAME = os.getenv("LITELLM_METER_NAME", "litellm") LITELLM_LOGGER_NAME = os.getenv("LITELLM_LOGGER_NAME", "litellm") +LITELLM_PROXY_REQUEST_SPAN_NAME = "Received Proxy Server Request" # Remove the hardcoded LITELLM_RESOURCE dictionary - we'll create it properly later RAW_REQUEST_SPAN_NAME = "raw_gen_ai_request" LITELLM_REQUEST_SPAN_NAME = "litellm_request" -def _get_litellm_resource(): - """ - Create a proper OpenTelemetry Resource that respects OTEL_RESOURCE_ATTRIBUTES - while maintaining backward compatibility with LiteLLM-specific environment variables. - """ - from opentelemetry.sdk.resources import OTELResourceDetector, Resource - - # Create base resource attributes with LiteLLM-specific defaults - # These will be overridden by OTEL_RESOURCE_ATTRIBUTES if present - base_attributes: Dict[str, Optional[str]] = { - "service.name": os.getenv("OTEL_SERVICE_NAME", "litellm"), - "deployment.environment": os.getenv("OTEL_ENVIRONMENT_NAME", "production"), - # Fix the model_id to use proper environment variable or default to service name - "model_id": os.getenv( - "OTEL_MODEL_ID", os.getenv("OTEL_SERVICE_NAME", "litellm") - ), - } - - # Create base resource with LiteLLM-specific defaults - base_resource = Resource.create(base_attributes) # type: ignore - - # Create resource from OTEL_RESOURCE_ATTRIBUTES using the detector - otel_resource_detector = OTELResourceDetector() - env_resource = otel_resource_detector.detect() - - # Merge the resources: env_resource takes precedence over base_resource - # This ensures OTEL_RESOURCE_ATTRIBUTES overrides LiteLLM defaults - merged_resource = base_resource.merge(env_resource) - - return merged_resource - - @dataclass class OpenTelemetryConfig: exporter: Union[str, SpanExporter] = "console" @@ -89,6 +65,19 @@ class OpenTelemetryConfig: headers: Optional[str] = None enable_metrics: bool = False enable_events: bool = False + service_name: Optional[str] = None + deployment_environment: Optional[str] = None + model_id: Optional[str] = None + + def __post_init__(self) -> None: + if not self.service_name: + self.service_name = os.getenv("OTEL_SERVICE_NAME", "litellm") + if not self.deployment_environment: + self.deployment_environment = os.getenv( + "OTEL_ENVIRONMENT_NAME", "production" + ) + if not self.model_id: + self.model_id = os.getenv("OTEL_MODEL_ID", self.service_name) @classmethod def from_env(cls): @@ -118,6 +107,9 @@ def from_env(cls): os.getenv("LITELLM_OTEL_INTEGRATION_ENABLE_EVENTS", "false").lower() == "true" ) + service_name = os.getenv("OTEL_SERVICE_NAME", "litellm") + deployment_environment = os.getenv("OTEL_ENVIRONMENT_NAME", "production") + model_id = os.getenv("OTEL_MODEL_ID", service_name) if exporter == "in_memory": return cls(exporter=InMemorySpanExporter()) @@ -127,6 +119,9 @@ def from_env(cls): headers=headers, # example: OTEL_HEADERS=x-honeycomb-team=B85YgLm96***" enable_metrics=enable_metrics, enable_events=enable_events, + service_name=service_name, + deployment_environment=deployment_environment, + model_id=model_id, ) @@ -141,7 +136,6 @@ def __init__( meter_provider: Optional[Any] = None, **kwargs, ): - if config is None: config = OpenTelemetryConfig.from_env() @@ -150,6 +144,7 @@ def __init__( self.OTEL_EXPORTER = self.config.exporter self.OTEL_ENDPOINT = self.config.endpoint self.OTEL_HEADERS = self.config.headers + self._tracer_provider_cache: Dict[str, Any] = {} self._init_tracing(tracer_provider) _debug_otel = str(os.getenv("DEBUG_OTEL", "False")).lower() @@ -171,6 +166,22 @@ def __init__( self._init_logs(logger_provider) self._init_otel_logger_on_litellm_proxy() + @staticmethod + def _get_litellm_resource(config: OpenTelemetryConfig): + """Create an OpenTelemetry Resource using config-driven defaults.""" + from opentelemetry.sdk.resources import OTELResourceDetector, Resource + + base_attributes: Dict[str, Optional[str]] = { + "service.name": config.service_name, + "deployment.environment": config.deployment_environment, + "model_id": config.model_id or config.service_name, + } + + base_resource = Resource.create(base_attributes) # type: ignore[arg-type] + otel_resource_detector = OTELResourceDetector() + env_resource = otel_resource_detector.detect() + return base_resource.merge(env_resource) + def _init_otel_logger_on_litellm_proxy(self): """ Initializes OpenTelemetry for litellm proxy server @@ -193,51 +204,92 @@ def _init_otel_logger_on_litellm_proxy(self): litellm.service_callback.append(self) setattr(proxy_server, "open_telemetry_logger", self) - def _init_tracing(self, tracer_provider): - from opentelemetry import trace - from opentelemetry.sdk.trace import TracerProvider - from opentelemetry.trace import SpanKind + def _get_or_create_provider( + self, + provider, + provider_name: str, + get_existing_provider_fn, + sdk_provider_class, + create_new_provider_fn, + set_provider_fn, + ): + """ + Generic helper to get or create an OpenTelemetry provider (Tracer, Meter, or Logger). - # use provided tracer or create a new one - if tracer_provider is None: - # Check if a TracerProvider is already set globally (e.g., by Langfuse SDK) - try: - from opentelemetry.trace import ProxyTracerProvider - existing_provider = trace.get_tracer_provider() + Args: + provider: The provider instance passed to the init function (can be None) + provider_name: Name for logging (e.g., "TracerProvider") + get_existing_provider_fn: Function to get the existing global provider + sdk_provider_class: The SDK provider class to check for (e.g., TracerProvider from SDK) + create_new_provider_fn: Function to create a new provider instance + set_provider_fn: Function to set the provider globally - # If an actual provider exists (not the default proxy), use it - if not isinstance(existing_provider, ProxyTracerProvider): - verbose_logger.debug( - "OpenTelemetry: Using existing TracerProvider: %s", - type(existing_provider).__name__ - ) - tracer_provider = existing_provider - # Don't call set_tracer_provider to preserve existing context - else: - # No real provider exists yet, create our own - verbose_logger.debug("OpenTelemetry: Creating new TracerProvider") - tracer_provider = TracerProvider(resource=_get_litellm_resource()) - tracer_provider.add_span_processor(self._get_span_processor()) - trace.set_tracer_provider(tracer_provider) - except Exception as e: - # Fallback: create a new provider if something goes wrong + Returns: + The provider to use (either existing, new, or explicitly provided) + """ + if provider is not None: + # Provider explicitly provided (e.g., for testing) + # Do NOT call set_provider_fn - the caller is responsible for managing global state + # If they want it to be global, they've already set it before passing it to us + verbose_logger.debug( + "OpenTelemetry: Using provided TracerProvider: %s", + type(provider).__name__, + ) + return provider + + # Check if a provider is already set globally + try: + existing_provider = get_existing_provider_fn() + + # If a real SDK provider exists (set by another SDK like Langfuse), use it + # This uses a positive check for SDK providers instead of a negative check for proxy providers + if isinstance(existing_provider, sdk_provider_class): verbose_logger.debug( - "OpenTelemetry: Exception checking existing provider, creating new one: %s", - str(e) + "OpenTelemetry: Using existing %s: %s", + provider_name, + type(existing_provider).__name__, ) - tracer_provider = TracerProvider(resource=_get_litellm_resource()) - tracer_provider.add_span_processor(self._get_span_processor()) - trace.set_tracer_provider(tracer_provider) - else: - # Tracer provider explicitly provided (e.g., for testing) + provider = existing_provider + # Don't call set_provider to preserve existing context + else: + # Default proxy provider or unknown type, create our own + verbose_logger.debug("OpenTelemetry: Creating new %s", provider_name) + provider = create_new_provider_fn() + set_provider_fn(provider) + except Exception as e: + # Fallback: create a new provider if something goes wrong verbose_logger.debug( - "OpenTelemetry: Using provided TracerProvider: %s", - type(tracer_provider).__name__ + "OpenTelemetry: Exception checking existing %s, creating new one: %s", + provider_name, + str(e), ) - trace.set_tracer_provider(tracer_provider) + provider = create_new_provider_fn() + set_provider_fn(provider) + + return provider - # grab our tracer - self.tracer = trace.get_tracer(LITELLM_TRACER_NAME) + def _init_tracing(self, tracer_provider): + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.trace import SpanKind + + def create_tracer_provider(): + provider = TracerProvider(resource=self._get_litellm_resource(self.config)) + provider.add_span_processor(self._get_span_processor()) + return provider + + tracer_provider = self._get_or_create_provider( + provider=tracer_provider, + provider_name="TracerProvider", + get_existing_provider_fn=trace.get_tracer_provider, + sdk_provider_class=TracerProvider, + create_new_provider_fn=create_tracer_provider, + set_provider_fn=trace.set_tracer_provider, + ) + + # Grab our tracer from the TracerProvider (not from global context) + # This ensures we use the provided TracerProvider (e.g., for testing) + self.tracer = tracer_provider.get_tracer(LITELLM_TRACER_NAME) self.span_kind = SpanKind def _init_metrics(self, meter_provider): @@ -245,42 +297,31 @@ def _init_metrics(self, meter_provider): self._operation_duration_histogram = None self._token_usage_histogram = None self._cost_histogram = None + self._time_to_first_token_histogram = None + self._time_per_output_token_histogram = None + self._response_duration_histogram = None return from opentelemetry import metrics - from opentelemetry.sdk.metrics import Histogram, MeterProvider + from opentelemetry.sdk.metrics import MeterProvider - # Only create OTLP infrastructure if no custom meter provider is provided - if meter_provider is None: - from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( - OTLPMetricExporter, - ) - from opentelemetry.sdk.metrics.export import ( - AggregationTemporality, - PeriodicExportingMetricReader, + def create_meter_provider(): + metric_reader = self._get_metric_reader() + return MeterProvider( + metric_readers=[metric_reader], + resource=self._get_litellm_resource(self.config), ) - normalized_endpoint = self._normalize_otel_endpoint( - self.config.endpoint, "metrics" - ) - _metric_exporter = OTLPMetricExporter( - endpoint=normalized_endpoint, - headers=OpenTelemetry._get_headers_dictionary(self.config.headers), - preferred_temporality={Histogram: AggregationTemporality.DELTA}, - ) - _metric_reader = PeriodicExportingMetricReader( - _metric_exporter, export_interval_millis=10000 - ) - - meter_provider = MeterProvider( - metric_readers=[_metric_reader], resource=_get_litellm_resource() - ) - meter = meter_provider.get_meter(__name__) - else: - # Use the provided meter provider as-is, without creating additional OTLP infrastructure - meter = meter_provider.get_meter(__name__) + meter_provider = self._get_or_create_provider( + provider=meter_provider, + provider_name="MeterProvider", + get_existing_provider_fn=metrics.get_meter_provider, + sdk_provider_class=MeterProvider, + create_new_provider_fn=create_meter_provider, + set_provider_fn=metrics.set_meter_provider, + ) - metrics.set_meter_provider(meter_provider) + meter = meter_provider.get_meter(__name__) self._operation_duration_histogram = meter.create_histogram( name="gen_ai.client.operation.duration", # Replace with semconv constant in otel 1.38 @@ -297,28 +338,49 @@ def _init_metrics(self, meter_provider): description="GenAI request cost", unit="USD", ) + self._time_to_first_token_histogram = meter.create_histogram( + name="gen_ai.client.response.time_to_first_token", + description="Time to first token for streaming requests", + unit="s", + ) + self._time_per_output_token_histogram = meter.create_histogram( + name="gen_ai.client.response.time_per_output_token", + description="Average time per output token (generation time / completion tokens)", + unit="s", + ) + self._response_duration_histogram = meter.create_histogram( + name="gen_ai.client.response.duration", + description="Total LLM API generation time (excludes LiteLLM overhead)", + unit="s", + ) def _init_logs(self, logger_provider): # nothing to do if events disabled if not self.config.enable_events: return - from opentelemetry._logs import set_logger_provider + from opentelemetry._logs import get_logger_provider, set_logger_provider from opentelemetry.sdk._logs import LoggerProvider as OTLoggerProvider from opentelemetry.sdk._logs.export import BatchLogRecordProcessor - # set up log pipeline - if logger_provider is None: - litellm_resource = _get_litellm_resource() - logger_provider = OTLoggerProvider(resource=litellm_resource) - # Only add OTLP exporter if we created the logger provider ourselves + def create_logger_provider(): + provider = OTLoggerProvider( + resource=self._get_litellm_resource(self.config) + ) log_exporter = self._get_log_exporter() - if log_exporter: - logger_provider.add_log_record_processor( - BatchLogRecordProcessor(log_exporter) # type: ignore[arg-type] - ) - - set_logger_provider(logger_provider) + provider.add_log_record_processor( + BatchLogRecordProcessor(log_exporter) # type: ignore[arg-type] + ) + return provider + + self._get_or_create_provider( + provider=logger_provider, + provider_name="LoggerProvider", + get_existing_provider_fn=get_logger_provider, + sdk_provider_class=OTLoggerProvider, + create_new_provider_fn=create_logger_provider, + set_provider_fn=set_logger_provider, + ) def log_success_event(self, kwargs, response_obj, start_time, end_time): self._handle_success(kwargs, response_obj, start_time, end_time) @@ -486,6 +548,29 @@ async def async_post_call_failure_hook( # End Parent OTEL Sspan parent_otel_span.end(end_time=self._to_ns(datetime.now())) + async def async_post_call_success_hook( + self, + data: dict, + user_api_key_dict: UserAPIKeyAuth, + response: LLMResponseTypes, + ): + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging + + litellm_logging_obj = data.get("litellm_logging_obj") + + if litellm_logging_obj is not None and isinstance( + litellm_logging_obj, LiteLLMLogging + ): + kwargs = litellm_logging_obj.model_call_details + parent_span = user_api_key_dict.parent_otel_span + + ctx, _ = self._get_span_context(kwargs, default_span=parent_span) + + # 3. Guardrail span + self._create_guardrail_span(kwargs=kwargs, context=ctx) + + return response + ######################################################### # Team/Key Based Logging Control Flow ######################################################### @@ -531,12 +616,20 @@ def _get_tracer_with_dynamic_headers(self, dynamic_headers: dict): """Create a temporary tracer with dynamic headers for this request only.""" from opentelemetry.sdk.trace import TracerProvider + # Prevents thread exhaustion by reusing providers for the same credential sets (e.g. per-team keys) + cache_key = str(sorted(dynamic_headers.items())) + if cache_key in self._tracer_provider_cache: + return self._tracer_provider_cache[cache_key].get_tracer(LITELLM_TRACER_NAME) + # Create a temporary tracer provider with dynamic headers - temp_provider = TracerProvider(resource=_get_litellm_resource()) + temp_provider = TracerProvider(resource=self._get_litellm_resource(self.config)) temp_provider.add_span_processor( self._get_span_processor(dynamic_headers=dynamic_headers) ) + # Store in cache for reuse + self._tracer_provider_cache[cache_key] = temp_provider + return temp_provider.get_tracer(LITELLM_TRACER_NAME) def construct_dynamic_otel_headers( @@ -564,11 +657,38 @@ def _handle_success(self, kwargs, response_obj, start_time, end_time): ) ctx, parent_span = self._get_span_context(kwargs) - # 1. Primary span - span = self._start_primary_span(kwargs, response_obj, start_time, end_time, ctx) + # Decide whether to create a primary span + # Always create if no parent span exists (backward compatibility) + # OR if USE_OTEL_LITELLM_REQUEST_SPAN is explicitly enabled + should_create_primary_span = parent_span is None or get_secret_bool( + "USE_OTEL_LITELLM_REQUEST_SPAN" + ) - # 2. Raw‐request sub-span (if enabled) - self._maybe_log_raw_request(kwargs, response_obj, start_time, end_time, span) + if should_create_primary_span: + # Create a new litellm_request span + span = self._start_primary_span( + kwargs, response_obj, start_time, end_time, ctx + ) + # Raw-request sub-span (if enabled) - child of litellm_request span + self._maybe_log_raw_request( + kwargs, response_obj, start_time, end_time, span + ) + # Ensure proxy-request parent span is annotated with the actual operation kind + if parent_span is not None and parent_span.name == LITELLM_PROXY_REQUEST_SPAN_NAME: + self.set_attributes(parent_span, kwargs, response_obj) + else: + # Do not create primary span (keep hierarchy shallow when parent exists) + from opentelemetry.trace import Status, StatusCode + + span = None + # Only set attributes if the span is still recording (not closed) + # Note: parent_span is guaranteed to be not None here + parent_span.set_status(Status(StatusCode.OK)) + self.set_attributes(parent_span, kwargs, response_obj) + # Raw-request as direct child of parent_span + self._maybe_log_raw_request( + kwargs, response_obj, start_time, end_time, parent_span + ) # 3. Guardrail span self._create_guardrail_span(kwargs=kwargs, context=ctx) @@ -578,21 +698,39 @@ def _handle_success(self, kwargs, response_obj, start_time, end_time): # 5. Semantic logs. if self.config.enable_events: - self._emit_semantic_logs(kwargs, response_obj, span) - - # 6. End parent span - if parent_span is not None: - parent_span.end(end_time=self._to_ns(datetime.now())) + log_span = span if span is not None else parent_span + if log_span is not None: + self._emit_semantic_logs(kwargs, response_obj, log_span) + + # 6. Do NOT end parent span - it should be managed by its creator + # External spans (from Langfuse, user code, HTTP headers, global context) must not be closed by LiteLLM + # However, proxy-created spans should be closed here + if ( + parent_span is not None + and parent_span.name == LITELLM_PROXY_REQUEST_SPAN_NAME + ): + parent_span.end(end_time=self._to_ns(end_time)) - def _start_primary_span(self, kwargs, response_obj, start_time, end_time, context): + def _start_primary_span( + self, + kwargs, + response_obj, + start_time, + end_time, + context, + ): from opentelemetry.trace import Status, StatusCode otel_tracer: Tracer = self.get_tracer_to_use_for_request(kwargs) + + # Always create a new span + # The parent relationship is preserved through the context parameter span = otel_tracer.start_span( name=self._get_span_name(kwargs), start_time=self._to_ns(start_time), context=context, ) + span.set_status(Status(StatusCode.OK)) self.set_attributes(span, kwargs, response_obj) span.end(end_time=self._to_ns(end_time)) @@ -675,7 +813,7 @@ def _record_metrics(self, kwargs, response_obj, start_time, end_time): and self._token_usage_histogram ): in_attrs = {**common_attrs, "gen_ai.token.type": "input"} - out_attrs = {**common_attrs, "gen_ai.token.type": "completion"} + out_attrs = {**common_attrs, "gen_ai.token.type": "output"} self._token_usage_histogram.record( usage.get("prompt_tokens", 0), attributes=in_attrs ) @@ -687,20 +825,200 @@ def _record_metrics(self, kwargs, response_obj, start_time, end_time): if self._cost_histogram and cost: self._cost_histogram.record(cost, attributes=common_attrs) + # Record latency metrics (TTFT, TPOT, and Total Generation Time) + self._record_time_to_first_token_metric(kwargs, common_attrs) + self._record_time_per_output_token_metric( + kwargs, response_obj, end_time, duration_s, common_attrs + ) + self._record_response_duration_metric(kwargs, end_time, common_attrs) + + @staticmethod + def _to_timestamp(val: Optional[Union[datetime, float, str]]) -> Optional[float]: + """Convert datetime/float/string to timestamp.""" + if val is None: + return None + if isinstance(val, datetime): + return val.timestamp() + if isinstance(val, (int, float)): + return float(val) + # isinstance(val, str) - parse datetime string (with or without microseconds) + try: + return datetime.strptime(val, "%Y-%m-%d %H:%M:%S.%f").timestamp() + except ValueError: + try: + return datetime.strptime(val, "%Y-%m-%d %H:%M:%S").timestamp() + except ValueError: + return None + + def _record_time_to_first_token_metric(self, kwargs: dict, common_attrs: dict): + """Record Time to First Token (TTFT) metric for streaming requests.""" + optional_params = kwargs.get("optional_params", {}) + is_streaming = optional_params.get("stream", False) + + if not (self._time_to_first_token_histogram and is_streaming): + return + + # Use api_call_start_time for precision (matches Prometheus implementation) + # This excludes LiteLLM overhead and measures pure LLM API latency + api_call_start_time = kwargs.get("api_call_start_time", None) + completion_start_time = kwargs.get("completion_start_time", None) + + if api_call_start_time is not None and completion_start_time is not None: + # Convert to timestamps if needed (handles datetime, float, and string) + api_call_start_ts = self._to_timestamp(api_call_start_time) + completion_start_ts = self._to_timestamp(completion_start_time) + + if api_call_start_ts is None or completion_start_ts is None: + return # Skip recording if conversion failed + + time_to_first_token_seconds = completion_start_ts - api_call_start_ts + self._time_to_first_token_histogram.record( + time_to_first_token_seconds, attributes=common_attrs + ) + + def _record_time_per_output_token_metric( + self, + kwargs: dict, + response_obj: Optional[Any], + end_time: datetime, + duration_s: float, + common_attrs: dict, + ): + """Record Time Per Output Token (TPOT) metric. + + Calculated as: generation_time / completion_tokens + - For streaming: uses end_time - completion_start_time (time to generate all tokens after first) + - For non-streaming: uses end_time - api_call_start_time (total generation time) + """ + if not self._time_per_output_token_histogram: + return + + # Get completion tokens from response_obj + completion_tokens = None + if response_obj and (usage := response_obj.get("usage")): + completion_tokens = usage.get("completion_tokens") + + if completion_tokens is None or completion_tokens <= 0: + return + + # Calculate generation time + completion_start_time = kwargs.get("completion_start_time", None) + api_call_start_time = kwargs.get("api_call_start_time", None) + + # Convert end_time to timestamp (handles datetime, float, and string) + end_time_ts = self._to_timestamp(end_time) + if end_time_ts is None: + # Fallback to duration_s if conversion failed + generation_time_seconds = duration_s + if generation_time_seconds > 0: + time_per_output_token_seconds = ( + generation_time_seconds / completion_tokens + ) + self._time_per_output_token_histogram.record( + time_per_output_token_seconds, attributes=common_attrs + ) + return + + if completion_start_time is not None: + # Streaming: use completion_start_time (when first token arrived) + # This measures time to generate all tokens after the first one + completion_start_ts = self._to_timestamp(completion_start_time) + if completion_start_ts is None: + # Fallback to duration_s if conversion failed + generation_time_seconds = duration_s + else: + generation_time_seconds = end_time_ts - completion_start_ts + elif api_call_start_time is not None: + # Non-streaming: use api_call_start_time (total generation time) + api_call_start_ts = self._to_timestamp(api_call_start_time) + if api_call_start_ts is None: + # Fallback to duration_s if conversion failed + generation_time_seconds = duration_s + else: + generation_time_seconds = end_time_ts - api_call_start_ts + else: + # Fallback: use duration_s (already calculated as (end_time - start_time).total_seconds()) + generation_time_seconds = duration_s + + if generation_time_seconds > 0: + time_per_output_token_seconds = generation_time_seconds / completion_tokens + self._time_per_output_token_histogram.record( + time_per_output_token_seconds, attributes=common_attrs + ) + + def _record_response_duration_metric( + self, + kwargs: dict, + end_time: Union[datetime, float], + common_attrs: dict, + ): + """Record Total Generation Time (response duration) metric. + + Measures pure LLM API generation time: end_time - api_call_start_time + This excludes LiteLLM overhead and measures only the LLM provider's response time. + Works for both streaming and non-streaming requests. + + Mirrors Prometheus's litellm_llm_api_latency_metric. + Uses kwargs.get("end_time") with fallback to parameter for consistency with Prometheus. + """ + if not self._response_duration_histogram: + return + + api_call_start_time = kwargs.get("api_call_start_time", None) + if api_call_start_time is None: + return + + # Use end_time from kwargs if available (matches Prometheus), otherwise use parameter + # For streaming: end_time is when the stream completes (final chunk received) + # For non-streaming: end_time is when the response is received + _end_time = kwargs.get("end_time") or end_time + if _end_time is None: + _end_time = datetime.now() + + # Convert to timestamps if needed (handles datetime, float, and string) + api_call_start_ts = self._to_timestamp(api_call_start_time) + end_time_ts = self._to_timestamp(_end_time) + + if api_call_start_ts is None or end_time_ts is None: + return # Skip recording if conversion failed + + response_duration_seconds = end_time_ts - api_call_start_ts + + if response_duration_seconds > 0: + self._response_duration_histogram.record( + response_duration_seconds, attributes=common_attrs + ) + def _emit_semantic_logs(self, kwargs, response_obj, span: Span): if not self.config.enable_events: return + # NOTE: Semantic logs (gen_ai.content.prompt/completion events) have compatibility issues + # with OTEL SDK >= 1.39.0 due to breaking changes in PR #4676: + # - LogRecord moved from opentelemetry.sdk._logs to opentelemetry.sdk._logs._internal + # - LogRecord constructor no longer accepts 'resource' parameter (now inherited from LoggerProvider) + # - LogData class was removed entirely + # These logs work correctly in OTEL SDK < 1.39.0 but may fail in >= 1.39.0. + # See: https://github.com/open-telemetry/opentelemetry-python/pull/4676 + # TODO: Refactor to use the proper OTEL Logs API instead of directly creating SDK LogRecords + from opentelemetry._logs import SeverityNumber, get_logger, get_logger_provider - from opentelemetry.sdk._logs import LogRecord as SdkLogRecord + try: + from opentelemetry.sdk._logs import ( + LogRecord as SdkLogRecord, # type: ignore[attr-defined] # OTEL < 1.39.0 + ) + except ImportError: + from opentelemetry.sdk._logs._internal import ( + LogRecord as SdkLogRecord, # OTEL >= 1.39.0 + ) otel_logger = get_logger(LITELLM_LOGGER_NAME) # Get the resource from the logger provider logger_provider = get_logger_provider() - resource = ( - getattr(logger_provider, "_resource", None) or _get_litellm_resource() - ) + resource = getattr( + logger_provider, "_resource", None + ) or self._get_litellm_resource(self.config) parent_ctx = span.get_span_context() provider = (kwargs.get("litellm_params") or {}).get( @@ -775,52 +1093,70 @@ def _create_guardrail_span( if standard_logging_payload is None: return - guardrail_information = standard_logging_payload.get("guardrail_information") - if guardrail_information is None: + guardrail_information_data = standard_logging_payload.get( + "guardrail_information" + ) + + if not guardrail_information_data: return - start_time_float = guardrail_information.get("start_time") - end_time_float = guardrail_information.get("end_time") - start_time_datetime = datetime.now() - if start_time_float is not None: - start_time_datetime = datetime.fromtimestamp(start_time_float) - end_time_datetime = datetime.now() - if end_time_float is not None: - end_time_datetime = datetime.fromtimestamp(end_time_float) + guardrail_information_list = [ + information + for information in guardrail_information_data + if isinstance(information, dict) + ] + + if not guardrail_information_list: + return otel_tracer: Tracer = self.get_tracer_to_use_for_request(kwargs) - guardrail_span = otel_tracer.start_span( - name="guardrail", - start_time=self._to_ns(start_time_datetime), - context=context, - ) + for guardrail_information in guardrail_information_list: + start_time_float = guardrail_information.get("start_time") + end_time_float = guardrail_information.get("end_time") + start_time_datetime = datetime.now() + if start_time_float is not None: + start_time_datetime = datetime.fromtimestamp(start_time_float) + end_time_datetime = datetime.now() + if end_time_float is not None: + end_time_datetime = datetime.fromtimestamp(end_time_float) + + guardrail_span = otel_tracer.start_span( + name="guardrail", + start_time=self._to_ns(start_time_datetime), + context=context, + ) - self.safe_set_attribute( - span=guardrail_span, - key="guardrail_name", - value=guardrail_information.get("guardrail_name"), - ) + self.safe_set_attribute( + span=guardrail_span, + key=SpanAttributes.OPENINFERENCE_SPAN_KIND, + value=OpenInferenceSpanKindValues.GUARDRAIL.value, + ) - self.safe_set_attribute( - span=guardrail_span, - key="guardrail_mode", - value=guardrail_information.get("guardrail_mode"), - ) + self.safe_set_attribute( + span=guardrail_span, + key="guardrail_name", + value=guardrail_information.get("guardrail_name"), + ) - # Set masked_entity_count directly without conversion - masked_entity_count = guardrail_information.get("masked_entity_count") - if masked_entity_count is not None: - guardrail_span.set_attribute( - "masked_entity_count", safe_dumps(masked_entity_count) + self.safe_set_attribute( + span=guardrail_span, + key="guardrail_mode", + value=guardrail_information.get("guardrail_mode"), ) - self.safe_set_attribute( - span=guardrail_span, - key="guardrail_response", - value=guardrail_information.get("guardrail_response"), - ) + masked_entity_count = guardrail_information.get("masked_entity_count") + if masked_entity_count is not None: + guardrail_span.set_attribute( + "masked_entity_count", safe_dumps(masked_entity_count) + ) - guardrail_span.end(end_time=self._to_ns(end_time_datetime)) + self.safe_set_attribute( + span=guardrail_span, + key="guardrail_response", + value=guardrail_information.get("guardrail_response"), + ) + + guardrail_span.end(end_time=self._to_ns(end_time_datetime)) def _handle_failure(self, kwargs, response_obj, start_time, end_time): from opentelemetry.trace import Status, StatusCode @@ -832,31 +1168,54 @@ def _handle_failure(self, kwargs, response_obj, start_time, end_time): ) _parent_context, parent_otel_span = self._get_span_context(kwargs) - # Span 1: Requst sent to litellm SDK - otel_tracer: Tracer = self.get_tracer_to_use_for_request(kwargs) - span = otel_tracer.start_span( - name=self._get_span_name(kwargs), - start_time=self._to_ns(start_time), - context=_parent_context, + # Decide whether to create a primary span + # Always create if no parent span exists (backward compatibility) + # OR if USE_OTEL_LITELLM_REQUEST_SPAN is explicitly enabled + should_create_primary_span = parent_otel_span is None or get_secret_bool( + "USE_OTEL_LITELLM_REQUEST_SPAN" ) - span.set_status(Status(StatusCode.ERROR)) - self.set_attributes(span, kwargs, response_obj) - - # Record exception information using OTEL standard method - self._record_exception_on_span(span=span, kwargs=kwargs) - - span.end(end_time=self._to_ns(end_time)) + + if should_create_primary_span: + # Span 1: Request sent to litellm SDK + otel_tracer: Tracer = self.get_tracer_to_use_for_request(kwargs) + span = otel_tracer.start_span( + name=self._get_span_name(kwargs), + start_time=self._to_ns(start_time), + context=_parent_context, + ) + span.set_status(Status(StatusCode.ERROR)) + self.set_attributes(span, kwargs, response_obj) + + # Record exception information using OTEL standard method + self._record_exception_on_span(span=span, kwargs=kwargs) + + span.end(end_time=self._to_ns(end_time)) + else: + # When parent span exists and USE_OTEL_LITELLM_REQUEST_SPAN=false, + # record error on parent span (keeps hierarchy shallow) + # Only set attributes if the span is still recording (not closed) + # Note: parent_otel_span is guaranteed to be not None here + if parent_otel_span.is_recording(): + parent_otel_span.set_status(Status(StatusCode.ERROR)) + self.set_attributes(parent_otel_span, kwargs, response_obj) + self._record_exception_on_span(span=parent_otel_span, kwargs=kwargs) # Create span for guardrail information self._create_guardrail_span(kwargs=kwargs, context=_parent_context) - if parent_otel_span is not None: - parent_otel_span.end(end_time=self._to_ns(datetime.now())) + # Do NOT end parent span - it should be managed by its creator + # External spans (from Langfuse, user code, HTTP headers, global context) must not be closed by LiteLLM + # However, proxy-created spans should be closed here + if ( + parent_otel_span is not None + and parent_otel_span.name == LITELLM_PROXY_REQUEST_SPAN_NAME + ): + parent_otel_span.end(end_time=self._to_ns(end_time)) def _record_exception_on_span(self, span: Span, kwargs: dict): """ Record exception information on the span using OTEL standard methods. - + This extracts error information from StandardLoggingPayload and: 1. Uses span.record_exception() for the actual exception object (OTEL standard) 2. Sets structured error attributes from StandardLoggingPayloadErrorInformation @@ -866,22 +1225,22 @@ def _record_exception_on_span(self, span: Span, kwargs: dict): # Get the exception object if available exception = kwargs.get("exception") - + # Record the exception using OTEL's standard method if exception is not None: span.record_exception(exception) - + # Get StandardLoggingPayload for structured error information standard_logging_payload: Optional[StandardLoggingPayload] = kwargs.get( "standard_logging_object" ) - + if standard_logging_payload is None: return - + # Extract error_information from StandardLoggingPayload error_information = standard_logging_payload.get("error_information") - + if error_information is None: # Fallback to error_str if error_information is not available error_str = standard_logging_payload.get("error_str") @@ -892,7 +1251,7 @@ def _record_exception_on_span(self, span: Span, kwargs: dict): value=error_str, ) return - + # Set structured error attributes from StandardLoggingPayloadErrorInformation if error_information.get("error_code"): self.safe_set_attribute( @@ -900,35 +1259,35 @@ def _record_exception_on_span(self, span: Span, kwargs: dict): key=ErrorAttributes.ERROR_CODE, value=error_information["error_code"], ) - + if error_information.get("error_class"): self.safe_set_attribute( span=span, key=ErrorAttributes.ERROR_TYPE, value=error_information["error_class"], ) - + if error_information.get("error_message"): self.safe_set_attribute( span=span, key=ErrorAttributes.ERROR_MESSAGE, value=error_information["error_message"], ) - + if error_information.get("llm_provider"): self.safe_set_attribute( span=span, key=ErrorAttributes.ERROR_LLM_PROVIDER, value=error_information["llm_provider"], ) - + if error_information.get("traceback"): self.safe_set_attribute( span=span, key=ErrorAttributes.ERROR_STACK_TRACE, value=error_information["traceback"], ) - + except Exception as e: verbose_logger.exception( "OpenTelemetry: Error recording exception on span: %s", str(e) @@ -1013,14 +1372,7 @@ def set_attributes( # noqa: PLR0915 self, span: Span, kwargs, response_obj: Optional[Any] ): try: - if self.callback_name == "arize_phoenix": - from litellm.integrations.arize.arize_phoenix import ArizePhoenixLogger - - ArizePhoenixLogger.set_arize_phoenix_attributes( - span, kwargs, response_obj - ) - return - elif self.callback_name == "langtrace": + if self.callback_name == "langtrace": from litellm.integrations.langtrace import LangtraceAttributes LangtraceAttributes().set_langtrace_attributes( @@ -1036,6 +1388,13 @@ def set_attributes( # noqa: PLR0915 span, kwargs, response_obj ) return + elif self.callback_name == "weave_otel": + from litellm.integrations.weave.weave_otel import ( + set_weave_otel_attributes, + ) + + set_weave_otel_attributes(span, kwargs, response_obj) + return from litellm.proxy._types import SpanAttributes optional_params = kwargs.get("optional_params", {}) @@ -1065,6 +1424,18 @@ def set_attributes( # noqa: PLR0915 self.safe_set_attribute( span=span, key="hidden_params", value=safe_dumps(hidden_params) ) + # Cost breakdown tracking + cost_breakdown: Optional[CostBreakdown] = standard_logging_payload.get( + "cost_breakdown" + ) + if cost_breakdown: + for key, value in cost_breakdown.items(): + if value is not None: + self.safe_set_attribute( + span=span, + key=f"gen_ai.cost.{key}", + value=value, + ) ############################################# ########## LLM Request Attributes ########### ############################################# @@ -1146,25 +1517,25 @@ def set_attributes( # noqa: PLR0915 if usage: self.safe_set_attribute( span=span, - key=SpanAttributes.LLM_USAGE_TOTAL_TOKENS.value, + key=SpanAttributes.GEN_AI_USAGE_TOTAL_TOKENS.value, value=usage.get("total_tokens"), ) # The number of tokens used in the LLM response (completion). self.safe_set_attribute( span=span, - key=SpanAttributes.LLM_USAGE_COMPLETION_TOKENS.value, + key=SpanAttributes.GEN_AI_USAGE_OUTPUT_TOKENS.value, value=usage.get("completion_tokens"), ) # The number of tokens used in the LLM prompt. self.safe_set_attribute( span=span, - key=SpanAttributes.LLM_USAGE_PROMPT_TOKENS.value, + key=SpanAttributes.GEN_AI_USAGE_INPUT_TOKENS.value, value=usage.get("prompt_tokens"), ) - ######################################################################## + ######################################################################## ########## LLM Request Medssages / tools / content Attributes ########### ######################################################################### @@ -1178,53 +1549,75 @@ def set_attributes( # noqa: PLR0915 self.set_tools_attributes(span, tools) if kwargs.get("messages"): - for idx, prompt in enumerate(kwargs.get("messages")): - if prompt.get("role"): - self.safe_set_attribute( - span=span, - key=f"{SpanAttributes.LLM_PROMPTS.value}.{idx}.role", - value=prompt.get("role"), - ) + transformed_messages = ( + self._transform_messages_to_otel_semantic_conventions( + kwargs.get("messages") + ) + ) + self.safe_set_attribute( + span=span, + key=SpanAttributes.GEN_AI_INPUT_MESSAGES.value, + value=safe_dumps(transformed_messages), + ) - if prompt.get("content"): - if not isinstance(prompt.get("content"), str): - prompt["content"] = str(prompt.get("content")) - self.safe_set_attribute( - span=span, - key=f"{SpanAttributes.LLM_PROMPTS.value}.{idx}.content", - value=prompt.get("content"), - ) + if kwargs.get("system_instructions"): + transformed_system_instructions = ( + self._transform_messages_to_otel_semantic_conventions( + kwargs.get("system_instructions") + ) + ) + self.safe_set_attribute( + span=span, + key=SpanAttributes.GEN_AI_SYSTEM_INSTRUCTIONS.value, + value=safe_dumps(transformed_system_instructions), + ) + + self.safe_set_attribute( + span=span, + key=SpanAttributes.GEN_AI_OPERATION_NAME.value, + value=( + "chat" + if standard_logging_payload.get("call_type") == "completion" + else standard_logging_payload.get("call_type") or "chat" + ), + ) + + if standard_logging_payload.get("request_id"): + self.safe_set_attribute( + span=span, + key=SpanAttributes.GEN_AI_REQUEST_ID.value, + value=standard_logging_payload.get("request_id"), + ) ############################################# ########## LLM Response Attributes ########## ############################################# if response_obj is not None: if response_obj.get("choices"): + transformed_choices = ( + self._transform_choices_to_otel_semantic_conventions( + response_obj.get("choices") + ) + ) + self.safe_set_attribute( + span=span, + key=SpanAttributes.GEN_AI_OUTPUT_MESSAGES.value, + value=safe_dumps(transformed_choices), + ) + + finish_reasons = [] + for idx, choice in enumerate(response_obj.get("choices")): + if choice.get("finish_reason"): + finish_reasons.append(choice.get("finish_reason")) + + if finish_reasons: + self.safe_set_attribute( + span=span, + key=SpanAttributes.GEN_AI_RESPONSE_FINISH_REASONS.value, + value=safe_dumps(finish_reasons), + ) + for idx, choice in enumerate(response_obj.get("choices")): if choice.get("finish_reason"): - self.safe_set_attribute( - span=span, - key=f"{SpanAttributes.LLM_COMPLETIONS.value}.{idx}.finish_reason", - value=choice.get("finish_reason"), - ) - if choice.get("message"): - if choice.get("message").get("role"): - self.safe_set_attribute( - span=span, - key=f"{SpanAttributes.LLM_COMPLETIONS.value}.{idx}.role", - value=choice.get("message").get("role"), - ) - if choice.get("message").get("content"): - if not isinstance( - choice.get("message").get("content"), str - ): - choice["message"]["content"] = str( - choice.get("message").get("content") - ) - self.safe_set_attribute( - span=span, - key=f"{SpanAttributes.LLM_COMPLETIONS.value}.{idx}.content", - value=choice.get("message").get("content"), - ) message = choice.get("message") tool_calls = message.get("tool_calls") @@ -1238,6 +1631,7 @@ def set_attributes( # noqa: PLR0915 ) except Exception as e: + self.handle_callback_failure(callback_name=self.callback_name or "opentelemetry") verbose_logger.exception( "OpenTelemetry logging error in set_attributes %s", str(e) ) @@ -1266,6 +1660,66 @@ def safe_set_attribute(self, span: Span, key: str, value: Any): primitive_value = self._cast_as_primitive_value_type(value) span.set_attribute(key, primitive_value) + def _transform_messages_to_otel_semantic_conventions( + self, messages: Union[List[dict], str] + ) -> List[dict]: + """ + Transforms LiteLLM/OpenAI style messages into OTEL GenAI 1.38 compliant format. + OTEL expects a 'parts' array instead of a single 'content' string. + """ + if isinstance(messages, str): + # Handle system_instructions passed as a string + return [ + {"role": "system", "parts": [{"type": "text", "content": messages}]} + ] + + transformed = [] + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + parts = [] + + if isinstance(content, str): + parts.append({"type": "text", "content": content}) + elif isinstance(content, list): + # Handle multi-modal content if necessary + for part in content: + if isinstance(part, dict): + parts.append(part) + else: + parts.append({"type": "text", "content": str(part)}) + + transformed_msg = {"role": role, "parts": parts} + if "id" in msg: + transformed_msg["id"] = msg["id"] + if "tool_calls" in msg: + transformed_msg["tool_calls"] = msg["tool_calls"] + if "tool_call_id" in msg: + transformed_msg["tool_call_id"] = msg["tool_call_id"] + transformed.append(transformed_msg) + + return transformed + + def _transform_choices_to_otel_semantic_conventions( + self, choices: List[dict] + ) -> List[dict]: + """ + Transforms choices into OTEL GenAI 1.38 compliant format for output.messages. + """ + transformed = [] + for choice in choices: + message = choice.get("message") or {} + finish_reason = choice.get("finish_reason") + + transformed_msg = self._transform_messages_to_otel_semantic_conventions( + [message] + )[0] + if finish_reason: + transformed_msg["finish_reason"] = finish_reason + + transformed.append(transformed_msg) + return transformed + def set_raw_request_attributes(self, span: Span, kwargs, response_obj): try: kwargs.get("optional_params", {}) @@ -1348,7 +1802,7 @@ def get_traceparent_from_header(self, headers): return _parent_context - def _get_span_context(self, kwargs): + def _get_span_context(self, kwargs, default_span: Optional[Span] = None): from opentelemetry import context, trace from opentelemetry.trace.propagation.tracecontext import ( TraceContextTextMapPropagator, @@ -1363,12 +1817,16 @@ def _get_span_context(self, kwargs): # Priority 1: Explicit parent span from metadata if parent_otel_span is not None: - verbose_logger.debug("OpenTelemetry: Using explicit parent span from metadata") + verbose_logger.debug( + "OpenTelemetry: Using explicit parent span from metadata" + ) return trace.set_span_in_context(parent_otel_span), parent_otel_span # Priority 2: HTTP traceparent header if traceparent is not None: - verbose_logger.debug("OpenTelemetry: Using traceparent header for context propagation") + verbose_logger.debug( + "OpenTelemetry: Using traceparent header for context propagation" + ) carrier = {"traceparent": traceparent} return TraceContextTextMapPropagator().extract(carrier=carrier), None @@ -1381,25 +1839,23 @@ def _get_span_context(self, kwargs): verbose_logger.debug( "OpenTelemetry: Using active span from global context: %s (trace_id=%s, span_id=%s, is_recording=%s)", current_span, - format(span_context.trace_id, '032x'), - format(span_context.span_id, '016x'), - current_span.is_recording() + format(span_context.trace_id, "032x"), + format(span_context.span_id, "016x"), + current_span.is_recording(), ) return context.get_current(), current_span except Exception as e: - verbose_logger.debug("OpenTelemetry: Error getting current span: %s", str(e)) + verbose_logger.debug( + "OpenTelemetry: Error getting current span: %s", str(e) + ) # Priority 4: No parent context - verbose_logger.debug("OpenTelemetry: No parent context found, creating root span") + verbose_logger.debug( + "OpenTelemetry: No parent context found, creating root span" + ) return None, None def _get_span_processor(self, dynamic_headers: Optional[dict] = None): - from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( - OTLPSpanExporter as OTLPSpanExporterGRPC, - ) - from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( - OTLPSpanExporter as OTLPSpanExporterHTTP, - ) from opentelemetry.sdk.trace.export import ( BatchSpanProcessor, ConsoleSpanExporter, @@ -1437,6 +1893,16 @@ def _get_span_processor(self, dynamic_headers: Optional[dict] = None): or self.OTEL_EXPORTER == "http/protobuf" or self.OTEL_EXPORTER == "http/json" ): + try: + from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( + OTLPSpanExporter as OTLPSpanExporterHTTP, + ) + except ImportError as exc: + raise ImportError( + "OpenTelemetry OTLP HTTP exporter is not available. Install " + "`opentelemetry-exporter-otlp` to enable OTLP HTTP." + ) from exc + verbose_logger.debug( "OpenTelemetry: intiializing http exporter. Value of OTEL_EXPORTER: %s", self.OTEL_EXPORTER, @@ -1450,6 +1916,16 @@ def _get_span_processor(self, dynamic_headers: Optional[dict] = None): ), ) elif self.OTEL_EXPORTER == "otlp_grpc" or self.OTEL_EXPORTER == "grpc": + try: + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import ( + OTLPSpanExporter as OTLPSpanExporterGRPC, + ) + except ImportError as exc: + raise ImportError( + "OpenTelemetry OTLP gRPC exporter is not available. Install " + "`opentelemetry-exporter-otlp` and `grpcio` (or `litellm[grpc]`)." + ) from exc + verbose_logger.debug( "OpenTelemetry: intiializing grpc exporter. Value of OTEL_EXPORTER: %s", self.OTEL_EXPORTER, @@ -1499,7 +1975,8 @@ def _get_log_exporter(self): ) return self.OTEL_EXPORTER - if self.OTEL_EXPORTER == "console": + otel_logs_exporter = os.getenv("OTEL_LOGS_EXPORTER") + if self.OTEL_EXPORTER == "console" or otel_logs_exporter == "console": from opentelemetry.sdk._logs.export import ConsoleLogExporter verbose_logger.debug( @@ -1525,9 +2002,15 @@ def _get_log_exporter(self): endpoint=normalized_endpoint, headers=_split_otel_headers ) elif self.OTEL_EXPORTER == "otlp_grpc" or self.OTEL_EXPORTER == "grpc": - from opentelemetry.exporter.otlp.proto.grpc._log_exporter import ( - OTLPLogExporter, - ) + try: + from opentelemetry.exporter.otlp.proto.grpc._log_exporter import ( + OTLPLogExporter, + ) + except ImportError as exc: + raise ImportError( + "OpenTelemetry OTLP gRPC log exporter is not available. Install " + "`opentelemetry-exporter-otlp` and `grpcio` (or `litellm[grpc]`)." + ) from exc verbose_logger.debug( "OpenTelemetry: Using gRPC log exporter. Value of OTEL_EXPORTER: %s, endpoint: %s", @@ -1546,6 +2029,75 @@ def _get_log_exporter(self): return ConsoleLogExporter() + def _get_metric_reader(self): + """ + Get the appropriate metric reader based on the configuration. + """ + from opentelemetry.sdk.metrics import Histogram + from opentelemetry.sdk.metrics.export import ( + AggregationTemporality, + ConsoleMetricExporter, + PeriodicExportingMetricReader, + ) + + verbose_logger.debug( + "OpenTelemetry Logger, initializing metric reader\nself.OTEL_EXPORTER: %s\nself.OTEL_ENDPOINT: %s\nself.OTEL_HEADERS: %s", + self.OTEL_EXPORTER, + self.OTEL_ENDPOINT, + self.OTEL_HEADERS, + ) + + _split_otel_headers = OpenTelemetry._get_headers_dictionary(self.OTEL_HEADERS) + normalized_endpoint = self._normalize_otel_endpoint( + self.OTEL_ENDPOINT, "metrics" + ) + + if self.OTEL_EXPORTER == "console": + exporter = ConsoleMetricExporter() + return PeriodicExportingMetricReader(exporter, export_interval_millis=5000) + + elif ( + self.OTEL_EXPORTER == "otlp_http" + or self.OTEL_EXPORTER == "http/protobuf" + or self.OTEL_EXPORTER == "http/json" + ): + from opentelemetry.exporter.otlp.proto.http.metric_exporter import ( + OTLPMetricExporter, + ) + + exporter = OTLPMetricExporter( + endpoint=normalized_endpoint, + headers=_split_otel_headers, + preferred_temporality={Histogram: AggregationTemporality.DELTA}, + ) + return PeriodicExportingMetricReader(exporter, export_interval_millis=5000) + + elif self.OTEL_EXPORTER == "otlp_grpc" or self.OTEL_EXPORTER == "grpc": + try: + from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import ( + OTLPMetricExporter, + ) + except ImportError as exc: + raise ImportError( + "OpenTelemetry OTLP gRPC metric exporter is not available. Install " + "`opentelemetry-exporter-otlp` and `grpcio` (or `litellm[grpc]`)." + ) from exc + + exporter = OTLPMetricExporter( + endpoint=normalized_endpoint, + headers=_split_otel_headers, + preferred_temporality={Histogram: AggregationTemporality.DELTA}, + ) + return PeriodicExportingMetricReader(exporter, export_interval_millis=5000) + + else: + verbose_logger.warning( + "OpenTelemetry: Unknown metric exporter '%s', defaulting to console. Supported: console, otlp_http, otlp_grpc", + self.OTEL_EXPORTER, + ) + exporter = ConsoleMetricExporter() + return PeriodicExportingMetricReader(exporter, export_interval_millis=5000) + def _normalize_otel_endpoint( self, endpoint: Optional[str], signal_type: str ) -> Optional[str]: @@ -1743,8 +2295,9 @@ def create_litellm_proxy_request_started_span( """ Create a span for the received proxy server request. """ + return self.tracer.start_span( - name="Received Proxy Server Request", + name=LITELLM_PROXY_REQUEST_SPAN_NAME, start_time=self._to_ns(start_time), context=self.get_traceparent_from_header(headers=headers), kind=self.span_kind.SERVER, diff --git a/litellm/integrations/opentelemetry_utils/base_otel_llm_obs_attributes.py b/litellm/integrations/opentelemetry_utils/base_otel_llm_obs_attributes.py new file mode 100644 index 000000000000..f74da8231f3b --- /dev/null +++ b/litellm/integrations/opentelemetry_utils/base_otel_llm_obs_attributes.py @@ -0,0 +1,37 @@ +from abc import ABC +from typing import TYPE_CHECKING, Any, Dict, Union + +if TYPE_CHECKING: + from opentelemetry.trace import Span + + +class BaseLLMObsOTELAttributes(ABC): + @staticmethod + def set_messages(span: "Span", kwargs: Dict[str, Any]): + pass + + @staticmethod + def set_response_output_messages(span: "Span", response_obj): + pass + + +def cast_as_primitive_value_type(value) -> Union[str, bool, int, float]: + """ + Converts a value to an OTEL-supported primitive for Arize/Phoenix observability. + """ + if value is None: + return "" + if isinstance(value, (str, bool, int, float)): + return value + try: + return str(value) + except Exception: + return "" + + +def safe_set_attribute(span: "Span", key: str, value: Any): + """ + Sets a span attribute safely with OTEL-compliant primitive typing for Arize/Phoenix. + """ + primitive_value = cast_as_primitive_value_type(value) + span.set_attribute(key, primitive_value) diff --git a/litellm/integrations/opik/opik.py b/litellm/integrations/opik/opik.py index c28aa14a11ee..7b687d34d1cf 100644 --- a/litellm/integrations/opik/opik.py +++ b/litellm/integrations/opik/opik.py @@ -3,10 +3,9 @@ """ import asyncio -from datetime import timezone -import json import traceback -from typing import Dict, List +from datetime import datetime +from typing import Any, Dict, Optional from litellm._logging import verbose_logger from litellm.integrations.custom_batch_logger import CustomBatchLogger @@ -16,12 +15,22 @@ httpxSpecialProvider, ) -from .utils import ( - create_usage_object, - create_uuid7, - get_opik_config_variable, - get_traces_and_spans_from_payload, -) +from . import opik_payload_builder, utils + +try: + from opik.api_objects import opik_client +except Exception: + opik_client = None + + +def _should_skip_event(kwargs: Dict[str, Any]) -> bool: + """Check if event should be skipped due to missing standard_logging_object.""" + if kwargs.get("standard_logging_object") is None: + verbose_logger.debug( + "OpikLogger skipping event; no standard_logging_object found" + ) + return True + return False class OpikLogger(CustomBatchLogger): @@ -29,76 +38,140 @@ class OpikLogger(CustomBatchLogger): Opik Logger for logging events to an Opik Server """ - def __init__(self, **kwargs): + def __init__(self, **kwargs: Any) -> None: self.async_httpx_client = get_async_httpx_client( llm_provider=httpxSpecialProvider.LoggingCallback ) self.sync_httpx_client = _get_httpx_client() - self.opik_project_name = get_opik_config_variable( - "project_name", - user_value=kwargs.get("project_name", None), - default_value="Default Project", + self.opik_project_name: str = ( + utils.get_opik_config_variable( + "project_name", + user_value=kwargs.get("project_name", None), + default_value="Default Project", + ) + or "Default Project" ) - opik_base_url = get_opik_config_variable( - "url_override", - user_value=kwargs.get("url", None), - default_value="https://www.comet.com/opik/api", + opik_base_url: str = ( + utils.get_opik_config_variable( + "url_override", + user_value=kwargs.get("url", None), + default_value="https://www.comet.com/opik/api", + ) + or "https://www.comet.com/opik/api" ) - opik_api_key = get_opik_config_variable( + opik_api_key: Optional[str] = utils.get_opik_config_variable( "api_key", user_value=kwargs.get("api_key", None), default_value=None ) - opik_workspace = get_opik_config_variable( + opik_workspace: Optional[str] = utils.get_opik_config_variable( "workspace", user_value=kwargs.get("workspace", None), default_value=None ) - self.trace_url = f"{opik_base_url}/v1/private/traces/batch" - self.span_url = f"{opik_base_url}/v1/private/spans/batch" + self.trace_url: str = f"{opik_base_url}/v1/private/traces/batch" + self.span_url: str = f"{opik_base_url}/v1/private/spans/batch" - self.headers = {} + self.headers: Dict[str, str] = {} if opik_workspace: self.headers["Comet-Workspace"] = opik_workspace if opik_api_key: self.headers["authorization"] = opik_api_key - self.opik_workspace = opik_workspace - self.opik_api_key = opik_api_key + self.opik_workspace: Optional[str] = opik_workspace + self.opik_api_key: Optional[str] = opik_api_key try: asyncio.create_task(self.periodic_flush()) - self.flush_lock = asyncio.Lock() + self.flush_lock: Optional[asyncio.Lock] = asyncio.Lock() except Exception as e: verbose_logger.exception( f"OpikLogger - Asynchronous processing not initialized as we are not running in an async context {str(e)}" ) self.flush_lock = None + # Initialize _opik_client attribute + if opik_client is not None: + self._opik_client = opik_client.get_client_cached() + else: + self._opik_client = None + super().__init__(**kwargs, flush_lock=self.flush_lock) - async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): + async def async_log_success_event( + self, + kwargs: Dict[str, Any], + response_obj: Any, + start_time: datetime, + end_time: datetime, + ) -> None: try: - opik_payload = self._create_opik_payload( + if _should_skip_event(kwargs): + return + + # Build payload using the payload builder + trace_payload, span_payload = opik_payload_builder.build_opik_payload( kwargs=kwargs, response_obj=response_obj, start_time=start_time, end_time=end_time, + project_name=self.opik_project_name, ) - self.log_queue.extend(opik_payload) - verbose_logger.debug( - f"OpikLogger added event to log_queue - Will flush in {self.flush_interval} seconds..." - ) + if self._opik_client is not None: + # Opik native client is available, use it to send data + if trace_payload is not None: + self._opik_client.trace( + id=trace_payload.id, + name=trace_payload.name, + start_time=datetime.fromisoformat(trace_payload.start_time), + end_time=datetime.fromisoformat(trace_payload.end_time), + input=trace_payload.input, + output=trace_payload.output, + metadata=trace_payload.metadata, + tags=trace_payload.tags, + thread_id=trace_payload.thread_id, + project_name=trace_payload.project_name, + ) + + self._opik_client.span( + id=span_payload.id, + trace_id=span_payload.trace_id, + parent_span_id=span_payload.parent_span_id, + name=span_payload.name, + type=span_payload.type, + model=span_payload.model, + start_time=datetime.fromisoformat(span_payload.start_time), + end_time=datetime.fromisoformat(span_payload.end_time), + input=span_payload.input, + output=span_payload.output, + metadata=span_payload.metadata, + tags=span_payload.tags, + usage=span_payload.usage, + project_name=span_payload.project_name, + provider=span_payload.provider, + total_cost=span_payload.total_cost, + ) + else: + # Add payloads to LiteLLM queue + if trace_payload is not None: + self.log_queue.append(trace_payload.__dict__) + self.log_queue.append(span_payload.__dict__) - if len(self.log_queue) >= self.batch_size: - verbose_logger.debug("OpikLogger - Flushing batch") - await self.flush_queue() + verbose_logger.debug( + f"OpikLogger added event to log_queue - Will flush in {self.flush_interval} seconds..." + ) + + if len(self.log_queue) >= self.batch_size: + verbose_logger.debug("OpikLogger - Flushing batch") + await self.flush_queue() except Exception as e: verbose_logger.exception( f"OpikLogger failed to log success event - {str(e)}\n{traceback.format_exc()}" ) - def _sync_send(self, url: str, headers: Dict[str, str], batch: Dict): + def _sync_send( + self, url: str, headers: Dict[str, str], batch: Dict[str, Any] + ) -> None: try: response = self.sync_httpx_client.post( url=url, headers=headers, json=batch # type: ignore @@ -113,30 +186,82 @@ def _sync_send(self, url: str, headers: Dict[str, str], batch: Dict): f"OpikLogger failed to send batch - {str(e)}\n{traceback.format_exc()}" ) - def log_success_event(self, kwargs, response_obj, start_time, end_time): + def log_success_event( + self, + kwargs: Dict[str, Any], + response_obj: Any, + start_time: datetime, + end_time: datetime, + ) -> None: try: - opik_payload = self._create_opik_payload( + if _should_skip_event(kwargs): + return + + # Build payload using the payload builder + trace_payload, span_payload = opik_payload_builder.build_opik_payload( kwargs=kwargs, response_obj=response_obj, start_time=start_time, end_time=end_time, + project_name=self.opik_project_name, ) - - traces, spans = get_traces_and_spans_from_payload(opik_payload) - if len(traces) > 0: - self._sync_send( - url=self.trace_url, headers=self.headers, batch={"traces": traces} + if self._opik_client is not None: + # Opik native client is available, use it to send data + if trace_payload is not None: + self._opik_client.trace( + id=trace_payload.id, + name=trace_payload.name, + start_time=datetime.fromisoformat(trace_payload.start_time), + end_time=datetime.fromisoformat(trace_payload.end_time), + input=trace_payload.input, + output=trace_payload.output, + metadata=trace_payload.metadata, + tags=trace_payload.tags, + thread_id=trace_payload.thread_id, + project_name=trace_payload.project_name, + ) + + self._opik_client.span( + id=span_payload.id, + trace_id=span_payload.trace_id, + parent_span_id=span_payload.parent_span_id, + name=span_payload.name, + type=span_payload.type, + model=span_payload.model, + start_time=datetime.fromisoformat(span_payload.start_time), + end_time=datetime.fromisoformat(span_payload.end_time), + input=span_payload.input, + output=span_payload.output, + metadata=span_payload.metadata, + tags=span_payload.tags, + usage=span_payload.usage, + project_name=span_payload.project_name, + provider=span_payload.provider, + total_cost=span_payload.total_cost, ) - if len(spans) > 0: + else: + # Opik native client is not available, use LiteLLM queue to send data + if trace_payload is not None: + self._sync_send( + url=self.trace_url, + headers=self.headers, + batch={"traces": [trace_payload.__dict__]}, + ) + + # Always send span self._sync_send( - url=self.span_url, headers=self.headers, batch={"spans": spans} + url=self.span_url, + headers=self.headers, + batch={"spans": [span_payload.__dict__]}, ) except Exception as e: verbose_logger.exception( f"OpikLogger failed to log success event - {str(e)}\n{traceback.format_exc()}" ) - async def _submit_batch(self, url: str, headers: Dict[str, str], batch: Dict): + async def _submit_batch( + self, url: str, headers: Dict[str, str], batch: Dict[str, Any] + ) -> None: try: response = await self.async_httpx_client.post( url=url, headers=headers, json=batch # type: ignore @@ -154,8 +279,8 @@ async def _submit_batch(self, url: str, headers: Dict[str, str], batch: Dict): except Exception as e: verbose_logger.exception(f"OpikLogger failed to send batch - {str(e)}") - def _create_opik_headers(self): - headers = {} + def _create_opik_headers(self) -> Dict[str, str]: + headers: Dict[str, str] = {} if self.opik_workspace: headers["Comet-Workspace"] = self.opik_workspace @@ -163,13 +288,13 @@ def _create_opik_headers(self): headers["authorization"] = self.opik_api_key return headers - async def async_send_batch(self): + async def async_send_batch(self) -> None: verbose_logger.info("Calling async_send_batch") if not self.log_queue: return # Split the log_queue into traces and spans - traces, spans = get_traces_and_spans_from_payload(self.log_queue) + traces, spans = utils.get_traces_and_spans_from_payload(self.log_queue) # Send trace batch if len(traces) > 0: @@ -182,183 +307,3 @@ async def async_send_batch(self): url=self.span_url, headers=self.headers, batch={"spans": spans} ) verbose_logger.info(f"Sent {len(spans)} spans") - - def _create_opik_payload( # noqa: PLR0915 - self, kwargs, response_obj, start_time, end_time - ) -> List[Dict]: - # Get metadata - _litellm_params = kwargs.get("litellm_params", {}) or {} - litellm_params_metadata = _litellm_params.get("metadata", {}) or {} - - # Extract opik metadata - litellm_opik_metadata = litellm_params_metadata.get("opik", {}) - - # Use standard_logging_object to create metadata and input/output data - standard_logging_object = kwargs.get("standard_logging_object", None) - if standard_logging_object is None: - verbose_logger.debug( - "OpikLogger skipping event; no standard_logging_object found" - ) - return [] - - # Update litellm_opik_metadata with opik metadata from requester - standard_logging_metadata = standard_logging_object.get("metadata", {}) or {} - requester_metadata = standard_logging_metadata.get("requester_metadata", {}) or {} - - # If requester_metadata is empty, try to get it from user_api_key_auth_metadata saved in api key - if not requester_metadata: - requester_metadata = standard_logging_metadata.get( - "user_api_key_auth_metadata", {} - ) or {} - - requester_opik_metadata = requester_metadata.get("opik", {}) or {} - litellm_opik_metadata.update(requester_opik_metadata) - - verbose_logger.debug( - f"litellm_opik_metadata - {json.dumps(litellm_opik_metadata, default=str)}" - ) - - project_name = litellm_opik_metadata.get("project_name", self.opik_project_name) - - # Extract trace_id and parent_span_id - current_span_data = litellm_opik_metadata.get("current_span_data", None) - if isinstance(current_span_data, dict): - trace_id = current_span_data.get("trace_id", None) - parent_span_id = current_span_data.get("id", None) - elif current_span_data: - trace_id = current_span_data.trace_id - parent_span_id = current_span_data.id - else: - trace_id = None - parent_span_id = None - - # Create Opik tags - opik_tags = litellm_opik_metadata.get("tags", []) - if kwargs.get("custom_llm_provider"): - opik_tags.append(kwargs["custom_llm_provider"]) - - # Get thread_id if present - thread_id = litellm_opik_metadata.get("thread_id", None) - - # Override with any opik_ headers from proxy request - proxy_server_request = _litellm_params.get("proxy_server_request", {}) or {} - proxy_headers = proxy_server_request.get("headers", {}) or {} - for key, value in proxy_headers.items(): - if key.startswith("opik_"): - param_key = key.replace("opik_", "", 1) - if param_key == "project_name" and value: - project_name = value - elif param_key == "thread_id" and value: - thread_id = value - elif param_key == "tags" and value: - try: - parsed_tags = json.loads(value) - if isinstance(parsed_tags, list): - opik_tags.extend(parsed_tags) - except (json.JSONDecodeError, TypeError): - pass - - # Create input and output data - input_data = standard_logging_object.get("messages", {}) - output_data = standard_logging_object.get("response", {}) - - # Create usage object - usage = create_usage_object(response_obj["usage"]) - - # Define span and trace names - span_name = "%s_%s_%s" % ( - response_obj.get("model", "unknown-model"), - response_obj.get("object", "unknown-object"), - response_obj.get("created", 0), - ) - trace_name = response_obj.get("object", "unknown type") - - # Create metadata object, we add the opik metadata first and then - # update it with the standard_logging_object metadata - metadata = litellm_opik_metadata - if "current_span_data" in metadata: - del metadata["current_span_data"] - metadata["created_from"] = "litellm" - - metadata.update(standard_logging_metadata) - if "call_type" in standard_logging_object: - metadata["type"] = standard_logging_object["call_type"] - if "status" in standard_logging_object: - metadata["status"] = standard_logging_object["status"] - if "response_cost" in kwargs: - metadata["cost"] = { - "total_tokens": kwargs["response_cost"], - "currency": "USD", - } - if "response_cost_failure_debug_info" in kwargs: - metadata["response_cost_failure_debug_info"] = kwargs[ - "response_cost_failure_debug_info" - ] - if "model_map_information" in standard_logging_object: - metadata["model_map_information"] = standard_logging_object[ - "model_map_information" - ] - if "model" in standard_logging_object: - metadata["model"] = standard_logging_object["model"] - if "model_id" in standard_logging_object: - metadata["model_id"] = standard_logging_object["model_id"] - if "model_group" in standard_logging_object: - metadata["model_group"] = standard_logging_object["model_group"] - if "api_base" in standard_logging_object: - metadata["api_base"] = standard_logging_object["api_base"] - if "cache_hit" in standard_logging_object: - metadata["cache_hit"] = standard_logging_object["cache_hit"] - if "saved_cache_cost" in standard_logging_object: - metadata["saved_cache_cost"] = standard_logging_object["saved_cache_cost"] - if "error_str" in standard_logging_object: - metadata["error_str"] = standard_logging_object["error_str"] - if "model_parameters" in standard_logging_object: - metadata["model_parameters"] = standard_logging_object["model_parameters"] - if "hidden_params" in standard_logging_object: - metadata["hidden_params"] = standard_logging_object["hidden_params"] - - payload = [] - if trace_id is None: - trace_id = create_uuid7() - verbose_logger.debug( - f"OpikLogger creating payload for trace with id {trace_id}" - ) - payload.append( - { - "project_name": project_name, - "id": trace_id, - "name": trace_name, - "start_time": start_time.astimezone(timezone.utc).isoformat().replace("+00:00", "Z"), - "end_time": end_time.astimezone(timezone.utc).isoformat().replace("+00:00", "Z"), - "input": input_data, - "output": output_data, - "metadata": metadata, - "tags": opik_tags, - "thread_id": thread_id, - } - ) - - span_id = create_uuid7() - verbose_logger.debug( - f"OpikLogger creating payload for trace with id {trace_id} and span with id {span_id}" - ) - payload.append( - { - "id": span_id, - "project_name": project_name, - "trace_id": trace_id, - "parent_span_id": parent_span_id, - "name": span_name, - "type": "llm", - "start_time": start_time.astimezone(timezone.utc).isoformat().replace("+00:00", "Z"), - "end_time": end_time.astimezone(timezone.utc).isoformat().replace("+00:00", "Z"), - "input": input_data, - "output": output_data, - "metadata": metadata, - "tags": opik_tags, - "thread_id": thread_id, - "usage": usage, - } - ) - verbose_logger.debug(f"Payload: {payload}") - return payload diff --git a/litellm/integrations/opik/opik_payload_builder/__init__.py b/litellm/integrations/opik/opik_payload_builder/__init__.py new file mode 100644 index 000000000000..c57fceaa110d --- /dev/null +++ b/litellm/integrations/opik/opik_payload_builder/__init__.py @@ -0,0 +1,10 @@ +""" +Opik payload builder namespace. + +Public API: + build_opik_payload - Main function to create Opik trace and span payloads +""" + +from .api import build_opik_payload + +__all__ = ["build_opik_payload"] diff --git a/litellm/integrations/opik/opik_payload_builder/api.py b/litellm/integrations/opik/opik_payload_builder/api.py new file mode 100644 index 000000000000..99dbea165e92 --- /dev/null +++ b/litellm/integrations/opik/opik_payload_builder/api.py @@ -0,0 +1,121 @@ +"""Public API for Opik payload building.""" + +from datetime import datetime +from typing import Any, Dict, Optional, Tuple + +from litellm.integrations.opik import utils + +from . import extractors, payload_builders, types + + +def build_opik_payload( + kwargs: Dict[str, Any], + response_obj: Dict[str, Any], + start_time: datetime, + end_time: datetime, + project_name: str, +) -> Tuple[Optional[types.TracePayload], types.SpanPayload]: + """ + Build Opik trace and span payloads from LiteLLM completion data. + + This is the main public API for creating Opik payloads. It: + 1. Extracts all necessary data from LiteLLM kwargs and response + 2. Decides whether to create a new trace or attach to existing + 3. Builds trace payload (if new trace) + 4. Builds span payload (always) + + Args: + kwargs: LiteLLM kwargs containing request metadata and logging data + response_obj: LiteLLM response object containing model response + start_time: Request start time + end_time: Request end time + project_name: Default Opik project name + + Returns: + Tuple of (optional trace payload, span payload) + - First element is TracePayload if creating a new trace, None if attaching to existing + - Second element is always SpanPayload + """ + standard_logging_object = kwargs["standard_logging_object"] + + # Extract litellm params and metadata + litellm_params = kwargs.get("litellm_params", {}) or {} + litellm_metadata = litellm_params.get("metadata", {}) or {} + standard_logging_metadata = standard_logging_object.get("metadata", {}) or {} + + # Extract and merge Opik metadata + opik_metadata = extractors.extract_opik_metadata( + litellm_metadata, standard_logging_metadata + ) + + # Extract project name + current_project_name = opik_metadata.get("project_name", project_name) + + # Extract trace identifiers + current_span_data = opik_metadata.get("current_span_data") + trace_id, parent_span_id = extractors.extract_span_identifiers(current_span_data) + + # Extract tags and thread_id + tags = extractors.extract_tags(opik_metadata, kwargs.get("custom_llm_provider")) + thread_id = opik_metadata.get("thread_id") + + # Apply proxy header overrides + proxy_request = litellm_params.get("proxy_server_request", {}) or {} + proxy_headers = proxy_request.get("headers", {}) or {} + current_project_name, tags, thread_id = extractors.apply_proxy_header_overrides( + current_project_name, tags, thread_id, proxy_headers + ) + + # Build shared metadata + metadata = extractors.extract_and_build_metadata( + opik_metadata=opik_metadata, + standard_logging_metadata=standard_logging_metadata, + standard_logging_object=standard_logging_object, + litellm_kwargs=kwargs, + ) + + # Get input/output data + input_data = standard_logging_object.get("messages", {}) + output_data = standard_logging_object.get("response", {}) + + # Decide whether to create a new trace or attach to existing + trace_payload: Optional[types.TracePayload] = None + if trace_id is None: + trace_id = utils.create_uuid7() + trace_payload = payload_builders.build_trace_payload( + project_name=current_project_name, + trace_id=trace_id, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + input_data=input_data, + output_data=output_data, + metadata=metadata, + tags=tags, + thread_id=thread_id, + ) + + # Always create a span + usage = utils.create_usage_object(response_obj["usage"]) + + # Extract provider and cost + provider = extractors.normalize_provider_name(kwargs.get("custom_llm_provider")) + cost = kwargs.get("response_cost") + + span_payload = payload_builders.build_span_payload( + project_name=current_project_name, + trace_id=trace_id, + parent_span_id=parent_span_id, + response_obj=response_obj, + start_time=start_time, + end_time=end_time, + input_data=input_data, + output_data=output_data, + metadata=metadata, + tags=tags, + usage=usage, + provider=provider, + cost=cost, + ) + + return trace_payload, span_payload diff --git a/litellm/integrations/opik/opik_payload_builder/extractors.py b/litellm/integrations/opik/opik_payload_builder/extractors.py new file mode 100644 index 000000000000..e4ff021778a9 --- /dev/null +++ b/litellm/integrations/opik/opik_payload_builder/extractors.py @@ -0,0 +1,221 @@ +"""Data extraction functions for Opik payload building.""" + +import json +from typing import Any, Dict, List, Optional, Tuple + +from litellm import _logging + + +def normalize_provider_name(provider: Optional[str]) -> Optional[str]: + """ + Normalize LiteLLM provider names to standardized string names. + + Args: + provider: LiteLLM internal provider name + + Returns: + Normalized provider name or the original if no mapping exists + """ + if provider is None: + return None + + # Provider mapping to names used in Opik + provider_mapping = { + "openai": "openai", + "vertex_ai-language-models": "google_vertexai", + "gemini": "google_ai", + "anthropic": "anthropic", + "vertex_ai-anthropic_models": "anthropic_vertexai", + "bedrock": "bedrock", + "bedrock_converse": "bedrock", + "groq": "groq", + } + + return provider_mapping.get(provider, provider) + + +def extract_opik_metadata( + litellm_metadata: Dict[str, Any], + standard_logging_metadata: Dict[str, Any], +) -> Dict[str, Any]: + """ + Extract and merge Opik metadata from request and requester. + + Args: + litellm_metadata: Metadata from litellm_params + standard_logging_metadata: Metadata from standard_logging_object + + Returns: + Merged Opik metadata dictionary + """ + opik_meta = litellm_metadata.get("opik", {}).copy() + + requester_metadata = standard_logging_metadata.get("requester_metadata", {}) or {} + requester_opik = requester_metadata.get("opik", {}) or {} + opik_meta.update(requester_opik) + + _logging.verbose_logger.debug( + f"litellm_opik_metadata - {json.dumps(opik_meta, default=str)}" + ) + + return opik_meta + + +def extract_span_identifiers( + current_span_data: Any, +) -> Tuple[Optional[str], Optional[str]]: + """ + Extract trace_id and parent_span_id from current_span_data. + + Args: + current_span_data: Either dict with trace_id/id keys or Opik object + + Returns: + Tuple of (trace_id, parent_span_id), both optional + """ + if current_span_data is None: + return None, None + + if isinstance(current_span_data, dict): + return (current_span_data.get("trace_id"), current_span_data.get("id")) + + try: + return current_span_data.trace_id, current_span_data.id + except AttributeError: + _logging.verbose_logger.warning( + f"Unexpected current_span_data format: {type(current_span_data)}" + ) + return None, None + + +def extract_tags( + opik_metadata: Dict[str, Any], + custom_llm_provider: Optional[str], +) -> List[str]: + """ + Extract and build list of tags. + + Args: + opik_metadata: Opik metadata dictionary + custom_llm_provider: LLM provider name to add as tag + + Returns: + List of tags + """ + tags = list(opik_metadata.get("tags", [])) + + if custom_llm_provider: + tags.append(custom_llm_provider) + + return tags + + +def apply_proxy_header_overrides( + project_name: str, + tags: List[str], + thread_id: Optional[str], + proxy_headers: Dict[str, Any], +) -> Tuple[str, List[str], Optional[str]]: + """ + Apply overrides from proxy request headers (opik_* prefix). + + Args: + project_name: Current project name + tags: Current tags list + thread_id: Current thread ID + proxy_headers: HTTP headers from proxy request + + Returns: + Tuple of (project_name, tags, thread_id) with overrides applied + """ + for key, value in proxy_headers.items(): + if not key.startswith("opik_") or not value: + continue + + param_key = key.replace("opik_", "", 1) + + if param_key == "project_name": + project_name = value + elif param_key == "thread_id": + thread_id = value + elif param_key == "tags": + try: + parsed_tags = json.loads(value) + if isinstance(parsed_tags, list): + tags.extend(parsed_tags) + except (json.JSONDecodeError, TypeError): + _logging.verbose_logger.warning( + f"Failed to parse tags from header: {value}" + ) + + return project_name, tags, thread_id + + +def extract_and_build_metadata( + opik_metadata: Dict[str, Any], + standard_logging_metadata: Dict[str, Any], + standard_logging_object: Dict[str, Any], + litellm_kwargs: Dict[str, Any], +) -> Dict[str, Any]: + """ + Build the complete metadata dictionary from all available sources. + + This combines: + - Opik-specific metadata (tags, etc.) + - Standard logging metadata + - Fields from standard_logging_object (model info, status, etc.) + - Cost information from litellm_kwargs (calculated after completion) + + Args: + opik_metadata: Opik-specific metadata from request + standard_logging_metadata: Standard logging metadata + standard_logging_object: Full standard logging object with call details + litellm_kwargs: Original LiteLLM kwargs (includes response_cost) + + Returns: + Complete metadata dictionary for trace/span + """ + # Start with opik metadata (excluding current_span_data which is used for trace linking) + metadata = {k: v for k, v in opik_metadata.items() if k != "current_span_data"} + metadata["created_from"] = "litellm" + + # Merge with standard logging metadata + metadata.update(standard_logging_metadata) + + # Add fields from standard_logging_object + # These come from the LiteLLM logging infrastructure + field_mappings = { + "call_type": "type", + "status": "status", + "model": "model", + "model_id": "model_id", + "model_group": "model_group", + "api_base": "api_base", + "cache_hit": "cache_hit", + "saved_cache_cost": "saved_cache_cost", + "error_str": "error_str", + "model_parameters": "model_parameters", + "hidden_params": "hidden_params", + "model_map_information": "model_map_information", + } + + for source_key, dest_key in field_mappings.items(): + if source_key in standard_logging_object: + metadata[dest_key] = standard_logging_object[source_key] + + # Add cost information + # response_cost is calculated by LiteLLM after completion and added to kwargs + # See: litellm/litellm_core_utils/llm_response_utils/response_metadata.py + if "response_cost" in litellm_kwargs: + metadata["cost"] = { + "total_tokens": litellm_kwargs["response_cost"], + "currency": "USD", + } + + # Add debug info if cost calculation failed + if "response_cost_failure_debug_info" in litellm_kwargs: + metadata["response_cost_failure_debug_info"] = litellm_kwargs[ + "response_cost_failure_debug_info" + ] + + return metadata diff --git a/litellm/integrations/opik/opik_payload_builder/payload_builders.py b/litellm/integrations/opik/opik_payload_builder/payload_builders.py new file mode 100644 index 000000000000..4656924fdb5a --- /dev/null +++ b/litellm/integrations/opik/opik_payload_builder/payload_builders.py @@ -0,0 +1,89 @@ +"""Payload builders for Opik traces and spans.""" + +from datetime import datetime, timezone +from typing import Any, Dict, List, Optional + +from litellm import _logging +from litellm.integrations.opik import utils + +from . import types + + +def build_trace_payload( + project_name: str, + trace_id: str, + response_obj: Dict[str, Any], + start_time: datetime, + end_time: datetime, + input_data: Any, + output_data: Any, + metadata: Dict[str, Any], + tags: List[str], + thread_id: Optional[str], +) -> types.TracePayload: + """Build a complete trace payload.""" + trace_name = response_obj.get("object", "unknown type") + + return types.TracePayload( + project_name=project_name, + id=trace_id, + name=trace_name, + start_time=( + start_time.astimezone(timezone.utc).isoformat().replace("+00:00", "Z") + ), + end_time=end_time.astimezone(timezone.utc).isoformat().replace("+00:00", "Z"), + input=input_data, + output=output_data, + metadata=metadata, + tags=tags, + thread_id=thread_id, + ) + + +def build_span_payload( + project_name: str, + trace_id: str, + parent_span_id: Optional[str], + response_obj: Dict[str, Any], + start_time: datetime, + end_time: datetime, + input_data: Any, + output_data: Any, + metadata: Dict[str, Any], + tags: List[str], + usage: Dict[str, int], + provider: Optional[str] = None, + cost: Optional[float] = None, +) -> types.SpanPayload: + """Build a complete span payload.""" + span_id = utils.create_uuid7() + + model = response_obj.get("model", "unknown-model") + obj_type = response_obj.get("object", "unknown-object") + created = response_obj.get("created", 0) + span_name = f"{model}_{obj_type}_{created}" + + _logging.verbose_logger.debug( + f"OpikLogger creating span with id {span_id} for trace {trace_id}" + ) + + return types.SpanPayload( + id=span_id, + project_name=project_name, + trace_id=trace_id, + parent_span_id=parent_span_id, + name=span_name, + type="llm", + model=model, + start_time=( + start_time.astimezone(timezone.utc).isoformat().replace("+00:00", "Z") + ), + end_time=end_time.astimezone(timezone.utc).isoformat().replace("+00:00", "Z"), + input=input_data, + output=output_data, + metadata=metadata, + tags=tags, + usage=usage, + provider=provider, + total_cost=cost, + ) diff --git a/litellm/integrations/opik/opik_payload_builder/types.py b/litellm/integrations/opik/opik_payload_builder/types.py new file mode 100644 index 000000000000..070cb11489ae --- /dev/null +++ b/litellm/integrations/opik/opik_payload_builder/types.py @@ -0,0 +1,46 @@ +"""Type definitions for Opik payload building.""" + +from dataclasses import dataclass +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + + +@dataclass +class TracePayload: + """Opik trace payload structure""" + + project_name: str + id: str + name: str + start_time: str + end_time: str + input: Any + output: Any + metadata: Dict[str, Any] + tags: List[str] + thread_id: Optional[str] = None + + +@dataclass +class SpanPayload: + """Opik span payload structure""" + + id: str + project_name: str + trace_id: str + name: str + type: Literal["llm"] + model: str + start_time: str + end_time: str + input: Any + output: Any + metadata: Dict[str, Any] + tags: List[str] + usage: Dict[str, int] + parent_span_id: Optional[str] = None + provider: Optional[str] = None + total_cost: Optional[float] = None + + +PayloadItem = Union[TracePayload, SpanPayload] +TraceSpanPayloadTuple = Tuple[Optional[TracePayload], SpanPayload] diff --git a/litellm/integrations/opik/utils.py b/litellm/integrations/opik/utils.py index 7b3b64dcf381..b0ab5991c913 100644 --- a/litellm/integrations/opik/utils.py +++ b/litellm/integrations/opik/utils.py @@ -1,7 +1,7 @@ import configparser import os import time -from typing import Dict, Final, List, Optional +from typing import Any, Dict, Final, List, Optional, Tuple CONFIG_FILE_PATH_DEFAULT: Final[str] = "~/.opik.config" @@ -99,12 +99,26 @@ def create_usage_object(usage): return usage_dict -def _remove_nulls(x): - x_ = {k: v for k, v in x.items() if v is not None} - return x_ +def _remove_nulls(x: Dict[str, Any]) -> Dict[str, Any]: + """Remove None values from dict.""" + return {k: v for k, v in x.items() if v is not None} -def get_traces_and_spans_from_payload(payload: List): +def get_traces_and_spans_from_payload( + payload: List[Dict[str, Any]] +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Separate traces and spans from payload. + + Traces are identified by not having a "type" field. + Spans are identified by having a "type" field. + + Args: + payload: List of dicts containing trace and span data + + Returns: + Tuple of (traces, spans) where both are lists of dicts with null values removed + """ traces = [_remove_nulls(x) for x in payload if "type" not in x] spans = [_remove_nulls(x) for x in payload if "type" in x] return traces, spans diff --git a/litellm/integrations/posthog.py b/litellm/integrations/posthog.py index c609d30ccffc..dd7c3627b874 100644 --- a/litellm/integrations/posthog.py +++ b/litellm/integrations/posthog.py @@ -10,12 +10,17 @@ """ import asyncio +import atexit import os from typing import Any, Dict, Optional, Tuple from litellm._logging import verbose_logger from litellm._uuid import uuid from litellm.integrations.custom_batch_logger import CustomBatchLogger +from litellm.integrations.posthog_mock_client import ( + should_use_posthog_mock, + create_mock_posthog_client, +) from litellm.llms.custom_httpx.http_handler import ( _get_httpx_client, get_async_httpx_client, @@ -39,6 +44,12 @@ def __init__(self, **kwargs): """ try: verbose_logger.debug("PostHog: in init posthog logger") + + self.is_mock_mode = should_use_posthog_mock() + if self.is_mock_mode: + create_mock_posthog_client() + verbose_logger.debug("[POSTHOG MOCK] PostHog logger initialized in mock mode") + if os.getenv("POSTHOG_API_KEY", None) is None: raise Exception("POSTHOG_API_KEY is not set, set 'POSTHOG_API_KEY=<>'") @@ -55,7 +66,10 @@ def __init__(self, **kwargs): self._async_initialized = False self.flush_lock = None self.log_queue = [] - + + # Register cleanup handler to flush internal queue on exit + atexit.register(self._flush_on_exit) + super().__init__( **kwargs, flush_lock=None, batch_size=POSTHOG_MAX_BATCH_SIZE ) @@ -96,7 +110,10 @@ def log_success_event(self, kwargs, response_obj, start_time, end_time): f"Response from PostHog API status_code: {response.status_code}, text: {response.text}" ) - verbose_logger.debug("PostHog: Sync event successfully sent") + if self.is_mock_mode: + verbose_logger.debug("[POSTHOG MOCK] Sync event successfully mocked") + else: + verbose_logger.debug("PostHog: Sync event successfully sent") except Exception as e: verbose_logger.exception(f"PostHog Sync Layer Error - {str(e)}") @@ -316,6 +333,9 @@ async def async_send_batch(self): verbose_logger.debug( f"PostHog: Sending batch of {len(self.log_queue)} events" ) + + if self.is_mock_mode: + verbose_logger.debug("[POSTHOG MOCK] Mock mode enabled - API calls will be intercepted") # Group events by credentials for batch sending batches_by_credentials: Dict[tuple[str, str], list] = {} @@ -346,9 +366,12 @@ async def async_send_batch(self): f"Response from PostHog API status_code: {response.status_code}, text: {response.text}" ) - verbose_logger.debug( - f"PostHog: Batch of {len(self.log_queue)} events successfully sent" - ) + if self.is_mock_mode: + verbose_logger.debug(f"[POSTHOG MOCK] Batch of {len(self.log_queue)} events successfully mocked") + else: + verbose_logger.debug( + f"PostHog: Batch of {len(self.log_queue)} events successfully sent" + ) except Exception as e: verbose_logger.exception(f"PostHog Error sending batch API - {str(e)}") @@ -377,3 +400,63 @@ def _safe_get(self, obj: Any, key: str, default: Any = None) -> Any: if obj is None or not hasattr(obj, 'get'): return default return obj.get(key, default) + + def _flush_on_exit(self): + """ + Flush remaining events from internal log_queue before process exit. + Called automatically via atexit handler. + + This works in conjunction with GLOBAL_LOGGING_WORKER's atexit handler: + 1. GLOBAL_LOGGING_WORKER atexit invokes pending callbacks + 2. Callbacks add events to this logger's internal log_queue + 3. This atexit handler flushes the internal queue to PostHog + """ + if not self.log_queue: + return + + verbose_logger.debug( + f"PostHog: Flushing {len(self.log_queue)} remaining events on exit" + ) + + try: + # Group events by credentials (same logic as async_send_batch) + batches_by_credentials: Dict[Tuple[str, str], list] = {} + for item in self.log_queue: + key = (item["api_key"], item["api_url"]) + if key not in batches_by_credentials: + batches_by_credentials[key] = [] + batches_by_credentials[key].append(item["event"]) + + # Send each batch synchronously using sync_client + for (api_key, api_url), events in batches_by_credentials.items(): + headers = { + "Content-Type": "application/json", + } + + payload = self._create_posthog_payload(events, api_key) + capture_url = f"{api_url.rstrip('/')}/batch/" + + response = self.sync_client.post( + url=capture_url, + json=payload, + headers=headers, + ) + response.raise_for_status() + + if response.status_code != 200: + verbose_logger.error( + f"PostHog: Failed to flush on exit - status {response.status_code}" + ) + + if self.is_mock_mode: + verbose_logger.debug( + f"[POSTHOG MOCK] Successfully flushed {len(self.log_queue)} events on exit" + ) + else: + verbose_logger.debug( + f"PostHog: Successfully flushed {len(self.log_queue)} events on exit" + ) + self.log_queue.clear() + + except Exception as e: + verbose_logger.error(f"PostHog: Error flushing events on exit: {str(e)}") diff --git a/litellm/integrations/posthog_mock_client.py b/litellm/integrations/posthog_mock_client.py new file mode 100644 index 000000000000..b713587ed6f1 --- /dev/null +++ b/litellm/integrations/posthog_mock_client.py @@ -0,0 +1,30 @@ +""" +Mock httpx client for PostHog integration testing. + +This module intercepts PostHog API calls and returns successful mock responses, +allowing full code execution without making actual network calls. + +Usage: + Set POSTHOG_MOCK=true in environment variables or config to enable mock mode. +""" + +from litellm.integrations.mock_client_factory import MockClientConfig, create_mock_client_factory + +# Create mock client using factory +_config = MockClientConfig( + name="POSTHOG", + env_var="POSTHOG_MOCK", + default_latency_ms=100, + default_status_code=200, + default_json_data={"status": "success"}, + url_matchers=[ + ".posthog.com", + "posthog.com", + "us.i.posthog.com", + "app.posthog.com", + ], + patch_async_handler=True, + patch_sync_client=True, +) + +create_mock_posthog_client, should_use_posthog_mock = create_mock_client_factory(_config) diff --git a/enterprise/litellm_enterprise/integrations/prometheus.py b/litellm/integrations/prometheus.py similarity index 75% rename from enterprise/litellm_enterprise/integrations/prometheus.py rename to litellm/integrations/prometheus.py index d3b599edff9c..2c897cb0692e 100644 --- a/enterprise/litellm_enterprise/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -14,48 +14,59 @@ Literal, Optional, Tuple, + Union, cast, ) import litellm from litellm._logging import print_verbose, verbose_logger from litellm.integrations.custom_logger import CustomLogger -from litellm.proxy._types import LiteLLM_TeamTable, UserAPIKeyAuth +from litellm.proxy._types import ( + LiteLLM_DeletedVerificationToken, + LiteLLM_TeamTable, + LiteLLM_UserTable, + UserAPIKeyAuth, +) from litellm.types.integrations.prometheus import * from litellm.types.integrations.prometheus import _sanitize_prometheus_label_name from litellm.types.utils import StandardLoggingPayload -from litellm.utils import get_end_user_id_for_cost_tracking if TYPE_CHECKING: from apscheduler.schedulers.asyncio import AsyncIOScheduler else: AsyncIOScheduler = Any +# Cached lazy import for get_end_user_id_for_cost_tracking +# Module-level cache to avoid repeated imports while preserving memory benefits +_get_end_user_id_for_cost_tracking = None + + +def _get_cached_end_user_id_for_cost_tracking(): + """ + Get cached get_end_user_id_for_cost_tracking function. + Lazy imports on first call to avoid loading utils.py at import time (60MB saved). + Subsequent calls use cached function for better performance. + """ + global _get_end_user_id_for_cost_tracking + if _get_end_user_id_for_cost_tracking is None: + from litellm.utils import get_end_user_id_for_cost_tracking + + _get_end_user_id_for_cost_tracking = get_end_user_id_for_cost_tracking + return _get_end_user_id_for_cost_tracking + class PrometheusLogger(CustomLogger): # Class variables or attributes - def __init__( + def __init__( # noqa: PLR0915 self, **kwargs, ): try: from prometheus_client import Counter, Gauge, Histogram - from litellm.proxy.proxy_server import CommonProxyErrors, premium_user - # Always initialize label_filters, even for non-premium users self.label_filters = self._parse_prometheus_config() - if premium_user is not True: - verbose_logger.warning( - f"🚨🚨🚨 Prometheus Metrics is on LiteLLM Enterprise\n🚨 {CommonProxyErrors.not_premium_user.value}" - ) - self.litellm_not_a_premium_user_metric = Counter( - name="litellm_not_a_premium_user_metric", - documentation=f"🚨🚨🚨 Prometheus Metrics is on LiteLLM Enterprise. 🚨 {CommonProxyErrors.not_premium_user.value}", - ) - return - # Create metric factory functions self._counter_factory = self._create_metric_factory(Counter) self._gauge_factory = self._create_metric_factory(Gauge) @@ -187,6 +198,30 @@ def __init__( ), ) + # Remaining Budget for User + self.litellm_remaining_user_budget_metric = self._gauge_factory( + "litellm_remaining_user_budget_metric", + "Remaining budget for user", + labelnames=self.get_labels_for_metric( + "litellm_remaining_user_budget_metric" + ), + ) + + # Max Budget for User + self.litellm_user_max_budget_metric = self._gauge_factory( + "litellm_user_max_budget_metric", + "Maximum budget set for user", + labelnames=self.get_labels_for_metric("litellm_user_max_budget_metric"), + ) + + self.litellm_user_budget_remaining_hours_metric = self._gauge_factory( + "litellm_user_budget_remaining_hours_metric", + "Remaining hours for user budget to be reset", + labelnames=self.get_labels_for_metric( + "litellm_user_budget_remaining_hours_metric" + ), + ) + ######################################## # LiteLLM Virtual API KEY metrics ######################################## @@ -194,14 +229,18 @@ def __init__( self.litellm_remaining_api_key_requests_for_model = self._gauge_factory( "litellm_remaining_api_key_requests_for_model", "Remaining Requests API Key can make for model (model based rpm limit on key)", - labelnames=["hashed_api_key", "api_key_alias", "model"], + labelnames=self.get_labels_for_metric( + "litellm_remaining_api_key_requests_for_model" + ), ) # Remaining MODEL TPM limit for API Key self.litellm_remaining_api_key_tokens_for_model = self._gauge_factory( "litellm_remaining_api_key_tokens_for_model", "Remaining Tokens API Key can make for model (model based tpm limit on key)", - labelnames=["hashed_api_key", "api_key_alias", "model"], + labelnames=self.get_labels_for_metric( + "litellm_remaining_api_key_tokens_for_model" + ), ) ######################################## @@ -210,7 +249,7 @@ def __init__( # Remaining Rate Limit for model self.litellm_remaining_requests_metric = self._gauge_factory( - "litellm_remaining_requests", + "litellm_remaining_requests_metric", "LLM Deployment Analytics - remaining requests for model, returned from LLM API Provider", labelnames=self.get_labels_for_metric( "litellm_remaining_requests_metric" @@ -218,7 +257,7 @@ def __init__( ) self.litellm_remaining_tokens_metric = self._gauge_factory( - "litellm_remaining_tokens", + "litellm_remaining_tokens_metric", "remaining tokens for model, returned from LLM API Provider", labelnames=self.get_labels_for_metric( "litellm_remaining_tokens_metric" @@ -233,6 +272,36 @@ def __init__( ), buckets=LATENCY_BUCKETS, ) + + # Request queue time metric + self.litellm_request_queue_time_metric = self._histogram_factory( + "litellm_request_queue_time_seconds", + "Time spent in request queue before processing starts (seconds)", + labelnames=self.get_labels_for_metric( + "litellm_request_queue_time_seconds" + ), + buckets=LATENCY_BUCKETS, + ) + + # Guardrail metrics + self.litellm_guardrail_latency_metric = self._histogram_factory( + "litellm_guardrail_latency_seconds", + "Latency (seconds) for guardrail execution", + labelnames=["guardrail_name", "status", "error_type", "hook_type"], + buckets=LATENCY_BUCKETS, + ) + + self.litellm_guardrail_errors_total = self._counter_factory( + "litellm_guardrail_errors_total", + "Total number of errors encountered during guardrail execution", + labelnames=["guardrail_name", "error_type", "hook_type"], + ) + + self.litellm_guardrail_requests_total = self._counter_factory( + "litellm_guardrail_requests_total", + "Total number of guardrail invocations", + labelnames=["guardrail_name", "status", "hook_type"], + ) # llm api provider budget metrics self.litellm_provider_remaining_budget_metric = self._gauge_factory( "litellm_provider_remaining_budget_metric", @@ -247,6 +316,18 @@ def __init__( labelnames=self.get_labels_for_metric("litellm_deployment_state"), ) + self.litellm_deployment_tpm_limit = self._gauge_factory( + "litellm_deployment_tpm_limit", + "Deployment TPM limit found in config", + labelnames=self.get_labels_for_metric("litellm_deployment_tpm_limit"), + ) + + self.litellm_deployment_rpm_limit = self._gauge_factory( + "litellm_deployment_rpm_limit", + "Deployment RPM limit found in config", + labelnames=self.get_labels_for_metric("litellm_deployment_rpm_limit"), + ) + self.litellm_deployment_cooled_down = self._counter_factory( "litellm_deployment_cooled_down", "LLM Deployment Analytics - Number of times a deployment has been cooled down by LiteLLM load balancing logic. exception_status is the status of the exception that caused the deployment to be cooled down", @@ -298,18 +379,19 @@ def __init__( self.get_labels_for_metric("litellm_deployment_failed_fallbacks"), ) + # Callback Logging Failure Metrics + self.litellm_callback_logging_failures_metric = self._counter_factory( + name="litellm_callback_logging_failures_metric", + documentation="Total number of failures when emitting logs to callbacks (e.g. s3_v2, langfuse, etc)", + labelnames=["callback_name"], + ) + self.litellm_llm_api_failed_requests_metric = self._counter_factory( name="litellm_llm_api_failed_requests_metric", documentation="deprecated - use litellm_proxy_failed_requests_metric", - labelnames=[ - "end_user", - "hashed_api_key", - "api_key_alias", - "model", - "team", - "team_alias", - "user", - ], + labelnames=self.get_labels_for_metric( + "litellm_llm_api_failed_requests_metric" + ), ) self.litellm_requests_metric = self._counter_factory( @@ -318,6 +400,38 @@ def __init__( labelnames=self.get_labels_for_metric("litellm_requests_metric"), ) + # Cache metrics + self.litellm_cache_hits_metric = self._counter_factory( + name="litellm_cache_hits_metric", + documentation="Total number of LiteLLM cache hits", + labelnames=self.get_labels_for_metric("litellm_cache_hits_metric"), + ) + + self.litellm_cache_misses_metric = self._counter_factory( + name="litellm_cache_misses_metric", + documentation="Total number of LiteLLM cache misses", + labelnames=self.get_labels_for_metric("litellm_cache_misses_metric"), + ) + + self.litellm_cached_tokens_metric = self._counter_factory( + name="litellm_cached_tokens_metric", + documentation="Total tokens served from LiteLLM cache", + labelnames=self.get_labels_for_metric("litellm_cached_tokens_metric"), + ) + + # User and Team count metrics + self.litellm_total_users_metric = self._gauge_factory( + "litellm_total_users", + "Total number of users in LiteLLM", + labelnames=[], + ) + + self.litellm_teams_count_metric = self._gauge_factory( + "litellm_teams_count", + "Total number of teams in LiteLLM", + labelnames=[], + ) + except Exception as e: print_verbose(f"Got exception on init prometheus client {str(e)}") raise e @@ -780,9 +894,16 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti f"standard_logging_object is required, got={standard_logging_payload}" ) + if self._should_skip_metrics_for_invalid_key( + kwargs=kwargs, standard_logging_payload=standard_logging_payload + ): + return + model = kwargs.get("model", "") litellm_params = kwargs.get("litellm_params", {}) or {} - _metadata = litellm_params.get("metadata", {}) + _metadata = litellm_params.get("metadata") or {} + get_end_user_id_for_cost_tracking = _get_cached_end_user_id_for_cost_tracking() + end_user_id = get_end_user_id_for_cost_tracking( litellm_params, service_type="prometheus" ) @@ -802,6 +923,7 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti user_api_key_auth_metadata: Optional[dict] = standard_logging_payload[ "metadata" ].get("user_api_key_auth_metadata") + combined_metadata: Dict[str, Any] = { **(_requester_metadata if _requester_metadata else {}), **(user_api_key_auth_metadata if user_api_key_auth_metadata else {}), @@ -842,6 +964,8 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti route=standard_logging_payload["metadata"].get( "user_api_key_request_route" ), + client_ip=standard_logging_payload["metadata"].get("requester_ip_address"), + user_agent=standard_logging_payload["metadata"].get("user_agent"), ) if ( @@ -890,6 +1014,7 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti user_api_key_alias=user_api_key_alias, litellm_params=litellm_params, response_cost=response_cost, + user_id=user_id, ) # set proxy virtual key rpm/tpm metrics @@ -898,6 +1023,7 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti user_api_key_alias=user_api_key_alias, kwargs=kwargs, metadata=_metadata, + model_id=enum_values.model_id, ) # set latency metrics @@ -919,6 +1045,12 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti kwargs, start_time, end_time, enum_values, output_tokens ) + # cache metrics + self._increment_cache_metrics( + standard_logging_payload=standard_logging_payload, # type: ignore + enum_values=enum_values, + ) + if ( standard_logging_payload["stream"] is True ): # log successful streaming requests from logging event hook. @@ -988,6 +1120,54 @@ def _increment_token_metrics( standard_logging_payload["completion_tokens"] ) + def _increment_cache_metrics( + self, + standard_logging_payload: StandardLoggingPayload, + enum_values: UserAPIKeyLabelValues, + ): + """ + Increment cache-related Prometheus metrics based on cache hit/miss status. + + Args: + standard_logging_payload: Contains cache_hit field (True/False/None) + enum_values: Label values for Prometheus metrics + """ + cache_hit = standard_logging_payload.get("cache_hit") + + # Only track if cache_hit has a definite value (True or False) + if cache_hit is None: + return + + if cache_hit is True: + # Increment cache hits counter + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_cache_hits_metric" + ), + enum_values=enum_values, + ) + self.litellm_cache_hits_metric.labels(**_labels).inc() + + # Increment cached tokens counter + total_tokens = standard_logging_payload.get("total_tokens", 0) + if total_tokens > 0: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_cached_tokens_metric" + ), + enum_values=enum_values, + ) + self.litellm_cached_tokens_metric.labels(**_labels).inc(total_tokens) + else: + # cache_hit is False - increment cache misses counter + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_cache_misses_metric" + ), + enum_values=enum_values, + ) + self.litellm_cache_misses_metric.labels(**_labels).inc() + async def _increment_remaining_budget_metrics( self, user_api_team: Optional[str], @@ -996,20 +1176,18 @@ async def _increment_remaining_budget_metrics( user_api_key_alias: Optional[str], litellm_params: dict, response_cost: float, + user_id: Optional[str] = None, ): - _team_spend = litellm_params.get("metadata", {}).get( - "user_api_key_team_spend", None - ) - _team_max_budget = litellm_params.get("metadata", {}).get( - "user_api_key_team_max_budget", None - ) + _metadata = litellm_params.get("metadata") or {} + _team_spend = _metadata.get("user_api_key_team_spend", None) + _team_max_budget = _metadata.get("user_api_key_team_max_budget", None) + + _api_key_spend = _metadata.get("user_api_key_spend", None) + _api_key_max_budget = _metadata.get("user_api_key_max_budget", None) + + _user_spend = _metadata.get("user_api_key_user_spend", None) + _user_max_budget = _metadata.get("user_api_key_user_max_budget", None) - _api_key_spend = litellm_params.get("metadata", {}).get( - "user_api_key_spend", None - ) - _api_key_max_budget = litellm_params.get("metadata", {}).get( - "user_api_key_max_budget", None - ) await self._set_api_key_budget_metrics_after_api_request( user_api_key=user_api_key, user_api_key_alias=user_api_key_alias, @@ -1026,6 +1204,13 @@ async def _increment_remaining_budget_metrics( response_cost=response_cost, ) + await self._set_user_budget_metrics_after_api_request( + user_id=user_id, + user_spend=_user_spend, + user_max_budget=_user_max_budget, + response_cost=response_cost, + ) + def _increment_top_level_request_and_spend_metrics( self, end_user_id: Optional[str], @@ -1062,6 +1247,7 @@ def _set_virtual_key_rate_limit_metrics( user_api_key_alias: Optional[str], kwargs: dict, metadata: dict, + model_id: Optional[str] = None, ): from litellm.proxy.common_utils.callback_utils import ( get_model_group_from_litellm_kwargs, @@ -1083,11 +1269,11 @@ def _set_virtual_key_rate_limit_metrics( ) self.litellm_remaining_api_key_requests_for_model.labels( - user_api_key, user_api_key_alias, model_group + user_api_key, user_api_key_alias, model_group, model_id ).set(remaining_requests) self.litellm_remaining_api_key_tokens_for_model.labels( - user_api_key, user_api_key_alias, model_group + user_api_key, user_api_key_alias, model_group, model_id ).set(remaining_tokens) def _set_latency_metrics( @@ -1113,12 +1299,14 @@ def _set_latency_metrics( time_to_first_token_seconds is not None and kwargs.get("stream", False) is True # only emit for streaming requests ): + _ttft_labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_llm_api_time_to_first_token_metric" + ), + enum_values=enum_values, + ) self.litellm_llm_api_time_to_first_token_metric.labels( - model, - user_api_key, - user_api_key_alias, - user_api_team, - user_api_team_alias, + **_ttft_labels ).observe(time_to_first_token_seconds) else: verbose_logger.debug( @@ -1156,6 +1344,22 @@ def _set_latency_metrics( total_time_seconds ) + # request queue time (time from arrival to processing start) + _litellm_params = kwargs.get("litellm_params", {}) or {} + queue_time_seconds = (_litellm_params.get("metadata") or {}).get( + "queue_time_seconds" + ) + if queue_time_seconds is not None and queue_time_seconds >= 0: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_request_queue_time_seconds" + ), + enum_values=enum_values, + ) + self.litellm_request_queue_time_metric.labels(**_labels).observe( + queue_time_seconds + ) + async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): from litellm.types.utils import StandardLoggingPayload @@ -1163,12 +1367,20 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti f"prometheus Logging - Enters failure logging function for kwargs {kwargs}" ) - # unpack kwargs - model = kwargs.get("model", "") standard_logging_payload: StandardLoggingPayload = kwargs.get( "standard_logging_object", {} ) + + if self._should_skip_metrics_for_invalid_key( + kwargs=kwargs, standard_logging_payload=standard_logging_payload + ): + return + + model = kwargs.get("model", "") + litellm_params = kwargs.get("litellm_params", {}) or {} + get_end_user_id_for_cost_tracking = _get_cached_end_user_id_for_cost_tracking() + end_user_id = get_end_user_id_for_cost_tracking( litellm_params, service_type="prometheus" ) @@ -1179,7 +1391,6 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti user_api_team_alias = standard_logging_payload["metadata"][ "user_api_key_team_alias" ] - kwargs.get("exception", None) try: self.litellm_llm_api_failed_requests_metric.labels( @@ -1190,6 +1401,7 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti user_api_team, user_api_team_alias, user_id, + standard_logging_payload.get("model_id", ""), ).inc() self.set_llm_deployment_failure_metrics(kwargs) except Exception as e: @@ -1199,6 +1411,147 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti pass pass + def _extract_status_code( + self, + kwargs: Optional[dict] = None, + enum_values: Optional[Any] = None, + exception: Optional[Exception] = None, + ) -> Optional[int]: + """ + Extract HTTP status code from various input formats for validation. + + This is a centralized helper to extract status code from different + callback function signatures. Handles both ProxyException (uses 'code') + and standard exceptions (uses 'status_code'). + + Args: + kwargs: Dictionary potentially containing 'exception' key + enum_values: Object with 'status_code' attribute + exception: Exception object to extract status code from directly + + Returns: + Status code as integer if found, None otherwise + """ + status_code = None + + # Try from enum_values first (most common in our callbacks) + if ( + enum_values + and hasattr(enum_values, "status_code") + and enum_values.status_code + ): + try: + status_code = int(enum_values.status_code) + except (ValueError, TypeError): + pass + + if not status_code and exception: + # ProxyException uses 'code' attribute, other exceptions may use 'status_code' + status_code = getattr(exception, "status_code", None) or getattr( + exception, "code", None + ) + if status_code is not None: + try: + status_code = int(status_code) + except (ValueError, TypeError): + status_code = None + + if not status_code and kwargs: + exception_in_kwargs = kwargs.get("exception") + if exception_in_kwargs: + status_code = getattr( + exception_in_kwargs, "status_code", None + ) or getattr(exception_in_kwargs, "code", None) + if status_code is not None: + try: + status_code = int(status_code) + except (ValueError, TypeError): + status_code = None + + return status_code + + def _is_invalid_api_key_request( + self, + status_code: Optional[int], + exception: Optional[Exception] = None, + ) -> bool: + """ + Determine if a request has an invalid API key based on status code and exception. + + This method prevents invalid authentication attempts from being recorded in + Prometheus metrics. A 401 status code is the definitive indicator of authentication + failure. Additionally, we check exception messages for authentication error patterns + to catch cases where the exception hasn't been converted to a ProxyException yet. + + Args: + status_code: HTTP status code (401 indicates authentication error) + exception: Exception object to check for auth-related error messages + + Returns: + True if the request has an invalid API key and metrics should be skipped, + False otherwise + """ + if status_code == 401: + return True + + # Handle cases where AssertionError is raised before conversion to ProxyException + if exception is not None: + exception_str = str(exception).lower() + auth_error_patterns = [ + "virtual key expected", + "expected to start with 'sk-'", + "authentication error", + "invalid api key", + "api key not valid", + ] + if any(pattern in exception_str for pattern in auth_error_patterns): + return True + + return False + + def _should_skip_metrics_for_invalid_key( + self, + kwargs: Optional[dict] = None, + user_api_key_dict: Optional[Any] = None, + enum_values: Optional[Any] = None, + standard_logging_payload: Optional[Union[dict, StandardLoggingPayload]] = None, + exception: Optional[Exception] = None, + ) -> bool: + """ + Determine if Prometheus metrics should be skipped for invalid API key requests. + + This is a centralized validation method that extracts status code and exception + information from various callback function signatures and determines if the request + represents an invalid API key attempt that should be filtered from metrics. + + Args: + kwargs: Dictionary potentially containing exception and other data + user_api_key_dict: User API key authentication object (currently unused) + enum_values: Object with status_code attribute + standard_logging_payload: Standard logging payload dictionary + exception: Exception object to check directly + + Returns: + True if metrics should be skipped (invalid key detected), False otherwise + """ + status_code = self._extract_status_code( + kwargs=kwargs, + enum_values=enum_values, + exception=exception, + ) + + if exception is None and kwargs: + exception = kwargs.get("exception") + + if self._is_invalid_api_key_request(status_code, exception=exception): + verbose_logger.debug( + "Skipping Prometheus metrics for invalid API key request: " + f"status_code={status_code}, exception={type(exception).__name__ if exception else None}" + ) + return True + + return False + async def async_post_call_failure_hook( self, request_data: dict, @@ -1224,11 +1577,23 @@ async def async_post_call_failure_hook( StandardLoggingPayloadSetup, ) + if self._should_skip_metrics_for_invalid_key( + user_api_key_dict=user_api_key_dict, + exception=original_exception, + ): + return + + status_code = self._extract_status_code(exception=original_exception) + try: _tags = StandardLoggingPayloadSetup._get_request_tags( litellm_params=request_data, proxy_server_request=request_data.get("proxy_server_request", {}), ) + _metadata = request_data.get("metadata", {}) or {} + model_id = _metadata.get("model_info", {}).get("id") or request_data.get( + "model_info", {} + ).get("id") enum_values = UserAPIKeyLabelValues( end_user=user_api_key_dict.end_user_id, user=user_api_key_dict.user_id, @@ -1238,11 +1603,14 @@ async def async_post_call_failure_hook( team=user_api_key_dict.team_id, team_alias=user_api_key_dict.team_alias, requested_model=request_data.get("model", ""), - status_code=str(getattr(original_exception, "status_code", None)), - exception_status=str(getattr(original_exception, "status_code", None)), + status_code=str(status_code), + exception_status=str(status_code), exception_class=self._get_exception_class_name(original_exception), tags=_tags, route=user_api_key_dict.request_route, + client_ip=_metadata.get("requester_ip_address"), + user_agent=_metadata.get("user_agent"), + model_id=model_id, ) _labels = prometheus_label_factory( supported_enum_labels=self.get_labels_for_metric( @@ -1277,6 +1645,12 @@ async def async_post_call_success_hook( StandardLoggingPayloadSetup, ) + if self._should_skip_metrics_for_invalid_key( + user_api_key_dict=user_api_key_dict + ): + return + + _metadata = data.get("metadata", {}) or {} enum_values = UserAPIKeyLabelValues( end_user=user_api_key_dict.end_user_id, hashed_api_key=user_api_key_dict.api_key, @@ -1292,6 +1666,8 @@ async def async_post_call_success_hook( litellm_params=data, proxy_server_request=data.get("proxy_server_request", {}), ), + client_ip=_metadata.get("requester_ip_address"), + user_agent=_metadata.get("user_agent"), ) _labels = prometheus_label_factory( supported_enum_labels=self.get_labels_for_metric( @@ -1333,6 +1709,15 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): llm_provider = _litellm_params.get("custom_llm_provider", None) + if self._should_skip_metrics_for_invalid_key( + kwargs=request_kwargs, + standard_logging_payload=standard_logging_payload, + ): + return + hashed_api_key = standard_logging_payload.get("metadata", {}).get( + "user_api_key_hash" + ) + # Create enum_values for the label factory (always create for use in different metrics) enum_values = UserAPIKeyLabelValues( litellm_model_name=litellm_model_name, @@ -1346,9 +1731,7 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): self._get_exception_class_name(exception) if exception else None ), requested_model=model_group, - hashed_api_key=standard_logging_payload["metadata"][ - "user_api_key_hash" - ], + hashed_api_key=hashed_api_key, api_key_alias=standard_logging_payload["metadata"][ "user_api_key_alias" ], @@ -1357,6 +1740,10 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): "user_api_key_team_alias" ], tags=standard_logging_payload.get("request_tags", []), + client_ip=standard_logging_payload["metadata"].get( + "requester_ip_address" + ), + user_agent=standard_logging_payload["metadata"].get("user_agent"), ) """ @@ -1370,7 +1757,6 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): api_provider=llm_provider or "", ) if exception is not None: - _labels = prometheus_label_factory( supported_enum_labels=self.get_labels_for_metric( metric_name="litellm_deployment_failure_responses" @@ -1395,6 +1781,49 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): ) ) + def _set_deployment_tpm_rpm_limit_metrics( + self, + model_info: dict, + litellm_params: dict, + litellm_model_name: Optional[str], + model_id: Optional[str], + api_base: Optional[str], + llm_provider: Optional[str], + ): + """ + Set the deployment TPM and RPM limits metrics + """ + tpm = model_info.get("tpm") or litellm_params.get("tpm") + rpm = model_info.get("rpm") or litellm_params.get("rpm") + + if tpm is not None: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_deployment_tpm_limit" + ), + enum_values=UserAPIKeyLabelValues( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ), + ) + self.litellm_deployment_tpm_limit.labels(**_labels).set(tpm) + + if rpm is not None: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_deployment_rpm_limit" + ), + enum_values=UserAPIKeyLabelValues( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ), + ) + self.litellm_deployment_rpm_limit.labels(**_labels).set(rpm) + def set_llm_deployment_success_metrics( self, request_kwargs: dict, @@ -1403,16 +1832,23 @@ def set_llm_deployment_success_metrics( enum_values: UserAPIKeyLabelValues, output_tokens: float = 1.0, ): - try: verbose_logger.debug("setting remaining tokens requests metric") - standard_logging_payload: Optional[StandardLoggingPayload] = ( - request_kwargs.get("standard_logging_object") - ) + standard_logging_payload: Optional[ + StandardLoggingPayload + ] = request_kwargs.get("standard_logging_object") if standard_logging_payload is None: return + # Skip recording metrics for invalid API key requests + if self._should_skip_metrics_for_invalid_key( + kwargs=request_kwargs, + enum_values=enum_values, + standard_logging_payload=standard_logging_payload, + ): + return + api_base = standard_logging_payload["api_base"] _litellm_params = request_kwargs.get("litellm_params", {}) or {} _metadata = _litellm_params.get("metadata", {}) @@ -1421,6 +1857,16 @@ def set_llm_deployment_success_metrics( _model_info = _metadata.get("model_info") or {} model_id = _model_info.get("id", None) + if _model_info or _litellm_params: + self._set_deployment_tpm_rpm_limit_metrics( + model_info=_model_info, + litellm_params=_litellm_params, + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + llm_provider=llm_provider, + ) + remaining_requests: Optional[int] = None remaining_tokens: Optional[int] = None if additional_headers := standard_logging_payload["hidden_params"][ @@ -1543,6 +1989,50 @@ def set_llm_deployment_success_metrics( ) return + def _record_guardrail_metrics( + self, + guardrail_name: str, + latency_seconds: float, + status: str, + error_type: Optional[str], + hook_type: str, + ): + """ + Record guardrail metrics for prometheus. + + Args: + guardrail_name: Name of the guardrail + latency_seconds: Execution latency in seconds + status: "success" or "error" + error_type: Type of error if any, None otherwise + hook_type: "pre_call", "during_call", or "post_call" + """ + try: + # Record latency + self.litellm_guardrail_latency_metric.labels( + guardrail_name=guardrail_name, + status=status, + error_type=error_type or "none", + hook_type=hook_type, + ).observe(latency_seconds) + + # Record request count + self.litellm_guardrail_requests_total.labels( + guardrail_name=guardrail_name, + status=status, + hook_type=hook_type, + ).inc() + + # Record error count if there was an error + if status == "error" and error_type: + self.litellm_guardrail_errors_total.labels( + guardrail_name=guardrail_name, + error_type=error_type, + hook_type=hook_type, + ).inc() + except Exception as e: + verbose_logger.debug(f"Error recording guardrail metrics: {str(e)}") + @staticmethod def _get_exception_class_name(exception: Exception) -> str: exception_class_name = "" @@ -1723,6 +2213,17 @@ def increment_deployment_cooled_down( litellm_model_name, model_id, api_base, api_provider, exception_status ).inc() + def increment_callback_logging_failure( + self, + callback_name: str, + ): + """ + Increment metric when logging to a callback fails (e.g., s3_v2, langfuse, etc.) + """ + self.litellm_callback_logging_failures_metric.labels( + callback_name=callback_name + ).inc() + def track_provider_remaining_budget( self, provider: str, spend: float, budget_limit: float ): @@ -1751,7 +2252,7 @@ async def _initialize_budget_metrics( self, data_fetch_function: Callable[..., Awaitable[Tuple[List[Any], Optional[int]]]], set_metrics_function: Callable[[List[Any]], Awaitable[None]], - data_type: Literal["teams", "keys"], + data_type: Literal["teams", "keys", "users"], ): """ Generic method to initialize budget metrics for teams or API keys. @@ -1843,7 +2344,10 @@ async def _initialize_api_key_budget_metrics(self): async def fetch_keys( page_size: int, page: int - ) -> Tuple[List[Union[str, UserAPIKeyAuth]], Optional[int]]: + ) -> Tuple[ + List[Union[str, UserAPIKeyAuth, LiteLLM_DeletedVerificationToken]], + Optional[int], + ]: key_list_response = await _list_key_helper( prisma_client=prisma_client, page=page, @@ -1868,6 +2372,37 @@ async def fetch_keys( data_type="keys", ) + async def _initialize_user_budget_metrics(self): + """ + Initialize user budget metrics by reusing the generic pagination logic. + """ + from litellm.proxy._types import LiteLLM_UserTable + from litellm.proxy.proxy_server import prisma_client + + if prisma_client is None: + verbose_logger.debug( + "Prometheus: skipping user metrics initialization, DB not initialized" + ) + return + + async def fetch_users( + page_size: int, page: int + ) -> Tuple[List[LiteLLM_UserTable], Optional[int]]: + skip = (page - 1) * page_size + users = await prisma_client.db.litellm_usertable.find_many( + skip=skip, + take=page_size, + order={"created_at": "desc"}, + ) + total_count = await prisma_client.db.litellm_usertable.count() + return users, total_count + + await self._initialize_budget_metrics( + data_fetch_function=fetch_users, + set_metrics_function=self._set_user_list_budget_metrics, + data_type="users", + ) + async def initialize_remaining_budget_metrics(self): """ Handler for initializing remaining budget metrics for all teams to avoid metric discrepancies. @@ -1900,11 +2435,48 @@ async def initialize_remaining_budget_metrics(self): async def _initialize_remaining_budget_metrics(self): """ - Helper to initialize remaining budget metrics for all teams and API keys. + Helper to initialize remaining budget metrics for all teams, API keys, and users. """ - verbose_logger.debug("Emitting key, team budget metrics....") + verbose_logger.debug("Emitting key, team, user budget metrics....") await self._initialize_team_budget_metrics() await self._initialize_api_key_budget_metrics() + await self._initialize_user_budget_metrics() + await self._initialize_user_and_team_count_metrics() + + async def _initialize_user_and_team_count_metrics(self): + """ + Initialize user and team count metrics by querying the database. + + Updates: + - litellm_total_users: Total count of users in the database + - litellm_teams_count: Total count of teams in the database + """ + from litellm.proxy.proxy_server import prisma_client + + if prisma_client is None: + verbose_logger.debug( + "Prometheus: skipping user/team count metrics initialization, DB not initialized" + ) + return + + try: + # Get total user count + total_users = await prisma_client.db.litellm_usertable.count() + self.litellm_total_users_metric.set(total_users) + verbose_logger.debug( + f"Prometheus: set litellm_total_users to {total_users}" + ) + + # Get total team count + total_teams = await prisma_client.db.litellm_teamtable.count() + self.litellm_teams_count_metric.set(total_teams) + verbose_logger.debug( + f"Prometheus: set litellm_teams_count to {total_teams}" + ) + except Exception as e: + verbose_logger.exception( + f"Error initializing user/team count metrics: {str(e)}" + ) async def _set_key_list_budget_metrics( self, keys: List[Union[str, UserAPIKeyAuth]] @@ -1919,12 +2491,17 @@ async def _set_team_list_budget_metrics(self, teams: List[LiteLLM_TeamTable]): for team in teams: self._set_team_budget_metrics(team) + async def _set_user_list_budget_metrics(self, users: List[LiteLLM_UserTable]): + """Helper function to set budget metrics for a list of users""" + for user in users: + self._set_user_budget_metrics(user) + async def _set_team_budget_metrics_after_api_request( self, user_api_team: Optional[str], user_api_team_alias: Optional[str], - team_spend: float, - team_max_budget: float, + team_spend: Optional[float], + team_max_budget: Optional[float], response_cost: float, ): """ @@ -2086,7 +2663,7 @@ async def _set_api_key_budget_metrics_after_api_request( user_api_key: Optional[str], user_api_key_alias: Optional[str], response_cost: float, - key_max_budget: float, + key_max_budget: Optional[float], key_spend: Optional[float], ): if user_api_key: @@ -2103,7 +2680,7 @@ async def _assemble_key_object( self, user_api_key: str, user_api_key_alias: str, - key_max_budget: float, + key_max_budget: Optional[float], key_spend: Optional[float], response_cost: float, ) -> UserAPIKeyAuth: @@ -2136,6 +2713,122 @@ async def _assemble_key_object( return user_api_key_dict + async def _set_user_budget_metrics_after_api_request( + self, + user_id: Optional[str], + user_spend: Optional[float], + user_max_budget: Optional[float], + response_cost: float, + ): + """ + Set user budget metrics after an LLM API request + + - Assemble a LiteLLM_UserTable object + - looks up user info from db if not available in metadata + - Set user budget metrics + """ + if user_id: + user_object = await self._assemble_user_object( + user_id=user_id, + spend=user_spend, + max_budget=user_max_budget, + response_cost=response_cost, + ) + + self._set_user_budget_metrics(user_object) + + async def _assemble_user_object( + self, + user_id: str, + spend: Optional[float], + max_budget: Optional[float], + response_cost: float, + ) -> LiteLLM_UserTable: + """ + Assemble a LiteLLM_UserTable object + + for fields not available in metadata, we fetch from db + Fields not available in metadata: + - `budget_reset_at` + """ + from litellm.proxy.auth.auth_checks import get_user_object + from litellm.proxy.proxy_server import prisma_client, user_api_key_cache + + _total_user_spend = (spend or 0) + response_cost + user_object = LiteLLM_UserTable( + user_id=user_id, + spend=_total_user_spend, + max_budget=max_budget, + ) + try: + user_info = await get_user_object( + user_id=user_id, + prisma_client=prisma_client, + user_api_key_cache=user_api_key_cache, + user_id_upsert=False, + check_db_only=True, + ) + except Exception as e: + verbose_logger.debug( + f"[Non-Blocking] Prometheus: Error getting user info: {str(e)}" + ) + return user_object + + if user_info: + user_object.budget_reset_at = user_info.budget_reset_at + + return user_object + + def _set_user_budget_metrics( + self, + user: LiteLLM_UserTable, + ): + """ + Set user budget metrics for a single user + + - Remaining Budget + - Max Budget + - Budget Reset At + """ + enum_values = UserAPIKeyLabelValues( + user=user.user_id, + ) + + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_remaining_user_budget_metric" + ), + enum_values=enum_values, + ) + self.litellm_remaining_user_budget_metric.labels(**_labels).set( + self._safe_get_remaining_budget( + max_budget=user.max_budget, + spend=user.spend, + ) + ) + + if user.max_budget is not None: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_user_max_budget_metric" + ), + enum_values=enum_values, + ) + self.litellm_user_max_budget_metric.labels(**_labels).set(user.max_budget) + + if user.budget_reset_at is not None: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_user_budget_remaining_hours_metric" + ), + enum_values=enum_values, + ) + self.litellm_user_budget_remaining_hours_metric.labels(**_labels).set( + self._get_remaining_hours_for_budget_reset( + budget_reset_at=user.budget_reset_at + ) + ) + def _get_remaining_hours_for_budget_reset(self, budget_reset_at: datetime) -> float: """ Get remaining hours for budget reset @@ -2166,16 +2859,13 @@ def initialize_budget_metrics_cron_job(scheduler: AsyncIOScheduler): It emits the current remaining budget metrics for all Keys and Teams. """ - from enterprise.litellm_enterprise.integrations.prometheus import ( - PrometheusLogger, - ) from litellm.constants import PROMETHEUS_BUDGET_METRICS_REFRESH_INTERVAL_MINUTES from litellm.integrations.custom_logger import CustomLogger - prometheus_loggers: List[CustomLogger] = ( - litellm.logging_callback_manager.get_custom_loggers_for_type( - callback_type=PrometheusLogger - ) + prometheus_loggers: List[ + CustomLogger + ] = litellm.logging_callback_manager.get_custom_loggers_for_type( + callback_type=PrometheusLogger ) # we need to get the initialized prometheus logger instance(s) and call logger.initialize_remaining_budget_metrics() on them verbose_logger.debug("found %s prometheus loggers", len(prometheus_loggers)) @@ -2195,26 +2885,19 @@ def initialize_budget_metrics_cron_job(scheduler: AsyncIOScheduler): ) @staticmethod - def _mount_metrics_endpoint(premium_user: bool): + def _mount_metrics_endpoint(): """ Mount the Prometheus metrics endpoint with optional authentication. Args: - premium_user (bool): Whether the user is a premium user require_auth (bool, optional): Whether to require authentication for the metrics endpoint. Defaults to False. """ from prometheus_client import make_asgi_app from litellm._logging import verbose_proxy_logger - from litellm.proxy._types import CommonProxyErrors from litellm.proxy.proxy_server import app - if premium_user is not True: - verbose_proxy_logger.warning( - f"Prometheus metrics are only available for premium users. {CommonProxyErrors.not_premium_user.value}" - ) - # Create metrics ASGI app if "PROMETHEUS_MULTIPROC_DIR" in os.environ: from prometheus_client import CollectorRegistry, multiprocess @@ -2253,6 +2936,8 @@ def prometheus_label_factory( } if UserAPIKeyLabelNames.END_USER.value in filtered_labels: + get_end_user_id_for_cost_tracking = _get_cached_end_user_id_for_cost_tracking() + filtered_labels["end_user"] = get_end_user_id_for_cost_tracking( litellm_params={"user_api_key_end_user_id": enum_values.end_user}, service_type="prometheus", diff --git a/litellm/integrations/prometheus_services.py b/litellm/integrations/prometheus_services.py index a5f2f0b5c723..55ce758ece6b 100644 --- a/litellm/integrations/prometheus_services.py +++ b/litellm/integrations/prometheus_services.py @@ -105,6 +105,11 @@ def _get_service_metrics_initialize( return metrics def is_metric_registered(self, metric_name) -> bool: + # Use _names_to_collectors (O(1)) instead of REGISTRY.collect() (O(n)) to avoid + # perf regression when a new Router is created per request (e.g. router_settings in DB). + names_to_collectors = getattr(self.REGISTRY, "_names_to_collectors", None) + if names_to_collectors is not None: + return metric_name in names_to_collectors for metric in self.REGISTRY.collect(): if metric_name == metric.name: return True diff --git a/litellm/integrations/prompt_management_base.py b/litellm/integrations/prompt_management_base.py index 7754ca435caf..b32f78c0deaf 100644 --- a/litellm/integrations/prompt_management_base.py +++ b/litellm/integrations/prompt_management_base.py @@ -1,14 +1,18 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple -from typing_extensions import TypedDict +from typing_extensions import TYPE_CHECKING, TypedDict from litellm.types.llms.openai import AllMessageValues +from litellm.types.prompts.init_prompts import PromptSpec from litellm.types.utils import StandardCallbackDynamicParams +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj + class PromptManagementClient(TypedDict): - prompt_id: str + prompt_id: Optional[str] prompt_template: List[AllMessageValues] prompt_template_model: Optional[str] prompt_template_optional_params: Optional[Dict[str, Any]] @@ -24,7 +28,8 @@ def integration_name(self) -> str: @abstractmethod def should_run_prompt_management( self, - prompt_id: str, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], dynamic_callback_params: StandardCallbackDynamicParams, ) -> bool: pass @@ -32,7 +37,8 @@ def should_run_prompt_management( @abstractmethod def _compile_prompt_helper( self, - prompt_id: str, + prompt_id: Optional[str], + prompt_spec: Optional[PromptSpec], prompt_variables: Optional[dict], dynamic_callback_params: StandardCallbackDynamicParams, prompt_label: Optional[str] = None, @@ -40,6 +46,18 @@ def _compile_prompt_helper( ) -> PromptManagementClient: pass + @abstractmethod + async def async_compile_prompt_helper( + self, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ) -> PromptManagementClient: + pass + def merge_messages( self, prompt_template: List[AllMessageValues], @@ -55,10 +73,41 @@ def compile_prompt( dynamic_callback_params: StandardCallbackDynamicParams, prompt_label: Optional[str] = None, prompt_version: Optional[int] = None, + prompt_spec: Optional[PromptSpec] = None, ) -> PromptManagementClient: compiled_prompt_client = self._compile_prompt_helper( prompt_id=prompt_id, + prompt_spec=prompt_spec, + prompt_variables=prompt_variables, + dynamic_callback_params=dynamic_callback_params, + prompt_label=prompt_label, + prompt_version=prompt_version, + ) + + try: + messages = compiled_prompt_client["prompt_template"] + client_messages + except Exception as e: + raise ValueError( + f"Error compiling prompt: {e}. Prompt id={prompt_id}, prompt_variables={prompt_variables}, client_messages={client_messages}, dynamic_callback_params={dynamic_callback_params}" + ) + + compiled_prompt_client["completed_messages"] = messages + return compiled_prompt_client + + async def async_compile_prompt( + self, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + client_messages: List[AllMessageValues], + dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ) -> PromptManagementClient: + compiled_prompt_client = await self.async_compile_prompt_helper( + prompt_id=prompt_id, + prompt_spec=prompt_spec, prompt_variables=prompt_variables, dynamic_callback_params=dynamic_callback_params, prompt_label=prompt_label, @@ -83,6 +132,39 @@ def _get_model_from_prompt( else: return model.replace("{}/".format(self.integration_name), "") + def post_compile_prompt_processing( + self, + prompt_template: PromptManagementClient, + messages: List[AllMessageValues], + non_default_params: dict, + model: str, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, + ): + completed_messages = prompt_template["completed_messages"] or messages + + prompt_template_optional_params = ( + prompt_template["prompt_template_optional_params"] or {} + ) + + updated_non_default_params = { + **non_default_params, + **( + prompt_template_optional_params + if not ignore_prompt_manager_optional_params + else {} + ), + } + + if not ignore_prompt_manager_model: + model = self._get_model_from_prompt( + prompt_management_client=prompt_template, model=model + ) + else: + model = model + + return model, completed_messages, updated_non_default_params + def get_chat_completion_prompt( self, model: str, @@ -91,14 +173,19 @@ def get_chat_completion_prompt( prompt_id: Optional[str], prompt_variables: Optional[dict], dynamic_callback_params: StandardCallbackDynamicParams, + prompt_spec: Optional[PromptSpec] = None, prompt_label: Optional[str] = None, prompt_version: Optional[int] = None, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, ) -> Tuple[str, List[AllMessageValues], dict]: if prompt_id is None: raise ValueError("prompt_id is required for Prompt Management Base class") if not self.should_run_prompt_management( - prompt_id=prompt_id, dynamic_callback_params=dynamic_callback_params + prompt_id=prompt_id, + prompt_spec=prompt_spec, + dynamic_callback_params=dynamic_callback_params, ): return model, messages, non_default_params @@ -111,19 +198,53 @@ def get_chat_completion_prompt( prompt_version=prompt_version, ) - completed_messages = prompt_template["completed_messages"] or messages - - prompt_template_optional_params = ( - prompt_template["prompt_template_optional_params"] or {} + return self.post_compile_prompt_processing( + prompt_template=prompt_template, + messages=messages, + non_default_params=non_default_params, + model=model, + ignore_prompt_manager_model=ignore_prompt_manager_model, + ignore_prompt_manager_optional_params=ignore_prompt_manager_optional_params, ) - updated_non_default_params = { - **non_default_params, - **prompt_template_optional_params, - } + async def async_get_chat_completion_prompt( + self, + model: str, + messages: List[AllMessageValues], + non_default_params: dict, + prompt_id: Optional[str], + prompt_variables: Optional[dict], + dynamic_callback_params: StandardCallbackDynamicParams, + litellm_logging_obj: "LiteLLMLoggingObj", + prompt_spec: Optional[PromptSpec] = None, + tools: Optional[List[Dict]] = None, + prompt_label: Optional[str] = None, + prompt_version: Optional[int] = None, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, + ) -> Tuple[str, List[AllMessageValues], dict]: + if not self.should_run_prompt_management( + prompt_id=prompt_id, + prompt_spec=prompt_spec, + dynamic_callback_params=dynamic_callback_params, + ): + return model, messages, non_default_params - model = self._get_model_from_prompt( - prompt_management_client=prompt_template, model=model + prompt_template = await self.async_compile_prompt( + prompt_id=prompt_id, + prompt_variables=prompt_variables, + client_messages=messages, + dynamic_callback_params=dynamic_callback_params, + prompt_spec=prompt_spec, + prompt_label=prompt_label, + prompt_version=prompt_version, ) - return model, completed_messages, updated_non_default_params + return self.post_compile_prompt_processing( + prompt_template=prompt_template, + messages=messages, + non_default_params=non_default_params, + model=model, + ignore_prompt_manager_model=ignore_prompt_manager_model, + ignore_prompt_manager_optional_params=ignore_prompt_manager_optional_params, + ) diff --git a/litellm/integrations/s3.py b/litellm/integrations/s3.py index 53caeb0d198f..2e70b1d65197 100644 --- a/litellm/integrations/s3.py +++ b/litellm/integrations/s3.py @@ -181,13 +181,13 @@ def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose): def get_s3_object_key( s3_path: str, - team_alias_prefix: str, + prefix: str, start_time: datetime, s3_file_name: str, ) -> str: s3_object_key = ( (s3_path.rstrip("/") + "/" if s3_path else "") - + team_alias_prefix + + prefix + start_time.strftime("%Y-%m-%d") + "/" + s3_file_name diff --git a/litellm/integrations/s3_v2.py b/litellm/integrations/s3_v2.py index a65500c80dcc..534b85e47523 100644 --- a/litellm/integrations/s3_v2.py +++ b/litellm/integrations/s3_v2.py @@ -49,6 +49,8 @@ def __init__( s3_batch_size: Optional[int] = DEFAULT_S3_BATCH_SIZE, s3_config=None, s3_use_team_prefix: bool = False, + s3_strip_base64_files: bool = False, + s3_use_key_prefix: bool = False, **kwargs, ): try: @@ -56,12 +58,7 @@ def __init__( f"in init s3 logger - s3_callback_params {litellm.s3_callback_params}" ) - # IMPORTANT: We use a concurrent limit of 1 to upload to s3 - # Files should get uploaded BUT they should not impact latency of LLM calling logic - self.async_httpx_client = get_async_httpx_client( - llm_provider=httpxSpecialProvider.LoggingCallback, - ) - + # Initialize S3 params first to get the correct s3_verify value self._init_s3_params( s3_bucket_name=s3_bucket_name, s3_region_name=s3_region_name, @@ -80,9 +77,21 @@ def __init__( s3_config=s3_config, s3_path=s3_path, s3_use_team_prefix=s3_use_team_prefix, + s3_strip_base64_files=s3_strip_base64_files, + s3_use_key_prefix=s3_use_key_prefix ) verbose_logger.debug(f"s3 logger using endpoint url {s3_endpoint_url}") + # IMPORTANT + # Create httpx client AFTER _init_s3_params so we have the correct s3_verify value + verbose_logger.debug( + f"s3_v2 logger creating async httpx client with s3_verify={self.s3_verify}" + ) + self.async_httpx_client = get_async_httpx_client( + llm_provider=httpxSpecialProvider.LoggingCallback, + params={"ssl_verify": self.s3_verify} + ) + asyncio.create_task(self.periodic_flush()) self.flush_lock = asyncio.Lock() @@ -124,6 +133,8 @@ def _init_s3_params( s3_config=None, s3_path: Optional[str] = None, s3_use_team_prefix: bool = False, + s3_strip_base64_files: bool = False, + s3_use_key_prefix: bool = False, ): """ Initialize the s3 params for this logging callback @@ -144,9 +155,11 @@ def _init_s3_params( litellm.s3_callback_params.get("s3_api_version") or s3_api_version ) self.s3_use_ssl = ( - litellm.s3_callback_params.get("s3_use_ssl", True) or s3_use_ssl + litellm.s3_callback_params.get("s3_use_ssl", True) if litellm.s3_callback_params.get("s3_use_ssl") is not None else s3_use_ssl + ) + self.s3_verify = ( + litellm.s3_callback_params.get("s3_verify") if litellm.s3_callback_params.get("s3_verify") is not None else s3_verify ) - self.s3_verify = litellm.s3_callback_params.get("s3_verify") or s3_verify self.s3_endpoint_url = ( litellm.s3_callback_params.get("s3_endpoint_url") or s3_endpoint_url ) @@ -194,6 +207,16 @@ def _init_s3_params( or s3_use_team_prefix ) + self.s3_use_key_prefix = ( + bool(litellm.s3_callback_params.get("s3_use_key_prefix", False)) + or s3_use_key_prefix + ) + + self.s3_strip_base64_files = ( + bool(litellm.s3_callback_params.get("s3_strip_base64_files", False)) + or s3_strip_base64_files + ) + return async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): @@ -239,7 +262,7 @@ async def _async_log_event_base(self, kwargs, response_obj, start_time, end_time ) except Exception as e: verbose_logger.exception(f"s3 Layer Error - {str(e)}") - pass + self.handle_callback_failure(callback_name="S3Logger") async def async_upload_data_to_s3( self, batch_logging_element: s3BatchLoggingElement @@ -271,6 +294,9 @@ async def async_upload_data_to_s3( verbose_logger.debug( f"s3_v2 logger - uploading data to s3 - {batch_logging_element.s3_object_key}" ) + verbose_logger.debug( + f"s3_v2 logger - s3_verify setting: {self.s3_verify}" + ) # Prepare the URL url = f"https://{self.s3_bucket_name}.s3.{self.s3_region_name}.amazonaws.com/{batch_logging_element.s3_object_key}" @@ -323,6 +349,7 @@ async def async_upload_data_to_s3( response.raise_for_status() except Exception as e: verbose_logger.exception(f"Error uploading to s3: {str(e)}") + self.handle_callback_failure(callback_name="S3Logger") async def async_send_batch(self): """ @@ -364,33 +391,37 @@ def create_s3_batch_logging_element( if standard_logging_payload is None: return None - team_alias = standard_logging_payload["metadata"].get("user_api_key_team_alias") + if self.s3_strip_base64_files: + standard_logging_payload = self._strip_base64_from_messages_sync(standard_logging_payload) - team_alias_prefix = "" - if ( - litellm.enable_preview_features - and self.s3_use_team_prefix - and team_alias is not None - ): - team_alias_prefix = f"{team_alias}/" + # Base prefix (default empty) + prefix_components = [] + if self.s3_use_team_prefix: + team_alias = standard_logging_payload.get("metadata", {}).get("user_api_key_team_alias", None) + if team_alias: + prefix_components.append(team_alias) + if self.s3_use_key_prefix: + user_api_key_alias = standard_logging_payload.get("metadata", {}).get("user_api_key_alias", None) + if user_api_key_alias: + prefix_components.append(user_api_key_alias) + + + # Construct full prefix path + prefix_path = "/".join(prefix_components) + if prefix_path: + prefix_path += "/" s3_file_name = ( litellm.utils.get_logging_id(start_time, standard_logging_payload) or "" ) + verbose_logger.debug(f"Creating s3 file with prefix_components={prefix_components},prefix_path={prefix_path} and {s3_file_name}") s3_object_key = get_s3_object_key( s3_path=cast(Optional[str], self.s3_path) or "", - team_alias_prefix=team_alias_prefix, + prefix=prefix_path, start_time=start_time, s3_file_name=s3_file_name, ) - - s3_object_download_filename = ( - "time-" - + start_time.strftime("%Y-%m-%dT%H-%M-%S-%f") - + "_" - + standard_logging_payload["id"] - + ".json" - ) + verbose_logger.debug(f"s3_object_key={s3_object_key}") s3_object_download_filename = f"time-{start_time.strftime('%Y-%m-%dT%H-%M-%S-%f')}_{standard_logging_payload['id']}.json" @@ -465,12 +496,15 @@ def upload_data_to_s3(self, batch_logging_element: s3BatchLoggingElement): # Prepare the signed headers signed_headers = dict(aws_request.headers.items()) - httpx_client = _get_httpx_client() + httpx_client = _get_httpx_client( + params={"ssl_verify": self.s3_verify} if self.s3_verify is not None else None + ) # Make the request response = httpx_client.put(url, data=json_string, headers=signed_headers) response.raise_for_status() except Exception as e: verbose_logger.exception(f"Error uploading to s3: {str(e)}") + self.handle_callback_failure(callback_name="S3Logger") async def _download_object_from_s3(self, s3_object_key: str) -> Optional[dict]: """ diff --git a/litellm/integrations/sqs.py b/litellm/integrations/sqs.py index 545aebbec6d1..97a4c5723d8f 100644 --- a/litellm/integrations/sqs.py +++ b/litellm/integrations/sqs.py @@ -30,6 +30,7 @@ from litellm.types.utils import StandardLoggingPayload from .custom_batch_logger import CustomBatchLogger +from litellm.types.integrations.base_health_check import IntegrationHealthCheckStatus _BASE64_INLINE_PATTERN = re.compile( r"data:(?:application|image|audio|video)/[a-zA-Z0-9.+-]+;base64,[A-Za-z0-9+/=\s]+", @@ -256,6 +257,8 @@ async def async_log_failure_event(self, kwargs, response_obj, start_time, end_ti standard_logging_payload = kwargs.get("standard_logging_object") if standard_logging_payload is None: raise ValueError("standard_logging_payload is None") + if self.sqs_strip_base64_files: + standard_logging_payload = await self._strip_base64_from_messages(standard_logging_payload) self.log_queue.append(standard_logging_payload) verbose_logger.debug( @@ -352,3 +355,19 @@ async def async_send_message(self, payload: StandardLoggingPayload) -> None: response.raise_for_status() except Exception as e: verbose_logger.exception(f"Error sending to SQS: {str(e)}") + + async def async_health_check(self) -> IntegrationHealthCheckStatus: + """ + Health check for SQS by sending a small test message to the configured queue. + """ + try: + from litellm.litellm_core_utils.litellm_logging import ( + create_dummy_standard_logging_payload, + ) + # Create a minimal standard logging payload + standard_logging_object: StandardLoggingPayload = create_dummy_standard_logging_payload() + # Attempt to send a single message + await self.async_send_message(standard_logging_object) + return IntegrationHealthCheckStatus(status="healthy", error_message=None) + except Exception as e: + return IntegrationHealthCheckStatus(status="unhealthy", error_message=str(e)) diff --git a/litellm/integrations/vector_store_integrations/vector_store_pre_call_hook.py b/litellm/integrations/vector_store_integrations/vector_store_pre_call_hook.py index 236935778d61..c94b925ea21a 100644 --- a/litellm/integrations/vector_store_integrations/vector_store_pre_call_hook.py +++ b/litellm/integrations/vector_store_integrations/vector_store_pre_call_hook.py @@ -12,6 +12,7 @@ from litellm._logging import verbose_logger from litellm.integrations.custom_logger import CustomLogger from litellm.types.llms.openai import AllMessageValues, ChatCompletionUserMessage +from litellm.types.prompts.init_prompts import PromptSpec from litellm.types.utils import StandardCallbackDynamicParams from litellm.types.vector_stores import ( LiteLLM_ManagedVectorStore, @@ -23,7 +24,7 @@ if TYPE_CHECKING: from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj else: - LiteLLMLoggingObj = None + LiteLLMLoggingObj = Any class VectorStorePreCallHook(CustomLogger): @@ -49,9 +50,12 @@ async def async_get_chat_completion_prompt( prompt_variables: Optional[dict], dynamic_callback_params: StandardCallbackDynamicParams, litellm_logging_obj: LiteLLMLoggingObj, + prompt_spec: Optional[PromptSpec] = None, tools: Optional[List[Dict]] = None, prompt_label: Optional[str] = None, prompt_version: Optional[int] = None, + ignore_prompt_manager_model: Optional[bool] = False, + ignore_prompt_manager_optional_params: Optional[bool] = False, ) -> Tuple[str, List[AllMessageValues], dict]: """ Perform vector store search and append results as context to messages. @@ -74,9 +78,20 @@ async def async_get_chat_completion_prompt( if litellm.vector_store_registry is None: return model, messages, non_default_params + # Get prisma_client for database fallback + prisma_client = None + try: + from litellm.proxy.proxy_server import prisma_client as _prisma_client + prisma_client = _prisma_client + except ImportError: + pass + + # Use database fallback to ensure synchronization across instances vector_stores_to_run: List[LiteLLM_ManagedVectorStore] = ( - litellm.vector_store_registry.pop_vector_stores_to_run( - non_default_params=non_default_params, tools=tools + await litellm.vector_store_registry.pop_vector_stores_to_run_with_db_fallback( + non_default_params=non_default_params, + tools=tools, + prisma_client=prisma_client ) ) diff --git a/litellm/integrations/weave/__init__.py b/litellm/integrations/weave/__init__.py new file mode 100644 index 000000000000..49af77b55e80 --- /dev/null +++ b/litellm/integrations/weave/__init__.py @@ -0,0 +1,7 @@ +""" +Weave (W&B) integration for LiteLLM via OpenTelemetry. +""" + +from litellm.integrations.weave.weave_otel import WeaveOtelLogger + +__all__ = ["WeaveOtelLogger"] diff --git a/litellm/integrations/weave/weave_otel.py b/litellm/integrations/weave/weave_otel.py new file mode 100644 index 000000000000..167deaf2cdc6 --- /dev/null +++ b/litellm/integrations/weave/weave_otel.py @@ -0,0 +1,329 @@ +from __future__ import annotations + +import base64 +import json +import os +from typing import TYPE_CHECKING, Any, Optional + +from opentelemetry.trace import Status, StatusCode +from typing_extensions import override + +from litellm._logging import verbose_logger +from litellm.integrations._types.open_inference import SpanAttributes as OpenInferenceSpanAttributes +from litellm.integrations.arize import _utils +from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig +from litellm.integrations.opentelemetry_utils.base_otel_llm_obs_attributes import ( + BaseLLMObsOTELAttributes, + safe_set_attribute, +) +from litellm.litellm_core_utils.safe_json_dumps import safe_dumps +from litellm.types.integrations.weave_otel import WeaveOtelConfig, WeaveSpanAttributes +from litellm.types.utils import StandardCallbackDynamicParams + +if TYPE_CHECKING: + from opentelemetry.trace import Span + + +# Weave OTEL endpoint +# Multi-tenant cloud: https://trace.wandb.ai/otel/v1/traces +# Dedicated cloud: https://.wandb.io/traces/otel/v1/traces +WEAVE_BASE_URL = "https://trace.wandb.ai" +WEAVE_OTEL_ENDPOINT = "/otel/v1/traces" + + +class WeaveLLMObsOTELAttributes(BaseLLMObsOTELAttributes): + """ + Weave-specific LLM observability OTEL attributes. + + Weave automatically maps attributes from multiple frameworks including + GenAI, OpenInference, Langfuse, and others. + """ + + @staticmethod + @override + def set_messages(span: "Span", kwargs: dict[str, Any]): + """Set input messages as span attributes using OpenInference conventions.""" + + messages = kwargs.get("messages") or [] + optional_params = kwargs.get("optional_params") or {} + + prompt = {"messages": messages} + functions = optional_params.get("functions") + tools = optional_params.get("tools") + if functions is not None: + prompt["functions"] = functions + if tools is not None: + prompt["tools"] = tools + safe_set_attribute(span, OpenInferenceSpanAttributes.INPUT_VALUE, json.dumps(prompt)) + + +def _set_weave_specific_attributes(span: Span, kwargs: dict[str, Any], response_obj: Any): + """ + Sets Weave-specific metadata attributes onto the OTEL span. + + Based on Weave's OTEL attribute mappings from: + https://github.com/wandb/weave/blob/master/weave/trace_server/opentelemetry/constants.py + """ + + # Extract all needed data upfront + litellm_params = kwargs.get("litellm_params") or {} + # optional_params = kwargs.get("optional_params") or {} + metadata = kwargs.get("metadata") or {} + model = kwargs.get("model") or "" + custom_llm_provider = litellm_params.get("custom_llm_provider") or "" + + # Weave supports a custom display name and will default to the model name if not provided. + display_name = metadata.get("display_name") + if not display_name and model: + if custom_llm_provider: + display_name = f"{custom_llm_provider}/{model}" + else: + display_name = model + if display_name: + display_name = display_name.replace("/", "__") + safe_set_attribute(span, WeaveSpanAttributes.DISPLAY_NAME.value, display_name) + + # Weave threads are OpenInference sessions. + if (session_id := metadata.get("session_id")) is not None: + if isinstance(session_id, (list, dict)): + session_id = safe_dumps(session_id) + safe_set_attribute(span, WeaveSpanAttributes.THREAD_ID.value, session_id) + safe_set_attribute(span, WeaveSpanAttributes.IS_TURN.value, True) + + # Response attributes are already set by _utils.set_attributes, + # but we override them here to better match Weave's expectations + if response_obj: + output_dict = None + if hasattr(response_obj, "model_dump"): + output_dict = response_obj.model_dump() + elif hasattr(response_obj, "get"): + output_dict = response_obj + + if output_dict: + safe_set_attribute(span, OpenInferenceSpanAttributes.OUTPUT_VALUE, safe_dumps(output_dict)) + + +def _get_weave_authorization_header(api_key: str) -> str: + """ + Get the authorization header for Weave OpenTelemetry. + + Weave uses Basic auth with format: api: + """ + auth_string = f"api:{api_key}" + auth_header = base64.b64encode(auth_string.encode()).decode() + return f"Basic {auth_header}" + + +def get_weave_otel_config() -> WeaveOtelConfig: + """ + Retrieves the Weave OpenTelemetry configuration based on environment variables. + + Environment Variables: + WANDB_API_KEY: Required. W&B API key for authentication. + WANDB_PROJECT_ID: Required. Project ID in format /. + WANDB_HOST: Optional. Custom Weave host URL. Defaults to cloud endpoint. + + Returns: + WeaveOtelConfig: A Pydantic model containing Weave OTEL configuration. + + Raises: + ValueError: If required environment variables are missing. + """ + api_key = os.getenv("WANDB_API_KEY") + project_id = os.getenv("WANDB_PROJECT_ID") + host = os.getenv("WANDB_HOST") + + if not api_key: + raise ValueError("WANDB_API_KEY must be set for Weave OpenTelemetry integration.") + + if not project_id: + raise ValueError( + "WANDB_PROJECT_ID must be set for Weave OpenTelemetry integration. Format: /" + ) + + if host: + if not host.startswith("http"): + host = "https://" + host + # Self-managed instances use a different path + endpoint = host.rstrip("/") + WEAVE_OTEL_ENDPOINT + verbose_logger.debug(f"Using Weave OTEL endpoint from host: {endpoint}") + else: + endpoint = WEAVE_BASE_URL + WEAVE_OTEL_ENDPOINT + verbose_logger.debug(f"Using Weave cloud endpoint: {endpoint}") + + # Weave uses Basic auth with format: api: + auth_header = _get_weave_authorization_header(api_key=api_key) + otlp_auth_headers = f"Authorization={auth_header},project_id={project_id}" + + # Set standard OTEL environment variables + os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = endpoint + os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = otlp_auth_headers + + return WeaveOtelConfig( + otlp_auth_headers=otlp_auth_headers, + endpoint=endpoint, + project_id=project_id, + protocol="otlp_http", + ) + + +def set_weave_otel_attributes(span: Span, kwargs: dict[str, Any], response_obj: Any): + """ + Sets OpenTelemetry span attributes for Weave observability. + Uses the same attribute setting logic as other OTEL integrations for consistency. + """ + _utils.set_attributes(span, kwargs, response_obj, WeaveLLMObsOTELAttributes) + _set_weave_specific_attributes(span=span, kwargs=kwargs, response_obj=response_obj) + + +class WeaveOtelLogger(OpenTelemetry): + """ + Weave (W&B) OpenTelemetry Logger for LiteLLM. + + Sends LLM traces to Weave via the OpenTelemetry Protocol (OTLP). + + Environment Variables: + WANDB_API_KEY: Required. Weights & Biases API key for authentication. + WANDB_PROJECT_ID: Required. Project ID in format /. + WANDB_HOST: Optional. Custom Weave host URL. Defaults to cloud endpoint. + + Usage: + litellm.callbacks = ["weave_otel"] + + Or manually: + from litellm.integrations.weave.weave_otel import WeaveOtelLogger + weave_logger = WeaveOtelLogger(callback_name="weave_otel") + litellm.callbacks = [weave_logger] + + Reference: + https://docs.wandb.ai/weave/guides/tracking/otel + """ + + def __init__( + self, + config: Optional[OpenTelemetryConfig] = None, + callback_name: Optional[str] = "weave_otel", + **kwargs, + ): + """ + Initialize WeaveOtelLogger. + + If config is not provided, automatically configures from environment variables + (WANDB_API_KEY, WANDB_PROJECT_ID, WANDB_HOST) via get_weave_otel_config(). + """ + if config is None: + # Auto-configure from Weave environment variables + weave_config = get_weave_otel_config() + + config = OpenTelemetryConfig( + exporter=weave_config.protocol, + endpoint=weave_config.endpoint, + headers=weave_config.otlp_auth_headers, + ) + + super().__init__(config=config, callback_name=callback_name, **kwargs) + + def _maybe_log_raw_request(self, kwargs, response_obj, start_time, end_time, parent_span): + """ + Override to skip creating the raw_gen_ai_request child span. + + For Weave, we only want a single span per LLM call. The parent span + already contains all the necessary attributes, so the child span + is redundant. + """ + pass + + def _start_primary_span( + self, + kwargs, + response_obj, + start_time, + end_time, + context, + parent_span=None, + ): + """ + Override to always create a child span instead of reusing the parent span. + + This ensures that wrapper spans (like "B", "C", "D", "E") remain separate + from the LiteLLM LLM call spans, creating proper nesting in Weave. + """ + + otel_tracer = self.get_tracer_to_use_for_request(kwargs) + # Always create a new child span, even if parent_span is provided + # This ensures wrapper spans remain separate from LLM call spans + span = otel_tracer.start_span( + name=self._get_span_name(kwargs), + start_time=self._to_ns(start_time), + context=context, + ) + span.set_status(Status(StatusCode.OK)) + self.set_attributes(span, kwargs, response_obj) + span.end(end_time=self._to_ns(end_time)) + return span + + def _handle_success(self, kwargs, response_obj, start_time, end_time): + """ + Override to prevent ending externally created parent spans. + + When wrapper spans (like "B", "C", "D", "E") are provided as parent spans, + they should be managed by the user code, not ended by LiteLLM. + """ + + verbose_logger.debug( + "Weave OpenTelemetry Logger: Logging kwargs: %s, OTEL config settings=%s", + kwargs, + self.config, + ) + ctx, parent_span = self._get_span_context(kwargs) + + # Always create a child span (handled by _start_primary_span override) + primary_span_parent = None + + # 1. Primary span + span = self._start_primary_span(kwargs, response_obj, start_time, end_time, ctx, primary_span_parent) + + # 2. Raw-request sub-span (skipped for Weave via _maybe_log_raw_request override) + self._maybe_log_raw_request(kwargs, response_obj, start_time, end_time, span) + + # 3. Guardrail span + self._create_guardrail_span(kwargs=kwargs, context=ctx) + + # 4. Metrics & cost recording + self._record_metrics(kwargs, response_obj, start_time, end_time) + + # 5. Semantic logs. + if self.config.enable_events: + self._emit_semantic_logs(kwargs, response_obj, span) + + # 6. Don't end parent span - it's managed by user code + # Since we always create a child span (never reuse parent), the parent span + # lifecycle is owned by the user. This prevents double-ending of wrapper spans + # like "B", "C", "D", "E" that users create and manage themselves. + + def construct_dynamic_otel_headers( + self, standard_callback_dynamic_params: StandardCallbackDynamicParams + ) -> dict | None: + """ + Construct dynamic Weave headers from standard callback dynamic params. + + This is used for team/key based logging. + + Returns: + dict: A dictionary of dynamic Weave headers + """ + dynamic_headers = {} + + dynamic_wandb_api_key = standard_callback_dynamic_params.get("wandb_api_key") + dynamic_weave_project_id = standard_callback_dynamic_params.get("weave_project_id") + + if dynamic_wandb_api_key: + auth_header = _get_weave_authorization_header( + api_key=dynamic_wandb_api_key, + ) + dynamic_headers["Authorization"] = auth_header + + if dynamic_weave_project_id: + dynamic_headers["project_id"] = dynamic_weave_project_id + + return dynamic_headers if dynamic_headers else None diff --git a/litellm/integrations/websearch_interception/ARCHITECTURE.md b/litellm/integrations/websearch_interception/ARCHITECTURE.md new file mode 100644 index 000000000000..3aa0a1558d74 --- /dev/null +++ b/litellm/integrations/websearch_interception/ARCHITECTURE.md @@ -0,0 +1,292 @@ +# WebSearch Interception Architecture + +Server-side WebSearch tool execution for models that don't natively support it (e.g., Bedrock/Claude). + +## How It Works + +User makes **ONE** `litellm.messages.acreate()` call → Gets final answer with search results. +The agentic loop happens transparently on the server. + +## LiteLLM Standard Web Search Tool + +LiteLLM defines a standard web search tool format (`litellm_web_search`) that all native provider tools are converted to. This enables consistent interception across providers. + +**Standard Tool Definition** (defined in `tools.py`): +```python +{ + "name": "litellm_web_search", + "description": "Search the web for information...", + "input_schema": { + "type": "object", + "properties": { + "query": {"type": "string", "description": "The search query"} + }, + "required": ["query"] + } +} +``` + +**Tool Name Constant**: `LITELLM_WEB_SEARCH_TOOL_NAME = "litellm_web_search"` (defined in `litellm/constants.py`) + +### Supported Tool Formats + +The interception system automatically detects and handles: + +| Tool Format | Example | Provider | Detection Method | Future-Proof | +|-------------|---------|----------|------------------|-------------| +| **LiteLLM Standard** | `name="litellm_web_search"` | Any | Direct name match | N/A | +| **Anthropic Native** | `type="web_search_20250305"` | Bedrock, Claude API | Type prefix: `startswith("web_search_")` | ✅ Yes (web_search_2026, etc.) | +| **Claude Code CLI** | `name="web_search"`, `type="web_search_20250305"` | Claude Code | Name + type check | ✅ Yes (version-agnostic) | +| **Legacy** | `name="WebSearch"` | Custom | Name match | N/A (backwards compat) | + +**Future Compatibility**: The `startswith("web_search_")` check in `tools.py` automatically supports future Anthropic web search versions. + +### Claude Code CLI Integration + +Claude Code (Anthropic's official CLI) sends web search requests using Anthropic's native tool format: + +```python +{ + "type": "web_search_20250305", + "name": "web_search", + "max_uses": 8 +} +``` + +**What Happens:** +1. Claude Code sends native `web_search_20250305` tool to LiteLLM proxy +2. LiteLLM intercepts and converts to `litellm_web_search` standard format +3. Bedrock receives converted tool (NOT native format) +4. Model returns `tool_use` block for `litellm_web_search` (not `server_tool_use`) +5. LiteLLM's agentic loop intercepts the `tool_use` +6. Executes `litellm.asearch()` using configured provider (Perplexity, Tavily, etc.) +7. Returns final answer to Claude Code user + +**Without Interception**: Bedrock would receive native tool → try to execute natively → return `web_search_tool_result_error` with `invalid_tool_input` + +**With Interception**: LiteLLM converts → Bedrock returns tool_use → LiteLLM executes search → Returns final answer ✅ + +### Native Tool Conversion + +Native tools are converted to LiteLLM standard format **before** sending to the provider: + +1. **Conversion Point** (`litellm/llms/anthropic/experimental_pass_through/messages/handler.py`): + - In `anthropic_messages()` function (lines 60-127) + - Runs BEFORE the API request is made + - Detects native web search tools using `is_web_search_tool()` + - Converts to `litellm_web_search` format using `get_litellm_web_search_tool()` + - Prevents provider from executing search natively (avoids `web_search_tool_result_error`) + +2. **Response Detection** (`transformation.py`): + - Detects `tool_use` blocks with any web search tool name + - Handles: `litellm_web_search`, `WebSearch`, `web_search` + - Extracts search queries for execution + +**Example Conversion**: +```python +# Input (Claude Code's native tool) +{ + "type": "web_search_20250305", + "name": "web_search", + "max_uses": 8 +} + +# Output (LiteLLM standard) +{ + "name": "litellm_web_search", + "description": "Search the web for information...", + "input_schema": {...} +} +``` + +--- + +## Request Flow + +### Without Interception (Client-Side) +User manually handles tool execution: +1. User calls `litellm.messages.acreate()` → Gets `tool_use` response +2. User executes `litellm.asearch()` +3. User calls `litellm.messages.acreate()` again with results +4. User gets final answer + +**Result**: 2 API calls, manual tool execution + +### With Interception (Server-Side) +Server handles tool execution automatically: + +```mermaid +sequenceDiagram + participant User + participant Messages as litellm.messages.acreate() + participant Handler as llm_http_handler.py + participant Logger as WebSearchInterceptionLogger + participant Router as proxy_server.llm_router + participant Search as litellm.asearch() + participant Provider as Bedrock API + + User->>Messages: acreate(tools=[WebSearch]) + Messages->>Handler: async_anthropic_messages_handler() + Handler->>Provider: Request + Provider-->>Handler: Response (tool_use) + Handler->>Logger: async_should_run_agentic_loop() + Logger->>Logger: Detect WebSearch tool_use + Logger-->>Handler: (True, tools) + Handler->>Logger: async_run_agentic_loop(tools) + Logger->>Router: Get search_provider from search_tools + Router-->>Logger: search_provider + Logger->>Search: asearch(query, provider) + Search-->>Logger: Search results + Logger->>Logger: Build tool_result message + Logger->>Messages: acreate() with results + Messages->>Provider: Request with search results + Provider-->>Messages: Final answer + Messages-->>Logger: Final response + Logger-->>Handler: Final response + Handler-->>User: Final answer (with search results) +``` + +**Result**: 1 API call from user, server handles agentic loop + +--- + +## Key Components + +| Component | File | Purpose | +|-----------|------|---------| +| **WebSearchInterceptionLogger** | `handler.py` | CustomLogger that implements agentic loop hooks | +| **Tool Standardization** | `tools.py` | Standard tool definition, detection, and utilities | +| **Tool Name Constant** | `constants.py` | `LITELLM_WEB_SEARCH_TOOL_NAME = "litellm_web_search"` | +| **Tool Conversion** | `anthropic/.../ handler.py` | Converts native tools to LiteLLM standard before API call | +| **Transformation Logic** | `transformation.py` | Detect tool_use, build tool_result messages, format search responses | +| **Agentic Loop Hooks** | `integrations/custom_logger.py` | Base hooks: `async_should_run_agentic_loop()`, `async_run_agentic_loop()` | +| **Hook Orchestration** | `llms/custom_httpx/llm_http_handler.py` | `_call_agentic_completion_hooks()` - calls hooks after response | +| **Router Search Tools** | `proxy/proxy_server.py` | `llm_router.search_tools` - configured search providers | +| **Search Endpoints** | `proxy/search_endpoints/endpoints.py` | Router logic for selecting search provider | + +--- + +## Configuration + +```python +from litellm.integrations.websearch_interception import ( + WebSearchInterceptionLogger, + get_litellm_web_search_tool, +) +from litellm.types.utils import LlmProviders + +# Enable for Bedrock with specific search tool +litellm.callbacks = [ + WebSearchInterceptionLogger( + enabled_providers=[LlmProviders.BEDROCK], + search_tool_name="my-perplexity-tool" # Optional: uses router's first tool if None + ) +] + +# Make request with LiteLLM standard tool (recommended) +response = await litellm.messages.acreate( + model="bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0", + messages=[{"role": "user", "content": "What is LiteLLM?"}], + tools=[get_litellm_web_search_tool()], # LiteLLM standard + max_tokens=1024, + stream=True # Auto-converted to non-streaming +) + +# OR send native tools - they're auto-converted to LiteLLM standard +response = await litellm.messages.acreate( + model="bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0", + messages=[{"role": "user", "content": "What is LiteLLM?"}], + tools=[{ + "type": "web_search_20250305", # Native Anthropic format + "name": "web_search", + "max_uses": 8 + }], + max_tokens=1024, +) +``` + +--- + +## Streaming Support + +WebSearch interception works transparently with both streaming and non-streaming requests. + +**How streaming is handled:** +1. User makes request with `stream=True` and WebSearch tool +2. Before API call, `anthropic_messages()` detects WebSearch + interception enabled +3. Converts `stream=True` → `stream=False` internally +4. Agentic loop executes with non-streaming responses +5. Final response returned to user (non-streaming) + +**Why this approach:** +- Server-side agentic loops require consuming full responses to detect tool_use +- User opts into this behavior by enabling WebSearch interception +- Provides seamless experience without client changes + +**Testing:** +- **Non-streaming**: `test_websearch_interception_e2e.py` +- **Streaming**: `test_websearch_interception_streaming_e2e.py` + +--- + +## Search Provider Selection + +1. If `search_tool_name` specified → Look up in `llm_router.search_tools` +2. If not found or None → Use first available search tool +3. If no router or no tools → Fallback to `perplexity` + +Example router config: +```yaml +search_tools: + - search_tool_name: "my-perplexity-tool" + litellm_params: + search_provider: "perplexity" + - search_tool_name: "my-tavily-tool" + litellm_params: + search_provider: "tavily" +``` + +--- + +## Message Flow + +### Initial Request +```python +messages = [{"role": "user", "content": "What is LiteLLM?"}] +tools = [{"name": "WebSearch", ...}] +``` + +### First API Call (Internal) +**Response**: `tool_use` with `name="WebSearch"`, `input={"query": "what is litellm"}` + +### Server Processing +1. Logger detects WebSearch tool_use +2. Looks up search provider from router +3. Executes `litellm.asearch(query="what is litellm", search_provider="perplexity")` +4. Gets results: `"Title: LiteLLM Docs\nURL: docs.litellm.ai\n..."` + +### Follow-Up Request (Internal) +```python +messages = [ + {"role": "user", "content": "What is LiteLLM?"}, + {"role": "assistant", "content": [{"type": "tool_use", ...}]}, + {"role": "user", "content": [{"type": "tool_result", "content": "search results..."}]} +] +``` + +### User Receives +```python +response.content[0].text +# "Based on the search results, LiteLLM is a unified interface..." +``` + +--- + +## Testing + +**E2E Tests**: +- `test_websearch_interception_e2e.py` - Non-streaming real API calls to Bedrock +- `test_websearch_interception_streaming_e2e.py` - Streaming real API calls to Bedrock + +**Unit Tests**: `test_websearch_interception.py` +Mocked tests for tool detection, provider filtering, edge cases. diff --git a/litellm/integrations/websearch_interception/__init__.py b/litellm/integrations/websearch_interception/__init__.py new file mode 100644 index 000000000000..f5b1963c1cff --- /dev/null +++ b/litellm/integrations/websearch_interception/__init__.py @@ -0,0 +1,20 @@ +""" +WebSearch Interception Module + +Provides server-side WebSearch tool execution for models that don't natively +support server-side tool calling (e.g., Bedrock/Claude). +""" + +from litellm.integrations.websearch_interception.handler import ( + WebSearchInterceptionLogger, +) +from litellm.integrations.websearch_interception.tools import ( + get_litellm_web_search_tool, + is_web_search_tool, +) + +__all__ = [ + "WebSearchInterceptionLogger", + "get_litellm_web_search_tool", + "is_web_search_tool", +] diff --git a/litellm/integrations/websearch_interception/handler.py b/litellm/integrations/websearch_interception/handler.py new file mode 100644 index 000000000000..5d36b760afb9 --- /dev/null +++ b/litellm/integrations/websearch_interception/handler.py @@ -0,0 +1,560 @@ +""" +WebSearch Interception Handler + +CustomLogger that intercepts WebSearch tool calls for models that don't +natively support web search (e.g., Bedrock/Claude) and executes them +server-side using litellm router's search tools. +""" + +import asyncio +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +import litellm +from litellm._logging import verbose_logger +from litellm.anthropic_interface import messages as anthropic_messages +from litellm.constants import LITELLM_WEB_SEARCH_TOOL_NAME +from litellm.integrations.custom_logger import CustomLogger +from litellm.integrations.websearch_interception.tools import ( + get_litellm_web_search_tool, + is_web_search_tool, +) +from litellm.integrations.websearch_interception.transformation import ( + WebSearchTransformation, +) +from litellm.types.integrations.websearch_interception import ( + WebSearchInterceptionConfig, +) +from litellm.types.utils import LlmProviders + + +class WebSearchInterceptionLogger(CustomLogger): + """ + CustomLogger that intercepts WebSearch tool calls for models that don't + natively support web search. + + Implements agentic loop: + 1. Detects WebSearch tool_use in model response + 2. Executes litellm.asearch() for each query using router's search tools + 3. Makes follow-up request with search results + 4. Returns final response + """ + + def __init__( + self, + enabled_providers: Optional[List[Union[LlmProviders, str]]] = None, + search_tool_name: Optional[str] = None, + ): + """ + Args: + enabled_providers: List of LLM providers to enable interception for. + Use LlmProviders enum values (e.g., [LlmProviders.BEDROCK]) + Default: [LlmProviders.BEDROCK] + search_tool_name: Name of search tool configured in router's search_tools. + If None, will attempt to use first available search tool. + """ + super().__init__() + # Convert enum values to strings for comparison + if enabled_providers is None: + self.enabled_providers = [LlmProviders.BEDROCK.value] + else: + self.enabled_providers = [ + p.value if isinstance(p, LlmProviders) else p + for p in enabled_providers + ] + self.search_tool_name = search_tool_name + self._request_has_websearch = False # Track if current request has web search + + async def async_pre_call_deployment_hook( + self, kwargs: Dict[str, Any], call_type: Optional[Any] + ) -> Optional[dict]: + """ + Pre-call hook to convert native Anthropic web_search tools to regular tools. + + This prevents Bedrock from trying to execute web search server-side (which fails). + Instead, we convert it to a regular tool so the model returns tool_use blocks + that we can intercept and execute ourselves. + """ + # Check if this is for an enabled provider + custom_llm_provider = kwargs.get("litellm_params", {}).get("custom_llm_provider", "") + if custom_llm_provider not in self.enabled_providers: + return None + + # Check if request has tools with native web_search + tools = kwargs.get("tools") + if not tools: + return None + + # Check if any tool is a web search tool (native or already LiteLLM standard) + has_websearch = any(is_web_search_tool(t) for t in tools) + + if not has_websearch: + return None + + verbose_logger.debug( + "WebSearchInterception: Converting native web_search tools to LiteLLM standard" + ) + + # Convert native/custom web_search tools to LiteLLM standard + converted_tools = [] + for tool in tools: + if is_web_search_tool(tool): + # Convert to LiteLLM standard web search tool + converted_tool = get_litellm_web_search_tool() + converted_tools.append(converted_tool) + verbose_logger.debug( + f"WebSearchInterception: Converted {tool.get('name', 'unknown')} " + f"(type={tool.get('type', 'none')}) to {LITELLM_WEB_SEARCH_TOOL_NAME}" + ) + else: + # Keep other tools as-is + converted_tools.append(tool) + + # Return modified kwargs with converted tools + return {"tools": converted_tools} + + @classmethod + def from_config_yaml( + cls, config: WebSearchInterceptionConfig + ) -> "WebSearchInterceptionLogger": + """ + Initialize WebSearchInterceptionLogger from proxy config.yaml parameters. + + Args: + config: Configuration dictionary from litellm_settings.websearch_interception_params + + Returns: + Configured WebSearchInterceptionLogger instance + + Example: + From proxy_config.yaml: + litellm_settings: + websearch_interception_params: + enabled_providers: ["bedrock"] + search_tool_name: "my-perplexity-search" + + Usage: + config = litellm_settings.get("websearch_interception_params", {}) + logger = WebSearchInterceptionLogger.from_config_yaml(config) + """ + # Extract parameters from config + enabled_providers_str = config.get("enabled_providers", None) + search_tool_name = config.get("search_tool_name", None) + + # Convert string provider names to LlmProviders enum values + enabled_providers: Optional[List[Union[LlmProviders, str]]] = None + if enabled_providers_str is not None: + enabled_providers = [] + for provider in enabled_providers_str: + try: + # Try to convert string to LlmProviders enum + provider_enum = LlmProviders(provider) + enabled_providers.append(provider_enum) + except ValueError: + # If conversion fails, keep as string + enabled_providers.append(provider) + + return cls( + enabled_providers=enabled_providers, + search_tool_name=search_tool_name, + ) + + async def async_pre_request_hook( + self, model: str, messages: List[Dict], kwargs: Dict + ) -> Optional[Dict]: + """ + Pre-request hook to convert native web search tools to LiteLLM standard. + + This hook is called before the API request is made, allowing us to: + 1. Detect native web search tools (web_search_20250305, etc.) + 2. Convert them to LiteLLM standard format (litellm_web_search) + 3. Convert stream=True to stream=False for interception + + This prevents providers like Bedrock from trying to execute web search + natively (which fails), and ensures our agentic loop can intercept tool_use. + + Returns: + Modified kwargs dict with converted tools, or None if no modifications needed + """ + # Check if this request is for an enabled provider + custom_llm_provider = kwargs.get("litellm_params", {}).get( + "custom_llm_provider", "" + ) + + verbose_logger.debug( + f"WebSearchInterception: Pre-request hook called" + f" - custom_llm_provider={custom_llm_provider}" + f" - enabled_providers={self.enabled_providers}" + ) + + if custom_llm_provider not in self.enabled_providers: + verbose_logger.debug( + f"WebSearchInterception: Skipping - provider {custom_llm_provider} not in {self.enabled_providers}" + ) + return None + + # Check if request has tools + tools = kwargs.get("tools") + if not tools: + return None + + # Check if any tool is a web search tool + has_websearch = any(is_web_search_tool(t) for t in tools) + if not has_websearch: + return None + + verbose_logger.debug( + f"WebSearchInterception: Pre-request hook triggered for provider={custom_llm_provider}" + ) + + # Convert native web search tools to LiteLLM standard + converted_tools = [] + for tool in tools: + if is_web_search_tool(tool): + standard_tool = get_litellm_web_search_tool() + converted_tools.append(standard_tool) + verbose_logger.debug( + f"WebSearchInterception: Converted {tool.get('name', 'unknown')} " + f"(type={tool.get('type', 'none')}) to {LITELLM_WEB_SEARCH_TOOL_NAME}" + ) + else: + converted_tools.append(tool) + + # Update kwargs with converted tools + kwargs["tools"] = converted_tools + verbose_logger.debug( + f"WebSearchInterception: Tools after conversion: {[t.get('name') for t in converted_tools]}" + ) + + # Convert stream=True to stream=False for WebSearch interception + if kwargs.get("stream"): + verbose_logger.debug( + "WebSearchInterception: Converting stream=True to stream=False" + ) + kwargs["stream"] = False + kwargs["_websearch_interception_converted_stream"] = True + + return kwargs + + async def async_should_run_agentic_loop( + self, + response: Any, + model: str, + messages: List[Dict], + tools: Optional[List[Dict]], + stream: bool, + custom_llm_provider: str, + kwargs: Dict, + ) -> Tuple[bool, Dict]: + """Check if WebSearch tool interception is needed""" + + verbose_logger.debug(f"WebSearchInterception: Hook called! provider={custom_llm_provider}, stream={stream}") + verbose_logger.debug(f"WebSearchInterception: Response type: {type(response)}") + + # Check if provider should be intercepted + # Note: custom_llm_provider is already normalized by get_llm_provider() + # (e.g., "bedrock/invoke/..." -> "bedrock") + if custom_llm_provider not in self.enabled_providers: + verbose_logger.debug( + f"WebSearchInterception: Skipping provider {custom_llm_provider} (not in enabled list: {self.enabled_providers})" + ) + return False, {} + + # Check if tools include any web search tool (LiteLLM standard or native) + has_websearch_tool = any(is_web_search_tool(t) for t in (tools or [])) + if not has_websearch_tool: + verbose_logger.debug( + "WebSearchInterception: No web search tool in request" + ) + return False, {} + + # Detect WebSearch tool_use in response + should_intercept, tool_calls = WebSearchTransformation.transform_request( + response=response, + stream=stream, + ) + + if not should_intercept: + verbose_logger.debug( + "WebSearchInterception: No WebSearch tool_use detected in response" + ) + return False, {} + + verbose_logger.debug( + f"WebSearchInterception: Detected {len(tool_calls)} WebSearch tool call(s), executing agentic loop" + ) + + # Return tools dict with tool calls + tools_dict = { + "tool_calls": tool_calls, + "tool_type": "websearch", + "provider": custom_llm_provider, + } + return True, tools_dict + + async def async_run_agentic_loop( + self, + tools: Dict, + model: str, + messages: List[Dict], + response: Any, + anthropic_messages_provider_config: Any, + anthropic_messages_optional_request_params: Dict, + logging_obj: Any, + stream: bool, + kwargs: Dict, + ) -> Any: + """Execute agentic loop with WebSearch execution""" + + tool_calls = tools["tool_calls"] + + verbose_logger.debug( + f"WebSearchInterception: Executing agentic loop for {len(tool_calls)} search(es)" + ) + + return await self._execute_agentic_loop( + model=model, + messages=messages, + tool_calls=tool_calls, + anthropic_messages_optional_request_params=anthropic_messages_optional_request_params, + logging_obj=logging_obj, + stream=stream, + kwargs=kwargs, + ) + + async def _execute_agentic_loop( + self, + model: str, + messages: List[Dict], + tool_calls: List[Dict], + anthropic_messages_optional_request_params: Dict, + logging_obj: Any, + stream: bool, + kwargs: Dict, + ) -> Any: + """Execute litellm.search() and make follow-up request""" + + # Extract search queries from tool_use blocks + search_tasks = [] + for tool_call in tool_calls: + query = tool_call["input"].get("query") + if query: + verbose_logger.debug( + f"WebSearchInterception: Queuing search for query='{query}'" + ) + search_tasks.append(self._execute_search(query)) + else: + verbose_logger.warning( + f"WebSearchInterception: Tool call {tool_call['id']} has no query" + ) + # Add empty result for tools without query + search_tasks.append(self._create_empty_search_result()) + + # Execute searches in parallel + verbose_logger.debug( + f"WebSearchInterception: Executing {len(search_tasks)} search(es) in parallel" + ) + search_results = await asyncio.gather(*search_tasks, return_exceptions=True) + + # Handle any exceptions in search results + final_search_results: List[str] = [] + for i, result in enumerate(search_results): + if isinstance(result, Exception): + verbose_logger.error( + f"WebSearchInterception: Search {i} failed with error: {str(result)}" + ) + final_search_results.append( + f"Search failed: {str(result)}" + ) + elif isinstance(result, str): + # Explicitly cast to str for type checker + final_search_results.append(cast(str, result)) + else: + # Should never happen, but handle for type safety + verbose_logger.warning( + f"WebSearchInterception: Unexpected result type {type(result)} at index {i}" + ) + final_search_results.append(str(result)) + + # Build assistant and user messages using transformation + assistant_message, user_message = WebSearchTransformation.transform_response( + tool_calls=tool_calls, + search_results=final_search_results, + ) + + # Make follow-up request with search results + follow_up_messages = messages + [assistant_message, user_message] + + verbose_logger.debug( + "WebSearchInterception: Making follow-up request with search results" + ) + verbose_logger.debug( + f"WebSearchInterception: Follow-up messages count: {len(follow_up_messages)}" + ) + verbose_logger.debug( + f"WebSearchInterception: Last message (tool_result): {user_message}" + ) + + # Use anthropic_messages.acreate for follow-up request + try: + # Extract max_tokens from optional params or kwargs + # max_tokens is a required parameter for anthropic_messages.acreate() + max_tokens = anthropic_messages_optional_request_params.get( + "max_tokens", + kwargs.get("max_tokens", 1024) # Default to 1024 if not found + ) + + verbose_logger.debug( + f"WebSearchInterception: Using max_tokens={max_tokens} for follow-up request" + ) + + # Create a copy of optional params without max_tokens (since we pass it explicitly) + optional_params_without_max_tokens = { + k: v for k, v in anthropic_messages_optional_request_params.items() + if k != 'max_tokens' + } + + # Remove internal websearch interception flags from kwargs before follow-up request + # These flags are used internally and should not be passed to the LLM provider + kwargs_for_followup = { + k: v for k, v in kwargs.items() + if not k.startswith('_websearch_interception') + } + + # Get model from logging_obj.model_call_details["agentic_loop_params"] + # This preserves the full model name with provider prefix (e.g., "bedrock/invoke/...") + full_model_name = model + if logging_obj is not None: + agentic_params = logging_obj.model_call_details.get("agentic_loop_params", {}) + full_model_name = agentic_params.get("model", model) + verbose_logger.debug( + f"WebSearchInterception: Using model name: {full_model_name}" + ) + + final_response = await anthropic_messages.acreate( + max_tokens=max_tokens, + messages=follow_up_messages, + model=full_model_name, + **optional_params_without_max_tokens, + **kwargs_for_followup, + ) + verbose_logger.debug( + f"WebSearchInterception: Follow-up request completed, response type: {type(final_response)}" + ) + verbose_logger.debug( + f"WebSearchInterception: Final response: {final_response}" + ) + return final_response + except Exception as e: + verbose_logger.exception( + f"WebSearchInterception: Follow-up request failed: {str(e)}" + ) + raise + + async def _execute_search(self, query: str) -> str: + """Execute a single web search using router's search tools""" + try: + # Import router from proxy_server + try: + from litellm.proxy.proxy_server import llm_router + except ImportError: + verbose_logger.warning( + "WebSearchInterception: Could not import llm_router from proxy_server, " + "falling back to direct litellm.asearch() with perplexity" + ) + llm_router = None + + # Determine search provider from router's search_tools + search_provider: Optional[str] = None + if llm_router is not None and hasattr(llm_router, "search_tools"): + if self.search_tool_name: + # Find specific search tool by name + matching_tools = [ + tool for tool in llm_router.search_tools + if tool.get("search_tool_name") == self.search_tool_name + ] + if matching_tools: + search_tool = matching_tools[0] + search_provider = search_tool.get("litellm_params", {}).get("search_provider") + verbose_logger.debug( + f"WebSearchInterception: Found search tool '{self.search_tool_name}' " + f"with provider '{search_provider}'" + ) + else: + verbose_logger.warning( + f"WebSearchInterception: Search tool '{self.search_tool_name}' not found in router, " + "falling back to first available or perplexity" + ) + + # If no specific tool or not found, use first available + if not search_provider and llm_router.search_tools: + first_tool = llm_router.search_tools[0] + search_provider = first_tool.get("litellm_params", {}).get("search_provider") + verbose_logger.debug( + f"WebSearchInterception: Using first available search tool with provider '{search_provider}'" + ) + + # Fallback to perplexity if no router or no search tools configured + if not search_provider: + search_provider = "perplexity" + verbose_logger.debug( + "WebSearchInterception: No search tools configured in router, " + f"using default provider '{search_provider}'" + ) + + verbose_logger.debug( + f"WebSearchInterception: Executing search for '{query}' using provider '{search_provider}'" + ) + result = await litellm.asearch( + query=query, search_provider=search_provider + ) + + # Format using transformation function + search_result_text = WebSearchTransformation.format_search_response(result) + + verbose_logger.debug( + f"WebSearchInterception: Search completed for '{query}', got {len(search_result_text)} chars" + ) + return search_result_text + except Exception as e: + verbose_logger.error( + f"WebSearchInterception: Search failed for '{query}': {str(e)}" + ) + raise + + async def _create_empty_search_result(self) -> str: + """Create an empty search result for tool calls without queries""" + return "No search query provided" + + @staticmethod + def initialize_from_proxy_config( + litellm_settings: Dict[str, Any], + callback_specific_params: Dict[str, Any], + ) -> "WebSearchInterceptionLogger": + """ + Static method to initialize WebSearchInterceptionLogger from proxy config. + + Used in callback_utils.py to simplify initialization logic. + + Args: + litellm_settings: Dictionary containing litellm_settings from proxy_config.yaml + callback_specific_params: Dictionary containing callback-specific parameters + + Returns: + Configured WebSearchInterceptionLogger instance + + Example: + From callback_utils.py: + websearch_obj = WebSearchInterceptionLogger.initialize_from_proxy_config( + litellm_settings=litellm_settings, + callback_specific_params=callback_specific_params + ) + """ + # Get websearch_interception_params from litellm_settings or callback_specific_params + websearch_params: WebSearchInterceptionConfig = {} + if "websearch_interception_params" in litellm_settings: + websearch_params = litellm_settings["websearch_interception_params"] + elif "websearch_interception" in callback_specific_params: + websearch_params = callback_specific_params["websearch_interception"] + + # Use classmethod to initialize from config + return WebSearchInterceptionLogger.from_config_yaml(websearch_params) diff --git a/litellm/integrations/websearch_interception/tools.py b/litellm/integrations/websearch_interception/tools.py new file mode 100644 index 000000000000..4f8b7372fe35 --- /dev/null +++ b/litellm/integrations/websearch_interception/tools.py @@ -0,0 +1,95 @@ +""" +LiteLLM Web Search Tool Definition + +This module defines the standard web search tool used across LiteLLM. +Native provider tools (like Anthropic's web_search_20250305) are converted +to this format for consistent interception and execution. +""" + +from typing import Any, Dict + +from litellm.constants import LITELLM_WEB_SEARCH_TOOL_NAME + + +def get_litellm_web_search_tool() -> Dict[str, Any]: + """ + Get the standard LiteLLM web search tool definition. + + This is the canonical tool definition that all native web search tools + (like Anthropic's web_search_20250305, Claude Code's web_search, etc.) + are converted to for interception. + + Returns: + Dict containing the Anthropic-style tool definition with: + - name: Tool name + - description: What the tool does + - input_schema: JSON schema for tool parameters + + Example: + >>> tool = get_litellm_web_search_tool() + >>> tool['name'] + 'litellm_web_search' + """ + return { + "name": LITELLM_WEB_SEARCH_TOOL_NAME, + "description": ( + "Search the web for information. Use this when you need current " + "information or answers to questions that require up-to-date data." + ), + "input_schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query to execute" + } + }, + "required": ["query"] + } + } + + +def is_web_search_tool(tool: Dict[str, Any]) -> bool: + """ + Check if a tool is a web search tool (native or LiteLLM standard). + + Detects: + - LiteLLM standard: name == "litellm_web_search" + - Anthropic native: type starts with "web_search_" (e.g., "web_search_20250305") + - Claude Code: name == "web_search" with a type field + - Custom: name == "WebSearch" (legacy format) + + Args: + tool: Tool dictionary to check + + Returns: + True if tool is a web search tool + + Example: + >>> is_web_search_tool({"name": "litellm_web_search"}) + True + >>> is_web_search_tool({"type": "web_search_20250305", "name": "web_search"}) + True + >>> is_web_search_tool({"name": "calculator"}) + False + """ + tool_name = tool.get("name", "") + tool_type = tool.get("type", "") + + # Check for LiteLLM standard tool + if tool_name == LITELLM_WEB_SEARCH_TOOL_NAME: + return True + + # Check for native Anthropic web_search_* types + if tool_type.startswith("web_search_"): + return True + + # Check for Claude Code's web_search with a type field + if tool_name == "web_search" and tool_type: + return True + + # Check for legacy WebSearch format + if tool_name == "WebSearch": + return True + + return False diff --git a/litellm/integrations/websearch_interception/transformation.py b/litellm/integrations/websearch_interception/transformation.py new file mode 100644 index 000000000000..313358822a50 --- /dev/null +++ b/litellm/integrations/websearch_interception/transformation.py @@ -0,0 +1,189 @@ +""" +WebSearch Tool Transformation + +Transforms between Anthropic tool_use format and LiteLLM search format. +""" + +from typing import Any, Dict, List, Tuple + +from litellm._logging import verbose_logger +from litellm.constants import LITELLM_WEB_SEARCH_TOOL_NAME +from litellm.llms.base_llm.search.transformation import SearchResponse + + +class WebSearchTransformation: + """ + Transformation class for WebSearch tool interception. + + Handles transformation between: + - Anthropic tool_use format → LiteLLM search requests + - LiteLLM SearchResponse → Anthropic tool_result format + """ + + @staticmethod + def transform_request( + response: Any, + stream: bool, + ) -> Tuple[bool, List[Dict]]: + """ + Transform Anthropic response to extract WebSearch tool calls. + + Detects if response contains WebSearch tool_use blocks and extracts + the search queries for execution. + + Args: + response: Model response (dict or AnthropicMessagesResponse) + stream: Whether response is streaming + + Returns: + (has_websearch, tool_calls): + has_websearch: True if WebSearch tool_use found + tool_calls: List of tool_use dicts with id, name, input + + Note: + Streaming requests are handled by converting stream=True to stream=False + in the WebSearchInterceptionLogger.async_log_pre_api_call hook before + the API request is made. This means by the time this method is called, + streaming requests have already been converted to non-streaming. + """ + if stream: + # This should not happen in practice since we convert streaming to non-streaming + # in async_log_pre_api_call, but keep this check for safety + verbose_logger.warning( + "WebSearchInterception: Unexpected streaming response, skipping interception" + ) + return False, [] + + # Parse non-streaming response + return WebSearchTransformation._detect_from_non_streaming_response(response) + + @staticmethod + def _detect_from_non_streaming_response( + response: Any, + ) -> Tuple[bool, List[Dict]]: + """Parse non-streaming response for WebSearch tool_use""" + + # Handle both dict and object responses + if isinstance(response, dict): + content = response.get("content", []) + else: + if not hasattr(response, "content"): + verbose_logger.debug( + "WebSearchInterception: Response has no content attribute" + ) + return False, [] + content = response.content or [] + + if not content: + verbose_logger.debug( + "WebSearchInterception: Response has empty content" + ) + return False, [] + + # Find all WebSearch tool_use blocks + tool_calls = [] + for block in content: + # Handle both dict and object blocks + if isinstance(block, dict): + block_type = block.get("type") + block_name = block.get("name") + block_id = block.get("id") + block_input = block.get("input", {}) + else: + block_type = getattr(block, "type", None) + block_name = getattr(block, "name", None) + block_id = getattr(block, "id", None) + block_input = getattr(block, "input", {}) + + # Check for LiteLLM standard or legacy web search tools + # Handles: litellm_web_search, WebSearch, web_search + if block_type == "tool_use" and block_name in ( + LITELLM_WEB_SEARCH_TOOL_NAME, "WebSearch", "web_search" + ): + # Convert to dict for easier handling + tool_call = { + "id": block_id, + "type": "tool_use", + "name": block_name, # Preserve original name + "input": block_input, + } + tool_calls.append(tool_call) + verbose_logger.debug( + f"WebSearchInterception: Found {block_name} tool_use with id={tool_call['id']}" + ) + + return len(tool_calls) > 0, tool_calls + + @staticmethod + def transform_response( + tool_calls: List[Dict], + search_results: List[str], + ) -> Tuple[Dict, Dict]: + """ + Transform LiteLLM search results to Anthropic tool_result format. + + Builds the assistant and user messages needed for the agentic loop + follow-up request. + + Args: + tool_calls: List of tool_use dicts from transform_request + search_results: List of search result strings (one per tool_call) + + Returns: + (assistant_message, user_message): + assistant_message: Message with tool_use blocks + user_message: Message with tool_result blocks + """ + # Build assistant message with tool_use blocks + assistant_message = { + "role": "assistant", + "content": [ + { + "type": "tool_use", + "id": tc["id"], + "name": tc["name"], + "input": tc["input"], + } + for tc in tool_calls + ], + } + + # Build user message with tool_result blocks + user_message = { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": tool_calls[i]["id"], + "content": search_results[i], + } + for i in range(len(tool_calls)) + ], + } + + return assistant_message, user_message + + @staticmethod + def format_search_response(result: SearchResponse) -> str: + """ + Format SearchResponse as text for tool_result content. + + Args: + result: SearchResponse from litellm.asearch() + + Returns: + Formatted text with Title, URL, Snippet for each result + """ + # Convert SearchResponse to string + if hasattr(result, "results") and result.results: + # Format results as text + search_result_text = "\n\n".join( + [ + f"Title: {r.title}\nURL: {r.url}\nSnippet: {r.snippet}" + for r in result.results + ] + ) + else: + search_result_text = str(result) + + return search_result_text diff --git a/litellm/interactions/__init__.py b/litellm/interactions/__init__.py new file mode 100644 index 000000000000..e1125b649a63 --- /dev/null +++ b/litellm/interactions/__init__.py @@ -0,0 +1,68 @@ +""" +LiteLLM Interactions API + +This module provides SDK methods for Google's Interactions API. + +Usage: + import litellm + + # Create an interaction with a model + response = litellm.interactions.create( + model="gemini-2.5-flash", + input="Hello, how are you?" + ) + + # Create an interaction with an agent + response = litellm.interactions.create( + agent="deep-research-pro-preview-12-2025", + input="Research the current state of cancer research" + ) + + # Async version + response = await litellm.interactions.acreate(...) + + # Get an interaction + response = litellm.interactions.get(interaction_id="...") + + # Delete an interaction + result = litellm.interactions.delete(interaction_id="...") + + # Cancel an interaction + result = litellm.interactions.cancel(interaction_id="...") + +Methods: +- create(): Sync create interaction +- acreate(): Async create interaction +- get(): Sync get interaction +- aget(): Async get interaction +- delete(): Sync delete interaction +- adelete(): Async delete interaction +- cancel(): Sync cancel interaction +- acancel(): Async cancel interaction +""" + +from litellm.interactions.main import ( + acancel, + acreate, + adelete, + aget, + cancel, + create, + delete, + get, +) + +__all__ = [ + # Create + "create", + "acreate", + # Get + "get", + "aget", + # Delete + "delete", + "adelete", + # Cancel + "cancel", + "acancel", +] diff --git a/litellm/interactions/http_handler.py b/litellm/interactions/http_handler.py new file mode 100644 index 000000000000..4b4ed9be4db2 --- /dev/null +++ b/litellm/interactions/http_handler.py @@ -0,0 +1,690 @@ +""" +HTTP Handler for Interactions API requests. + +This module handles the HTTP communication for the Google Interactions API. +""" + +from typing import ( + Any, + AsyncIterator, + Coroutine, + Dict, + Iterator, + Optional, + Union, +) + +import httpx + +import litellm +from litellm.constants import request_timeout +from litellm.interactions.streaming_iterator import ( + InteractionsAPIStreamingIterator, + SyncInteractionsAPIStreamingIterator, +) +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +from litellm.llms.base_llm.interactions.transformation import BaseInteractionsAPIConfig +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + _get_httpx_client, + get_async_httpx_client, +) +from litellm.types.interactions import ( + CancelInteractionResult, + DeleteInteractionResult, + InteractionInput, + InteractionsAPIOptionalRequestParams, + InteractionsAPIResponse, + InteractionsAPIStreamingResponse, +) +from litellm.types.router import GenericLiteLLMParams + + +class InteractionsHTTPHandler: + """ + HTTP handler for Interactions API requests. + """ + + def _handle_error( + self, + e: Exception, + provider_config: BaseInteractionsAPIConfig, + ) -> Exception: + """Handle errors from HTTP requests.""" + if isinstance(e, httpx.HTTPStatusError): + error_message = e.response.text + status_code = e.response.status_code + headers = dict(e.response.headers) + return provider_config.get_error_class( + error_message=error_message, + status_code=status_code, + headers=headers, + ) + return e + + # ========================================================= + # CREATE INTERACTION + # ========================================================= + + def create_interaction( + self, + interactions_api_config: BaseInteractionsAPIConfig, + optional_params: InteractionsAPIOptionalRequestParams, + custom_llm_provider: str, + litellm_params: GenericLiteLLMParams, + logging_obj: LiteLLMLoggingObj, + model: Optional[str] = None, + agent: Optional[str] = None, + input: Optional[InteractionInput] = None, + extra_headers: Optional[Dict[str, Any]] = None, + extra_body: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[HTTPHandler] = None, + _is_async: bool = False, + stream: Optional[bool] = None, + ) -> Union[ + InteractionsAPIResponse, + Iterator[InteractionsAPIStreamingResponse], + Coroutine[Any, Any, Union[InteractionsAPIResponse, AsyncIterator[InteractionsAPIStreamingResponse]]], + ]: + """ + Create a new interaction (synchronous or async based on _is_async flag). + + Per Google's OpenAPI spec, the endpoint is POST /{api_version}/interactions + """ + if _is_async: + return self.async_create_interaction( + model=model, + agent=agent, + input=input, + interactions_api_config=interactions_api_config, + optional_params=optional_params, + custom_llm_provider=custom_llm_provider, + litellm_params=litellm_params, + logging_obj=logging_obj, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + stream=stream, + ) + + if client is None: + sync_httpx_client = _get_httpx_client( + params={"ssl_verify": litellm_params.get("ssl_verify", None)} + ) + else: + sync_httpx_client = client + + headers = interactions_api_config.validate_environment( + headers=extra_headers or {}, + model=model or "", + litellm_params=litellm_params, + ) + + api_base = interactions_api_config.get_complete_url( + api_base=litellm_params.api_base or "", + model=model, + agent=agent, + litellm_params=dict(litellm_params), + stream=stream, + ) + + data = interactions_api_config.transform_request( + model=model, + agent=agent, + input=input, + optional_params=optional_params, + litellm_params=litellm_params, + headers=headers, + ) + + if extra_body: + data.update(extra_body) + + # Logging + logging_obj.pre_call( + input=input, + api_key="", + additional_args={ + "complete_input_dict": data, + "api_base": api_base, + "headers": headers, + }, + ) + + try: + if stream: + response = sync_httpx_client.post( + url=api_base, + headers=headers, + json=data, + timeout=timeout or request_timeout, + stream=True, + ) + return self._create_sync_streaming_iterator( + response=response, + model=model, + logging_obj=logging_obj, + interactions_api_config=interactions_api_config, + ) + else: + response = sync_httpx_client.post( + url=api_base, + headers=headers, + json=data, + timeout=timeout or request_timeout, + ) + except Exception as e: + raise self._handle_error(e=e, provider_config=interactions_api_config) + + return interactions_api_config.transform_response( + model=model, + raw_response=response, + logging_obj=logging_obj, + ) + + async def async_create_interaction( + self, + interactions_api_config: BaseInteractionsAPIConfig, + optional_params: InteractionsAPIOptionalRequestParams, + custom_llm_provider: str, + litellm_params: GenericLiteLLMParams, + logging_obj: LiteLLMLoggingObj, + model: Optional[str] = None, + agent: Optional[str] = None, + input: Optional[InteractionInput] = None, + extra_headers: Optional[Dict[str, Any]] = None, + extra_body: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[AsyncHTTPHandler] = None, + stream: Optional[bool] = None, + ) -> Union[InteractionsAPIResponse, AsyncIterator[InteractionsAPIStreamingResponse]]: + """ + Create a new interaction (async version). + """ + if client is None: + async_httpx_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders(custom_llm_provider), + params={"ssl_verify": litellm_params.get("ssl_verify", None)}, + ) + else: + async_httpx_client = client + + headers = interactions_api_config.validate_environment( + headers=extra_headers or {}, + model=model or "", + litellm_params=litellm_params, + ) + + api_base = interactions_api_config.get_complete_url( + api_base=litellm_params.api_base or "", + model=model, + agent=agent, + litellm_params=dict(litellm_params), + stream=stream, + ) + + data = interactions_api_config.transform_request( + model=model, + agent=agent, + input=input, + optional_params=optional_params, + litellm_params=litellm_params, + headers=headers, + ) + + if extra_body: + data.update(extra_body) + + # Logging + logging_obj.pre_call( + input=input, + api_key="", + additional_args={ + "complete_input_dict": data, + "api_base": api_base, + "headers": headers, + }, + ) + + try: + if stream: + response = await async_httpx_client.post( + url=api_base, + headers=headers, + json=data, + timeout=timeout or request_timeout, + stream=True, + ) + return self._create_async_streaming_iterator( + response=response, + model=model, + logging_obj=logging_obj, + interactions_api_config=interactions_api_config, + ) + else: + response = await async_httpx_client.post( + url=api_base, + headers=headers, + json=data, + timeout=timeout or request_timeout, + ) + except Exception as e: + raise self._handle_error(e=e, provider_config=interactions_api_config) + + return interactions_api_config.transform_response( + model=model, + raw_response=response, + logging_obj=logging_obj, + ) + + def _create_sync_streaming_iterator( + self, + response: httpx.Response, + model: Optional[str], + logging_obj: LiteLLMLoggingObj, + interactions_api_config: BaseInteractionsAPIConfig, + ) -> SyncInteractionsAPIStreamingIterator: + """Create a synchronous streaming iterator. + + Google AI's streaming format uses SSE (Server-Sent Events). + Returns a proper streaming iterator that yields chunks as they arrive. + """ + return SyncInteractionsAPIStreamingIterator( + response=response, + model=model, + interactions_api_config=interactions_api_config, + logging_obj=logging_obj, + ) + + def _create_async_streaming_iterator( + self, + response: httpx.Response, + model: Optional[str], + logging_obj: LiteLLMLoggingObj, + interactions_api_config: BaseInteractionsAPIConfig, + ) -> InteractionsAPIStreamingIterator: + """Create an asynchronous streaming iterator. + + Google AI's streaming format uses SSE (Server-Sent Events). + Returns a proper streaming iterator that yields chunks as they arrive. + """ + return InteractionsAPIStreamingIterator( + response=response, + model=model, + interactions_api_config=interactions_api_config, + logging_obj=logging_obj, + ) + + # ========================================================= + # GET INTERACTION + # ========================================================= + + def get_interaction( + self, + interaction_id: str, + interactions_api_config: BaseInteractionsAPIConfig, + custom_llm_provider: str, + litellm_params: GenericLiteLLMParams, + logging_obj: LiteLLMLoggingObj, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[HTTPHandler] = None, + _is_async: bool = False, + ) -> Union[InteractionsAPIResponse, Coroutine[Any, Any, InteractionsAPIResponse]]: + """Get an interaction by ID.""" + if _is_async: + return self.async_get_interaction( + interaction_id=interaction_id, + interactions_api_config=interactions_api_config, + custom_llm_provider=custom_llm_provider, + litellm_params=litellm_params, + logging_obj=logging_obj, + extra_headers=extra_headers, + timeout=timeout, + ) + + if client is None: + sync_httpx_client = _get_httpx_client( + params={"ssl_verify": litellm_params.get("ssl_verify", None)} + ) + else: + sync_httpx_client = client + + headers = interactions_api_config.validate_environment( + headers=extra_headers or {}, + model="", + litellm_params=litellm_params, + ) + + url, params = interactions_api_config.transform_get_interaction_request( + interaction_id=interaction_id, + api_base=litellm_params.api_base or "", + litellm_params=litellm_params, + headers=headers, + ) + + logging_obj.pre_call( + input=interaction_id, + api_key="", + additional_args={"api_base": url, "headers": headers}, + ) + + try: + response = sync_httpx_client.get( + url=url, + headers=headers, + params=params, + ) + except Exception as e: + raise self._handle_error(e=e, provider_config=interactions_api_config) + + return interactions_api_config.transform_get_interaction_response( + raw_response=response, + logging_obj=logging_obj, + ) + + async def async_get_interaction( + self, + interaction_id: str, + interactions_api_config: BaseInteractionsAPIConfig, + custom_llm_provider: str, + litellm_params: GenericLiteLLMParams, + logging_obj: LiteLLMLoggingObj, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[AsyncHTTPHandler] = None, + ) -> InteractionsAPIResponse: + """Get an interaction by ID (async version).""" + if client is None: + async_httpx_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders(custom_llm_provider), + params={"ssl_verify": litellm_params.get("ssl_verify", None)}, + ) + else: + async_httpx_client = client + + headers = interactions_api_config.validate_environment( + headers=extra_headers or {}, + model="", + litellm_params=litellm_params, + ) + + url, params = interactions_api_config.transform_get_interaction_request( + interaction_id=interaction_id, + api_base=litellm_params.api_base or "", + litellm_params=litellm_params, + headers=headers, + ) + + logging_obj.pre_call( + input=interaction_id, + api_key="", + additional_args={"api_base": url, "headers": headers}, + ) + + try: + response = await async_httpx_client.get( + url=url, + headers=headers, + params=params, + ) + except Exception as e: + raise self._handle_error(e=e, provider_config=interactions_api_config) + + return interactions_api_config.transform_get_interaction_response( + raw_response=response, + logging_obj=logging_obj, + ) + + # ========================================================= + # DELETE INTERACTION + # ========================================================= + + def delete_interaction( + self, + interaction_id: str, + interactions_api_config: BaseInteractionsAPIConfig, + custom_llm_provider: str, + litellm_params: GenericLiteLLMParams, + logging_obj: LiteLLMLoggingObj, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[HTTPHandler] = None, + _is_async: bool = False, + ) -> Union[DeleteInteractionResult, Coroutine[Any, Any, DeleteInteractionResult]]: + """Delete an interaction by ID.""" + if _is_async: + return self.async_delete_interaction( + interaction_id=interaction_id, + interactions_api_config=interactions_api_config, + custom_llm_provider=custom_llm_provider, + litellm_params=litellm_params, + logging_obj=logging_obj, + extra_headers=extra_headers, + timeout=timeout, + ) + + if client is None: + sync_httpx_client = _get_httpx_client( + params={"ssl_verify": litellm_params.get("ssl_verify", None)} + ) + else: + sync_httpx_client = client + + headers = interactions_api_config.validate_environment( + headers=extra_headers or {}, + model="", + litellm_params=litellm_params, + ) + + url, data = interactions_api_config.transform_delete_interaction_request( + interaction_id=interaction_id, + api_base=litellm_params.api_base or "", + litellm_params=litellm_params, + headers=headers, + ) + + logging_obj.pre_call( + input=interaction_id, + api_key="", + additional_args={"api_base": url, "headers": headers}, + ) + + try: + response = sync_httpx_client.delete( + url=url, + headers=headers, + timeout=timeout or request_timeout, + ) + except Exception as e: + raise self._handle_error(e=e, provider_config=interactions_api_config) + + return interactions_api_config.transform_delete_interaction_response( + raw_response=response, + logging_obj=logging_obj, + interaction_id=interaction_id, + ) + + async def async_delete_interaction( + self, + interaction_id: str, + interactions_api_config: BaseInteractionsAPIConfig, + custom_llm_provider: str, + litellm_params: GenericLiteLLMParams, + logging_obj: LiteLLMLoggingObj, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[AsyncHTTPHandler] = None, + ) -> DeleteInteractionResult: + """Delete an interaction by ID (async version).""" + if client is None: + async_httpx_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders(custom_llm_provider), + params={"ssl_verify": litellm_params.get("ssl_verify", None)}, + ) + else: + async_httpx_client = client + + headers = interactions_api_config.validate_environment( + headers=extra_headers or {}, + model="", + litellm_params=litellm_params, + ) + + url, data = interactions_api_config.transform_delete_interaction_request( + interaction_id=interaction_id, + api_base=litellm_params.api_base or "", + litellm_params=litellm_params, + headers=headers, + ) + + logging_obj.pre_call( + input=interaction_id, + api_key="", + additional_args={"api_base": url, "headers": headers}, + ) + + try: + response = await async_httpx_client.delete( + url=url, + headers=headers, + timeout=timeout or request_timeout, + ) + except Exception as e: + raise self._handle_error(e=e, provider_config=interactions_api_config) + + return interactions_api_config.transform_delete_interaction_response( + raw_response=response, + logging_obj=logging_obj, + interaction_id=interaction_id, + ) + + # ========================================================= + # CANCEL INTERACTION + # ========================================================= + + def cancel_interaction( + self, + interaction_id: str, + interactions_api_config: BaseInteractionsAPIConfig, + custom_llm_provider: str, + litellm_params: GenericLiteLLMParams, + logging_obj: LiteLLMLoggingObj, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[HTTPHandler] = None, + _is_async: bool = False, + ) -> Union[CancelInteractionResult, Coroutine[Any, Any, CancelInteractionResult]]: + """Cancel an interaction by ID.""" + if _is_async: + return self.async_cancel_interaction( + interaction_id=interaction_id, + interactions_api_config=interactions_api_config, + custom_llm_provider=custom_llm_provider, + litellm_params=litellm_params, + logging_obj=logging_obj, + extra_headers=extra_headers, + timeout=timeout, + ) + + if client is None: + sync_httpx_client = _get_httpx_client( + params={"ssl_verify": litellm_params.get("ssl_verify", None)} + ) + else: + sync_httpx_client = client + + headers = interactions_api_config.validate_environment( + headers=extra_headers or {}, + model="", + litellm_params=litellm_params, + ) + + url, data = interactions_api_config.transform_cancel_interaction_request( + interaction_id=interaction_id, + api_base=litellm_params.api_base or "", + litellm_params=litellm_params, + headers=headers, + ) + + logging_obj.pre_call( + input=interaction_id, + api_key="", + additional_args={"api_base": url, "headers": headers}, + ) + + try: + response = sync_httpx_client.post( + url=url, + headers=headers, + json=data, + timeout=timeout or request_timeout, + ) + except Exception as e: + raise self._handle_error(e=e, provider_config=interactions_api_config) + + return interactions_api_config.transform_cancel_interaction_response( + raw_response=response, + logging_obj=logging_obj, + ) + + async def async_cancel_interaction( + self, + interaction_id: str, + interactions_api_config: BaseInteractionsAPIConfig, + custom_llm_provider: str, + litellm_params: GenericLiteLLMParams, + logging_obj: LiteLLMLoggingObj, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + client: Optional[AsyncHTTPHandler] = None, + ) -> CancelInteractionResult: + """Cancel an interaction by ID (async version).""" + if client is None: + async_httpx_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders(custom_llm_provider), + params={"ssl_verify": litellm_params.get("ssl_verify", None)}, + ) + else: + async_httpx_client = client + + headers = interactions_api_config.validate_environment( + headers=extra_headers or {}, + model="", + litellm_params=litellm_params, + ) + + url, data = interactions_api_config.transform_cancel_interaction_request( + interaction_id=interaction_id, + api_base=litellm_params.api_base or "", + litellm_params=litellm_params, + headers=headers, + ) + + logging_obj.pre_call( + input=interaction_id, + api_key="", + additional_args={"api_base": url, "headers": headers}, + ) + + try: + response = await async_httpx_client.post( + url=url, + headers=headers, + json=data, + timeout=timeout or request_timeout, + ) + except Exception as e: + raise self._handle_error(e=e, provider_config=interactions_api_config) + + return interactions_api_config.transform_cancel_interaction_response( + raw_response=response, + logging_obj=logging_obj, + ) + + +# Initialize the HTTP handler singleton +interactions_http_handler = InteractionsHTTPHandler() + diff --git a/litellm/interactions/litellm_responses_transformation/__init__.py b/litellm/interactions/litellm_responses_transformation/__init__.py new file mode 100644 index 000000000000..2450a9f3d203 --- /dev/null +++ b/litellm/interactions/litellm_responses_transformation/__init__.py @@ -0,0 +1,16 @@ +""" +Bridge module for connecting Interactions API to Responses API via litellm.responses(). +""" + +from litellm.interactions.litellm_responses_transformation.handler import ( + LiteLLMResponsesInteractionsHandler, +) +from litellm.interactions.litellm_responses_transformation.transformation import ( + LiteLLMResponsesInteractionsConfig, +) + +__all__ = [ + "LiteLLMResponsesInteractionsHandler", + "LiteLLMResponsesInteractionsConfig", # Transformation config class (not BaseInteractionsAPIConfig) +] + diff --git a/litellm/interactions/litellm_responses_transformation/handler.py b/litellm/interactions/litellm_responses_transformation/handler.py new file mode 100644 index 000000000000..c2df8f96eff7 --- /dev/null +++ b/litellm/interactions/litellm_responses_transformation/handler.py @@ -0,0 +1,156 @@ +""" +Handler for transforming interactions API requests to litellm.responses requests. +""" + +from typing import ( + Any, + AsyncIterator, + Coroutine, + Dict, + Iterator, + Optional, + Union, + cast, +) + +import litellm +from litellm.interactions.litellm_responses_transformation.streaming_iterator import ( + LiteLLMResponsesInteractionsStreamingIterator, +) +from litellm.interactions.litellm_responses_transformation.transformation import ( + LiteLLMResponsesInteractionsConfig, +) +from litellm.responses.streaming_iterator import BaseResponsesAPIStreamingIterator +from litellm.types.interactions import ( + InteractionInput, + InteractionsAPIOptionalRequestParams, + InteractionsAPIResponse, + InteractionsAPIStreamingResponse, +) +from litellm.types.llms.openai import ResponsesAPIResponse + + +class LiteLLMResponsesInteractionsHandler: + """Handler for bridging Interactions API to Responses API via litellm.responses().""" + + def interactions_api_handler( + self, + model: str, + input: Optional[InteractionInput], + optional_params: InteractionsAPIOptionalRequestParams, + custom_llm_provider: Optional[str] = None, + _is_async: bool = False, + stream: Optional[bool] = None, + **kwargs, + ) -> Union[ + InteractionsAPIResponse, + Iterator[InteractionsAPIStreamingResponse], + Coroutine[ + Any, + Any, + Union[ + InteractionsAPIResponse, + AsyncIterator[InteractionsAPIStreamingResponse], + ], + ], + ]: + """ + Handle Interactions API request by calling litellm.responses(). + + Args: + model: The model to use + input: The input content + optional_params: Optional parameters for the request + custom_llm_provider: Override LLM provider + _is_async: Whether this is an async call + stream: Whether to stream the response + **kwargs: Additional parameters + + Returns: + InteractionsAPIResponse or streaming iterator + """ + # Transform interactions request to responses request + responses_request = ( + LiteLLMResponsesInteractionsConfig.transform_interactions_request_to_responses_request( + model=model, + input=input, + optional_params=optional_params, + custom_llm_provider=custom_llm_provider, + stream=stream, + **kwargs, + ) + ) + + if _is_async: + return self.async_interactions_api_handler( + responses_request=responses_request, + model=model, + input=input, + optional_params=optional_params, + **kwargs, + ) + + # Call litellm.responses() + # Note: litellm.responses() returns Union[ResponsesAPIResponse, BaseResponsesAPIStreamingIterator] + # but the type checker may see it as a coroutine in some contexts + responses_response = litellm.responses( + **responses_request, + ) + + # Handle streaming response + if isinstance(responses_response, BaseResponsesAPIStreamingIterator): + return LiteLLMResponsesInteractionsStreamingIterator( + model=model, + litellm_custom_stream_wrapper=responses_response, + request_input=input, + optional_params=optional_params, + custom_llm_provider=custom_llm_provider, + litellm_metadata=kwargs.get("litellm_metadata", {}), + ) + + # At this point, responses_response must be ResponsesAPIResponse (not streaming) + # Cast to satisfy type checker since we've already checked it's not a streaming iterator + responses_api_response = cast(ResponsesAPIResponse, responses_response) + + # Transform responses response to interactions response + return LiteLLMResponsesInteractionsConfig.transform_responses_response_to_interactions_response( + responses_response=responses_api_response, + model=model, + ) + + async def async_interactions_api_handler( + self, + responses_request: Dict[str, Any], + model: str, + input: Optional[InteractionInput], + optional_params: InteractionsAPIOptionalRequestParams, + **kwargs, + ) -> Union[InteractionsAPIResponse, AsyncIterator[InteractionsAPIStreamingResponse]]: + """Async handler for interactions API requests.""" + # Call litellm.aresponses() + # Note: litellm.aresponses() returns Union[ResponsesAPIResponse, BaseResponsesAPIStreamingIterator] + responses_response = await litellm.aresponses( + **responses_request, + ) + + # Handle streaming response + if isinstance(responses_response, BaseResponsesAPIStreamingIterator): + return LiteLLMResponsesInteractionsStreamingIterator( + model=model, + litellm_custom_stream_wrapper=responses_response, + request_input=input, + optional_params=optional_params, + custom_llm_provider=responses_request.get("custom_llm_provider"), + litellm_metadata=kwargs.get("litellm_metadata", {}), + ) + + # At this point, responses_response must be ResponsesAPIResponse (not streaming) + # Cast to satisfy type checker since we've already checked it's not a streaming iterator + responses_api_response = cast(ResponsesAPIResponse, responses_response) + + # Transform responses response to interactions response + return LiteLLMResponsesInteractionsConfig.transform_responses_response_to_interactions_response( + responses_response=responses_api_response, + model=model, + ) + diff --git a/litellm/interactions/litellm_responses_transformation/streaming_iterator.py b/litellm/interactions/litellm_responses_transformation/streaming_iterator.py new file mode 100644 index 000000000000..511b69e83b22 --- /dev/null +++ b/litellm/interactions/litellm_responses_transformation/streaming_iterator.py @@ -0,0 +1,260 @@ +""" +Streaming iterator for transforming Responses API stream to Interactions API stream. +""" + +from typing import Any, AsyncIterator, Dict, Iterator, Optional, cast + +from litellm.responses.streaming_iterator import ( + BaseResponsesAPIStreamingIterator, + ResponsesAPIStreamingIterator, + SyncResponsesAPIStreamingIterator, +) +from litellm.types.interactions import ( + InteractionInput, + InteractionsAPIOptionalRequestParams, + InteractionsAPIStreamingResponse, +) +from litellm.types.llms.openai import ( + OutputTextDeltaEvent, + ResponseCompletedEvent, + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponsesAPIStreamingResponse, +) + + +class LiteLLMResponsesInteractionsStreamingIterator: + """ + Iterator that wraps Responses API streaming and transforms chunks to Interactions API format. + + This class handles both sync and async iteration, transforming Responses API + streaming events (output.text.delta, response.completed, etc.) to Interactions + API streaming events (content.delta, interaction.complete, etc.). + """ + + def __init__( + self, + model: str, + litellm_custom_stream_wrapper: BaseResponsesAPIStreamingIterator, + request_input: Optional[InteractionInput], + optional_params: InteractionsAPIOptionalRequestParams, + custom_llm_provider: Optional[str] = None, + litellm_metadata: Optional[Dict[str, Any]] = None, + ): + self.model = model + self.responses_stream_iterator = litellm_custom_stream_wrapper + self.request_input = request_input + self.optional_params = optional_params + self.custom_llm_provider = custom_llm_provider + self.litellm_metadata = litellm_metadata or {} + self.finished = False + self.collected_text = "" + self.sent_interaction_start = False + self.sent_content_start = False + + def _transform_responses_chunk_to_interactions_chunk( + self, + responses_chunk: ResponsesAPIStreamingResponse, + ) -> Optional[InteractionsAPIStreamingResponse]: + """ + Transform a Responses API streaming chunk to an Interactions API streaming chunk. + + Responses API events: + - output.text.delta -> content.delta + - response.completed -> interaction.complete + + Interactions API events: + - interaction.start + - content.start + - content.delta + - content.stop + - interaction.complete + """ + if not responses_chunk: + return None + + # Handle OutputTextDeltaEvent -> content.delta + if isinstance(responses_chunk, OutputTextDeltaEvent): + delta_text = responses_chunk.delta if isinstance(responses_chunk.delta, str) else "" + self.collected_text += delta_text + + # Send interaction.start if not sent + if not self.sent_interaction_start: + self.sent_interaction_start = True + return InteractionsAPIStreamingResponse( + event_type="interaction.start", + id=getattr(responses_chunk, "item_id", None) or f"interaction_{id(self)}", + object="interaction", + status="in_progress", + model=self.model, + ) + + # Send content.start if not sent + if not self.sent_content_start: + self.sent_content_start = True + return InteractionsAPIStreamingResponse( + event_type="content.start", + id=getattr(responses_chunk, "item_id", None), + object="content", + delta={"type": "text", "text": ""}, + ) + + # Send content.delta + return InteractionsAPIStreamingResponse( + event_type="content.delta", + id=getattr(responses_chunk, "item_id", None), + object="content", + delta={"text": delta_text}, + ) + + # Handle ResponseCreatedEvent or ResponseInProgressEvent -> interaction.start + if isinstance(responses_chunk, (ResponseCreatedEvent, ResponseInProgressEvent)): + if not self.sent_interaction_start: + self.sent_interaction_start = True + response_id = getattr(responses_chunk.response, "id", None) if hasattr(responses_chunk, "response") else None + return InteractionsAPIStreamingResponse( + event_type="interaction.start", + id=response_id or f"interaction_{id(self)}", + object="interaction", + status="in_progress", + model=self.model, + ) + + # Handle ResponseCompletedEvent -> interaction.complete + if isinstance(responses_chunk, ResponseCompletedEvent): + self.finished = True + response = responses_chunk.response + + # Send content.stop first if content was started + if self.sent_content_start: + # Note: We'll send this in the iterator, not here + pass + + # Send interaction.complete + return InteractionsAPIStreamingResponse( + event_type="interaction.complete", + id=getattr(response, "id", None) or f"interaction_{id(self)}", + object="interaction", + status="completed", + model=self.model, + outputs=[ + { + "type": "text", + "text": self.collected_text, + } + ], + ) + + # For other event types, return None (skip) + return None + + def __iter__(self) -> Iterator[InteractionsAPIStreamingResponse]: + """Sync iterator implementation.""" + return self + + def __next__(self) -> InteractionsAPIStreamingResponse: + """Get next chunk in sync mode.""" + if self.finished: + raise StopIteration + + # Check if we have a pending interaction.complete to send + if hasattr(self, "_pending_interaction_complete"): + pending: InteractionsAPIStreamingResponse = getattr(self, "_pending_interaction_complete") + delattr(self, "_pending_interaction_complete") + return pending + + # Use a loop instead of recursion to avoid stack overflow + sync_iterator = cast(SyncResponsesAPIStreamingIterator, self.responses_stream_iterator) + while True: + try: + # Get next chunk from responses API stream + chunk = next(sync_iterator) + + # Transform chunk (chunk is already a ResponsesAPIStreamingResponse) + transformed = self._transform_responses_chunk_to_interactions_chunk(chunk) + + if transformed: + # If we finished and content was started, send content.stop before interaction.complete + if self.finished and self.sent_content_start and transformed.event_type == "interaction.complete": + # Send content.stop first + content_stop = InteractionsAPIStreamingResponse( + event_type="content.stop", + id=transformed.id, + object="content", + delta={"type": "text", "text": self.collected_text}, + ) + # Store the interaction.complete to send next + self._pending_interaction_complete = transformed + return content_stop + return transformed + + # If no transformation, continue to next chunk (loop continues) + + except StopIteration: + self.finished = True + + # Send final events if needed + if self.sent_content_start: + return InteractionsAPIStreamingResponse( + event_type="content.stop", + object="content", + delta={"type": "text", "text": self.collected_text}, + ) + + raise StopIteration + + def __aiter__(self) -> AsyncIterator[InteractionsAPIStreamingResponse]: + """Async iterator implementation.""" + return self + + async def __anext__(self) -> InteractionsAPIStreamingResponse: + """Get next chunk in async mode.""" + if self.finished: + raise StopAsyncIteration + + # Check if we have a pending interaction.complete to send + if hasattr(self, "_pending_interaction_complete"): + pending: InteractionsAPIStreamingResponse = getattr(self, "_pending_interaction_complete") + delattr(self, "_pending_interaction_complete") + return pending + + # Use a loop instead of recursion to avoid stack overflow + async_iterator = cast(ResponsesAPIStreamingIterator, self.responses_stream_iterator) + while True: + try: + # Get next chunk from responses API stream + chunk = await async_iterator.__anext__() + + # Transform chunk (chunk is already a ResponsesAPIStreamingResponse) + transformed = self._transform_responses_chunk_to_interactions_chunk(chunk) + + if transformed: + # If we finished and content was started, send content.stop before interaction.complete + if self.finished and self.sent_content_start and transformed.event_type == "interaction.complete": + # Send content.stop first + content_stop = InteractionsAPIStreamingResponse( + event_type="content.stop", + id=transformed.id, + object="content", + delta={"type": "text", "text": self.collected_text}, + ) + # Store the interaction.complete to send next + self._pending_interaction_complete = transformed + return content_stop + return transformed + + # If no transformation, continue to next chunk (loop continues) + + except StopAsyncIteration: + self.finished = True + + # Send final events if needed + if self.sent_content_start: + return InteractionsAPIStreamingResponse( + event_type="content.stop", + object="content", + delta={"type": "text", "text": self.collected_text}, + ) + + raise StopAsyncIteration + diff --git a/litellm/interactions/litellm_responses_transformation/transformation.py b/litellm/interactions/litellm_responses_transformation/transformation.py new file mode 100644 index 000000000000..24b2c5dbde79 --- /dev/null +++ b/litellm/interactions/litellm_responses_transformation/transformation.py @@ -0,0 +1,277 @@ +""" +Transformation utilities for bridging Interactions API to Responses API. + +This module handles transforming between: +- Interactions API format (Google's format with Turn[], system_instruction, etc.) +- Responses API format (OpenAI's format with input[], instructions, etc.) +""" + +from typing import Any, Dict, List, Optional, cast + +from litellm.types.interactions import ( + InteractionInput, + InteractionsAPIOptionalRequestParams, + InteractionsAPIResponse, + Turn, +) +from litellm.types.llms.openai import ( + ResponseInputParam, + ResponsesAPIResponse, +) + + +class LiteLLMResponsesInteractionsConfig: + """Configuration class for transforming between Interactions API and Responses API.""" + + @staticmethod + def transform_interactions_request_to_responses_request( + model: str, + input: Optional[InteractionInput], + optional_params: InteractionsAPIOptionalRequestParams, + **kwargs, + ) -> Dict[str, Any]: + """ + Transform an Interactions API request to a Responses API request. + + Key transformations: + - system_instruction -> instructions + - input (string | Turn[]) -> input (ResponseInputParam) + - tools -> tools (similar format) + - generation_config -> temperature, top_p, etc. + """ + responses_request: Dict[str, Any] = { + "model": model, + } + + # Transform input + if input is not None: + responses_request["input"] = ( + LiteLLMResponsesInteractionsConfig._transform_interactions_input_to_responses_input( + input + ) + ) + + # Transform system_instruction -> instructions + if optional_params.get("system_instruction"): + responses_request["instructions"] = optional_params["system_instruction"] + + # Transform tools (similar format, pass through for now) + if optional_params.get("tools"): + responses_request["tools"] = optional_params["tools"] + + # Transform generation_config to temperature, top_p, etc. + generation_config = optional_params.get("generation_config") + if generation_config: + if isinstance(generation_config, dict): + if "temperature" in generation_config: + responses_request["temperature"] = generation_config["temperature"] + if "top_p" in generation_config: + responses_request["top_p"] = generation_config["top_p"] + if "top_k" in generation_config: + # Responses API doesn't have top_k, skip it + pass + if "max_output_tokens" in generation_config: + responses_request["max_output_tokens"] = generation_config["max_output_tokens"] + + # Pass through other optional params that match + passthrough_params = ["stream", "store", "metadata", "user"] + for param in passthrough_params: + if param in optional_params and optional_params[param] is not None: + responses_request[param] = optional_params[param] + + # Add any extra kwargs + responses_request.update(kwargs) + + return responses_request + + @staticmethod + def _transform_interactions_input_to_responses_input( + input: InteractionInput, + ) -> ResponseInputParam: + """ + Transform Interactions API input to Responses API input format. + + Interactions API input can be: + - string: "Hello" + - Turn[]: [{"role": "user", "content": [...]}] + - Content object + + Responses API input is: + - string: "Hello" + - Message[]: [{"role": "user", "content": [...]}] + """ + if isinstance(input, str): + # ResponseInputParam accepts str + return cast(ResponseInputParam, input) + + if isinstance(input, list): + # Turn[] format - convert to Responses API Message[] format + messages = [] + for turn in input: + if isinstance(turn, dict): + role = turn.get("role", "user") + content = turn.get("content", []) + + # Transform content array + transformed_content = ( + LiteLLMResponsesInteractionsConfig._transform_content_array(content) + ) + + messages.append({ + "role": role, + "content": transformed_content, + }) + elif isinstance(turn, Turn): + # Pydantic model + role = turn.role if hasattr(turn, "role") else "user" + content = turn.content if hasattr(turn, "content") else [] + + # Ensure content is a list for _transform_content_array + # Cast to List[Any] to handle various content types + if isinstance(content, list): + content_list: List[Any] = list(content) + elif content is not None: + content_list = [content] + else: + content_list = [] + + transformed_content = ( + LiteLLMResponsesInteractionsConfig._transform_content_array(content_list) + ) + + messages.append({ + "role": role, + "content": transformed_content, + }) + + return cast(ResponseInputParam, messages) + + # Single content object - wrap in message + if isinstance(input, dict): + return cast(ResponseInputParam, [{ + "role": "user", + "content": LiteLLMResponsesInteractionsConfig._transform_content_array( + input.get("content", []) if isinstance(input.get("content"), list) else [input] + ), + }]) + + # Fallback: convert to string + return cast(ResponseInputParam, str(input)) + + @staticmethod + def _transform_content_array(content: List[Any]) -> List[Dict[str, Any]]: + """Transform Interactions API content array to Responses API format.""" + if not isinstance(content, list): + # Single content item - wrap in array + content = [content] + + transformed: List[Dict[str, Any]] = [] + for item in content: + if isinstance(item, dict): + # Already in dict format, pass through + transformed.append(item) + elif isinstance(item, str): + # Plain string - wrap in text format + transformed.append({"type": "text", "text": item}) + else: + # Pydantic model or other - convert to dict + if hasattr(item, "model_dump"): + dumped = item.model_dump() + if isinstance(dumped, dict): + transformed.append(dumped) + else: + # Fallback: wrap in text format + transformed.append({"type": "text", "text": str(dumped)}) + elif hasattr(item, "dict"): + dumped = item.dict() + if isinstance(dumped, dict): + transformed.append(dumped) + else: + # Fallback: wrap in text format + transformed.append({"type": "text", "text": str(dumped)}) + else: + # Fallback: wrap in text format + transformed.append({"type": "text", "text": str(item)}) + + return transformed + + @staticmethod + def transform_responses_response_to_interactions_response( + responses_response: ResponsesAPIResponse, + model: Optional[str] = None, + ) -> InteractionsAPIResponse: + """ + Transform a Responses API response to an Interactions API response. + + Key transformations: + - Extract text from output[].content[].text + - Convert created_at (int) to created (ISO string) + - Map status + - Extract usage + """ + # Extract text from outputs + outputs = [] + if hasattr(responses_response, "output") and responses_response.output: + for output_item in responses_response.output: + # Use getattr with None default to safely access content + content = getattr(output_item, "content", None) + if content is not None: + content_items = content if isinstance(content, list) else [content] + for content_item in content_items: + # Check if content_item has text attribute + text = getattr(content_item, "text", None) + if text is not None: + outputs.append({ + "type": "text", + "text": text, + }) + elif isinstance(content_item, dict) and content_item.get("type") == "text": + outputs.append(content_item) + + # Convert created_at to ISO string + created_at = getattr(responses_response, "created_at", None) + if isinstance(created_at, int): + from datetime import datetime + created = datetime.fromtimestamp(created_at).isoformat() + elif created_at is not None and hasattr(created_at, "isoformat"): + created = created_at.isoformat() + else: + created = None + + # Map status + status = getattr(responses_response, "status", "completed") + if status == "completed": + interactions_status = "completed" + elif status == "in_progress": + interactions_status = "in_progress" + else: + interactions_status = status + + # Build interactions response + interactions_response_dict: Dict[str, Any] = { + "id": getattr(responses_response, "id", ""), + "object": "interaction", + "status": interactions_status, + "outputs": outputs, + "model": model or getattr(responses_response, "model", ""), + "created": created, + } + + # Add usage if available + # Map Responses API usage (input_tokens, output_tokens) to Interactions API spec format + # (total_input_tokens, total_output_tokens) + usage = getattr(responses_response, "usage", None) + if usage: + interactions_response_dict["usage"] = { + "total_input_tokens": getattr(usage, "input_tokens", 0), + "total_output_tokens": getattr(usage, "output_tokens", 0), + } + + # Add role + interactions_response_dict["role"] = "model" + + # Add updated (same as created for now) + interactions_response_dict["updated"] = created + + return InteractionsAPIResponse(**interactions_response_dict) + diff --git a/litellm/interactions/main.py b/litellm/interactions/main.py new file mode 100644 index 000000000000..fb811b25b2f5 --- /dev/null +++ b/litellm/interactions/main.py @@ -0,0 +1,633 @@ +""" +LiteLLM Interactions API - Main Module + +Per OpenAPI spec (https://ai.google.dev/static/api/interactions.openapi.json): +- Create interaction: POST /{api_version}/interactions +- Get interaction: GET /{api_version}/interactions/{interaction_id} +- Delete interaction: DELETE /{api_version}/interactions/{interaction_id} + +Usage: + import litellm + + # Create an interaction with a model + response = litellm.interactions.create( + model="gemini-2.5-flash", + input="Hello, how are you?" + ) + + # Create an interaction with an agent + response = litellm.interactions.create( + agent="deep-research-pro-preview-12-2025", + input="Research the current state of cancer research" + ) + + # Async version + response = await litellm.interactions.acreate(...) + + # Get an interaction + response = litellm.interactions.get(interaction_id="...") + + # Delete an interaction + result = litellm.interactions.delete(interaction_id="...") +""" + +import asyncio +import contextvars +from functools import partial +from typing import ( + Any, + AsyncIterator, + Coroutine, + Dict, + Iterator, + List, + Optional, + Union, +) + +import httpx + +import litellm +from litellm.interactions.http_handler import interactions_http_handler +from litellm.interactions.utils import ( + InteractionsAPIRequestUtils, + get_provider_interactions_api_config, +) +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +from litellm.types.interactions import ( + CancelInteractionResult, + DeleteInteractionResult, + InteractionInput, + InteractionsAPIResponse, + InteractionsAPIStreamingResponse, + InteractionTool, +) +from litellm.types.router import GenericLiteLLMParams +from litellm.utils import client + +# ============================================================ +# SDK Methods - CREATE INTERACTION +# ============================================================ + + +@client +async def acreate( + # Model or Agent (one required per OpenAPI spec) + model: Optional[str] = None, + agent: Optional[str] = None, + # Input (required) + input: Optional[InteractionInput] = None, + # Tools (for model interactions) + tools: Optional[List[InteractionTool]] = None, + # System instruction + system_instruction: Optional[str] = None, + # Generation config + generation_config: Optional[Dict[str, Any]] = None, + # Streaming + stream: Optional[bool] = None, + # Storage + store: Optional[bool] = None, + # Background execution + background: Optional[bool] = None, + # Response format + response_modalities: Optional[List[str]] = None, + response_format: Optional[Dict[str, Any]] = None, + response_mime_type: Optional[str] = None, + # Continuation + previous_interaction_id: Optional[str] = None, + # Extra params + extra_headers: Optional[Dict[str, Any]] = None, + extra_body: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + # LiteLLM params + custom_llm_provider: Optional[str] = None, + **kwargs, +) -> Union[InteractionsAPIResponse, AsyncIterator[InteractionsAPIStreamingResponse]]: + """ + Async: Create a new interaction using Google's Interactions API. + + Per OpenAPI spec, provide either `model` or `agent`. + + Args: + model: The model to use (e.g., "gemini-2.5-flash") + agent: The agent to use (e.g., "deep-research-pro-preview-12-2025") + input: The input content (string, content object, or list) + tools: Tools available for the model + system_instruction: System instruction for the interaction + generation_config: Generation configuration + stream: Whether to stream the response + store: Whether to store the response for later retrieval + background: Whether to run in background + response_modalities: Requested response modalities (TEXT, IMAGE, AUDIO) + response_format: JSON schema for response format + response_mime_type: MIME type of the response + previous_interaction_id: ID of previous interaction for continuation + extra_headers: Additional headers + extra_body: Additional body parameters + timeout: Request timeout + custom_llm_provider: Override the LLM provider + + Returns: + InteractionsAPIResponse or async iterator for streaming + """ + local_vars = locals() + try: + loop = asyncio.get_event_loop() + kwargs["acreate_interaction"] = True + + if custom_llm_provider is None and model: + _, custom_llm_provider, _, _ = litellm.get_llm_provider( + model=model, api_base=kwargs.get("api_base", None) + ) + elif custom_llm_provider is None: + custom_llm_provider = "gemini" + + func = partial( + create, + model=model, + agent=agent, + input=input, + tools=tools, + system_instruction=system_instruction, + generation_config=generation_config, + stream=stream, + store=store, + background=background, + response_modalities=response_modalities, + response_format=response_format, + response_mime_type=response_mime_type, + previous_interaction_id=previous_interaction_id, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + custom_llm_provider=custom_llm_provider, + **kwargs, + ) + + ctx = contextvars.copy_context() + func_with_context = partial(ctx.run, func) + init_response = await loop.run_in_executor(None, func_with_context) + + if asyncio.iscoroutine(init_response): + response = await init_response + else: + response = init_response + + return response # type: ignore + except Exception as e: + raise litellm.exception_type( + model=model, + custom_llm_provider=custom_llm_provider, + original_exception=e, + completion_kwargs=local_vars, + extra_kwargs=kwargs, + ) + + +@client +def create( + # Model or Agent (one required per OpenAPI spec) + model: Optional[str] = None, + agent: Optional[str] = None, + # Input (required) + input: Optional[InteractionInput] = None, + # Tools (for model interactions) + tools: Optional[List[InteractionTool]] = None, + # System instruction + system_instruction: Optional[str] = None, + # Generation config + generation_config: Optional[Dict[str, Any]] = None, + # Streaming + stream: Optional[bool] = None, + # Storage + store: Optional[bool] = None, + # Background execution + background: Optional[bool] = None, + # Response format + response_modalities: Optional[List[str]] = None, + response_format: Optional[Dict[str, Any]] = None, + response_mime_type: Optional[str] = None, + # Continuation + previous_interaction_id: Optional[str] = None, + # Extra params + extra_headers: Optional[Dict[str, Any]] = None, + extra_body: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + # LiteLLM params + custom_llm_provider: Optional[str] = None, + **kwargs, +) -> Union[ + InteractionsAPIResponse, + Iterator[InteractionsAPIStreamingResponse], + Coroutine[Any, Any, Union[InteractionsAPIResponse, AsyncIterator[InteractionsAPIStreamingResponse]]], +]: + """ + Sync: Create a new interaction using Google's Interactions API. + + Per OpenAPI spec, provide either `model` or `agent`. + + Args: + model: The model to use (e.g., "gemini-2.5-flash") + agent: The agent to use (e.g., "deep-research-pro-preview-12-2025") + input: The input content (string, content object, or list) + tools: Tools available for the model + system_instruction: System instruction for the interaction + generation_config: Generation configuration + stream: Whether to stream the response + store: Whether to store the response for later retrieval + background: Whether to run in background + response_modalities: Requested response modalities (TEXT, IMAGE, AUDIO) + response_format: JSON schema for response format + response_mime_type: MIME type of the response + previous_interaction_id: ID of previous interaction for continuation + extra_headers: Additional headers + extra_body: Additional body parameters + timeout: Request timeout + custom_llm_provider: Override the LLM provider + + Returns: + InteractionsAPIResponse or iterator for streaming + """ + local_vars = locals() + + try: + litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj") # type: ignore + litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None) + _is_async = kwargs.pop("acreate_interaction", False) is True + + litellm_params = GenericLiteLLMParams(**kwargs) + + if model: + model, custom_llm_provider, _, _ = litellm.get_llm_provider( + model=model, + custom_llm_provider=custom_llm_provider, + api_base=litellm_params.api_base, + api_key=litellm_params.api_key, + ) + else: + custom_llm_provider = custom_llm_provider or "gemini" + + interactions_api_config = get_provider_interactions_api_config( + provider=custom_llm_provider, + model=model, + ) + + # Get optional params using utility (similar to responses API pattern) + local_vars.update(kwargs) + optional_params = InteractionsAPIRequestUtils.get_requested_interactions_api_optional_params( + local_vars + ) + + # Check if this is a bridge provider (litellm_responses) - similar to responses API + # Either provider is explicitly "litellm_responses" or no config found (bridge to responses) + if custom_llm_provider == "litellm_responses" or interactions_api_config is None: + # Bridge to litellm.responses() for non-native providers + from litellm.interactions.litellm_responses_transformation.handler import ( + LiteLLMResponsesInteractionsHandler, + ) + handler = LiteLLMResponsesInteractionsHandler() + return handler.interactions_api_handler( + model=model or "", + input=input, + optional_params=optional_params, + custom_llm_provider=custom_llm_provider, + _is_async=_is_async, + stream=stream, + **kwargs, + ) + + litellm_logging_obj.update_environment_variables( + model=model, + optional_params=dict(optional_params), + litellm_params={"litellm_call_id": litellm_call_id}, + custom_llm_provider=custom_llm_provider, + ) + + response = interactions_http_handler.create_interaction( + model=model, + agent=agent, + input=input, + interactions_api_config=interactions_api_config, + optional_params=optional_params, + custom_llm_provider=custom_llm_provider, + litellm_params=litellm_params, + logging_obj=litellm_logging_obj, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + _is_async=_is_async, + stream=stream, + ) + + return response + except Exception as e: + raise litellm.exception_type( + model=model, + custom_llm_provider=custom_llm_provider, + original_exception=e, + completion_kwargs=local_vars, + extra_kwargs=kwargs, + ) + + +# ============================================================ +# SDK Methods - GET INTERACTION +# ============================================================ + + +@client +async def aget( + interaction_id: str, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + custom_llm_provider: Optional[str] = None, + **kwargs, +) -> InteractionsAPIResponse: + """Async: Get an interaction by its ID.""" + local_vars = locals() + try: + loop = asyncio.get_event_loop() + kwargs["aget_interaction"] = True + + func = partial( + get, + interaction_id=interaction_id, + extra_headers=extra_headers, + timeout=timeout, + custom_llm_provider=custom_llm_provider or "gemini", + **kwargs, + ) + + ctx = contextvars.copy_context() + func_with_context = partial(ctx.run, func) + init_response = await loop.run_in_executor(None, func_with_context) + + if asyncio.iscoroutine(init_response): + response = await init_response + else: + response = init_response + + return response # type: ignore + except Exception as e: + raise litellm.exception_type( + model=None, + custom_llm_provider=custom_llm_provider or "gemini", + original_exception=e, + completion_kwargs=local_vars, + extra_kwargs=kwargs, + ) + + +@client +def get( + interaction_id: str, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + custom_llm_provider: Optional[str] = None, + **kwargs, +) -> Union[InteractionsAPIResponse, Coroutine[Any, Any, InteractionsAPIResponse]]: + """Sync: Get an interaction by its ID.""" + local_vars = locals() + custom_llm_provider = custom_llm_provider or "gemini" + + try: + litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj") # type: ignore + litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None) + _is_async = kwargs.pop("aget_interaction", False) is True + + litellm_params = GenericLiteLLMParams(**kwargs) + + interactions_api_config = get_provider_interactions_api_config( + provider=custom_llm_provider, + ) + + if interactions_api_config is None: + raise ValueError(f"Interactions API not supported for: {custom_llm_provider}") + + litellm_logging_obj.update_environment_variables( + model=None, + optional_params={"interaction_id": interaction_id}, + litellm_params={"litellm_call_id": litellm_call_id}, + custom_llm_provider=custom_llm_provider, + ) + + return interactions_http_handler.get_interaction( + interaction_id=interaction_id, + interactions_api_config=interactions_api_config, + custom_llm_provider=custom_llm_provider, + litellm_params=litellm_params, + logging_obj=litellm_logging_obj, + extra_headers=extra_headers, + timeout=timeout, + _is_async=_is_async, + ) + except Exception as e: + raise litellm.exception_type( + model=None, + custom_llm_provider=custom_llm_provider, + original_exception=e, + completion_kwargs=local_vars, + extra_kwargs=kwargs, + ) + + +# ============================================================ +# SDK Methods - DELETE INTERACTION +# ============================================================ + + +@client +async def adelete( + interaction_id: str, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + custom_llm_provider: Optional[str] = None, + **kwargs, +) -> DeleteInteractionResult: + """Async: Delete an interaction by its ID.""" + local_vars = locals() + try: + loop = asyncio.get_event_loop() + kwargs["adelete_interaction"] = True + + func = partial( + delete, + interaction_id=interaction_id, + extra_headers=extra_headers, + timeout=timeout, + custom_llm_provider=custom_llm_provider or "gemini", + **kwargs, + ) + + ctx = contextvars.copy_context() + func_with_context = partial(ctx.run, func) + init_response = await loop.run_in_executor(None, func_with_context) + + if asyncio.iscoroutine(init_response): + response = await init_response + else: + response = init_response + + return response # type: ignore + except Exception as e: + raise litellm.exception_type( + model=None, + custom_llm_provider=custom_llm_provider or "gemini", + original_exception=e, + completion_kwargs=local_vars, + extra_kwargs=kwargs, + ) + + +@client +def delete( + interaction_id: str, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + custom_llm_provider: Optional[str] = None, + **kwargs, +) -> Union[DeleteInteractionResult, Coroutine[Any, Any, DeleteInteractionResult]]: + """Sync: Delete an interaction by its ID.""" + local_vars = locals() + custom_llm_provider = custom_llm_provider or "gemini" + + try: + litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj") # type: ignore + litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None) + _is_async = kwargs.pop("adelete_interaction", False) is True + + litellm_params = GenericLiteLLMParams(**kwargs) + + interactions_api_config = get_provider_interactions_api_config( + provider=custom_llm_provider, + ) + + if interactions_api_config is None: + raise ValueError(f"Interactions API not supported for: {custom_llm_provider}") + + litellm_logging_obj.update_environment_variables( + model=None, + optional_params={"interaction_id": interaction_id}, + litellm_params={"litellm_call_id": litellm_call_id}, + custom_llm_provider=custom_llm_provider, + ) + + return interactions_http_handler.delete_interaction( + interaction_id=interaction_id, + interactions_api_config=interactions_api_config, + custom_llm_provider=custom_llm_provider, + litellm_params=litellm_params, + logging_obj=litellm_logging_obj, + extra_headers=extra_headers, + timeout=timeout, + _is_async=_is_async, + ) + except Exception as e: + raise litellm.exception_type( + model=None, + custom_llm_provider=custom_llm_provider, + original_exception=e, + completion_kwargs=local_vars, + extra_kwargs=kwargs, + ) + + +# ============================================================ +# SDK Methods - CANCEL INTERACTION +# ============================================================ + + +@client +async def acancel( + interaction_id: str, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + custom_llm_provider: Optional[str] = None, + **kwargs, +) -> CancelInteractionResult: + """Async: Cancel an interaction by its ID.""" + local_vars = locals() + try: + loop = asyncio.get_event_loop() + kwargs["acancel_interaction"] = True + + func = partial( + cancel, + interaction_id=interaction_id, + extra_headers=extra_headers, + timeout=timeout, + custom_llm_provider=custom_llm_provider or "gemini", + **kwargs, + ) + + ctx = contextvars.copy_context() + func_with_context = partial(ctx.run, func) + init_response = await loop.run_in_executor(None, func_with_context) + + if asyncio.iscoroutine(init_response): + response = await init_response + else: + response = init_response + + return response # type: ignore + except Exception as e: + raise litellm.exception_type( + model=None, + custom_llm_provider=custom_llm_provider or "gemini", + original_exception=e, + completion_kwargs=local_vars, + extra_kwargs=kwargs, + ) + + +@client +def cancel( + interaction_id: str, + extra_headers: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + custom_llm_provider: Optional[str] = None, + **kwargs, +) -> Union[CancelInteractionResult, Coroutine[Any, Any, CancelInteractionResult]]: + """Sync: Cancel an interaction by its ID.""" + local_vars = locals() + custom_llm_provider = custom_llm_provider or "gemini" + + try: + litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj") # type: ignore + litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None) + _is_async = kwargs.pop("acancel_interaction", False) is True + + litellm_params = GenericLiteLLMParams(**kwargs) + + interactions_api_config = get_provider_interactions_api_config( + provider=custom_llm_provider, + ) + + if interactions_api_config is None: + raise ValueError(f"Interactions API not supported for: {custom_llm_provider}") + + litellm_logging_obj.update_environment_variables( + model=None, + optional_params={"interaction_id": interaction_id}, + litellm_params={"litellm_call_id": litellm_call_id}, + custom_llm_provider=custom_llm_provider, + ) + + return interactions_http_handler.cancel_interaction( + interaction_id=interaction_id, + interactions_api_config=interactions_api_config, + custom_llm_provider=custom_llm_provider, + litellm_params=litellm_params, + logging_obj=litellm_logging_obj, + extra_headers=extra_headers, + timeout=timeout, + _is_async=_is_async, + ) + except Exception as e: + raise litellm.exception_type( + model=None, + custom_llm_provider=custom_llm_provider, + original_exception=e, + completion_kwargs=local_vars, + extra_kwargs=kwargs, + ) diff --git a/litellm/interactions/streaming_iterator.py b/litellm/interactions/streaming_iterator.py new file mode 100644 index 000000000000..f65d08d3ca95 --- /dev/null +++ b/litellm/interactions/streaming_iterator.py @@ -0,0 +1,264 @@ +""" +Streaming iterators for the Interactions API. + +This module provides streaming iterators that properly stream SSE responses +from the Google Interactions API, similar to the responses API streaming iterator. +""" + +import asyncio +import json +from datetime import datetime +from typing import Any, Dict, Optional + +import httpx + +from litellm._logging import verbose_logger +from litellm.constants import STREAM_SSE_DONE_STRING +from litellm.litellm_core_utils.asyncify import run_async_function +from litellm.litellm_core_utils.core_helpers import process_response_headers +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +from litellm.litellm_core_utils.llm_response_utils.get_api_base import get_api_base +from litellm.litellm_core_utils.thread_pool_executor import executor +from litellm.llms.base_llm.interactions.transformation import BaseInteractionsAPIConfig +from litellm.types.interactions import ( + InteractionsAPIStreamingResponse, +) +from litellm.utils import CustomStreamWrapper + + +class BaseInteractionsAPIStreamingIterator: + """ + Base class for streaming iterators that process responses from the Interactions API. + + This class contains shared logic for both synchronous and asynchronous iterators. + """ + + def __init__( + self, + response: httpx.Response, + model: Optional[str], + interactions_api_config: BaseInteractionsAPIConfig, + logging_obj: LiteLLMLoggingObj, + litellm_metadata: Optional[Dict[str, Any]] = None, + custom_llm_provider: Optional[str] = None, + ): + self.response = response + self.model = model + self.logging_obj = logging_obj + self.finished = False + self.interactions_api_config = interactions_api_config + self.completed_response: Optional[InteractionsAPIStreamingResponse] = None + self.start_time = datetime.now() + + # set request kwargs + self.litellm_metadata = litellm_metadata + self.custom_llm_provider = custom_llm_provider + + # set hidden params for response headers + _api_base = get_api_base( + model=model or "", + optional_params=self.logging_obj.model_call_details.get( + "litellm_params", {} + ), + ) + _model_info: Dict = litellm_metadata.get("model_info", {}) if litellm_metadata else {} + self._hidden_params = { + "model_id": _model_info.get("id", None), + "api_base": _api_base, + } + self._hidden_params["additional_headers"] = process_response_headers( + self.response.headers or {} + ) + + def _process_chunk(self, chunk: str) -> Optional[InteractionsAPIStreamingResponse]: + """Process a single chunk of data from the stream.""" + if not chunk: + return None + + # Handle SSE format (data: {...}) + stripped_chunk = CustomStreamWrapper._strip_sse_data_from_chunk(chunk) + if stripped_chunk is None: + return None + + # Handle "[DONE]" marker + if stripped_chunk == STREAM_SSE_DONE_STRING: + self.finished = True + return None + + try: + # Parse the JSON chunk + parsed_chunk = json.loads(stripped_chunk) + + # Format as InteractionsAPIStreamingResponse + if isinstance(parsed_chunk, dict): + streaming_response = self.interactions_api_config.transform_streaming_response( + model=self.model, + parsed_chunk=parsed_chunk, + logging_obj=self.logging_obj, + ) + + # Store the completed response (check for status=completed) + if ( + streaming_response + and getattr(streaming_response, "status", None) == "completed" + ): + self.completed_response = streaming_response + self._handle_logging_completed_response() + + return streaming_response + + return None + except json.JSONDecodeError: + # If we can't parse the chunk, continue + verbose_logger.debug(f"Failed to parse streaming chunk: {stripped_chunk[:200]}...") + return None + + def _handle_logging_completed_response(self): + """Base implementation - should be overridden by subclasses.""" + pass + + +class InteractionsAPIStreamingIterator(BaseInteractionsAPIStreamingIterator): + """ + Async iterator for processing streaming responses from the Interactions API. + """ + + def __init__( + self, + response: httpx.Response, + model: Optional[str], + interactions_api_config: BaseInteractionsAPIConfig, + logging_obj: LiteLLMLoggingObj, + litellm_metadata: Optional[Dict[str, Any]] = None, + custom_llm_provider: Optional[str] = None, + ): + super().__init__( + response=response, + model=model, + interactions_api_config=interactions_api_config, + logging_obj=logging_obj, + litellm_metadata=litellm_metadata, + custom_llm_provider=custom_llm_provider, + ) + self.stream_iterator = response.aiter_lines() + + def __aiter__(self): + return self + + async def __anext__(self) -> InteractionsAPIStreamingResponse: + try: + while True: + # Get the next chunk from the stream + try: + chunk = await self.stream_iterator.__anext__() + except StopAsyncIteration: + self.finished = True + raise StopAsyncIteration + + result = self._process_chunk(chunk) + + if self.finished: + raise StopAsyncIteration + elif result is not None: + return result + # If result is None, continue the loop to get the next chunk + + except httpx.HTTPError as e: + # Handle HTTP errors + self.finished = True + raise e + + def _handle_logging_completed_response(self): + """Handle logging for completed responses in async context.""" + import copy + logging_response = copy.deepcopy(self.completed_response) + + asyncio.create_task( + self.logging_obj.async_success_handler( + result=logging_response, + start_time=self.start_time, + end_time=datetime.now(), + cache_hit=None, + ) + ) + + executor.submit( + self.logging_obj.success_handler, + result=logging_response, + cache_hit=None, + start_time=self.start_time, + end_time=datetime.now(), + ) + + +class SyncInteractionsAPIStreamingIterator(BaseInteractionsAPIStreamingIterator): + """ + Synchronous iterator for processing streaming responses from the Interactions API. + """ + + def __init__( + self, + response: httpx.Response, + model: Optional[str], + interactions_api_config: BaseInteractionsAPIConfig, + logging_obj: LiteLLMLoggingObj, + litellm_metadata: Optional[Dict[str, Any]] = None, + custom_llm_provider: Optional[str] = None, + ): + super().__init__( + response=response, + model=model, + interactions_api_config=interactions_api_config, + logging_obj=logging_obj, + litellm_metadata=litellm_metadata, + custom_llm_provider=custom_llm_provider, + ) + self.stream_iterator = response.iter_lines() + + def __iter__(self): + return self + + def __next__(self) -> InteractionsAPIStreamingResponse: + try: + while True: + # Get the next chunk from the stream + try: + chunk = next(self.stream_iterator) + except StopIteration: + self.finished = True + raise StopIteration + + result = self._process_chunk(chunk) + + if self.finished: + raise StopIteration + elif result is not None: + return result + # If result is None, continue the loop to get the next chunk + + except httpx.HTTPError as e: + # Handle HTTP errors + self.finished = True + raise e + + def _handle_logging_completed_response(self): + """Handle logging for completed responses in sync context.""" + import copy + logging_response = copy.deepcopy(self.completed_response) + + run_async_function( + async_function=self.logging_obj.async_success_handler, + result=logging_response, + start_time=self.start_time, + end_time=datetime.now(), + cache_hit=None, + ) + + executor.submit( + self.logging_obj.success_handler, + result=logging_response, + cache_hit=None, + start_time=self.start_time, + end_time=datetime.now(), + ) + diff --git a/litellm/interactions/utils.py b/litellm/interactions/utils.py new file mode 100644 index 000000000000..4fc40916e52f --- /dev/null +++ b/litellm/interactions/utils.py @@ -0,0 +1,84 @@ +""" +Utility functions for Interactions API. +""" + +from typing import Any, Dict, Optional, cast + +from litellm.llms.base_llm.interactions.transformation import BaseInteractionsAPIConfig +from litellm.types.interactions import InteractionsAPIOptionalRequestParams + +# Valid optional parameter keys per OpenAPI spec +INTERACTIONS_API_OPTIONAL_PARAMS = { + "tools", + "system_instruction", + "generation_config", + "stream", + "store", + "background", + "response_modalities", + "response_format", + "response_mime_type", + "previous_interaction_id", + "agent_config", +} + + +def get_provider_interactions_api_config( + provider: str, + model: Optional[str] = None, +) -> Optional[BaseInteractionsAPIConfig]: + """ + Get the interactions API config for the given provider. + + Args: + provider: The LLM provider name + model: Optional model name + + Returns: + The provider-specific interactions API config, or None if not supported + """ + from litellm.types.utils import LlmProviders + + if provider == LlmProviders.GEMINI.value or provider == "gemini": + from litellm.llms.gemini.interactions.transformation import ( + GoogleAIStudioInteractionsConfig, + ) + return GoogleAIStudioInteractionsConfig() + + return None + + +class InteractionsAPIRequestUtils: + """Helper utils for constructing Interactions API requests.""" + + @staticmethod + def get_requested_interactions_api_optional_params( + params: Dict[str, Any], + ) -> InteractionsAPIOptionalRequestParams: + """ + Filter parameters to only include valid optional params per OpenAPI spec. + + Args: + params: Dictionary of parameters to filter (typically from locals()) + + Returns: + Dict with only the valid optional parameters + """ + from litellm.utils import PreProcessNonDefaultParams + + custom_llm_provider = params.pop("custom_llm_provider", None) + special_params = params.pop("kwargs", {}) + additional_drop_params = params.pop("additional_drop_params", None) + + non_default_params = ( + PreProcessNonDefaultParams.base_pre_process_non_default_params( + passed_params=params, + special_params=special_params, + custom_llm_provider=custom_llm_provider, + additional_drop_params=additional_drop_params, + default_param_values={k: None for k in INTERACTIONS_API_OPTIONAL_PARAMS}, + additional_endpoint_specific_params=["input", "model", "agent"], + ) + ) + + return cast(InteractionsAPIOptionalRequestParams, non_default_params) diff --git a/litellm/litellm_core_utils/README.md b/litellm/litellm_core_utils/README.md index 6494041291b9..b61c8982762e 100644 --- a/litellm/litellm_core_utils/README.md +++ b/litellm/litellm_core_utils/README.md @@ -9,4 +9,5 @@ Core files: - `default_encoding.py`: code for loading the default encoding (tiktoken) - `get_llm_provider_logic.py`: code for inferring the LLM provider from a given model name. - `duration_parser.py`: code for parsing durations - e.g. "1d", "1mo", "10s" +- `api_route_to_call_types.py`: mapping of API routes to their corresponding CallTypes (e.g., `/chat/completions` -> [acompletion, completion]) diff --git a/litellm/litellm_core_utils/api_route_to_call_types.py b/litellm/litellm_core_utils/api_route_to_call_types.py new file mode 100644 index 000000000000..4146ff6d6a62 --- /dev/null +++ b/litellm/litellm_core_utils/api_route_to_call_types.py @@ -0,0 +1,40 @@ +""" +Dictionary mapping API routes to their corresponding CallTypes in LiteLLM. + +This dictionary maps each API endpoint to the CallTypes that can be used for that route. +Each route can have both async (prefixed with 'a') and sync call types. +""" + +from typing import List, Optional + +from litellm.types.utils import API_ROUTE_TO_CALL_TYPES, CallTypes + + +def get_call_types_for_route(route: str) -> Optional[List[CallTypes]]: + """ + Get the list of CallTypes for a given API route. + + Args: + route: API route path (e.g., "/chat/completions") + + Returns: + List of CallTypes for that route, or None if route not found + """ + return API_ROUTE_TO_CALL_TYPES.get(route, None) + + +def get_routes_for_call_type(call_type: CallTypes) -> list: + """ + Get all routes that use a specific CallType. + + Args: + call_type: The CallType to search for + + Returns: + List of routes that use this CallType + """ + routes = [] + for route, types in API_ROUTE_TO_CALL_TYPES.items(): + if call_type in types: + routes.append(route) + return routes diff --git a/litellm/litellm_core_utils/audio_utils/utils.py b/litellm/litellm_core_utils/audio_utils/utils.py index fc0c8aca8423..a7d12841e582 100644 --- a/litellm/litellm_core_utils/audio_utils/utils.py +++ b/litellm/litellm_core_utils/audio_utils/utils.py @@ -2,8 +2,10 @@ Utils used for litellm.transcription() and litellm.atranscription() """ +import hashlib import os from dataclasses import dataclass +from typing import Optional from litellm.types.files import get_file_mime_type_from_extension from litellm.types.utils import FileTypes @@ -13,12 +15,13 @@ class ProcessedAudioFile: """ Processed audio file data. - + Attributes: file_content: The binary content of the audio file filename: The filename (extracted or generated) content_type: The MIME type of the audio file """ + file_content: bytes filename: str content_type: str @@ -27,61 +30,63 @@ class ProcessedAudioFile: def process_audio_file(audio_file: FileTypes) -> ProcessedAudioFile: """ Common utility function to process audio files for audio transcription APIs. - + Handles various input types: - File paths (str, os.PathLike) - Raw bytes/bytearray - Tuples (filename, content, optional content_type) - File-like objects with read() method - + Args: audio_file: The audio file input in various formats - + Returns: ProcessedAudioFile: Structured data with file content, filename, and content type - + Raises: ValueError: If audio_file type is unsupported or content cannot be extracted """ file_content = None filename = None - + if isinstance(audio_file, (bytes, bytearray)): # Raw bytes - filename = 'audio.wav' + filename = "audio.wav" file_content = bytes(audio_file) elif isinstance(audio_file, (str, os.PathLike)): # File path or PathLike file_path = str(audio_file) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: file_content = f.read() - filename = file_path.split('/')[-1] + filename = file_path.split("/")[-1] elif isinstance(audio_file, tuple): # Tuple format: (filename, content, content_type) or (filename, content) if len(audio_file) >= 2: - filename = audio_file[0] or 'audio.wav' + filename = audio_file[0] or "audio.wav" content = audio_file[1] if isinstance(content, (bytes, bytearray)): file_content = bytes(content) elif isinstance(content, (str, os.PathLike)): # File path or PathLike - with open(str(content), 'rb') as f: + with open(str(content), "rb") as f: file_content = f.read() - elif hasattr(content, 'read'): + elif hasattr(content, "read"): # File-like object file_content = content.read() - if hasattr(content, 'seek'): + if hasattr(content, "seek"): content.seek(0) else: raise ValueError(f"Unsupported content type in tuple: {type(content)}") else: raise ValueError("Tuple must have at least 2 elements: (filename, content)") - elif hasattr(audio_file, 'read') and not isinstance(audio_file, (str, bytes, bytearray, tuple, os.PathLike)): + elif hasattr(audio_file, "read") and not isinstance( + audio_file, (str, bytes, bytearray, tuple, os.PathLike) + ): # File-like object (IO) - check this after all other types - filename = getattr(audio_file, 'name', 'audio.wav') + filename = getattr(audio_file, "name", "audio.wav") file_content = audio_file.read() # type: ignore # Reset file pointer if possible - if hasattr(audio_file, 'seek'): + if hasattr(audio_file, "seek"): audio_file.seek(0) # type: ignore else: raise ValueError(f"Unsupported audio_file type: {type(audio_file)}") @@ -90,20 +95,18 @@ def process_audio_file(audio_file: FileTypes) -> ProcessedAudioFile: raise ValueError("Could not extract file content from audio_file") # Determine content type using LiteLLM's file type utilities - content_type = 'audio/wav' # Default fallback + content_type = "audio/wav" # Default fallback if filename: try: # Extract extension from filename - extension = filename.split('.')[-1].lower() if '.' in filename else 'wav' + extension = filename.split(".")[-1].lower() if "." in filename else "wav" content_type = get_file_mime_type_from_extension(extension) except ValueError: # If extension is not recognized, fallback to audio/wav - content_type = 'audio/wav' - + content_type = "audio/wav" + return ProcessedAudioFile( - file_content=file_content, - filename=filename, - content_type=content_type + file_content=file_content, filename=filename, content_type=content_type ) @@ -125,6 +128,67 @@ def get_audio_file_name(file_obj: FileTypes) -> str: return repr(file_obj) +def get_audio_file_content_hash(file_obj: FileTypes) -> str: + """ + Compute SHA-256 hash of audio file content for cache keys. + Falls back to filename hash if content extraction fails. + """ + file_content: Optional[bytes] = None + fallback_filename: Optional[str] = None + + if isinstance(file_obj, tuple): + if len(file_obj) < 2: + fallback_filename = str(file_obj[0]) if len(file_obj) > 0 else None + else: + fallback_filename = str(file_obj[0]) if file_obj[0] is not None else None + file_content_obj = file_obj[1] + else: + file_content_obj = file_obj + fallback_filename = get_audio_file_name(file_obj) + + try: + if isinstance(file_content_obj, (bytes, bytearray)): + file_content = bytes(file_content_obj) + elif isinstance(file_content_obj, (str, os.PathLike)): + try: + with open(str(file_content_obj), "rb") as f: + file_content = f.read() + if fallback_filename is None: + fallback_filename = str(file_content_obj) + except (OSError, IOError): + fallback_filename = str(file_content_obj) + file_content = None + elif hasattr(file_content_obj, "read"): + try: + current_position = file_content_obj.tell() if hasattr(file_content_obj, "tell") else None + if hasattr(file_content_obj, "seek"): + file_content_obj.seek(0) + file_content = file_content_obj.read() # type: ignore + if current_position is not None and hasattr(file_content_obj, "seek"): + file_content_obj.seek(current_position) # type: ignore + except (OSError, IOError, AttributeError): + file_content = None + else: + file_content = None + except Exception: + file_content = None + + if file_content is not None and isinstance(file_content, bytes): + try: + hash_object = hashlib.sha256(file_content) + return hash_object.hexdigest() + except Exception: + pass + + if fallback_filename: + hash_object = hashlib.sha256(fallback_filename.encode('utf-8')) + return hash_object.hexdigest() + + file_obj_str = str(file_obj) + hash_object = hashlib.sha256(file_obj_str.encode('utf-8')) + return hash_object.hexdigest() + + def get_audio_file_for_health_check() -> FileTypes: """ Get an audio file for health check @@ -134,3 +198,74 @@ def get_audio_file_for_health_check() -> FileTypes: pwd = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(pwd, "audio_health_check.wav") return open(file_path, "rb") + + +def calculate_request_duration(file: FileTypes) -> Optional[float]: + """ + Calculate audio duration from file content. + + Args: + file: The audio file (can be file path, bytes, or file-like object) + + Returns: + Duration in seconds, or None if extraction fails or soundfile is not available + """ + try: + import soundfile as sf + except ImportError: + # soundfile not available, cannot extract duration + return None + + try: + import io + + # Handle different file input types + file_content: Optional[bytes] = None + + if isinstance(file, (bytes, bytearray)): + # Raw bytes + file_content = bytes(file) + elif isinstance(file, (str, os.PathLike)): + # File path + with open(str(file), "rb") as f: + file_content = f.read() + elif isinstance(file, tuple): + # Tuple format: (filename, content, optional content_type) + if len(file) >= 2: + content = file[1] + if isinstance(content, bytes): + file_content = content + elif hasattr(content, "read") and not isinstance( + content, (str, os.PathLike) + ): + # File-like object in tuple + current_pos = getattr(content, "tell", lambda: None)() + # Seek to start to ensure we read the entire content + if hasattr(content, "seek"): + content.seek(0) + file_content = content.read() + if current_pos is not None and hasattr(content, "seek"): + content.seek(current_pos) + elif hasattr(file, "read") and not isinstance(file, tuple): + # File-like object (including BytesIO) + current_position = file.tell() if hasattr(file, "tell") else None + # Seek to start to ensure we read the entire content + if hasattr(file, "seek"): + file.seek(0) + file_content = file.read() + # Reset file position if possible + if current_position is not None and hasattr(file, "seek"): + file.seek(current_position) + + if file_content is None or not isinstance(file_content, bytes): + return None + + # Extract duration using soundfile + file_object = io.BytesIO(file_content) + with sf.SoundFile(file_object) as audio: + duration = len(audio) / audio.samplerate + return duration + + except Exception: + # Silently fail if duration extraction fails + return None diff --git a/litellm/litellm_core_utils/core_helpers.py b/litellm/litellm_core_utils/core_helpers.py index 8b9f53cec152..00695cbfb5bd 100644 --- a/litellm/litellm_core_utils/core_helpers.py +++ b/litellm/litellm_core_utils/core_helpers.py @@ -38,18 +38,18 @@ def safe_divide_seconds( def safe_divide( - numerator: Union[int, float], - denominator: Union[int, float], - default: Union[int, float] = 0 + numerator: Union[int, float], + denominator: Union[int, float], + default: Union[int, float] = 0, ) -> Union[int, float]: """ Safely divide two numbers, returning a default value if denominator is zero. - + Args: numerator: The number to divide denominator: The number to divide by default: Value to return if denominator is zero (defaults to 0) - + Returns: The result of numerator/denominator, or default if denominator is zero """ @@ -79,9 +79,11 @@ def map_finish_reason( elif finish_reason == "eos_token" or finish_reason == "stop_sequence": return "stop" elif ( - finish_reason == "FINISH_REASON_UNSPECIFIED" or finish_reason == "STOP" + finish_reason == "FINISH_REASON_UNSPECIFIED" ): # vertex ai - got from running `print(dir(response_obj.candidates[0].finish_reason))`: ['FINISH_REASON_UNSPECIFIED', 'MAX_TOKENS', 'OTHER', 'RECITATION', 'SAFETY', 'STOP',] - return "stop" + return "finish_reason_unspecified" + elif finish_reason == "MALFORMED_FUNCTION_CALL": + return "malformed_function_call" elif finish_reason == "SAFETY" or finish_reason == "RECITATION": # vertex ai return "content_filter" elif finish_reason == "STOP": # vertex ai @@ -153,7 +155,8 @@ def get_metadata_variable_name_from_kwargs( - LiteLLM is now moving to using `litellm_metadata` for our metadata """ return "litellm_metadata" if "litellm_metadata" in kwargs else "metadata" - + + def get_litellm_metadata_from_kwargs(kwargs: dict): """ Helper to get litellm metadata from all litellm request kwargs @@ -176,6 +179,25 @@ def get_litellm_metadata_from_kwargs(kwargs: dict): return {} +def reconstruct_model_name( + model_name: str, + custom_llm_provider: Optional[str], + metadata: dict, +) -> str: + """Reconstruct full model name with provider prefix for logging.""" + # Check if deployment model name from router metadata is available (has original prefix) + deployment_model_name = metadata.get("deployment") + if deployment_model_name and "/" in deployment_model_name: + # Use the deployment model name which preserves the original provider prefix + return deployment_model_name + elif custom_llm_provider and model_name and "/" not in model_name: + # Only add prefix for Bedrock (not for direct Anthropic API) + # This ensures Bedrock models get the prefix while direct Anthropic models don't + if custom_llm_provider == "bedrock": + return f"{custom_llm_provider}/{model_name}" + return model_name + + # Helper functions used for OTEL logging def _get_parent_otel_span_from_kwargs( kwargs: Optional[dict] = None, @@ -234,7 +256,8 @@ def preserve_upstream_non_openai_attributes( """ Preserve non-OpenAI attributes from the original chunk. """ - expected_keys = set(model_response.model_fields.keys()).union({"usage"}) + # Access model_fields on the class, not the instance, to avoid Pydantic 2.11+ deprecation warnings + expected_keys = set(type(model_response).model_fields.keys()).union({"usage"}) for key, value in original_chunk.model_dump().items(): if key not in expected_keys: setattr(model_response, key, value) @@ -245,8 +268,8 @@ def safe_deep_copy(data): Safe Deep Copy The LiteLLM request may contain objects that cannot be pickled/deep-copied - (e.g., tracing spans, locks, clients). - + (e.g., tracing spans, locks, clients). + This helper deep-copies each top-level key independently; on failure keeps original ref """ @@ -299,4 +322,103 @@ def safe_deep_copy(data): data["litellm_metadata"][ "litellm_parent_otel_span" ] = litellm_parent_otel_span - return new_data \ No newline at end of file + return new_data + + +def filter_exceptions_from_params(data: Any, max_depth: int = 20) -> Any: + """ + Recursively filter out Exception objects and callable objects from dicts/lists. + + This is a defensive utility to prevent deepcopy failures when exception objects + are accidentally stored in parameter dictionaries (e.g., optional_params). + Also filters callable objects (functions) to prevent JSON serialization errors. + Exceptions and callables should not be stored in params - this function removes them. + + Args: + data: The data structure to filter (dict, list, or any other type) + max_depth: Maximum recursion depth to prevent infinite loops + + Returns: + Filtered data structure with Exception and callable objects removed, or None if the + entire input was an Exception or callable + """ + if max_depth <= 0: + return data + + # Skip exception objects + if isinstance(data, Exception): + return None + # Skip callable objects (functions, methods, lambdas) but not classes (type objects) + if callable(data) and not isinstance(data, type): + return None + # Skip known non-serializable object types (Logging, Router, etc.) + obj_type_name = type(data).__name__ + if obj_type_name in ["Logging", "LiteLLMLoggingObj", "Router"]: + return None + + if isinstance(data, dict): + result: dict[str, Any] = {} + for k, v in data.items(): + # Skip exception and callable values + if isinstance(v, Exception) or (callable(v) and not isinstance(v, type)): + continue + try: + filtered = filter_exceptions_from_params(v, max_depth - 1) + if filtered is not None: + result[k] = filtered + except Exception: + # Skip values that cause errors during filtering + continue + return result + elif isinstance(data, list): + result_list: list[Any] = [] + for item in data: + # Skip exception and callable items + if isinstance(item, Exception) or ( + callable(item) and not isinstance(item, type) + ): + continue + try: + filtered = filter_exceptions_from_params(item, max_depth - 1) + if filtered is not None: + result_list.append(filtered) + except Exception: + # Skip items that cause errors during filtering + continue + return result_list + else: + return data + + +def filter_internal_params( + data: dict, additional_internal_params: Optional[set] = None +) -> dict: + """ + Filter out LiteLLM internal parameters that shouldn't be sent to provider APIs. + + This removes internal/MCP-related parameters that are used by LiteLLM internally + but should not be included in API requests to providers. + + Args: + data: Dictionary of parameters to filter + additional_internal_params: Optional set of additional internal parameter names to filter + + Returns: + Filtered dictionary with internal parameters removed + """ + if not isinstance(data, dict): + return data + + # Known internal parameters that should never be sent to provider APIs + internal_params = { + "skip_mcp_handler", + "mcp_handler_context", + "_skip_mcp_handler", + } + + # Add any additional internal params if provided + if additional_internal_params: + internal_params.update(additional_internal_params) + + # Filter out internal parameters + return {k: v for k, v in data.items() if k not in internal_params} diff --git a/litellm/litellm_core_utils/custom_logger_registry.py b/litellm/litellm_core_utils/custom_logger_registry.py index 09794bf2677b..a3c25ab65e9d 100644 --- a/litellm/litellm_core_utils/custom_logger_registry.py +++ b/litellm/litellm_core_utils/custom_logger_registry.py @@ -16,14 +16,17 @@ from litellm.integrations.argilla import ArgillaLogger from litellm.integrations.azure_storage.azure_storage import AzureBlobStorageLogger from litellm.integrations.bitbucket import BitBucketPromptManager -from litellm.integrations.gitlab import GitLabPromptManager from litellm.integrations.braintrust_logging import BraintrustLogger +from litellm.integrations.cloudzero.cloudzero import CloudZeroLogger +from litellm.integrations.focus.focus_logger import FocusLogger from litellm.integrations.datadog.datadog import DataDogLogger from litellm.integrations.datadog.datadog_llm_obs import DataDogLLMObsLogger from litellm.integrations.deepeval import DeepEvalLogger +from litellm.integrations.dotprompt import DotpromptManager from litellm.integrations.galileo import GalileoObserve from litellm.integrations.gcs_bucket.gcs_bucket import GCSBucketLogger from litellm.integrations.gcs_pubsub.pub_sub import GcsPubSubLogger +from litellm.integrations.gitlab import GitLabPromptManager from litellm.integrations.humanloop import HumanloopLogger from litellm.integrations.lago import LagoLogger from litellm.integrations.langfuse.langfuse_prompt_management import ( @@ -36,13 +39,7 @@ from litellm.integrations.opentelemetry import OpenTelemetry from litellm.integrations.opik.opik import OpikLogger from litellm.integrations.posthog import PostHogLogger - -try: - from litellm_enterprise.integrations.prometheus import PrometheusLogger -except Exception: - PrometheusLogger = None -from litellm.integrations.cloudzero.cloudzero import CloudZeroLogger -from litellm.integrations.dotprompt import DotpromptManager +from litellm.integrations.prometheus import PrometheusLogger from litellm.integrations.s3_v2 import S3Logger from litellm.integrations.sqs import SQSLogger from litellm.integrations.vector_store_integrations.vector_store_pre_call_hook import ( @@ -79,6 +76,8 @@ class CustomLoggerRegistry: "langfuse_otel": OpenTelemetry, "arize_phoenix": OpenTelemetry, "langtrace": OpenTelemetry, + "weave_otel": OpenTelemetry, + "levo": OpenTelemetry, "mlflow": MlflowLogger, "langfuse": LangfusePromptManagement, "otel": OpenTelemetry, @@ -95,27 +94,33 @@ class CustomLoggerRegistry: "bitbucket": BitBucketPromptManager, "gitlab": GitLabPromptManager, "cloudzero": CloudZeroLogger, + "focus": FocusLogger, "posthog": PostHogLogger, } try: - from litellm_enterprise.enterprise_callbacks.generic_api_callback import ( - GenericAPILogger, - ) from litellm_enterprise.enterprise_callbacks.pagerduty.pagerduty import ( PagerDutyAlerting, ) from litellm_enterprise.enterprise_callbacks.send_emails.resend_email import ( ResendEmailLogger, ) + from litellm_enterprise.enterprise_callbacks.send_emails.sendgrid_email import ( + SendGridEmailLogger, + ) from litellm_enterprise.enterprise_callbacks.send_emails.smtp_email import ( SMTPEmailLogger, ) + from litellm.integrations.generic_api.generic_api_callback import ( + GenericAPILogger, + ) + enterprise_loggers = { "pagerduty": PagerDutyAlerting, "generic_api": GenericAPILogger, "resend_email": ResendEmailLogger, + "sendgrid_email": SendGridEmailLogger, "smtp_email": SMTPEmailLogger, } CALLBACK_CLASS_STR_TO_CLASS_TYPE.update(enterprise_loggers) diff --git a/litellm/litellm_core_utils/default_encoding.py b/litellm/litellm_core_utils/default_encoding.py index 93b3132912cb..1771efba410d 100644 --- a/litellm/litellm_core_utils/default_encoding.py +++ b/litellm/litellm_core_utils/default_encoding.py @@ -15,9 +15,33 @@ __name__, "litellm_core_utils/tokenizers" ) +# Check if the directory is writable. If not, use /tmp as a fallback. +# This is especially important for non-root Docker environments where the package directory is read-only. +is_non_root = os.getenv("LITELLM_NON_ROOT", "").lower() == "true" +if not os.access(filename, os.W_OK) and is_non_root: + filename = "/tmp/tiktoken_cache" + os.makedirs(filename, exist_ok=True) + os.environ["TIKTOKEN_CACHE_DIR"] = os.getenv( "CUSTOM_TIKTOKEN_CACHE_DIR", filename ) # use local copy of tiktoken b/c of - https://github.com/BerriAI/litellm/issues/1071 import tiktoken +import time +import random + +# Retry logic to handle race conditions when multiple processes try to create +# the tiktoken cache file simultaneously (common in parallel test execution on Windows) +_max_retries = 5 +_retry_delay = 0.1 # Start with 100ms -encoding = tiktoken.get_encoding("cl100k_base") +for attempt in range(_max_retries): + try: + encoding = tiktoken.get_encoding("cl100k_base") + break + except (FileExistsError, OSError): + if attempt == _max_retries - 1: + # Last attempt, re-raise the exception + raise + # Exponential backoff with jitter to reduce collision probability + delay = _retry_delay * (2**attempt) + random.uniform(0, 0.1) + time.sleep(delay) diff --git a/litellm/litellm_core_utils/dot_notation_indexing.py b/litellm/litellm_core_utils/dot_notation_indexing.py index fda37f65007d..1e835004e942 100644 --- a/litellm/litellm_core_utils/dot_notation_indexing.py +++ b/litellm/litellm_core_utils/dot_notation_indexing.py @@ -1,10 +1,29 @@ """ -This file contains the logic for dot notation indexing. +Path-based navigation utilities for nested dictionaries. -Used by JWT Auth to get the user role from the token. +This module provides utilities for reading and deleting values in nested +dictionaries using dot notation and JSONPath-like array syntax. + +Custom implementation with zero external dependencies. + +Supported syntax: +- "field" - top-level field +- "parent.child" - nested field +- "parent\\.with\\.dots.child" - keys containing dots (escape with backslash) +- "array[*]" - all array elements (wildcard) +- "array[0]" - specific array element (index) +- "array[*].field" - field in all array elements + +Examples: + >>> data = {"tools": [{"name": "t1", "input_examples": ["ex"]}]} + >>> delete_nested_value(data, "tools[*].input_examples") + {"tools": [{"name": "t1"}]} + +Used by JWT Auth to get the user role from the token, and by +additional_drop_params to remove nested fields from optional parameters. """ -from typing import Any, Dict, Optional, TypeVar +from typing import Any, Dict, List, Optional, TypeVar, Union T = TypeVar("T") @@ -29,6 +48,9 @@ def get_nested_value( 'value' >>> get_nested_value(data, "a.b.d", "default") 'default' + >>> data = {"kubernetes.io": {"namespace": "default"}} + >>> get_nested_value(data, "kubernetes\\.io.namespace") + 'default' """ if not key_path: return default @@ -40,8 +62,11 @@ def get_nested_value( else key_path ) - # Split the key path into parts - parts = key_path.split(".") + # Split the key path into parts, respecting escaped dots (\.) + # Use a temporary placeholder, split on unescaped dots, then restore + placeholder = "\x00" + parts = key_path.replace("\\.", placeholder).split(".") + parts = [p.replace(placeholder, ".") for p in parts] # Traverse through the dictionary current: Any = data @@ -57,3 +82,164 @@ def get_nested_value( # Otherwise, ensure the type matches the default return current if isinstance(current, type(default)) else default + + +def _parse_path_segments(path: str) -> list: + """ + Parse a JSONPath-like string into segments using regex. + + Handles: + - Dot notation: "a.b.c" → ["a", "b", "c"] + - Array wildcards: "a[*].b" → ["a", "[*]", "b"] + - Array indices: "a[0].b" → ["a", "[0]", "b"] + + Args: + path: JSONPath-like path string + + Returns: + List of path segments + + Example: + >>> _parse_path_segments("tools[*].arr[0].field") + ["tools", "[*]", "arr", "[0]", "field"] + """ + import re + + # Match field names OR bracket expressions + # Pattern: field_name (anything except . or [) | [anything_in_brackets] + pattern = r'[^\.\[]+|\[[^\]]*\]' + segments = re.findall(pattern, path) + return segments + + +def _delete_nested_value_custom( + data: Union[Dict[str, Any], List[Any]], + segments: list, + segment_index: int = 0, +) -> None: + """ + Recursively delete a field from nested data using parsed segments. + + Modifies data in-place (caller must deep copy first). + + Args: + data: Dictionary or list to modify + segments: Parsed path segments + segment_index: Current position in segments list + """ + if segment_index >= len(segments): + return + + segment = segments[segment_index] + is_last = segment_index == len(segments) - 1 + + # Handle array wildcard: [*] + if segment == "[*]": + if isinstance(data, list): + for item in data: + if is_last: + # Can't delete array elements themselves, skip + pass + else: + # Only recurse if item is a dict or list (nested structure) + if isinstance(item, (dict, list)): + _delete_nested_value_custom(item, segments, segment_index + 1) + return + + # Handle array index: [0], [1], [2], etc. + if segment.startswith("[") and segment.endswith("]"): + try: + index = int(segment[1:-1]) + if isinstance(data, list) and 0 <= index < len(data): + if is_last: + # Can't delete array elements themselves, skip + pass + else: + # Only recurse if element is a dict or list (nested structure) + element = data[index] + if isinstance(element, (dict, list)): + _delete_nested_value_custom(element, segments, segment_index + 1) + except (ValueError, IndexError): + # Invalid index, skip + pass + return + + # Handle regular field navigation + if isinstance(data, dict): + if is_last: + # Delete the field + data.pop(segment, None) + else: + # Navigate deeper + if segment in data: + next_segment = segments[segment_index + 1] if segment_index + 1 < len(segments) else None + + # If next segment is array notation, current field should be list + if next_segment and (next_segment.startswith("[")): + if isinstance(data[segment], list): + _delete_nested_value_custom(data[segment], segments, segment_index + 1) + # Otherwise navigate into dict + elif isinstance(data[segment], dict): + _delete_nested_value_custom(data[segment], segments, segment_index + 1) + + +def delete_nested_value( + data: Dict[str, Any], + path: str, + depth: int = 0, + max_depth: int = 20, +) -> Dict[str, Any]: + """ + Delete a field from nested data using JSONPath notation. + + Custom implementation - no external dependencies. + + Supports: + - "field" - top-level field + - "parent.child" - nested field + - "array[*]" - all array elements (wildcard) + - "array[0]" - specific array element (index) + - "array[*].field" - field in all array elements + + Args: + data: Dictionary to modify (creates deep copy) + path: JSONPath-like path string + depth: Current recursion depth (kept for API compatibility) + max_depth: Maximum recursion depth (kept for API compatibility) + + Returns: + New dictionary with field removed at path + + Example: + >>> data = {"tools": [{"name": "t1", "input_examples": ["ex"]}]} + >>> delete_nested_value(data, "tools[*].input_examples") + {"tools": [{"name": "t1"}]} + """ + import copy + + result = copy.deepcopy(data) + + try: + # Parse path into segments + segments = _parse_path_segments(path) + + if not segments: + return result + + # Delete using custom recursive implementation + _delete_nested_value_custom(result, segments, 0) + + except Exception: + # Invalid path or parsing error - silently skip + pass + + return result + + +def is_nested_path(path: str) -> bool: + """ + Check if path requires nested handling. + + Returns True if path contains '.' or '[' (array notation). + """ + return "." in path or "[" in path diff --git a/litellm/litellm_core_utils/exception_mapping_utils.py b/litellm/litellm_core_utils/exception_mapping_utils.py index 61551b042368..3ddcae69315a 100644 --- a/litellm/litellm_core_utils/exception_mapping_utils.py +++ b/litellm/litellm_core_utils/exception_mapping_utils.py @@ -3,6 +3,7 @@ from typing import Any, Optional import httpx +import re import litellm from litellm._logging import verbose_logger @@ -12,6 +13,7 @@ APIConnectionError, APIError, AuthenticationError, + BadGatewayError, BadRequestError, ContentPolicyViolationError, ContextWindowExceededError, @@ -43,16 +45,23 @@ def is_error_str_rate_limit(error_str: str) -> bool: """ if not isinstance(error_str, str): return False - - if "429" in error_str or "rate limit" in error_str.lower(): + + # Only treat 429 as a rate limit signal when it appears as a standalone token + if re.search(r"\b429\b", error_str): + return True + + _error_str_lower = error_str.lower() + + # Match "rate limit" (including variations like rate-limit / rate_limit) + if re.search(r"rate[\s_\-]*limit", _error_str_lower): return True - + ####################################### # Mistral API returns this error string ######################################### - if "service tier capacity exceeded" in error_str.lower(): + if "service tier capacity exceeded" in _error_str_lower: return True - + return False @staticmethod @@ -68,10 +77,38 @@ def is_error_str_context_window_exceeded(error_str: str) -> bool: "model's maximum context limit", "is longer than the model's context length", "input tokens exceed the configured limit", + "`inputs` tokens + `max_new_tokens` must be", + "exceeds the maximum number of tokens allowed", # Gemini ] for substring in known_exception_substrings: if substring in _error_str_lowercase: return True + + # Cerebras pattern: "Current length is X while limit is Y" + if ( + "current length is" in _error_str_lowercase + and "while limit is" in _error_str_lowercase + ): + return True + + return False + + @staticmethod + def is_azure_content_policy_violation_error(error_str: str) -> bool: + """ + Check if an error string indicates a content policy violation error. + """ + known_exception_substrings = [ + "invalid_request_error", + "content_policy_violation", + "the response was filtered due to the prompt triggering azure openai's content management", + "your task failed as a result of our safety system", + "the model produced invalid content", + "content_filter_policy", + ] + for substring in known_exception_substrings: + if substring in error_str.lower(): + return True return False @@ -105,7 +142,14 @@ def get_error_message(error_obj) -> Optional[str]: if hasattr(error_obj, "body"): _error_obj_body = getattr(error_obj, "body") if isinstance(_error_obj_body, dict): - return _error_obj_body.get("message") + # OpenAI-style: {"message": "...", "type": "...", ...} + if _error_obj_body.get("message"): + return _error_obj_body.get("message") + + # Azure-style: {"error": {"message": "...", ...}} + nested_error = _error_obj_body.get("error") + if isinstance(nested_error, dict): + return nested_error.get("message") # If all else fails, return None return None @@ -136,9 +180,6 @@ def _get_response_headers(original_exception: Exception) -> Optional[httpx.Heade return _response_headers -import re - - def extract_and_raise_litellm_exception( response: Optional[Any], error_str: str, @@ -163,12 +204,22 @@ def extract_and_raise_litellm_exception( exception_name = exception_name.strip().replace("litellm.", "") raised_exception_obj = getattr(litellm, exception_name, None) if raised_exception_obj: - raise raised_exception_obj( - message=error_str, - llm_provider=custom_llm_provider, - model=model, - response=response, - ) + # Try with response parameter first, fall back to without it + # Some exceptions (e.g., APIConnectionError) don't accept response param + try: + raise raised_exception_obj( + message=error_str, + llm_provider=custom_llm_provider, + model=model, + response=response, + ) + except TypeError: + # Exception doesn't accept response parameter + raise raised_exception_obj( + message=error_str, + llm_provider=custom_llm_provider, + model=model, + ) def exception_type( # type: ignore # noqa: PLR0915 @@ -507,6 +558,15 @@ def exception_type( # type: ignore # noqa: PLR0915 response=getattr(original_exception, "response", None), litellm_debug_info=extra_information, ) + elif original_exception.status_code == 502: + exception_mapping_worked = True + raise BadGatewayError( + message=f"BadGatewayError: {exception_provider} - {message}", + model=model, + llm_provider=custom_llm_provider, + response=getattr(original_exception, "response", None), + litellm_debug_info=extra_information, + ) elif original_exception.status_code == 503: exception_mapping_worked = True raise ServiceUnavailableError( @@ -637,6 +697,15 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AnthropicException - {error_str}. Handle with `litellm.InternalServerError`.", llm_provider="anthropic", model=model, + response=getattr(original_exception, "response", None), + ) + elif original_exception.status_code == 502: + exception_mapping_worked = True + raise BadGatewayError( + message=f"AnthropicException BadGatewayError - {error_str}", + llm_provider="anthropic", + model=model, + response=getattr(original_exception, "response", None), ) elif original_exception.status_code == 503: exception_mapping_worked = True @@ -644,6 +713,15 @@ def exception_type( # type: ignore # noqa: PLR0915 message=f"AnthropicException - {error_str}. Handle with `litellm.ServiceUnavailableError`.", llm_provider="anthropic", model=model, + response=getattr(original_exception, "response", None), + ) + elif original_exception.status_code == 504: # gateway timeout error + exception_mapping_worked = True + raise Timeout( + message=f"AnthropicException Timeout - {error_str}", + model=model, + llm_provider="anthropic", + exception_status_code=original_exception.status_code, ) elif custom_llm_provider == "replicate": if "Incorrect authentication token" in error_str: @@ -1199,6 +1277,14 @@ def exception_type( # type: ignore # noqa: PLR0915 model=model, llm_provider=custom_llm_provider, ) + elif ExceptionCheckers.is_error_str_context_window_exceeded(error_str): + exception_mapping_worked = True + raise ContextWindowExceededError( + message=f"ContextWindowExceededError: {custom_llm_provider.capitalize()}Exception - {error_str}", + model=model, + llm_provider=custom_llm_provider, + litellm_debug_info=extra_information, + ) elif ( "None Unknown Error." in error_str or "Content has no parts." in error_str @@ -1260,6 +1346,7 @@ def exception_type( # type: ignore # noqa: PLR0915 elif ( "429 Quota exceeded" in error_str or "Quota exceeded for" in error_str + or "Resource exhausted" in error_str or "IndexError: list index out of range" in error_str or "429 Unable to submit request because the service is temporarily out of capacity." in error_str @@ -1964,6 +2051,20 @@ def exception_type( # type: ignore # noqa: PLR0915 else: message = str(original_exception) + # Azure OpenAI (especially Images) often nests error details under + # body["error"]. Detect content policy violations using the structured + # payload in addition to string matching. + azure_error_code: Optional[str] = None + try: + body_dict = getattr(original_exception, "body", None) or {} + if isinstance(body_dict, dict): + if isinstance(body_dict.get("error"), dict): + azure_error_code = body_dict["error"].get("code") # type: ignore[index] + else: + azure_error_code = body_dict.get("code") + except Exception: + azure_error_code = None + if "Internal server error" in error_str: exception_mapping_worked = True raise litellm.InternalServerError( @@ -1992,26 +2093,20 @@ def exception_type( # type: ignore # noqa: PLR0915 response=getattr(original_exception, "response", None), ) elif ( - ( - "invalid_request_error" in error_str - and "content_policy_violation" in error_str - ) - or ( - "The response was filtered due to the prompt triggering Azure OpenAI's content management" - in error_str - ) - or "Your task failed as a result of our safety system" in error_str - or "The model produced invalid content" in error_str - or "content_filter_policy" in error_str + azure_error_code == "content_policy_violation" + or ExceptionCheckers.is_azure_content_policy_violation_error(error_str) ): exception_mapping_worked = True - raise ContentPolicyViolationError( - message=f"litellm.ContentPolicyViolationError: AzureException - {message}", - llm_provider="azure", + from litellm.llms.azure.exception_mapping import ( + AzureOpenAIExceptionMapping, + ) + raise AzureOpenAIExceptionMapping.create_content_policy_violation_error( + message=message, model=model, - litellm_debug_info=extra_information, - response=getattr(original_exception, "response", None), + extra_information=extra_information, + original_exception=original_exception, ) + elif "invalid_request_error" in error_str: exception_mapping_worked = True raise BadRequestError( @@ -2089,6 +2184,15 @@ def exception_type( # type: ignore # noqa: PLR0915 litellm_debug_info=extra_information, response=getattr(original_exception, "response", None), ) + elif original_exception.status_code == 502: + exception_mapping_worked = True + raise BadGatewayError( + message=f"AzureException BadGatewayError - {message}", + model=model, + llm_provider="azure", + litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), + ) elif original_exception.status_code == 503: exception_mapping_worked = True raise ServiceUnavailableError( diff --git a/litellm/litellm_core_utils/fallback_utils.py b/litellm/litellm_core_utils/fallback_utils.py index 7ce53862089d..aa5bdd927130 100644 --- a/litellm/litellm_core_utils/fallback_utils.py +++ b/litellm/litellm_core_utils/fallback_utils.py @@ -3,7 +3,7 @@ import litellm from litellm._logging import verbose_logger -from litellm.litellm_core_utils.core_helpers import safe_deep_copy +from litellm.litellm_core_utils.core_helpers import safe_deep_copy, filter_internal_params from .asyncify import run_async_function @@ -49,6 +49,9 @@ async def async_completion_with_fallbacks(**kwargs): else: model = fallback + # Filter out internal parameters that shouldn't be sent to provider APIs + completion_kwargs = filter_internal_params(completion_kwargs) + response = await litellm.acompletion( **completion_kwargs, model=model, diff --git a/litellm/litellm_core_utils/get_litellm_params.py b/litellm/litellm_core_utils/get_litellm_params.py index c167c202e5d2..060e98fd49fc 100644 --- a/litellm/litellm_core_utils/get_litellm_params.py +++ b/litellm/litellm_core_utils/get_litellm_params.py @@ -42,6 +42,7 @@ def get_litellm_params( input_cost_per_token=None, output_cost_per_token=None, output_cost_per_second=None, + cost_per_query=None, cooldown_time=None, text_completion=None, azure_ad_token_provider=None, @@ -87,12 +88,17 @@ def get_litellm_params( "input_cost_per_second": input_cost_per_second, "output_cost_per_token": output_cost_per_token, "output_cost_per_second": output_cost_per_second, + "cost_per_query": cost_per_query, "cooldown_time": cooldown_time, "text_completion": text_completion, "azure_ad_token_provider": azure_ad_token_provider, "user_continue_message": user_continue_message, "base_model": base_model - or _get_base_model_from_litellm_call_metadata(metadata=metadata), + or ( + _get_base_model_from_litellm_call_metadata(metadata=metadata) + if metadata + else None + ), "litellm_trace_id": litellm_trace_id, "litellm_session_id": litellm_session_id, "hf_model_name": hf_model_name, @@ -118,7 +124,25 @@ def get_litellm_params( "bucket_name": kwargs.get("bucket_name"), "vertex_credentials": kwargs.get("vertex_credentials"), "vertex_project": kwargs.get("vertex_project"), + "vertex_location": kwargs.get("vertex_location"), + "vertex_ai_project": kwargs.get("vertex_ai_project"), + "vertex_ai_location": kwargs.get("vertex_ai_location"), + "vertex_ai_credentials": kwargs.get("vertex_ai_credentials"), "use_litellm_proxy": use_litellm_proxy, "litellm_request_debug": litellm_request_debug, + "aws_region_name": kwargs.get("aws_region_name"), + # AWS credentials for Bedrock/Sagemaker + "aws_access_key_id": kwargs.get("aws_access_key_id"), + "aws_secret_access_key": kwargs.get("aws_secret_access_key"), + "aws_session_token": kwargs.get("aws_session_token"), + "aws_session_name": kwargs.get("aws_session_name"), + "aws_profile_name": kwargs.get("aws_profile_name"), + "aws_role_name": kwargs.get("aws_role_name"), + "aws_web_identity_token": kwargs.get("aws_web_identity_token"), + "aws_sts_endpoint": kwargs.get("aws_sts_endpoint"), + "aws_external_id": kwargs.get("aws_external_id"), + "aws_bedrock_runtime_endpoint": kwargs.get("aws_bedrock_runtime_endpoint"), + "tpm": kwargs.get("tpm"), + "rpm": kwargs.get("rpm"), } return litellm_params diff --git a/litellm/litellm_core_utils/get_llm_provider_logic.py b/litellm/litellm_core_utils/get_llm_provider_logic.py index fb25c5ed8403..718773a1b162 100644 --- a/litellm/litellm_core_utils/get_llm_provider_logic.py +++ b/litellm/litellm_core_utils/get_llm_provider_logic.py @@ -1,9 +1,8 @@ from typing import Optional, Tuple -import httpx - import litellm from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH +from litellm.llms.openai_like.json_loader import JSONProviderRegistry from litellm.secret_managers.main import get_secret, get_secret_str from ..types.router import LiteLLM_Params @@ -22,6 +21,18 @@ def _is_non_openai_azure_model(model: str) -> bool: return False +def _is_azure_claude_model(model: str) -> bool: + """ + Check if a model name contains 'claude' (case-insensitive). + Used to detect Claude models that need Anthropic-specific handling. + """ + try: + model_lower = model.lower() + return "claude" in model_lower or model_lower.startswith("claude") + except Exception: + return False + + def handle_cohere_chat_model_custom_llm_provider( model: str, custom_llm_provider: Optional[str] = None ) -> Tuple[str, Optional[str]]: @@ -143,6 +154,17 @@ def get_llm_provider( # noqa: PLR0915 if api_key and api_key.startswith("os.environ/"): dynamic_api_key = get_secret_str(api_key) + + # Check JSON-configured providers FIRST (before enum-based provider_list) + provider_prefix = model.split("/", 1)[0] + if len(model.split("/")) > 1 and JSONProviderRegistry.exists(provider_prefix): + return _get_openai_compatible_provider_info( + model=model, + api_base=api_base, + api_key=api_key, + dynamic_api_key=dynamic_api_key, + ) + # check if llm provider part of model name if ( @@ -205,10 +227,10 @@ def get_llm_provider( # noqa: PLR0915 elif endpoint == "https://api.ai21.com/studio/v1": custom_llm_provider = "ai21_chat" dynamic_api_key = get_secret_str("AI21_API_KEY") - elif endpoint == "https://codestral.mistral.ai/v1": + elif endpoint == "codestral.mistral.ai/v1/chat/completions": custom_llm_provider = "codestral" dynamic_api_key = get_secret_str("CODESTRAL_API_KEY") - elif endpoint == "https://codestral.mistral.ai/v1": + elif endpoint == "codestral.mistral.ai/v1/fim/completions": custom_llm_provider = "text-completion-codestral" dynamic_api_key = get_secret_str("CODESTRAL_API_KEY") elif endpoint == "app.empower.dev/api/v1": @@ -217,6 +239,9 @@ def get_llm_provider( # noqa: PLR0915 elif endpoint == "api.deepseek.com/v1": custom_llm_provider = "deepseek" dynamic_api_key = get_secret_str("DEEPSEEK_API_KEY") + elif endpoint == "ollama.com": + custom_llm_provider = "ollama" + dynamic_api_key = get_secret_str("OLLAMA_API_KEY") elif endpoint == "https://api.friendli.ai/serverless/v1": custom_llm_provider = "friendliai" dynamic_api_key = get_secret_str( @@ -240,6 +265,30 @@ def get_llm_provider( # noqa: PLR0915 elif endpoint == "api.moonshot.ai/v1": custom_llm_provider = "moonshot" dynamic_api_key = get_secret_str("MOONSHOT_API_KEY") + elif endpoint == "api.minimax.io/anthropic" or endpoint == "api.minimaxi.com/anthropic": + custom_llm_provider = "minimax" + dynamic_api_key = get_secret_str("MINIMAX_API_KEY") + elif endpoint == "api.minimax.io/v1" or endpoint == "api.minimaxi.com/v1": + custom_llm_provider = "minimax" + dynamic_api_key = get_secret_str("MINIMAX_API_KEY") + elif endpoint == "platform.publicai.co/v1": + custom_llm_provider = "publicai" + dynamic_api_key = get_secret_str("PUBLICAI_API_KEY") + elif endpoint == "https://api.synthetic.new/openai/v1": + custom_llm_provider = "synthetic" + dynamic_api_key = get_secret_str("SYNTHETIC_API_KEY") + elif endpoint == "https://api.stima.tech/v1": + custom_llm_provider = "apertis" + dynamic_api_key = get_secret_str("STIMA_API_KEY") + elif endpoint == "https://nano-gpt.com/api/v1": + custom_llm_provider = "nano-gpt" + dynamic_api_key = get_secret_str("NANOGPT_API_KEY") + elif endpoint == "https://api.poe.com/v1": + custom_llm_provider = "poe" + dynamic_api_key = get_secret_str("POE_API_KEY") + elif endpoint == "https://llm.chutes.ai/v1/": + custom_llm_provider = "chutes" + dynamic_api_key = get_secret_str("CHUTES_API_KEY") elif endpoint == "https://api.v0.dev/v1": custom_llm_provider = "v0" dynamic_api_key = get_secret_str("V0_API_KEY") @@ -386,6 +435,10 @@ def get_llm_provider( # noqa: PLR0915 custom_llm_provider = "lemonade" elif model.startswith("clarifai/"): custom_llm_provider = "clarifai" + elif model.startswith("amazon_nova"): + custom_llm_provider = "amazon_nova" + elif model.startswith("sap/"): + custom_llm_provider = "sap" if not custom_llm_provider: if litellm.suppress_debug_info is False: print() # noqa @@ -398,11 +451,7 @@ def get_llm_provider( # noqa: PLR0915 raise litellm.exceptions.BadRequestError( # type: ignore message=error_str, model=model, - response=httpx.Response( - status_code=400, - content=error_str, - request=httpx.Request(method="completion", url="https://github.com/BerriAI/litellm"), # type: ignore - ), + response=None, llm_provider="", ) if api_base is not None and not isinstance(api_base, str): @@ -426,11 +475,7 @@ def get_llm_provider( # noqa: PLR0915 raise litellm.exceptions.BadRequestError( # type: ignore message=f"GetLLMProvider Exception - {str(e)}\n\noriginal model: {model}", model=model, - response=httpx.Response( - status_code=400, - content=error_str, - request=httpx.Request(method="completion", url="https://github.com/BerriAI/litellm"), # type: ignore - ), + response=None, llm_provider="", ) @@ -453,6 +498,20 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915 custom_llm_provider = model.split("/", 1)[0] model = model.split("/", 1)[1] + # Check JSON providers FIRST (before hardcoded ones) + from litellm.llms.openai_like.dynamic_config import create_config_class + from litellm.llms.openai_like.json_loader import JSONProviderRegistry + + if JSONProviderRegistry.exists(custom_llm_provider): + provider_config = JSONProviderRegistry.get(custom_llm_provider) + if provider_config is None: + raise ValueError(f"Provider {custom_llm_provider} not found") + config_class = create_config_class(provider_config) + api_base, dynamic_api_key = config_class()._get_openai_compatible_provider_info( + api_base, api_key + ) + return model, custom_llm_provider, dynamic_api_key, api_base + if custom_llm_provider == "perplexity": # perplexity is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.perplexity.ai ( @@ -529,6 +588,13 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915 or "https://api.studio.nebius.ai/v1" ) # type: ignore dynamic_api_key = api_key or get_secret_str("NEBIUS_API_KEY") + elif custom_llm_provider == "ollama": + api_base = ( + api_base + or get_secret("OLLAMA_API_BASE") + or "http://localhost:11434" + ) # type: ignore + dynamic_api_key = api_key or get_secret_str("OLLAMA_API_KEY") elif (custom_llm_provider == "ai21_chat") or ( custom_llm_provider == "ai21" and model in litellm.ai21_chat_models ): @@ -647,6 +713,13 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915 ) = litellm.XAIChatConfig()._get_openai_compatible_provider_info( api_base, api_key ) + elif custom_llm_provider == "zai": + ( + api_base, + dynamic_api_key, + ) = litellm.ZAIChatConfig()._get_openai_compatible_provider_info( + api_base, api_key + ) elif custom_llm_provider == "together_ai": api_base = ( api_base @@ -685,6 +758,14 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915 ) = litellm.GithubCopilotConfig()._get_openai_compatible_provider_info( model, api_base, api_key, custom_llm_provider ) + elif custom_llm_provider == "chatgpt": + ( + api_base, + dynamic_api_key, + custom_llm_provider, + ) = litellm.ChatGPTConfig()._get_openai_compatible_provider_info( + model, api_base, api_key, custom_llm_provider + ) elif custom_llm_provider == "novita": api_base = ( api_base @@ -693,12 +774,12 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915 ) # type: ignore dynamic_api_key = api_key or get_secret_str("NOVITA_API_KEY") elif custom_llm_provider == "snowflake": - api_base = ( - api_base - or get_secret_str("SNOWFLAKE_API_BASE") - or f"https://{get_secret('SNOWFLAKE_ACCOUNT_ID')}.snowflakecomputing.com/api/v2/cortex/inference:complete" - ) # type: ignore - dynamic_api_key = api_key or get_secret_str("SNOWFLAKE_JWT") + ( + api_base, + dynamic_api_key, + ) = litellm.SnowflakeConfig()._get_openai_compatible_provider_info( + api_base, api_key + ) elif custom_llm_provider == "gradient_ai": ( api_base, @@ -741,6 +822,14 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915 ) = litellm.MoonshotChatConfig()._get_openai_compatible_provider_info( api_base, api_key ) + # publicai is now handled by JSON config (see litellm/llms/openai_like/providers.json) + elif custom_llm_provider == "docker_model_runner": + ( + api_base, + dynamic_api_key, + ) = litellm.DockerModelRunnerChatConfig()._get_openai_compatible_provider_info( + api_base, api_key + ) elif custom_llm_provider == "v0": ( api_base, @@ -804,6 +893,32 @@ def _get_openai_compatible_provider_info( # noqa: PLR0915 ) = litellm.ClarifaiConfig()._get_openai_compatible_provider_info( api_base, api_key ) + elif custom_llm_provider == "ragflow": + full_model = f"ragflow/{model}" + ( + api_base, + dynamic_api_key, + _, + ) = litellm.RAGFlowConfig()._get_openai_compatible_provider_info( + full_model, api_base, api_key, "ragflow" + ) + model = full_model + elif custom_llm_provider == "langgraph": + # LangGraph is a custom provider, just need to set api_base + api_base = ( + api_base + or get_secret_str("LANGGRAPH_API_BASE") + or "http://localhost:2024" + ) + dynamic_api_key = api_key or get_secret_str("LANGGRAPH_API_KEY") + elif custom_llm_provider == "manus": + # Manus is OpenAI compatible for responses API + api_base = ( + api_base + or get_secret_str("MANUS_API_BASE") + or "https://api.manus.im" + ) + dynamic_api_key = api_key or get_secret_str("MANUS_API_KEY") if api_base is not None and not isinstance(api_base, str): raise Exception("api base needs to be a string. api_base={}".format(api_base)) diff --git a/litellm/litellm_core_utils/get_model_cost_map.py b/litellm/litellm_core_utils/get_model_cost_map.py index b6a3a243c46c..9b86f4ca2f08 100644 --- a/litellm/litellm_core_utils/get_model_cost_map.py +++ b/litellm/litellm_core_utils/get_model_cost_map.py @@ -18,14 +18,15 @@ def get_model_cost_map(url: str) -> dict: os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) or os.getenv("LITELLM_LOCAL_MODEL_COST_MAP", False) == "True" ): - import importlib.resources + from importlib.resources import files import json - with importlib.resources.open_text( - "litellm", "model_prices_and_context_window_backup.json" - ) as f: - content = json.load(f) - return content + content = json.loads( + files("litellm") + .joinpath("model_prices_and_context_window_backup.json") + .read_text(encoding="utf-8") + ) + return content try: response = httpx.get( @@ -35,11 +36,12 @@ def get_model_cost_map(url: str) -> dict: content = response.json() return content except Exception: - import importlib.resources + from importlib.resources import files import json - with importlib.resources.open_text( - "litellm", "model_prices_and_context_window_backup.json" - ) as f: - content = json.load(f) - return content + content = json.loads( + files("litellm") + .joinpath("model_prices_and_context_window_backup.json") + .read_text(encoding="utf-8") + ) + return content diff --git a/litellm/litellm_core_utils/get_supported_openai_params.py b/litellm/litellm_core_utils/get_supported_openai_params.py index 06e650f938dc..4b40f44cbc4a 100644 --- a/litellm/litellm_core_utils/get_supported_openai_params.py +++ b/litellm/litellm_core_utils/get_supported_openai_params.py @@ -116,6 +116,11 @@ def get_supported_openai_params( # noqa: PLR0915 f"Unsupported provider config: {transcription_provider_config} for model: {model}" ) return litellm.OpenAIConfig().get_supported_openai_params(model=model) + elif custom_llm_provider == "sap": + if request_type == "chat_completion": + return litellm.GenAIHubOrchestrationConfig().get_supported_openai_params(model=model) + elif request_type == "embeddings": + return litellm.GenAIHubEmbeddingConfig().get_supported_openai_params(model=model) elif custom_llm_provider == "azure": if litellm.AzureOpenAIO1Config().is_o_series_model(model=model): return litellm.AzureOpenAIO1Config().get_supported_openai_params( @@ -266,6 +271,15 @@ def get_supported_openai_params( # noqa: PLR0915 model=model ) ) + elif custom_llm_provider == "ovhcloud": + if request_type == "transcription": + from litellm.llms.ovhcloud.audio_transcription.transformation import ( + OVHCloudAudioTranscriptionConfig, + ) + + return OVHCloudAudioTranscriptionConfig().get_supported_openai_params( + model=model + ) elif custom_llm_provider == "elevenlabs": if request_type == "transcription": from litellm.llms.elevenlabs.audio_transcription.transformation import ( diff --git a/litellm/litellm_core_utils/health_check_helpers.py b/litellm/litellm_core_utils/health_check_helpers.py index 9cbee7fc70d1..cc3916af0693 100644 --- a/litellm/litellm_core_utils/health_check_helpers.py +++ b/litellm/litellm_core_utils/health_check_helpers.py @@ -97,6 +97,7 @@ def get_mode_handlers( "audio_speech", "audio_transcription", "image_generation", + "video_generation", "rerank", "realtime", "batch", @@ -159,6 +160,10 @@ def get_mode_handlers( **_filter_model_params(model_params=model_params), prompt=prompt, ), + "video_generation": lambda: litellm.avideo_generation( + **_filter_model_params(model_params=model_params), + prompt=prompt or "test video generation", + ), "rerank": lambda: litellm.arerank( **_filter_model_params(model_params=model_params), query=prompt or "", diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index c9d766628a91..4ad2d1002bce 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -59,6 +59,7 @@ from litellm.integrations.deepeval.deepeval import DeepEvalLogger from litellm.integrations.mlflow import MlflowLogger from litellm.integrations.sqs import SQSLogger +from litellm.litellm_core_utils.core_helpers import reconstruct_model_name from litellm.litellm_core_utils.get_litellm_params import get_litellm_params from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import ( StandardBuiltInToolCostTracking, @@ -68,7 +69,11 @@ redact_message_input_output_from_custom_logger, redact_message_input_output_from_logging, ) +from litellm.llms.base_llm.ocr.transformation import OCRResponse +from litellm.llms.base_llm.search.transformation import SearchResponse from litellm.responses.utils import ResponseAPILoggingUtils +from litellm.types.agents import LiteLLMSendMessageResponse +from litellm.types.containers.main import ContainerObject from litellm.types.llms.openai import ( AllMessageValues, Batch, @@ -76,10 +81,12 @@ HttpxBinaryResponseContent, OpenAIFileObject, OpenAIModerationResponse, + ResponseAPIUsage, ResponseCompletedEvent, ResponsesAPIResponse, ) from litellm.types.mcp import MCPPostCallResponseObject +from litellm.types.prompts.init_prompts import PromptSpec from litellm.types.rerank import RerankResponse from litellm.types.utils import ( CachingDetails, @@ -117,11 +124,11 @@ ) from litellm.types.videos.main import VideoObject from litellm.utils import _get_base_model_from_metadata, executor, print_verbose -from litellm.llms.base_llm.ocr.transformation import OCRResponse from ..integrations.argilla import ArgillaLogger from ..integrations.arize.arize_phoenix import ArizePhoenixLogger from ..integrations.athina import AthinaLogger +from ..integrations.azure_sentinel.azure_sentinel import AzureSentinelLogger from ..integrations.azure_storage.azure_storage import AzureBlobStorageLogger from ..integrations.custom_prompt_management import CustomPromptManagement from ..integrations.datadog.datadog import DataDogLogger @@ -162,23 +169,24 @@ from litellm_enterprise.enterprise_callbacks.callback_controls import ( EnterpriseCallbackControls, ) - from litellm_enterprise.enterprise_callbacks.generic_api_callback import ( - GenericAPILogger, - ) from litellm_enterprise.enterprise_callbacks.pagerduty.pagerduty import ( PagerDutyAlerting, ) from litellm_enterprise.enterprise_callbacks.send_emails.resend_email import ( ResendEmailLogger, ) + from litellm_enterprise.enterprise_callbacks.send_emails.sendgrid_email import ( + SendGridEmailLogger, + ) from litellm_enterprise.enterprise_callbacks.send_emails.smtp_email import ( SMTPEmailLogger, ) - from litellm_enterprise.integrations.prometheus import PrometheusLogger from litellm_enterprise.litellm_core_utils.litellm_logging import ( StandardLoggingPayloadSetup as EnterpriseStandardLoggingPayloadSetup, ) + from litellm.integrations.generic_api.generic_api_callback import GenericAPILogger + EnterpriseStandardLoggingPayloadSetupVAR: Optional[ Type[EnterpriseStandardLoggingPayloadSetup] ] = EnterpriseStandardLoggingPayloadSetup @@ -188,15 +196,20 @@ ) GenericAPILogger = CustomLogger # type: ignore ResendEmailLogger = CustomLogger # type: ignore + SendGridEmailLogger = CustomLogger # type: ignore SMTPEmailLogger = CustomLogger # type: ignore PagerDutyAlerting = CustomLogger # type: ignore EnterpriseCallbackControls = None # type: ignore EnterpriseStandardLoggingPayloadSetupVAR = None - PrometheusLogger = None _in_memory_loggers: List[Any] = [] ### GLOBAL VARIABLES ### +# Cache custom pricing keys as frozenset for O(1) lookups instead of looping through 49 keys +_CUSTOM_PRICING_KEYS: frozenset = frozenset( + CustomPricingLiteLLMParams.model_fields.keys() +) + sentry_sdk_instance = None capture_exception = None add_breadcrumb = None @@ -246,6 +259,24 @@ def set_cache(self, litellm_call_id: str, service_name: str, trace_id: str) -> N in_memory_trace_id_cache = ServiceTraceIDCache() in_memory_dynamic_logger_cache = DynamicLoggingCache() +# Cached lazy import for PrometheusLogger +# Module-level cache to avoid repeated imports while preserving memory benefits +_PrometheusLogger = None + + +def _get_cached_prometheus_logger(): + """ + Get cached PrometheusLogger class. + Lazy imports on first call to avoid loading prometheus.py and utils.py at import time (60MB saved). + Subsequent calls use cached class for better performance. + """ + global _PrometheusLogger + if _PrometheusLogger is None: + from litellm.integrations.prometheus import PrometheusLogger + + _PrometheusLogger = PrometheusLogger + return _PrometheusLogger + class Logging(LiteLLMLoggingBaseClass): global supabaseClient, promptLayerLogger, weightsBiasesLogger, logfireLogger, capture_exception, add_breadcrumb, lunaryLogger, logfireLogger, prometheusLogger, slack_app @@ -297,18 +328,21 @@ def __init__( for m in messages: new_messages.append({"role": "user", "content": m}) messages = new_messages + self.model = model - self.messages = copy.deepcopy(messages) + self.messages = copy.deepcopy(messages) if messages is not None else None self.stream = stream self.start_time = start_time # log the call start time self.call_type = call_type self.litellm_call_id = litellm_call_id - self.litellm_trace_id: str = litellm_trace_id or str(uuid.uuid4()) + self.litellm_trace_id: str = ( + litellm_trace_id if litellm_trace_id else str(uuid.uuid4()) + ) self.function_id = function_id self.streaming_chunks: List[Any] = [] # for generating complete stream response - self.sync_streaming_chunks: List[Any] = ( - [] - ) # for generating complete stream response + self.sync_streaming_chunks: List[ + Any + ] = [] # for generating complete stream response self.log_raw_request_response = log_raw_request_response # Initialize dynamic callbacks @@ -356,6 +390,9 @@ def __init__( # Init Caching related details self.caching_details: Optional[CachingDetails] = None + # Passthrough endpoint guardrails config for field targeting + self.passthrough_guardrails_config: Optional[Dict[str, Any]] = None + self.model_call_details: Dict[str, Any] = { "litellm_trace_id": litellm_trace_id, "litellm_call_id": litellm_call_id, @@ -509,10 +546,11 @@ def update_environment_variables( if "stream_options" in additional_params: self.stream_options = additional_params["stream_options"] ## check if custom pricing set ## - custom_pricing_keys = CustomPricingLiteLLMParams.model_fields.keys() - for key in custom_pricing_keys: - if litellm_params.get(key) is not None: - self.custom_pricing = True + if any( + litellm_params.get(key) is not None + for key in _CUSTOM_PRICING_KEYS & litellm_params.keys() + ): + self.custom_pricing = True if "custom_llm_provider" in self.model_call_details: self.custom_llm_provider = self.model_call_details["custom_llm_provider"] @@ -575,8 +613,9 @@ def get_chat_completion_prompt( model: str, messages: List[AllMessageValues], non_default_params: Dict, - prompt_id: Optional[str], prompt_variables: Optional[dict], + prompt_id: Optional[str] = None, + prompt_spec: Optional[PromptSpec] = None, prompt_management_logger: Optional[CustomLogger] = None, prompt_label: Optional[str] = None, prompt_version: Optional[int] = None, @@ -584,7 +623,11 @@ def get_chat_completion_prompt( custom_logger = ( prompt_management_logger or self.get_custom_logger_for_prompt_management( - model=model, non_default_params=non_default_params + model=model, + non_default_params=non_default_params, + prompt_id=prompt_id, + prompt_spec=prompt_spec, + dynamic_callback_params=self.standard_callback_dynamic_params, ) ) @@ -598,6 +641,7 @@ def get_chat_completion_prompt( messages=messages, non_default_params=non_default_params or {}, prompt_id=prompt_id, + prompt_spec=prompt_spec, prompt_variables=prompt_variables, dynamic_callback_params=self.standard_callback_dynamic_params, prompt_label=prompt_label, @@ -611,8 +655,9 @@ async def async_get_chat_completion_prompt( model: str, messages: List[AllMessageValues], non_default_params: Dict, - prompt_id: Optional[str], prompt_variables: Optional[dict], + prompt_id: Optional[str] = None, + prompt_spec: Optional[PromptSpec] = None, prompt_management_logger: Optional[CustomLogger] = None, tools: Optional[List[Dict]] = None, prompt_label: Optional[str] = None, @@ -621,7 +666,12 @@ async def async_get_chat_completion_prompt( custom_logger = ( prompt_management_logger or self.get_custom_logger_for_prompt_management( - model=model, tools=tools, non_default_params=non_default_params + model=model, + tools=tools, + non_default_params=non_default_params, + prompt_id=prompt_id, + prompt_spec=prompt_spec, + dynamic_callback_params=self.standard_callback_dynamic_params, ) ) @@ -635,6 +685,7 @@ async def async_get_chat_completion_prompt( messages=messages, non_default_params=non_default_params or {}, prompt_id=prompt_id, + prompt_spec=prompt_spec, prompt_variables=prompt_variables, dynamic_callback_params=self.standard_callback_dynamic_params, litellm_logging_obj=self, @@ -645,19 +696,72 @@ async def async_get_chat_completion_prompt( self.messages = messages return model, messages, non_default_params + def _auto_detect_prompt_management_logger( + self, + prompt_id: str, + prompt_spec: Optional[PromptSpec], + dynamic_callback_params: StandardCallbackDynamicParams, + ) -> Optional[CustomLogger]: + """ + Auto-detect which prompt management system owns the given prompt_id. + + This allows a user to just pass prompt_id in the completion call and it will be auto-detected which system owns this prompt. + + Args: + prompt_id: The prompt ID to check + dynamic_callback_params: Dynamic callback parameters for should_run_prompt_management checks + + Returns: + A CustomLogger instance if a matching prompt management system is found, None otherwise + """ + prompt_management_loggers = ( + litellm.logging_callback_manager.get_custom_loggers_for_type( + callback_type=CustomPromptManagement + ) + ) + + for logger in prompt_management_loggers: + if isinstance(logger, CustomPromptManagement): + try: + if logger.should_run_prompt_management( + prompt_id=prompt_id, + prompt_spec=prompt_spec, + dynamic_callback_params=dynamic_callback_params, + ): + self.model_call_details[ + "prompt_integration" + ] = logger.__class__.__name__ + return logger + except Exception: + # If check fails, continue to next logger + continue + + return None + def get_custom_logger_for_prompt_management( - self, model: str, non_default_params: Dict, tools: Optional[List[Dict]] = None + self, + model: str, + non_default_params: Dict, + tools: Optional[List[Dict]] = None, + prompt_id: Optional[str] = None, + prompt_spec: Optional[PromptSpec] = None, + dynamic_callback_params: Optional[StandardCallbackDynamicParams] = None, ) -> Optional[CustomLogger]: """ Get a custom logger for prompt management based on model name or available callbacks. Args: model: The model name to check for prompt management integration + non_default_params: Non-default parameters passed to the completion call + tools: Optional tools passed to the completion call + prompt_id: Optional prompt ID to auto-detect which system owns this prompt + dynamic_callback_params: Dynamic callback parameters for should_run_prompt_management checks Returns: A CustomLogger instance if one is found, None otherwise """ # First check if model starts with a known custom logger compatible callback + # This takes precedence for backward compatibility for callback_name in litellm._known_custom_logger_compatible_callbacks: if model.startswith(callback_name): custom_logger = _init_custom_logger_compatible_class( @@ -669,7 +773,17 @@ def get_custom_logger_for_prompt_management( self.model_call_details["prompt_integration"] = model.split("/")[0] return custom_logger - # Then check for any registered CustomPromptManagement loggers + # If prompt_id is provided, try to auto-detect which system has this prompt + if prompt_id and dynamic_callback_params is not None: + auto_detected_logger = self._auto_detect_prompt_management_logger( + prompt_id=prompt_id, + prompt_spec=prompt_spec, + dynamic_callback_params=dynamic_callback_params, + ) + if auto_detected_logger is not None: + return auto_detected_logger + + # Then check for any registered CustomPromptManagement loggers (fallback) prompt_management_loggers = ( litellm.logging_callback_manager.get_custom_loggers_for_type( callback_type=CustomPromptManagement @@ -684,9 +798,9 @@ def get_custom_logger_for_prompt_management( if anthropic_cache_control_logger := AnthropicCacheControlHook.get_custom_logger_for_anthropic_cache_control_hook( non_default_params ): - self.model_call_details["prompt_integration"] = ( - anthropic_cache_control_logger.__class__.__name__ - ) + self.model_call_details[ + "prompt_integration" + ] = anthropic_cache_control_logger.__class__.__name__ return anthropic_cache_control_logger ######################################################### @@ -698,9 +812,9 @@ def get_custom_logger_for_prompt_management( internal_usage_cache=None, llm_router=None, ) - self.model_call_details["prompt_integration"] = ( - vector_store_custom_logger.__class__.__name__ - ) + self.model_call_details[ + "prompt_integration" + ] = vector_store_custom_logger.__class__.__name__ # Add to global callbacks so post-call hooks are invoked if ( vector_store_custom_logger @@ -760,9 +874,9 @@ def _pre_call(self, input, api_key, model=None, additional_args={}): model ): # if model name was changes pre-call, overwrite the initial model call name with the new one self.model_call_details["model"] = model - self.model_call_details["litellm_params"]["api_base"] = ( - self._get_masked_api_base(additional_args.get("api_base", "")) - ) + self.model_call_details["litellm_params"][ + "api_base" + ] = self._get_masked_api_base(additional_args.get("api_base", "")) def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR0915 # Log the exact input to the LLM API @@ -791,10 +905,10 @@ def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR try: # [Non-blocking Extra Debug Information in metadata] if turn_off_message_logging is True: - _metadata["raw_request"] = ( - "redacted by litellm. \ + _metadata[ + "raw_request" + ] = "redacted by litellm. \ 'litellm.turn_off_message_logging=True'" - ) else: curl_command = self._get_request_curl_command( api_base=additional_args.get("api_base", ""), @@ -805,32 +919,34 @@ def pre_call(self, input, api_key, model=None, additional_args={}): # noqa: PLR _metadata["raw_request"] = str(curl_command) # split up, so it's easier to parse in the UI - self.model_call_details["raw_request_typed_dict"] = ( - RawRequestTypedDict( - raw_request_api_base=str( - additional_args.get("api_base") or "" - ), - raw_request_body=self._get_raw_request_body( - additional_args.get("complete_input_dict", {}) - ), - raw_request_headers=self._get_masked_headers( - additional_args.get("headers", {}) or {}, - ignore_sensitive_headers=True, - ), - error=None, - ) + self.model_call_details[ + "raw_request_typed_dict" + ] = RawRequestTypedDict( + raw_request_api_base=str( + additional_args.get("api_base") or "" + ), + raw_request_body=self._get_raw_request_body( + additional_args.get("complete_input_dict", {}) + ), + # NOTE: setting ignore_sensitive_headers to True will cause + # the Authorization header to be leaked when calls to the health + # endpoint are made and fail. + raw_request_headers=self._get_masked_headers( + additional_args.get("headers", {}) or {}, + ), + error=None, ) except Exception as e: - self.model_call_details["raw_request_typed_dict"] = ( - RawRequestTypedDict( - error=str(e), - ) + self.model_call_details[ + "raw_request_typed_dict" + ] = RawRequestTypedDict( + error=str(e), ) - _metadata["raw_request"] = ( - "Unable to Log \ + _metadata[ + "raw_request" + ] = "Unable to Log \ raw request: {}".format( - str(e) - ) + str(e) ) if getattr(self, "logger_fn", None) and callable(self.logger_fn): try: @@ -1131,13 +1247,13 @@ async def async_post_mcp_tool_call_hook( for callback in callbacks: try: if isinstance(callback, CustomLogger): - response: Optional[MCPPostCallResponseObject] = ( - await callback.async_post_mcp_tool_call_hook( - kwargs=kwargs, - response_obj=post_mcp_tool_call_response_obj, - start_time=start_time, - end_time=end_time, - ) + response: Optional[ + MCPPostCallResponseObject + ] = await callback.async_post_mcp_tool_call_hook( + kwargs=kwargs, + response_obj=post_mcp_tool_call_response_obj, + start_time=start_time, + end_time=end_time, ) ###################################################################### # if any of the callbacks modify the response, use the modified response @@ -1181,9 +1297,13 @@ def set_cost_breakdown( output_cost: float, total_cost: float, cost_for_built_in_tools_cost_usd_dollar: float, + additional_costs: Optional[dict] = None, original_cost: Optional[float] = None, discount_percent: Optional[float] = None, discount_amount: Optional[float] = None, + margin_percent: Optional[float] = None, + margin_fixed_amount: Optional[float] = None, + margin_total_amount: Optional[float] = None, ) -> None: """ Helper method to store cost breakdown in the logging object. @@ -1193,9 +1313,13 @@ def set_cost_breakdown( output_cost: Cost of output/completion tokens cost_for_built_in_tools_cost_usd_dollar: Cost of built-in tools total_cost: Total cost of request + additional_costs: Free-form additional costs dict (e.g., {"azure_model_router_flat_cost": 0.00014}) original_cost: Cost before discount discount_percent: Discount percentage (0.05 = 5%) discount_amount: Discount amount in USD + margin_percent: Margin percentage applied (0.10 = 10%) + margin_fixed_amount: Fixed margin amount in USD + margin_total_amount: Total margin added in USD """ self.cost_breakdown = CostBreakdown( @@ -1205,6 +1329,10 @@ def set_cost_breakdown( tool_usage_cost=cost_for_built_in_tools_cost_usd_dollar, ) + # Store additional costs if provided (free-form dict for extensibility) + if additional_costs and isinstance(additional_costs, dict) and len(additional_costs) > 0: + self.cost_breakdown["additional_costs"] = additional_costs + # Store discount information if provided if original_cost is not None: self.cost_breakdown["original_cost"] = original_cost @@ -1213,6 +1341,14 @@ def set_cost_breakdown( if discount_amount is not None: self.cost_breakdown["discount_amount"] = discount_amount + # Store margin information if provided + if margin_percent is not None: + self.cost_breakdown["margin_percent"] = margin_percent + if margin_fixed_amount is not None: + self.cost_breakdown["margin_fixed_amount"] = margin_fixed_amount + if margin_total_amount is not None: + self.cost_breakdown["margin_total_amount"] = margin_total_amount + def _response_cost_calculator( self, result: Union[ @@ -1231,6 +1367,7 @@ def _response_cost_calculator( OpenAIFileObject, LiteLLMRealtimeStreamLoggingObject, OpenAIModerationResponse, + "SearchResponse", ], cache_hit: Optional[bool] = None, litellm_model_name: Optional[str] = None, @@ -1241,6 +1378,7 @@ def _response_cost_calculator( used for consistent cost calculation across response headers + logging integrations. """ + if isinstance(result, BaseModel) and hasattr(result, "_hidden_params"): hidden_params = getattr(result, "_hidden_params", {}) if ( @@ -1300,16 +1438,16 @@ def _response_cost_calculator( verbose_logger.debug( f"response_cost_failure_debug_information: {debug_info}" ) - self.model_call_details["response_cost_failure_debug_information"] = ( - debug_info - ) + self.model_call_details[ + "response_cost_failure_debug_information" + ] = debug_info return None try: - response_cost = litellm.response_cost_calculator( **response_cost_calculator_kwargs ) + verbose_logger.debug(f"response_cost: {response_cost}") return response_cost except Exception as e: # error calculating cost @@ -1328,9 +1466,9 @@ def _response_cost_calculator( verbose_logger.debug( f"response_cost_failure_debug_information: {debug_info}" ) - self.model_call_details["response_cost_failure_debug_information"] = ( - debug_info - ) + self.model_call_details[ + "response_cost_failure_debug_information" + ] = debug_info return None @@ -1459,6 +1597,76 @@ def normalize_logging_result(self, result: Any) -> Any: ) return logging_result + def _process_hidden_params_and_response_cost( + self, + logging_result, + start_time, + end_time, + ): + hidden_params = getattr(logging_result, "_hidden_params", {}) + if hidden_params: + if self.model_call_details.get("litellm_params") is not None: + self.model_call_details["litellm_params"].setdefault("metadata", {}) + if self.model_call_details["litellm_params"]["metadata"] is None: + self.model_call_details["litellm_params"]["metadata"] = {} + self.model_call_details["litellm_params"]["metadata"]["hidden_params"] = getattr(logging_result, "_hidden_params", {}) # type: ignore + + if "response_cost" in hidden_params: + self.model_call_details["response_cost"] = hidden_params["response_cost"] + else: + self.model_call_details["response_cost"] = self._response_cost_calculator( + result=logging_result + ) + + self.model_call_details[ + "standard_logging_object" + ] = get_standard_logging_object_payload( + kwargs=self.model_call_details, + init_response_obj=logging_result, + start_time=start_time, + end_time=end_time, + logging_obj=self, + status="success", + standard_built_in_tools_params=self.standard_built_in_tools_params, + ) + + def _transform_usage_objects(self, result): + if isinstance(result, ResponsesAPIResponse): + result = result.model_copy() + transformed_usage = ( + ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage( + result.usage + ) + ) + setattr(result, "usage", transformed_usage) + if ( + standard_logging_payload := self.model_call_details.get( + "standard_logging_object" + ) + ) is not None: + response_dict = ( + result.model_dump() + if hasattr(result, "model_dump") + else dict(result) + ) + # Ensure usage is properly included with transformed chat format + if transformed_usage is not None: + response_dict["usage"] = ( + transformed_usage.model_dump() + if hasattr(transformed_usage, "model_dump") + else dict(transformed_usage) + ) + standard_logging_payload["response"] = response_dict + elif isinstance(result, TranscriptionResponse): + from litellm.litellm_core_utils.llm_cost_calc.usage_object_transformation import ( + TranscriptionUsageObjectTransformation, + ) + + result = result.model_copy() + transformed_usage = TranscriptionUsageObjectTransformation.transform_transcription_usage_object(result.usage) # type: ignore + setattr(result, "usage", transformed_usage) + return result + def _success_handler_helper_fn( self, result=None, @@ -1474,12 +1682,14 @@ def _success_handler_helper_fn( end_time = datetime.datetime.now() if self.completion_start_time is None: self.completion_start_time = end_time - self.model_call_details["completion_start_time"] = ( - self.completion_start_time - ) + self.model_call_details[ + "completion_start_time" + ] = self.completion_start_time + self.model_call_details["log_event_type"] = "successful_api_call" self.model_call_details["end_time"] = end_time self.model_call_details["cache_hit"] = cache_hit + if self.call_type == CallTypes.anthropic_messages.value: result = self._handle_anthropic_messages_response_logging(result=result) elif ( @@ -1489,8 +1699,11 @@ def _success_handler_helper_fn( result = self._handle_non_streaming_google_genai_generate_content_response_logging( result=result ) - ## if model in model cost map - log the response cost - ## else set cost to None + elif ( + self.call_type == CallTypes.asend_message.value + or self.call_type == CallTypes.send_message.value + ): + result = self._handle_a2a_response_logging(result=result) logging_result = self.normalize_logging_result(result=result) @@ -1502,80 +1715,31 @@ def _success_handler_helper_fn( if self._is_recognized_call_type_for_logging( logging_result=logging_result ): - ## HIDDEN PARAMS ## - hidden_params = getattr(logging_result, "_hidden_params", {}) - if hidden_params: - # add to metadata for logging - if self.model_call_details.get("litellm_params") is not None: - self.model_call_details["litellm_params"].setdefault( - "metadata", {} - ) - if ( - self.model_call_details["litellm_params"]["metadata"] - is None - ): - self.model_call_details["litellm_params"][ - "metadata" - ] = {} - - self.model_call_details["litellm_params"]["metadata"][ # type: ignore - "hidden_params" - ] = getattr( - logging_result, "_hidden_params", {} - ) - ## RESPONSE COST - Only calculate if not in hidden_params ## - if "response_cost" in hidden_params: - self.model_call_details["response_cost"] = hidden_params[ - "response_cost" - ] - else: - self.model_call_details["response_cost"] = ( - self._response_cost_calculator(result=logging_result) - ) - ## STANDARDIZED LOGGING PAYLOAD - - self.model_call_details["standard_logging_object"] = ( - get_standard_logging_object_payload( - kwargs=self.model_call_details, - init_response_obj=logging_result, - start_time=start_time, - end_time=end_time, - logging_obj=self, - status="success", - standard_built_in_tools_params=self.standard_built_in_tools_params, - ) + self._process_hidden_params_and_response_cost( + logging_result=logging_result, + start_time=start_time, + end_time=end_time, ) elif isinstance(result, dict) or isinstance(result, list): - ## STANDARDIZED LOGGING PAYLOAD - self.model_call_details["standard_logging_object"] = ( - get_standard_logging_object_payload( - kwargs=self.model_call_details, - init_response_obj=result, - start_time=start_time, - end_time=end_time, - logging_obj=self, - status="success", - standard_built_in_tools_params=self.standard_built_in_tools_params, - ) + self.model_call_details[ + "standard_logging_object" + ] = get_standard_logging_object_payload( + kwargs=self.model_call_details, + init_response_obj=result, + start_time=start_time, + end_time=end_time, + logging_obj=self, + status="success", + standard_built_in_tools_params=self.standard_built_in_tools_params, ) elif standard_logging_object is not None: - self.model_call_details["standard_logging_object"] = ( - standard_logging_object - ) - else: # streaming chunks + image gen. + self.model_call_details[ + "standard_logging_object" + ] = standard_logging_object + else: self.model_call_details["response_cost"] = None - ## RESPONSES API USAGE OBJECT TRANSFORMATION ## - # MAP RESPONSES API USAGE OBJECT TO LITELLM USAGE OBJECT - if isinstance(result, ResponsesAPIResponse): - result = result.model_copy() - setattr( - result, - "usage", - ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage( - result.usage - ), - ) + result = self._transform_usage_objects(result=result) if ( litellm.max_budget @@ -1621,9 +1785,14 @@ def _is_recognized_call_type_for_logging( or isinstance(logging_result, LiteLLMRealtimeStreamLoggingObject) or isinstance(logging_result, OpenAIModerationResponse) or isinstance(logging_result, OCRResponse) # OCR + or isinstance(logging_result, SearchResponse) # Search API or isinstance(logging_result, dict) and logging_result.get("object") == "vector_store.search_results.page" - or isinstance(logging_result, VideoObject) + or isinstance(logging_result, dict) + and logging_result.get("object") == "search" # Search API (dict format) + or isinstance(logging_result, VideoObject) + or isinstance(logging_result, ContainerObject) + or isinstance(logging_result, LiteLLMSendMessageResponse) # A2A or (self.call_type == CallTypes.call_mcp_tool.value) ): return True @@ -1698,6 +1867,14 @@ def success_handler( # noqa: PLR0915 cache_hit=cache_hit, standard_logging_object=kwargs.get("standard_logging_object", None), ) + litellm_params = self.model_call_details.get("litellm_params", {}) + is_sync_request = ( + litellm_params.get(CallTypes.acompletion.value, False) is not True + and litellm_params.get(CallTypes.aresponses.value, False) is not True + and litellm_params.get(CallTypes.aembedding.value, False) is not True + and litellm_params.get(CallTypes.aimage_generation.value, False) is not True + and litellm_params.get(CallTypes.atranscription.value, False) is not True + ) try: ## BUILD COMPLETE STREAMED RESPONSE complete_streaming_response: Optional[ @@ -1716,24 +1893,32 @@ def success_handler( # noqa: PLR0915 verbose_logger.debug( "Logging Details LiteLLM-Success Call streaming complete" ) - self.model_call_details["complete_streaming_response"] = ( - complete_streaming_response - ) - self.model_call_details["response_cost"] = ( - self._response_cost_calculator(result=complete_streaming_response) - ) + self.model_call_details[ + "complete_streaming_response" + ] = complete_streaming_response + self.model_call_details[ + "response_cost" + ] = self._response_cost_calculator(result=complete_streaming_response) ## STANDARDIZED LOGGING PAYLOAD - self.model_call_details["standard_logging_object"] = ( - get_standard_logging_object_payload( - kwargs=self.model_call_details, - init_response_obj=complete_streaming_response, - start_time=start_time, - end_time=end_time, - logging_obj=self, - status="success", - standard_built_in_tools_params=self.standard_built_in_tools_params, - ) + self.model_call_details[ + "standard_logging_object" + ] = get_standard_logging_object_payload( + kwargs=self.model_call_details, + init_response_obj=complete_streaming_response, + start_time=start_time, + end_time=end_time, + logging_obj=self, + status="success", + standard_built_in_tools_params=self.standard_built_in_tools_params, ) + if ( + standard_logging_payload := self.model_call_details.get( + "standard_logging_object" + ) + ) is not None: + # Only emit for sync requests (async_success_handler handles async) + if is_sync_request: + emit_standard_logging_payload(standard_logging_payload) callbacks = self.get_combined_callback_list( dynamic_success_callbacks=self.dynamic_success_callbacks, global_callbacks=litellm.success_callback, @@ -1760,7 +1945,6 @@ def success_handler( # noqa: PLR0915 self.has_run_logging(event_type="sync_success") for callback in callbacks: try: - litellm_params = self.model_call_details.get("litellm_params", {}) should_run = self.should_run_callback( callback=callback, litellm_params=litellm_params, @@ -2028,25 +2212,7 @@ def success_handler( # noqa: PLR0915 print_verbose=print_verbose, ) - if ( - callback == "openmeter" - and self.model_call_details.get("litellm_params", {}).get( - "acompletion", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "aembedding", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "aimage_generation", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "atranscription", False - ) - is not True - ): + if callback == "openmeter" and is_sync_request: global openMeterLogger if openMeterLogger is None: print_verbose("Instantiates openmeter client") @@ -2060,10 +2226,10 @@ def success_handler( # noqa: PLR0915 ) else: if self.stream and complete_streaming_response: - self.model_call_details["complete_response"] = ( - self.model_call_details.get( - "complete_streaming_response", {} - ) + self.model_call_details[ + "complete_response" + ] = self.model_call_details.get( + "complete_streaming_response", {} ) result = self.model_call_details["complete_response"] openMeterLogger.log_success_event( @@ -2074,22 +2240,7 @@ def success_handler( # noqa: PLR0915 ) if ( isinstance(callback, CustomLogger) - and self.model_call_details.get("litellm_params", {}).get( - "acompletion", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "aembedding", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "aimage_generation", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "atranscription", False - ) - is not True + and is_sync_request and self.call_type != CallTypes.pass_through.value # pass-through endpoints call async_log_success_event ): # custom logger class @@ -2102,10 +2253,10 @@ def success_handler( # noqa: PLR0915 ) else: if self.stream and complete_streaming_response: - self.model_call_details["complete_response"] = ( - self.model_call_details.get( - "complete_streaming_response", {} - ) + self.model_call_details[ + "complete_response" + ] = self.model_call_details.get( + "complete_streaming_response", {} ) result = self.model_call_details["complete_response"] @@ -2117,22 +2268,7 @@ def success_handler( # noqa: PLR0915 ) if ( callable(callback) is True - and self.model_call_details.get("litellm_params", {}).get( - "acompletion", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "aembedding", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "aimage_generation", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "atranscription", False - ) - is not True + and is_sync_request and customLogger is not None ): # custom logger functions print_verbose( @@ -2159,6 +2295,11 @@ def success_handler( # noqa: PLR0915 ) if capture_exception: # log this error to sentry for debugging capture_exception(e) + # Track callback logging failures in Prometheus + try: + self._handle_callback_failure(callback=callback) + except Exception: + pass except Exception as e: verbose_logger.exception( "LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {}".format( @@ -2201,18 +2342,28 @@ async def async_success_handler( # noqa: PLR0915 batch_cost = kwargs.get("batch_cost", None) batch_usage = kwargs.get("batch_usage", None) batch_models = kwargs.get("batch_models", None) - if all([batch_cost, batch_usage, batch_models]) is not None: + has_explicit_batch_data = all( + x is not None for x in (batch_cost, batch_usage, batch_models) + ) + + should_compute_batch_data = ( + not is_base64_unified_file_id + or not has_explicit_batch_data + and result.status == "completed" + ) + if has_explicit_batch_data: result._hidden_params["response_cost"] = batch_cost result._hidden_params["batch_models"] = batch_models result.usage = batch_usage - elif not is_base64_unified_file_id: # only run for non-unified file ids + elif should_compute_batch_data: ( response_cost, batch_usage, batch_models, ) = await _handle_completed_batch( - batch=result, custom_llm_provider=self.custom_llm_provider + batch=result, + custom_llm_provider=self.custom_llm_provider, ) result._hidden_params["response_cost"] = response_cost @@ -2243,9 +2394,9 @@ async def async_success_handler( # noqa: PLR0915 if complete_streaming_response is not None: print_verbose("Async success callbacks: Got a complete streaming response") - self.model_call_details["async_complete_streaming_response"] = ( - complete_streaming_response - ) + self.model_call_details[ + "async_complete_streaming_response" + ] = complete_streaming_response try: if self.model_call_details.get("cache_hit", False) is True: @@ -2256,10 +2407,10 @@ async def async_success_handler( # noqa: PLR0915 model_call_details=self.model_call_details ) # base_model defaults to None if not set on model_info - self.model_call_details["response_cost"] = ( - self._response_cost_calculator( - result=complete_streaming_response - ) + self.model_call_details[ + "response_cost" + ] = self._response_cost_calculator( + result=complete_streaming_response ) verbose_logger.debug( @@ -2272,17 +2423,25 @@ async def async_success_handler( # noqa: PLR0915 self.model_call_details["response_cost"] = None ## STANDARDIZED LOGGING PAYLOAD - self.model_call_details["standard_logging_object"] = ( - get_standard_logging_object_payload( - kwargs=self.model_call_details, - init_response_obj=complete_streaming_response, - start_time=start_time, - end_time=end_time, - logging_obj=self, - status="success", - standard_built_in_tools_params=self.standard_built_in_tools_params, - ) + self.model_call_details[ + "standard_logging_object" + ] = get_standard_logging_object_payload( + kwargs=self.model_call_details, + init_response_obj=complete_streaming_response, + start_time=start_time, + end_time=end_time, + logging_obj=self, + status="success", + standard_built_in_tools_params=self.standard_built_in_tools_params, ) + + # print standard logging payload + if ( + standard_logging_payload := self.model_call_details.get( + "standard_logging_object" + ) + ) is not None: + emit_standard_logging_payload(standard_logging_payload) callbacks = self.get_combined_callback_list( dynamic_success_callbacks=self.dynamic_async_success_callbacks, global_callbacks=litellm._async_success_callback, @@ -2464,8 +2623,31 @@ async def async_success_handler( # noqa: PLR0915 verbose_logger.error( f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}" ) + self._handle_callback_failure(callback=callback) pass + def _handle_callback_failure(self, callback: Any): + """ + Handle callback logging failures by incrementing Prometheus metrics. + + Works for both sync and async contexts since Prometheus counter increment is synchronous. + + Args: + callback: The callback that failed + """ + try: + callback_name = self._get_callback_name(callback) + + all_callbacks = litellm.logging_callback_manager._get_all_callbacks() + + for callback_obj in all_callbacks: + if hasattr(callback_obj, "increment_callback_logging_failure"): + callback_obj.increment_callback_logging_failure(callback_name=callback_name) # type: ignore + break # Only increment once + + except Exception as e: + verbose_logger.debug(f"Error in _handle_callback_failure: {str(e)}") + def _failure_handler_helper_fn( self, exception, traceback_exception, start_time=None, end_time=None ): @@ -2494,18 +2676,18 @@ def _failure_handler_helper_fn( ## STANDARDIZED LOGGING PAYLOAD - self.model_call_details["standard_logging_object"] = ( - get_standard_logging_object_payload( - kwargs=self.model_call_details, - init_response_obj={}, - start_time=start_time, - end_time=end_time, - logging_obj=self, - status="failure", - error_str=str(exception), - original_exception=exception, - standard_built_in_tools_params=self.standard_built_in_tools_params, - ) + self.model_call_details[ + "standard_logging_object" + ] = get_standard_logging_object_payload( + kwargs=self.model_call_details, + init_response_obj={}, + start_time=start_time, + end_time=end_time, + logging_obj=self, + status="failure", + error_str=str(exception), + original_exception=exception, + standard_built_in_tools_params=self.standard_built_in_tools_params, ) return start_time, end_time @@ -2554,6 +2736,15 @@ def failure_handler( # noqa: PLR0915 event_type="sync_failure" ): # prevent double logging return + litellm_params = self.model_call_details.get("litellm_params", {}) + is_sync_request = ( + litellm_params.get(CallTypes.acompletion.value, False) is not True + and litellm_params.get(CallTypes.aresponses.value, False) is not True + and litellm_params.get(CallTypes.aembedding.value, False) is not True + and litellm_params.get(CallTypes.aimage_generation.value, False) is not True + and litellm_params.get(CallTypes.atranscription.value, False) is not True + ) + try: start_time, end_time = self._failure_handler_helper_fn( exception=exception, @@ -2579,7 +2770,6 @@ def failure_handler( # noqa: PLR0915 self.has_run_logging(event_type="sync_failure") for callback in callbacks: try: - litellm_params = self.model_call_details.get("litellm_params", {}) should_run = self.should_run_callback( callback=callback, litellm_params=litellm_params, @@ -2646,15 +2836,7 @@ def failure_handler( # noqa: PLR0915 callback_func=callback, ) if ( - isinstance(callback, CustomLogger) - and self.model_call_details.get("litellm_params", {}).get( - "acompletion", False - ) - is not True - and self.model_call_details.get("litellm_params", {}).get( - "aembedding", False - ) - is not True + isinstance(callback, CustomLogger) and is_sync_request ): # custom logger class callback.log_failure_event( start_time=start_time, @@ -2801,6 +2983,8 @@ async def async_failure_handler( str(e), callback ) ) + # Track callback logging failures in Prometheus + self._handle_callback_failure(callback=callback) def _get_trace_id(self, service_name: Literal["langfuse"]) -> Optional[str]: """ @@ -2933,15 +3117,19 @@ def _get_callback_name(self, cb) -> str: Helper to get the name of a callback function Args: - cb: The callback function/string to get the name of + cb: The callback object/function/string to get the name of Returns: The name of the callback """ + if isinstance(cb, str): + return cb if hasattr(cb, "__name__"): return cb.__name__ if hasattr(cb, "__func__"): return cb.__func__.__name__ + if hasattr(cb, "__class__"): + return cb.__class__.__name__ return str(cb) def _is_internal_litellm_proxy_callback(self, cb) -> bool: @@ -2995,6 +3183,23 @@ def _get_assembled_streaming_response( elif isinstance(result, TextCompletionResponse): return result elif isinstance(result, ResponseCompletedEvent): + ## return unified Usage object + if isinstance(result.response.usage, ResponseAPIUsage): + transformed_usage = ( + ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage( + result.response.usage + ) + ) + # Set as dict instead of Usage object so model_dump() serializes it correctly + setattr( + result.response, + "usage", + ( + transformed_usage.model_dump() + if hasattr(transformed_usage, "model_dump") + else dict(transformed_usage) + ), + ) return result.response else: return None @@ -3076,6 +3281,31 @@ def _handle_non_streaming_google_genai_generate_content_response_logging( ) return result + def _handle_a2a_response_logging(self, result: Any) -> Any: + """ + Handles logging for A2A (Agent-to-Agent) responses. + + Adds usage from model_call_details to the result if available. + Uses Pydantic's model_copy to avoid modifying the original response. + + Args: + result: The LiteLLMSendMessageResponse from the A2A call + + Returns: + The response object with usage added if available + """ + # Get usage from model_call_details (set by asend_message) + usage = self.model_call_details.get("usage") + if usage is None: + return result + + # Deep copy result and add usage + result_copy = result.model_copy(deep=True) + result_copy.usage = ( + usage.model_dump() if hasattr(usage, "model_dump") else dict(usage) + ) + return result_copy + def _get_masked_values( sensitive_object: dict, @@ -3098,6 +3328,7 @@ def _get_masked_values( "token", "key", "secret", + "vertex_credentials", ] return { k: ( @@ -3316,8 +3547,8 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 _in_memory_loggers.append(_literalai_logger) return _literalai_logger # type: ignore elif logging_integration == "prometheus": - if PrometheusLogger is None: - raise ValueError("PrometheusLogger is not initialized") + PrometheusLogger = _get_cached_prometheus_logger() + for callback in _in_memory_loggers: if isinstance(callback, PrometheusLogger): return callback # type: ignore @@ -3337,6 +3568,14 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 _datadog_llm_obs_logger = DataDogLLMObsLogger() _in_memory_loggers.append(_datadog_llm_obs_logger) return _datadog_llm_obs_logger # type: ignore + elif logging_integration == "azure_sentinel": + for callback in _in_memory_loggers: + if isinstance(callback, AzureSentinelLogger): + return callback # type: ignore + + _azure_sentinel_logger = AzureSentinelLogger() + _in_memory_loggers.append(_azure_sentinel_logger) + return _azure_sentinel_logger # type: ignore elif logging_integration == "gcs_bucket": for callback in _in_memory_loggers: if isinstance(callback, GCSBucketLogger): @@ -3391,11 +3630,12 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 otel_config = OpenTelemetryConfig( exporter=arize_config.protocol, endpoint=arize_config.endpoint, + service_name=arize_config.project_name, ) - os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = ( - f"space_id={arize_config.space_key},api_key={arize_config.api_key}" - ) + os.environ[ + "OTEL_EXPORTER_OTLP_TRACES_HEADERS" + ] = f"space_id={arize_config.space_key or arize_config.space_id},api_key={arize_config.api_key}" for callback in _in_memory_loggers: if ( isinstance(callback, ArizeLogger) @@ -3415,25 +3655,76 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 otel_config = OpenTelemetryConfig( exporter=arize_phoenix_config.protocol, endpoint=arize_phoenix_config.endpoint, + headers=arize_phoenix_config.otlp_auth_headers, ) + if arize_phoenix_config.project_name: + existing_attrs = os.environ.get("OTEL_RESOURCE_ATTRIBUTES", "") + # Add openinference.project.name attribute + if existing_attrs: + os.environ[ + "OTEL_RESOURCE_ATTRIBUTES" + ] = f"{existing_attrs},openinference.project.name={arize_phoenix_config.project_name}" + else: + os.environ[ + "OTEL_RESOURCE_ATTRIBUTES" + ] = f"openinference.project.name={arize_phoenix_config.project_name}" + + # Set Phoenix project name from environment variable + phoenix_project_name = os.environ.get("PHOENIX_PROJECT_NAME", None) + if phoenix_project_name: + existing_attrs = os.environ.get("OTEL_RESOURCE_ATTRIBUTES", "") + # Add openinference.project.name attribute + if existing_attrs: + os.environ[ + "OTEL_RESOURCE_ATTRIBUTES" + ] = f"{existing_attrs},openinference.project.name={phoenix_project_name}" + else: + os.environ[ + "OTEL_RESOURCE_ATTRIBUTES" + ] = f"openinference.project.name={phoenix_project_name}" # auth can be disabled on local deployments of arize phoenix if arize_phoenix_config.otlp_auth_headers is not None: - os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = ( - arize_phoenix_config.otlp_auth_headers - ) + os.environ[ + "OTEL_EXPORTER_OTLP_TRACES_HEADERS" + ] = arize_phoenix_config.otlp_auth_headers for callback in _in_memory_loggers: if ( - isinstance(callback, OpenTelemetry) + isinstance(callback, ArizePhoenixLogger) and callback.callback_name == "arize_phoenix" ): return callback # type: ignore - _otel_logger = OpenTelemetry( + _arize_phoenix_otel_logger = ArizePhoenixLogger( config=otel_config, callback_name="arize_phoenix" ) - _in_memory_loggers.append(_otel_logger) - return _otel_logger # type: ignore + _in_memory_loggers.append(_arize_phoenix_otel_logger) + return _arize_phoenix_otel_logger # type: ignore + elif logging_integration == "levo": + from litellm.integrations.levo.levo import LevoLogger + from litellm.integrations.opentelemetry import ( + OpenTelemetry, + OpenTelemetryConfig, + ) + + levo_config = LevoLogger.get_levo_config() + otel_config = OpenTelemetryConfig( + exporter=levo_config.protocol, + endpoint=levo_config.endpoint, + headers=levo_config.otlp_auth_headers, + ) + + # Check if LevoLogger instance already exists + for callback in _in_memory_loggers: + if ( + isinstance(callback, LevoLogger) + and callback.callback_name == "levo" + ): + return callback # type: ignore + + _levo_otel_logger = LevoLogger(config=otel_config, callback_name="levo") + _in_memory_loggers.append(_levo_otel_logger) + return _levo_otel_logger # type: ignore elif logging_integration == "otel": from litellm.integrations.opentelemetry import OpenTelemetry @@ -3465,6 +3756,15 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 cloudzero_logger = CloudZeroLogger() _in_memory_loggers.append(cloudzero_logger) return cloudzero_logger # type: ignore + elif logging_integration == "focus": + from litellm.integrations.focus.focus_logger import FocusLogger + + for callback in _in_memory_loggers: + if isinstance(callback, FocusLogger): + return callback # type: ignore + focus_logger = FocusLogger() + _in_memory_loggers.append(focus_logger) + return focus_logger # type: ignore elif logging_integration == "deepeval": for callback in _in_memory_loggers: if isinstance(callback, DeepEvalLogger): @@ -3481,9 +3781,12 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 OpenTelemetryConfig, ) + logfire_base_url = os.getenv( + "LOGFIRE_BASE_URL", "https://logfire-api.pydantic.dev" + ) otel_config = OpenTelemetryConfig( exporter="otlp_http", - endpoint="https://logfire-api.pydantic.dev/v1/traces", + endpoint=f"{logfire_base_url.rstrip('/')}/v1/traces", headers=f"Authorization={os.getenv('LOGFIRE_TOKEN')}", ) for callback in _in_memory_loggers: @@ -3553,9 +3856,9 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 exporter="otlp_http", endpoint="https://langtrace.ai/api/trace", ) - os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = ( - f"api_key={os.getenv('LANGTRACE_API_KEY')}" - ) + os.environ[ + "OTEL_EXPORTER_OTLP_TRACES_HEADERS" + ] = f"api_key={os.getenv('LANGTRACE_API_KEY')}" for callback in _in_memory_loggers: if ( isinstance(callback, OpenTelemetry) @@ -3608,6 +3911,32 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 ) _in_memory_loggers.append(_otel_logger) return _otel_logger # type: ignore + elif logging_integration == "weave_otel": + from litellm.integrations.opentelemetry import OpenTelemetryConfig + from litellm.integrations.weave.weave_otel import ( + WeaveOtelLogger, + get_weave_otel_config, + ) + + weave_otel_config = get_weave_otel_config() + + otel_config = OpenTelemetryConfig( + exporter=weave_otel_config.protocol, + endpoint=weave_otel_config.endpoint, + headers=weave_otel_config.otlp_auth_headers, + ) + + for callback in _in_memory_loggers: + if ( + isinstance(callback, WeaveOtelLogger) + and callback.callback_name == "weave_otel" + ): + return callback # type: ignore + _otel_logger = WeaveOtelLogger( + config=otel_config, callback_name="weave_otel" + ) + _in_memory_loggers.append(_otel_logger) + return _otel_logger # type: ignore elif logging_integration == "pagerduty": for callback in _in_memory_loggers: if isinstance(callback, PagerDutyAlerting): @@ -3654,6 +3983,13 @@ def _init_custom_logger_compatible_class( # noqa: PLR0915 resend_email_logger = ResendEmailLogger() _in_memory_loggers.append(resend_email_logger) return resend_email_logger # type: ignore + elif logging_integration == "sendgrid_email": + for callback in _in_memory_loggers: + if isinstance(callback, SendGridEmailLogger): + return callback + sendgrid_email_logger = SendGridEmailLogger() + _in_memory_loggers.append(sendgrid_email_logger) + return sendgrid_email_logger # type: ignore elif logging_integration == "smtp_email": for callback in _in_memory_loggers: if isinstance(callback, SMTPEmailLogger): @@ -3752,6 +4088,12 @@ def get_custom_logger_compatible_class( # noqa: PLR0915 for callback in _in_memory_loggers: if isinstance(callback, CloudZeroLogger): return callback + elif logging_integration == "focus": + from litellm.integrations.focus.focus_logger import FocusLogger + + for callback in _in_memory_loggers: + if isinstance(callback, FocusLogger): + return callback elif logging_integration == "deepeval": for callback in _in_memory_loggers: if isinstance(callback, DeepEvalLogger): @@ -3768,7 +4110,8 @@ def get_custom_logger_compatible_class( # noqa: PLR0915 for callback in _in_memory_loggers: if isinstance(callback, LiteralAILogger): return callback - elif logging_integration == "prometheus" and PrometheusLogger is not None: + elif logging_integration == "prometheus": + PrometheusLogger = _get_cached_prometheus_logger() for callback in _in_memory_loggers: if isinstance(callback, PrometheusLogger): return callback @@ -3780,6 +4123,10 @@ def get_custom_logger_compatible_class( # noqa: PLR0915 for callback in _in_memory_loggers: if isinstance(callback, DataDogLLMObsLogger): return callback + elif logging_integration == "azure_sentinel": + for callback in _in_memory_loggers: + if isinstance(callback, AzureSentinelLogger): + return callback elif logging_integration == "gcs_bucket": for callback in _in_memory_loggers: if isinstance(callback, GCSBucketLogger): @@ -3814,8 +4161,6 @@ def get_custom_logger_compatible_class( # noqa: PLR0915 if isinstance(callback, OpenTelemetry): return callback elif logging_integration == "arize": - if "ARIZE_SPACE_KEY" not in os.environ: - raise ValueError("ARIZE_SPACE_KEY not found in environment variables") if "ARIZE_API_KEY" not in os.environ: raise ValueError("ARIZE_API_KEY not found in environment variables") for callback in _in_memory_loggers: @@ -3895,6 +4240,10 @@ def get_custom_logger_compatible_class( # noqa: PLR0915 for callback in _in_memory_loggers: if isinstance(callback, ResendEmailLogger): return callback + elif logging_integration == "sendgrid_email": + for callback in _in_memory_loggers: + if isinstance(callback, SendGridEmailLogger): + return callback elif logging_integration == "smtp_email": for callback in _in_memory_loggers: if isinstance(callback, SMTPEmailLogger): @@ -3918,10 +4267,8 @@ def _get_custom_logger_settings_from_proxy_server(callback_name: str) -> Dict: otel: message_logging: False """ - from litellm.proxy.proxy_server import callback_settings - - if callback_settings: - return dict(callback_settings.get(callback_name, {})) + if litellm.callback_settings: + return dict(litellm.callback_settings.get(callback_name, {})) return {} @@ -3934,15 +4281,21 @@ def use_custom_pricing_for_model(litellm_params: Optional[dict]) -> bool: if litellm_params is None: return False + # Check litellm_params using set intersection (only check keys that exist in both) + matching_keys = _CUSTOM_PRICING_KEYS & litellm_params.keys() + for key in matching_keys: + if litellm_params.get(key) is not None: + return True + + # Check model_info metadata: dict = litellm_params.get("metadata", {}) or {} model_info: dict = metadata.get("model_info", {}) or {} - custom_pricing_keys = CustomPricingLiteLLMParams.model_fields.keys() - for key in custom_pricing_keys: - if litellm_params.get(key, None) is not None: - return True - elif model_info.get(key, None) is not None: - return True + if model_info: + matching_keys = _CUSTOM_PRICING_KEYS & model_info.keys() + for key in matching_keys: + if model_info.get(key) is not None: + return True return False @@ -3998,6 +4351,77 @@ def cleanup_timestamps( return start_time_float, end_time_float, completion_start_time_float + @staticmethod + def append_system_prompt_messages( + kwargs: Optional[Dict] = None, messages: Optional[Any] = None + ): + """ + Append system prompt messages to the messages + """ + if kwargs is not None: + if kwargs.get("system") is not None and isinstance( + kwargs.get("system"), str + ): + if messages is None: + return [{"role": "system", "content": kwargs.get("system")}] + elif isinstance(messages, list): + if len(messages) == 0: + return [{"role": "system", "content": kwargs.get("system")}] + # check for duplicates + if messages[0].get("role") == "system" and messages[0].get( + "content" + ) == kwargs.get("system"): + return messages + messages = [ + {"role": "system", "content": kwargs.get("system")} + ] + messages + elif isinstance(messages, str): + messages = [ + {"role": "system", "content": kwargs.get("system")}, + {"role": "user", "content": messages}, + ] + return messages + + return messages + + @staticmethod + def merge_litellm_metadata(litellm_params: dict) -> dict: + """ + Merge both litellm_metadata and metadata from litellm_params. + + litellm_metadata contains model-related fields, metadata contains user API key fields. + We need both for complete standard logging payload. + + Args: + litellm_params: Dictionary containing metadata and litellm_metadata + + Returns: + dict: Merged metadata with user API key fields taking precedence + """ + merged_metadata: dict = {} + + # Start with metadata (user API key fields) - but skip non-serializable objects + if litellm_params.get("metadata") and isinstance( + litellm_params.get("metadata"), dict + ): + for key, value in litellm_params["metadata"].items(): + # Skip non-serializable objects like UserAPIKeyAuth + if key == "user_api_key_auth": + continue + merged_metadata[key] = value + + # Then merge litellm_metadata (model-related fields) - this will NOT overwrite existing keys + if litellm_params.get("litellm_metadata") and isinstance( + litellm_params.get("litellm_metadata"), dict + ): + for key, value in litellm_params["litellm_metadata"].items(): + if ( + key not in merged_metadata + ): # Don't overwrite existing keys from metadata + merged_metadata[key] = value + + return merged_metadata + @staticmethod def get_standard_logging_metadata( metadata: Optional[Dict[str, Any]], @@ -4059,6 +4483,7 @@ def get_standard_logging_metadata( user_api_key_request_route=None, spend_logs_metadata=None, requester_ip_address=None, + user_agent=None, requester_metadata=None, prompt_management_metadata=prompt_management_metadata, applied_guardrails=applied_guardrails, @@ -4139,6 +4564,10 @@ def get_usage_from_response_obj( ) elif isinstance(usage, Usage): return usage + elif isinstance(usage, ResponseAPIUsage): + return ResponseAPILoggingUtils._transform_response_api_usage_to_chat_usage( + usage + ) elif isinstance(usage, dict): if ResponseAPILoggingUtils._is_response_api_usage(usage): return ( @@ -4194,12 +4623,12 @@ def get_final_response_obj( """ Get final response object after redacting the message input/output from logging """ - if response_obj is not None: + if response_obj: final_response_obj: Optional[Union[dict, str, list]] = response_obj elif isinstance(init_response_obj, list) or isinstance(init_response_obj, str): final_response_obj = init_response_obj else: - final_response_obj = None + final_response_obj = {} modified_final_response_obj = redact_message_input_output_from_logging( model_call_details=kwargs, @@ -4255,10 +4684,10 @@ def get_hidden_params( for key in StandardLoggingHiddenParams.__annotations__.keys(): if key in hidden_params: if key == "additional_headers": - clean_hidden_params["additional_headers"] = ( - StandardLoggingPayloadSetup.get_additional_headers( - hidden_params[key] - ) + clean_hidden_params[ + "additional_headers" + ] = StandardLoggingPayloadSetup.get_additional_headers( + hidden_params[key] ) else: clean_hidden_params[key] = hidden_params[key] # type: ignore @@ -4267,7 +4696,10 @@ def get_hidden_params( @staticmethod def strip_trailing_slash(api_base: Optional[str]) -> Optional[str]: if api_base: - return api_base.rstrip("/") + if api_base.endswith("//"): + return api_base.rstrip("/") + if api_base[-1] == "/": + return api_base[:-1] return api_base @staticmethod @@ -4319,7 +4751,7 @@ def _generate_cold_storage_object_key( s3_object_key = get_s3_object_key( s3_path=s3_path, # Use actual s3_path from logger configuration - team_alias_prefix="", # Don't split by team alias for cold storage + prefix="", # Don't split by team alias for cold storage start_time=start_time, s3_file_name=s3_file_name, ) @@ -4336,7 +4768,14 @@ def get_error_information( ) -> StandardLoggingPayloadErrorInformation: from litellm.constants import MAXIMUM_TRACEBACK_LINES_TO_LOG - error_status: str = str(getattr(original_exception, "status_code", "")) + # Check for 'code' first (used by ProxyException), then fall back to 'status_code' (used by LiteLLM exceptions) + # Ensure error_code is always a string for Prisma Python JSON field compatibility + error_code_attr = getattr(original_exception, "code", None) + if error_code_attr is not None and str(error_code_attr) not in ("", "None"): + error_status: str = str(error_code_attr) + else: + status_code_attr = getattr(original_exception, "status_code", None) + error_status = str(status_code_attr) if status_code_attr is not None else "" error_class: str = ( str(original_exception.__class__.__name__) if original_exception else "" ) @@ -4438,7 +4877,9 @@ def _get_extra_header_tags(proxy_server_request: dict) -> Optional[List[str]]: """ Extract additional header tags for spend tracking based on config. """ - extra_headers: List[str] = litellm.extra_spend_tag_headers or [] + extra_headers: List[str] = ( + getattr(litellm, "extra_spend_tag_headers", None) or [] + ) if not extra_headers: return None @@ -4462,9 +4903,9 @@ def _get_request_tags( metadata = litellm_params.get("metadata") or {} litellm_metadata = litellm_params.get("litellm_metadata") or {} if metadata.get("tags", []): - request_tags = metadata.get("tags", []) + request_tags = metadata.get("tags", []).copy() elif litellm_metadata.get("tags", []): - request_tags = litellm_metadata.get("tags", []) + request_tags = litellm_metadata.get("tags", []).copy() else: request_tags = [] user_agent_tags = StandardLoggingPayloadSetup._get_user_agent_tags( @@ -4482,7 +4923,7 @@ def _get_request_tags( def _get_status_fields( status: StandardLoggingPayloadStatus, - guardrail_information: Optional[dict], + guardrail_information: Optional[List[dict]], error_str: Optional[str], ) -> "StandardLoggingPayloadStatusFields": """ @@ -4513,15 +4954,57 @@ def _get_status_fields( # Map - guardrail_information.guardrail_status to guardrail_status ######################################################### guardrail_status: GuardrailStatus = "not_run" - if guardrail_information and isinstance(guardrail_information, dict): - raw_status = guardrail_information.get("guardrail_status", "not_run") - guardrail_status = GUARDRAIL_STATUS_MAP.get(raw_status, "not_run") + if guardrail_information and isinstance(guardrail_information, list): + for information in guardrail_information: + if isinstance(information, dict): + raw_status = information.get("guardrail_status", "not_run") + if raw_status != "not_run": + guardrail_status = GUARDRAIL_STATUS_MAP.get(raw_status, "not_run") + break return StandardLoggingPayloadStatusFields( llm_api_status=llm_api_status, guardrail_status=guardrail_status ) +def _extract_response_obj_and_hidden_params( + init_response_obj: Union[Any, BaseModel, dict], + original_exception: Optional[Exception], +) -> Tuple[dict, Optional[dict]]: + """Extract response_obj and hidden_params from init_response_obj.""" + hidden_params: Optional[dict] = None + if init_response_obj is None: + response_obj = {} + elif isinstance(init_response_obj, BaseModel): + response_obj = init_response_obj.model_dump() + hidden_params = getattr(init_response_obj, "_hidden_params", None) + elif isinstance(init_response_obj, dict): + response_obj = init_response_obj + else: + response_obj = {} + + if original_exception is not None and hidden_params is None: + response_headers = _get_response_headers(original_exception) + if response_headers is not None: + hidden_params = dict( + StandardLoggingHiddenParams( + additional_headers=StandardLoggingPayloadSetup.get_additional_headers( + dict(response_headers) + ), + model_id=None, + cache_key=None, + api_base=None, + response_cost=None, + litellm_overhead_time_ms=None, + batch_models=None, + litellm_model_name=None, + usage_object=None, + ) + ) + + return response_obj, hidden_params + + def get_standard_logging_object_payload( kwargs: Optional[dict], init_response_obj: Union[Any, BaseModel, dict], @@ -4536,44 +5019,17 @@ def get_standard_logging_object_payload( try: kwargs = kwargs or {} - hidden_params: Optional[dict] = None - if init_response_obj is None: - response_obj = {} - elif isinstance(init_response_obj, BaseModel): - response_obj = init_response_obj.model_dump() - hidden_params = getattr(init_response_obj, "_hidden_params", None) - elif isinstance(init_response_obj, dict): - response_obj = init_response_obj - else: - response_obj = {} - - if original_exception is not None and hidden_params is None: - response_headers = _get_response_headers(original_exception) - if response_headers is not None: - hidden_params = dict( - StandardLoggingHiddenParams( - additional_headers=StandardLoggingPayloadSetup.get_additional_headers( - dict(response_headers) - ), - model_id=None, - cache_key=None, - api_base=None, - response_cost=None, - litellm_overhead_time_ms=None, - batch_models=None, - litellm_model_name=None, - usage_object=None, - ) - ) + response_obj, hidden_params = _extract_response_obj_and_hidden_params( + init_response_obj, original_exception + ) # standardize this function to be used across, s3, dynamoDB, langfuse logging litellm_params = kwargs.get("litellm_params", {}) or {} proxy_server_request = litellm_params.get("proxy_server_request") or {} - metadata: dict = ( - litellm_params.get("litellm_metadata") - or litellm_params.get("metadata", None) - or {} + # Merge both litellm_metadata and metadata to get complete metadata + metadata: dict = StandardLoggingPayloadSetup.merge_litellm_metadata( + litellm_params ) completion_start_time = kwargs.get("completion_start_time", end_time) @@ -4676,6 +5132,14 @@ def get_standard_logging_object_payload( ) and kwargs.get("stream") is True: stream = True + # Reconstruct full model name with provider prefix for logging + # This ensures Bedrock models like "us.anthropic.claude-3-5-sonnet-20240620-v1:0" + # are logged as "bedrock/us.anthropic.claude-3-5-sonnet-20240620-v1:0" + custom_llm_provider = cast(Optional[str], kwargs.get("custom_llm_provider")) + model_name = reconstruct_model_name( + kwargs.get("model", "") or "", custom_llm_provider, metadata + ) + payload: StandardLoggingPayload = StandardLoggingPayload( id=str(id), trace_id=StandardLoggingPayloadSetup._get_standard_logging_payload_trace_id( @@ -4693,13 +5157,13 @@ def get_standard_logging_object_payload( ), error_str=error_str, ), - custom_llm_provider=cast(Optional[str], kwargs.get("custom_llm_provider")), + custom_llm_provider=custom_llm_provider, saved_cache_cost=saved_cache_cost, startTime=start_time_float, endTime=end_time_float, completionStartTime=completion_start_time_float, response_time=response_time, - model=kwargs.get("model", "") or "", + model=model_name, metadata=clean_metadata, cache_key=clean_hidden_params["cache_key"], response_cost=response_cost, @@ -4716,7 +5180,10 @@ def get_standard_logging_object_payload( model_group=_model_group, model_id=_model_id, requester_ip_address=clean_metadata.get("requester_ip_address", None), - messages=kwargs.get("messages"), + user_agent=clean_metadata.get("user_agent", None), + messages=StandardLoggingPayloadSetup.append_system_prompt_messages( + kwargs=kwargs, messages=kwargs.get("messages") + ), response=final_response_obj, model_parameters=ModelParamHelper.get_standard_logging_model_parameters( kwargs.get("optional_params", None) or {} @@ -4734,7 +5201,8 @@ def get_standard_logging_object_payload( standard_built_in_tools_params=standard_built_in_tools_params, ) - emit_standard_logging_payload(payload) + # emit_standard_logging_payload(payload) - Moved to success_handler to prevent double emitting + return payload except Exception as e: verbose_logger.exception( @@ -4778,6 +5246,7 @@ def get_standard_logging_metadata( user_api_key_team_alias=None, spend_logs_metadata=None, requester_ip_address=None, + user_agent=None, requester_metadata=None, user_api_key_end_user_id=None, prompt_management_metadata=None, @@ -4810,6 +5279,15 @@ def scrub_sensitive_keys_in_metadata(litellm_params: Optional[dict]): metadata = litellm_params.get("metadata", {}) or {} + ## Extract provider-specific callable values (like langfuse_masking_function) + ## Store them separately so only the intended logger can access them + ## This prevents callables from leaking to other logging integrations + if "langfuse_masking_function" in metadata: + masking_fn = metadata.pop("langfuse_masking_function", None) + if callable(masking_fn): + litellm_params["_langfuse_masking_function"] = masking_fn + litellm_params["metadata"] = metadata + ## check user_api_key_metadata for sensitive logging keys cleaned_user_api_key_metadata = {} if "user_api_key_metadata" in metadata and isinstance( @@ -4817,9 +5295,9 @@ def scrub_sensitive_keys_in_metadata(litellm_params: Optional[dict]): ): for k, v in metadata["user_api_key_metadata"].items(): if k == "logging": # prevent logging user logging keys - cleaned_user_api_key_metadata[k] = ( - "scrubbed_by_litellm_for_sensitive_keys" - ) + cleaned_user_api_key_metadata[ + k + ] = "scrubbed_by_litellm_for_sensitive_keys" else: cleaned_user_api_key_metadata[k] = v diff --git a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py index 7d3af4ad2f80..4a4a2508d2e2 100644 --- a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py +++ b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py @@ -595,6 +595,31 @@ def get_cost_for_computer_use( # OpenAI doesn't charge separately for computer use yet return 0.0 + @staticmethod + def _get_code_interpreter_cost_from_model_map( + provider: str, + ) -> Optional[float]: + """ + Get code interpreter cost per session from model cost map. + """ + import litellm + + try: + container_model = f"{provider}/container" + model_info = litellm.get_model_info( + model=container_model, + custom_llm_provider=provider + ) + model_key = model_info.get("key") if isinstance(model_info, dict) else getattr(model_info, "key", None) + + if model_key and model_key in litellm.model_cost: + return litellm.model_cost[model_key].get("code_interpreter_cost_per_session") + + except Exception: + pass + + return None + @staticmethod def get_cost_for_code_interpreter( sessions: Optional[int] = None, @@ -604,7 +629,8 @@ def get_cost_for_code_interpreter( """ Calculate cost for code interpreter feature. - Azure: $0.03 USD per session + Azure: $0.03 USD per session (from model cost map) + OpenAI: $0.03 USD per session (from model cost map) """ if sessions is None or sessions == 0: return 0.0 @@ -613,13 +639,15 @@ def get_cost_for_code_interpreter( if model_info and "code_interpreter_cost_per_session" in model_info: return sessions * model_info["code_interpreter_cost_per_session"] - # Azure pricing for code interpreter - if provider == "azure": - from litellm.constants import AZURE_CODE_INTERPRETER_COST_PER_SESSION - - return sessions * AZURE_CODE_INTERPRETER_COST_PER_SESSION + # Try to get cost from model cost map for any provider + if provider: + cost_per_session = StandardBuiltInToolCostTracking._get_code_interpreter_cost_from_model_map( + provider=provider + ) + if cost_per_session is not None: + return sessions * cost_per_session + - # OpenAI doesn't charge separately for code interpreter yet return 0.0 @staticmethod diff --git a/litellm/litellm_core_utils/llm_cost_calc/usage_object_transformation.py b/litellm/litellm_core_utils/llm_cost_calc/usage_object_transformation.py new file mode 100644 index 000000000000..1432e912fd8e --- /dev/null +++ b/litellm/litellm_core_utils/llm_cost_calc/usage_object_transformation.py @@ -0,0 +1,38 @@ +from typing import Any, Optional, Union + +from litellm.types.utils import ( + PromptTokensDetailsWrapper, + TranscriptionUsageDurationObject, + TranscriptionUsageTokensObject, + Usage, +) + + +class TranscriptionUsageObjectTransformation: + @staticmethod + def is_transcription_usage_object( + usage_object: Any, + ) -> bool: + return isinstance(usage_object, TranscriptionUsageDurationObject) or isinstance( + usage_object, TranscriptionUsageTokensObject + ) + + @staticmethod + def transform_transcription_usage_object( + usage_object: Union[ + TranscriptionUsageDurationObject, TranscriptionUsageTokensObject + ], + ) -> Optional[Usage]: + if isinstance(usage_object, TranscriptionUsageDurationObject): + return None + elif isinstance(usage_object, TranscriptionUsageTokensObject): + return Usage( + prompt_tokens=usage_object.input_tokens, + completion_tokens=usage_object.output_tokens, + total_tokens=usage_object.total_tokens, + prompt_tokens_details=PromptTokensDetailsWrapper( + text_tokens=usage_object.input_token_details.text_tokens, + audio_tokens=usage_object.input_token_details.audio_tokens, + ), + ) + return None diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py index 2fd0b44962e1..2308dc7becac 100644 --- a/litellm/litellm_core_utils/llm_cost_calc/utils.py +++ b/litellm/litellm_core_utils/llm_cost_calc/utils.py @@ -1,7 +1,7 @@ # What is this? ## Helper utilities for cost_per_token() -from typing import Any, Literal, Optional, Tuple, TypedDict, cast +from typing import Literal, Optional, Tuple, TypedDict, cast import litellm from litellm._logging import verbose_logger @@ -11,8 +11,8 @@ ImageResponse, ModelInfo, PassthroughCallTypes, - Usage, ServiceTier, + Usage, ) from litellm.utils import get_model_info @@ -23,6 +23,15 @@ def _is_above_128k(tokens: float) -> bool: return False +def get_billable_input_tokens(usage: Usage) -> int: + """ + Returns the number of billable input tokens. + Subtracts cached tokens from prompt tokens if applicable. + """ + details = _parse_prompt_tokens_details(usage) + return usage.prompt_tokens - details["cache_hit_tokens"] + + def select_cost_metric_for_model( model_info: ModelInfo, ) -> Literal["cost_per_character", "cost_per_token"]: @@ -118,21 +127,21 @@ def _generic_cost_per_character( def _get_service_tier_cost_key(base_key: str, service_tier: Optional[str]) -> str: """ Get the appropriate cost key based on service tier. - + Args: base_key: The base cost key (e.g., "input_cost_per_token") service_tier: The service tier ("flex", "priority", or None for standard) - + Returns: str: The cost key to use (e.g., "input_cost_per_token_flex" or "input_cost_per_token") """ if service_tier is None: return base_key - + # Only use service tier specific keys for "flex" and "priority" if service_tier.lower() in [ServiceTier.FLEX.value, ServiceTier.PRIORITY.value]: return f"{base_key}_{service_tier.lower()}" - + # For any other service tier, use standard pricing return base_key @@ -152,15 +161,24 @@ def _get_token_base_cost( # Get service tier aware cost keys input_cost_key = _get_service_tier_cost_key("input_cost_per_token", service_tier) output_cost_key = _get_service_tier_cost_key("output_cost_per_token", service_tier) - cache_creation_cost_key = _get_service_tier_cost_key("cache_creation_input_token_cost", service_tier) - cache_read_cost_key = _get_service_tier_cost_key("cache_read_input_token_cost", service_tier) - - prompt_base_cost = cast( - float, _get_cost_per_unit(model_info, input_cost_key) + cache_creation_cost_key = _get_service_tier_cost_key( + "cache_creation_input_token_cost", service_tier ) - completion_base_cost = cast( - float, _get_cost_per_unit(model_info, output_cost_key) + cache_read_cost_key = _get_service_tier_cost_key( + "cache_read_input_token_cost", service_tier ) + + prompt_base_cost = cast(float, _get_cost_per_unit(model_info, input_cost_key)) + completion_base_cost = cast(float, _get_cost_per_unit(model_info, output_cost_key)) + + # For image generation models that don't have output_cost_per_token, + # use output_cost_per_image_token as the base cost (all output tokens are image tokens) + if completion_base_cost == 0.0 or completion_base_cost is None: + output_image_cost = _get_cost_per_unit( + model_info, "output_cost_per_image_token", None + ) + if output_image_cost is not None: + completion_base_cost = cast(float, output_image_cost) cache_creation_cost = cast( float, _get_cost_per_unit(model_info, cache_creation_cost_key) ) @@ -168,9 +186,7 @@ def _get_token_base_cost( float, _get_cost_per_unit(model_info, "cache_creation_input_token_cost_above_1hr"), ) - cache_read_cost = cast( - float, _get_cost_per_unit(model_info, cache_read_cost_key) - ) + cache_read_cost = cast(float, _get_cost_per_unit(model_info, cache_read_cost_key)) ## CHECK IF ABOVE THRESHOLD threshold: Optional[float] = None @@ -183,7 +199,6 @@ def _get_token_base_cost( 1000 if "k" in threshold_str else 1 ) if usage.prompt_tokens > threshold: - prompt_base_cost = cast( float, _get_cost_per_unit(model_info, key, prompt_base_cost) ) @@ -200,6 +215,9 @@ def _get_token_base_cost( cache_creation_tiered_key = ( f"cache_creation_input_token_cost_above_{threshold_str}_tokens" ) + cache_creation_1hr_tiered_key = ( + f"cache_creation_input_token_cost_above_1hr_above_{threshold_str}_tokens" + ) cache_read_tiered_key = ( f"cache_read_input_token_cost_above_{threshold_str}_tokens" ) @@ -214,6 +232,16 @@ def _get_token_base_cost( ), ) + if cache_creation_1hr_tiered_key in model_info: + cache_creation_cost_above_1hr = cast( + float, + _get_cost_per_unit( + model_info, + cache_creation_1hr_tiered_key, + cache_creation_cost_above_1hr, + ), + ) + if cache_read_tiered_key in model_info: cache_read_cost = cast( float, @@ -278,7 +306,7 @@ def _get_cost_per_unit( verbose_logger.exception( f"litellm.litellm_core_utils.llm_cost_calc.utils.py::calculate_cost_per_component(): Exception occured - {cost_per_unit}\nDefaulting to 0.0" ) - + # If the service tier key doesn't exist or is None, try to fall back to the standard key if cost_per_unit is None: # Check if any service tier suffix exists in the cost key using ServiceTier enum @@ -286,7 +314,7 @@ def _get_cost_per_unit( suffix = f"_{service_tier.value}" if suffix in cost_key: # Extract the base key by removing the matched suffix - base_key = cost_key.replace(suffix, '') + base_key = cost_key.replace(suffix, "") fallback_cost = model_info.get(base_key) if isinstance(fallback_cost, float): return fallback_cost @@ -300,7 +328,7 @@ def _get_cost_per_unit( f"litellm.litellm_core_utils.llm_cost_calc.utils.py::_get_cost_per_unit(): Exception occured - {fallback_cost}\nDefaulting to 0.0" ) break # Only try the first matching suffix - + return default_value @@ -344,9 +372,10 @@ class PromptTokensDetailsResult(TypedDict): cache_creation_token_details: Optional[CacheCreationTokenDetails] text_tokens: int audio_tokens: int + image_tokens: int character_count: int image_count: int - video_length_seconds: int + video_length_seconds: float def _parse_prompt_tokens_details(usage: Usage) -> PromptTokensDetailsResult: @@ -376,6 +405,10 @@ def _parse_prompt_tokens_details(usage: Usage) -> PromptTokensDetailsResult: cast(Optional[int], getattr(usage.prompt_tokens_details, "audio_tokens", 0)) or 0 ) + image_tokens = ( + cast(Optional[int], getattr(usage.prompt_tokens_details, "image_tokens", 0)) + or 0 + ) character_count = ( cast( Optional[int], @@ -388,10 +421,10 @@ def _parse_prompt_tokens_details(usage: Usage) -> PromptTokensDetailsResult: ) video_length_seconds = ( cast( - Optional[int], + Optional[float], getattr(usage.prompt_tokens_details, "video_length_seconds", 0), ) - or 0 + or 0.0 ) return PromptTokensDetailsResult( @@ -400,9 +433,10 @@ def _parse_prompt_tokens_details(usage: Usage) -> PromptTokensDetailsResult: cache_creation_token_details=cache_creation_token_details, text_tokens=text_tokens, audio_tokens=audio_tokens, + image_tokens=image_tokens, character_count=character_count, image_count=image_count, - video_length_seconds=video_length_seconds, + video_length_seconds=float(video_length_seconds), ) @@ -410,6 +444,7 @@ class CompletionTokensDetailsResult(TypedDict): audio_tokens: int text_tokens: int reasoning_tokens: int + image_tokens: int def _parse_completion_tokens_details(usage: Usage) -> CompletionTokensDetailsResult: @@ -434,11 +469,19 @@ def _parse_completion_tokens_details(usage: Usage) -> CompletionTokensDetailsRes ) or 0 ) + image_tokens = ( + cast( + Optional[int], + getattr(usage.completion_tokens_details, "image_tokens", 0), + ) + or 0 + ) return CompletionTokensDetailsResult( audio_tokens=audio_tokens, text_tokens=text_tokens, reasoning_tokens=reasoning_tokens, + image_tokens=image_tokens, ) @@ -463,6 +506,16 @@ def _calculate_input_cost( model_info, "input_cost_per_audio_token", prompt_tokens_details["audio_tokens"] ) + ### IMAGE TOKEN COST + # For image token costs: + # First check if input_cost_per_image_token is available. If not, default to generic input_cost_per_token. + image_token_cost_key = "input_cost_per_image_token" + if model_info.get(image_token_cost_key) is None: + image_token_cost_key = "input_cost_per_token" + prompt_cost += calculate_cost_component( + model_info, image_token_cost_key, prompt_tokens_details["image_tokens"] + ) + ### CACHE WRITING COST - Now uses tiered pricing prompt_cost += calculate_cache_writing_cost( cache_creation_tokens=prompt_tokens_details["cache_creation_tokens"], @@ -494,8 +547,11 @@ def _calculate_input_cost( return prompt_cost -def generic_cost_per_token( - model: str, usage: Usage, custom_llm_provider: str, service_tier: Optional[str] = None +def generic_cost_per_token( # noqa: PLR0915 + model: str, + usage: Usage, + custom_llm_provider: str, + service_tier: Optional[str] = None, ) -> Tuple[float, float]: """ Calculates the cost per token for a given model, prompt tokens, and completion tokens. @@ -523,21 +579,36 @@ def generic_cost_per_token( cache_creation_token_details=None, text_tokens=usage.prompt_tokens, audio_tokens=0, + image_tokens=0, character_count=0, image_count=0, - video_length_seconds=0, + video_length_seconds=0.0, ) if usage.prompt_tokens_details: prompt_tokens_details = _parse_prompt_tokens_details(usage) - ## EDGE CASE - text tokens not set inside PromptTokensDetails + ## EDGE CASE - text tokens not set or includes cached tokens (double-counting) + ## Some providers (like xAI) report text_tokens = prompt_tokens (including cached) + ## We detect this when: text_tokens + cached_tokens + other > prompt_tokens + ## Ref: https://github.com/BerriAI/litellm/issues/19680, #14874, #14875 + + cache_hit = prompt_tokens_details["cache_hit_tokens"] + text_tokens = prompt_tokens_details["text_tokens"] + audio_tokens = prompt_tokens_details["audio_tokens"] + cache_creation = prompt_tokens_details["cache_creation_tokens"] + image_tokens = prompt_tokens_details["image_tokens"] + + # Check for double-counting: sum of details > prompt_tokens means overlap + total_details = text_tokens + cache_hit + audio_tokens + cache_creation + image_tokens + has_double_counting = cache_hit > 0 and total_details > usage.prompt_tokens - if prompt_tokens_details["text_tokens"] == 0: + if text_tokens == 0 or has_double_counting: text_tokens = ( usage.prompt_tokens - - prompt_tokens_details["cache_hit_tokens"] - - prompt_tokens_details["audio_tokens"] - - prompt_tokens_details["cache_creation_tokens"] + - cache_hit + - audio_tokens + - cache_creation + - image_tokens ) prompt_tokens_details["text_tokens"] = text_tokens @@ -547,7 +618,9 @@ def generic_cost_per_token( cache_creation_cost, cache_creation_cost_above_1hr, cache_read_cost, - ) = _get_token_base_cost(model_info=model_info, usage=usage, service_tier=service_tier) + ) = _get_token_base_cost( + model_info=model_info, usage=usage, service_tier=service_tier + ) prompt_cost = _calculate_input_cost( prompt_tokens_details=prompt_tokens_details, @@ -562,17 +635,35 @@ def generic_cost_per_token( text_tokens = 0 audio_tokens = 0 reasoning_tokens = 0 + image_tokens = 0 is_text_tokens_total = False if usage.completion_tokens_details is not None: completion_tokens_details = _parse_completion_tokens_details(usage) audio_tokens = completion_tokens_details["audio_tokens"] text_tokens = completion_tokens_details["text_tokens"] reasoning_tokens = completion_tokens_details["reasoning_tokens"] + image_tokens = completion_tokens_details["image_tokens"] + # Handle text_tokens calculation: + # 1. If text_tokens is explicitly provided and > 0, use it + # 2. If there's a breakdown (reasoning/audio/image tokens), calculate text_tokens as the remainder + # 3. If no breakdown at all, assume all completion_tokens are text_tokens + has_token_breakdown = image_tokens > 0 or audio_tokens > 0 or reasoning_tokens > 0 if text_tokens == 0: - text_tokens = usage.completion_tokens - if text_tokens == usage.completion_tokens: - is_text_tokens_total = True + if has_token_breakdown: + # Calculate text tokens as remainder when we have a breakdown + # This handles cases like OpenAI's reasoning models where text_tokens isn't provided + text_tokens = max( + 0, + usage.completion_tokens + - reasoning_tokens + - audio_tokens + - image_tokens, + ) + else: + # No breakdown at all, all tokens are text tokens + text_tokens = usage.completion_tokens + is_text_tokens_total = True ## TEXT COST completion_cost = float(text_tokens) * completion_base_cost @@ -582,6 +673,9 @@ def generic_cost_per_token( _output_cost_per_reasoning_token = _get_cost_per_unit( model_info, "output_cost_per_reasoning_token", None ) + _output_cost_per_image_token = _get_cost_per_unit( + model_info, "output_cost_per_image_token", None + ) ## AUDIO COST if not is_text_tokens_total and audio_tokens is not None and audio_tokens > 0: @@ -601,6 +695,15 @@ def generic_cost_per_token( ) completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token + ## IMAGE COST + if not is_text_tokens_total and image_tokens and image_tokens > 0: + _output_cost_per_image_token = ( + _output_cost_per_image_token + if _output_cost_per_image_token is not None + else completion_base_cost + ) + completion_cost += float(image_tokens) * _output_cost_per_image_token + return prompt_cost, completion_cost @@ -631,12 +734,13 @@ def _call_type_has_image_response(call_type: str) -> bool: @staticmethod def route_image_generation_cost_calculator( model: str, - completion_response: Any, + completion_response: ImageResponse, custom_llm_provider: Optional[str] = None, quality: Optional[str] = None, n: Optional[int] = None, size: Optional[str] = None, optional_params: Optional[dict] = None, + call_type: Optional[str] = None, ) -> float: """ Route the image generation cost calculator based on the custom_llm_provider @@ -645,7 +749,7 @@ def route_image_generation_cost_calculator( from litellm.llms.azure_ai.image_generation.cost_calculator import ( cost_calculator as azure_ai_image_cost_calculator, ) - from litellm.llms.bedrock.image.cost_calculator import ( + from litellm.llms.bedrock.image_generation.cost_calculator import ( cost_calculator as bedrock_image_cost_calculator, ) from litellm.llms.gemini.image_generation.cost_calculator import ( @@ -658,6 +762,13 @@ def route_image_generation_cost_calculator( cost_calculator as vertex_ai_image_cost_calculator, ) + if size is None: + size = completion_response.size or "1024-x-1024" + if quality is None: + quality = completion_response.quality or "standard" + if n is None: + n = len(completion_response.data) if completion_response.data else 0 + if custom_llm_provider == litellm.LlmProviders.VERTEX_AI.value: if isinstance(completion_response, ImageResponse): return vertex_ai_image_cost_calculator( @@ -703,6 +814,18 @@ def route_image_generation_cost_calculator( image_response=completion_response, ) elif custom_llm_provider == litellm.LlmProviders.GEMINI.value: + if call_type in ( + CallTypes.image_edit.value, + CallTypes.aimage_edit.value, + ): + from litellm.llms.gemini.image_edit.cost_calculator import ( + cost_calculator as gemini_image_edit_cost_calculator, + ) + + return gemini_image_edit_cost_calculator( + model=model, + image_response=completion_response, + ) from litellm.llms.gemini.image_generation.cost_calculator import ( cost_calculator as gemini_image_cost_calculator, ) @@ -716,6 +839,68 @@ def route_image_generation_cost_calculator( model=model, image_response=completion_response, ) + elif custom_llm_provider == litellm.LlmProviders.FAL_AI.value: + from litellm.llms.fal_ai.cost_calculator import ( + cost_calculator as fal_ai_image_cost_calculator, + ) + + return fal_ai_image_cost_calculator( + model=model, + image_response=completion_response, + ) + elif custom_llm_provider == litellm.LlmProviders.RUNWAYML.value: + from litellm.llms.runwayml.cost_calculator import ( + cost_calculator as runwayml_image_cost_calculator, + ) + + return runwayml_image_cost_calculator( + model=model, + image_response=completion_response, + ) + elif custom_llm_provider == litellm.LlmProviders.OPENAI.value: + # Check if this is a gpt-image model (token-based pricing) + model_lower = model.lower() + if "gpt-image-1" in model_lower: + from litellm.llms.openai.image_generation.cost_calculator import ( + cost_calculator as openai_gpt_image_cost_calculator, + ) + + return openai_gpt_image_cost_calculator( + model=model, + image_response=completion_response, + custom_llm_provider=custom_llm_provider, + ) + # Fall through to default for DALL-E models + return default_image_cost_calculator( + model=model, + quality=quality, + custom_llm_provider=custom_llm_provider, + n=n, + size=size, + optional_params=optional_params, + ) + elif custom_llm_provider == litellm.LlmProviders.AZURE.value: + # Check if this is a gpt-image model (token-based pricing) + model_lower = model.lower() + if "gpt-image-1" in model_lower: + from litellm.llms.openai.image_generation.cost_calculator import ( + cost_calculator as openai_gpt_image_cost_calculator, + ) + + return openai_gpt_image_cost_calculator( + model=model, + image_response=completion_response, + custom_llm_provider=custom_llm_provider, + ) + # Fall through to default for DALL-E models + return default_image_cost_calculator( + model=model, + quality=quality, + custom_llm_provider=custom_llm_provider, + n=n, + size=size, + optional_params=optional_params, + ) else: return default_image_cost_calculator( model=model, diff --git a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py index 6ed9d5725e9b..25ad0a570cb8 100644 --- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py +++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py @@ -21,11 +21,13 @@ ChatCompletionMessageToolCall, ChatCompletionRedactedThinkingBlock, Choices, + CompletionTokensDetailsWrapper, Delta, EmbeddingResponse, Function, HiddenParams, ImageResponse, + PromptTokensDetailsWrapper, ) from litellm.types.utils import Logprobs as TextCompletionLogprobs from litellm.types.utils import ( @@ -37,6 +39,8 @@ TextChoices, TextCompletionResponse, TranscriptionResponse, + TranscriptionUsageDurationObject, + TranscriptionUsageTokensObject, Usage, ) @@ -302,6 +306,22 @@ def convert_to_image_response( "text_tokens": 0, } + # Map Responses API naming to Chat Completions API naming for cost calculator + if usage.get("prompt_tokens") is None: + usage["prompt_tokens"] = usage.get("input_tokens", 0) + if usage.get("completion_tokens") is None: + usage["completion_tokens"] = usage.get("output_tokens", 0) + + # Convert dicts to wrapper objects so getattr() works in cost calculation + if isinstance(usage.get("input_tokens_details"), dict): + usage["prompt_tokens_details"] = PromptTokensDetailsWrapper( + **usage["input_tokens_details"] + ) + if isinstance(usage.get("output_tokens_details"), dict): + usage["completion_tokens_details"] = CompletionTokensDetailsWrapper( + **usage["output_tokens_details"] + ) + if model_response_object is None: model_response_object = ImageResponse(**response_object) return model_response_object @@ -428,28 +448,58 @@ def convert_to_model_response_object( # noqa: PLR0915 if hidden_params is None: hidden_params = {} + + # Preserve existing additional_headers if they contain important provider headers + # For responses API, additional_headers may already be set with LLM provider headers + existing_additional_headers = hidden_params.get("additional_headers", {}) + if existing_additional_headers and _response_headers is None: + # Keep existing headers when _response_headers is None (responses API case) + additional_headers = existing_additional_headers + else: + # Merge new headers with existing ones + if existing_additional_headers: + additional_headers.update(existing_additional_headers) + hidden_params["additional_headers"] = additional_headers ### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary + # Some OpenAI-compatible providers (e.g., Apertis) return empty error objects + # even on success. Only raise if the error contains meaningful data. if ( response_object is not None and "error" in response_object and response_object["error"] is not None ): - error_args = {"status_code": 422, "message": "Error in response object"} - if isinstance(response_object["error"], dict): - if "code" in response_object["error"]: - error_args["status_code"] = response_object["error"]["code"] - if "message" in response_object["error"]: - if isinstance(response_object["error"]["message"], dict): - message_str = json.dumps(response_object["error"]["message"]) - else: - message_str = str(response_object["error"]["message"]) - error_args["message"] = message_str - raised_exception = Exception() - setattr(raised_exception, "status_code", error_args["status_code"]) - setattr(raised_exception, "message", error_args["message"]) - raise raised_exception + error_obj = response_object["error"] + has_meaningful_error = False + + if isinstance(error_obj, dict): + # Check if error dict has non-empty message or non-null code + error_message = error_obj.get("message", "") + error_code = error_obj.get("code") + has_meaningful_error = bool(error_message) or error_code is not None + elif isinstance(error_obj, str): + # String error is meaningful if non-empty + has_meaningful_error = bool(error_obj) + else: + # Any other truthy value is considered meaningful + has_meaningful_error = True + + if has_meaningful_error: + error_args = {"status_code": 422, "message": "Error in response object"} + if isinstance(error_obj, dict): + if "code" in error_obj: + error_args["status_code"] = error_obj["code"] + if "message" in error_obj: + if isinstance(error_obj["message"], dict): + message_str = json.dumps(error_obj["message"]) + else: + message_str = str(error_obj["message"]) + error_args["message"] = message_str + raised_exception = Exception() + setattr(raised_exception, "status_code", error_args["status_code"]) + setattr(raised_exception, "message", error_args["message"]) + raise raised_exception try: if response_type == "completion" and ( @@ -684,6 +734,24 @@ def convert_to_model_response_object( # noqa: PLR0915 if key in response_object: setattr(model_response_object, key, response_object[key]) + if "usage" in response_object and response_object["usage"] is not None: + tr_usage_object: Optional[ + Union[ + TranscriptionUsageDurationObject, TranscriptionUsageTokensObject + ] + ] = None + + if response_object["usage"].get("type", None) == "duration": + tr_usage_object = TranscriptionUsageDurationObject( + **response_object["usage"] + ) + elif response_object["usage"].get("type", None) == "tokens": + tr_usage_object = TranscriptionUsageTokensObject( + **response_object["usage"] + ) + if tr_usage_object is not None: + setattr(model_response_object, "usage", tr_usage_object) + if hidden_params is not None: model_response_object._hidden_params = hidden_params diff --git a/litellm/litellm_core_utils/llm_response_utils/get_formatted_prompt.py b/litellm/litellm_core_utils/llm_response_utils/get_formatted_prompt.py index fffaad79b9e8..f7406398a46d 100644 --- a/litellm/litellm_core_utils/llm_response_utils/get_formatted_prompt.py +++ b/litellm/litellm_core_utils/llm_response_utils/get_formatted_prompt.py @@ -4,6 +4,7 @@ def get_formatted_prompt( data: dict, call_type: Literal[ + "acompletion", "completion", "embedding", "image_generation", @@ -18,7 +19,7 @@ def get_formatted_prompt( Returns a string. """ prompt = "" - if call_type == "completion": + if call_type == "acompletion" or call_type == "completion": for message in data["messages"]: if message.get("content", None) is not None: content = message.get("content") diff --git a/litellm/litellm_core_utils/logging_callback_manager.py b/litellm/litellm_core_utils/logging_callback_manager.py index 9ec346c20a10..4f76a5bad03e 100644 --- a/litellm/litellm_core_utils/logging_callback_manager.py +++ b/litellm/litellm_core_utils/logging_callback_manager.py @@ -1,9 +1,10 @@ -from typing import TYPE_CHECKING, Callable, List, Optional, Set, Type, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Type, Union import litellm from litellm._logging import verbose_logger from litellm.integrations.additional_logging_utils import AdditionalLoggingUtils from litellm.integrations.custom_logger import CustomLogger +from litellm.integrations.generic_api.generic_api_callback import GenericAPILogger from litellm.types.utils import CallbacksByType if TYPE_CHECKING: @@ -11,6 +12,8 @@ else: _custom_logger_compatible_callbacks_literal = str +_generic_api_logger_cache: Dict[str, GenericAPILogger] = {} + class LoggingCallbackManager: """ @@ -138,6 +141,78 @@ def _check_callback_list_size( return False return True + @staticmethod + def _add_custom_callback_generic_api_str( + callback: str, + ) -> Union[GenericAPILogger, str]: + """ + litellm_settings: + success_callback: ["custom_callback_name"] + + callback_settings: + custom_callback_name: + callback_type: generic_api + endpoint: https://webhook-test.com/30343bc33591bc5e6dc44217ceae3e0a + headers: + Authorization: Bearer sk-1234 + """ + callback_config = litellm.callback_settings.get(callback) + + # Check if callback is in callback_settings with callback_type: generic_api + if ( + isinstance(callback_config, dict) + and callback_config.get("callback_type") == "generic_api" + ): + endpoint = callback_config.get("endpoint") + headers = callback_config.get("headers") + event_types = callback_config.get("event_types") + log_format = callback_config.get("log_format") + + if endpoint is None or headers is None: + verbose_logger.warning( + "generic_api callback '%s' is missing endpoint or headers, skipping.", + callback, + ) + return callback + + cached_logger = _generic_api_logger_cache.get(callback) + if ( + isinstance(cached_logger, GenericAPILogger) + and cached_logger.endpoint == endpoint + and cached_logger.headers == headers + and cached_logger.event_types == event_types + and cached_logger.log_format == log_format + ): + return cached_logger + + new_logger = GenericAPILogger( + endpoint=endpoint, + headers=headers, + event_types=event_types, + log_format=log_format, + ) + _generic_api_logger_cache[callback] = new_logger + return new_logger + + # Check if callback is in generic_api_compatible_callbacks.json + from litellm.integrations.generic_api.generic_api_callback import ( + is_callback_compatible, + ) + + if is_callback_compatible(callback): + # Check if we already have a cached logger for this callback + cached_logger = _generic_api_logger_cache.get(callback) + if isinstance(cached_logger, GenericAPILogger): + return cached_logger + + # Create new GenericAPILogger with callback_name parameter + # This will load config from generic_api_compatible_callbacks.json + new_logger = GenericAPILogger(callback_name=callback) + _generic_api_logger_cache[callback] = new_logger + return new_logger + + return callback + def _safe_add_callback_to_list( self, callback: Union[CustomLogger, Callable, str], @@ -152,6 +227,13 @@ def _safe_add_callback_to_list( if not self._check_callback_list_size(parent_list): return + # Check if the callback is a custom callback + + if isinstance(callback, str): + callback = LoggingCallbackManager._add_custom_callback_generic_api_str( + callback + ) + if isinstance(callback, str): self._add_string_callback_to_list( callback=callback, parent_list=parent_list @@ -161,6 +243,7 @@ def _safe_add_callback_to_list( custom_logger=callback, parent_list=parent_list, ) + elif callable(callback): self._add_callback_function_to_list( callback=callback, parent_list=parent_list @@ -348,7 +431,6 @@ def _get_callback_string(self, callback: Union[CustomLogger, Callable, str]) -> elif callable(callback): return getattr(callback, "__name__", str(callback)) return str(callback) - def get_active_custom_logger_for_callback_name( self, @@ -362,12 +444,16 @@ def get_active_custom_logger_for_callback_name( ) # get the custom logger class type - custom_logger_class_type = CustomLoggerRegistry.get_class_type_for_custom_logger_name(callback_name) + custom_logger_class_type = ( + CustomLoggerRegistry.get_class_type_for_custom_logger_name(callback_name) + ) # get the active custom logger custom_logger = self.get_custom_loggers_for_type(custom_logger_class_type) if len(custom_logger) == 0: - raise ValueError(f"No active custom logger found for callback name: {callback_name}") + raise ValueError( + f"No active custom logger found for callback name: {callback_name}" + ) return custom_logger[0] diff --git a/litellm/litellm_core_utils/logging_worker.py b/litellm/litellm_core_utils/logging_worker.py index 3c475f133a88..d5eca9eeb552 100644 --- a/litellm/litellm_core_utils/logging_worker.py +++ b/litellm/litellm_core_utils/logging_worker.py @@ -1,11 +1,22 @@ +# This file may be a good candidate to be the first one to be refactored into a separate process, +# for the sake of performance and scalability. + import asyncio -import contextlib import contextvars from typing import Coroutine, Optional - +import atexit from typing_extensions import TypedDict from litellm._logging import verbose_logger +from litellm.constants import ( + LOGGING_WORKER_CONCURRENCY, + LOGGING_WORKER_MAX_QUEUE_SIZE, + LOGGING_WORKER_MAX_TIME_PER_COROUTINE, + LOGGING_WORKER_CLEAR_PERCENTAGE, + LOGGING_WORKER_AGGRESSIVE_CLEAR_COOLDOWN_SECONDS, + MAX_ITERATIONS_TO_CLEAR_QUEUE, + MAX_TIME_TO_CLEAR_QUEUE, +) class LoggingTask(TypedDict): @@ -27,42 +38,61 @@ class LoggingWorker: - Use this to queue coroutine tasks that are not critical to the main flow of the application. e.g Success/Error callbacks, logging, etc. """ - LOGGING_WORKER_MAX_QUEUE_SIZE = 50_000 - LOGGING_WORKER_MAX_TIME_PER_COROUTINE = 20.0 - - MAX_ITERATIONS_TO_CLEAR_QUEUE = 200 - MAX_TIME_TO_CLEAR_QUEUE = 5.0 - def __init__( self, timeout: float = LOGGING_WORKER_MAX_TIME_PER_COROUTINE, max_queue_size: int = LOGGING_WORKER_MAX_QUEUE_SIZE, + concurrency: int = LOGGING_WORKER_CONCURRENCY, ): self.timeout = timeout self.max_queue_size = max_queue_size + self.concurrency = concurrency self._queue: Optional[asyncio.Queue[LoggingTask]] = None self._worker_task: Optional[asyncio.Task] = None + self._running_tasks: set[asyncio.Task] = set() + self._sem: Optional[asyncio.Semaphore] = None + self._bound_loop: Optional[asyncio.AbstractEventLoop] = None + self._last_aggressive_clear_time: float = 0.0 + self._aggressive_clear_in_progress: bool = False + + # Register cleanup handler to flush remaining events on exit + atexit.register(self._flush_on_exit) def _ensure_queue(self) -> None: - """Initialize the queue if it doesn't exist.""" + """Initialize the queue if it doesn't exist or if event loop has changed.""" + try: + current_loop = asyncio.get_running_loop() + except RuntimeError: + # No running loop, can't initialize + return + + # Check if we need to reinitialize due to event loop change + if self._queue is not None and self._bound_loop is not current_loop: + verbose_logger.debug( + "LoggingWorker: Event loop changed, reinitializing queue and worker" + ) + # Clear old state - these are bound to the old loop + self._queue = None + self._sem = None + self._worker_task = None + self._running_tasks.clear() + if self._queue is None: self._queue = asyncio.Queue(maxsize=self.max_queue_size) + self._bound_loop = current_loop def start(self) -> None: """Start the logging worker. Idempotent - safe to call multiple times.""" self._ensure_queue() + if self._sem is None: + self._sem = asyncio.Semaphore(self.concurrency) if self._worker_task is None or self._worker_task.done(): self._worker_task = asyncio.create_task(self._worker_loop()) - async def _worker_loop(self) -> None: - """Main worker loop that processes log coroutines sequentially.""" + async def _process_log_task(self, task: LoggingTask, sem: asyncio.Semaphore): + """Runs the logging task and handles cleanup. Releases semaphore when done.""" try: - if self._queue is None: - return - - while True: - # Process one coroutine at a time to keep event loop load predictable - task = await self._queue.get() + if self._queue is not None: try: # Run the coroutine in its original context await asyncio.wait_for( @@ -71,9 +101,34 @@ async def _worker_loop(self) -> None: ) except Exception as e: verbose_logger.exception(f"LoggingWorker error: {e}") - pass finally: self._queue.task_done() + finally: + # Always release semaphore, even if queue is None + sem.release() + + async def _worker_loop(self) -> None: + """Main worker loop that gets tasks and schedules them to run concurrently.""" + try: + if self._queue is None or self._sem is None: + return + + while True: + # Acquire semaphore before removing task from queue to prevent + # unbounded growth of waiting tasks + await self._sem.acquire() + try: + task = await self._queue.get() + # Track each spawned coroutine so we can cancel on shutdown. + processing_task = asyncio.create_task( + self._process_log_task(task, self._sem) + ) + self._running_tasks.add(processing_task) + processing_task.add_done_callback(self._running_tasks.discard) + except Exception: + # If task creation fails, release semaphore to prevent deadlock + self._sem.release() + raise except asyncio.CancelledError: verbose_logger.debug("LoggingWorker cancelled during shutdown") @@ -83,19 +138,207 @@ async def _worker_loop(self) -> None: def enqueue(self, coroutine: Coroutine) -> None: """ Add a coroutine to the logging queue. - Hot path: never blocks, drops logs if queue is full. + Hot path: never blocks, aggressively clears queue if full. + """ + if self._queue is None: + return + + # Capture the current context when enqueueing + task = LoggingTask(coroutine=coroutine, context=contextvars.copy_context()) + + try: + self._queue.put_nowait(task) + except asyncio.QueueFull: + # Queue is full - handle it appropriately + verbose_logger.exception("LoggingWorker queue is full") + self._handle_queue_full(task) + + def _should_start_aggressive_clear(self) -> bool: + """ + Check if we should start a new aggressive clear operation. + Returns True if cooldown period has passed and no clear is in progress. + """ + if self._aggressive_clear_in_progress: + return False + + try: + loop = asyncio.get_running_loop() + current_time = loop.time() + time_since_last_clear = current_time - self._last_aggressive_clear_time + + if time_since_last_clear < LOGGING_WORKER_AGGRESSIVE_CLEAR_COOLDOWN_SECONDS: + return False + + return True + except RuntimeError: + # No event loop running, drop the task + return False + + def _mark_aggressive_clear_started(self) -> None: + """ + Mark that an aggressive clear operation has started. + + Note: This should only be called after _should_start_aggressive_clear() + returns True, which guarantees an event loop exists. + """ + loop = asyncio.get_running_loop() + self._last_aggressive_clear_time = loop.time() + self._aggressive_clear_in_progress = True + + def _handle_queue_full(self, task: LoggingTask) -> None: + """ + Handle queue full condition by either starting an aggressive clear + or scheduling a delayed retry. + """ + + if self._should_start_aggressive_clear(): + self._mark_aggressive_clear_started() + # Schedule clearing as async task so enqueue returns immediately (non-blocking) + asyncio.create_task(self._aggressively_clear_queue_async(task)) + else: + # Cooldown active or clear in progress, schedule a delayed retry + self._schedule_delayed_enqueue_retry(task) + + def _calculate_retry_delay(self) -> float: """ + Calculate the delay before retrying an enqueue operation. + Returns the delay in seconds. + """ + try: + loop = asyncio.get_running_loop() + current_time = loop.time() + time_since_last_clear = current_time - self._last_aggressive_clear_time + remaining_cooldown = max( + 0.0, + LOGGING_WORKER_AGGRESSIVE_CLEAR_COOLDOWN_SECONDS + - time_since_last_clear, + ) + # Add a small buffer (10% of cooldown or 50ms, whichever is larger) to ensure + # cooldown has expired and aggressive clear has completed + return remaining_cooldown + max( + 0.05, LOGGING_WORKER_AGGRESSIVE_CLEAR_COOLDOWN_SECONDS * 0.1 + ) + except RuntimeError: + # No event loop, return minimum delay + return 0.1 + + def _schedule_delayed_enqueue_retry(self, task: LoggingTask) -> None: + """ + Schedule a delayed retry to enqueue the task after cooldown expires. + This prevents dropping tasks when the queue is full during cooldown. + Preserves the original task context. + """ + try: + # Check that we have a running event loop (will raise RuntimeError if not) + asyncio.get_running_loop() + delay = self._calculate_retry_delay() + + # Schedule the retry as a background task + asyncio.create_task(self._retry_enqueue_task(task, delay)) + except RuntimeError: + # No event loop, drop the task as we can't schedule a retry + pass + + async def _retry_enqueue_task(self, task: LoggingTask, delay: float) -> None: + """ + Retry enqueueing the task after delay, preserving original context. + This is called as a background task from _schedule_delayed_enqueue_retry. + """ + await asyncio.sleep(delay) + + # Try to enqueue the task directly, preserving its original context if self._queue is None: return try: - # Capture the current context when enqueueing - task = LoggingTask(coroutine=coroutine, context=contextvars.copy_context()) self._queue.put_nowait(task) - except asyncio.QueueFull as e: - verbose_logger.exception(f"LoggingWorker queue is full: {e}") - # Drop logs on overload to protect request throughput + except asyncio.QueueFull: + # Still full - handle it appropriately (clear or retry again) + self._handle_queue_full(task) + + def _extract_tasks_from_queue(self) -> list[LoggingTask]: + """ + Extract tasks from the queue to make room. + Returns a list of extracted tasks based on percentage of queue size. + """ + if self._queue is None: + return [] + + # Calculate items based on percentage of queue size + items_to_extract = ( + self.max_queue_size * LOGGING_WORKER_CLEAR_PERCENTAGE + ) // 100 + # Use actual queue size to avoid unnecessary iterations + actual_size = self._queue.qsize() + if actual_size == 0: + return [] + items_to_extract = min(items_to_extract, actual_size) + + # Extract tasks from queue (using list comprehension would require wrapping in try/except) + extracted_tasks = [] + for _ in range(items_to_extract): + try: + extracted_tasks.append(self._queue.get_nowait()) + except asyncio.QueueEmpty: + break + + return extracted_tasks + + async def _aggressively_clear_queue_async( + self, new_task: Optional[LoggingTask] = None + ) -> None: + """ + Aggressively clear the queue by extracting and processing items. + This is called when the queue is full to prevent dropping logs. + Fully async and non-blocking - runs in background task. + """ + try: + if self._queue is None: + return + + extracted_tasks = self._extract_tasks_from_queue() + + # Add new task to extracted tasks to process directly + if new_task is not None: + extracted_tasks.append(new_task) + + # Process extracted tasks directly + if extracted_tasks: + await self._process_extracted_tasks(extracted_tasks) + except Exception as e: + verbose_logger.exception( + f"LoggingWorker error during aggressive clear: {e}" + ) + finally: + # Always reset the flag even if an error occurs + self._aggressive_clear_in_progress = False + + async def _process_single_task(self, task: LoggingTask) -> None: + """Process a single task and mark it done.""" + if self._queue is None: + return + + try: + await asyncio.wait_for( + task["context"].run(asyncio.create_task, task["coroutine"]), + timeout=self.timeout, + ) + except Exception: + # Suppress errors during processing to ensure we keep going pass + finally: + self._queue.task_done() + + async def _process_extracted_tasks(self, tasks: list[LoggingTask]) -> None: + """ + Process tasks that were extracted from the queue to make room. + Processes them concurrently without semaphore limits for maximum speed. + """ + if not tasks or self._queue is None: + return + + # Process all tasks concurrently for maximum speed + await asyncio.gather(*[self._process_single_task(task) for task in tasks]) def ensure_initialized_and_enqueue(self, async_coroutine: Coroutine): """ @@ -106,11 +349,25 @@ def ensure_initialized_and_enqueue(self, async_coroutine: Coroutine): async def stop(self) -> None: """Stop the logging worker and clean up resources.""" + if self._worker_task is None and not self._running_tasks: + # No worker launched and no in-flight tasks to drain. + return + + tasks_to_cancel: list[asyncio.Task] = list(self._running_tasks) if self._worker_task: - self._worker_task.cancel() - with contextlib.suppress(Exception): - await self._worker_task - self._worker_task = None + # Include the main worker loop so it stops fetching work. + tasks_to_cancel.append(self._worker_task) + + for task in tasks_to_cancel: + # Propagate cancellation to every pending task. + task.cancel() + + # Wait for cancellation to settle; ignore errors raised during shutdown. + await asyncio.gather(*tasks_to_cancel, return_exceptions=True) + + self._worker_task = None + # Drop references to completed tasks so we can restart cleanly. + self._running_tasks.clear() async def flush(self) -> None: """Flush the logging queue.""" @@ -128,14 +385,11 @@ async def clear_queue(self): start_time = asyncio.get_event_loop().time() - for _ in range(self.MAX_ITERATIONS_TO_CLEAR_QUEUE): + for _ in range(MAX_ITERATIONS_TO_CLEAR_QUEUE): # Check if we've exceeded the maximum time - if ( - asyncio.get_event_loop().time() - start_time - >= self.MAX_TIME_TO_CLEAR_QUEUE - ): + if asyncio.get_event_loop().time() - start_time >= MAX_TIME_TO_CLEAR_QUEUE: verbose_logger.warning( - f"clear_queue exceeded max_time of {self.MAX_TIME_TO_CLEAR_QUEUE}s, stopping early" + f"clear_queue exceeded max_time of {MAX_TIME_TO_CLEAR_QUEUE}s, stopping early" ) break @@ -150,10 +404,120 @@ async def clear_queue(self): except Exception: # Suppress errors during cleanup pass + finally: + # Clear reference to prevent memory leaks + task = None self._queue.task_done() # If you're using join() elsewhere except asyncio.QueueEmpty: break + def _safe_log(self, level: str, message: str) -> None: + """ + Safely log a message during shutdown, suppressing errors if logging is closed. + """ + # Check if logger has valid handlers before attempting to log + # During shutdown, handlers may be closed, causing ValueError when writing + if not hasattr(verbose_logger, 'handlers') or not verbose_logger.handlers: + return + + # Check if any handler has a valid stream + has_valid_handler = False + for handler in verbose_logger.handlers: + try: + if hasattr(handler, 'stream') and handler.stream and not handler.stream.closed: + has_valid_handler = True + break + elif not hasattr(handler, 'stream'): + # Non-stream handlers (like NullHandler) are always valid + has_valid_handler = True + break + except (AttributeError, ValueError): + continue + + if not has_valid_handler: + return + + try: + if level == "debug": + verbose_logger.debug(message) + elif level == "info": + verbose_logger.info(message) + elif level == "warning": + verbose_logger.warning(message) + elif level == "error": + verbose_logger.error(message) + except (ValueError, OSError, AttributeError): + # Logging handlers may be closed during shutdown + # Silently ignore logging errors to prevent breaking shutdown + pass + + def _flush_on_exit(self): + """ + Flush remaining events synchronously before process exit. + Called automatically via atexit handler. + + This ensures callbacks queued by async completions are processed + even when the script exits before the worker loop can handle them. + + Note: All logging in this method is wrapped to handle cases where + logging handlers are closed during shutdown. + """ + if self._queue is None: + self._safe_log("debug", "[LoggingWorker] atexit: No queue initialized") + return + + if self._queue.empty(): + self._safe_log("debug", "[LoggingWorker] atexit: Queue is empty") + return + + queue_size = self._queue.qsize() + self._safe_log( + "info", f"[LoggingWorker] atexit: Flushing {queue_size} remaining events..." + ) + + # Create a new event loop since the original is closed + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + # Process remaining queue items with time limit + processed = 0 + start_time = loop.time() + + while not self._queue.empty() and processed < MAX_ITERATIONS_TO_CLEAR_QUEUE: + if loop.time() - start_time >= MAX_TIME_TO_CLEAR_QUEUE: + self._safe_log( + "warning", + f"[LoggingWorker] atexit: Reached time limit ({MAX_TIME_TO_CLEAR_QUEUE}s), stopping flush", + ) + break + + try: + task = self._queue.get_nowait() + except asyncio.QueueEmpty: + break + + # Run the coroutine synchronously in new loop + # Note: We run the coroutine directly, not via create_task, + # since we're in a new event loop context + try: + loop.run_until_complete(task["coroutine"]) + processed += 1 + except Exception: + # Silent failure to not break user's program + pass + finally: + # Clear reference to prevent memory leaks + task = None + + self._safe_log( + "info", + f"[LoggingWorker] atexit: Successfully flushed {processed} events!", + ) + + finally: + loop.close() + # Global instance for backward compatibility GLOBAL_LOGGING_WORKER = LoggingWorker() diff --git a/litellm/litellm_core_utils/model_response_utils.py b/litellm/litellm_core_utils/model_response_utils.py index 974d12aef6f2..00462221fe35 100644 --- a/litellm/litellm_core_utils/model_response_utils.py +++ b/litellm/litellm_core_utils/model_response_utils.py @@ -46,7 +46,8 @@ def is_model_response_stream_empty(model_response: ModelResponseStream) -> bool: return False # Check for any non-base fields that are set - for model_response_field in model_response.model_fields.keys(): + # Access model_fields on the class, not the instance, to avoid Pydantic 2.11+ deprecation warnings + for model_response_field in type(model_response).model_fields.keys(): # Skip base fields that are always set if model_response_field in BASE_FIELDS: continue diff --git a/litellm/litellm_core_utils/prompt_templates/common_utils.py b/litellm/litellm_core_utils/prompt_templates/common_utils.py index 33658d490631..7790fb83361c 100644 --- a/litellm/litellm_core_utils/prompt_templates/common_utils.py +++ b/litellm/litellm_core_utils/prompt_templates/common_utils.py @@ -6,6 +6,7 @@ import mimetypes import re from os import PathLike +from pathlib import Path from typing import ( TYPE_CHECKING, Any, @@ -94,6 +95,18 @@ def handle_messages_with_content_list_to_str_conversion( return messages +def strip_name_from_message( + message: AllMessageValues, allowed_name_roles: List[str] = ["user"] +) -> AllMessageValues: + """ + Removes 'name' from message + """ + msg_copy = message.copy() + if msg_copy.get("role") not in allowed_name_roles: + msg_copy.pop("name", None) # type: ignore + return msg_copy + + def strip_name_from_messages( messages: List[AllMessageValues], allowed_name_roles: List[str] = ["user"] ) -> List[AllMessageValues]: @@ -428,6 +441,71 @@ def update_messages_with_model_file_ids( return messages +def update_responses_input_with_model_file_ids( + input: Any, +) -> Union[str, List[Dict[str, Any]]]: + """ + Updates responses API input with provider-specific file IDs. + File IDs are always inside the content array, not as direct input_file items. + + For managed files (unified file IDs), decodes the base64-encoded unified file ID + and extracts the llm_output_file_id directly. + """ + from litellm.proxy.openai_files_endpoints.common_utils import ( + _is_base64_encoded_unified_file_id, + convert_b64_uid_to_unified_uid, + ) + + if isinstance(input, str): + return input + + if not isinstance(input, list): + return input + + updated_input = [] + for item in input: + if not isinstance(item, dict): + updated_input.append(item) + continue + + updated_item = item.copy() + content = item.get("content") + if isinstance(content, list): + updated_content = [] + for content_item in content: + if ( + isinstance(content_item, dict) + and content_item.get("type") == "input_file" + ): + file_id = content_item.get("file_id") + if file_id: + # Check if this is a managed file ID (base64-encoded unified file ID) + is_unified_file_id = _is_base64_encoded_unified_file_id(file_id) + if is_unified_file_id: + unified_file_id = convert_b64_uid_to_unified_uid(file_id) + if "llm_output_file_id," in unified_file_id: + provider_file_id = unified_file_id.split( + "llm_output_file_id," + )[1].split(";")[0] + else: + # Fallback: keep original if we can't extract + provider_file_id = file_id + updated_content_item = content_item.copy() + updated_content_item["file_id"] = provider_file_id + updated_content.append(updated_content_item) + else: + updated_content.append(content_item) + else: + updated_content.append(content_item) + else: + updated_content.append(content_item) + updated_item["content"] = updated_content + + updated_input.append(updated_item) + + return updated_input + + def extract_file_data(file_data: FileTypes) -> ExtractedFileData: """ Extracts and processes file data from various input formats. @@ -464,6 +542,12 @@ def extract_file_data(file_data: FileTypes) -> ExtractedFileData: # Convert content to bytes if isinstance(file_content, (str, PathLike)): # If it's a path, open and read the file + # Extract filename from path if not already set + if filename is None: + if isinstance(file_content, PathLike): + filename = Path(file_content).name + else: + filename = Path(str(file_content)).name with open(file_content, "rb") as f: content = f.read() elif isinstance(file_content, io.IOBase): @@ -481,11 +565,11 @@ def extract_file_data(file_data: FileTypes) -> ExtractedFileData: # Use provided content type or guess based on filename if not content_type: - content_type = ( - mimetypes.guess_type(filename)[0] - if filename - else "application/octet-stream" - ) + if filename: + guessed_type = mimetypes.guess_type(filename)[0] + content_type = guessed_type if guessed_type else "application/octet-stream" + else: + content_type = "application/octet-stream" return ExtractedFileData( filename=filename, @@ -620,8 +704,15 @@ def _get_image_mime_type_from_url(url: str) -> Optional[str]: video/mpegps video/flv """ + from urllib.parse import urlparse + url = url.lower() + # Parse URL to extract path without query parameters + # This handles URLs like: https://example.com/image.jpg?signature=... + parsed = urlparse(url) + path = parsed.path + # Map file extensions to mime types mime_types = { # Images @@ -648,7 +739,7 @@ def _get_image_mime_type_from_url(url: str) -> Optional[str]: # Check each extension group against the URL for extensions, mime_type in mime_types.items(): - if any(url.endswith(ext) for ext in extensions): + if any(path.endswith(ext) for ext in extensions): return mime_type return None @@ -661,28 +752,28 @@ def infer_content_type_from_url_and_content( ) -> str: """ Infer content type from URL extension and binary content when content-type header is missing or generic. - + This helper implements a fallback strategy for determining MIME types when HTTP headers are missing or provide generic values (like binary/octet-stream). It's commonly used when processing images and documents from various sources (S3, URLs, etc.). - + Fallback Strategy: 1. If current_content_type is valid (not None and not generic octet-stream), return it 2. Try to infer from URL extension (handles query parameters) 3. Try to detect from binary content signature (magic bytes) 4. Raise ValueError if all methods fail - + Args: url: The URL of the content (used to extract file extension) content: The binary content (first ~100 bytes are sufficient for detection) current_content_type: The current content-type from headers (may be None or generic) - + Returns: str: The inferred MIME type (e.g., "image/png", "application/pdf") - + Raises: ValueError: If content type cannot be determined by any method - + Example: >>> content_type = infer_content_type_from_url_and_content( ... url="https://s3.amazonaws.com/bucket/image.png?AWSAccessKeyId=123", @@ -693,14 +784,14 @@ def infer_content_type_from_url_and_content( "image/png" """ from litellm.litellm_core_utils.token_counter import get_image_type - + # If we have a valid content type that's not generic, use it if current_content_type and current_content_type not in [ "binary/octet-stream", "application/octet-stream", ]: return current_content_type - + # Extension to MIME type mapping # Supports images, documents, and other common file types extension_to_mime = { @@ -721,14 +812,14 @@ def infer_content_type_from_url_and_content( "txt": "text/plain", "md": "text/markdown", } - + # Try to infer from URL extension if url: extension = url.split(".")[-1].lower().split("?")[0] # Remove query params inferred_type = extension_to_mime.get(extension) if inferred_type: return inferred_type - + # Try to detect from binary content signature (magic bytes) if content: detected_type = get_image_type(content[:100]) @@ -742,7 +833,7 @@ def infer_content_type_from_url_and_content( } if detected_type in type_to_mime: return type_to_mime[detected_type] - + # If all fallbacks failed, raise error raise ValueError( f"Unable to determine content type from URL: {url}. " @@ -980,9 +1071,9 @@ def _extract_reasoning_content(message: dict) -> Tuple[Optional[str], Optional[s """ message_content = message.get("content") if "reasoning_content" in message: - return message["reasoning_content"], message["content"] + return message["reasoning_content"], message_content elif "reasoning" in message: - return message["reasoning"], message["content"] + return message["reasoning"], message_content elif isinstance(message_content, str): return _parse_content_for_reasoning(message_content) return None, message_content @@ -1002,7 +1093,9 @@ def _parse_content_for_reasoning( return None, message_text reasoning_match = re.match( - r"<(?:think|thinking)>(.*?)(.*)", message_text, re.DOTALL + r"<(?:think|thinking|budget:thinking)>(.*?)(.*)", + message_text, + re.DOTALL, ) if reasoning_match: @@ -1011,9 +1104,35 @@ def _parse_content_for_reasoning( return None, message_text +def _extract_base64_data(image_url: str) -> str: + """ + Extract pure base64 data from an image URL. + + If the URL is a data URL (e.g., "..."), + extract and return only the base64 data portion. + Otherwise, return the original URL unchanged. + + This is needed for providers like Ollama that expect pure base64 data + rather than full data URLs. + + Args: + image_url: The image URL or data URL to process + + Returns: + The base64 data if it's a data URL, otherwise the original URL + """ + if image_url.startswith("data:") and ";base64," in image_url: + return image_url.split(";base64,", 1)[1] + return image_url + + def extract_images_from_message(message: AllMessageValues) -> List[str]: """ - Extract images from a message + Extract images from a message. + + For data URLs (e.g., "..."), only the base64 + data portion is extracted. This is required for providers like Ollama + that expect pure base64 data rather than full data URLs. """ images = [] message_content = message.get("content") @@ -1022,7 +1141,51 @@ def extract_images_from_message(message: AllMessageValues) -> List[str]: image_url = m.get("image_url") if image_url: if isinstance(image_url, str): - images.append(image_url) + images.append(_extract_base64_data(image_url)) elif isinstance(image_url, dict) and "url" in image_url: - images.append(image_url["url"]) + images.append(_extract_base64_data(image_url["url"])) return images + + +def parse_tool_call_arguments( + arguments: Optional[str], + tool_name: Optional[str] = None, + context: Optional[str] = None, +) -> Dict[str, Any]: + """ + Parse tool call arguments from a JSON string. + + This function handles malformed JSON gracefully by raising a ValueError + with context about what failed and what the problematic input was. + + Args: + arguments: The JSON string containing tool arguments, or None. + tool_name: Optional name of the tool (for error messages). + context: Optional context string (e.g., "Anthropic Messages API"). + + Returns: + Parsed arguments as a dictionary. Returns empty dict if arguments is None or empty. + + Raises: + ValueError: If the arguments string is not valid JSON. + """ + import json + + if not arguments: + return {} + + try: + return json.loads(arguments) + except json.JSONDecodeError as e: + error_parts = ["Failed to parse tool call arguments"] + + if tool_name: + error_parts.append(f"for tool '{tool_name}'") + if context: + error_parts.append(f"({context})") + + error_message = ( + " ".join(error_parts) + f". Error: {str(e)}. Arguments: {arguments}" + ) + + raise ValueError(error_message) from e diff --git a/litellm/litellm_core_utils/prompt_templates/factory.py b/litellm/litellm_core_utils/prompt_templates/factory.py index 18bd9fc16849..0e1637a65bab 100644 --- a/litellm/litellm_core_utils/prompt_templates/factory.py +++ b/litellm/litellm_core_utils/prompt_templates/factory.py @@ -1,10 +1,12 @@ +import base64 import copy +import hashlib import json import mimetypes import re import xml.etree.ElementTree as ET from enum import Enum -from typing import Any, List, Optional, Tuple, cast, overload +from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload from jinja2.sandbox import ImmutableSandboxedEnvironment @@ -42,6 +44,7 @@ convert_content_list_to_str, infer_content_type_from_url_and_content, is_non_content_values_set, + parse_tool_call_arguments, ) from .image_handling import convert_url_to_base64 @@ -56,6 +59,10 @@ def prompt_injection_detection_default_pt(): BAD_MESSAGE_ERROR_STR = "Invalid Message " +# Separator used to embed Gemini thought signatures in tool call IDs +# See: https://ai.google.dev/gemini-api/docs/thought-signatures +THOUGHT_SIGNATURE_SEPARATOR = "__thought__" + # used to interweave user messages, to ensure user/assistant alternating DEFAULT_USER_CONTINUE_MESSAGE = { "role": "user", @@ -896,11 +903,70 @@ def convert_to_anthropic_image_obj( media_type=media_type, data=base64_data, ) + except litellm.ImageFetchError: + raise except Exception as e: - if "Error: Unable to fetch image from URL" in str(e): - raise e raise Exception( - """Image url not in expected format. Example Expected input - "image_url": "data:image/jpeg;base64,{base64_image}". Supported formats - ['image/jpeg', 'image/png', 'image/gif', 'image/webp'].""" + f"""Image url not in expected format. Example Expected input - "image_url": "data:image/jpeg;base64,{{base64_image}}". Supported formats - ['image/jpeg', 'image/png', 'image/gif', 'image/webp']. Error: {str(e)}""" + ) + + +def create_anthropic_image_param( + image_url_input: Union[str, dict], + format: Optional[str] = None, + is_bedrock_invoke: bool = False, +) -> AnthropicMessagesImageParam: + """ + Create an AnthropicMessagesImageParam from an image URL input. + + Supports both URL references (for HTTP/HTTPS URLs) and base64 encoding. + """ + # Extract URL and format from input + if isinstance(image_url_input, str): + image_url = image_url_input + else: + image_url = image_url_input.get("url", "") + if format is None: + format = image_url_input.get("format") + + # Check if the image URL is an HTTP/HTTPS URL + if image_url.startswith("http://") or image_url.startswith("https://"): + # For Bedrock invoke and Vertex AI Anthropic, always convert URLs to base64 + # as these providers don't support URL sources for images + if is_bedrock_invoke or image_url.startswith("http://"): + base64_url = convert_url_to_base64(url=image_url) + image_chunk = convert_to_anthropic_image_obj( + openai_image_url=base64_url, format=format + ) + return AnthropicMessagesImageParam( + type="image", + source=AnthropicContentParamSource( + type="base64", + media_type=image_chunk["media_type"], + data=image_chunk["data"], + ), + ) + else: + # HTTPS URL - pass directly for regular Anthropic + return AnthropicMessagesImageParam( + type="image", + source=AnthropicContentParamSourceUrl( + type="url", + url=image_url, + ), + ) + else: + # Convert to base64 for data URIs or other formats + image_chunk = convert_to_anthropic_image_obj( + openai_image_url=image_url, format=format + ) + return AnthropicMessagesImageParam( + type="image", + source=AnthropicContentParamSource( + type="base64", + media_type=image_chunk["media_type"], + data=image_chunk["data"], + ), ) @@ -966,9 +1032,11 @@ def convert_to_anthropic_tool_invoke_xml(tool_calls: list) -> str: tool_function = get_attribute_or_key(tool, "function") tool_name = get_attribute_or_key(tool_function, "name") tool_arguments = get_attribute_or_key(tool_function, "arguments") + parsed_args = parse_tool_call_arguments( + tool_arguments, tool_name=tool_name, context="Anthropic XML tool invoke" + ) parameters = "".join( - f"<{param}>{val}\n" - for param, val in json.loads(tool_arguments).items() + f"<{param}>{val}\n" for param, val in parsed_args.items() ) invokes += ( "\n" @@ -1006,15 +1074,41 @@ def anthropic_messages_pt_xml(messages: list): if isinstance(messages[msg_i]["content"], list): for m in messages[msg_i]["content"]: if m.get("type", "") == "image_url": - format = m["image_url"].get("format") - user_content.append( - { - "type": "image", - "source": convert_to_anthropic_image_obj( - m["image_url"]["url"], format=format - ), - } + format = ( + m["image_url"].get("format") + if isinstance(m["image_url"], dict) + else None + ) + image_param = create_anthropic_image_param( + m["image_url"], format=format ) + # Convert to dict format for XML version + source = image_param["source"] + if isinstance(source, dict) and source.get("type") == "url": + # Type narrowing for URL source + url_source = cast(AnthropicContentParamSourceUrl, source) + user_content.append( + { + "type": "image", + "source": { + "type": "url", + "url": url_source["url"], + }, + } + ) + else: + # Type narrowing for base64 source + base64_source = cast(AnthropicContentParamSource, source) + user_content.append( + { + "type": "image", + "source": { + "type": "base64", + "media_type": base64_source["media_type"], + "data": base64_source["data"], + }, + } + ) elif m.get("type", "") == "text": user_content.append({"type": "text", "text": m["text"]}) else: @@ -1160,8 +1254,94 @@ def _gemini_tool_call_invoke_helper( return function_call +def _encode_tool_call_id_with_signature( + tool_call_id: str, thought_signature: Optional[str] +) -> str: + """ + Embed thought signature into tool call ID for OpenAI client compatibility. + + Args: + tool_call_id: The tool call ID (e.g., "call_abc123...") + thought_signature: Base64-encoded signature from Gemini response + + Returns: + Tool call ID with embedded signature if present, otherwise original ID + Format: call___thought__ + + See: https://ai.google.dev/gemini-api/docs/thought-signatures + """ + if thought_signature: + return f"{tool_call_id}{THOUGHT_SIGNATURE_SEPARATOR}{thought_signature}" + return tool_call_id + + +def _get_thought_signature_from_tool( + tool: dict, model: Optional[str] = None +) -> Optional[str]: + """Extract thought signature from tool call's provider_specific_fields. + + If not provided try to extract thought signature from tool call id + + Checks both tool.provider_specific_fields and tool.function.provider_specific_fields. + If no signature is found and model is gemini-3, returns a dummy signature. + """ + # First check tool's provider_specific_fields + provider_fields = tool.get("provider_specific_fields") or {} + if isinstance(provider_fields, dict): + signature = provider_fields.get("thought_signature") + if signature: + return signature + + # Then check function's provider_specific_fields + function = tool.get("function") + if function: + if isinstance(function, dict): + func_provider_fields = function.get("provider_specific_fields") or {} + if isinstance(func_provider_fields, dict): + signature = func_provider_fields.get("thought_signature") + if signature: + return signature + elif ( + hasattr(function, "provider_specific_fields") + and function.provider_specific_fields + ): + if isinstance(function.provider_specific_fields, dict): + signature = function.provider_specific_fields.get("thought_signature") + if signature: + return signature + # Check if thought signature is embedded in tool call ID + tool_call_id = tool.get("id") + if tool_call_id and THOUGHT_SIGNATURE_SEPARATOR in tool_call_id: + parts = tool_call_id.split(THOUGHT_SIGNATURE_SEPARATOR, 1) + if len(parts) == 2: + _, signature = parts + return signature + # If no signature found and model is gemini-3, return dummy signature + from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import ( + VertexGeminiConfig, + ) + + if model and VertexGeminiConfig._is_gemini_3_or_newer(model): + return _get_dummy_thought_signature() + return None + + +def _get_dummy_thought_signature() -> str: + """Generate a dummy thought signature for models that require it. + + This is used when transferring conversation history from older models + (like gemini-2.5-flash) to gemini-3, which requires thought_signature + for strict validation. + """ + # Return a base64-encoded dummy signature string + # Below dummy signature is recommended by google - https://ai.google.dev/gemini-api/docs/thought-signatures#faqs + dummy_data = b"skip_thought_signature_validator" + return base64.b64encode(dummy_data).decode("utf-8") + + def convert_to_gemini_tool_call_invoke( message: ChatCompletionAssistantMessage, + model: Optional[str] = None, ) -> List[VertexPartType]: """ OpenAI tool invokes: @@ -1206,8 +1386,9 @@ def convert_to_gemini_tool_call_invoke( _parts_list: List[VertexPartType] = [] tool_calls = message.get("tool_calls", None) function_call = message.get("function_call", None) + if tool_calls is not None: - for tool in tool_calls: + for idx, tool in enumerate(tool_calls): if "function" in tool: gemini_function_call: Optional[VertexFunctionCall] = ( _gemini_tool_call_invoke_helper( @@ -1215,9 +1396,16 @@ def convert_to_gemini_tool_call_invoke( ) ) if gemini_function_call is not None: - _parts_list.append( - VertexPartType(function_call=gemini_function_call) + part_dict: VertexPartType = { + "function_call": gemini_function_call + } + thought_signature = _get_thought_signature_from_tool( + dict(tool), model=model ) + if thought_signature: + part_dict["thoughtSignature"] = thought_signature + + _parts_list.append(part_dict) else: # don't silently drop params. Make it clear to user what's happening. raise Exception( "function_call missing. Received tool call with 'type': 'function'. No function call in argument - {}".format( @@ -1229,7 +1417,36 @@ def convert_to_gemini_tool_call_invoke( function_call_params=function_call ) if gemini_function_call is not None: - _parts_list.append(VertexPartType(function_call=gemini_function_call)) + part_dict_function: VertexPartType = { + "function_call": gemini_function_call + } + + # Extract thought signature from function_call's provider_specific_fields + thought_signature = None + provider_fields = ( + function_call.get("provider_specific_fields") + if isinstance(function_call, dict) + else {} + ) + if isinstance(provider_fields, dict): + thought_signature = provider_fields.get("thought_signature") + + # If no signature found and model is gemini-3, use dummy signature + from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import ( + VertexGeminiConfig, + ) + + if ( + not thought_signature + and model + and VertexGeminiConfig._is_gemini_3_or_newer(model) + ): + thought_signature = _get_dummy_thought_signature() + + if thought_signature: + part_dict_function["thoughtSignature"] = thought_signature + + _parts_list.append(part_dict_function) else: # don't silently drop params. Make it clear to user what's happening. raise Exception( "function_call missing. Received tool call with 'type': 'function'. No function call in argument - {}".format( @@ -1245,10 +1462,10 @@ def convert_to_gemini_tool_call_invoke( ) -def convert_to_gemini_tool_call_result( +def convert_to_gemini_tool_call_result( # noqa: PLR0915 message: Union[ChatCompletionToolMessage, ChatCompletionFunctionMessage], last_message_with_tool_calls: Optional[dict], -) -> VertexPartType: +) -> Union[VertexPartType, List[VertexPartType]]: """ OpenAI message with a tool result looks like: { @@ -1264,16 +1481,81 @@ def convert_to_gemini_tool_call_result( "name": "get_current_weather", "content": "function result goes here", } + + Supports content with images for Computer Use: + { + "role": "tool", + "tool_call_id": "call_abc123", + "content": [ + {"type": "text", "text": "I found the requested image:"}, + {"type": "input_image", "image_url": "https://example.com/image.jpg" } + ] + } """ + from litellm.types.llms.vertex_ai import BlobType + content_str: str = "" + inline_data: Optional[BlobType] = None + if "content" in message: if isinstance(message["content"], str): content_str = message["content"] elif isinstance(message["content"], List): content_list = message["content"] for content in content_list: - if content["type"] == "text": - content_str += content["text"] + content_type = content.get("type", "") + if content_type == "text": + content_str += content.get("text", "") + elif content_type in ("input_image", "image_url"): + # Extract image for inline_data (for Computer Use screenshots and tool results) + image_url_data = content.get("image_url", "") + image_url = ( + image_url_data.get("url", "") + if isinstance(image_url_data, dict) + else image_url_data + ) + + if image_url: + # Convert image to base64 blob format for Gemini + try: + image_obj = convert_to_anthropic_image_obj( + image_url, format=None + ) + inline_data = BlobType( + data=image_obj["data"], + mime_type=image_obj["media_type"], + ) + except Exception as e: + verbose_logger.warning( + f"Failed to process image in tool response: {e}" + ) + elif content_type in ("file", "input_file"): + # Extract file for inline_data (for tool results with PDF, audio, video, etc.) + file_data = content.get("file_data", "") + if not file_data: + file_content = content.get("file", {}) + file_data = ( + file_content.get("file_data", "") + if isinstance(file_content, dict) + else file_content + if isinstance(file_content, str) + else "" + ) + + if file_data: + # Convert file to base64 blob format for Gemini + try: + file_obj = convert_to_anthropic_image_obj( + file_data, format=None + ) + inline_data = BlobType( + data=file_obj["data"], + mime_type=file_obj["media_type"], + ) + except Exception as e: + verbose_logger.warning( + f"Failed to process file in tool response: {e}" + ) name: Optional[str] = message.get("name", "") # type: ignore # Recover name from last message with tool calls @@ -1296,19 +1578,61 @@ def convert_to_gemini_tool_call_result( ) ) + # Parse response data - support both JSON string and plain string + # For Computer Use, the response should contain structured data like {"url": "..."} + response_data: dict + try: + if content_str.strip().startswith("{") or content_str.strip().startswith("["): + # Try to parse as JSON (for Computer Use structured responses) + parsed = json.loads(content_str) + if isinstance(parsed, dict): + response_data = parsed # Use the parsed JSON directly + else: + response_data = {"content": content_str} + else: + response_data = {"content": content_str} + except (json.JSONDecodeError, ValueError): + # Not valid JSON, wrap in content field + response_data = {"content": content_str} + # We can't determine from openai message format whether it's a successful or # error call result so default to the successful result template _function_response = VertexFunctionResponse( - name=name, response={"content": content_str} # type: ignore + name=name, response=response_data # type: ignore ) - _part = VertexPartType(function_response=_function_response) + # Create part with function_response, and optionally inline_data for images (Computer Use) + _part: VertexPartType = {"function_response": _function_response} + + # For Computer Use, if we have an image, we need separate parts: + # - One part with function_response + # - One part with inline_data + # Gemini's PartType is a oneof, so we can't have both in the same part + if inline_data: + image_part: VertexPartType = {"inline_data": inline_data} + return [_part, image_part] return _part +def _sanitize_anthropic_tool_use_id(tool_use_id: str) -> str: + """ + Sanitize tool_use_id to match Anthropic's required pattern: ^[a-zA-Z0-9_-]+$ + + Anthropic requires tool_use_id to only contain alphanumeric characters, underscores, and hyphens. + This function replaces any invalid characters with underscores. + """ + # Replace any character that's not alphanumeric, underscore, or hyphen with underscore + sanitized = re.sub(r"[^a-zA-Z0-9_-]", "_", tool_use_id) + # Ensure it's not empty (fallback to a default if needed) + if not sanitized: + sanitized = "tool_use_id" + return sanitized + + def convert_to_anthropic_tool_result( message: Union[ChatCompletionToolMessage, ChatCompletionFunctionMessage], + force_base64: bool = False, ) -> AnthropicMessagesToolResultParam: """ OpenAI message with a tool result looks like: @@ -1354,33 +1678,30 @@ def convert_to_anthropic_tool_result( ] = [] for content in content_list: if content["type"] == "text": - anthropic_content_list.append( - AnthropicMessagesToolResultContent( - type="text", - text=content["text"], - cache_control=content.get("cache_control", None), - ) - ) + # Only include cache_control if explicitly set and not None + # to avoid sending "cache_control": null which breaks some API channels + text_content: AnthropicMessagesToolResultContent = { + "type": "text", + "text": content["text"], + } + cache_control_value = content.get("cache_control") + if cache_control_value is not None: + text_content["cache_control"] = cache_control_value + anthropic_content_list.append(text_content) elif content["type"] == "image_url": - if isinstance(content["image_url"], str): - image_chunk = convert_to_anthropic_image_obj( - content["image_url"], format=None - ) - else: - format = content["image_url"].get("format") - image_chunk = convert_to_anthropic_image_obj( - content["image_url"]["url"], format=format - ) - anthropic_content_list.append( - AnthropicMessagesImageParam( - type="image", - source=AnthropicContentParamSource( - type="base64", - media_type=image_chunk["media_type"], - data=image_chunk["data"], - ), - ) + format = ( + content["image_url"].get("format") + if isinstance(content["image_url"], dict) + else None + ) + _anthropic_image_param = create_anthropic_image_param( + content["image_url"], format=format, is_bedrock_invoke=force_base64 + ) + _anthropic_image_param = add_cache_control_to_content( + anthropic_content_element=_anthropic_image_param, + original_content_element=content, ) + anthropic_content_list.append(cast(AnthropicMessagesImageParam, _anthropic_image_param)) anthropic_content = anthropic_content_list anthropic_tool_result: Optional[AnthropicMessagesToolResultParam] = None @@ -1389,18 +1710,26 @@ def convert_to_anthropic_tool_result( if message["role"] == "tool": tool_message: ChatCompletionToolMessage = message tool_call_id: str = tool_message["tool_call_id"] + # Sanitize tool_use_id to match Anthropic's pattern requirement: ^[a-zA-Z0-9_-]+$ + sanitized_tool_use_id = _sanitize_anthropic_tool_use_id(tool_call_id) # We can't determine from openai message format whether it's a successful or # error call result so default to the successful result template anthropic_tool_result = AnthropicMessagesToolResultParam( - type="tool_result", tool_use_id=tool_call_id, content=anthropic_content + type="tool_result", + tool_use_id=sanitized_tool_use_id, + content=anthropic_content, ) if message["role"] == "function": function_message: ChatCompletionFunctionMessage = message tool_call_id = function_message.get("tool_call_id") or str(uuid.uuid4()) + # Sanitize tool_use_id to match Anthropic's pattern requirement: ^[a-zA-Z0-9_-]+$ + sanitized_tool_use_id = _sanitize_anthropic_tool_use_id(tool_call_id) anthropic_tool_result = AnthropicMessagesToolResultParam( - type="tool_result", tool_use_id=tool_call_id, content=anthropic_content + type="tool_result", + tool_use_id=sanitized_tool_use_id, + content=anthropic_content, ) if anthropic_tool_result is None: @@ -1416,12 +1745,17 @@ def convert_function_to_anthropic_tool_invoke( try: _name = get_attribute_or_key(function_call, "name") or "" _arguments = get_attribute_or_key(function_call, "arguments") + + tool_input = parse_tool_call_arguments( + _arguments, tool_name=_name, context="Anthropic function to tool invoke" + ) + anthropic_tool_invoke = [ AnthropicMessagesToolUseParam( type="tool_use", id=str(uuid.uuid4()), name=_name, - input=json.loads(_arguments) if _arguments else {}, + input=tool_input, ) ] return anthropic_tool_invoke @@ -1431,7 +1765,8 @@ def convert_function_to_anthropic_tool_invoke( def convert_to_anthropic_tool_invoke( tool_calls: List[ChatCompletionAssistantToolCall], -) -> List[AnthropicMessagesToolUseParam]: + web_search_results: Optional[List[Any]] = None, +) -> List[Union[AnthropicMessagesToolUseParam, Dict[str, Any]]]: """ OpenAI tool invokes: { @@ -1467,38 +1802,70 @@ def convert_to_anthropic_tool_invoke( } ] } + + For server-side tools (web_search), we need to reconstruct: + - server_tool_use blocks (id starts with "srvtoolu_") + - web_search_tool_result blocks (from provider_specific_fields) + + Fixes: https://github.com/BerriAI/litellm/issues/17737 """ - anthropic_tool_invoke = [] + anthropic_tool_invoke: List[ + Union[AnthropicMessagesToolUseParam, Dict[str, Any]] + ] = [] for tool in tool_calls: if not get_attribute_or_key(tool, "type") == "function": continue - _anthropic_tool_use_param = AnthropicMessagesToolUseParam( - type="tool_use", - id=cast(str, get_attribute_or_key(tool, "id")), - name=cast( - str, - get_attribute_or_key(get_attribute_or_key(tool, "function"), "name"), - ), - input=json.loads( - get_attribute_or_key( - get_attribute_or_key(tool, "function"), "arguments" - ) - ), + tool_id = cast(str, get_attribute_or_key(tool, "id")) + tool_name = cast( + str, + get_attribute_or_key(get_attribute_or_key(tool, "function"), "name"), ) - - _content_element = add_cache_control_to_content( - anthropic_content_element=_anthropic_tool_use_param, - orignal_content_element=dict(tool), + tool_input = parse_tool_call_arguments( + get_attribute_or_key(get_attribute_or_key(tool, "function"), "arguments"), + tool_name=tool_name, + context="Anthropic tool invoke", ) - if "cache_control" in _content_element: - _anthropic_tool_use_param["cache_control"] = _content_element[ - "cache_control" - ] + # Check if this is a server-side tool (web_search, tool_search, etc.) + # Server tool IDs start with "srvtoolu_" + if tool_id.startswith("srvtoolu_"): + # Create server_tool_use block instead of tool_use + _anthropic_server_tool_use: Dict[str, Any] = { + "type": "server_tool_use", + "id": tool_id, + "name": tool_name, + "input": tool_input, + } + anthropic_tool_invoke.append(_anthropic_server_tool_use) + + # Add corresponding web_search_tool_result if available + if web_search_results: + for result in web_search_results: + if result.get("tool_use_id") == tool_id: + anthropic_tool_invoke.append(result) + break + else: + # Regular tool_use + _anthropic_tool_use_param = AnthropicMessagesToolUseParam( + type="tool_use", + id=tool_id, + name=tool_name, + input=tool_input, + ) - anthropic_tool_invoke.append(_anthropic_tool_use_param) + _content_element = add_cache_control_to_content( + anthropic_content_element=_anthropic_tool_use_param, + original_content_element=dict(tool), + ) + + if "cache_control" in _content_element: + _anthropic_tool_use_param["cache_control"] = _content_element[ + "cache_control" + ] + + anthropic_tool_invoke.append(_anthropic_tool_use_param) return anthropic_tool_invoke @@ -1512,9 +1879,9 @@ def add_cache_control_to_content( AnthropicMessagesToolUseParam, ChatCompletionThinkingBlock, ], - orignal_content_element: Union[dict, AllMessageValues], + original_content_element: Union[dict, AllMessageValues], ): - cache_control_param = orignal_content_element.get("cache_control") + cache_control_param = original_content_element.get("cache_control") if cache_control_param is not None and isinstance(cache_control_param, dict): transformed_param = ChatCompletionCachedContent(**cache_control_param) # type: ignore @@ -1690,6 +2057,12 @@ def anthropic_messages_pt( # noqa: PLR0915 else: messages.append(DEFAULT_USER_CONTINUE_MESSAGE_TYPED) + # Bedrock invoke models have format: invoke/... + # Vertex AI Anthropic also doesn't support URL sources for images + is_bedrock_invoke = model.lower().startswith("invoke/") + is_vertex_ai = llm_provider.startswith("vertex_ai") if llm_provider else False + force_base64 = is_bedrock_invoke or is_vertex_ai + msg_i = 0 while msg_i < len(messages): user_content: List[AnthropicMessagesUserMessageValues] = [] @@ -1710,24 +2083,40 @@ def anthropic_messages_pt( # noqa: PLR0915 for m in user_message_types_block["content"]: if m.get("type", "") == "image_url": m = cast(ChatCompletionImageObject, m) - format: Optional[str] = None - if isinstance(m["image_url"], str): - image_chunk = convert_to_anthropic_image_obj( - openai_image_url=m["image_url"], format=None + format = ( + m["image_url"].get("format") + if isinstance(m["image_url"], dict) + else None + ) + # Convert ChatCompletionImageUrlObject to dict if needed + image_url_value = m["image_url"] + if isinstance(image_url_value, str): + image_url_input: Union[str, dict[str, Any]] = ( + image_url_value ) else: - format = m["image_url"].get("format") - image_chunk = convert_to_anthropic_image_obj( - openai_image_url=m["image_url"]["url"], - format=format, - ) - - _anthropic_content_element = ( - _anthropic_content_element_factory(image_chunk) + # ChatCompletionImageUrlObject or dict case - convert to dict + image_url_input = { + "url": image_url_value["url"], + "format": image_url_value.get("format"), + } + # Bedrock invoke models have format: invoke/... + # Vertex AI Anthropic also doesn't support URL sources for images + is_bedrock_invoke = model.lower().startswith("invoke/") + is_vertex_ai = ( + llm_provider.startswith("vertex_ai") + if llm_provider + else False + ) + force_base64 = is_bedrock_invoke or is_vertex_ai + _anthropic_content_element = create_anthropic_image_param( + image_url_input, + format=format, + is_bedrock_invoke=force_base64, ) _content_element = add_cache_control_to_content( anthropic_content_element=_anthropic_content_element, - orignal_content_element=dict(m), + original_content_element=dict(m), ) if "cache_control" in _content_element: @@ -1745,7 +2134,7 @@ def anthropic_messages_pt( # noqa: PLR0915 ) _content_element = add_cache_control_to_content( anthropic_content_element=_anthropic_text_content_element, - orignal_content_element=dict(m), + original_content_element=dict(m), ) _content_element = cast( AnthropicMessagesTextParam, _content_element @@ -1767,7 +2156,7 @@ def anthropic_messages_pt( # noqa: PLR0915 } _content_element = add_cache_control_to_content( anthropic_content_element=_anthropic_content_text_element, - orignal_content_element=dict(user_message_types_block), + original_content_element=dict(user_message_types_block), ) if "cache_control" in _content_element: @@ -1783,7 +2172,9 @@ def anthropic_messages_pt( # noqa: PLR0915 ): # OpenAI's tool message content will always be a string user_content.append( - convert_to_anthropic_tool_result(user_message_types_block) + convert_to_anthropic_tool_result( + user_message_types_block, force_base64=force_base64 + ) ) msg_i += 1 @@ -1791,6 +2182,9 @@ def anthropic_messages_pt( # noqa: PLR0915 if user_content: new_messages.append({"role": "user", "content": user_content}) + # Track unique tool IDs in this merge block to avoid duplication + unique_tool_ids: Set[str] = set() + assistant_content: List[AnthropicMessagesAssistantMessageValues] = [] ## MERGE CONSECUTIVE ASSISTANT CONTENT ## while msg_i < len(messages) and messages[msg_i]["role"] == "assistant": @@ -1825,12 +2219,20 @@ def anthropic_messages_pt( # noqa: PLR0915 ) _cached_message = add_cache_control_to_content( anthropic_content_element=anthropic_message, - orignal_content_element=dict(m), + original_content_element=dict(m), ) assistant_content.append( cast(AnthropicMessagesTextParam, _cached_message) ) + # handle server_tool_use blocks (tool search, web search, etc.) + # Pass through as-is since these are Anthropic-native content types + elif m.get("type", "") == "server_tool_use": + assistant_content.append(m) # type: ignore + # handle tool_search_tool_result blocks + # Pass through as-is since these are Anthropic-native content types + elif m.get("type", "") == "tool_search_tool_result": + assistant_content.append(m) # type: ignore elif ( "content" in assistant_content_block and isinstance(assistant_content_block["content"], str) @@ -1845,7 +2247,7 @@ def anthropic_messages_pt( # noqa: PLR0915 _content_element = add_cache_control_to_content( anthropic_content_element=_anthropic_text_content_element, - orignal_content_element=dict(assistant_content_block), + original_content_element=dict(assistant_content_block), ) if "cache_control" in _content_element: @@ -1859,9 +2261,42 @@ def anthropic_messages_pt( # noqa: PLR0915 if ( assistant_tool_calls is not None ): # support assistant tool invoke conversion - assistant_content.extend( - convert_to_anthropic_tool_invoke(assistant_tool_calls) + # Get web_search_results from provider_specific_fields for server_tool_use reconstruction + # Fixes: https://github.com/BerriAI/litellm/issues/17737 + _provider_specific_fields_raw = assistant_content_block.get( + "provider_specific_fields" ) + _provider_specific_fields: Dict[str, Any] = {} + if isinstance(_provider_specific_fields_raw, dict): + _provider_specific_fields = cast( + Dict[str, Any], _provider_specific_fields_raw + ) + _web_search_results = _provider_specific_fields.get( + "web_search_results" + ) + tool_invoke_results = convert_to_anthropic_tool_invoke( + assistant_tool_calls, + web_search_results=_web_search_results, + ) + + # Prevent "tool_use ids must be unique" errors by filtering duplicates + # This can happen when merging history that already contains the tool calls + for item in tool_invoke_results: + # tool_use items are typically dicts, but handle objects just in case + item_id = ( + item.get("id") + if isinstance(item, dict) + else getattr(item, "id", None) + ) + + if item_id: + if item_id in unique_tool_ids: + continue + unique_tool_ids.add(item_id) + + assistant_content.append( + cast(AnthropicMessagesAssistantMessageValues, item) + ) assistant_function_call = assistant_content_block.get("function_call") @@ -2495,7 +2930,6 @@ def stringify_json_tool_call_content(messages: List) -> List: ###### AMAZON BEDROCK ####### -import base64 from email.message import Message import httpx @@ -2540,17 +2974,19 @@ class BedrockImageProcessor: """Handles both sync and async image processing for Bedrock conversations.""" @staticmethod - def _post_call_image_processing(response: httpx.Response, image_url: str = "") -> Tuple[str, str]: + def _post_call_image_processing( + response: httpx.Response, image_url: str = "" + ) -> Tuple[str, str]: # Check the response's content type to ensure it is an image content_type = response.headers.get("content-type") - + # Use helper function to infer content type with fallback logic content_type = infer_content_type_from_url_and_content( url=image_url, content=response.content, current_content_type=content_type, ) - + content_type = _parse_content_type(content_type) # Convert the image content to base64 bytes @@ -2569,7 +3005,9 @@ async def get_image_details_async(image_url) -> Tuple[str, str]: response = await client.get(image_url, follow_redirects=True) response.raise_for_status() # Raise an exception for HTTP errors - return BedrockImageProcessor._post_call_image_processing(response, image_url) + return BedrockImageProcessor._post_call_image_processing( + response, image_url + ) except Exception as e: raise e @@ -2582,7 +3020,9 @@ def get_image_details(image_url) -> Tuple[str, str]: response = client.get(image_url, follow_redirects=True) response.raise_for_status() # Raise an exception for HTTP errors - return BedrockImageProcessor._post_call_image_processing(response, image_url) + return BedrockImageProcessor._post_call_image_processing( + response, image_url + ) except Exception as e: raise e @@ -2706,12 +3146,39 @@ def _create_bedrock_block( for video_type in supported_video_formats ) + HASH_SAMPLE_BYTES = 64 * 1024 # hash up to 64 KB of data + if is_document: + # --- Prepare normalized bytes for hashing (without modifying original) --- + if isinstance(image_bytes, str): + # Remove whitespace/newlines so base64 variations hash identically + normalized = "".join(image_bytes.split()).encode("utf-8") + else: + normalized = image_bytes + + # --- Use only the first 64 KB for speed --- + if len(normalized) <= HASH_SAMPLE_BYTES: + sample = normalized + else: + sample = normalized[:HASH_SAMPLE_BYTES] + + # --- Compute deterministic hash (sample + total length) --- + hasher = hashlib.sha256() + hasher.update(sample) + hasher.update( + str(len(normalized)).encode("utf-8") + ) # include full length for uniqueness + full_hash = hasher.hexdigest() + content_hash = full_hash[:16] # short deterministic ID + + document_name = f"DocumentPDFmessages_{content_hash}_{image_format}" + + # --- Return content block --- return BedrockContentBlock( document=BedrockDocumentBlock( source=_blob, format=image_format, - name=f"DocumentPDFmessages_{str(uuid.uuid4())}", + name=document_name, ) ) elif is_video: @@ -2818,6 +3285,11 @@ def _convert_to_bedrock_tool_call_invoke( id = tool["id"] name = tool["function"].get("name", "") arguments = tool["function"].get("arguments", "") + arguments_dict = json.loads(arguments) if arguments else {} + # Ensure arguments_dict is always a dict (Bedrock requires toolUse.input to be an object) + # When some providers return arguments: '""' (JSON-encoded empty string), json.loads returns "" + if not isinstance(arguments_dict, dict): + arguments_dict = {} if not arguments or not arguments.strip(): arguments_dict = {} else: @@ -2886,21 +3358,39 @@ def _convert_to_bedrock_tool_call_result( """ - """ - content_str: str = "" + tool_result_content_blocks: List[BedrockToolResultContentBlock] = [] if isinstance(message["content"], str): - content_str = message["content"] + tool_result_content_blocks.append( + BedrockToolResultContentBlock(text=message["content"]) + ) elif isinstance(message["content"], List): content_list = message["content"] for content in content_list: if content["type"] == "text": - content_str += content["text"] + tool_result_content_blocks.append( + BedrockToolResultContentBlock(text=content["text"]) + ) + elif content["type"] == "image_url": + format: Optional[str] = None + if isinstance(content["image_url"], dict): + image_url = content["image_url"]["url"] + format = content["image_url"].get("format") + else: + image_url = content["image_url"] + _block: BedrockContentBlock = BedrockImageProcessor.process_image_sync( + image_url=image_url, + format=format, + ) + if "image" in _block: + tool_result_content_blocks.append( + BedrockToolResultContentBlock(image=_block["image"]) + ) message.get("name", "") id = str(message.get("tool_call_id", str(uuid.uuid4()))) - tool_result_content_block = BedrockToolResultContentBlock(text=content_str) tool_result = BedrockToolResultBlock( - content=[tool_result_content_block], + content=tool_result_content_blocks, toolUseId=id, ) @@ -3209,8 +3699,25 @@ class BedrockConverseMessagesProcessor: @staticmethod def _initial_message_setup( messages: List, + model: str, + llm_provider: str, user_continue_message: Optional[ChatCompletionUserMessage] = None, ) -> List: + # gracefully handle base case of no messages at all + if len(messages) == 0: + if user_continue_message is not None: + messages.append(user_continue_message) + elif litellm.modify_params: + messages.append(DEFAULT_USER_CONTINUE_MESSAGE) + else: + raise litellm.BadRequestError( + message=BAD_MESSAGE_ERROR_STR + + "bedrock requires at least one non-system message", + model=model, + llm_provider=llm_provider, + ) + + # if initial message is assistant message if messages[0].get("role") is not None and messages[0]["role"] == "assistant": if user_continue_message is not None: messages.insert(0, user_continue_message) @@ -3238,18 +3745,8 @@ async def _bedrock_converse_messages_pt_async( # noqa: PLR0915 contents: List[BedrockMessageBlock] = [] msg_i = 0 - ## BASE CASE ## - if len(messages) == 0: - raise litellm.BadRequestError( - message=BAD_MESSAGE_ERROR_STR - + "bedrock requires at least one non-system message", - model=model, - llm_provider=llm_provider, - ) - - # if initial message is assistant message messages = BedrockConverseMessagesProcessor._initial_message_setup( - messages, user_continue_message + messages, model, llm_provider, user_continue_message ) while msg_i < len(messages): @@ -3610,28 +4107,9 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915 contents: List[BedrockMessageBlock] = [] msg_i = 0 - ## BASE CASE ## - if len(messages) == 0: - raise litellm.BadRequestError( - message=BAD_MESSAGE_ERROR_STR - + "bedrock requires at least one non-system message", - model=model, - llm_provider=llm_provider, - ) - - # if initial message is assistant message - if messages[0].get("role") is not None and messages[0]["role"] == "assistant": - if user_continue_message is not None: - messages.insert(0, user_continue_message) - elif litellm.modify_params: - messages.insert(0, DEFAULT_USER_CONTINUE_MESSAGE) - - # if final message is assistant message - if messages[-1].get("role") is not None and messages[-1]["role"] == "assistant": - if user_continue_message is not None: - messages.append(user_continue_message) - elif litellm.modify_params: - messages.append(DEFAULT_USER_CONTINUE_MESSAGE) + messages = BedrockConverseMessagesProcessor._initial_message_setup( + messages, model, llm_provider, user_continue_message + ) while msg_i < len(messages): user_content: List[BedrockContentBlock] = [] @@ -3811,7 +4289,11 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915 assistant_parts=assistants_parts, ) elif element["type"] == "text": - assistants_part = BedrockContentBlock(text=element["text"]) + # AWS Bedrock doesn't allow empty or whitespace-only text content, so use placeholder for empty strings + text_content = ( + element["text"] if element["text"].strip() else "." + ) + assistants_part = BedrockContentBlock(text=text_content) assistants_parts.append(assistants_part) elif element["type"] == "image_url": if isinstance(element["image_url"], dict): @@ -3835,7 +4317,9 @@ def _bedrock_converse_messages_pt( # noqa: PLR0915 assistants_parts.append(_cache_point_block) assistant_content.extend(assistants_parts) elif _assistant_content is not None and isinstance(_assistant_content, str): - assistant_content.append(BedrockContentBlock(text=_assistant_content)) + # AWS Bedrock doesn't allow empty or whitespace-only text content, so use placeholder for empty strings + text_content = _assistant_content if _assistant_content.strip() else "." + assistant_content.append(BedrockContentBlock(text=text_content)) # Add cache point block for assistant string content _cache_point_block = ( litellm.AmazonConverseConfig()._get_cache_point_block( @@ -3911,6 +4395,32 @@ def add_cache_point_tool_block(tool: dict) -> Optional[BedrockToolBlock]: return None +def _is_bedrock_tool_block(tool: dict) -> bool: + """ + Check if a tool is already a BedrockToolBlock. + + BedrockToolBlock has one of: systemTool, toolSpec, or cachePoint. + This is used to detect tools that are already in Bedrock format + (e.g., systemTool for Nova grounding) vs OpenAI-style function tools + that need transformation. + + Args: + tool: The tool dict to check + + Returns: + True if the tool is already a BedrockToolBlock, False otherwise + + Examples: + >>> _is_bedrock_tool_block({"systemTool": {"name": "nova_grounding"}}) + True + >>> _is_bedrock_tool_block({"type": "function", "function": {...}}) + False + """ + return isinstance(tool, dict) and ( + "systemTool" in tool or "toolSpec" in tool or "cachePoint" in tool + ) + + def _bedrock_tools_pt(tools: List) -> List[BedrockToolBlock]: """ OpenAI tools looks like: @@ -3936,7 +4446,7 @@ def _bedrock_tools_pt(tools: List) -> List[BedrockToolBlock]: ] """ """ - Bedrock toolConfig looks like: + Bedrock toolConfig looks like: "tools": [ { "toolSpec": { @@ -3964,6 +4474,13 @@ def _bedrock_tools_pt(tools: List) -> List[BedrockToolBlock]: tool_block_list: List[BedrockToolBlock] = [] for tool in tools: + # Check if tool is already a BedrockToolBlock (e.g., systemTool for Nova grounding) + if _is_bedrock_tool_block(tool): + # Already a BedrockToolBlock, pass it through + tool_block_list.append(tool) # type: ignore + continue + + # Handle regular OpenAI-style function tools parameters = tool.get("function", {}).get( "parameters", {"type": "object", "properties": {}} ) @@ -3980,9 +4497,10 @@ def _bedrock_tools_pt(tools: List) -> List[BedrockToolBlock]: defs = parameters.pop("$defs", {}) defs_copy = copy.deepcopy(defs) - # flatten the defs - for _, value in defs_copy.items(): - unpack_defs(value, defs_copy) + # Expand $ref references in parameters using the definitions + # Note: We don't pre-flatten defs as that causes exponential memory growth + # with circular references (see issue #19098). unpack_defs handles nested + # refs recursively and correctly detects/skips circular references. unpack_defs(parameters, defs_copy) tool_input_schema = BedrockToolInputSchemaBlock( json=BedrockToolJsonSchemaBlock( diff --git a/litellm/litellm_core_utils/prompt_templates/image_handling.py b/litellm/litellm_core_utils/prompt_templates/image_handling.py index 4fa10e421112..7137a4e42229 100644 --- a/litellm/litellm_core_utils/prompt_templates/image_handling.py +++ b/litellm/litellm_core_utils/prompt_templates/image_handling.py @@ -9,6 +9,7 @@ import litellm from litellm import verbose_logger from litellm.caching.caching import InMemoryCache +from litellm.constants import MAX_IMAGE_URL_DOWNLOAD_SIZE_MB MAX_IMGS_IN_MEMORY = 10 @@ -21,7 +22,29 @@ def _process_image_response(response: Response, url: str) -> str: f"Error: Unable to fetch image from URL. Status code: {response.status_code}, url={url}" ) - image_bytes = response.content + # Check size before downloading if Content-Length header is present + content_length = response.headers.get("Content-Length") + if content_length is not None: + size_mb = int(content_length) / (1024 * 1024) + if size_mb > MAX_IMAGE_URL_DOWNLOAD_SIZE_MB: + raise litellm.ImageFetchError( + f"Error: Image size ({size_mb:.2f}MB) exceeds maximum allowed size ({MAX_IMAGE_URL_DOWNLOAD_SIZE_MB}MB). url={url}" + ) + + # Stream download with size checking to prevent downloading huge files + max_bytes = int(MAX_IMAGE_URL_DOWNLOAD_SIZE_MB * 1024 * 1024) + image_bytes = bytearray() + bytes_downloaded = 0 + + for chunk in response.iter_bytes(chunk_size=8192): + bytes_downloaded += len(chunk) + if bytes_downloaded > max_bytes: + size_mb = bytes_downloaded / (1024 * 1024) + raise litellm.ImageFetchError( + f"Error: Image size ({size_mb:.2f}MB) exceeds maximum allowed size ({MAX_IMAGE_URL_DOWNLOAD_SIZE_MB}MB). url={url}" + ) + image_bytes.extend(chunk) + base64_image = base64.b64encode(image_bytes).decode("utf-8") image_type = response.headers.get("Content-Type") @@ -48,6 +71,12 @@ def _process_image_response(response: Response, url: str) -> str: async def async_convert_url_to_base64(url: str) -> str: + # If MAX_IMAGE_URL_DOWNLOAD_SIZE_MB is 0, block all image downloads + if MAX_IMAGE_URL_DOWNLOAD_SIZE_MB == 0: + raise litellm.ImageFetchError( + f"Error: Image URL download is disabled (MAX_IMAGE_URL_DOWNLOAD_SIZE_MB=0). url={url}" + ) + cached_result = in_memory_cache.get_cache(url) if cached_result: return cached_result @@ -67,6 +96,12 @@ async def async_convert_url_to_base64(url: str) -> str: def convert_url_to_base64(url: str) -> str: + # If MAX_IMAGE_URL_DOWNLOAD_SIZE_MB is 0, block all image downloads + if MAX_IMAGE_URL_DOWNLOAD_SIZE_MB == 0: + raise litellm.ImageFetchError( + f"Error: Image URL download is disabled (MAX_IMAGE_URL_DOWNLOAD_SIZE_MB=0). url={url}" + ) + cached_result = in_memory_cache.get_cache(url) if cached_result: return cached_result diff --git a/litellm/litellm_core_utils/redact_messages.py b/litellm/litellm_core_utils/redact_messages.py index 849cb20cdc5f..0effed3db707 100644 --- a/litellm/litellm_core_utils/redact_messages.py +++ b/litellm/litellm_core_utils/redact_messages.py @@ -7,17 +7,17 @@ # # Thank you users! We ❤️ you! - Krrish & Ishaan +import asyncio import copy from typing import TYPE_CHECKING, Any, Optional import litellm from litellm.integrations.custom_logger import CustomLogger -from litellm.secret_managers.main import str_to_bool -from litellm.types.utils import StandardCallbackDynamicParams from litellm.litellm_core_utils.core_helpers import ( get_metadata_variable_name_from_kwargs, ) -import asyncio +from litellm.secret_managers.main import str_to_bool +from litellm.types.utils import StandardCallbackDynamicParams if TYPE_CHECKING: from litellm.litellm_core_utils.litellm_logging import ( diff --git a/litellm/litellm_core_utils/safe_json_dumps.py b/litellm/litellm_core_utils/safe_json_dumps.py index c714e36b5f93..8b50e41a795d 100644 --- a/litellm/litellm_core_utils/safe_json_dumps.py +++ b/litellm/litellm_core_utils/safe_json_dumps.py @@ -49,4 +49,4 @@ def _serialize(obj: Any, seen: set, depth: int) -> Any: return "Unserializable Object" safe_data = _serialize(data, set(), 0) - return json.dumps(safe_data, default=str) + return json.dumps(safe_data, default=str) \ No newline at end of file diff --git a/litellm/litellm_core_utils/sensitive_data_masker.py b/litellm/litellm_core_utils/sensitive_data_masker.py index ea0bed304162..8b6ae7446373 100644 --- a/litellm/litellm_core_utils/sensitive_data_masker.py +++ b/litellm/litellm_core_utils/sensitive_data_masker.py @@ -1,4 +1,5 @@ -from typing import Any, Dict, Optional, Set +from collections.abc import Mapping +from typing import Any, Dict, List, Optional, Set from litellm.constants import DEFAULT_MAX_RECURSE_DEPTH_SENSITIVE_DATA_MASKER @@ -17,6 +18,7 @@ def __init__( "key", "token", "auth", + "authorization", "credential", "access", "private", @@ -42,23 +44,58 @@ def _mask_value(self, value: str) -> str: else: return f"{value_str[:self.visible_prefix]}{self.mask_char * masked_length}{value_str[-self.visible_suffix:]}" - def is_sensitive_key(self, key: str) -> bool: + def is_sensitive_key( + self, key: str, excluded_keys: Optional[Set[str]] = None + ) -> bool: + # Check if key is in excluded_keys first (exact match) + if excluded_keys and key in excluded_keys: + return False + key_lower = str(key).lower() - # Split on underscores and check if any segment matches the pattern + # Split on underscores/hyphens and check if any segment matches the pattern # This avoids false positives like "max_tokens" matching "token" # but still catches "api_key", "access_token", etc. - key_segments = key_lower.replace('-', '_').split('_') - result = any( - pattern in key_segments - for pattern in self.sensitive_patterns - ) + key_segments = key_lower.replace("-", "_").split("_") + result = any(pattern in key_segments for pattern in self.sensitive_patterns) return result + def _mask_sequence( + self, + values: List[Any], + depth: int, + max_depth: int, + excluded_keys: Optional[Set[str]], + key_is_sensitive: bool, + ) -> List[Any]: + masked_items: List[Any] = [] + if depth >= max_depth: + return values + + for item in values: + if isinstance(item, Mapping): + masked_items.append( + self.mask_dict(dict(item), depth + 1, max_depth, excluded_keys) + ) + elif isinstance(item, list): + masked_items.append( + self._mask_sequence( + item, depth + 1, max_depth, excluded_keys, key_is_sensitive + ) + ) + elif key_is_sensitive and isinstance(item, str): + masked_items.append(self._mask_value(item)) + else: + masked_items.append( + item if isinstance(item, (int, float, bool, str, list)) else str(item) + ) + return masked_items + def mask_dict( self, data: Dict[str, Any], depth: int = 0, max_depth: int = DEFAULT_MAX_RECURSE_DEPTH_SENSITIVE_DATA_MASKER, + excluded_keys: Optional[Set[str]] = None, ) -> Dict[str, Any]: if depth >= max_depth: return data @@ -66,11 +103,20 @@ def mask_dict( masked_data: Dict[str, Any] = {} for k, v in data.items(): try: - if isinstance(v, dict): - masked_data[k] = self.mask_dict(v, depth + 1) + key_is_sensitive = self.is_sensitive_key(k, excluded_keys) + if isinstance(v, Mapping): + masked_data[k] = self.mask_dict( + dict(v), depth + 1, max_depth, excluded_keys + ) + elif isinstance(v, list): + masked_data[k] = self._mask_sequence( + v, depth + 1, max_depth, excluded_keys, key_is_sensitive + ) elif hasattr(v, "__dict__") and not isinstance(v, type): - masked_data[k] = self.mask_dict(vars(v), depth + 1) - elif self.is_sensitive_key(k): + masked_data[k] = self.mask_dict( + vars(v), depth + 1, max_depth, excluded_keys + ) + elif key_is_sensitive: str_value = str(v) if v is not None else "" masked_data[k] = self._mask_value(str_value) else: diff --git a/litellm/litellm_core_utils/streaming_chunk_builder_utils.py b/litellm/litellm_core_utils/streaming_chunk_builder_utils.py index 89c933461551..77f6287ec560 100644 --- a/litellm/litellm_core_utils/streaming_chunk_builder_utils.py +++ b/litellm/litellm_core_utils/streaming_chunk_builder_utils.py @@ -19,6 +19,7 @@ ModelResponse, ModelResponseStream, PromptTokensDetailsWrapper, + ServerToolUse, Usage, ) from litellm.utils import print_verbose, token_counter @@ -114,12 +115,31 @@ def _get_chunk_id(chunks: List[Dict[str, Any]]) -> str: return chunk["id"] return "" + @staticmethod + def _get_model_from_chunks(chunks: List[Dict[str, Any]], first_chunk_model: str) -> str: + """ + Get the actual model from chunks, preferring a model that differs from the first chunk. + + For Azure Model Router, the first chunk may have the request model (e.g., 'azure-model-router') + while subsequent chunks have the actual model (e.g., 'gpt-4.1-nano-2025-04-14'). + This method finds the actual model for accurate cost calculation. + """ + # Look for a model in chunks that differs from the first chunk's model + for chunk in chunks: + chunk_model = chunk.get("model") + if chunk_model and chunk_model != first_chunk_model: + return chunk_model + # Fall back to first chunk's model if no different model found + return first_chunk_model + def build_base_response(self, chunks: List[Dict[str, Any]]) -> ModelResponse: chunk = self.first_chunk id = ChunkProcessor._get_chunk_id(chunks) object = chunk["object"] created = chunk["created"] - model = chunk["model"] + first_chunk_model = chunk["model"] + # Get the actual model - for Azure Model Router, this finds the real model from later chunks + model = ChunkProcessor._get_model_from_chunks(chunks, first_chunk_model) system_fingerprint = chunk.get("system_fingerprint", None) role = chunk["choices"][0]["delta"]["role"] @@ -159,7 +179,7 @@ def build_base_response(self, chunks: List[Dict[str, Any]]) -> ModelResponse: ) return response - def get_combined_tool_content( + def get_combined_tool_content( # noqa: PLR0915 self, tool_call_chunks: List[Dict[str, Any]] ) -> List[ChatCompletionMessageToolCall]: tool_calls_list: List[ChatCompletionMessageToolCall] = [] @@ -174,34 +194,93 @@ def get_combined_tool_content( tool_calls = delta.get("tool_calls", []) for tool_call in tool_calls: - if not tool_call or not hasattr(tool_call, "function"): + # Handle both dict and object formats + if not tool_call: + continue + + # Check if tool_call has function (either as attribute or dict key) + has_function = False + if isinstance(tool_call, dict): + has_function = "function" in tool_call and tool_call["function"] is not None + else: + has_function = hasattr(tool_call, "function") and tool_call.function is not None + + if not has_function: continue - index = getattr(tool_call, "index", 0) + # Get index (handle both dict and object) + if isinstance(tool_call, dict): + index = tool_call.get("index", 0) + else: + index = getattr(tool_call, "index", 0) + if index not in tool_call_map: tool_call_map[index] = { "id": None, "name": None, "type": None, "arguments": [], + "provider_specific_fields": None, } - if hasattr(tool_call, "id") and tool_call.id: - tool_call_map[index]["id"] = tool_call.id - if hasattr(tool_call, "type") and tool_call.type: - tool_call_map[index]["type"] = tool_call.type - if hasattr(tool_call, "function"): - if ( - hasattr(tool_call.function, "name") - and tool_call.function.name - ): - tool_call_map[index]["name"] = tool_call.function.name - if ( - hasattr(tool_call.function, "arguments") - and tool_call.function.arguments - ): - tool_call_map[index]["arguments"].append( - tool_call.function.arguments + # Extract id, type, and function data (handle both dict and object) + if isinstance(tool_call, dict): + if tool_call.get("id"): + tool_call_map[index]["id"] = tool_call["id"] + if tool_call.get("type"): + tool_call_map[index]["type"] = tool_call["type"] + + function = tool_call.get("function", {}) + if isinstance(function, dict): + if function.get("name"): + tool_call_map[index]["name"] = function["name"] + if function.get("arguments"): + tool_call_map[index]["arguments"].append(function["arguments"]) + else: + # function is an object + if hasattr(function, "name") and function.name: + tool_call_map[index]["name"] = function.name + if hasattr(function, "arguments") and function.arguments: + tool_call_map[index]["arguments"].append(function.arguments) + else: + # tool_call is an object + if hasattr(tool_call, "id") and tool_call.id: + tool_call_map[index]["id"] = tool_call.id + if hasattr(tool_call, "type") and tool_call.type: + tool_call_map[index]["type"] = tool_call.type + if hasattr(tool_call, "function"): + if ( + hasattr(tool_call.function, "name") + and tool_call.function.name + ): + tool_call_map[index]["name"] = tool_call.function.name + if ( + hasattr(tool_call.function, "arguments") + and tool_call.function.arguments + ): + tool_call_map[index]["arguments"].append( + tool_call.function.arguments + ) + + # Preserve provider_specific_fields from streaming chunks + provider_fields = None + if isinstance(tool_call, dict): + provider_fields = tool_call.get("provider_specific_fields") + if not provider_fields and isinstance(tool_call.get("function"), dict): + provider_fields = tool_call["function"].get("provider_specific_fields") + else: + if hasattr(tool_call, "provider_specific_fields") and tool_call.provider_specific_fields: + provider_fields = tool_call.provider_specific_fields + elif hasattr(tool_call, "function") and hasattr(tool_call.function, "provider_specific_fields") and tool_call.function.provider_specific_fields: + provider_fields = tool_call.function.provider_specific_fields + + if provider_fields: + # Merge provider_specific_fields if multiple chunks have them + if tool_call_map[index]["provider_specific_fields"] is None: + tool_call_map[index]["provider_specific_fields"] = {} + if isinstance(provider_fields, dict): + tool_call_map[index]["provider_specific_fields"].update( + provider_fields ) # Convert the map to a list of tool calls @@ -210,19 +289,30 @@ def get_combined_tool_content( if tool_call_data["id"] and tool_call_data["name"]: raw_arguments = "".join(tool_call_data["arguments"]) combined_arguments = _validate_and_repair_tool_arguments(raw_arguments) - tool_calls_list.append( - ChatCompletionMessageToolCall( - id=tool_call_data["id"], - function=Function( - arguments=combined_arguments, - name=tool_call_data["name"], - ), - type=tool_call_data["type"] or "function", - ) + + # Build function - provider_specific_fields should be on tool_call level, not function level + function = Function( + arguments=combined_arguments, + name=tool_call_data["name"], ) + + # Prepare params for ChatCompletionMessageToolCall + tool_call_params = { + "id": tool_call_data["id"], + "function": function, + "type": tool_call_data["type"] or "function", + } + + # Add provider_specific_fields if present (for thought signatures in Gemini 3) + if tool_call_data.get("provider_specific_fields"): + tool_call_params["provider_specific_fields"] = tool_call_data["provider_specific_fields"] + + tool_call = ChatCompletionMessageToolCall(**tool_call_params) + tool_calls_list.append(tool_call) return tool_calls_list + def get_combined_function_call_content( self, function_call_chunks: List[Dict[str, Any]] ) -> FunctionCall: @@ -440,7 +530,8 @@ def _calculate_usage_per_chunk( ## anthropic prompt caching information ## cache_creation_input_tokens: Optional[int] = None cache_read_input_tokens: Optional[int] = None - + + server_tool_use: Optional[ServerToolUse] = None web_search_requests: Optional[int] = None completion_tokens_details: Optional[CompletionTokensDetails] = None prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None @@ -484,6 +575,8 @@ def _calculate_usage_per_chunk( completion_tokens_details = usage_chunk_dict[ "completion_tokens_details" ] + if hasattr(usage_chunk, 'server_tool_use') and usage_chunk.server_tool_use is not None: + server_tool_use = usage_chunk.server_tool_use if ( usage_chunk_dict["prompt_tokens_details"] is not None and getattr( @@ -505,6 +598,7 @@ def _calculate_usage_per_chunk( completion_tokens=completion_tokens, cache_creation_input_tokens=cache_creation_input_tokens, cache_read_input_tokens=cache_read_input_tokens, + server_tool_use=server_tool_use, web_search_requests=web_search_requests, completion_tokens_details=completion_tokens_details, prompt_tokens_details=prompt_tokens_details, @@ -535,6 +629,9 @@ def calculate_usage( "cache_read_input_tokens" ] + server_tool_use: Optional[ServerToolUse] = calculated_usage_per_chunk[ + "server_tool_use" + ] web_search_requests: Optional[int] = calculated_usage_per_chunk[ "web_search_requests" ] @@ -598,6 +695,8 @@ def calculate_usage( if prompt_tokens_details is not None: returned_usage.prompt_tokens_details = prompt_tokens_details + if server_tool_use is not None: + returned_usage.server_tool_use = server_tool_use if web_search_requests is not None: if returned_usage.prompt_tokens_details is None: returned_usage.prompt_tokens_details = PromptTokensDetailsWrapper( diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py index 64223a9ba4e8..c6f0f67976f3 100644 --- a/litellm/litellm_core_utils/streaming_handler.py +++ b/litellm/litellm_core_utils/streaming_handler.py @@ -25,6 +25,7 @@ ) from litellm.types.utils import GenericStreamingChunk as GChunk from litellm.types.utils import ( + LlmProviders, ModelResponse, ModelResponseStream, StreamingChoices, @@ -96,9 +97,9 @@ def __init__( self.system_fingerprint: Optional[str] = None self.received_finish_reason: Optional[str] = None - self.intermittent_finish_reason: Optional[str] = ( - None # finish reasons that show up mid-stream - ) + self.intermittent_finish_reason: Optional[ + str + ] = None # finish reasons that show up mid-stream self.special_tokens = [ "<|assistant|>", "<|system|>", @@ -441,7 +442,6 @@ def handle_openai_chat_completion_chunk(self, chunk): finish_reason = None logprobs = None usage = None - if str_line and str_line.choices and len(str_line.choices) > 0: if ( str_line.choices[0].delta is not None @@ -734,6 +734,15 @@ def is_chunk_non_empty( "function_call" in completion_obj and completion_obj["function_call"] is not None ) + or ( + "tool_calls" in model_response.choices[0].delta + and model_response.choices[0].delta["tool_calls"] is not None + and len(model_response.choices[0].delta["tool_calls"]) > 0 + ) + or ( + "function_call" in model_response.choices[0].delta + and model_response.choices[0].delta["function_call"] is not None + ) or ( "reasoning_content" in model_response.choices[0].delta and model_response.choices[0].delta.reasoning_content is not None @@ -881,7 +890,6 @@ def return_processed_chunk_logic( # noqa ## check if openai/azure chunk original_chunk = response_obj.get("original_chunk", None) if original_chunk: - if len(original_chunk.choices) > 0: choices = [] for choice in original_chunk.choices: @@ -898,7 +906,6 @@ def return_processed_chunk_logic( # noqa print_verbose(f"choices in streaming: {choices}") setattr(model_response, "choices", choices) else: - return model_response.system_fingerprint = ( original_chunk.system_fingerprint @@ -1295,9 +1302,9 @@ def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915 if response_obj["is_finished"]: self.received_finish_reason = response_obj["finish_reason"] else: # openai / azure chat model - if self.custom_llm_provider == "azure": + if self.custom_llm_provider in [LlmProviders.AZURE.value, LlmProviders.AZURE_AI.value]: if isinstance(chunk, BaseModel) and hasattr(chunk, "model"): - # for azure, we need to pass the model from the orignal chunk + # for azure, we need to pass the model from the original chunk self.model = getattr(chunk, "model", self.model) response_obj = self.handle_openai_chat_completion_chunk(chunk) if response_obj is None: @@ -1427,9 +1434,9 @@ def chunk_creator(self, chunk: Any): # type: ignore # noqa: PLR0915 _json_delta = delta.model_dump() print_verbose(f"_json_delta: {_json_delta}") if "role" not in _json_delta or _json_delta["role"] is None: - _json_delta["role"] = ( - "assistant" # mistral's api returns role as None - ) + _json_delta[ + "role" + ] = "assistant" # mistral's api returns role as None if "tool_calls" in _json_delta and isinstance( _json_delta["tool_calls"], list ): @@ -1525,7 +1532,7 @@ def set_logging_event_loop(self, loop): async def _call_post_streaming_deployment_hook(self, chunk): """ Call the post-call streaming deployment hook for callbacks. - + This allows callbacks to modify streaming chunks before they're returned. """ try: @@ -1536,15 +1543,17 @@ async def _call_post_streaming_deployment_hook(self, chunk): # Get request kwargs from logging object request_data = self.logging_obj.model_call_details call_type_str = self.logging_obj.call_type - + try: typed_call_type = CallTypes(call_type_str) except ValueError: typed_call_type = None - + # Call hooks for all callbacks for callback in litellm.callbacks: - if isinstance(callback, CustomLogger) and hasattr(callback, "async_post_call_streaming_deployment_hook"): + if isinstance(callback, CustomLogger) and hasattr( + callback, "async_post_call_streaming_deployment_hook" + ): result = await callback.async_post_call_streaming_deployment_hook( request_data=request_data, response_chunk=chunk, @@ -1552,13 +1561,100 @@ async def _call_post_streaming_deployment_hook(self, chunk): ) if result is not None: chunk = result - + return chunk except Exception as e: from litellm._logging import verbose_logger - verbose_logger.exception(f"Error in post-call streaming deployment hook: {str(e)}") + + verbose_logger.exception( + f"Error in post-call streaming deployment hook: {str(e)}" + ) return chunk + def _add_mcp_list_tools_to_first_chunk(self, chunk: ModelResponseStream) -> ModelResponseStream: + """ + Add mcp_list_tools from _hidden_params to the first chunk's delta.provider_specific_fields. + + This method checks if MCP metadata with mcp_list_tools is stored in _hidden_params + and adds it to the first chunk's delta.provider_specific_fields. + """ + try: + # Check if MCP metadata should be added to first chunk + if not hasattr(self, "_hidden_params") or not self._hidden_params: + return chunk + + mcp_metadata = self._hidden_params.get("mcp_metadata") + if not mcp_metadata or not isinstance(mcp_metadata, dict): + return chunk + + # Only add mcp_list_tools to first chunk (not tool_calls or tool_results) + mcp_list_tools = mcp_metadata.get("mcp_list_tools") + if not mcp_list_tools: + return chunk + + # Add mcp_list_tools to delta.provider_specific_fields + if hasattr(chunk, "choices") and chunk.choices: + for choice in chunk.choices: + if isinstance(choice, StreamingChoices) and hasattr(choice, "delta") and choice.delta: + # Get existing provider_specific_fields or create new dict + provider_fields = ( + getattr(choice.delta, "provider_specific_fields", None) or {} + ) + + # Add only mcp_list_tools to first chunk + provider_fields["mcp_list_tools"] = mcp_list_tools + + # Set the provider_specific_fields + setattr(choice.delta, "provider_specific_fields", provider_fields) + + except Exception as e: + from litellm._logging import verbose_logger + verbose_logger.exception( + f"Error adding MCP list tools to first chunk: {str(e)}" + ) + + return chunk + + def _add_mcp_metadata_to_final_chunk(self, chunk: ModelResponseStream) -> ModelResponseStream: + """ + Add MCP metadata from _hidden_params to the final chunk's delta.provider_specific_fields. + + This method checks if MCP metadata is stored in _hidden_params and adds it to + the chunk's delta.provider_specific_fields, similar to how RAG adds search results. + """ + try: + # Check if MCP metadata should be added to final chunk + if not hasattr(self, "_hidden_params") or not self._hidden_params: + return chunk + + mcp_metadata = self._hidden_params.get("mcp_metadata") + if not mcp_metadata: + return chunk + + # Add MCP metadata to delta.provider_specific_fields + if hasattr(chunk, "choices") and chunk.choices: + for choice in chunk.choices: + if isinstance(choice, StreamingChoices) and hasattr(choice, "delta") and choice.delta: + # Get existing provider_specific_fields or create new dict + provider_fields = ( + getattr(choice.delta, "provider_specific_fields", None) or {} + ) + + # Add MCP metadata + if isinstance(mcp_metadata, dict): + provider_fields.update(mcp_metadata) + + # Set the provider_specific_fields + setattr(choice.delta, "provider_specific_fields", provider_fields) + + except Exception as e: + from litellm._logging import verbose_logger + verbose_logger.exception( + f"Error adding MCP metadata to final chunk: {str(e)}" + ) + + return chunk + def cache_streaming_response(self, processed_chunk, cache_hit: bool): """ Caches the streaming response @@ -1675,11 +1771,17 @@ def __next__(self): # noqa: PLR0915 ) # HANDLE STREAM OPTIONS self.chunks.append(response) + + # Add mcp_list_tools to first chunk if present + if not self.sent_first_chunk: + response = self._add_mcp_list_tools_to_first_chunk(response) + self.sent_first_chunk = True + if hasattr( response, "usage" ): # remove usage from chunk, only send on final chunk # Convert the object to a dictionary - obj_dict = response.dict() + obj_dict = response.model_dump() # Remove an attribute (e.g., 'attr2') if "usage" in obj_dict: @@ -1700,6 +1802,8 @@ def __next__(self): # noqa: PLR0915 if self.sent_last_chunk is True and self.stream_options is None: usage = calculate_total_usage(chunks=self.chunks) response._hidden_params["usage"] = usage + # Add MCP metadata to final chunk if present + response = self._add_mcp_metadata_to_final_chunk(response) # RETURN RESULT return response @@ -1840,11 +1944,16 @@ async def __anext__(self): # noqa: PLR0915 input=self.response_uptil_now, model=self.model ) self.chunks.append(processed_chunk) + + # Add mcp_list_tools to first chunk if present + if not self.sent_first_chunk: + processed_chunk = self._add_mcp_list_tools_to_first_chunk(processed_chunk) + self.sent_first_chunk = True if hasattr( processed_chunk, "usage" ): # remove usage from chunk, only send on final chunk # Convert the object to a dictionary - obj_dict = processed_chunk.dict() + obj_dict = processed_chunk.model_dump() # Remove an attribute (e.g., 'attr2') if "usage" in obj_dict: @@ -1864,11 +1973,17 @@ async def __anext__(self): # noqa: PLR0915 if self.sent_last_chunk is True and self.stream_options is None: usage = calculate_total_usage(chunks=self.chunks) processed_chunk._hidden_params["usage"] = usage - + # Call post-call streaming deployment hook for final chunk if self.sent_last_chunk is True: - processed_chunk = await self._call_post_streaming_deployment_hook(processed_chunk) - + processed_chunk = ( + await self._call_post_streaming_deployment_hook( + processed_chunk + ) + ) + # Add MCP metadata to final chunk if present (after hooks) + processed_chunk = self._add_mcp_metadata_to_final_chunk(processed_chunk) + return processed_chunk raise StopAsyncIteration else: # temporary patch for non-aiohttp async calls @@ -1882,9 +1997,9 @@ async def __anext__(self): # noqa: PLR0915 chunk = next(self.completion_stream) if chunk is not None and chunk != b"": print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}") - processed_chunk: Optional[ModelResponseStream] = ( - self.chunk_creator(chunk=chunk) - ) + processed_chunk: Optional[ + ModelResponseStream + ] = self.chunk_creator(chunk=chunk) print_verbose( f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}" ) @@ -1985,24 +2100,56 @@ async def __anext__(self): # noqa: PLR0915 ) ## Map to OpenAI Exception try: - raise exception_type( + mapped_exception = exception_type( model=self.model, custom_llm_provider=self.custom_llm_provider, original_exception=e, completion_kwargs={}, extra_kwargs={}, ) - except Exception as e: - from litellm.exceptions import MidStreamFallbackError + except Exception as mapping_error: + mapped_exception = mapping_error + + def _normalize_status_code(exc: Exception) -> Optional[int]: + """ + Best-effort status_code extraction. + Uses status_code on the exception, then falls back to the response. + """ + try: + code = getattr(exc, "status_code", None) + if code is not None: + return int(code) + except Exception: + pass - raise MidStreamFallbackError( - message=str(e), - model=self.model, - llm_provider=self.custom_llm_provider or "anthropic", - original_exception=e, - generated_content=self.response_uptil_now, - is_pre_first_chunk=not self.sent_first_chunk, - ) + response = getattr(exc, "response", None) + if response is not None: + try: + status_code = getattr(response, "status_code", None) + if status_code is not None: + return int(status_code) + except Exception: + pass + return None + + mapped_status_code = _normalize_status_code(mapped_exception) + original_status_code = _normalize_status_code(e) + + if mapped_status_code is not None and 400 <= mapped_status_code < 500: + raise mapped_exception + if original_status_code is not None and 400 <= original_status_code < 500: + raise mapped_exception + + from litellm.exceptions import MidStreamFallbackError + + raise MidStreamFallbackError( + message=str(mapped_exception), + model=self.model, + llm_provider=self.custom_llm_provider or "anthropic", + original_exception=mapped_exception, + generated_content=self.response_uptil_now, + is_pre_first_chunk=not self.sent_first_chunk, + ) @staticmethod def _strip_sse_data_from_chunk(chunk: Optional[str]) -> Optional[str]: diff --git a/litellm/litellm_core_utils/token_counter.py b/litellm/litellm_core_utils/token_counter.py index fab2c1e76ee4..a99bd1cd0f3a 100644 --- a/litellm/litellm_core_utils/token_counter.py +++ b/litellm/litellm_core_utils/token_counter.py @@ -3,7 +3,17 @@ import base64 import io import struct -from typing import Callable, List, Literal, Optional, Tuple, Union, cast +from typing import ( + Any, + Callable, + List, + Literal, + Mapping, + Optional, + Tuple, + Union, + cast, +) import tiktoken @@ -20,6 +30,10 @@ ) from litellm.litellm_core_utils.default_encoding import encoding as default_encoding from litellm.llms.custom_httpx.http_handler import _get_httpx_client +from litellm.types.llms.anthropic import ( + AnthropicMessagesToolResultParam, + AnthropicMessagesToolUseParam, +) from litellm.types.llms.openai import ( AllMessageValues, ChatCompletionNamedToolChoiceParam, @@ -552,6 +566,131 @@ def _fix_model_name(model: str) -> str: return "gpt-3.5-turbo" +def _count_image_tokens( + image_url: Any, + use_default_image_token_count: bool, +) -> int: + """ + Count tokens for an image_url content block. + + Args: + image_url: The image URL data - can be a string URL or dict with 'url' and 'detail' + use_default_image_token_count: Whether to use default image token counts + + Returns: + int: Number of tokens for the image + + Raises: + ValueError: If image_url is invalid type or detail value is invalid + """ + if isinstance(image_url, dict): + detail = image_url.get("detail", "auto") + if detail not in ["low", "high", "auto"]: + raise ValueError( + f"Invalid detail value: {detail}. Expected 'low', 'high', or 'auto'." + ) + url = image_url.get("url") + if not url: + raise ValueError("Missing required key 'url' in image_url dict.") + return calculate_img_tokens( + data=url, + mode=detail, # type: ignore + use_default_image_token_count=use_default_image_token_count, + ) + elif isinstance(image_url, str): + if not image_url.strip(): + raise ValueError("Empty image_url string is not valid.") + return calculate_img_tokens( + data=image_url, + mode="auto", + use_default_image_token_count=use_default_image_token_count, + ) + else: + raise ValueError( + f"Invalid image_url type: {type(image_url).__name__}. " + "Expected str or dict with 'url' field." + ) + + +def _validate_anthropic_content(content: Mapping[str, Any]) -> type: + """ + Validate and determine which Anthropic TypedDict applies. + + Returns the corresponding TypedDict class if recognized, otherwise raises. + """ + content_type = content.get("type") + if not content_type: + raise ValueError("Anthropic content missing required field: 'type'") + + mapping = { + "tool_use": AnthropicMessagesToolUseParam, + "tool_result": AnthropicMessagesToolResultParam, + } + + expected_cls = mapping.get(content_type) + if expected_cls is None: + raise ValueError(f"Unknown Anthropic content type: '{content_type}'") + + missing = [ + k for k in getattr(expected_cls, "__required_keys__", set()) if k not in content + ] + if missing: + raise ValueError( + f"Missing required fields in {content_type} block: {', '.join(missing)}" + ) + + return expected_cls + + +def _count_anthropic_content( + content: Mapping[str, Any], + count_function: TokenCounterFunction, + use_default_image_token_count: bool, + default_token_count: Optional[int], +) -> int: + """ + Count tokens in Anthropic-specific content blocks (tool_use, tool_result, etc.). + + Uses TypedDict definitions from litellm.types.llms.anthropic to determine + what fields to count and how to handle nested structures. + + Dynamically infers which fields to count based on the TypedDict definition, + avoiding hardcoded field names. + """ + typeddict_cls = _validate_anthropic_content(content) + type_hints = getattr(typeddict_cls, "__annotations__", {}) + tokens = 0 + + # Fields to skip (metadata/identifiers that don't contribute to prompt tokens) + skip_fields = {"type", "id", "tool_use_id", "cache_control", "is_error"} + + # Iterate over all fields defined in the TypedDict + for field_name, field_type in type_hints.items(): + if field_name in skip_fields: + continue + + field_value = content.get(field_name) + if field_value is None: + continue + try: + if isinstance(field_value, str): + tokens += count_function(field_value) + elif isinstance(field_value, list): + tokens += _count_content_list( + count_function, + field_value, # type: ignore + use_default_image_token_count, + default_token_count, + ) + elif isinstance(field_value, dict): + tokens += count_function(str(field_value)) + except Exception as e: + if default_token_count is not None: + return default_token_count + raise ValueError(f"Error counting field '{field_name}': {e}") + return tokens + + def _count_content_list( count_function: TokenCounterFunction, content_list: OpenAIMessageContent, @@ -559,7 +698,7 @@ def _count_content_list( default_token_count: Optional[int], ) -> int: """ - Get the number of tokens from a list of content. + Recursively count tokens from a list of content blocks. """ try: num_tokens = 0 @@ -567,42 +706,38 @@ def _count_content_list( if isinstance(c, str): num_tokens += count_function(c) elif c["type"] == "text": - num_tokens += count_function(c["text"]) + num_tokens += count_function(c.get("text", "")) elif c["type"] == "image_url": - if isinstance(c["image_url"], dict): - image_url_dict = c["image_url"] - detail = image_url_dict.get("detail", "auto") - if detail not in ["low", "high", "auto"]: - raise ValueError( - f"Invalid detail value: {detail}. Expected 'low', 'high', or 'auto'." - ) - url = image_url_dict.get("url") - num_tokens += calculate_img_tokens( - data=url, - mode=detail, # type: ignore - use_default_image_token_count=use_default_image_token_count, - ) - elif isinstance(c["image_url"], str): - image_url_str = c["image_url"] - num_tokens += calculate_img_tokens( - data=image_url_str, - mode="auto", - use_default_image_token_count=use_default_image_token_count, - ) - else: - raise ValueError( - f"Invalid image_url type: {type(c['image_url'])}. Expected str or dict." - ) + image_url = c.get("image_url") + num_tokens += _count_image_tokens( + image_url, use_default_image_token_count + ) + elif c["type"] in ("tool_use", "tool_result"): + num_tokens += _count_anthropic_content( + c, + count_function, + use_default_image_token_count, + default_token_count, + ) + elif c["type"] == "thinking": + # Claude extended thinking content block + # Count the thinking text and skip signature (opaque signature blob) + thinking_text = c.get("thinking", "") + if thinking_text: + num_tokens += count_function(thinking_text) else: raise ValueError( - f"Invalid content type: {type(c)}. Expected str or dict." + f"Invalid content item type: {type(c).__name__}. " + f"Expected str or dict with 'type' field. " + f"Value: {c!r}" ) return num_tokens except Exception as e: if default_token_count is not None: return default_token_count raise ValueError( - f"Error getting number of tokens from content list: {e}, default_token_count={default_token_count}" + f"Error getting number of tokens from content list: {e}, " + f"default_token_count={default_token_count}" ) diff --git a/litellm/llms/__init__.py b/litellm/llms/__init__.py index 44e1fb4eb933..c73f0b22b4b2 100644 --- a/litellm/llms/__init__.py +++ b/litellm/llms/__init__.py @@ -43,6 +43,10 @@ def get_cost_for_web_search_request( # Perplexity handles search costs internally in its own cost calculator # Return 0.0 to indicate costs are already accounted for return 0.0 + elif custom_llm_provider == "xai": + from .xai.cost_calculator import cost_per_web_search_request + + return cost_per_web_search_request(usage=usage, model_info=model_info) else: return None @@ -107,6 +111,21 @@ def discover_guardrail_translation_mappings() -> ( verbose_logger.error(f"Error processing {module_path}: {e}") continue + try: + from litellm.proxy._experimental.mcp_server.guardrail_translation import ( + guardrail_translation_mappings as mcp_guardrail_translation_mappings, + ) + + discovered_mappings.update(mcp_guardrail_translation_mappings) + verbose_logger.debug( + "Loaded MCP guardrail translation mappings: %s", + list(mcp_guardrail_translation_mappings.keys()), + ) + except ImportError: + verbose_logger.debug( + "MCP guardrail translation mappings not available; skipping" + ) + verbose_logger.debug( f"Discovered {len(discovered_mappings)} guardrail translation mappings: {list(discovered_mappings.keys())}" ) diff --git a/litellm/llms/aiml/image_generation/transformation.py b/litellm/llms/aiml/image_generation/transformation.py index 006a2c16d7e7..d8f3e23fe7e7 100644 --- a/litellm/llms/aiml/image_generation/transformation.py +++ b/litellm/llms/aiml/image_generation/transformation.py @@ -97,6 +97,9 @@ def get_complete_url( ) complete_url = complete_url.rstrip("/") + # Strip /v1 suffix if present since IMAGE_GENERATION_ENDPOINT already includes v1 + if complete_url.endswith("/v1"): + complete_url = complete_url[:-3] complete_url = f"{complete_url}/{self.IMAGE_GENERATION_ENDPOINT}" return complete_url diff --git a/litellm/llms/amazon_nova/chat/transformation.py b/litellm/llms/amazon_nova/chat/transformation.py new file mode 100644 index 000000000000..6d321e298b80 --- /dev/null +++ b/litellm/llms/amazon_nova/chat/transformation.py @@ -0,0 +1,115 @@ +""" +Translate from OpenAI's `/v1/chat/completions` to Amazon Nova's `/v1/chat/completions` +""" +from typing import Any, List, Optional, Tuple + +import httpx + +import litellm +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +from litellm.secret_managers.main import get_secret_str +from litellm.types.llms.openai import ( + AllMessageValues, +) +from litellm.types.utils import ModelResponse + +from ...openai_like.chat.transformation import OpenAILikeChatConfig + + +class AmazonNovaChatConfig(OpenAILikeChatConfig): + max_completion_tokens: Optional[int] = None + max_tokens: Optional[int] = None + metadata: Optional[int] = None + temperature: Optional[int] = None + top_p: Optional[int] = None + tools: Optional[list] = None + reasoning_effort: Optional[list] = None + + def __init__( + self, + max_completion_tokens: Optional[int] = None, + max_tokens: Optional[int] = None, + temperature: Optional[int] = None, + top_p: Optional[int] = None, + tools: Optional[list] = None, + reasoning_effort: Optional[list] = None, + ) -> None: + locals_ = locals().copy() + for key, value in locals_.items(): + if key != "self" and value is not None: + setattr(self.__class__, key, value) + + @property + def custom_llm_provider(self) -> Optional[str]: + return "amazon_nova" + + @classmethod + def get_config(cls): + return super().get_config() + + def _get_openai_compatible_provider_info( + self, api_base: Optional[str], api_key: Optional[str] + ) -> Tuple[Optional[str], Optional[str]]: + # Amazon Nova is openai compatible, we just need to set this to custom_openai and have the api_base be Nova's endpoint + api_base = ( + api_base + or get_secret_str("AMAZON_NOVA_API_BASE") + or "https://api.nova.amazon.com/v1" + ) # type: ignore + + # Get API key from multiple sources + key = ( + api_key + or litellm.amazon_nova_api_key + or get_secret_str("AMAZON_NOVA_API_KEY") + or litellm.api_key + ) + return api_base, key + + def get_supported_openai_params(self, model: str) -> List: + return [ + "top_p", + "temperature", + "max_tokens", + "max_completion_tokens", + "metadata", + "stop", + "stream", + "stream_options", + "tools", + "tool_choice", + "reasoning_effort" + ] + + def transform_response( + self, + model: str, + raw_response: httpx.Response, + model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, + request_data: dict, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + encoding: Any, + api_key: Optional[str] = None, + json_mode: Optional[bool] = None, + ) -> ModelResponse: + model_response = super().transform_response( + model=model, + model_response=model_response, + raw_response=raw_response, + messages=messages, + logging_obj=logging_obj, + request_data=request_data, + encoding=encoding, + optional_params=optional_params, + json_mode=json_mode, + litellm_params=litellm_params, + api_key=api_key, + ) + + # Storing amazon_nova in the model response for easier cost calculation later + setattr(model_response, "model", "amazon-nova/" + model) + + return model_response \ No newline at end of file diff --git a/litellm/llms/amazon_nova/cost_calculation.py b/litellm/llms/amazon_nova/cost_calculation.py new file mode 100644 index 000000000000..9d9cedde875d --- /dev/null +++ b/litellm/llms/amazon_nova/cost_calculation.py @@ -0,0 +1,21 @@ +""" +Helper util for handling amazon nova cost calculation +- e.g.: prompt caching +""" + +from typing import TYPE_CHECKING, Tuple + +from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token + +if TYPE_CHECKING: + from litellm.types.utils import Usage + + +def cost_per_token(model: str, usage: "Usage") -> Tuple[float, float]: + """ + Calculates the cost per token for a given model, prompt tokens, and completion tokens. + Follows the same logic as Anthropic's cost per token calculation. + """ + return generic_cost_per_token( + model=model, usage=usage, custom_llm_provider="amazon_nova" + ) \ No newline at end of file diff --git a/litellm/llms/anthropic/batches/__init__.py b/litellm/llms/anthropic/batches/__init__.py new file mode 100644 index 000000000000..66d1a8f77f43 --- /dev/null +++ b/litellm/llms/anthropic/batches/__init__.py @@ -0,0 +1,5 @@ +from .handler import AnthropicBatchesHandler +from .transformation import AnthropicBatchesConfig + +__all__ = ["AnthropicBatchesHandler", "AnthropicBatchesConfig"] + diff --git a/litellm/llms/anthropic/batches/handler.py b/litellm/llms/anthropic/batches/handler.py new file mode 100644 index 000000000000..fd303e60afc5 --- /dev/null +++ b/litellm/llms/anthropic/batches/handler.py @@ -0,0 +1,168 @@ +""" +Anthropic Batches API Handler +""" + +import asyncio +from typing import TYPE_CHECKING, Any, Coroutine, Optional, Union + +import httpx + +from litellm.llms.custom_httpx.http_handler import ( + get_async_httpx_client, +) +from litellm.types.utils import LiteLLMBatch, LlmProviders + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any + +from ..common_utils import AnthropicModelInfo +from .transformation import AnthropicBatchesConfig + + +class AnthropicBatchesHandler: + """ + Handler for Anthropic Message Batches API. + + Supports: + - retrieve_batch() - Retrieve batch status and information + """ + + def __init__(self): + self.anthropic_model_info = AnthropicModelInfo() + self.provider_config = AnthropicBatchesConfig() + + async def aretrieve_batch( + self, + batch_id: str, + api_base: Optional[str], + api_key: Optional[str], + timeout: Union[float, httpx.Timeout], + max_retries: Optional[int], + logging_obj: Optional[LiteLLMLoggingObj] = None, + ) -> LiteLLMBatch: + """ + Async: Retrieve a batch from Anthropic. + + Args: + batch_id: The batch ID to retrieve + api_base: Anthropic API base URL + api_key: Anthropic API key + timeout: Request timeout + max_retries: Max retry attempts (unused for now) + logging_obj: Optional logging object + + Returns: + LiteLLMBatch: Batch information in OpenAI format + """ + # Resolve API credentials + api_base = api_base or self.anthropic_model_info.get_api_base(api_base) + api_key = api_key or self.anthropic_model_info.get_api_key() + + if not api_key: + raise ValueError("Missing Anthropic API Key") + + # Create a minimal logging object if not provided + if logging_obj is None: + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObjClass + logging_obj = LiteLLMLoggingObjClass( + model="anthropic/unknown", + messages=[], + stream=False, + call_type="batch_retrieve", + start_time=None, + litellm_call_id=f"batch_retrieve_{batch_id}", + function_id="batch_retrieve", + ) + + # Get the complete URL for batch retrieval + retrieve_url = self.provider_config.get_retrieve_batch_url( + api_base=api_base, + batch_id=batch_id, + optional_params={}, + litellm_params={}, + ) + + # Validate environment and get headers + headers = self.provider_config.validate_environment( + headers={}, + model="", + messages=[], + optional_params={}, + litellm_params={}, + api_key=api_key, + api_base=api_base, + ) + + logging_obj.pre_call( + input=batch_id, + api_key=api_key, + additional_args={ + "api_base": retrieve_url, + "headers": headers, + "complete_input_dict": {}, + }, + ) + # Make the request + async_client = get_async_httpx_client(llm_provider=LlmProviders.ANTHROPIC) + response = await async_client.get( + url=retrieve_url, + headers=headers + ) + response.raise_for_status() + + # Transform response to LiteLLM format + return self.provider_config.transform_retrieve_batch_response( + model=None, + raw_response=response, + logging_obj=logging_obj, + litellm_params={}, + ) + + def retrieve_batch( + self, + _is_async: bool, + batch_id: str, + api_base: Optional[str], + api_key: Optional[str], + timeout: Union[float, httpx.Timeout], + max_retries: Optional[int], + logging_obj: Optional[LiteLLMLoggingObj] = None, + ) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]: + """ + Retrieve a batch from Anthropic. + + Args: + _is_async: Whether to run asynchronously + batch_id: The batch ID to retrieve + api_base: Anthropic API base URL + api_key: Anthropic API key + timeout: Request timeout + max_retries: Max retry attempts (unused for now) + logging_obj: Optional logging object + + Returns: + LiteLLMBatch or Coroutine: Batch information in OpenAI format + """ + if _is_async: + return self.aretrieve_batch( + batch_id=batch_id, + api_base=api_base, + api_key=api_key, + timeout=timeout, + max_retries=max_retries, + logging_obj=logging_obj, + ) + else: + return asyncio.run( + self.aretrieve_batch( + batch_id=batch_id, + api_base=api_base, + api_key=api_key, + timeout=timeout, + max_retries=max_retries, + logging_obj=logging_obj, + ) + ) + diff --git a/litellm/llms/anthropic/batches/transformation.py b/litellm/llms/anthropic/batches/transformation.py index c20136894bdf..750dd002ff9a 100644 --- a/litellm/llms/anthropic/batches/transformation.py +++ b/litellm/llms/anthropic/batches/transformation.py @@ -1,10 +1,14 @@ import json -from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast +import time +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union, cast -from httpx import Response +import httpx +from httpx import Headers, Response -from litellm.types.llms.openai import AllMessageValues -from litellm.types.utils import ModelResponse +from litellm.llms.base_llm.batches.transformation import BaseBatchesConfig +from litellm.llms.base_llm.chat.transformation import BaseLLMException +from litellm.types.llms.openai import AllMessageValues, CreateBatchRequest +from litellm.types.utils import LiteLLMBatch, LlmProviders, ModelResponse if TYPE_CHECKING: from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj @@ -14,11 +18,221 @@ LoggingClass = Any -class AnthropicBatchesConfig: +class AnthropicBatchesConfig(BaseBatchesConfig): def __init__(self): from ..chat.transformation import AnthropicConfig + from ..common_utils import AnthropicModelInfo self.anthropic_chat_config = AnthropicConfig() # initialize once + self.anthropic_model_info = AnthropicModelInfo() + + @property + def custom_llm_provider(self) -> LlmProviders: + """Return the LLM provider type for this configuration.""" + return LlmProviders.ANTHROPIC + + def validate_environment( + self, + headers: dict, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ) -> dict: + """Validate and prepare environment-specific headers and parameters.""" + # Resolve api_key from environment if not provided + api_key = api_key or self.anthropic_model_info.get_api_key() + if api_key is None: + raise ValueError( + "Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params" + ) + _headers = { + "accept": "application/json", + "anthropic-version": "2023-06-01", + "content-type": "application/json", + "x-api-key": api_key, + } + # Add beta header for message batches + if "anthropic-beta" not in headers: + headers["anthropic-beta"] = "message-batches-2024-09-24" + headers.update(_headers) + return headers + + def get_complete_batch_url( + self, + api_base: Optional[str], + api_key: Optional[str], + model: str, + optional_params: Dict, + litellm_params: Dict, + data: CreateBatchRequest, + ) -> str: + """Get the complete URL for batch creation request.""" + api_base = api_base or self.anthropic_model_info.get_api_base(api_base) + if not api_base.endswith("/v1/messages/batches"): + api_base = f"{api_base.rstrip('/')}/v1/messages/batches" + return api_base + + def transform_create_batch_request( + self, + model: str, + create_batch_data: CreateBatchRequest, + optional_params: dict, + litellm_params: dict, + ) -> Union[bytes, str, Dict[str, Any]]: + """ + Transform the batch creation request to Anthropic format. + + Not currently implemented - placeholder to satisfy abstract base class. + """ + raise NotImplementedError("Batch creation not yet implemented for Anthropic") + + def transform_create_batch_response( + self, + model: Optional[str], + raw_response: httpx.Response, + logging_obj: LoggingClass, + litellm_params: dict, + ) -> LiteLLMBatch: + """ + Transform Anthropic MessageBatch creation response to LiteLLM format. + + Not currently implemented - placeholder to satisfy abstract base class. + """ + raise NotImplementedError("Batch creation not yet implemented for Anthropic") + + def get_retrieve_batch_url( + self, + api_base: Optional[str], + batch_id: str, + optional_params: Dict, + litellm_params: Dict, + ) -> str: + """ + Get the complete URL for batch retrieval request. + + Args: + api_base: Base API URL (optional, will use default if not provided) + batch_id: Batch ID to retrieve + optional_params: Optional parameters + litellm_params: LiteLLM parameters + + Returns: + Complete URL for Anthropic batch retrieval: {api_base}/v1/messages/batches/{batch_id} + """ + api_base = api_base or self.anthropic_model_info.get_api_base(api_base) + return f"{api_base.rstrip('/')}/v1/messages/batches/{batch_id}" + + def transform_retrieve_batch_request( + self, + batch_id: str, + optional_params: dict, + litellm_params: dict, + ) -> Union[bytes, str, Dict[str, Any]]: + """ + Transform batch retrieval request for Anthropic. + + For Anthropic, the URL is constructed by get_retrieve_batch_url(), + so this method returns an empty dict (no additional request params needed). + """ + # No additional request params needed - URL is handled by get_retrieve_batch_url + return {} + + def transform_retrieve_batch_response( + self, + model: Optional[str], + raw_response: httpx.Response, + logging_obj: LoggingClass, + litellm_params: dict, + ) -> LiteLLMBatch: + """Transform Anthropic MessageBatch retrieval response to LiteLLM format.""" + try: + response_data = raw_response.json() + except Exception as e: + raise ValueError(f"Failed to parse Anthropic batch response: {e}") + + # Map Anthropic MessageBatch to OpenAI Batch format + batch_id = response_data.get("id", "") + processing_status = response_data.get("processing_status", "in_progress") + + # Map Anthropic processing_status to OpenAI status + status_mapping: Dict[str, Literal["validating", "failed", "in_progress", "finalizing", "completed", "expired", "cancelling", "cancelled"]] = { + "in_progress": "in_progress", + "canceling": "cancelling", + "ended": "completed", + } + openai_status = status_mapping.get(processing_status, "in_progress") + + # Parse timestamps + def parse_timestamp(ts_str: Optional[str]) -> Optional[int]: + if not ts_str: + return None + try: + from datetime import datetime + dt = datetime.fromisoformat(ts_str.replace('Z', '+00:00')) + return int(dt.timestamp()) + except Exception: + return None + + created_at = parse_timestamp(response_data.get("created_at")) + ended_at = parse_timestamp(response_data.get("ended_at")) + expires_at = parse_timestamp(response_data.get("expires_at")) + cancel_initiated_at = parse_timestamp(response_data.get("cancel_initiated_at")) + archived_at = parse_timestamp(response_data.get("archived_at")) + + # Extract request counts + request_counts_data = response_data.get("request_counts", {}) + from openai.types.batch import BatchRequestCounts + request_counts = BatchRequestCounts( + total=sum([ + request_counts_data.get("processing", 0), + request_counts_data.get("succeeded", 0), + request_counts_data.get("errored", 0), + request_counts_data.get("canceled", 0), + request_counts_data.get("expired", 0), + ]), + completed=request_counts_data.get("succeeded", 0), + failed=request_counts_data.get("errored", 0), + ) + + return LiteLLMBatch( + id=batch_id, + object="batch", + endpoint="/v1/messages", + errors=None, + input_file_id="None", + completion_window="24h", + status=openai_status, + output_file_id=batch_id, + error_file_id=None, + created_at=created_at or int(time.time()), + in_progress_at=created_at if processing_status == "in_progress" else None, + expires_at=expires_at, + finalizing_at=None, + completed_at=ended_at if processing_status == "ended" else None, + failed_at=None, + expired_at=archived_at if archived_at else None, + cancelling_at=cancel_initiated_at if processing_status == "canceling" else None, + cancelled_at=ended_at if processing_status == "canceling" and ended_at else None, + request_counts=request_counts, + metadata={}, + ) + + def get_error_class( + self, error_message: str, status_code: int, headers: Union[Dict, Headers] + ) -> "BaseLLMException": + """Get the appropriate error class for Anthropic.""" + from ..common_utils import AnthropicError + + # Convert Dict to Headers if needed + if isinstance(headers, dict): + headers_obj: Optional[Headers] = Headers(headers) + else: + headers_obj = headers if isinstance(headers, Headers) else None + + return AnthropicError(status_code=status_code, message=error_message, headers=headers_obj) def transform_response( self, diff --git a/litellm/llms/anthropic/chat/guardrail_translation/handler.py b/litellm/llms/anthropic/chat/guardrail_translation/handler.py index 06a1b92e1b00..71d74121a300 100644 --- a/litellm/llms/anthropic/chat/guardrail_translation/handler.py +++ b/litellm/llms/anthropic/chat/guardrail_translation/handler.py @@ -12,17 +12,37 @@ 4. Apply guardrail responses back to the original structure """ -import asyncio -from typing import TYPE_CHECKING, Any, Coroutine, Dict, List, Optional, Tuple, cast +import json +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, cast from litellm._logging import verbose_proxy_logger +from litellm.llms.anthropic.chat.transformation import AnthropicConfig +from litellm.llms.anthropic.experimental_pass_through.adapters.transformation import ( + LiteLLMAnthropicMessagesAdapter, +) from litellm.llms.base_llm.guardrail_translation.base_translation import BaseTranslation +from litellm.proxy.pass_through_endpoints.llm_provider_handlers.anthropic_passthrough_logging_handler import ( + AnthropicPassthroughLoggingHandler, +) +from litellm.types.llms.anthropic import ( + AllAnthropicToolsValues, + AnthropicMessagesRequest, +) +from litellm.types.llms.openai import ( + ChatCompletionToolCallChunk, + ChatCompletionToolParam, +) +from litellm.types.utils import ( + ChatCompletionMessageToolCall, + GenericGuardrailAPIInputs, + ModelResponse, +) if TYPE_CHECKING: from litellm.integrations.custom_guardrail import CustomGuardrail + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.types.llms.anthropic_messages.anthropic_response import ( AnthropicMessagesResponse, - AnthropicResponseTextBlock, ) @@ -37,10 +57,15 @@ class AnthropicMessagesHandler(BaseTranslation): Methods can be overridden to customize behavior for different message formats. """ + def __init__(self): + super().__init__() + self.adapter = LiteLLMAnthropicMessagesAdapter() + async def process_input_messages( self, data: dict, guardrail_to_apply: "CustomGuardrail", + litellm_logging_obj: Optional[Any] = None, ) -> Any: """ Process input messages by applying guardrails to text content. @@ -49,30 +74,61 @@ async def process_input_messages( if messages is None: return data - tasks: List[Coroutine[Any, Any, str]] = [] + chat_completion_compatible_request = ( + LiteLLMAnthropicMessagesAdapter().translate_anthropic_to_openai( + anthropic_message_request=cast(AnthropicMessagesRequest, data) + ) + ) + + structured_messages = chat_completion_compatible_request.get("messages", []) + + texts_to_check: List[str] = [] + images_to_check: List[str] = [] + tools_to_check: List[ChatCompletionToolParam] = ( + chat_completion_compatible_request.get("tools", []) + ) task_mappings: List[Tuple[int, Optional[int]]] = [] - # Track (message_index, content_index) for each task + # Track (message_index, content_index) for each text # content_index is None for string content, int for list content - # Step 1: Extract all text content and create guardrail tasks + # Step 1: Extract all text content and images for msg_idx, message in enumerate(messages): - await self._extract_input_text_and_create_tasks( + self._extract_input_text_and_images( message=message, msg_idx=msg_idx, - tasks=tasks, + texts_to_check=texts_to_check, + images_to_check=images_to_check, task_mappings=task_mappings, - guardrail_to_apply=guardrail_to_apply, ) - # Step 2: Run all guardrail tasks in parallel - responses = await asyncio.gather(*tasks) + # Step 2: Apply guardrail to all texts in batch + if texts_to_check: + inputs = GenericGuardrailAPIInputs(texts=texts_to_check) + if images_to_check: + inputs["images"] = images_to_check + if tools_to_check: + inputs["tools"] = tools_to_check + if structured_messages: + inputs["structured_messages"] = structured_messages + # Include model information if available + model = data.get("model") + if model: + inputs["model"] = model + guardrailed_inputs = await guardrail_to_apply.apply_guardrail( + inputs=inputs, + request_data=data, + input_type="request", + logging_obj=litellm_logging_obj, + ) - # Step 3: Map guardrail responses back to original message structure - await self._apply_guardrail_responses_to_input( - messages=messages, - responses=responses, - task_mappings=task_mappings, - ) + guardrailed_texts = guardrailed_inputs.get("texts", []) + + # Step 3: Map guardrail responses back to original message structure + await self._apply_guardrail_responses_to_input( + messages=messages, + responses=guardrailed_texts, + task_mappings=task_mappings, + ) verbose_proxy_logger.debug( "Anthropic Messages: Processed input messages: %s", messages @@ -80,36 +136,63 @@ async def process_input_messages( return data - async def _extract_input_text_and_create_tasks( + def _extract_input_text_and_images( self, message: Dict[str, Any], msg_idx: int, - tasks: List, + texts_to_check: List[str], + images_to_check: List[str], task_mappings: List[Tuple[int, Optional[int]]], - guardrail_to_apply: "CustomGuardrail", ) -> None: """ - Extract text content from a message and create guardrail tasks. + Extract text content and images from a message. - Override this method to customize text extraction logic. + Override this method to customize text/image extraction logic. """ content = message.get("content", None) - if content is None: + tools = message.get("tools", None) + if content is None and tools is None: return - if isinstance(content, str): + ## CHECK FOR TEXT + IMAGES + if content is not None and isinstance(content, str): # Simple string content - tasks.append(guardrail_to_apply.apply_guardrail(text=content)) + texts_to_check.append(content) task_mappings.append((msg_idx, None)) - elif isinstance(content, list): + elif content is not None and isinstance(content, list): # List content (e.g., multimodal with text and images) for content_idx, content_item in enumerate(content): + # Extract text text_str = content_item.get("text", None) - if text_str is None: - continue - tasks.append(guardrail_to_apply.apply_guardrail(text=text_str)) - task_mappings.append((msg_idx, int(content_idx))) + if text_str is not None: + texts_to_check.append(text_str) + task_mappings.append((msg_idx, int(content_idx))) + + # Extract images + if content_item.get("type") == "image": + source = content_item.get("source", {}) + if isinstance(source, dict): + # Could be base64 or url + data = source.get("data") + if data: + images_to_check.append(data) + + def _extract_input_tools( + self, + tools: List[Dict[str, Any]], + tools_to_check: List[ChatCompletionToolParam], + ) -> None: + """ + Extract tools from a message. + """ + ## CHECK FOR TOOLS + if tools is not None and isinstance(tools, list): + # TRANSFORM ANTHROPIC TOOLS TO OPENAI TOOLS + openai_tools = self.adapter.translate_anthropic_tools_to_openai( + tools=cast(List[AllAnthropicToolsValues], tools) + ) + tools_to_check.extend(openai_tools) async def _apply_guardrail_responses_to_input( self, @@ -145,56 +228,115 @@ async def process_output_response( self, response: "AnthropicMessagesResponse", guardrail_to_apply: "CustomGuardrail", + litellm_logging_obj: Optional[Any] = None, + user_api_key_dict: Optional[Any] = None, ) -> Any: """ - Process output response by applying guardrails to text content. + Process output response by applying guardrails to text content and tool calls. Args: response: Anthropic MessagesResponse object guardrail_to_apply: The guardrail instance to apply + litellm_logging_obj: Optional logging object + user_api_key_dict: User API key metadata to pass to guardrails Returns: Modified response with guardrail applied to content Response Format Support: - - List content: response.content = [{"type": "text", "text": "text here"}, ...] + - List content: response.content = [ + {"type": "text", "text": "text here"}, + {"type": "tool_use", "id": "...", "name": "...", "input": {...}}, + ... + ] """ - # Step 0: Check if response has any text content to process - if not self._has_text_content(response): - verbose_proxy_logger.warning( - "Anthropic Messages: No text content in response, skipping guardrail" - ) - return response - - tasks: List[Coroutine[Any, Any, str]] = [] + texts_to_check: List[str] = [] + images_to_check: List[str] = [] + tool_calls_to_check: List[ChatCompletionToolCallChunk] = [] task_mappings: List[Tuple[int, Optional[int]]] = [] - # Track (choice_index, content_index) for each task + # Track (content_index, None) for each text + + # Handle both dict and object responses + response_content: List[Any] = [] + if isinstance(response, dict): + response_content = response.get("content", []) or [] + elif hasattr(response, "content"): + content = getattr(response, "content", None) + response_content = content or [] + else: + response_content = [] - response_content = response.get("content", []) if not response_content: return response - # Step 1: Extract all text content from response choices + + # Step 1: Extract all text content and tool calls from response for content_idx, content_block in enumerate(response_content): - # Check if this is a text block by checking the 'type' field - if isinstance(content_block, dict) and content_block.get("type") == "text": - # Cast to dict to handle the union type properly - await self._extract_output_text_and_create_tasks( - content_block=cast(Dict[str, Any], content_block), + # Handle both dict and Pydantic object content blocks + block_dict: Dict[str, Any] = {} + if isinstance(content_block, dict): + block_type = content_block.get("type") + block_dict = cast(Dict[str, Any], content_block) + elif hasattr(content_block, "type"): + block_type = getattr(content_block, "type", None) + # Convert Pydantic object to dict for processing + if hasattr(content_block, "model_dump"): + block_dict = content_block.model_dump() + else: + block_dict = {"type": block_type, "text": getattr(content_block, "text", None)} + else: + continue + + if block_type in ["text", "tool_use"]: + self._extract_output_text_and_images( + content_block=block_dict, content_idx=content_idx, - tasks=tasks, + texts_to_check=texts_to_check, + images_to_check=images_to_check, task_mappings=task_mappings, - guardrail_to_apply=guardrail_to_apply, + tool_calls_to_check=tool_calls_to_check, ) - # Step 2: Run all guardrail tasks in parallel - responses = await asyncio.gather(*tasks) + # Step 2: Apply guardrail to all texts in batch + if texts_to_check or tool_calls_to_check: + # Create a request_data dict with response info and user API key metadata + request_data: dict = {"response": response} - # Step 3: Map guardrail responses back to original response structure - await self._apply_guardrail_responses_to_output( - response=response, - responses=responses, - task_mappings=task_mappings, - ) + # Add user API key metadata with prefixed keys + user_metadata = self.transform_user_api_key_dict_to_metadata( + user_api_key_dict + ) + if user_metadata: + request_data["litellm_metadata"] = user_metadata + + inputs = GenericGuardrailAPIInputs(texts=texts_to_check) + if images_to_check: + inputs["images"] = images_to_check + if tool_calls_to_check: + inputs["tool_calls"] = tool_calls_to_check + # Include model information from the response if available + response_model = None + if isinstance(response, dict): + response_model = response.get("model") + elif hasattr(response, "model"): + response_model = getattr(response, "model", None) + if response_model: + inputs["model"] = response_model + + guardrailed_inputs = await guardrail_to_apply.apply_guardrail( + inputs=inputs, + request_data=request_data, + input_type="response", + logging_obj=litellm_logging_obj, + ) + + guardrailed_texts = guardrailed_inputs.get("texts", []) + + # Step 3: Map guardrail responses back to original response structure + await self._apply_guardrail_responses_to_output( + response=response, + responses=guardrailed_texts, + task_mappings=task_mappings, + ) verbose_proxy_logger.debug( "Anthropic Messages: Processed output response: %s", response @@ -202,13 +344,227 @@ async def process_output_response( return response + async def process_output_streaming_response( + self, + responses_so_far: List[Any], + guardrail_to_apply: "CustomGuardrail", + litellm_logging_obj: Optional[Any] = None, + user_api_key_dict: Optional[Any] = None, + ) -> List[Any]: + """ + Process output streaming response by applying guardrails to text content. + + Get the string so far, check the apply guardrail to the string so far, and return the list of responses so far. + """ + has_ended = self._check_streaming_has_ended(responses_so_far) + if has_ended: + + # build the model response from the responses_so_far + model_response = cast( + ModelResponse, + AnthropicPassthroughLoggingHandler._build_complete_streaming_response( + all_chunks=responses_so_far, + litellm_logging_obj=cast("LiteLLMLoggingObj", litellm_logging_obj), + model="", + ), + ) + tool_calls_list = cast(Optional[List[ChatCompletionMessageToolCall]], model_response.choices[0].message.tool_calls) # type: ignore + string_so_far = model_response.choices[0].message.content # type: ignore + guardrail_inputs = GenericGuardrailAPIInputs() + if string_so_far: + guardrail_inputs["texts"] = [string_so_far] + if tool_calls_list: + guardrail_inputs["tool_calls"] = tool_calls_list + + _guardrailed_inputs = await guardrail_to_apply.apply_guardrail( # allow rejecting the response, if invalid + inputs=guardrail_inputs, + request_data={}, + input_type="response", + logging_obj=litellm_logging_obj, + ) + return responses_so_far + + string_so_far = self.get_streaming_string_so_far(responses_so_far) + _guardrailed_inputs = await guardrail_to_apply.apply_guardrail( # allow rejecting the response, if invalid + inputs={"texts": [string_so_far]}, + request_data={}, + input_type="response", + logging_obj=litellm_logging_obj, + ) + return responses_so_far + + def get_streaming_string_so_far(self, responses_so_far: List[Any]) -> str: + """ + Parse streaming responses and extract accumulated text content. + + Handles two formats: + 1. Raw bytes in SSE (Server-Sent Events) format from Anthropic API + 2. Parsed dict objects (for backwards compatibility) + + SSE format example: + b'event: content_block_delta\\ndata: {"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":" curious"}}\\n\\n' + + Dict format example: + { + "type": "content_block_delta", + "index": 0, + "delta": { + "type": "text_delta", + "text": " curious" + } + } + """ + text_so_far = "" + for response in responses_so_far: + # Handle raw bytes in SSE format + if isinstance(response, bytes): + text_so_far += self._extract_text_from_sse(response) + # Handle already-parsed dict format + elif isinstance(response, dict): + delta = response.get("delta") if response.get("delta") else None + if delta and delta.get("type") == "text_delta": + text = delta.get("text", "") + if text: + text_so_far += text + return text_so_far + + def _extract_text_from_sse(self, sse_bytes: bytes) -> str: + """ + Extract text content from Server-Sent Events (SSE) format. + + Args: + sse_bytes: Raw bytes in SSE format + + Returns: + Accumulated text from all content_block_delta events + """ + text = "" + try: + # Decode bytes to string + sse_string = sse_bytes.decode("utf-8") + + # Split by double newline to get individual events + events = sse_string.split("\n\n") + + for event in events: + if not event.strip(): + continue + + # Parse event lines + lines = event.strip().split("\n") + event_type = None + data_line = None + + for line in lines: + if line.startswith("event:"): + event_type = line[6:].strip() + elif line.startswith("data:"): + data_line = line[5:].strip() + + # Only process content_block_delta events + if event_type == "content_block_delta" and data_line: + try: + data = json.loads(data_line) + delta = data.get("delta", {}) + if delta.get("type") == "text_delta": + text += delta.get("text", "") + except json.JSONDecodeError: + verbose_proxy_logger.warning( + f"Failed to parse JSON from SSE data: {data_line}" + ) + + except Exception as e: + verbose_proxy_logger.error(f"Error extracting text from SSE: {e}") + + return text + + def _check_streaming_has_ended(self, responses_so_far: List[Any]) -> bool: + """ + Check if streaming response has ended by looking for non-null stop_reason. + + Handles two formats: + 1. Raw bytes in SSE (Server-Sent Events) format from Anthropic API + 2. Parsed dict objects (for backwards compatibility) + + SSE format example: + b'event: message_delta\\ndata: {"type":"message_delta","delta":{"stop_reason":"tool_use","stop_sequence":null},...}\\n\\n' + + Dict format example: + { + "type": "message_delta", + "delta": { + "stop_reason": "tool_use", + "stop_sequence": null + } + } + + Returns: + True if stop_reason is set to a non-null value, indicating stream has ended + """ + for response in responses_so_far: + # Handle raw bytes in SSE format + if isinstance(response, bytes): + try: + # Decode bytes to string + sse_string = response.decode("utf-8") + + # Split by double newline to get individual events + events = sse_string.split("\n\n") + + for event in events: + if not event.strip(): + continue + + # Parse event lines + lines = event.strip().split("\n") + event_type = None + data_line = None + + for line in lines: + if line.startswith("event:"): + event_type = line[6:].strip() + elif line.startswith("data:"): + data_line = line[5:].strip() + + # Check for message_delta event with stop_reason + if event_type == "message_delta" and data_line: + try: + data = json.loads(data_line) + delta = data.get("delta", {}) + stop_reason = delta.get("stop_reason") + if stop_reason is not None: + return True + except json.JSONDecodeError: + verbose_proxy_logger.warning( + f"Failed to parse JSON from SSE data: {data_line}" + ) + + except Exception as e: + verbose_proxy_logger.error( + f"Error checking streaming end in SSE: {e}" + ) + + # Handle already-parsed dict format + elif isinstance(response, dict): + if response.get("type") == "message_delta": + delta = response.get("delta", {}) + stop_reason = delta.get("stop_reason") + if stop_reason is not None: + return True + + return False + def _has_text_content(self, response: "AnthropicMessagesResponse") -> bool: """ Check if response has any text content to process. Override this method to customize text content detection. """ - response_content = response.get("content", []) + if isinstance(response, dict): + response_content = response.get("content", []) + else: + response_content = getattr(response, "content", None) or [] + if not response_content: return False for content_block in response_content: @@ -219,24 +575,39 @@ def _has_text_content(self, response: "AnthropicMessagesResponse") -> bool: return True return False - async def _extract_output_text_and_create_tasks( + def _extract_output_text_and_images( self, content_block: Dict[str, Any], content_idx: int, - tasks: List, + texts_to_check: List[str], + images_to_check: List[str], task_mappings: List[Tuple[int, Optional[int]]], - guardrail_to_apply: "CustomGuardrail", + tool_calls_to_check: Optional[List[ChatCompletionToolCallChunk]] = None, ) -> None: """ - Extract text content from a response choice and create guardrail tasks. + Extract text content, images, and tool calls from a response content block. - Override this method to customize text extraction logic. + Override this method to customize text/image/tool extraction logic. """ - content_text = content_block.get("text") - if content_text and isinstance(content_text, str): - # Simple string content - tasks.append(guardrail_to_apply.apply_guardrail(text=content_text)) - task_mappings.append((content_idx, None)) + content_type = content_block.get("type") + + # Extract text content + if content_type == "text": + content_text = content_block.get("text") + if content_text and isinstance(content_text, str): + # Simple string content + texts_to_check.append(content_text) + task_mappings.append((content_idx, None)) + + # Extract tool calls + elif content_type == "tool_use": + tool_call = AnthropicConfig.convert_tool_use_to_openai_format( + anthropic_tool_content=content_block, + index=content_idx, + ) + if tool_calls_to_check is None: + tool_calls_to_check = [] + tool_calls_to_check.append(tool_call) async def _apply_guardrail_responses_to_output( self, @@ -253,7 +624,16 @@ async def _apply_guardrail_responses_to_output( mapping = task_mappings[task_idx] content_idx = cast(int, mapping[0]) - response_content = response.get("content", []) + # Handle both dict and object responses + response_content: List[Any] = [] + if isinstance(response, dict): + response_content = response.get("content", []) or [] + elif hasattr(response, "content"): + content = getattr(response, "content", None) + response_content = content or [] + else: + continue + if not response_content: continue @@ -264,7 +644,11 @@ async def _apply_guardrail_responses_to_output( content_block = response_content[content_idx] # Verify it's a text block and update the text field - if isinstance(content_block, dict) and content_block.get("type") == "text": - # Cast to dict to handle the union type properly for assignment - content_block = cast("AnthropicResponseTextBlock", content_block) - content_block["text"] = guardrail_response + # Handle both dict and Pydantic object content blocks + if isinstance(content_block, dict): + if content_block.get("type") == "text": + cast(Dict[str, Any], content_block)["text"] = guardrail_response + elif hasattr(content_block, "type") and getattr(content_block, "type", None) == "text": + # Update Pydantic object's text attribute + if hasattr(content_block, "text"): + content_block.text = guardrail_response diff --git a/litellm/llms/anthropic/chat/handler.py b/litellm/llms/anthropic/chat/handler.py index b7b39f103954..6a9aafd076bd 100644 --- a/litellm/llms/anthropic/chat/handler.py +++ b/litellm/llms/anthropic/chat/handler.py @@ -10,6 +10,7 @@ Callable, Dict, List, + Literal, Optional, Tuple, Union, @@ -42,6 +43,7 @@ ChatCompletionRedactedThinkingBlock, ChatCompletionThinkingBlock, ChatCompletionToolCallChunk, + ChatCompletionToolCallFunctionChunk, ) from litellm.types.utils import ( Delta, @@ -315,6 +317,7 @@ def completion( stream = optional_params.pop("stream", None) json_mode: bool = optional_params.pop("json_mode", False) is_vertex_request: bool = optional_params.pop("is_vertex_request", False) + optional_params.pop("vertex_count_tokens_location", None) _is_function_call = False messages = copy.deepcopy(messages) headers = AnthropicConfig().validate_environment( @@ -338,7 +341,7 @@ def completion( data = config.transform_request( model=model, messages=messages, - optional_params=optional_params, + optional_params={**optional_params, "is_vertex_request": is_vertex_request}, litellm_params=litellm_params, headers=headers, ) @@ -435,9 +438,7 @@ def completion( else: if client is None or not isinstance(client, HTTPHandler): - client = _get_httpx_client( - params={"timeout": timeout} - ) + client = _get_httpx_client(params={"timeout": timeout}) else: client = client @@ -499,6 +500,19 @@ def __init__( # Track if we've converted any response_format tools (affects finish_reason) self.converted_response_format_tool: bool = False + # For handling partial JSON chunks from fragmentation + # See: https://github.com/BerriAI/litellm/issues/17473 + self.accumulated_json: str = "" + self.chunk_type: Literal["valid_json", "accumulated_json"] = "valid_json" + + # Track current content block type to avoid emitting tool calls for non-tool blocks + # See: https://github.com/BerriAI/litellm/issues/17254 + self.current_content_block_type: Optional[str] = None + + # Accumulate web_search_tool_result blocks for multi-turn reconstruction + # See: https://github.com/BerriAI/litellm/issues/17737 + self.web_search_results: List[Dict[str, Any]] = [] + def check_empty_tool_call_args(self) -> bool: """ Check if the tool call block so far has been an empty string @@ -527,9 +541,7 @@ def _handle_usage(self, anthropic_usage_chunk: Union[dict, UsageDelta]) -> Usage usage_object=cast(dict, anthropic_usage_chunk), reasoning_content=None ) - def _content_block_delta_helper( - self, chunk: dict - ) -> Tuple[ + def _content_block_delta_helper(self, chunk: dict) -> Tuple[ str, Optional[ChatCompletionToolCallChunk], List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]], @@ -550,15 +562,22 @@ def _content_block_delta_helper( if "text" in content_block["delta"]: text = content_block["delta"]["text"] elif "partial_json" in content_block["delta"]: - tool_use = { - "id": None, - "type": "function", - "function": { - "name": None, - "arguments": content_block["delta"]["partial_json"], - }, - "index": self.tool_index, - } + # Only emit tool calls if we're in a tool_use or server_tool_use block + # web_search_tool_result blocks also have input_json_delta but should not be treated as tool calls + # See: https://github.com/BerriAI/litellm/issues/17254 + if self.current_content_block_type in ("tool_use", "server_tool_use"): + tool_use = cast( + ChatCompletionToolCallChunk, + { + "id": None, + "type": "function", + "function": { + "name": None, + "arguments": content_block["delta"]["partial_json"], + }, + "index": self.tool_index, + }, + ) elif "citation" in content_block["delta"]: provider_specific_fields["citation"] = content_block["delta"]["citation"] elif ( @@ -569,7 +588,7 @@ def _content_block_delta_helper( ChatCompletionThinkingBlock( type="thinking", thinking=content_block["delta"].get("thinking") or "", - signature=content_block["delta"].get("signature"), + signature=str(content_block["delta"].get("signature") or ""), ) ] provider_specific_fields["thinking_blocks"] = thinking_blocks @@ -625,7 +644,7 @@ def get_content_block_start(self, chunk: dict) -> ContentBlockStart: return content_block_start - def chunk_parser(self, chunk: dict) -> ModelResponseStream: + def chunk_parser(self, chunk: dict) -> ModelResponseStream: # noqa: PLR0915 try: type_chunk = chunk.get("type", "") or "" @@ -668,19 +687,29 @@ def chunk_parser(self, chunk: dict) -> ModelResponseStream: content_block_start = self.get_content_block_start(chunk=chunk) self.content_blocks = [] # reset content blocks when new block starts + # Track current content block type for filtering deltas + self.current_content_block_type = content_block_start["content_block"]["type"] if content_block_start["content_block"]["type"] == "text": text = content_block_start["content_block"]["text"] - elif content_block_start["content_block"]["type"] == "tool_use": + elif content_block_start["content_block"]["type"] == "tool_use" or content_block_start["content_block"]["type"] == "server_tool_use": self.tool_index += 1 - tool_use = { - "id": content_block_start["content_block"]["id"], - "type": "function", - "function": { - "name": content_block_start["content_block"]["name"], - "arguments": "", - }, - "index": self.tool_index, - } + # Use empty string for arguments in content_block_start - actual arguments + # come in subsequent content_block_delta chunks and get accumulated. + # Using str(input) here would prepend '{}' causing invalid JSON accumulation. + tool_use = ChatCompletionToolCallChunk( + id=content_block_start["content_block"]["id"], + type="function", + function=ChatCompletionToolCallFunctionChunk( + name=content_block_start["content_block"]["name"], + arguments="", + ), + index=self.tool_index, + ) + # Include caller information if present (for programmatic tool calling) + if "caller" in content_block_start["content_block"]: + caller_data = content_block_start["content_block"]["caller"] + if caller_data: + tool_use["caller"] = cast(Dict[str, Any], caller_data) # type: ignore[typeddict-item] elif ( content_block_start["content_block"]["type"] == "redacted_thinking" ): @@ -691,24 +720,67 @@ def chunk_parser(self, chunk: dict) -> ModelResponseStream: content_block_start=content_block_start, provider_specific_fields=provider_specific_fields, ) + + elif content_block_start["content_block"]["type"].endswith("_tool_result"): + # Handle all tool result types (web_search, bash_code_execution, text_editor, etc.) + content_type = content_block_start["content_block"]["type"] + + # Special handling for web_search_tool_result for backwards compatibility + if content_type == "web_search_tool_result": + # Capture web_search_tool_result for multi-turn reconstruction + # The full content comes in content_block_start, not in deltas + # See: https://github.com/BerriAI/litellm/issues/17737 + self.web_search_results.append( + content_block_start["content_block"] + ) + provider_specific_fields["web_search_results"] = ( + self.web_search_results + ) + elif content_type == "web_fetch_tool_result": + # Capture web_fetch_tool_result for multi-turn reconstruction + # The full content comes in content_block_start, not in deltas + # Fixes: https://github.com/BerriAI/litellm/issues/18137 + self.web_search_results.append( + content_block_start["content_block"] + ) + provider_specific_fields["web_search_results"] = ( + self.web_search_results + ) + elif content_type != "tool_search_tool_result": + # Handle other tool results (code execution, etc.) + # Skip tool_search_tool_result as it's internal metadata + if not hasattr(self, "tool_results"): + self.tool_results = [] + self.tool_results.append(content_block_start["content_block"]) + provider_specific_fields["tool_results"] = self.tool_results + elif type_chunk == "content_block_stop": ContentBlockStop(**chunk) # type: ignore - # check if tool call content block - is_empty = self.check_empty_tool_call_args() - if is_empty: - tool_use = { - "id": None, - "type": "function", - "function": { - "name": None, - "arguments": "{}", - }, - "index": self.tool_index, - } + # check if tool call content block - only for tool_use and server_tool_use blocks + if self.current_content_block_type in ("tool_use", "server_tool_use"): + is_empty = self.check_empty_tool_call_args() + if is_empty: + tool_use = ChatCompletionToolCallChunk( + id=None, # type: ignore[typeddict-item] + type="function", + function=ChatCompletionToolCallFunctionChunk( + name=None, # type: ignore[typeddict-item] + arguments="{}", + ), + index=self.tool_index, + ) # Reset response_format tool tracking when block stops self.is_response_format_tool = False + # Reset current content block type + self.current_content_block_type = None + elif type_chunk == "tool_result": + # Handle tool_result blocks (for tool search results with tool_reference) + # These are automatically handled by Anthropic API, we just pass them through + pass elif type_chunk == "message_delta": - finish_reason, usage = self._handle_message_delta(chunk) + finish_reason, usage, container = self._handle_message_delta(chunk) + if container: + provider_specific_fields["container"] = container elif type_chunk == "message_start": """ Anthropic @@ -824,15 +896,15 @@ def _handle_json_mode_chunk( return text, tool_use - def _handle_message_delta(self, chunk: dict) -> Tuple[str, Optional[Usage]]: + def _handle_message_delta(self, chunk: dict) -> Tuple[str, Optional[Usage], Optional[Dict[str, Any]]]: """ - Handle message_delta event for finish_reason and usage. + Handle message_delta event for finish_reason, usage, and container. Args: chunk: The message_delta chunk Returns: - Tuple of (finish_reason, usage) + Tuple of (finish_reason, usage, container) """ message_delta = MessageBlockDelta(**chunk) # type: ignore finish_reason = map_finish_reason( @@ -843,44 +915,108 @@ def _handle_message_delta(self, chunk: dict) -> Tuple[str, Optional[Usage]]: if self.converted_response_format_tool: finish_reason = "stop" usage = self._handle_usage(anthropic_usage_chunk=message_delta["usage"]) - return finish_reason, usage + container = message_delta["delta"].get("container") + return finish_reason, usage, container + + def _handle_accumulated_json_chunk( + self, data_str: str + ) -> Optional[ModelResponseStream]: + """ + Handle partial JSON chunks by accumulating them until valid JSON is received. + + This fixes network fragmentation issues where SSE data chunks may be split + across TCP packets. See: https://github.com/BerriAI/litellm/issues/17473 + + Args: + data_str: The JSON string to parse (without "data:" prefix) + + Returns: + ModelResponseStream if JSON is complete, None if still accumulating + """ + # Accumulate JSON data + self.accumulated_json += data_str + + # Try to parse the accumulated JSON + try: + data_json = json.loads(self.accumulated_json) + self.accumulated_json = "" # Reset after successful parsing + return self.chunk_parser(chunk=data_json) + except json.JSONDecodeError: + # If it's not valid JSON yet, continue to the next chunk + return None + + def _parse_sse_data(self, str_line: str) -> Optional[ModelResponseStream]: + """ + Parse SSE data line, handling both complete and partial JSON chunks. + + Args: + str_line: The SSE line starting with "data:" + + Returns: + ModelResponseStream if parsing succeeded, None if accumulating partial JSON + """ + data_str = str_line[5:] # Remove "data:" prefix + + if self.chunk_type == "accumulated_json": + # Already in accumulation mode, keep accumulating + return self._handle_accumulated_json_chunk(data_str) + + # Try to parse as valid JSON first + try: + data_json = json.loads(data_str) + return self.chunk_parser(chunk=data_json) + except json.JSONDecodeError: + # Switch to accumulation mode and start accumulating + self.chunk_type = "accumulated_json" + return self._handle_accumulated_json_chunk(data_str) # Sync iterator def __iter__(self): return self def __next__(self): - try: - chunk = self.response_iterator.__next__() - except StopIteration: - raise StopIteration - except ValueError as e: - raise RuntimeError(f"Error receiving chunk from stream: {e}") - - try: - str_line = chunk - if isinstance(chunk, bytes): # Handle binary data - str_line = chunk.decode("utf-8") # Convert bytes to string - index = str_line.find("data:") - if index != -1: - str_line = str_line[index:] - - if str_line.startswith("data:"): - data_json = json.loads(str_line[5:]) - return self.chunk_parser(chunk=data_json) - else: - return GenericStreamingChunk( - text="", - is_finished=False, - finish_reason="", - usage=None, - index=0, - tool_use=None, - ) - except StopIteration: - raise StopIteration - except ValueError as e: - raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}") + while True: + try: + chunk = self.response_iterator.__next__() + except StopIteration: + # If we have accumulated JSON when stream ends, try to parse it + if self.accumulated_json: + try: + data_json = json.loads(self.accumulated_json) + self.accumulated_json = "" + return self.chunk_parser(chunk=data_json) + except json.JSONDecodeError: + pass + raise StopIteration + except ValueError as e: + raise RuntimeError(f"Error receiving chunk from stream: {e}") + + try: + str_line = chunk + if isinstance(chunk, bytes): # Handle binary data + str_line = chunk.decode("utf-8") # Convert bytes to string + index = str_line.find("data:") + if index != -1: + str_line = str_line[index:] + + if str_line.startswith("data:"): + result = self._parse_sse_data(str_line) + if result is not None: + return result + # If None, continue loop to get more chunks for accumulation + else: + return GenericStreamingChunk( + text="", + is_finished=False, + finish_reason="", + usage=None, + index=0, + tool_use=None, + ) + except StopIteration: + raise StopIteration + except ValueError as e: + raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}") # Async iterator def __aiter__(self): @@ -888,37 +1024,48 @@ def __aiter__(self): return self async def __anext__(self): - try: - chunk = await self.async_response_iterator.__anext__() - except StopAsyncIteration: - raise StopAsyncIteration - except ValueError as e: - raise RuntimeError(f"Error receiving chunk from stream: {e}") - - try: - str_line = chunk - if isinstance(chunk, bytes): # Handle binary data - str_line = chunk.decode("utf-8") # Convert bytes to string - index = str_line.find("data:") - if index != -1: - str_line = str_line[index:] - - if str_line.startswith("data:"): - data_json = json.loads(str_line[5:]) - return self.chunk_parser(chunk=data_json) - else: - return GenericStreamingChunk( - text="", - is_finished=False, - finish_reason="", - usage=None, - index=0, - tool_use=None, - ) - except StopAsyncIteration: - raise StopAsyncIteration - except ValueError as e: - raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}") + while True: + try: + chunk = await self.async_response_iterator.__anext__() + except StopAsyncIteration: + # If we have accumulated JSON when stream ends, try to parse it + if self.accumulated_json: + try: + data_json = json.loads(self.accumulated_json) + self.accumulated_json = "" + return self.chunk_parser(chunk=data_json) + except json.JSONDecodeError: + pass + raise StopAsyncIteration + except ValueError as e: + raise RuntimeError(f"Error receiving chunk from stream: {e}") + + try: + str_line = chunk + if isinstance(chunk, bytes): # Handle binary data + str_line = chunk.decode("utf-8") # Convert bytes to string + index = str_line.find("data:") + if index != -1: + str_line = str_line[index:] + + if str_line.startswith("data:"): + result = self._parse_sse_data(str_line) + if result is not None: + return result + # If None, continue loop to get more chunks for accumulation + else: + return GenericStreamingChunk( + text="", + is_finished=False, + finish_reason="", + usage=None, + index=0, + tool_use=None, + ) + except StopAsyncIteration: + raise StopAsyncIteration + except ValueError as e: + raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}") def convert_str_chunk_to_generic_chunk(self, chunk: str) -> ModelResponseStream: """ @@ -932,9 +1079,12 @@ def convert_str_chunk_to_generic_chunk(self, chunk: str) -> ModelResponseStream: str_line = chunk if isinstance(chunk, bytes): # Handle binary data str_line = chunk.decode("utf-8") # Convert bytes to string - index = str_line.find("data:") - if index != -1: - str_line = str_line[index:] + + # Extract the data line from SSE format + # SSE events can be: "event: X\ndata: {...}\n\n" or just "data: {...}\n\n" + index = str_line.find("data:") + if index != -1: + str_line = str_line[index:] if str_line.startswith("data:"): data_json = json.loads(str_line[5:]) diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py index 691b46af8da1..1b61b5332753 100644 --- a/litellm/llms/anthropic/chat/transformation.py +++ b/litellm/llms/anthropic/chat/transformation.py @@ -12,6 +12,7 @@ DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET, DEFAULT_REASONING_EFFORT_LOW_THINKING_BUDGET, DEFAULT_REASONING_EFFORT_MEDIUM_THINKING_BUDGET, + DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET, RESPONSE_FORMAT_TOOL_NAME, ) from litellm.litellm_core_utils.core_helpers import map_finish_reason @@ -29,6 +30,7 @@ AnthropicMcpServerTool, AnthropicMessagesTool, AnthropicMessagesToolChoice, + AnthropicOutputSchema, AnthropicSystemMessageContent, AnthropicThinkingParam, AnthropicWebSearchTool, @@ -60,7 +62,10 @@ ModelResponse, Usage, add_dummy_tool, + any_assistant_message_has_thinking_blocks, + get_max_tokens, has_tool_call_blocks, + last_assistant_with_tool_calls_has_no_thinking_blocks, supports_reasoning, token_counter, ) @@ -82,9 +87,7 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig): to pass metadata to anthropic, it's {"user_id": "any-relevant-information"} """ - max_tokens: Optional[int] = ( - DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default) - ) + max_tokens: Optional[int] = None stop_sequences: Optional[list] = None temperature: Optional[int] = None top_p: Optional[int] = None @@ -94,9 +97,7 @@ class AnthropicConfig(AnthropicModelInfo, BaseConfig): def __init__( self, - max_tokens: Optional[ - int - ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, # You can pass in a value yourself or use the default value 4096 + max_tokens: Optional[int] = None, stop_sequences: Optional[list] = None, temperature: Optional[int] = None, top_p: Optional[int] = None, @@ -114,11 +115,66 @@ def custom_llm_provider(self) -> Optional[str]: return "anthropic" @classmethod - def get_config(cls): - return super().get_config() + def get_config(cls, *, model: Optional[str] = None): + config = super().get_config() - def get_supported_openai_params(self, model: str): + # anthropic requires a default value for max_tokens + if config.get("max_tokens") is None: + config["max_tokens"] = cls.get_max_tokens_for_model(model) + + return config + + @staticmethod + def get_max_tokens_for_model(model: Optional[str] = None) -> int: + """ + Get the max output tokens for a given model. + Falls back to DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS (configurable via env var) if model is not found. + """ + if model is None: + return DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS + try: + max_tokens = get_max_tokens(model) + if max_tokens is None: + return DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS + return max_tokens + except Exception: + return DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS + + @staticmethod + def convert_tool_use_to_openai_format( + anthropic_tool_content: Dict[str, Any], + index: int, + ) -> ChatCompletionToolCallChunk: + """ + Convert Anthropic tool_use format to OpenAI ChatCompletionToolCallChunk format. + Args: + anthropic_tool_content: Anthropic tool_use content block with format: + {"type": "tool_use", "id": "...", "name": "...", "input": {...}} + index: The index of this tool call + + Returns: + ChatCompletionToolCallChunk in OpenAI format + """ + tool_call = ChatCompletionToolCallChunk( + id=anthropic_tool_content["id"], + type="function", + function=ChatCompletionToolCallFunctionChunk( + name=anthropic_tool_content["name"], + arguments=json.dumps(anthropic_tool_content["input"]), + ), + index=index, + ) + # Include caller information if present (for programmatic tool calling) + if "caller" in anthropic_tool_content: + tool_call["caller"] = cast(Dict[str, Any], anthropic_tool_content["caller"]) # type: ignore[typeddict-item] + return tool_call + + def _is_claude_opus_4_5(self, model: str) -> bool: + """Check if the model is Claude Opus 4.5.""" + return "opus-4-5" in model.lower() or "opus_4_5" in model.lower() + + def get_supported_openai_params(self, model: str): params = [ "stream", "stop", @@ -144,6 +200,68 @@ def get_supported_openai_params(self, model: str): return params + @staticmethod + def filter_anthropic_output_schema(schema: Dict[str, Any]) -> Dict[str, Any]: + """ + Filter out unsupported fields from JSON schema for Anthropic's output_format API. + + Anthropic's output_format doesn't support certain JSON schema properties: + - maxItems: Not supported for array types + - minItems: Not supported for array types + + This function recursively removes these unsupported fields while preserving + all other valid schema properties. + + Args: + schema: The JSON schema dictionary to filter + + Returns: + A new dictionary with unsupported fields removed + + Related issue: https://github.com/BerriAI/litellm/issues/19444 + """ + if not isinstance(schema, dict): + return schema + + unsupported_fields = {"maxItems", "minItems"} + + result: Dict[str, Any] = {} + for key, value in schema.items(): + if key in unsupported_fields: + continue + + if key == "properties" and isinstance(value, dict): + result[key] = { + k: AnthropicConfig.filter_anthropic_output_schema(v) + for k, v in value.items() + } + elif key == "items" and isinstance(value, dict): + result[key] = AnthropicConfig.filter_anthropic_output_schema(value) + elif key == "$defs" and isinstance(value, dict): + result[key] = { + k: AnthropicConfig.filter_anthropic_output_schema(v) + for k, v in value.items() + } + elif key == "anyOf" and isinstance(value, list): + result[key] = [ + AnthropicConfig.filter_anthropic_output_schema(item) + for item in value + ] + elif key == "allOf" and isinstance(value, list): + result[key] = [ + AnthropicConfig.filter_anthropic_output_schema(item) + for item in value + ] + elif key == "oneOf" and isinstance(value, list): + result[key] = [ + AnthropicConfig.filter_anthropic_output_schema(item) + for item in value + ] + else: + result[key] = value + + return result + def get_json_schema_from_pydantic_object( self, response_format: Union[Any, Dict, None] ) -> Optional[dict]: @@ -152,9 +270,11 @@ def get_json_schema_from_pydantic_object( ) # Relevant issue: https://github.com/BerriAI/litellm/issues/7755 def get_cache_control_headers(self) -> dict: + # Anthropic no longer requires the prompt-caching beta header + # Prompt caching now works automatically when cache_control is used in messages + # Reference: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching return { "anthropic-version": "2023-06-01", - "anthropic-beta": "prompt-caching-2024-07-31", } def _map_tool_choice( @@ -170,10 +290,19 @@ def _map_tool_choice( elif tool_choice == "none": _tool_choice = AnthropicMessagesToolChoice(type="none") elif isinstance(tool_choice, dict): - _tool_name = tool_choice.get("function", {}).get("name") - _tool_choice = AnthropicMessagesToolChoice(type="tool") - if _tool_name is not None: - _tool_choice["name"] = _tool_name + if "type" in tool_choice and "function" not in tool_choice: + tool_type = tool_choice.get("type") + if tool_type == "auto": + _tool_choice = AnthropicMessagesToolChoice(type="auto") + elif tool_type == "required" or tool_type == "any": + _tool_choice = AnthropicMessagesToolChoice(type="any") + elif tool_type == "none": + _tool_choice = AnthropicMessagesToolChoice(type="none") + else: + _tool_name = tool_choice.get("function", {}).get("name") + if _tool_name is not None: + _tool_choice = AnthropicMessagesToolChoice(type="tool") + _tool_choice["name"] = _tool_name if parallel_tool_use is not None: # Anthropic uses 'disable_parallel_tool_use' flag to determine if parallel tool use is allowed @@ -189,7 +318,7 @@ def _map_tool_choice( ) return _tool_choice - def _map_tool_helper( + def _map_tool_helper( # noqa: PLR0915 self, tool: ChatCompletionToolParam ) -> Tuple[Optional[AllAnthropicToolsValues], Optional[AnthropicMcpServerTool]]: returned_tool: Optional[AllAnthropicToolsValues] = None @@ -252,9 +381,10 @@ def _map_tool_helper( returned_tool = _computer_tool elif any(tool["type"].startswith(t) for t in ANTHROPIC_HOSTED_TOOLS): - function_name = tool.get("name", tool.get("function", {}).get("name")) - if function_name is None or not isinstance(function_name, str): + function_name_obj = tool.get("name", tool.get("function", {}).get("name")) + if function_name_obj is None or not isinstance(function_name_obj, str): raise ValueError("Missing required parameter: name") + function_name = function_name_obj additional_tool_params = {} for k, v in tool.items(): @@ -270,6 +400,30 @@ def _map_tool_helper( mcp_server = self._map_openai_mcp_server_tool( cast(OpenAIMcpServerTool, tool) ) + elif tool["type"] == "tool_search_tool_regex_20251119": + # Tool search tool using regex + from litellm.types.llms.anthropic import AnthropicToolSearchToolRegex + + tool_name_obj = tool.get("name", "tool_search_tool_regex") + if not isinstance(tool_name_obj, str): + raise ValueError("Tool search tool must have a valid name") + tool_name = tool_name_obj + returned_tool = AnthropicToolSearchToolRegex( + type="tool_search_tool_regex_20251119", + name=tool_name, + ) + elif tool["type"] == "tool_search_tool_bm25_20251119": + # Tool search tool using BM25 + from litellm.types.llms.anthropic import AnthropicToolSearchToolBM25 + + tool_name_obj = tool.get("name", "tool_search_tool_bm25") + if not isinstance(tool_name_obj, str): + raise ValueError("Tool search tool must have a valid name") + tool_name = tool_name_obj + returned_tool = AnthropicToolSearchToolBM25( + type="tool_search_tool_bm25_20251119", + name=tool_name, + ) if returned_tool is None and mcp_server is None: raise ValueError(f"Unsupported tool type: {tool['type']}") @@ -277,14 +431,82 @@ def _map_tool_helper( _cache_control = tool.get("cache_control", None) _cache_control_function = tool.get("function", {}).get("cache_control", None) if returned_tool is not None: - if _cache_control is not None: - returned_tool["cache_control"] = _cache_control - elif _cache_control_function is not None and isinstance( - _cache_control_function, dict + # Only set cache_control on tools that support it (not tool search tools) + tool_type = returned_tool.get("type", "") + if tool_type not in ( + "tool_search_tool_regex_20251119", + "tool_search_tool_bm25_20251119", ): - returned_tool["cache_control"] = ChatCompletionCachedContent( - **_cache_control_function # type: ignore - ) + if _cache_control is not None: + returned_tool["cache_control"] = _cache_control # type: ignore[typeddict-item] + elif _cache_control_function is not None and isinstance( + _cache_control_function, dict + ): + returned_tool["cache_control"] = ChatCompletionCachedContent( # type: ignore[typeddict-item] + **_cache_control_function # type: ignore + ) + + ## check if defer_loading is set in the tool + _defer_loading = tool.get("defer_loading", None) + _defer_loading_function = tool.get("function", {}).get("defer_loading", None) + if returned_tool is not None: + # Only set defer_loading on tools that support it (not tool search tools or computer tools) + tool_type = returned_tool.get("type", "") + if tool_type not in ( + "tool_search_tool_regex_20251119", + "tool_search_tool_bm25_20251119", + "computer_20241022", + "computer_20250124", + ): + if _defer_loading is not None: + if not isinstance(_defer_loading, bool): + raise ValueError("defer_loading must be a boolean") + returned_tool["defer_loading"] = _defer_loading # type: ignore[typeddict-item] + elif _defer_loading_function is not None: + if not isinstance(_defer_loading_function, bool): + raise ValueError("defer_loading must be a boolean") + returned_tool["defer_loading"] = _defer_loading_function # type: ignore[typeddict-item] + + ## check if allowed_callers is set in the tool + _allowed_callers = tool.get("allowed_callers", None) + _allowed_callers_function = tool.get("function", {}).get( + "allowed_callers", None + ) + if returned_tool is not None: + # Only set allowed_callers on tools that support it (not tool search tools or computer tools) + tool_type = returned_tool.get("type", "") + if tool_type not in ( + "tool_search_tool_regex_20251119", + "tool_search_tool_bm25_20251119", + "computer_20241022", + "computer_20250124", + ): + if _allowed_callers is not None: + if not isinstance(_allowed_callers, list) or not all( + isinstance(item, str) for item in _allowed_callers + ): + raise ValueError("allowed_callers must be a list of strings") + returned_tool["allowed_callers"] = _allowed_callers # type: ignore[typeddict-item] + elif _allowed_callers_function is not None: + if not isinstance(_allowed_callers_function, list) or not all( + isinstance(item, str) for item in _allowed_callers_function + ): + raise ValueError("allowed_callers must be a list of strings") + returned_tool["allowed_callers"] = _allowed_callers_function # type: ignore[typeddict-item] + + ## check if input_examples is set in the tool + _input_examples = tool.get("input_examples", None) + _input_examples_function = tool.get("function", {}).get("input_examples", None) + if returned_tool is not None: + # Only set input_examples on user-defined tools (type "custom" or no type) + tool_type = returned_tool.get("type", "") + if tool_type == "custom" or (tool_type == "" and "name" in returned_tool): + if _input_examples is not None and isinstance(_input_examples, list): + returned_tool["input_examples"] = _input_examples # type: ignore[typeddict-item] + elif _input_examples_function is not None and isinstance( + _input_examples_function, list + ): + returned_tool["input_examples"] = _input_examples_function # type: ignore[typeddict-item] return returned_tool, mcp_server @@ -336,6 +558,83 @@ def _map_tools( mcp_servers.append(mcp_server_tool) return anthropic_tools, mcp_servers + def _detect_tool_search_tools(self, tools: Optional[List]) -> bool: + """Check if tool search tools are present in the tools list.""" + if not tools: + return False + + for tool in tools: + tool_type = tool.get("type", "") + if tool_type in [ + "tool_search_tool_regex_20251119", + "tool_search_tool_bm25_20251119", + ]: + return True + return False + + def _separate_deferred_tools(self, tools: List) -> Tuple[List, List]: + """ + Separate tools into deferred and non-deferred lists. + + Returns: + Tuple of (non_deferred_tools, deferred_tools) + """ + non_deferred = [] + deferred = [] + + for tool in tools: + if tool.get("defer_loading", False): + deferred.append(tool) + else: + non_deferred.append(tool) + + return non_deferred, deferred + + def _expand_tool_references( + self, + content: List, + deferred_tools: List, + ) -> List: + """ + Expand tool_reference blocks to full tool definitions. + + When Anthropic's tool search returns results, it includes tool_reference blocks + that reference tools by name. This method expands those references to full + tool definitions from the deferred_tools catalog. + + Args: + content: Response content that may contain tool_reference blocks + deferred_tools: List of deferred tools that can be referenced + + Returns: + Content with tool_reference blocks expanded to full tool definitions + """ + if not deferred_tools: + return content + + # Create a mapping of tool names to tool definitions + tool_map = {} + for tool in deferred_tools: + tool_name = tool.get("name") or tool.get("function", {}).get("name") + if tool_name: + tool_map[tool_name] = tool + + # Expand tool references in content + expanded_content = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "tool_reference": + tool_name = item.get("tool_name") + if tool_name and tool_name in tool_map: + # Replace reference with full tool definition + expanded_content.append(tool_map[tool_name]) + else: + # Keep the reference if we can't find the tool + expanded_content.append(item) + else: + expanded_content.append(item) + + return expanded_content + def _map_stop_sequences( self, stop: Optional[Union[str, List[str]]] ) -> Optional[List[str]]: @@ -379,9 +678,44 @@ def _map_reasoning_effort( type="enabled", budget_tokens=DEFAULT_REASONING_EFFORT_HIGH_THINKING_BUDGET, ) + elif reasoning_effort == "minimal": + return AnthropicThinkingParam( + type="enabled", + budget_tokens=DEFAULT_REASONING_EFFORT_MINIMAL_THINKING_BUDGET, + ) else: raise ValueError(f"Unmapped reasoning effort: {reasoning_effort}") + def _extract_json_schema_from_response_format( + self, value: Optional[dict] + ) -> Optional[dict]: + if value is None: + return None + json_schema: Optional[dict] = None + if "response_schema" in value: + json_schema = value["response_schema"] + elif "json_schema" in value: + json_schema = value["json_schema"]["schema"] + + return json_schema + + def map_response_format_to_anthropic_output_format( + self, value: Optional[dict] + ) -> Optional[AnthropicOutputSchema]: + json_schema: Optional[dict] = self._extract_json_schema_from_response_format( + value + ) + if json_schema is None: + return None + + # Filter out unsupported fields for Anthropic's output_format API + filtered_schema = self.filter_anthropic_output_schema(json_schema) + + return AnthropicOutputSchema( + type="json_schema", + schema=filtered_schema, + ) + def map_response_format_to_anthropic_tool( self, value: Optional[dict], optional_params: dict, is_thinking_enabled: bool ) -> Optional[AnthropicMessagesTool]: @@ -391,11 +725,11 @@ def map_response_format_to_anthropic_tool( ): # value is a no-op return None - json_schema: Optional[dict] = None - if "response_schema" in value: - json_schema = value["response_schema"] - elif "json_schema" in value: - json_schema = value["json_schema"]["schema"] + json_schema: Optional[dict] = self._extract_json_schema_from_response_format( + value + ) + if json_schema is None: + return None """ When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode - You usually want to provide a single tool @@ -440,7 +774,7 @@ def map_web_search_tool( return hosted_web_search_tool - def map_openai_params( + def map_openai_params( # noqa: PLR0915 self, non_default_params: dict, optional_params: dict, @@ -485,18 +819,37 @@ def map_openai_params( if param == "top_p": optional_params["top_p"] = value if param == "response_format" and isinstance(value, dict): - _tool = self.map_response_format_to_anthropic_tool( - value, optional_params, is_thinking_enabled - ) - if _tool is None: - continue - if not is_thinking_enabled: - _tool_choice = {"name": RESPONSE_FORMAT_TOOL_NAME, "type": "tool"} - optional_params["tool_choice"] = _tool_choice + if any( + substring in model + for substring in { + "sonnet-4.5", + "sonnet-4-5", + "opus-4.1", + "opus-4-1", + } + ): + _output_format = ( + self.map_response_format_to_anthropic_output_format(value) + ) + if _output_format is not None: + optional_params["output_format"] = _output_format + else: + _tool = self.map_response_format_to_anthropic_tool( + value, optional_params, is_thinking_enabled + ) + if _tool is None: + continue + if not is_thinking_enabled: + _tool_choice = { + "name": RESPONSE_FORMAT_TOOL_NAME, + "type": "tool", + } + optional_params["tool_choice"] = _tool_choice + + optional_params = self._add_tools_to_optional_params( + optional_params=optional_params, tools=[_tool] + ) optional_params["json_mode"] = True - optional_params = self._add_tools_to_optional_params( - optional_params=optional_params, tools=[_tool] - ) if ( param == "user" and value is not None @@ -507,6 +860,11 @@ def map_openai_params( if param == "thinking": optional_params["thinking"] = value elif param == "reasoning_effort" and isinstance(value, str): + # For Claude Opus 4.5, map reasoning_effort to output_config + if self._is_claude_opus_4_5(model): + optional_params["output_config"] = {"effort": value} + + # For other models, map to thinking parameter optional_params["thinking"] = AnthropicConfig._map_reasoning_effort( value ) @@ -517,6 +875,8 @@ def map_openai_params( self._add_tools_to_optional_params( optional_params=optional_params, tools=[hosted_web_search_tool] ) + elif param == "extra_headers": + optional_params["extra_headers"] = value ## handle thinking tokens self.update_optional_params_with_thinking_tokens( @@ -570,6 +930,9 @@ def translate_system_message( valid_content: bool = False system_message_block = ChatCompletionSystemMessage(**message) if isinstance(system_message_block["content"], str): + # Skip empty text blocks - Anthropic API raises errors for empty text + if not system_message_block["content"]: + continue anthropic_system_message_content = AnthropicSystemMessageContent( type="text", text=system_message_block["content"], @@ -584,10 +947,14 @@ def translate_system_message( valid_content = True elif isinstance(message["content"], list): for _content in message["content"]: + # Skip empty text blocks - Anthropic API raises errors for empty text + text_value = _content.get("text") + if _content.get("type") == "text" and not text_value: + continue anthropic_system_message_content = ( AnthropicSystemMessageContent( type=_content.get("type"), - text=_content.get("text"), + text=text_value, ) ) if "cache_control" in _content: @@ -641,13 +1008,59 @@ def add_code_execution_tool( ) ) return tools - - def update_headers_with_optional_anthropic_beta(self, headers: dict, optional_params: dict) -> dict: + + def _ensure_beta_header(self, headers: dict, beta_value: str) -> None: + """ + Ensure a beta header value is present in the anthropic-beta header. + Merges with existing values instead of overriding them. + + Args: + headers: Dictionary of headers to update + beta_value: The beta header value to add + """ + existing_beta = headers.get("anthropic-beta") + if existing_beta is None: + headers["anthropic-beta"] = beta_value + return + existing_values = [beta.strip() for beta in existing_beta.split(",")] + if beta_value not in existing_values: + headers["anthropic-beta"] = f"{existing_beta}, {beta_value}" + + def _ensure_context_management_beta_header(self, headers: dict) -> None: + beta_value = ANTHROPIC_BETA_HEADER_VALUES.CONTEXT_MANAGEMENT_2025_06_27.value + self._ensure_beta_header(headers, beta_value) + + def update_headers_with_optional_anthropic_beta( + self, headers: dict, optional_params: dict + ) -> dict: """Update headers with optional anthropic beta.""" + + # Skip adding beta headers for Vertex requests + # Vertex AI handles these headers differently + is_vertex_request = optional_params.get("is_vertex_request", False) + if is_vertex_request: + return headers + _tools = optional_params.get("tools", []) for tool in _tools: - if tool.get("type", None) and tool.get("type").startswith(ANTHROPIC_HOSTED_TOOLS.WEB_FETCH.value): - headers["anthropic-beta"] = ANTHROPIC_BETA_HEADER_VALUES.WEB_FETCH_2025_09_10.value + if tool.get("type", None) and tool.get("type").startswith( + ANTHROPIC_HOSTED_TOOLS.WEB_FETCH.value + ): + self._ensure_beta_header( + headers, ANTHROPIC_BETA_HEADER_VALUES.WEB_FETCH_2025_09_10.value + ) + elif tool.get("type", None) and tool.get("type").startswith( + ANTHROPIC_HOSTED_TOOLS.MEMORY.value + ): + self._ensure_beta_header( + headers, ANTHROPIC_BETA_HEADER_VALUES.CONTEXT_MANAGEMENT_2025_06_27.value + ) + if optional_params.get("context_management") is not None: + self._ensure_context_management_beta_header(headers) + if optional_params.get("output_format") is not None: + self._ensure_beta_header( + headers, ANTHROPIC_BETA_HEADER_VALUES.STRUCTURED_OUTPUT_2025_09_25.value + ) return headers def transform_request( @@ -685,7 +1098,29 @@ def transform_request( llm_provider="anthropic", ) - headers = self.update_headers_with_optional_anthropic_beta(headers=headers, optional_params=optional_params) + # Drop thinking param if thinking is enabled but thinking_blocks are missing + # This prevents the error: "Expected thinking or redacted_thinking, but found tool_use" + # + # IMPORTANT: Only drop thinking if NO assistant messages have thinking_blocks. + # If any message has thinking_blocks, we must keep thinking enabled, otherwise + # Anthropic errors with: "When thinking is disabled, an assistant message cannot contain thinking" + # Related issue: https://github.com/BerriAI/litellm/issues/18926 + if ( + optional_params.get("thinking") is not None + and messages is not None + and last_assistant_with_tool_calls_has_no_thinking_blocks(messages) + and not any_assistant_message_has_thinking_blocks(messages) + ): + if litellm.modify_params: + optional_params.pop("thinking", None) + litellm.verbose_logger.warning( + "Dropping 'thinking' param because the last assistant message with tool_calls " + "has no thinking_blocks. The model won't use extended thinking for this turn." + ) + + headers = self.update_headers_with_optional_anthropic_beta( + headers=headers, optional_params=optional_params + ) # Separate system prompt from rest of message anthropic_system_message_list = self.translate_system_message(messages=messages) @@ -697,7 +1132,7 @@ def transform_request( anthropic_messages = anthropic_messages_pt( model=model, messages=messages, - llm_provider="anthropic", + llm_provider=self.custom_llm_provider or "anthropic", ) except Exception as e: raise AnthropicError( @@ -718,7 +1153,7 @@ def transform_request( optional_params["tools"] = tools ## Load Config - config = litellm.AnthropicConfig.get_config() + config = litellm.AnthropicConfig.get_config(model=model) for k, v in config.items(): if ( k not in optional_params @@ -736,12 +1171,26 @@ def transform_request( ): optional_params["metadata"] = {"user_id": _litellm_metadata["user_id"]} + # Remove internal LiteLLM parameters that should not be sent to Anthropic API + optional_params.pop("is_vertex_request", None) + data = { "model": model, "messages": anthropic_messages, **optional_params, } + ## Handle output_config (Anthropic-specific parameter) + if "output_config" in optional_params: + output_config = optional_params.get("output_config") + if output_config and isinstance(output_config, dict): + effort = output_config.get("effort") + if effort and effort not in ["high", "medium", "low"]: + raise ValueError( + f"Invalid effort value: {effort}. Must be one of: 'high', 'medium', 'low'" + ) + data["output_config"] = output_config + return data def _transform_response_for_json_mode( @@ -774,6 +1223,8 @@ def extract_response_content(self, completion_response: dict) -> Tuple[ ], Optional[str], List[ChatCompletionToolCallChunk], + Optional[List[Any]], + Optional[List[Any]], ]: text_content = "" citations: Optional[List[Any]] = None @@ -784,22 +1235,38 @@ def extract_response_content(self, completion_response: dict) -> Tuple[ ] = None reasoning_content: Optional[str] = None tool_calls: List[ChatCompletionToolCallChunk] = [] + web_search_results: Optional[List[Any]] = None + tool_results: Optional[List[Any]] = None for idx, content in enumerate(completion_response["content"]): if content["type"] == "text": text_content += content["text"] ## TOOL CALLING - elif content["type"] == "tool_use": - tool_calls.append( - ChatCompletionToolCallChunk( - id=content["id"], - type="function", - function=ChatCompletionToolCallFunctionChunk( - name=content["name"], - arguments=json.dumps(content["input"]), - ), - index=idx, - ) + elif content["type"] == "tool_use" or content["type"] == "server_tool_use": + tool_call = AnthropicConfig.convert_tool_use_to_openai_format( + anthropic_tool_content=content, + index=idx, ) + tool_calls.append(tool_call) + + ## TOOL RESULTS - handle all tool result types (code execution, etc.) + elif content["type"].endswith("_tool_result"): + # Skip tool_search_tool_result as it's internal metadata + if content["type"] == "tool_search_tool_result": + continue + # Handle web_search_tool_result separately for backwards compatibility + if content["type"] == "web_search_tool_result": + if web_search_results is None: + web_search_results = [] + web_search_results.append(content) + elif content["type"] == "web_fetch_tool_result": + if web_search_results is None: + web_search_results = [] + web_search_results.append(content) + else: + # All other tool results (bash_code_execution_tool_result, text_editor_code_execution_tool_result, etc.) + if tool_results is None: + tool_results = [] + tool_results.append(content) elif content.get("thinking", None) is not None: if thinking_blocks is None: @@ -832,10 +1299,13 @@ def extract_response_content(self, completion_response: dict) -> Tuple[ if thinking_content is not None: reasoning_content += thinking_content - return text_content, citations, thinking_blocks, reasoning_content, tool_calls + return text_content, citations, thinking_blocks, reasoning_content, tool_calls, web_search_results, tool_results def calculate_usage( - self, usage_object: dict, reasoning_content: Optional[str] + self, + usage_object: dict, + reasoning_content: Optional[str], + completion_response: Optional[dict] = None, ) -> Usage: # NOTE: Sometimes the usage object has None set explicitly for token counts, meaning .get() & key access returns None, and we need to account for this prompt_tokens = usage_object.get("input_tokens", 0) or 0 @@ -845,6 +1315,7 @@ def calculate_usage( cache_read_input_tokens: int = 0 cache_creation_token_details: Optional[CacheCreationTokenDetails] = None web_search_requests: Optional[int] = None + tool_search_requests: Optional[int] = None if ( "cache_creation_input_tokens" in _usage and _usage["cache_creation_input_tokens"] is not None @@ -865,6 +1336,25 @@ def calculate_usage( web_search_requests = cast( int, _usage["server_tool_use"]["web_search_requests"] ) + if ( + "tool_search_requests" in _usage["server_tool_use"] + and _usage["server_tool_use"]["tool_search_requests"] is not None + ): + tool_search_requests = cast( + int, _usage["server_tool_use"]["tool_search_requests"] + ) + + # Count tool_search_requests from content blocks if not in usage + # Anthropic doesn't always include tool_search_requests in the usage object + if tool_search_requests is None and completion_response is not None: + tool_search_count = 0 + for content in completion_response.get("content", []): + if content.get("type") == "server_tool_use": + tool_name = content.get("name", "") + if "tool_search" in tool_name: + tool_search_count += 1 + if tool_search_count > 0: + tool_search_requests = tool_search_count if "cache_creation" in _usage and _usage["cache_creation"] is not None: cache_creation_token_details = CacheCreationTokenDetails( @@ -881,14 +1371,15 @@ def calculate_usage( cache_creation_tokens=cache_creation_input_tokens, cache_creation_token_details=cache_creation_token_details, ) - completion_token_details = ( - CompletionTokensDetailsWrapper( - reasoning_tokens=token_counter( - text=reasoning_content, count_response_tokens=True - ) - ) + # Always populate completion_token_details, not just when there's reasoning_content + reasoning_tokens = ( + token_counter(text=reasoning_content, count_response_tokens=True) if reasoning_content - else None + else 0 + ) + completion_token_details = CompletionTokensDetailsWrapper( + reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else 0, + text_tokens=completion_tokens - reasoning_tokens if reasoning_tokens > 0 else completion_tokens, ) total_tokens = prompt_tokens + completion_tokens @@ -901,8 +1392,11 @@ def calculate_usage( cache_read_input_tokens=cache_read_input_tokens, completion_tokens_details=completion_token_details, server_tool_use=( - ServerToolUse(web_search_requests=web_search_requests) - if web_search_requests is not None + ServerToolUse( + web_search_requests=web_search_requests, + tool_search_requests=tool_search_requests, + ) + if (web_search_requests is not None or tool_search_requests is not None) else None ), ) @@ -946,6 +1440,8 @@ def transform_parsed_response( thinking_blocks, reasoning_content, tool_calls, + web_search_results, + tool_results, ) = self.extract_response_content(completion_response=completion_response) if ( @@ -955,13 +1451,29 @@ def transform_parsed_response( ): text_content = prefix_prompt + text_content + context_management: Optional[Dict] = completion_response.get( + "context_management" + ) + + container: Optional[Dict] = completion_response.get("container") + + provider_specific_fields: Dict[str, Any] = { + "citations": citations, + "thinking_blocks": thinking_blocks, + } + if context_management is not None: + provider_specific_fields["context_management"] = context_management + if web_search_results is not None: + provider_specific_fields["web_search_results"] = web_search_results + if tool_results is not None: + provider_specific_fields["tool_results"] = tool_results + if container is not None: + provider_specific_fields["container"] = container + _message = litellm.Message( tool_calls=tool_calls, content=text_content or None, - provider_specific_fields={ - "citations": citations, - "thinking_blocks": thinking_blocks, - }, + provider_specific_fields=provider_specific_fields, thinking_blocks=thinking_blocks, reasoning_content=reasoning_content, ) @@ -988,12 +1500,23 @@ def transform_parsed_response( usage = self.calculate_usage( usage_object=completion_response["usage"], reasoning_content=reasoning_content, + completion_response=completion_response, ) setattr(model_response, "usage", usage) # type: ignore model_response.created = int(time.time()) model_response.model = completion_response["model"] + context_management_response = completion_response.get("context_management") + if context_management_response is not None: + _hidden_params["context_management"] = context_management_response + try: + model_response.__dict__["context_management"] = ( + context_management_response + ) + except Exception: + pass + model_response._hidden_params = _hidden_params return model_response diff --git a/litellm/llms/anthropic/common_utils.py b/litellm/llms/anthropic/common_utils.py index 68b5341e954a..cb23d21fbc93 100644 --- a/litellm/llms/anthropic/common_utils.py +++ b/litellm/llms/anthropic/common_utils.py @@ -2,7 +2,7 @@ This file contains common utils for anthropic calls. """ -from typing import Any, Dict, List, Optional, Union +from typing import Dict, List, Optional, Union import httpx @@ -12,9 +12,38 @@ ) from litellm.llms.base_llm.base_utils import BaseLLMModelInfo, BaseTokenCounter from litellm.llms.base_llm.chat.transformation import BaseLLMException -from litellm.types.llms.anthropic import AllAnthropicToolsValues, AnthropicMcpServerTool +from litellm.types.llms.anthropic import ( + ANTHROPIC_HOSTED_TOOLS, + ANTHROPIC_OAUTH_BETA_HEADER, + ANTHROPIC_OAUTH_TOKEN_PREFIX, + AllAnthropicToolsValues, + AnthropicMcpServerTool, +) from litellm.types.llms.openai import AllMessageValues -from litellm.types.utils import TokenCountResponse + + +def optionally_handle_anthropic_oauth( + headers: dict, api_key: Optional[str] +) -> tuple[dict, Optional[str]]: + """ + Handle Anthropic OAuth token detection and header setup. + + If an OAuth token is detected in the Authorization header, extracts it + and sets the required OAuth headers. + + Args: + headers: Request headers dict + api_key: Current API key (may be None) + + Returns: + Tuple of (updated headers, api_key) + """ + auth_header = headers.get("authorization", "") + if auth_header and auth_header.startswith(f"Bearer {ANTHROPIC_OAUTH_TOKEN_PREFIX}"): + api_key = auth_header.replace("Bearer ", "") + headers["anthropic-beta"] = ANTHROPIC_OAUTH_BETA_HEADER + headers["anthropic-dangerous-direct-browser-access"] = "true" + return headers, api_key class AnthropicError(BaseLLMException): @@ -63,11 +92,23 @@ def is_mcp_server_used( def is_computer_tool_used( self, tools: Optional[List[AllAnthropicToolsValues]] + ) -> Optional[str]: + """Returns the computer tool version if used, e.g. 'computer_20250124' or None""" + if tools is None: + return None + for tool in tools: + if "type" in tool and tool["type"].startswith("computer_"): + return tool["type"] + return None + + def is_web_search_tool_used( + self, tools: Optional[List[AllAnthropicToolsValues]] ) -> bool: + """Returns True if web_search tool is used""" if tools is None: return False for tool in tools: - if "type" in tool and tool["type"].startswith("computer_"): + if "type" in tool and tool["type"].startswith(ANTHROPIC_HOSTED_TOOLS.WEB_SEARCH.value): return True return False @@ -87,6 +128,124 @@ def is_pdf_used(self, messages: List[AllMessageValues]) -> bool: return True return False + def is_tool_search_used(self, tools: Optional[List]) -> bool: + """ + Check if tool search tools are present in the tools list. + """ + if not tools: + return False + + for tool in tools: + tool_type = tool.get("type", "") + if tool_type in ["tool_search_tool_regex_20251119", "tool_search_tool_bm25_20251119"]: + return True + return False + + def is_programmatic_tool_calling_used(self, tools: Optional[List]) -> bool: + """ + Check if programmatic tool calling is being used (tools with allowed_callers field). + + Returns True if any tool has allowed_callers containing 'code_execution_20250825'. + """ + if not tools: + return False + + for tool in tools: + # Check top-level allowed_callers + allowed_callers = tool.get("allowed_callers", None) + if allowed_callers and isinstance(allowed_callers, list): + if "code_execution_20250825" in allowed_callers: + return True + + # Check function.allowed_callers for OpenAI format tools + function = tool.get("function", {}) + if isinstance(function, dict): + function_allowed_callers = function.get("allowed_callers", None) + if function_allowed_callers and isinstance(function_allowed_callers, list): + if "code_execution_20250825" in function_allowed_callers: + return True + + return False + + def is_input_examples_used(self, tools: Optional[List]) -> bool: + """ + Check if input_examples is being used in any tools. + + Returns True if any tool has input_examples field. + """ + if not tools: + return False + + for tool in tools: + # Check top-level input_examples + input_examples = tool.get("input_examples", None) + if input_examples and isinstance(input_examples, list) and len(input_examples) > 0: + return True + + # Check function.input_examples for OpenAI format tools + function = tool.get("function", {}) + if isinstance(function, dict): + function_input_examples = function.get("input_examples", None) + if function_input_examples and isinstance(function_input_examples, list) and len(function_input_examples) > 0: + return True + + return False + + def is_effort_used(self, optional_params: Optional[dict], model: Optional[str] = None) -> bool: + """ + Check if effort parameter is being used. + + Returns True if effort-related parameters are present. + """ + if not optional_params: + return False + + # Check if reasoning_effort is provided for Claude Opus 4.5 + if model and ("opus-4-5" in model.lower() or "opus_4_5" in model.lower()): + reasoning_effort = optional_params.get("reasoning_effort") + if reasoning_effort and isinstance(reasoning_effort, str): + return True + + # Check if output_config is directly provided + output_config = optional_params.get("output_config") + if output_config and isinstance(output_config, dict): + effort = output_config.get("effort") + if effort and isinstance(effort, str): + return True + + return False + + def is_code_execution_tool_used(self, tools: Optional[List]) -> bool: + """ + Check if code execution tool is being used. + + Returns True if any tool has type "code_execution_20250825". + """ + if not tools: + return False + + for tool in tools: + tool_type = tool.get("type", "") + if tool_type == "code_execution_20250825": + return True + return False + + def is_container_with_skills_used(self, optional_params: Optional[dict]) -> bool: + """ + Check if container with skills is being used. + + Returns True if optional_params contains container with skills. + """ + if not optional_params: + return False + + container = optional_params.get("container") + if container and isinstance(container, dict): + skills = container.get("skills") + if skills and isinstance(skills, list) and len(skills) > 0: + return True + return False + def _get_user_anthropic_beta_headers( self, anthropic_beta_header: Optional[str] ) -> Optional[List[str]]: @@ -94,23 +253,94 @@ def _get_user_anthropic_beta_headers( return None return anthropic_beta_header.split(",") + def get_computer_tool_beta_header(self, computer_tool_version: str) -> str: + """ + Get the appropriate beta header for a given computer tool version. + + Args: + computer_tool_version: The computer tool version (e.g., 'computer_20250124', 'computer_20241022') + + Returns: + The corresponding beta header string + """ + computer_tool_beta_mapping = { + "computer_20250124": "computer-use-2025-01-24", + "computer_20241022": "computer-use-2024-10-22", + } + return computer_tool_beta_mapping.get( + computer_tool_version, "computer-use-2024-10-22" # Default fallback + ) + + def get_anthropic_beta_list( + self, + model: str, + optional_params: Optional[dict] = None, + computer_tool_used: Optional[str] = None, + prompt_caching_set: bool = False, + file_id_used: bool = False, + mcp_server_used: bool = False, + ) -> List[str]: + """ + Get list of common beta headers based on the features that are active. + + Returns: + List of beta header strings + """ + from litellm.types.llms.anthropic import ( + ANTHROPIC_EFFORT_BETA_HEADER, + ) + + betas = [] + + # Detect features + effort_used = self.is_effort_used(optional_params, model) + + if effort_used: + betas.append(ANTHROPIC_EFFORT_BETA_HEADER) # effort-2025-11-24 + + if computer_tool_used: + beta_header = self.get_computer_tool_beta_header(computer_tool_used) + betas.append(beta_header) + + # Anthropic no longer requires the prompt-caching beta header + # Prompt caching now works automatically when cache_control is used in messages + # Reference: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching + + if file_id_used: + betas.append("files-api-2025-04-14") + betas.append("code-execution-2025-05-22") + + if mcp_server_used: + betas.append("mcp-client-2025-04-04") + + return list(set(betas)) + def get_anthropic_headers( self, api_key: str, anthropic_version: Optional[str] = None, - computer_tool_used: bool = False, + computer_tool_used: Optional[str] = None, prompt_caching_set: bool = False, pdf_used: bool = False, file_id_used: bool = False, mcp_server_used: bool = False, + web_search_tool_used: bool = False, + tool_search_used: bool = False, + programmatic_tool_calling_used: bool = False, + input_examples_used: bool = False, + effort_used: bool = False, is_vertex_request: bool = False, user_anthropic_beta_headers: Optional[List[str]] = None, + code_execution_tool_used: bool = False, + container_with_skills_used: bool = False, ) -> dict: betas = set() - if prompt_caching_set: - betas.add("prompt-caching-2024-07-31") + # Anthropic no longer requires the prompt-caching beta header + # Prompt caching now works automatically when cache_control is used in messages + # Reference: https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching if computer_tool_used: - betas.add("computer-use-2024-10-22") + beta_header = self.get_computer_tool_beta_header(computer_tool_used) + betas.add(beta_header) # if pdf_used: # betas.add("pdfs-2024-09-25") if file_id_used: @@ -118,6 +348,23 @@ def get_anthropic_headers( betas.add("code-execution-2025-05-22") if mcp_server_used: betas.add("mcp-client-2025-04-04") + # Tool search, programmatic tool calling, and input_examples all use the same beta header + if tool_search_used or programmatic_tool_calling_used or input_examples_used: + from litellm.types.llms.anthropic import ANTHROPIC_TOOL_SEARCH_BETA_HEADER + betas.add(ANTHROPIC_TOOL_SEARCH_BETA_HEADER) + + # Effort parameter uses a separate beta header + if effort_used: + from litellm.types.llms.anthropic import ANTHROPIC_EFFORT_BETA_HEADER + betas.add(ANTHROPIC_EFFORT_BETA_HEADER) + + # Code execution tool uses a separate beta header + if code_execution_tool_used: + betas.add("code-execution-2025-08-25") + + # Container with skills uses a separate beta header + if container_with_skills_used: + betas.add("skills-2025-10-02") headers = { "anthropic-version": anthropic_version or "2023-06-01", @@ -129,9 +376,12 @@ def get_anthropic_headers( if user_anthropic_beta_headers is not None: betas.update(user_anthropic_beta_headers) - # Don't send any beta headers to Vertex, Vertex has failed requests when they are sent + # Don't send any beta headers to Vertex, except web search which is required if is_vertex_request is True: - pass + # Vertex AI requires web search beta header for web search to work + if web_search_tool_used: + from litellm.types.llms.anthropic import ANTHROPIC_BETA_HEADER_VALUES + headers["anthropic-beta"] = ANTHROPIC_BETA_HEADER_VALUES.WEB_SEARCH_2025_03_05.value elif len(betas) > 0: headers["anthropic-beta"] = ",".join(betas) @@ -147,6 +397,8 @@ def validate_environment( api_key: Optional[str] = None, api_base: Optional[str] = None, ) -> Dict: + # Check for Anthropic OAuth token in headers + headers, api_key = optionally_handle_anthropic_oauth(headers=headers, api_key=api_key) if api_key is None: raise litellm.AuthenticationError( message="Missing Anthropic API Key - A call is being made to anthropic but no key is set either in the environment variables or via params. Please set `ANTHROPIC_API_KEY` in your environment vars", @@ -162,6 +414,13 @@ def validate_environment( ) pdf_used = self.is_pdf_used(messages=messages) file_id_used = self.is_file_id_used(messages=messages) + web_search_tool_used = self.is_web_search_tool_used(tools=tools) + tool_search_used = self.is_tool_search_used(tools=tools) + programmatic_tool_calling_used = self.is_programmatic_tool_calling_used(tools=tools) + input_examples_used = self.is_input_examples_used(tools=tools) + effort_used = self.is_effort_used(optional_params=optional_params, model=model) + code_execution_tool_used = self.is_code_execution_tool_used(tools=tools) + container_with_skills_used = self.is_container_with_skills_used(optional_params=optional_params) user_anthropic_beta_headers = self._get_user_anthropic_beta_headers( anthropic_beta_header=headers.get("anthropic-beta") ) @@ -171,9 +430,16 @@ def validate_environment( pdf_used=pdf_used, api_key=api_key, file_id_used=file_id_used, + web_search_tool_used=web_search_tool_used, is_vertex_request=optional_params.get("is_vertex_request", False), user_anthropic_beta_headers=user_anthropic_beta_headers, mcp_server_used=mcp_server_used, + tool_search_used=tool_search_used, + programmatic_tool_calling_used=programmatic_tool_calling_used, + input_examples_used=input_examples_used, + effort_used=effort_used, + code_execution_tool_used=code_execution_tool_used, + container_with_skills_used=container_with_skills_used, ) headers = {**headers, **anthropic_headers} @@ -237,45 +503,11 @@ def get_token_counter(self) -> Optional[BaseTokenCounter]: Returns: AnthropicTokenCounter instance for this provider. """ - return AnthropicTokenCounter() - - -class AnthropicTokenCounter(BaseTokenCounter): - """Token counter implementation for Anthropic provider.""" - - def should_use_token_counting_api( - self, - custom_llm_provider: Optional[str] = None, - ) -> bool: - from litellm.types.utils import LlmProviders - return custom_llm_provider == LlmProviders.ANTHROPIC.value - - async def count_tokens( - self, - model_to_use: str, - messages: Optional[List[Dict[str, Any]]], - contents: Optional[List[Dict[str, Any]]], - deployment: Optional[Dict[str, Any]] = None, - request_model: str = "", - ) -> Optional[TokenCountResponse]: - from litellm.proxy.utils import count_tokens_with_anthropic_api - - result = await count_tokens_with_anthropic_api( - model_to_use=model_to_use, - messages=messages, - deployment=deployment, + from litellm.llms.anthropic.count_tokens.token_counter import ( + AnthropicTokenCounter, ) - - if result is not None: - return TokenCountResponse( - total_tokens=result.get("total_tokens", 0), - request_model=request_model, - model_used=model_to_use, - tokenizer_type=result.get("tokenizer_used", ""), - original_response=result, - ) - - return None + + return AnthropicTokenCounter() def process_anthropic_headers(headers: Union[httpx.Headers, dict]) -> dict: diff --git a/litellm/llms/anthropic/count_tokens/__init__.py b/litellm/llms/anthropic/count_tokens/__init__.py new file mode 100644 index 000000000000..ef46862bda68 --- /dev/null +++ b/litellm/llms/anthropic/count_tokens/__init__.py @@ -0,0 +1,15 @@ +""" +Anthropic CountTokens API implementation. +""" + +from litellm.llms.anthropic.count_tokens.handler import AnthropicCountTokensHandler +from litellm.llms.anthropic.count_tokens.token_counter import AnthropicTokenCounter +from litellm.llms.anthropic.count_tokens.transformation import ( + AnthropicCountTokensConfig, +) + +__all__ = [ + "AnthropicCountTokensHandler", + "AnthropicCountTokensConfig", + "AnthropicTokenCounter", +] diff --git a/litellm/llms/anthropic/count_tokens/handler.py b/litellm/llms/anthropic/count_tokens/handler.py new file mode 100644 index 000000000000..5b5354228f90 --- /dev/null +++ b/litellm/llms/anthropic/count_tokens/handler.py @@ -0,0 +1,122 @@ +""" +Anthropic CountTokens API handler. + +Uses httpx for HTTP requests instead of the Anthropic SDK. +""" + +from typing import Any, Dict, List, Optional, Union + +import httpx + +import litellm +from litellm._logging import verbose_logger +from litellm.llms.anthropic.common_utils import AnthropicError +from litellm.llms.anthropic.count_tokens.transformation import ( + AnthropicCountTokensConfig, +) +from litellm.llms.custom_httpx.http_handler import get_async_httpx_client + + +class AnthropicCountTokensHandler(AnthropicCountTokensConfig): + """ + Handler for Anthropic CountTokens API requests. + + Uses httpx for HTTP requests, following the same pattern as BedrockCountTokensHandler. + """ + + async def handle_count_tokens_request( + self, + model: str, + messages: List[Dict[str, Any]], + api_key: str, + api_base: Optional[str] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + ) -> Dict[str, Any]: + """ + Handle a CountTokens request using httpx. + + Args: + model: The model identifier (e.g., "claude-3-5-sonnet-20241022") + messages: The messages to count tokens for + api_key: The Anthropic API key + api_base: Optional custom API base URL + timeout: Optional timeout for the request (defaults to litellm.request_timeout) + + Returns: + Dictionary containing token count response + + Raises: + AnthropicError: If the API request fails + """ + try: + # Validate the request + self.validate_request(model, messages) + + verbose_logger.debug( + f"Processing Anthropic CountTokens request for model: {model}" + ) + + # Transform request to Anthropic format + request_body = self.transform_request_to_count_tokens( + model=model, + messages=messages, + ) + + verbose_logger.debug(f"Transformed request: {request_body}") + + # Get endpoint URL + endpoint_url = api_base or self.get_anthropic_count_tokens_endpoint() + + verbose_logger.debug(f"Making request to: {endpoint_url}") + + # Get required headers + headers = self.get_required_headers(api_key) + + # Use LiteLLM's async httpx client + async_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.ANTHROPIC + ) + + # Use provided timeout or fall back to litellm.request_timeout + request_timeout = timeout if timeout is not None else litellm.request_timeout + + response = await async_client.post( + endpoint_url, + headers=headers, + json=request_body, + timeout=request_timeout, + ) + + verbose_logger.debug(f"Response status: {response.status_code}") + + if response.status_code != 200: + error_text = response.text + verbose_logger.error(f"Anthropic API error: {error_text}") + raise AnthropicError( + status_code=response.status_code, + message=error_text, + ) + + anthropic_response = response.json() + + verbose_logger.debug(f"Anthropic response: {anthropic_response}") + + # Return Anthropic response directly - no transformation needed + return anthropic_response + + except AnthropicError: + # Re-raise Anthropic exceptions as-is + raise + except httpx.HTTPStatusError as e: + # HTTP errors - preserve the actual status code + verbose_logger.error(f"HTTP error in CountTokens handler: {str(e)}") + raise AnthropicError( + status_code=e.response.status_code, + message=e.response.text, + ) + except Exception as e: + verbose_logger.error(f"Error in CountTokens handler: {str(e)}") + raise AnthropicError( + status_code=500, + message=f"CountTokens processing error: {str(e)}", + ) diff --git a/litellm/llms/anthropic/count_tokens/token_counter.py b/litellm/llms/anthropic/count_tokens/token_counter.py new file mode 100644 index 000000000000..266b2794fc37 --- /dev/null +++ b/litellm/llms/anthropic/count_tokens/token_counter.py @@ -0,0 +1,104 @@ +""" +Anthropic Token Counter implementation using the CountTokens API. +""" + +import os +from typing import Any, Dict, List, Optional + +from litellm._logging import verbose_logger +from litellm.llms.anthropic.count_tokens.handler import AnthropicCountTokensHandler +from litellm.llms.base_llm.base_utils import BaseTokenCounter +from litellm.types.utils import LlmProviders, TokenCountResponse + +# Global handler instance - reuse across all token counting requests +anthropic_count_tokens_handler = AnthropicCountTokensHandler() + + +class AnthropicTokenCounter(BaseTokenCounter): + """Token counter implementation for Anthropic provider using the CountTokens API.""" + + def should_use_token_counting_api( + self, + custom_llm_provider: Optional[str] = None, + ) -> bool: + return custom_llm_provider == LlmProviders.ANTHROPIC.value + + async def count_tokens( + self, + model_to_use: str, + messages: Optional[List[Dict[str, Any]]], + contents: Optional[List[Dict[str, Any]]], + deployment: Optional[Dict[str, Any]] = None, + request_model: str = "", + ) -> Optional[TokenCountResponse]: + """ + Count tokens using Anthropic's CountTokens API. + + Args: + model_to_use: The model identifier + messages: The messages to count tokens for + contents: Alternative content format (not used for Anthropic) + deployment: Deployment configuration containing litellm_params + request_model: The original request model name + + Returns: + TokenCountResponse with token count, or None if counting fails + """ + from litellm.llms.anthropic.common_utils import AnthropicError + + if not messages: + return None + + deployment = deployment or {} + litellm_params = deployment.get("litellm_params", {}) + + # Get Anthropic API key from deployment config or environment + api_key = litellm_params.get("api_key") + if not api_key: + api_key = os.getenv("ANTHROPIC_API_KEY") + + if not api_key: + verbose_logger.warning("No Anthropic API key found for token counting") + return None + + try: + result = await anthropic_count_tokens_handler.handle_count_tokens_request( + model=model_to_use, + messages=messages, + api_key=api_key, + ) + + if result is not None: + return TokenCountResponse( + total_tokens=result.get("input_tokens", 0), + request_model=request_model, + model_used=model_to_use, + tokenizer_type="anthropic_api", + original_response=result, + ) + except AnthropicError as e: + verbose_logger.warning( + f"Anthropic CountTokens API error: status={e.status_code}, message={e.message}" + ) + return TokenCountResponse( + total_tokens=0, + request_model=request_model, + model_used=model_to_use, + tokenizer_type="anthropic_api", + error=True, + error_message=e.message, + status_code=e.status_code, + ) + except Exception as e: + verbose_logger.warning(f"Error calling Anthropic CountTokens API: {e}") + return TokenCountResponse( + total_tokens=0, + request_model=request_model, + model_used=model_to_use, + tokenizer_type="anthropic_api", + error=True, + error_message=str(e), + status_code=500, + ) + + return None diff --git a/litellm/llms/anthropic/count_tokens/transformation.py b/litellm/llms/anthropic/count_tokens/transformation.py new file mode 100644 index 000000000000..c3ad72436b4f --- /dev/null +++ b/litellm/llms/anthropic/count_tokens/transformation.py @@ -0,0 +1,103 @@ +""" +Anthropic CountTokens API transformation logic. + +This module handles the transformation of requests to Anthropic's CountTokens API format. +""" + +from typing import Any, Dict, List + +from litellm.constants import ANTHROPIC_TOKEN_COUNTING_BETA_VERSION + + +class AnthropicCountTokensConfig: + """ + Configuration and transformation logic for Anthropic CountTokens API. + + Anthropic CountTokens API Specification: + - Endpoint: POST https://api.anthropic.com/v1/messages/count_tokens + - Beta header required: anthropic-beta: token-counting-2024-11-01 + - Response: {"input_tokens": } + """ + + def get_anthropic_count_tokens_endpoint(self) -> str: + """ + Get the Anthropic CountTokens API endpoint. + + Returns: + The endpoint URL for the CountTokens API + """ + return "https://api.anthropic.com/v1/messages/count_tokens" + + def transform_request_to_count_tokens( + self, + model: str, + messages: List[Dict[str, Any]], + ) -> Dict[str, Any]: + """ + Transform request to Anthropic CountTokens format. + + Input: + { + "model": "claude-3-5-sonnet-20241022", + "messages": [{"role": "user", "content": "Hello!"}] + } + + Output (Anthropic CountTokens format): + { + "model": "claude-3-5-sonnet-20241022", + "messages": [{"role": "user", "content": "Hello!"}] + } + """ + return { + "model": model, + "messages": messages, + } + + def get_required_headers(self, api_key: str) -> Dict[str, str]: + """ + Get the required headers for the CountTokens API. + + Args: + api_key: The Anthropic API key + + Returns: + Dictionary of required headers + """ + return { + "Content-Type": "application/json", + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + "anthropic-beta": ANTHROPIC_TOKEN_COUNTING_BETA_VERSION, + } + + def validate_request( + self, model: str, messages: List[Dict[str, Any]] + ) -> None: + """ + Validate the incoming count tokens request. + + Args: + model: The model name + messages: The messages to count tokens for + + Raises: + ValueError: If the request is invalid + """ + if not model: + raise ValueError("model parameter is required") + + if not messages: + raise ValueError("messages parameter is required") + + if not isinstance(messages, list): + raise ValueError("messages must be a list") + + for i, message in enumerate(messages): + if not isinstance(message, dict): + raise ValueError(f"Message {i} must be a dictionary") + + if "role" not in message: + raise ValueError(f"Message {i} must have a 'role' field") + + if "content" not in message: + raise ValueError(f"Message {i} must have a 'content' field") diff --git a/litellm/llms/anthropic/experimental_pass_through/adapters/handler.py b/litellm/llms/anthropic/experimental_pass_through/adapters/handler.py index 88a63fc6f5d2..8fa7bb7e65e0 100644 --- a/litellm/llms/anthropic/experimental_pass_through/adapters/handler.py +++ b/litellm/llms/anthropic/experimental_pass_through/adapters/handler.py @@ -45,6 +45,7 @@ def _prepare_completion_kwargs( tools: Optional[List[Dict]] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, + output_format: Optional[Dict] = None, extra_kwargs: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """Prepare kwargs for litellm.completion/acompletion""" @@ -76,6 +77,8 @@ def _prepare_completion_kwargs( request_data["top_k"] = top_k if top_p is not None: request_data["top_p"] = top_p + if output_format: + request_data["output_format"] = output_format openai_request = ANTHROPIC_ADAPTER.translate_completion_input_params( request_data @@ -130,6 +133,7 @@ async def async_anthropic_messages_handler( tools: Optional[List[Dict]] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, + output_format: Optional[Dict] = None, **kwargs, ) -> Union[AnthropicMessagesResponse, AsyncIterator]: """Handle non-Anthropic models asynchronously using the adapter""" @@ -148,36 +152,32 @@ async def async_anthropic_messages_handler( tools=tools, top_k=top_k, top_p=top_p, + output_format=output_format, extra_kwargs=kwargs, ) ) - try: - completion_response = await litellm.acompletion(**completion_kwargs) + completion_response = await litellm.acompletion(**completion_kwargs) - if stream: - transformed_stream = ( - ANTHROPIC_ADAPTER.translate_completion_output_params_streaming( - completion_response, - model=model, - ) + if stream: + transformed_stream = ( + ANTHROPIC_ADAPTER.translate_completion_output_params_streaming( + completion_response, + model=model, ) - if transformed_stream is not None: - return transformed_stream - raise ValueError("Failed to transform streaming response") - else: - anthropic_response = ( - ANTHROPIC_ADAPTER.translate_completion_output_params( - cast(ModelResponse, completion_response) - ) + ) + if transformed_stream is not None: + return transformed_stream + raise ValueError("Failed to transform streaming response") + else: + anthropic_response = ( + ANTHROPIC_ADAPTER.translate_completion_output_params( + cast(ModelResponse, completion_response) ) - if anthropic_response is not None: - return anthropic_response - raise ValueError("Failed to transform response to Anthropic format") - except Exception as e: # noqa: BLE001 - raise ValueError( - f"Error calling litellm.acompletion for non-Anthropic model: {str(e)}" ) + if anthropic_response is not None: + return anthropic_response + raise ValueError("Failed to transform response to Anthropic format") @staticmethod def anthropic_messages_handler( @@ -194,6 +194,7 @@ def anthropic_messages_handler( tools: Optional[List[Dict]] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, + output_format: Optional[Dict] = None, _is_async: bool = False, **kwargs, ) -> Union[ @@ -217,6 +218,7 @@ def anthropic_messages_handler( tools=tools, top_k=top_k, top_p=top_p, + output_format=output_format, **kwargs, ) @@ -235,33 +237,29 @@ def anthropic_messages_handler( tools=tools, top_k=top_k, top_p=top_p, + output_format=output_format, extra_kwargs=kwargs, ) ) - try: - completion_response = litellm.completion(**completion_kwargs) + completion_response = litellm.completion(**completion_kwargs) - if stream: - transformed_stream = ( - ANTHROPIC_ADAPTER.translate_completion_output_params_streaming( - completion_response, - model=model, - ) + if stream: + transformed_stream = ( + ANTHROPIC_ADAPTER.translate_completion_output_params_streaming( + completion_response, + model=model, ) - if transformed_stream is not None: - return transformed_stream - raise ValueError("Failed to transform streaming response") - else: - anthropic_response = ( - ANTHROPIC_ADAPTER.translate_completion_output_params( - cast(ModelResponse, completion_response) - ) + ) + if transformed_stream is not None: + return transformed_stream + raise ValueError("Failed to transform streaming response") + else: + anthropic_response = ( + ANTHROPIC_ADAPTER.translate_completion_output_params( + cast(ModelResponse, completion_response) ) - if anthropic_response is not None: - return anthropic_response - raise ValueError("Failed to transform response to Anthropic format") - except Exception as e: # noqa: BLE001 - raise ValueError( - f"Error calling litellm.completion for non-Anthropic model: {str(e)}" ) + if anthropic_response is not None: + return anthropic_response + raise ValueError("Failed to transform response to Anthropic format") diff --git a/litellm/llms/anthropic/experimental_pass_through/adapters/streaming_iterator.py b/litellm/llms/anthropic/experimental_pass_through/adapters/streaming_iterator.py index ecad7a500111..24524233ddfc 100644 --- a/litellm/llms/anthropic/experimental_pass_through/adapters/streaming_iterator.py +++ b/litellm/llms/anthropic/experimental_pass_through/adapters/streaming_iterator.py @@ -2,11 +2,11 @@ ## Translates OpenAI call to Anthropic `/v1/messages` format import json import traceback -from litellm._uuid import uuid from collections import deque from typing import TYPE_CHECKING, Any, AsyncIterator, Iterator, Literal, Optional from litellm import verbose_logger +from litellm._uuid import uuid from litellm.types.llms.anthropic import UsageDelta from litellm.types.utils import AdapterCompletionStreamWrapper @@ -48,6 +48,27 @@ def __init__(self, completion_stream: Any, model: str): super().__init__(completion_stream) self.model = model + def _create_initial_usage_delta(self) -> UsageDelta: + """ + Create the initial UsageDelta for the message_start event. + + Initializes cache token fields (cache_creation_input_tokens, cache_read_input_tokens) + to 0 to indicate to clients (like Claude Code) that prompt caching is supported. + + The actual cache token values will be provided in the message_delta event at the + end of the stream, since Bedrock Converse API only returns usage data in the final + response chunk. + + Returns: + UsageDelta with all token counts initialized to 0. + """ + return UsageDelta( + input_tokens=0, + output_tokens=0, + cache_creation_input_tokens=0, + cache_read_input_tokens=0, + ) + def __next__(self): from .transformation import LiteLLMAnthropicMessagesAdapter @@ -64,7 +85,7 @@ def __next__(self): "model": self.model, "stop_reason": None, "stop_sequence": None, - "usage": UsageDelta(input_tokens=0, output_tokens=0), + "usage": self._create_initial_usage_delta(), }, } if self.sent_content_block_start is False: @@ -169,7 +190,7 @@ async def __anext__(self): # noqa: PLR0915 "model": self.model, "stop_reason": None, "stop_sequence": None, - "usage": UsageDelta(input_tokens=0, output_tokens=0), + "usage": self._create_initial_usage_delta(), }, } ) @@ -211,10 +232,16 @@ async def __anext__(self): # noqa: PLR0915 merged_chunk["delta"] = {} # Add usage to the held chunk - merged_chunk["usage"] = { + usage_dict: UsageDelta = { "input_tokens": chunk.usage.prompt_tokens or 0, "output_tokens": chunk.usage.completion_tokens or 0, } + # Add cache tokens if available (for prompt caching support) + if hasattr(chunk.usage, "_cache_creation_input_tokens") and chunk.usage._cache_creation_input_tokens > 0: + usage_dict["cache_creation_input_tokens"] = chunk.usage._cache_creation_input_tokens + if hasattr(chunk.usage, "_cache_read_input_tokens") and chunk.usage._cache_read_input_tokens > 0: + usage_dict["cache_read_input_tokens"] = chunk.usage._cache_read_input_tokens + merged_chunk["usage"] = usage_dict # Queue the merged chunk and reset self.chunk_queue.append(merged_chunk) diff --git a/litellm/llms/anthropic/experimental_pass_through/adapters/transformation.py b/litellm/llms/anthropic/experimental_pass_through/adapters/transformation.py index 922e6626f238..5ba0754b7441 100644 --- a/litellm/llms/anthropic/experimental_pass_through/adapters/transformation.py +++ b/litellm/llms/anthropic/experimental_pass_through/adapters/transformation.py @@ -3,6 +3,7 @@ TYPE_CHECKING, Any, AsyncIterator, + Dict, List, Literal, Optional, @@ -13,6 +14,9 @@ from openai.types.chat.chat_completion_chunk import Choice as OpenAIStreamingChoice +from litellm.litellm_core_utils.prompt_templates.common_utils import ( + parse_tool_call_arguments, +) from litellm.types.llms.anthropic import ( AllAnthropicToolsValues, AnthopicMessagesAssistantMessageParam, @@ -129,11 +133,76 @@ def __init__(self): ### FOR [BETA] `/v1/messages` endpoint support + def _extract_signature_from_tool_call(self, tool_call: Any) -> Optional[str]: + """ + Extract signature from a tool call's provider_specific_fields. + Only checks provider_specific_fields, not thinking blocks. + """ + signature = None + + if ( + hasattr(tool_call, "provider_specific_fields") + and tool_call.provider_specific_fields + ): + if "thought_signature" in tool_call.provider_specific_fields: + signature = tool_call.provider_specific_fields["thought_signature"] + elif ( + hasattr(tool_call.function, "provider_specific_fields") + and tool_call.function.provider_specific_fields + ): + if "thought_signature" in tool_call.function.provider_specific_fields: + signature = tool_call.function.provider_specific_fields[ + "thought_signature" + ] + + return signature + + def _extract_signature_from_tool_use_content( + self, content: Dict[str, Any] + ) -> Optional[str]: + """ + Extract signature from a tool_use content block's provider_specific_fields. + """ + provider_specific_fields = content.get("provider_specific_fields", {}) + if provider_specific_fields: + return provider_specific_fields.get("signature") + return None + + def _add_cache_control_if_applicable( + self, + source: Any, + target: Any, + model: Optional[str], + ) -> None: + """ + Extract cache_control from source and add to target if it should be preserved. + + This method accepts Any type to support both regular dicts and TypedDict objects. + TypedDict objects (like ChatCompletionTextObject, ChatCompletionImageObject, etc.) + are dicts at runtime but have specific types at type-check time. Using Any allows + this method to work with both while maintaining runtime correctness. + + Args: + source: Dict or TypedDict containing potential cache_control field + target: Dict or TypedDict to add cache_control to + model: Model name to check if cache_control should be preserved + """ + # TypedDict objects are dicts at runtime, so .get() works + cache_control = source.get("cache_control") if isinstance(source, dict) else getattr(source, "cache_control", None) + if cache_control and model and self.is_anthropic_claude_model(model): + # TypedDict objects support dict operations at runtime + # Use type ignore consistent with codebase pattern (see anthropic/chat/transformation.py:432) + if isinstance(target, dict): + target["cache_control"] = cache_control # type: ignore[typeddict-item] + else: + # Fallback for non-dict objects (shouldn't happen in practice) + cast(Dict[str, Any], target)["cache_control"] = cache_control + def translatable_anthropic_params(self) -> List: """ Which anthropic params, we need to translate to the openai format. """ - return ["messages", "metadata", "system", "tool_choice", "tools"] + return ["messages", "metadata", "system", "tool_choice", "tools", "thinking", "output_format"] def translate_anthropic_messages_to_openai( # noqa: PLR0915 self, @@ -143,6 +212,7 @@ def translate_anthropic_messages_to_openai( # noqa: PLR0915 AnthopicMessagesAssistantMessageParam, ] ], + model: Optional[str] = None, ) -> List: new_messages: List[AllMessageValues] = [] for m in messages: @@ -165,16 +235,40 @@ def translate_anthropic_messages_to_openai( # noqa: PLR0915 text_obj = ChatCompletionTextObject( type="text", text=content.get("text", "") ) - new_user_content_list.append(text_obj) + self._add_cache_control_if_applicable(content, text_obj, model) + new_user_content_list.append(text_obj) # type: ignore elif content.get("type") == "image": - image_url = ChatCompletionImageUrlObject( - url=f"data:{content.get('type', '')};base64,{content.get('source', '')}" + # Convert Anthropic image format to OpenAI format + source = content.get("source", {}) + openai_image_url = ( + self._translate_anthropic_image_to_openai(cast(dict, source)) ) - image_obj = ChatCompletionImageObject( - type="image_url", image_url=image_url + + if openai_image_url: + image_url_obj = ChatCompletionImageUrlObject( + url=openai_image_url + ) + image_obj = ChatCompletionImageObject( + type="image_url", image_url=image_url_obj + ) + self._add_cache_control_if_applicable(content, image_obj, model) + new_user_content_list.append(image_obj) # type: ignore + elif content.get("type") == "document": + # Convert Anthropic document format (PDF, etc.) to OpenAI format + source = content.get("source", {}) + openai_image_url = ( + self._translate_anthropic_image_to_openai(cast(dict, source)) ) - new_user_content_list.append(image_obj) + if openai_image_url: + image_url_obj = ChatCompletionImageUrlObject( + url=openai_image_url + ) + doc_obj = ChatCompletionImageObject( + type="image_url", image_url=image_url_obj + ) + self._add_cache_control_if_applicable(content, doc_obj, model) + new_user_content_list.append(doc_obj) # type: ignore elif content.get("type") == "tool_result": if "content" not in content: tool_result = ChatCompletionToolMessage( @@ -182,23 +276,33 @@ def translate_anthropic_messages_to_openai( # noqa: PLR0915 tool_call_id=content.get("tool_use_id", ""), content="", ) - tool_message_list.append(tool_result) + self._add_cache_control_if_applicable(content, tool_result, model) + tool_message_list.append(tool_result) # type: ignore[arg-type] elif isinstance(content.get("content"), str): tool_result = ChatCompletionToolMessage( role="tool", tool_call_id=content.get("tool_use_id", ""), content=str(content.get("content", "")), ) - tool_message_list.append(tool_result) + self._add_cache_control_if_applicable(content, tool_result, model) + tool_message_list.append(tool_result) # type: ignore[arg-type] elif isinstance(content.get("content"), list): - for c in content.get("content", []): + # Combine all content items into a single tool message + # to avoid creating multiple tool_result blocks with the same ID + # (each tool_use must have exactly one tool_result) + content_items = list(content.get("content", [])) + + # For single-item content, maintain backward compatibility with string/url format + if len(content_items) == 1: + c = content_items[0] if isinstance(c, str): tool_result = ChatCompletionToolMessage( role="tool", tool_call_id=content.get("tool_use_id", ""), content=c, ) - tool_message_list.append(tool_result) + self._add_cache_control_if_applicable(content, tool_result, model) + tool_message_list.append(tool_result) # type: ignore[arg-type] elif isinstance(c, dict): if c.get("type") == "text": tool_result = ChatCompletionToolMessage( @@ -208,17 +312,75 @@ def translate_anthropic_messages_to_openai( # noqa: PLR0915 ), content=c.get("text", ""), ) - tool_message_list.append(tool_result) + self._add_cache_control_if_applicable(content, tool_result, model) + tool_message_list.append(tool_result) # type: ignore[arg-type] elif c.get("type") == "image": - image_str = f"data:{c.get('type', '')};base64,{c.get('source', '')}" + source = c.get("source", {}) + openai_image_url = ( + self._translate_anthropic_image_to_openai( + cast(dict, source) + ) + or "" + ) tool_result = ChatCompletionToolMessage( role="tool", tool_call_id=content.get( "tool_use_id", "" ), - content=image_str, + content=openai_image_url, + ) + self._add_cache_control_if_applicable(content, tool_result, model) + tool_message_list.append(tool_result) # type: ignore[arg-type] + else: + # For multiple content items, combine into a single tool message + # with list content to preserve all items while having one tool_use_id + combined_content_parts: List[ + Union[ + ChatCompletionTextObject, + ChatCompletionImageObject, + ] + ] = [] + for c in content_items: + if isinstance(c, str): + combined_content_parts.append( + ChatCompletionTextObject( + type="text", text=c + ) ) - tool_message_list.append(tool_result) + elif isinstance(c, dict): + if c.get("type") == "text": + combined_content_parts.append( + ChatCompletionTextObject( + type="text", + text=c.get("text", ""), + ) + ) + elif c.get("type") == "image": + source = c.get("source", {}) + openai_image_url = ( + self._translate_anthropic_image_to_openai( + cast(dict, source) + ) + or "" + ) + if openai_image_url: + combined_content_parts.append( + ChatCompletionImageObject( + type="image_url", + image_url=ChatCompletionImageUrlObject( + url=openai_image_url + ), + ) + ) + # Create a single tool message with combined content + if combined_content_parts: + tool_result = ChatCompletionToolMessage( + role="tool", + tool_call_id=content.get("tool_use_id", ""), + content=combined_content_parts, # type: ignore + ) + self._add_cache_control_if_applicable(content, tool_result, model) + tool_message_list.append(tool_result) # type: ignore[arg-type] if len(tool_message_list) > 0: new_messages.extend(tool_message_list) @@ -231,8 +393,12 @@ def translate_anthropic_messages_to_openai( # noqa: PLR0915 ## ASSISTANT MESSAGE ## assistant_message_str: Optional[str] = None + assistant_content_list: List[Dict[str, Any]] = [] # For content blocks with cache_control + has_cache_control_in_text = False tool_calls: List[ChatCompletionAssistantToolCall] = [] - thinking_blocks: List[Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock]] = [] + thinking_blocks: List[ + Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock] + ] = [] if m["role"] == "assistant": if isinstance(m.get("content"), str): assistant_message_str = str(m.get("content", "")) @@ -242,54 +408,177 @@ def translate_anthropic_messages_to_openai( # noqa: PLR0915 assistant_message_str = str(content) elif isinstance(content, dict): if content.get("type") == "text": - if assistant_message_str is None: - assistant_message_str = content.get("text", "") - else: - assistant_message_str += content.get("text", "") + text_block: Dict[str, Any] = { + "type": "text", + "text": content.get("text", ""), + } + self._add_cache_control_if_applicable(content, text_block, model) + if "cache_control" in text_block: + has_cache_control_in_text = True + assistant_content_list.append(text_block) elif content.get("type") == "tool_use": - function_chunk = ChatCompletionToolCallFunctionChunk( - name=content.get("name", ""), - arguments=json.dumps(content.get("input", {})), + function_chunk: ChatCompletionToolCallFunctionChunk = { + "name": content.get("name", ""), + "arguments": json.dumps(content.get("input", {})), + } + signature = ( + self._extract_signature_from_tool_use_content( + cast(Dict[str, Any], content) + ) ) - tool_calls.append( - ChatCompletionAssistantToolCall( - id=content.get("id", ""), - type="function", - function=function_chunk, + if signature: + provider_specific_fields: Dict[str, Any] = ( + function_chunk.get("provider_specific_fields") + or {} ) + provider_specific_fields["thought_signature"] = ( + signature + ) + function_chunk["provider_specific_fields"] = ( + provider_specific_fields + ) + + tool_call = ChatCompletionAssistantToolCall( + id=content.get("id", ""), + type="function", + function=function_chunk, ) + self._add_cache_control_if_applicable(content, tool_call, model) + tool_calls.append(tool_call) elif content.get("type") == "thinking": thinking_block = ChatCompletionThinkingBlock( type="thinking", thinking=content.get("thinking") or "", signature=content.get("signature") or "", - cache_control=content.get("cache_control", {}) + cache_control=content.get("cache_control", {}), ) thinking_blocks.append(thinking_block) elif content.get("type") == "redacted_thinking": - redacted_thinking_block = ChatCompletionRedactedThinkingBlock( - type="redacted_thinking", - data=content.get("data") or "", - cache_control=content.get("cache_control", {}) + redacted_thinking_block = ( + ChatCompletionRedactedThinkingBlock( + type="redacted_thinking", + data=content.get("data") or "", + cache_control=content.get("cache_control", {}), + ) ) thinking_blocks.append(redacted_thinking_block) + if ( + assistant_message_str is not None + or len(assistant_content_list) > 0 + or len(tool_calls) > 0 + or len(thinking_blocks) > 0 + ): + # Use list format if any text block has cache_control, otherwise use string + if has_cache_control_in_text and len(assistant_content_list) > 0: + assistant_content: Any = assistant_content_list + elif len(assistant_content_list) > 0 and not has_cache_control_in_text: + # Concatenate text blocks into string when no cache_control + assistant_content = "".join( + block.get("text", "") for block in assistant_content_list + ) + else: + assistant_content = assistant_message_str - if assistant_message_str is not None or len(tool_calls) > 0 or len(thinking_blocks) > 0: assistant_message = ChatCompletionAssistantMessage( role="assistant", - content=assistant_message_str, - thinking_blocks=thinking_blocks if len(thinking_blocks) > 0 else None, + content=assistant_content, + thinking_blocks=( + thinking_blocks if len(thinking_blocks) > 0 else None + ), ) if len(tool_calls) > 0: - assistant_message["tool_calls"] = tool_calls + assistant_message["tool_calls"] = tool_calls # type: ignore if len(thinking_blocks) > 0: assistant_message["thinking_blocks"] = thinking_blocks # type: ignore new_messages.append(assistant_message) return new_messages + @staticmethod + def translate_anthropic_thinking_to_reasoning_effort( + thinking: Dict[str, Any] + ) -> Optional[str]: + """ + Translate Anthropic's thinking parameter to OpenAI's reasoning_effort. + + Anthropic thinking format: {'type': 'enabled'|'disabled', 'budget_tokens': int} + OpenAI reasoning_effort: 'none' | 'minimal' | 'low' | 'medium' | 'high' | 'xhigh' | 'default' + + Mapping: + - budget_tokens >= 10000 -> 'high' + - budget_tokens >= 5000 -> 'medium' + - budget_tokens >= 2000 -> 'low' + - budget_tokens < 2000 -> 'minimal' + """ + if not isinstance(thinking, dict): + return None + + thinking_type = thinking.get("type", "disabled") + + if thinking_type == "disabled": + return None + elif thinking_type == "enabled": + budget_tokens = thinking.get("budget_tokens", 0) + if budget_tokens >= 10000: + return "high" + elif budget_tokens >= 5000: + return "medium" + elif budget_tokens >= 2000: + return "low" + else: + return "minimal" + + return None + + @staticmethod + def is_anthropic_claude_model(model: str) -> bool: + """ + Check if the model is an Anthropic Claude model that supports the thinking parameter. + + Returns True for: + - anthropic/* models + - bedrock/*anthropic* models (including converse) + - vertex_ai/*claude* models + """ + model_lower = model.lower() + return ( + "anthropic" in model_lower + or "claude" in model_lower + ) + + @staticmethod + def translate_thinking_for_model( + thinking: Dict[str, Any], + model: str, + ) -> Dict[str, Any]: + """ + Translate Anthropic thinking parameter based on the target model. + + For Claude/Anthropic models: returns {'thinking': } + - Preserves exact budget_tokens value + + For non-Claude models: returns {'reasoning_effort': } + - Converts thinking to reasoning_effort to avoid UnsupportedParamsError + + Args: + thinking: Anthropic thinking dict with 'type' and 'budget_tokens' + model: The target model name + + Returns: + Dict with either 'thinking' or 'reasoning_effort' key + """ + if LiteLLMAnthropicMessagesAdapter.is_anthropic_claude_model(model): + return {"thinking": thinking} + else: + reasoning_effort = LiteLLMAnthropicMessagesAdapter.translate_anthropic_thinking_to_reasoning_effort( + thinking + ) + if reasoning_effort: + return {"reasoning_effort": reasoning_effort} + return {} + def translate_anthropic_tool_choice_to_openai( self, tool_choice: AnthropicMessagesToolChoice ) -> ChatCompletionToolChoiceValues: @@ -310,10 +599,10 @@ def translate_anthropic_tool_choice_to_openai( ) def translate_anthropic_tools_to_openai( - self, tools: List[AllAnthropicToolsValues] + self, tools: List[AllAnthropicToolsValues], model: Optional[str] = None ) -> List[ChatCompletionToolParam]: new_tools: List[ChatCompletionToolParam] = [] - mapped_tool_params = ["name", "input_schema", "description"] + mapped_tool_params = ["name", "input_schema", "description", "cache_control"] for tool in tools: function_chunk = ChatCompletionToolParamFunctionChunk( name=tool["name"], @@ -326,11 +615,82 @@ def translate_anthropic_tools_to_openai( for k, v in tool.items(): if k not in mapped_tool_params: # pass additional computer kwargs function_chunk.setdefault("parameters", {}).update({k: v}) - new_tools.append( - ChatCompletionToolParam(type="function", function=function_chunk) - ) + tool_param = ChatCompletionToolParam(type="function", function=function_chunk) + self._add_cache_control_if_applicable(tool, tool_param, model) + new_tools.append(tool_param) # type: ignore[arg-type] - return new_tools + return new_tools # type: ignore[return-value] + + def translate_anthropic_output_format_to_openai( + self, output_format: Any + ) -> Optional[Dict[str, Any]]: + """ + Translate Anthropic's output_format to OpenAI's response_format. + + Anthropic output_format: {"type": "json_schema", "schema": {...}} + OpenAI response_format: {"type": "json_schema", "json_schema": {"name": "...", "schema": {...}}} + + Args: + output_format: Anthropic output_format dict with 'type' and 'schema' + + Returns: + OpenAI-compatible response_format dict, or None if invalid + """ + if not isinstance(output_format, dict): + return None + + output_type = output_format.get("type") + if output_type != "json_schema": + return None + + schema = output_format.get("schema") + if not schema: + return None + + # Convert to OpenAI response_format structure + return { + "type": "json_schema", + "json_schema": { + "name": "structured_output", + "schema": schema, + "strict": True, + }, + } + + def _add_system_message_to_messages( + self, + new_messages: List[AllMessageValues], + anthropic_message_request: AnthropicMessagesRequest, + ) -> None: + """Add system message to messages list if present in request.""" + if "system" not in anthropic_message_request: + return + system_content = anthropic_message_request["system"] + if not system_content: + return + # Handle system as string or array of content blocks + if isinstance(system_content, str): + new_messages.insert( + 0, + ChatCompletionSystemMessage(role="system", content=system_content), + ) + elif isinstance(system_content, list): + # Convert Anthropic system content blocks to OpenAI format + openai_system_content: List[Dict[str, Any]] = [] + model_name = anthropic_message_request.get("model", "") + for block in system_content: + if isinstance(block, dict) and block.get("type") == "text": + text_block: Dict[str, Any] = { + "type": "text", + "text": block.get("text", ""), + } + self._add_cache_control_if_applicable(block, text_block, model_name) + openai_system_content.append(text_block) + if openai_system_content: + new_messages.insert( + 0, + ChatCompletionSystemMessage(role="system", content=openai_system_content), # type: ignore + ) def translate_anthropic_to_openai( self, anthropic_message_request: AnthropicMessagesRequest @@ -356,16 +716,11 @@ def translate_anthropic_to_openai( anthropic_message_request["messages"], ) new_messages = self.translate_anthropic_messages_to_openai( - messages=messages_list + messages=messages_list, + model=anthropic_message_request.get("model"), ) ## ADD SYSTEM MESSAGE TO MESSAGES - if "system" in anthropic_message_request: - system_content = anthropic_message_request["system"] - if system_content: - new_messages.insert( - 0, - ChatCompletionSystemMessage(role="system", content=system_content), - ) + self._add_system_message_to_messages(new_messages, anthropic_message_request) new_kwargs: ChatCompletionRequest = { "model": anthropic_message_request["model"], @@ -396,9 +751,34 @@ def translate_anthropic_to_openai( tools = anthropic_message_request["tools"] if tools: new_kwargs["tools"] = self.translate_anthropic_tools_to_openai( - tools=cast(List[AllAnthropicToolsValues], tools) + tools=cast(List[AllAnthropicToolsValues], tools), + model=new_kwargs.get("model"), ) + ## CONVERT THINKING + if "thinking" in anthropic_message_request: + thinking = anthropic_message_request["thinking"] + if thinking: + model = new_kwargs.get("model", "") + if self.is_anthropic_claude_model(model): + new_kwargs["thinking"] = thinking # type: ignore + else: + reasoning_effort = self.translate_anthropic_thinking_to_reasoning_effort( + cast(Dict[str, Any], thinking) + ) + if reasoning_effort: + new_kwargs["reasoning_effort"] = reasoning_effort + + ## CONVERT OUTPUT_FORMAT to RESPONSE_FORMAT + if "output_format" in anthropic_message_request: + output_format = anthropic_message_request["output_format"] + if output_format: + response_format = self.translate_anthropic_output_format_to_openai( + output_format=output_format + ) + if response_format: + new_kwargs["response_format"] = response_format + translatable_params = self.translatable_anthropic_params() for k, v in anthropic_message_request.items(): if k not in translatable_params: # pass remaining params as is @@ -406,19 +786,55 @@ def translate_anthropic_to_openai( return new_kwargs - def _translate_openai_content_to_anthropic( - self, choices: List[Choices] - ) -> List[ - Union[AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse, AnthropicResponseContentBlockThinking, AnthropicResponseContentBlockRedactedThinking] + def _translate_anthropic_image_to_openai(self, image_source: dict) -> Optional[str]: + """ + Translate Anthropic image source format to OpenAI-compatible image URL. + + Anthropic supports two image source formats: + 1. Base64: {"type": "base64", "media_type": "image/jpeg", "data": "..."} + 2. URL: {"type": "url", "url": "https://..."} + + Returns the properly formatted image URL string, or None if invalid format. + """ + if not isinstance(image_source, dict): + return None + + source_type = image_source.get("type") + + if source_type == "base64": + # Base64 image format + media_type = image_source.get("media_type", "image/jpeg") + image_data = image_source.get("data", "") + if image_data: + return f"data:{media_type};base64,{image_data}" + elif source_type == "url": + # URL-referenced image format + return image_source.get("url", "") + + return None + + def _translate_openai_content_to_anthropic(self, choices: List[Choices]) -> List[ + Union[ + AnthropicResponseContentBlockText, + AnthropicResponseContentBlockToolUse, + AnthropicResponseContentBlockThinking, + AnthropicResponseContentBlockRedactedThinking, + ] ]: new_content: List[ Union[ - AnthropicResponseContentBlockText, AnthropicResponseContentBlockToolUse, AnthropicResponseContentBlockThinking, AnthropicResponseContentBlockRedactedThinking + AnthropicResponseContentBlockText, + AnthropicResponseContentBlockToolUse, + AnthropicResponseContentBlockThinking, + AnthropicResponseContentBlockRedactedThinking, ] ] = [] for choice in choices: # Handle thinking blocks first - if hasattr(choice.message, 'thinking_blocks') and choice.message.thinking_blocks: + if ( + hasattr(choice.message, "thinking_blocks") + and choice.message.thinking_blocks + ): for thinking_block in choice.message.thinking_blocks: if thinking_block.get("type") == "thinking": thinking_value = thinking_block.get("thinking", "") @@ -426,8 +842,16 @@ def _translate_openai_content_to_anthropic( new_content.append( AnthropicResponseContentBlockThinking( type="thinking", - thinking=str(thinking_value) if thinking_value is not None else "", - signature=str(signature_value) if signature_value is not None else None, + thinking=( + str(thinking_value) + if thinking_value is not None + else "" + ), + signature=( + str(signature_value) + if signature_value is not None + else None + ), ) ) elif thinking_block.get("type") == "redacted_thinking": @@ -438,28 +862,43 @@ def _translate_openai_content_to_anthropic( data=str(data_value) if data_value is not None else "", ) ) - - # Handle tool calls - if ( - choice.message.tool_calls is not None - and len(choice.message.tool_calls) > 0 - ): - for tool_call in choice.message.tool_calls: - new_content.append( - AnthropicResponseContentBlockToolUse( - type="tool_use", - id=tool_call.id, - name=tool_call.function.name or "", - input=json.loads(tool_call.function.arguments) if tool_call.function.arguments else {}, - ) - ) + # Handle text content - elif choice.message.content is not None: + if choice.message.content is not None: new_content.append( AnthropicResponseContentBlockText( type="text", text=choice.message.content ) ) + # Handle tool calls (in parallel to text content) + if ( + choice.message.tool_calls is not None + and len(choice.message.tool_calls) > 0 + ): + for tool_call in choice.message.tool_calls: + # Extract signature from provider_specific_fields only + signature = self._extract_signature_from_tool_call(tool_call) + + provider_specific_fields = {} + if signature: + provider_specific_fields["signature"] = signature + + tool_use_block = AnthropicResponseContentBlockToolUse( + type="tool_use", + id=tool_call.id, + name=tool_call.function.name or "", + input=parse_tool_call_arguments( + tool_call.function.arguments, + tool_name=tool_call.function.name, + context="Anthropic pass-through adapter", + ), + ) + # Add provider_specific_fields if signature is present + if provider_specific_fields: + tool_use_block.provider_specific_fields = ( + provider_specific_fields + ) + new_content.append(tool_use_block) return new_content @@ -489,13 +928,19 @@ def translate_openai_response_to_anthropic( input_tokens=usage.prompt_tokens or 0, output_tokens=usage.completion_tokens or 0, ) + # Add cache tokens if available (for prompt caching support) + if hasattr(usage, "_cache_creation_input_tokens") and usage._cache_creation_input_tokens > 0: + anthropic_usage["cache_creation_input_tokens"] = usage._cache_creation_input_tokens + if hasattr(usage, "_cache_read_input_tokens") and usage._cache_read_input_tokens > 0: + anthropic_usage["cache_read_input_tokens"] = usage._cache_read_input_tokens + translated_obj = AnthropicMessagesResponse( id=response.id, type="message", role="assistant", model=response.model or "unknown-model", stop_sequence=None, - usage=anthropic_usage, + usage=anthropic_usage, # type: ignore content=anthropic_content, # type: ignore stop_reason=anthropic_finish_reason, ) @@ -512,9 +957,7 @@ def _translate_streaming_openai_chunk_to_anthropic_content_block( from litellm.types.llms.anthropic import TextBlock, ToolUseBlock for choice in choices: - if choice.delta.content is not None and len(choice.delta.content) > 0: - return "text", TextBlock(type="text", text="") - elif ( + if ( choice.delta.tool_calls is not None and len(choice.delta.tool_calls) > 0 and choice.delta.tool_calls[0].function is not None @@ -523,10 +966,12 @@ def _translate_streaming_openai_chunk_to_anthropic_content_block( type="tool_use", id=choice.delta.tool_calls[0].id or str(uuid.uuid4()), name=choice.delta.tool_calls[0].function.name or "", - input={}, + input={}, # type: ignore[typeddict-item] ) - elif ( - isinstance(choice, StreamingChoices) and hasattr(choice.delta, "thinking_blocks") + elif choice.delta.content is not None and len(choice.delta.content) > 0: + return "text", TextBlock(type="text", text="") + elif isinstance(choice, StreamingChoices) and hasattr( + choice.delta, "thinking_blocks" ): thinking_blocks = choice.delta.thinking_blocks or [] if len(thinking_blocks) > 0: @@ -539,22 +984,26 @@ def _translate_streaming_openai_chunk_to_anthropic_content_block( assert isinstance(signature, str) if thinking and signature: - raise ValueError("Both `thinking` and `signature` in a single streaming chunk isn't supported.") + raise ValueError( + "Both `thinking` and `signature` in a single streaming chunk isn't supported." + ) return "thinking", ChatCompletionThinkingBlock( - type="thinking", - thinking=thinking, - signature=signature + type="thinking", thinking=thinking, signature=signature ) - return "text", TextBlock(type="text", text="") def _translate_streaming_openai_chunk_to_anthropic( self, choices: List[Union[OpenAIStreamingChoice, StreamingChoices]] ) -> Tuple[ Literal["text_delta", "input_json_delta", "thinking_delta", "signature_delta"], - Union[ContentTextBlockDelta, ContentJsonBlockDelta, ContentThinkingBlockDelta, ContentThinkingSignatureBlockDelta], + Union[ + ContentTextBlockDelta, + ContentJsonBlockDelta, + ContentThinkingBlockDelta, + ContentThinkingSignatureBlockDelta, + ], ]: text: str = "" @@ -564,7 +1013,7 @@ def _translate_streaming_openai_chunk_to_anthropic( for choice in choices: if choice.delta.content is not None and len(choice.delta.content) > 0: text += choice.delta.content - elif choice.delta.tool_calls is not None: + if choice.delta.tool_calls is not None: partial_json = "" for tool in choice.delta.tool_calls: if ( @@ -572,7 +1021,9 @@ def _translate_streaming_openai_chunk_to_anthropic( and tool.function.arguments is not None ): partial_json = (partial_json or "") + tool.function.arguments - elif isinstance(choice, StreamingChoices) and hasattr(choice.delta, "thinking_blocks"): + elif isinstance(choice, StreamingChoices) and hasattr( + choice.delta, "thinking_blocks" + ): thinking_blocks = choice.delta.thinking_blocks or [] if len(thinking_blocks) > 0: for thinking_block in thinking_blocks: @@ -585,19 +1036,24 @@ def _translate_streaming_openai_chunk_to_anthropic( reasoning_content += thinking reasoning_signature += signature - - if reasoning_content and reasoning_signature: - raise ValueError("Both `reasoning` and `signature` in a single streaming chunk isn't supported.") + if reasoning_content and reasoning_signature: + raise ValueError( + "Both `reasoning` and `signature` in a single streaming chunk isn't supported." + ) if partial_json is not None: return "input_json_delta", ContentJsonBlockDelta( type="input_json_delta", partial_json=partial_json ) elif reasoning_content: - return "thinking_delta", ContentThinkingBlockDelta(type="thinking_delta", thinking=reasoning_content) + return "thinking_delta", ContentThinkingBlockDelta( + type="thinking_delta", thinking=reasoning_content + ) elif reasoning_signature: - return "signature_delta", ContentThinkingSignatureBlockDelta(type="signature_delta", signature=reasoning_signature) + return "signature_delta", ContentThinkingSignatureBlockDelta( + type="signature_delta", signature=reasoning_signature + ) else: return "text_delta", ContentTextBlockDelta(type="text_delta", text=text) @@ -625,10 +1081,15 @@ def translate_streaming_openai_response_to_anthropic( input_tokens=litellm_usage_chunk.prompt_tokens or 0, output_tokens=litellm_usage_chunk.completion_tokens or 0, ) + # Add cache tokens if available (for prompt caching support) + if hasattr(litellm_usage_chunk, "_cache_creation_input_tokens") and litellm_usage_chunk._cache_creation_input_tokens > 0: + usage_delta["cache_creation_input_tokens"] = litellm_usage_chunk._cache_creation_input_tokens + if hasattr(litellm_usage_chunk, "_cache_read_input_tokens") and litellm_usage_chunk._cache_read_input_tokens > 0: + usage_delta["cache_read_input_tokens"] = litellm_usage_chunk._cache_read_input_tokens else: usage_delta = UsageDelta(input_tokens=0, output_tokens=0) return MessageBlockDelta( - type="message_delta", delta=delta, usage=usage_delta + type="message_delta", delta=delta, usage=usage_delta # type: ignore ) ( type_of_content, diff --git a/litellm/llms/anthropic/experimental_pass_through/architecture.md b/litellm/llms/anthropic/experimental_pass_through/architecture.md new file mode 100644 index 000000000000..b939723513e4 --- /dev/null +++ b/litellm/llms/anthropic/experimental_pass_through/architecture.md @@ -0,0 +1,51 @@ +# Anthropic Messages Pass-Through Architecture + +## Request Flow + +```mermaid +flowchart TD + A[litellm.anthropic.messages.acreate] --> B{Provider?} + + B -->|anthropic| C[AnthropicMessagesConfig] + B -->|azure_ai| D[AzureAnthropicMessagesConfig] + B -->|bedrock invoke| E[BedrockAnthropicMessagesConfig] + B -->|vertex_ai| F[VertexAnthropicMessagesConfig] + B -->|Other providers| G[LiteLLMAnthropicMessagesAdapter] + + C --> H[Direct Anthropic API] + D --> I[Azure AI Foundry API] + E --> J[Bedrock Invoke API] + F --> K[Vertex AI API] + + G --> L[translate_anthropic_to_openai] + L --> M[litellm.completion] + M --> N[Provider API] + N --> O[translate_openai_response_to_anthropic] + O --> P[Anthropic Response Format] + + H --> P + I --> P + J --> P + K --> P +``` + +## Adapter Flow (Non-Native Providers) + +```mermaid +sequenceDiagram + participant User + participant Handler as anthropic_messages_handler + participant Adapter as LiteLLMAnthropicMessagesAdapter + participant LiteLLM as litellm.completion + participant Provider as Provider API + + User->>Handler: Anthropic Messages Request + Handler->>Adapter: translate_anthropic_to_openai() + Note over Adapter: messages, tools, thinking,
output_format → response_format + Adapter->>LiteLLM: OpenAI Format Request + LiteLLM->>Provider: Provider-specific Request + Provider->>LiteLLM: Provider Response + LiteLLM->>Adapter: OpenAI Format Response + Adapter->>Handler: translate_openai_response_to_anthropic() + Handler->>User: Anthropic Messages Response +``` diff --git a/litellm/llms/anthropic/experimental_pass_through/messages/fake_stream_iterator.py b/litellm/llms/anthropic/experimental_pass_through/messages/fake_stream_iterator.py new file mode 100644 index 000000000000..542ae20b602e --- /dev/null +++ b/litellm/llms/anthropic/experimental_pass_through/messages/fake_stream_iterator.py @@ -0,0 +1,246 @@ +""" +Fake Streaming Iterator for Anthropic Messages + +This module provides a fake streaming iterator that converts non-streaming +Anthropic Messages responses into proper streaming format. + +Used when WebSearch interception converts stream=True to stream=False but +the LLM doesn't make a tool call, and we need to return a stream to the user. +""" + +import json +from typing import Any, Dict, List, cast + +from litellm.types.llms.anthropic_messages.anthropic_response import ( + AnthropicMessagesResponse, +) + + +class FakeAnthropicMessagesStreamIterator: + """ + Fake streaming iterator for Anthropic Messages responses. + + Used when we need to convert a non-streaming response to a streaming format, + such as when WebSearch interception converts stream=True to stream=False but + the LLM doesn't make a tool call. + + This creates a proper Anthropic-style streaming response with multiple events: + - message_start + - content_block_start (for each content block) + - content_block_delta (for text content, chunked) + - content_block_stop + - message_delta (for usage) + - message_stop + """ + + def __init__(self, response: AnthropicMessagesResponse): + self.response = response + self.chunks = self._create_streaming_chunks() + self.current_index = 0 + + def _create_streaming_chunks(self) -> List[bytes]: + """Convert the non-streaming response to streaming chunks""" + chunks = [] + + # Cast response to dict for easier access + response_dict = cast(Dict[str, Any], self.response) + + # 1. message_start event + usage = response_dict.get("usage", {}) + message_start = { + "type": "message_start", + "message": { + "id": response_dict.get("id"), + "type": "message", + "role": response_dict.get("role", "assistant"), + "model": response_dict.get("model"), + "content": [], + "stop_reason": None, + "stop_sequence": None, + "usage": { + "input_tokens": usage.get("input_tokens", 0) if usage else 0, + "output_tokens": 0 + } + } + } + chunks.append(f"event: message_start\ndata: {json.dumps(message_start)}\n\n".encode()) + + # 2-4. For each content block, send start/delta/stop events + content_blocks = response_dict.get("content", []) + if content_blocks: + for index, block in enumerate(content_blocks): + # Cast block to dict for easier access + block_dict = cast(Dict[str, Any], block) + block_type = block_dict.get("type") + + if block_type == "text": + # content_block_start + content_block_start = { + "type": "content_block_start", + "index": index, + "content_block": { + "type": "text", + "text": "" + } + } + chunks.append(f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n".encode()) + + # content_block_delta (send full text as one delta for simplicity) + text = block_dict.get("text", "") + content_block_delta = { + "type": "content_block_delta", + "index": index, + "delta": { + "type": "text_delta", + "text": text + } + } + chunks.append(f"event: content_block_delta\ndata: {json.dumps(content_block_delta)}\n\n".encode()) + + # content_block_stop + content_block_stop = { + "type": "content_block_stop", + "index": index + } + chunks.append(f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n".encode()) + + elif block_type == "thinking": + # content_block_start for thinking + content_block_start = { + "type": "content_block_start", + "index": index, + "content_block": { + "type": "thinking", + "thinking": "", + "signature": "" + } + } + chunks.append(f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n".encode()) + + # content_block_delta for thinking text + thinking_text = block_dict.get("thinking", "") + if thinking_text: + content_block_delta = { + "type": "content_block_delta", + "index": index, + "delta": { + "type": "thinking_delta", + "thinking": thinking_text + } + } + chunks.append(f"event: content_block_delta\ndata: {json.dumps(content_block_delta)}\n\n".encode()) + + # content_block_delta for signature (if present) + signature = block_dict.get("signature", "") + if signature: + signature_delta = { + "type": "content_block_delta", + "index": index, + "delta": { + "type": "signature_delta", + "signature": signature + } + } + chunks.append(f"event: content_block_delta\ndata: {json.dumps(signature_delta)}\n\n".encode()) + + # content_block_stop + content_block_stop = { + "type": "content_block_stop", + "index": index + } + chunks.append(f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n".encode()) + + elif block_type == "redacted_thinking": + # content_block_start for redacted_thinking + content_block_start = { + "type": "content_block_start", + "index": index, + "content_block": { + "type": "redacted_thinking" + } + } + chunks.append(f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n".encode()) + + # content_block_stop (no delta for redacted thinking) + content_block_stop = { + "type": "content_block_stop", + "index": index + } + chunks.append(f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n".encode()) + + elif block_type == "tool_use": + # content_block_start + content_block_start = { + "type": "content_block_start", + "index": index, + "content_block": { + "type": "tool_use", + "id": block_dict.get("id"), + "name": block_dict.get("name"), + "input": {} + } + } + chunks.append(f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n".encode()) + + # content_block_delta (send input as JSON delta) + input_data = block_dict.get("input", {}) + content_block_delta = { + "type": "content_block_delta", + "index": index, + "delta": { + "type": "input_json_delta", + "partial_json": json.dumps(input_data) + } + } + chunks.append(f"event: content_block_delta\ndata: {json.dumps(content_block_delta)}\n\n".encode()) + + # content_block_stop + content_block_stop = { + "type": "content_block_stop", + "index": index + } + chunks.append(f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n".encode()) + + # 5. message_delta event (with final usage and stop_reason) + message_delta = { + "type": "message_delta", + "delta": { + "stop_reason": response_dict.get("stop_reason"), + "stop_sequence": response_dict.get("stop_sequence") + }, + "usage": { + "output_tokens": usage.get("output_tokens", 0) if usage else 0 + } + } + chunks.append(f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n".encode()) + + # 6. message_stop event + message_stop = { + "type": "message_stop", + "usage": usage if usage else {} + } + chunks.append(f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n".encode()) + + return chunks + + def __aiter__(self): + return self + + async def __anext__(self): + if self.current_index >= len(self.chunks): + raise StopAsyncIteration + + chunk = self.chunks[self.current_index] + self.current_index += 1 + return chunk + + def __iter__(self): + return self + + def __next__(self): + if self.current_index >= len(self.chunks): + raise StopIteration + + chunk = self.chunks[self.current_index] + self.current_index += 1 + return chunk diff --git a/litellm/llms/anthropic/experimental_pass_through/messages/handler.py b/litellm/llms/anthropic/experimental_pass_through/messages/handler.py index cc9334ae68bf..7e5a4f22a7f9 100644 --- a/litellm/llms/anthropic/experimental_pass_through/messages/handler.py +++ b/litellm/llms/anthropic/experimental_pass_through/messages/handler.py @@ -33,6 +33,70 @@ ################################################# +async def _execute_pre_request_hooks( + model: str, + messages: List[Dict], + tools: Optional[List[Dict]], + stream: Optional[bool], + custom_llm_provider: Optional[str], + **kwargs, +) -> Dict: + """ + Execute pre-request hooks from CustomLogger callbacks. + + Allows CustomLoggers to modify request parameters before the API call. + Used for WebSearch tool conversion, stream modification, etc. + + Args: + model: Model name + messages: List of messages + tools: Optional tools list + stream: Optional stream flag + custom_llm_provider: Provider name (if not set, will be extracted from model) + **kwargs: Additional request parameters + + Returns: + Dict containing all (potentially modified) request parameters including tools, stream + """ + # If custom_llm_provider not provided, extract from model + if not custom_llm_provider: + try: + _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) + except Exception: + # If extraction fails, continue without provider + pass + + # Build complete request kwargs dict + request_kwargs = { + "tools": tools, + "stream": stream, + "litellm_params": { + "custom_llm_provider": custom_llm_provider, + }, + **kwargs, + } + + if not litellm.callbacks: + return request_kwargs + + from litellm.integrations.custom_logger import CustomLogger as _CustomLogger + + for callback in litellm.callbacks: + if not isinstance(callback, _CustomLogger): + continue + + # Call the pre-request hook + modified_kwargs = await callback.async_pre_request_hook( + model, messages, request_kwargs + ) + + # If hook returned modified kwargs, use them + if modified_kwargs is not None: + request_kwargs = modified_kwargs + + return request_kwargs + + @client async def anthropic_messages( max_tokens: int, @@ -57,7 +121,24 @@ async def anthropic_messages( """ Async: Make llm api request in Anthropic /messages API spec """ - local_vars = locals() + # Execute pre-request hooks to allow CustomLoggers to modify request + request_kwargs = await _execute_pre_request_hooks( + model=model, + messages=messages, + tools=tools, + stream=stream, + custom_llm_provider=custom_llm_provider, + **kwargs, + ) + + # Extract modified parameters + tools = request_kwargs.pop("tools", tools) + stream = request_kwargs.pop("stream", stream) + # Remove litellm_params from kwargs (only needed for hooks) + request_kwargs.pop("litellm_params", None) + # Merge back any other modifications + kwargs.update(request_kwargs) + loop = asyncio.get_event_loop() kwargs["is_async"] = True @@ -119,6 +200,7 @@ def anthropic_messages_handler( tools: Optional[List[Dict]] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, + container: Optional[Dict] = None, api_key: Optional[str] = None, api_base: Optional[str] = None, client: Optional[AsyncHTTPHandler] = None, @@ -131,6 +213,9 @@ def anthropic_messages_handler( ]: """ Makes Anthropic `/v1/messages` API calls In the Anthropic API Spec + + Args: + container: Container config with skills for code execution """ from litellm.types.utils import LlmProviders @@ -141,6 +226,10 @@ def anthropic_messages_handler( # Use provided client or create a new one litellm_logging_obj: LiteLLMLoggingObj = kwargs.get("litellm_logging_obj") # type: ignore + # Store original model name before get_llm_provider strips the provider prefix + # This is needed by agentic hooks (e.g., websearch_interception) to make follow-up requests + original_model = model + litellm_params = GenericLiteLLMParams( **kwargs, api_key=api_key, @@ -158,6 +247,19 @@ def anthropic_messages_handler( api_base=litellm_params.api_base, api_key=litellm_params.api_key, ) + + # Store agentic loop params in logging object for agentic hooks + # This provides original request context needed for follow-up calls + if litellm_logging_obj is not None: + litellm_logging_obj.model_call_details["agentic_loop_params"] = { + "model": original_model, + "custom_llm_provider": custom_llm_provider, + } + + # Check if stream was converted for WebSearch interception + # This is set in the async wrapper above when stream=True is converted to stream=False + if kwargs.get("_websearch_interception_converted_stream", False): + litellm_logging_obj.model_call_details["websearch_interception_converted_stream"] = True if litellm_params.mock_response and isinstance(litellm_params.mock_response, str): diff --git a/litellm/llms/anthropic/experimental_pass_through/messages/transformation.py b/litellm/llms/anthropic/experimental_pass_through/messages/transformation.py index e04a1aef5e88..308bf367d066 100644 --- a/litellm/llms/anthropic/experimental_pass_through/messages/transformation.py +++ b/litellm/llms/anthropic/experimental_pass_through/messages/transformation.py @@ -2,17 +2,26 @@ import httpx -from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj, verbose_logger +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj +from litellm.litellm_core_utils.litellm_logging import verbose_logger from litellm.llms.base_llm.anthropic_messages.transformation import ( BaseAnthropicMessagesConfig, ) -from litellm.types.llms.anthropic import AnthropicMessagesRequest +from litellm.types.llms.anthropic import ( + ANTHROPIC_BETA_HEADER_VALUES, + AnthropicMessagesRequest, +) from litellm.types.llms.anthropic_messages.anthropic_response import ( AnthropicMessagesResponse, ) +from litellm.types.llms.anthropic_tool_search import get_tool_search_beta_header from litellm.types.router import GenericLiteLLMParams -from ...common_utils import AnthropicError +from ...common_utils import ( + AnthropicError, + AnthropicModelInfo, + optionally_handle_anthropic_oauth, +) DEFAULT_ANTHROPIC_API_BASE = "https://api.anthropic.com" DEFAULT_ANTHROPIC_API_VERSION = "2023-06-01" @@ -32,6 +41,8 @@ def get_supported_anthropic_messages_params(self, model: str) -> list: "tools", "tool_choice", "thinking", + "context_management", + "output_format", # TODO: Add Anthropic `metadata` support # "metadata", ] @@ -62,8 +73,11 @@ def validate_anthropic_messages_environment( ) -> Tuple[dict, Optional[str]]: import os + # Check for Anthropic OAuth token in Authorization header + headers, api_key = optionally_handle_anthropic_oauth(headers=headers, api_key=api_key) if api_key is None: api_key = os.getenv("ANTHROPIC_API_KEY") + if "x-api-key" not in headers and api_key: headers["x-api-key"] = api_key if "anthropic-version" not in headers: @@ -71,6 +85,11 @@ def validate_anthropic_messages_environment( if "content-type" not in headers: headers["content-type"] = "application/json" + headers = self._update_headers_with_anthropic_beta( + headers=headers, + optional_params=optional_params, + ) + return headers, api_base def transform_anthropic_messages_request( @@ -94,7 +113,7 @@ def transform_anthropic_messages_request( status_code=400, ) ####### get required params for all anthropic messages requests ###### - verbose_logger.info(f"🔍 TRANSFORMATION DEBUG - Messages: {messages}") + verbose_logger.debug(f"TRANSFORMATION DEBUG - Messages: {messages}") anthropic_messages_request: AnthropicMessagesRequest = AnthropicMessagesRequest( messages=messages, max_tokens=max_tokens, @@ -142,3 +161,51 @@ def get_async_streaming_response_iterator( request_body=request_body, litellm_logging_obj=litellm_logging_obj, ) + + @staticmethod + def _update_headers_with_anthropic_beta( + headers: dict, + optional_params: dict, + custom_llm_provider: str = "anthropic", + ) -> dict: + """ + Auto-inject anthropic-beta headers based on features used. + + Handles: + - context_management: adds 'context-management-2025-06-27' + - tool_search: adds provider-specific tool search header + - output_format: adds 'structured-outputs-2025-11-13' + + Args: + headers: Request headers dict + optional_params: Optional parameters including tools, context_management, output_format + custom_llm_provider: Provider name for looking up correct tool search header + """ + beta_values: set = set() + + # Get existing beta headers if any + existing_beta = headers.get("anthropic-beta") + if existing_beta: + beta_values.update(b.strip() for b in existing_beta.split(",")) + + # Check for context management + if optional_params.get("context_management") is not None: + beta_values.add(ANTHROPIC_BETA_HEADER_VALUES.CONTEXT_MANAGEMENT_2025_06_27.value) + + # Check for structured outputs + if optional_params.get("output_format") is not None: + beta_values.add(ANTHROPIC_BETA_HEADER_VALUES.STRUCTURED_OUTPUT_2025_09_25.value) + + # Check for tool search tools + tools = optional_params.get("tools") + if tools: + anthropic_model_info = AnthropicModelInfo() + if anthropic_model_info.is_tool_search_used(tools): + # Use provider-specific tool search header + tool_search_header = get_tool_search_beta_header(custom_llm_provider) + beta_values.add(tool_search_header) + + if beta_values: + headers["anthropic-beta"] = ",".join(sorted(beta_values)) + + return headers diff --git a/litellm/llms/anthropic/files/__init__.py b/litellm/llms/anthropic/files/__init__.py new file mode 100644 index 000000000000..b8b538ffb62a --- /dev/null +++ b/litellm/llms/anthropic/files/__init__.py @@ -0,0 +1,4 @@ +from .handler import AnthropicFilesHandler + +__all__ = ["AnthropicFilesHandler"] + diff --git a/litellm/llms/anthropic/files/handler.py b/litellm/llms/anthropic/files/handler.py new file mode 100644 index 000000000000..d46fc4013109 --- /dev/null +++ b/litellm/llms/anthropic/files/handler.py @@ -0,0 +1,367 @@ +import asyncio +import json +import time +from typing import Any, Coroutine, Optional, Union + +import httpx + +import litellm +from litellm._logging import verbose_logger +from litellm._uuid import uuid +from litellm.llms.custom_httpx.http_handler import ( + get_async_httpx_client, +) +from litellm.litellm_core_utils.litellm_logging import Logging +from litellm.types.llms.openai import ( + FileContentRequest, + HttpxBinaryResponseContent, + OpenAIBatchResult, + OpenAIChatCompletionResponse, + OpenAIErrorBody, +) +from litellm.types.utils import CallTypes, LlmProviders, ModelResponse + +from ..chat.transformation import AnthropicConfig +from ..common_utils import AnthropicModelInfo + +# Map Anthropic error types to HTTP status codes +ANTHROPIC_ERROR_STATUS_CODE_MAP = { + "invalid_request_error": 400, + "authentication_error": 401, + "permission_error": 403, + "not_found_error": 404, + "rate_limit_error": 429, + "api_error": 500, + "overloaded_error": 503, + "timeout_error": 504, +} + + +class AnthropicFilesHandler: + """ + Handles Anthropic Files API operations. + + Currently supports: + - file_content() for retrieving Anthropic Message Batch results + """ + + def __init__(self): + self.anthropic_model_info = AnthropicModelInfo() + + async def afile_content( + self, + file_content_request: FileContentRequest, + api_base: Optional[str] = None, + api_key: Optional[str] = None, + timeout: Union[float, httpx.Timeout] = 600.0, + max_retries: Optional[int] = None, + ) -> HttpxBinaryResponseContent: + """ + Async: Retrieve file content from Anthropic. + + For batch results, the file_id should be the batch_id. + This will call Anthropic's /v1/messages/batches/{batch_id}/results endpoint. + + Args: + file_content_request: Contains file_id (batch_id for batch results) + api_base: Anthropic API base URL + api_key: Anthropic API key + timeout: Request timeout + max_retries: Max retry attempts (unused for now) + + Returns: + HttpxBinaryResponseContent: Binary content wrapped in compatible response format + """ + file_id = file_content_request.get("file_id") + if not file_id: + raise ValueError("file_id is required in file_content_request") + + # Extract batch_id from file_id + # Handle both formats: "anthropic_batch_results:{batch_id}" or just "{batch_id}" + if file_id.startswith("anthropic_batch_results:"): + batch_id = file_id.replace("anthropic_batch_results:", "", 1) + else: + batch_id = file_id + + # Get Anthropic API credentials + api_base = self.anthropic_model_info.get_api_base(api_base) + api_key = api_key or self.anthropic_model_info.get_api_key() + + if not api_key: + raise ValueError("Missing Anthropic API Key") + + # Construct the Anthropic batch results URL + results_url = f"{api_base.rstrip('/')}/v1/messages/batches/{batch_id}/results" + + # Prepare headers + headers = { + "accept": "application/json", + "anthropic-version": "2023-06-01", + "x-api-key": api_key, + } + + # Make the request to Anthropic + async_client = get_async_httpx_client(llm_provider=LlmProviders.ANTHROPIC) + anthropic_response = await async_client.get( + url=results_url, + headers=headers + ) + anthropic_response.raise_for_status() + + # Transform Anthropic batch results to OpenAI format + transformed_content = self._transform_anthropic_batch_results_to_openai_format( + anthropic_response.content + ) + + # Create a new response with transformed content + transformed_response = httpx.Response( + status_code=anthropic_response.status_code, + headers=anthropic_response.headers, + content=transformed_content, + request=anthropic_response.request, + ) + + # Return the transformed response content + return HttpxBinaryResponseContent(response=transformed_response) + + + def file_content( + self, + _is_async: bool, + file_content_request: FileContentRequest, + api_base: Optional[str] = None, + api_key: Optional[str] = None, + timeout: Union[float, httpx.Timeout] = 600.0, + max_retries: Optional[int] = None, + ) -> Union[ + HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent] + ]: + """ + Retrieve file content from Anthropic. + + For batch results, the file_id should be the batch_id. + This will call Anthropic's /v1/messages/batches/{batch_id}/results endpoint. + + Args: + _is_async: Whether to run asynchronously + file_content_request: Contains file_id (batch_id for batch results) + api_base: Anthropic API base URL + api_key: Anthropic API key + timeout: Request timeout + max_retries: Max retry attempts (unused for now) + + Returns: + HttpxBinaryResponseContent or Coroutine: Binary content wrapped in compatible response format + """ + if _is_async: + return self.afile_content( + file_content_request=file_content_request, + api_base=api_base, + api_key=api_key, + max_retries=max_retries, + ) + else: + return asyncio.run( + self.afile_content( + file_content_request=file_content_request, + api_base=api_base, + api_key=api_key, + timeout=timeout, + max_retries=max_retries, + ) + ) + + def _transform_anthropic_batch_results_to_openai_format( + self, anthropic_content: bytes + ) -> bytes: + """ + Transform Anthropic batch results JSONL to OpenAI batch results JSONL format. + + Anthropic format: + { + "custom_id": "...", + "result": { + "type": "succeeded", + "message": { ... } // Anthropic message format + } + } + + OpenAI format: + { + "custom_id": "...", + "response": { + "status_code": 200, + "request_id": "...", + "body": { ... } // OpenAI chat completion format + } + } + """ + try: + anthropic_config = AnthropicConfig() + transformed_lines = [] + + # Parse JSONL content + content_str = anthropic_content.decode("utf-8") + for line in content_str.strip().split("\n"): + if not line.strip(): + continue + + anthropic_result = json.loads(line) + custom_id = anthropic_result.get("custom_id", "") + result = anthropic_result.get("result", {}) + result_type = result.get("type", "") + + # Transform based on result type + if result_type == "succeeded": + # Transform Anthropic message to OpenAI format + anthropic_message = result.get("message", {}) + if anthropic_message: + openai_response_body = self._transform_anthropic_message_to_openai_format( + anthropic_message=anthropic_message, + anthropic_config=anthropic_config, + ) + + # Create OpenAI batch result format + openai_result: OpenAIBatchResult = { + "custom_id": custom_id, + "response": { + "status_code": 200, + "request_id": anthropic_message.get("id", ""), + "body": openai_response_body, + }, + } + transformed_lines.append(json.dumps(openai_result)) + elif result_type == "errored": + # Handle error case + error = result.get("error", {}) + error_obj = error.get("error", {}) + error_message = error_obj.get("message", "Unknown error") + error_type = error_obj.get("type", "api_error") + + status_code = ANTHROPIC_ERROR_STATUS_CODE_MAP.get(error_type, 500) + + error_body_errored: OpenAIErrorBody = { + "error": { + "message": error_message, + "type": error_type, + } + } + openai_result_errored: OpenAIBatchResult = { + "custom_id": custom_id, + "response": { + "status_code": status_code, + "request_id": error.get("request_id", ""), + "body": error_body_errored, + }, + } + transformed_lines.append(json.dumps(openai_result_errored)) + elif result_type in ["canceled", "expired"]: + # Handle canceled/expired cases + error_body_canceled: OpenAIErrorBody = { + "error": { + "message": f"Batch request was {result_type}", + "type": "invalid_request_error", + } + } + openai_result_canceled: OpenAIBatchResult = { + "custom_id": custom_id, + "response": { + "status_code": 400, + "request_id": "", + "body": error_body_canceled, + }, + } + transformed_lines.append(json.dumps(openai_result_canceled)) + + # Join lines and encode back to bytes + transformed_content = "\n".join(transformed_lines) + if transformed_lines: + transformed_content += "\n" # Add trailing newline for JSONL format + return transformed_content.encode("utf-8") + except Exception as e: + verbose_logger.error( + f"Error transforming Anthropic batch results to OpenAI format: {e}" + ) + # Return original content if transformation fails + return anthropic_content + + def _transform_anthropic_message_to_openai_format( + self, anthropic_message: dict, anthropic_config: AnthropicConfig + ) -> OpenAIChatCompletionResponse: + """ + Transform a single Anthropic message to OpenAI chat completion format. + """ + try: + # Create a mock httpx.Response for transformation + mock_response = httpx.Response( + status_code=200, + content=json.dumps(anthropic_message).encode("utf-8"), + ) + + # Create a ModelResponse object + model_response = ModelResponse() + # Initialize with required fields - will be populated by transform_parsed_response + model_response.choices = [ + litellm.Choices( + finish_reason="stop", + index=0, + message=litellm.Message(content="", role="assistant"), + ) + ] # type: ignore + + # Create a logging object for transformation + logging_obj = Logging( + model=anthropic_message.get("model", "claude-3-5-sonnet-20241022"), + messages=[{"role": "user", "content": "batch_request"}], + stream=False, + call_type=CallTypes.aretrieve_batch, + start_time=time.time(), + litellm_call_id="batch_" + str(uuid.uuid4()), + function_id="batch_processing", + litellm_trace_id=str(uuid.uuid4()), + kwargs={"optional_params": {}}, + ) + logging_obj.optional_params = {} + + # Transform using AnthropicConfig + transformed_response = anthropic_config.transform_parsed_response( + completion_response=anthropic_message, + raw_response=mock_response, + model_response=model_response, + json_mode=False, + prefix_prompt=None, + ) + + # Convert ModelResponse to OpenAI format dict - it's already in OpenAI format + openai_body: OpenAIChatCompletionResponse = transformed_response.model_dump(exclude_none=True) + + # Ensure id comes from anthropic_message if not set + if not openai_body.get("id"): + openai_body["id"] = anthropic_message.get("id", "") + + return openai_body + except Exception as e: + verbose_logger.error( + f"Error transforming Anthropic message to OpenAI format: {e}" + ) + # Return a basic error response if transformation fails + error_response: OpenAIChatCompletionResponse = { + "id": anthropic_message.get("id", ""), + "object": "chat.completion", + "created": int(time.time()), + "model": anthropic_message.get("model", ""), + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": ""}, + "finish_reason": "error", + } + ], + "usage": { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0, + }, + } + return error_response + diff --git a/litellm/llms/anthropic/skills/__init__.py b/litellm/llms/anthropic/skills/__init__.py new file mode 100644 index 000000000000..60e78c240659 --- /dev/null +++ b/litellm/llms/anthropic/skills/__init__.py @@ -0,0 +1,6 @@ +"""Anthropic Skills API integration""" + +from .transformation import AnthropicSkillsConfig + +__all__ = ["AnthropicSkillsConfig"] + diff --git a/litellm/llms/anthropic/skills/readme.md b/litellm/llms/anthropic/skills/readme.md new file mode 100644 index 000000000000..0602272256ce --- /dev/null +++ b/litellm/llms/anthropic/skills/readme.md @@ -0,0 +1,279 @@ +# Anthropic Skills API Integration + +This module provides comprehensive support for the Anthropic Skills API through LiteLLM. + +## Features + +The Skills API allows you to: +- **Create skills**: Define reusable AI capabilities +- **List skills**: Browse all available skills +- **Get skills**: Retrieve detailed information about a specific skill +- **Delete skills**: Remove skills that are no longer needed + +## Quick Start + +### Prerequisites + +Set your Anthropic API key: +```python +import os +os.environ["ANTHROPIC_API_KEY"] = "your-api-key-here" +``` + +### Basic Usage + +#### Create a Skill + +```python +import litellm + +# Create a skill with files +# Note: All files must be in the same top-level directory +# and must include a SKILL.md file at the root +skill = litellm.create_skill( + files=[ + # List of file objects to upload + # Must include SKILL.md + ], + display_title="Python Code Generator", + custom_llm_provider="anthropic" +) +print(f"Created skill: {skill.id}") + +# Asynchronous version +skill = await litellm.acreate_skill( + files=[...], # Your files here + display_title="Python Code Generator", + custom_llm_provider="anthropic" +) +``` + +#### List Skills + +```python +# List all skills +skills = litellm.list_skills( + custom_llm_provider="anthropic" +) + +for skill in skills.data: + print(f"{skill.display_title}: {skill.id}") + +# With pagination and filtering +skills = litellm.list_skills( + limit=20, + source="custom", # Filter by 'custom' or 'anthropic' + custom_llm_provider="anthropic" +) + +# Get next page if available +if skills.has_more: + next_page = litellm.list_skills( + page=skills.next_page, + custom_llm_provider="anthropic" + ) +``` + +#### Get a Skill + +```python +skill = litellm.get_skill( + skill_id="skill_abc123", + custom_llm_provider="anthropic" +) + +print(f"Skill: {skill.display_title}") +print(f"Created: {skill.created_at}") +print(f"Latest version: {skill.latest_version}") +print(f"Source: {skill.source}") +``` + +#### Delete a Skill + +```python +result = litellm.delete_skill( + skill_id="skill_abc123", + custom_llm_provider="anthropic" +) + +print(f"Deleted skill {result.id}, type: {result.type}") +``` + +## API Reference + +### `create_skill()` + +Create a new skill. + +**Parameters:** +- `files` (List[Any], optional): Files to upload for the skill. All files must be in the same top-level directory and must include a SKILL.md file at the root. +- `display_title` (str, optional): Display title for the skill +- `custom_llm_provider` (str, optional): Provider name (default: "anthropic") +- `extra_headers` (dict, optional): Additional HTTP headers +- `timeout` (float, optional): Request timeout + +**Returns:** +- `Skill`: The created skill object + +**Async version:** `acreate_skill()` + +### `list_skills()` + +List all skills. + +**Parameters:** +- `limit` (int, optional): Number of results to return per page (max 100, default 20) +- `page` (str, optional): Pagination token for fetching a specific page of results +- `source` (str, optional): Filter skills by source ('custom' or 'anthropic') +- `custom_llm_provider` (str, optional): Provider name (default: "anthropic") +- `extra_headers` (dict, optional): Additional HTTP headers +- `timeout` (float, optional): Request timeout + +**Returns:** +- `ListSkillsResponse`: Object containing a list of skills and pagination info + +**Async version:** `alist_skills()` + +### `get_skill()` + +Get a specific skill by ID. + +**Parameters:** +- `skill_id` (str, required): The skill ID +- `custom_llm_provider` (str, optional): Provider name (default: "anthropic") +- `extra_headers` (dict, optional): Additional HTTP headers +- `timeout` (float, optional): Request timeout + +**Returns:** +- `Skill`: The requested skill object + +**Async version:** `aget_skill()` + +### `delete_skill()` + +Delete a skill. + +**Parameters:** +- `skill_id` (str, required): The skill ID to delete +- `custom_llm_provider` (str, optional): Provider name (default: "anthropic") +- `extra_headers` (dict, optional): Additional HTTP headers +- `timeout` (float, optional): Request timeout + +**Returns:** +- `DeleteSkillResponse`: Object with `id` and `type` fields + +**Async version:** `adelete_skill()` + +## Response Types + +### `Skill` + +Represents a skill from the Anthropic Skills API. + +**Fields:** +- `id` (str): Unique identifier +- `created_at` (str): ISO 8601 timestamp +- `display_title` (str, optional): Display title +- `latest_version` (str, optional): Latest version identifier +- `source` (str): Source ("custom" or "anthropic") +- `type` (str): Object type (always "skill") +- `updated_at` (str): ISO 8601 timestamp + +### `ListSkillsResponse` + +Response from listing skills. + +**Fields:** +- `data` (List[Skill]): List of skills +- `next_page` (str, optional): Pagination token for the next page +- `has_more` (bool): Whether more skills are available + +### `DeleteSkillResponse` + +Response from deleting a skill. + +**Fields:** +- `id` (str): The deleted skill ID +- `type` (str): Deleted object type (always "skill_deleted") + +## Architecture + +The Skills API implementation follows LiteLLM's standard patterns: + +1. **Type Definitions** (`litellm/types/llms/anthropic_skills.py`) + - Pydantic models for request/response types + - TypedDict definitions for request parameters + +2. **Base Configuration** (`litellm/llms/base_llm/skills/transformation.py`) + - Abstract base class `BaseSkillsAPIConfig` + - Defines transformation interface for provider-specific implementations + +3. **Provider Implementation** (`litellm/llms/anthropic/skills/transformation.py`) + - `AnthropicSkillsConfig` - Anthropic-specific transformations + - Handles API authentication, URL construction, and response mapping + +4. **Main Handler** (`litellm/skills/main.py`) + - Public API functions (sync and async) + - Request validation and routing + - Error handling + +5. **HTTP Handlers** (`litellm/llms/custom_httpx/llm_http_handler.py`) + - Low-level HTTP request/response handling + - Connection pooling and retry logic + +## Beta API Support + +The Skills API is in beta. The beta header (`skills-2025-10-02`) is automatically added by the Anthropic provider configuration. You can customize it if needed: + +```python +skill = litellm.create_skill( + display_title="My Skill", + extra_headers={ + "anthropic-beta": "skills-2025-10-02" # Or any other beta version + }, + custom_llm_provider="anthropic" +) +``` + +The default beta version is configured in `litellm.constants.ANTHROPIC_SKILLS_API_BETA_VERSION`. + +## Error Handling + +All Skills API functions follow LiteLLM's standard error handling: + +```python +import litellm + +try: + skill = litellm.create_skill( + display_title="My Skill", + custom_llm_provider="anthropic" + ) +except litellm.exceptions.AuthenticationError as e: + print(f"Authentication failed: {e}") +except litellm.exceptions.RateLimitError as e: + print(f"Rate limit exceeded: {e}") +except litellm.exceptions.APIError as e: + print(f"API error: {e}") +``` + +## Contributing + +To add support for Skills API to a new provider: + +1. Create provider-specific configuration class inheriting from `BaseSkillsAPIConfig` +2. Implement all abstract methods for request/response transformations +3. Register the config in `ProviderConfigManager.get_provider_skills_api_config()` +4. Add appropriate tests + +## Related Documentation + +- [Anthropic Skills API Documentation](https://platform.claude.com/docs/en/api/beta/skills/create) +- [LiteLLM Responses API](../../../responses/) +- [Provider Configuration System](../../base_llm/) + +## Support + +For issues or questions: +- GitHub Issues: https://github.com/BerriAI/litellm/issues +- Discord: https://discord.gg/wuPM9dRgDw diff --git a/litellm/llms/anthropic/skills/transformation.py b/litellm/llms/anthropic/skills/transformation.py new file mode 100644 index 000000000000..832b74cf51dd --- /dev/null +++ b/litellm/llms/anthropic/skills/transformation.py @@ -0,0 +1,211 @@ +""" +Anthropic Skills API configuration and transformations +""" + +from typing import Any, Dict, Optional, Tuple + +import httpx + +from litellm._logging import verbose_logger +from litellm.llms.base_llm.skills.transformation import ( + BaseSkillsAPIConfig, + LiteLLMLoggingObj, +) +from litellm.types.llms.anthropic_skills import ( + CreateSkillRequest, + DeleteSkillResponse, + ListSkillsParams, + ListSkillsResponse, + Skill, +) +from litellm.types.router import GenericLiteLLMParams +from litellm.types.utils import LlmProviders + + +class AnthropicSkillsConfig(BaseSkillsAPIConfig): + """Anthropic-specific Skills API configuration""" + + @property + def custom_llm_provider(self) -> LlmProviders: + return LlmProviders.ANTHROPIC + + def validate_environment( + self, headers: dict, litellm_params: Optional[GenericLiteLLMParams] + ) -> dict: + """Add Anthropic-specific headers""" + from litellm.llms.anthropic.common_utils import AnthropicModelInfo + + # Get API key + api_key = None + if litellm_params: + api_key = litellm_params.api_key + api_key = AnthropicModelInfo.get_api_key(api_key) + + if not api_key: + raise ValueError("ANTHROPIC_API_KEY is required for Skills API") + + # Add required headers + headers["x-api-key"] = api_key + headers["anthropic-version"] = "2023-06-01" + + # Add beta header for skills API + from litellm.constants import ANTHROPIC_SKILLS_API_BETA_VERSION + + if "anthropic-beta" not in headers: + headers["anthropic-beta"] = ANTHROPIC_SKILLS_API_BETA_VERSION + elif isinstance(headers["anthropic-beta"], list): + if ANTHROPIC_SKILLS_API_BETA_VERSION not in headers["anthropic-beta"]: + headers["anthropic-beta"].append(ANTHROPIC_SKILLS_API_BETA_VERSION) + elif isinstance(headers["anthropic-beta"], str): + if ANTHROPIC_SKILLS_API_BETA_VERSION not in headers["anthropic-beta"]: + headers["anthropic-beta"] = [headers["anthropic-beta"], ANTHROPIC_SKILLS_API_BETA_VERSION] + + headers["content-type"] = "application/json" + + return headers + + def get_complete_url( + self, + api_base: Optional[str], + endpoint: str, + skill_id: Optional[str] = None, + ) -> str: + """Get complete URL for Anthropic Skills API""" + from litellm.llms.anthropic.common_utils import AnthropicModelInfo + + if api_base is None: + api_base = AnthropicModelInfo.get_api_base() + + if skill_id: + return f"{api_base}/v1/skills/{skill_id}?beta=true" + return f"{api_base}/v1/{endpoint}?beta=true" + + def transform_create_skill_request( + self, + create_request: CreateSkillRequest, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Dict: + """Transform create skill request for Anthropic""" + verbose_logger.debug( + "Transforming create skill request: %s", create_request + ) + + # Anthropic expects the request body directly + request_body = {k: v for k, v in create_request.items() if v is not None} + + return request_body + + def transform_create_skill_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> Skill: + """Transform Anthropic response to Skill object""" + response_json = raw_response.json() + verbose_logger.debug( + "Transforming create skill response: %s", response_json + ) + + return Skill(**response_json) + + def transform_list_skills_request( + self, + list_params: ListSkillsParams, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """Transform list skills request for Anthropic""" + from litellm.llms.anthropic.common_utils import AnthropicModelInfo + + api_base = AnthropicModelInfo.get_api_base( + litellm_params.api_base if litellm_params else None + ) + url = self.get_complete_url(api_base=api_base, endpoint="skills") + + # Build query parameters + query_params: Dict[str, Any] = {} + if "limit" in list_params and list_params["limit"]: + query_params["limit"] = list_params["limit"] + if "page" in list_params and list_params["page"]: + query_params["page"] = list_params["page"] + if "source" in list_params and list_params["source"]: + query_params["source"] = list_params["source"] + + verbose_logger.debug( + "List skills request made to Anthropic Skills endpoint with params: %s", query_params + ) + + return url, query_params + + def transform_list_skills_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> ListSkillsResponse: + """Transform Anthropic response to ListSkillsResponse""" + response_json = raw_response.json() + verbose_logger.debug( + "Transforming list skills response: %s", response_json + ) + + return ListSkillsResponse(**response_json) + + def transform_get_skill_request( + self, + skill_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """Transform get skill request for Anthropic""" + url = self.get_complete_url( + api_base=api_base, endpoint="skills", skill_id=skill_id + ) + + verbose_logger.debug("Get skill request - URL: %s", url) + + return url, headers + + def transform_get_skill_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> Skill: + """Transform Anthropic response to Skill object""" + response_json = raw_response.json() + verbose_logger.debug( + "Transforming get skill response: %s", response_json + ) + + return Skill(**response_json) + + def transform_delete_skill_request( + self, + skill_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """Transform delete skill request for Anthropic""" + url = self.get_complete_url( + api_base=api_base, endpoint="skills", skill_id=skill_id + ) + + verbose_logger.debug("Delete skill request - URL: %s", url) + + return url, headers + + def transform_delete_skill_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> DeleteSkillResponse: + """Transform Anthropic response to DeleteSkillResponse""" + response_json = raw_response.json() + verbose_logger.debug( + "Transforming delete skill response: %s", response_json + ) + + return DeleteSkillResponse(**response_json) + diff --git a/litellm/llms/aws_polly/__init__.py b/litellm/llms/aws_polly/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/litellm/llms/aws_polly/text_to_speech/__init__.py b/litellm/llms/aws_polly/text_to_speech/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/litellm/llms/aws_polly/text_to_speech/transformation.py b/litellm/llms/aws_polly/text_to_speech/transformation.py new file mode 100644 index 000000000000..dc6c40000f1e --- /dev/null +++ b/litellm/llms/aws_polly/text_to_speech/transformation.py @@ -0,0 +1,391 @@ +""" +AWS Polly Text-to-Speech transformation + +Maps OpenAI TTS spec to AWS Polly SynthesizeSpeech API +Reference: https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html +""" + +import json +from typing import TYPE_CHECKING, Any, Coroutine, Dict, Optional, Tuple, Union + +import httpx + +from litellm.llms.base_llm.text_to_speech.transformation import ( + BaseTextToSpeechConfig, + TextToSpeechRequestData, +) +from litellm.llms.bedrock.base_aws_llm import BaseAWSLLM + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj + from litellm.types.llms.openai import HttpxBinaryResponseContent +else: + LiteLLMLoggingObj = Any + HttpxBinaryResponseContent = Any + + +class AWSPollyTextToSpeechConfig(BaseTextToSpeechConfig, BaseAWSLLM): + """ + Configuration for AWS Polly Text-to-Speech + + Reference: https://docs.aws.amazon.com/polly/latest/dg/API_SynthesizeSpeech.html + """ + + def __init__(self): + BaseTextToSpeechConfig.__init__(self) + BaseAWSLLM.__init__(self) + + # Default settings + DEFAULT_VOICE = "Joanna" + DEFAULT_ENGINE = "neural" + DEFAULT_OUTPUT_FORMAT = "mp3" + DEFAULT_REGION = "us-east-1" + + # Voice name mappings from OpenAI voices to Polly voices + VOICE_MAPPINGS = { + "alloy": "Joanna", # US English female + "echo": "Matthew", # US English male + "fable": "Amy", # British English female + "onyx": "Brian", # British English male + "nova": "Ivy", # US English female (child) + "shimmer": "Kendra", # US English female + } + + # Response format mappings from OpenAI to Polly + FORMAT_MAPPINGS = { + "mp3": "mp3", + "opus": "ogg_vorbis", + "aac": "mp3", # Polly doesn't support AAC, use MP3 + "flac": "mp3", # Polly doesn't support FLAC, use MP3 + "wav": "pcm", + "pcm": "pcm", + } + + # Valid Polly engines + VALID_ENGINES = {"standard", "neural", "long-form", "generative"} + + def dispatch_text_to_speech( + self, + model: str, + input: str, + voice: Optional[Union[str, Dict]], + optional_params: Dict, + litellm_params_dict: Dict, + logging_obj: "LiteLLMLoggingObj", + timeout: Union[float, httpx.Timeout], + extra_headers: Optional[Dict[str, Any]], + base_llm_http_handler: Any, + aspeech: bool, + api_base: Optional[str], + api_key: Optional[str], + **kwargs: Any, + ) -> Union[ + "HttpxBinaryResponseContent", + Coroutine[Any, Any, "HttpxBinaryResponseContent"], + ]: + """ + Dispatch method to handle AWS Polly TTS requests + + This method encapsulates AWS-specific credential resolution and parameter handling + + Args: + base_llm_http_handler: The BaseLLMHTTPHandler instance from main.py + """ + # Get AWS region from kwargs or environment + aws_region_name = kwargs.get("aws_region_name") or self._get_aws_region_name_for_polly( + optional_params=optional_params + ) + + # Convert voice to string if it's a dict + voice_str: Optional[str] = None + if isinstance(voice, str): + voice_str = voice + elif isinstance(voice, dict): + voice_str = voice.get("name") if voice else None + + # Update litellm_params with resolved values + # Note: AWS credentials (aws_access_key_id, aws_secret_access_key, etc.) + # are already in litellm_params_dict via get_litellm_params() in main.py + litellm_params_dict["aws_region_name"] = aws_region_name + litellm_params_dict["api_base"] = api_base + litellm_params_dict["api_key"] = api_key + + # Call the text_to_speech_handler + response = base_llm_http_handler.text_to_speech_handler( + model=model, + input=input, + voice=voice_str, + text_to_speech_provider_config=self, + text_to_speech_optional_params=optional_params, + custom_llm_provider="aws_polly", + litellm_params=litellm_params_dict, + logging_obj=logging_obj, + timeout=timeout, + extra_headers=extra_headers, + client=None, + _is_async=aspeech, + ) + + return response + + def _get_aws_region_name_for_polly(self, optional_params: Dict) -> str: + """Get AWS region name for Polly API calls.""" + aws_region_name = optional_params.get("aws_region_name") + if aws_region_name is None: + aws_region_name = self.get_aws_region_name_for_non_llm_api_calls() + return aws_region_name + + def get_supported_openai_params(self, model: str) -> list: + """ + AWS Polly TTS supports these OpenAI parameters + """ + return ["voice", "response_format", "speed"] + + def map_openai_params( + self, + model: str, + optional_params: Dict, + voice: Optional[Union[str, Dict]] = None, + drop_params: bool = False, + kwargs: Dict = {}, + ) -> Tuple[Optional[str], Dict]: + """ + Map OpenAI parameters to AWS Polly parameters + """ + mapped_params = {} + + # Map voice - support both native Polly voices and OpenAI voice mappings + mapped_voice: Optional[str] = None + if isinstance(voice, str): + if voice in self.VOICE_MAPPINGS: + # OpenAI voice -> Polly voice + mapped_voice = self.VOICE_MAPPINGS[voice] + else: + # Assume it's already a Polly voice name + mapped_voice = voice + + # Map response format + if "response_format" in optional_params: + format_name = optional_params["response_format"] + if format_name in self.FORMAT_MAPPINGS: + mapped_params["output_format"] = self.FORMAT_MAPPINGS[format_name] + else: + mapped_params["output_format"] = format_name + else: + mapped_params["output_format"] = self.DEFAULT_OUTPUT_FORMAT + + # Extract engine from model name (e.g., "aws_polly/neural" -> "neural") + engine = self._extract_engine_from_model(model) + mapped_params["engine"] = engine + + # Pass through Polly-specific parameters (use AWS API casing) + if "language_code" in kwargs: + mapped_params["LanguageCode"] = kwargs["language_code"] + if "lexicon_names" in kwargs: + mapped_params["LexiconNames"] = kwargs["lexicon_names"] + if "sample_rate" in kwargs: + mapped_params["SampleRate"] = kwargs["sample_rate"] + + return mapped_voice, mapped_params + + def _extract_engine_from_model(self, model: str) -> str: + """ + Extract engine from model name. + + Examples: + - aws_polly/neural -> neural + - aws_polly/standard -> standard + - aws_polly/long-form -> long-form + - aws_polly -> neural (default) + """ + if "/" in model: + parts = model.split("/") + if len(parts) >= 2: + engine = parts[1].lower() + if engine in self.VALID_ENGINES: + return engine + return self.DEFAULT_ENGINE + + def validate_environment( + self, + headers: dict, + model: str, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ) -> dict: + """ + Validate AWS environment and set up headers. + AWS SigV4 signing will be done in transform_text_to_speech_request. + """ + validated_headers = headers.copy() + validated_headers["Content-Type"] = "application/json" + return validated_headers + + def get_complete_url( + self, + model: str, + api_base: Optional[str], + litellm_params: dict, + ) -> str: + """ + Get the complete URL for AWS Polly SynthesizeSpeech request + + Polly endpoint format: + https://polly.{region}.amazonaws.com/v1/speech + """ + if api_base is not None: + return api_base.rstrip("/") + "/v1/speech" + + aws_region_name = litellm_params.get("aws_region_name", self.DEFAULT_REGION) + return f"https://polly.{aws_region_name}.amazonaws.com/v1/speech" + + def is_ssml_input(self, input: str) -> bool: + """ + Returns True if input is SSML, False otherwise. + + Based on AWS Polly SSML requirements - must contain tag. + """ + return "" in input or " Tuple[Dict[str, str], str]: + """ + Sign the AWS Polly request using SigV4. + + Returns: + Tuple of (signed_headers, json_body_string) + """ + try: + from botocore.auth import SigV4Auth + from botocore.awsrequest import AWSRequest + except ImportError: + raise ImportError("Missing boto3 to call AWS Polly. Run 'pip install boto3'.") + + # Get AWS region + aws_region_name = litellm_params.get("aws_region_name", self.DEFAULT_REGION) + + # Get AWS credentials + credentials = self.get_credentials( + aws_access_key_id=litellm_params.get("aws_access_key_id"), + aws_secret_access_key=litellm_params.get("aws_secret_access_key"), + aws_session_token=litellm_params.get("aws_session_token"), + aws_region_name=aws_region_name, + aws_session_name=litellm_params.get("aws_session_name"), + aws_profile_name=litellm_params.get("aws_profile_name"), + aws_role_name=litellm_params.get("aws_role_name"), + aws_web_identity_token=litellm_params.get("aws_web_identity_token"), + aws_sts_endpoint=litellm_params.get("aws_sts_endpoint"), + aws_external_id=litellm_params.get("aws_external_id"), + ) + + # Serialize request body to JSON + json_body = json.dumps(request_body) + + # Create headers for signing + headers = { + "Content-Type": "application/json", + } + + # Create AWS request for signing + aws_request = AWSRequest( + method="POST", + url=endpoint_url, + data=json_body, + headers=headers, + ) + + # Sign the request + SigV4Auth(credentials, "polly", aws_region_name).add_auth(aws_request) + + # Return signed headers and body + return dict(aws_request.headers), json_body + + def transform_text_to_speech_request( + self, + model: str, + input: str, + voice: Optional[str], + optional_params: Dict, + litellm_params: Dict, + headers: dict, + ) -> TextToSpeechRequestData: + """ + Transform OpenAI TTS request to AWS Polly SynthesizeSpeech format. + + Supports: + - Native Polly voices (Joanna, Matthew, etc.) + - OpenAI voice mapping (alloy, echo, etc.) + - SSML input (auto-detected via tag) + - Multiple engines (neural, standard, long-form, generative) + + Returns: + TextToSpeechRequestData: Contains signed request for Polly API + """ + # Get voice (already mapped in main.py, or use default) + polly_voice = voice or self.DEFAULT_VOICE + + # Get output format + output_format = optional_params.get("output_format", self.DEFAULT_OUTPUT_FORMAT) + + # Get engine + engine = optional_params.get("engine", self.DEFAULT_ENGINE) + + # Build request body + request_body: Dict[str, Any] = { + "Engine": engine, + "OutputFormat": output_format, + "Text": input, + "VoiceId": polly_voice, + } + + # Auto-detect SSML + if self.is_ssml_input(input): + request_body["TextType"] = "ssml" + else: + request_body["TextType"] = "text" + + # Add optional Polly parameters (already in AWS casing from map_openai_params) + for key in ["LanguageCode", "LexiconNames", "SampleRate"]: + if key in optional_params: + request_body[key] = optional_params[key] + + # Get endpoint URL + endpoint_url = self.get_complete_url( + model=model, + api_base=litellm_params.get("api_base"), + litellm_params=litellm_params, + ) + + # Sign the request with AWS SigV4 + signed_headers, json_body = self._sign_polly_request( + request_body=request_body, + endpoint_url=endpoint_url, + litellm_params=litellm_params, + ) + + # Return as ssml_body so the handler uses data= instead of json= + # This preserves the exact JSON string that was signed + return TextToSpeechRequestData( + ssml_body=json_body, + headers=signed_headers, + ) + + def transform_text_to_speech_response( + self, + model: str, + raw_response: httpx.Response, + logging_obj: "LiteLLMLoggingObj", + ) -> "HttpxBinaryResponseContent": + """ + Transform AWS Polly response to standard format. + + Polly returns the audio data directly in the response body. + """ + from litellm.types.llms.openai import HttpxBinaryResponseContent + + return HttpxBinaryResponseContent(raw_response) + diff --git a/litellm/llms/azure/azure.py b/litellm/llms/azure/azure.py index 7c5b693b453e..cb9fe0aeb300 100644 --- a/litellm/llms/azure/azure.py +++ b/litellm/llms/azure/azure.py @@ -4,7 +4,13 @@ from typing import Any, Callable, Coroutine, Dict, List, Optional, Union import httpx # type: ignore -from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI +from openai import ( + APITimeoutError, + AsyncAzureOpenAI, + AsyncOpenAI, + AzureOpenAI, + OpenAI, +) import litellm from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES @@ -36,6 +42,7 @@ process_azure_headers, select_azure_base_url_or_endpoint, ) +from .image_generation import get_azure_image_generation_config class AzureOpenAIAssistantsAPIConfig: @@ -127,7 +134,7 @@ def __init__(self) -> None: def make_sync_azure_openai_chat_completion_request( self, - azure_client: AzureOpenAI, + azure_client: Union[AzureOpenAI, OpenAI], data: dict, timeout: Union[float, httpx.Timeout], ): @@ -150,7 +157,7 @@ def make_sync_azure_openai_chat_completion_request( @track_llm_api_timing() async def make_azure_openai_chat_completion_request( self, - azure_client: AsyncAzureOpenAI, + azure_client: Union[AsyncAzureOpenAI, AsyncOpenAI], data: dict, timeout: Union[float, httpx.Timeout], logging_obj: LiteLLMLoggingObj, @@ -214,7 +221,7 @@ def completion( # noqa: PLR0915 ### CHECK IF CLOUDFLARE AI GATEWAY ### ### if so - set the model as part of the base url - if "gateway.ai.cloudflare.com" in api_base: + if api_base is not None and "gateway.ai.cloudflare.com" in api_base: client = self._init_azure_client_for_cloudflare_ai_gateway( api_base=api_base, model=model, @@ -327,10 +334,10 @@ def completion( # noqa: PLR0915 _is_async=False, litellm_params=litellm_params, ) - if not isinstance(azure_client, AzureOpenAI): + if not isinstance(azure_client, (AzureOpenAI, OpenAI)): raise AzureOpenAIError( status_code=500, - message="azure_client is not an instance of AzureOpenAI", + message="azure_client is not an instance of AzureOpenAI or OpenAI", ) headers, response = self.make_sync_azure_openai_chat_completion_request( @@ -400,8 +407,8 @@ async def acompletion( _is_async=True, litellm_params=litellm_params, ) - if not isinstance(azure_client, AsyncAzureOpenAI): - raise ValueError("Azure client is not an instance of AsyncAzureOpenAI") + if not isinstance(azure_client, (AsyncAzureOpenAI, AsyncOpenAI)): + raise ValueError("Azure client is not an instance of AsyncAzureOpenAI or AsyncOpenAI") ## LOGGING logging_obj.pre_call( input=data["messages"], @@ -411,7 +418,7 @@ async def acompletion( "api_key": api_key, "azure_ad_token": azure_ad_token, }, - "api_base": azure_client._base_url._uri_reference, + "api_base": api_base, "acompletion": True, "complete_input_dict": data, }, @@ -519,10 +526,10 @@ def streaming( _is_async=False, litellm_params=litellm_params, ) - if not isinstance(azure_client, AzureOpenAI): + if not isinstance(azure_client, (AzureOpenAI, OpenAI)): raise AzureOpenAIError( status_code=500, - message="azure_client is not an instance of AzureOpenAI", + message="azure_client is not an instance of AzureOpenAI or OpenAI", ) ## LOGGING logging_obj.pre_call( @@ -533,7 +540,7 @@ def streaming( "api_key": api_key, "azure_ad_token": azure_ad_token, }, - "api_base": azure_client._base_url._uri_reference, + "api_base": api_base, "acompletion": True, "complete_input_dict": data, }, @@ -577,8 +584,8 @@ async def async_streaming( _is_async=True, litellm_params=litellm_params, ) - if not isinstance(azure_client, AsyncAzureOpenAI): - raise ValueError("Azure client is not an instance of AsyncAzureOpenAI") + if not isinstance(azure_client, (AsyncAzureOpenAI, AsyncOpenAI)): + raise ValueError("Azure client is not an instance of AsyncAzureOpenAI or AsyncOpenAI") ## LOGGING logging_obj.pre_call( @@ -589,7 +596,7 @@ async def async_streaming( "api_key": api_key, "azure_ad_token": azure_ad_token, }, - "api_base": azure_client._base_url._uri_reference, + "api_base": api_base, "acompletion": True, "complete_input_dict": data, }, @@ -656,15 +663,36 @@ async def aembedding( client=client, litellm_params=litellm_params, ) - if not isinstance(openai_aclient, AsyncAzureOpenAI): - raise ValueError("Azure client is not an instance of AsyncAzureOpenAI") + if not isinstance(openai_aclient, (AsyncAzureOpenAI, AsyncOpenAI)): + raise ValueError("Azure client is not an instance of AsyncAzureOpenAI or AsyncOpenAI") raw_response = await openai_aclient.embeddings.with_raw_response.create( **data, timeout=timeout ) headers = dict(raw_response.headers) - response = raw_response.parse() + + # Convert json.JSONDecodeError to AzureOpenAIError for two critical reasons: + # + # 1. ROUTER BEHAVIOR: The router relies on exception.status_code to determine cooldown logic: + # - JSONDecodeError has no status_code → router skips cooldown evaluation + # - AzureOpenAIError has status_code → router properly evaluates for cooldown + # + # 2. CONNECTION CLEANUP: When response.parse() throws JSONDecodeError, the response + # body may not be fully consumed, preventing httpx from properly returning the + # connection to the pool. By catching the exception and accessing raw_response.status_code, + # we trigger httpx's internal cleanup logic. Without this: + # - parse() fails → JSONDecodeError bubbles up → httpx never knows response was acknowledged → connection leak + # This completely eliminates "Unclosed connection" warnings during high load. + try: + response = raw_response.parse() + except json.JSONDecodeError as json_error: + raise AzureOpenAIError( + status_code=raw_response.status_code or 500, + message=f"Failed to parse raw Azure embedding response: {str(json_error)}" + ) from json_error + stringified_response = response.model_dump() + ## LOGGING logging_obj.post_call( input=input, @@ -754,10 +782,10 @@ def embedding( client=client, litellm_params=litellm_params, ) - if not isinstance(azure_client, AzureOpenAI): + if not isinstance(azure_client, (AzureOpenAI, OpenAI)): raise AzureOpenAIError( status_code=500, - message="azure_client is not an instance of AzureOpenAI", + message="azure_client is not an instance of AzureOpenAI or OpenAI", ) ## COMPLETION CALL @@ -989,6 +1017,10 @@ def make_sync_azure_httpx_request( def create_azure_base_url( self, azure_client_params: dict, model: Optional[str] ) -> str: + from litellm.llms.azure_ai.image_generation import ( + AzureFoundryFluxImageGenerationConfig, + ) + api_base: str = azure_client_params.get( "azure_endpoint", "" ) # "https://example-endpoint.openai.azure.com" @@ -998,6 +1030,15 @@ def create_azure_base_url( if model is None: model = "" + # Handle FLUX 2 models on Azure AI which use a different URL pattern + # e.g., /providers/blackforestlabs/v1/flux-2-pro instead of /openai/deployments/{model}/images/generations + if AzureFoundryFluxImageGenerationConfig.is_flux2_model(model): + return AzureFoundryFluxImageGenerationConfig.get_flux2_image_generation_url( + api_base=api_base, + model=model, + api_version=api_version, + ) + if "/openai/deployments/" in api_base: base_url_with_deployment = api_base else: @@ -1011,7 +1052,7 @@ def create_azure_base_url( async def aimage_generation( self, data: dict, - model_response: ModelResponse, + model_response: Optional[ImageResponse], azure_client_params: dict, api_key: str, input: list, @@ -1019,7 +1060,8 @@ async def aimage_generation( headers: dict, client=None, timeout=None, - ) -> litellm.ImageResponse: + ) -> ImageResponse: + response: Optional[dict] = None try: # response = await azure_client.images.generate(**data, timeout=timeout) @@ -1052,21 +1094,38 @@ async def aimage_generation( data=data, headers=headers, ) - response = httpx_response.json() - stringified_response = response - ## LOGGING - logging_obj.post_call( - input=input, - api_key=api_key, - additional_args={"complete_input_dict": data}, - original_response=stringified_response, - ) - return convert_to_model_response_object( # type: ignore - response_object=stringified_response, - model_response_object=model_response, - response_type="image_generation", + provider_config = get_azure_image_generation_config( + data.get("model", "dall-e-2") ) + if provider_config is not None: + return provider_config.transform_image_generation_response( + model=data.get("model", "dall-e-2"), + raw_response=httpx_response, + model_response=model_response or ImageResponse(), + logging_obj=logging_obj, + request_data=data, + optional_params=data, + litellm_params=data, + encoding=litellm.encoding, + ) + + else: + response = httpx_response.json() + + stringified_response = response + ## LOGGING + logging_obj.post_call( + input=input, + api_key=api_key, + additional_args={"complete_input_dict": data}, + original_response=stringified_response, + ) + return convert_to_model_response_object( # type: ignore + response_object=stringified_response, + model_response_object=model_response, + response_type="image_generation", + ) except Exception as e: ## LOGGING logging_obj.post_call( @@ -1110,7 +1169,11 @@ def image_generation( "base_model" ) - data = {"model": model, "prompt": prompt, **optional_params} + # Azure image generation API doesn't support extra_body parameter + extra_body = optional_params.pop("extra_body", {}) + flattened_params = {**optional_params, **extra_body} + + data = {"model": model, "prompt": prompt, **flattened_params} max_retries = data.pop("max_retries", 2) if not isinstance(max_retries, int): raise AzureOpenAIError( @@ -1120,9 +1183,7 @@ def image_generation( if api_key is None and azure_ad_token_provider is not None: azure_ad_token = azure_ad_token_provider() if azure_ad_token: - headers.pop( - "api-key", None - ) + headers.pop("api-key", None) headers["Authorization"] = f"Bearer {azure_ad_token}" # init AzureOpenAI Client @@ -1283,7 +1344,7 @@ def get_headers( prompt: Optional[str] = None, ) -> dict: client_session = litellm.client_session or httpx.Client() - if "gateway.ai.cloudflare.com" in api_base: + if api_base is not None and "gateway.ai.cloudflare.com" in api_base: ## build base url - assume api base includes resource name if not api_base.endswith("/"): api_base += "/" diff --git a/litellm/llms/azure/batches/handler.py b/litellm/llms/azure/batches/handler.py index 7fc6388ba874..aaefe8016878 100644 --- a/litellm/llms/azure/batches/handler.py +++ b/litellm/llms/azure/batches/handler.py @@ -5,10 +5,10 @@ from typing import Any, Coroutine, Optional, Union, cast import httpx +from openai import AsyncOpenAI, OpenAI from litellm.llms.azure.azure import AsyncAzureOpenAI, AzureOpenAI from litellm.types.llms.openai import ( - Batch, CancelBatchRequest, CreateBatchRequest, RetrieveBatchRequest, @@ -33,7 +33,7 @@ def __init__(self) -> None: async def acreate_batch( self, create_batch_data: CreateBatchRequest, - azure_client: AsyncAzureOpenAI, + azure_client: Union[AsyncAzureOpenAI, AsyncOpenAI], ) -> LiteLLMBatch: response = await azure_client.batches.create(**create_batch_data) return LiteLLMBatch(**response.model_dump()) @@ -47,11 +47,11 @@ def create_batch( api_version: Optional[str], timeout: Union[float, httpx.Timeout], max_retries: Optional[int], - client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ) -> Union[LiteLLMBatch, Coroutine[Any, Any, LiteLLMBatch]]: azure_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( api_key=api_key, api_base=api_base, @@ -66,20 +66,20 @@ def create_batch( ) if _is_async is True: - if not isinstance(azure_client, AsyncAzureOpenAI): + if not isinstance(azure_client, (AsyncAzureOpenAI, AsyncOpenAI)): raise ValueError( "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client." ) return self.acreate_batch( # type: ignore create_batch_data=create_batch_data, azure_client=azure_client ) - response = cast(AzureOpenAI, azure_client).batches.create(**create_batch_data) + response = cast(Union[AzureOpenAI, OpenAI], azure_client).batches.create(**create_batch_data) return LiteLLMBatch(**response.model_dump()) async def aretrieve_batch( self, retrieve_batch_data: RetrieveBatchRequest, - client: AsyncAzureOpenAI, + client: Union[AsyncAzureOpenAI, AsyncOpenAI], ) -> LiteLLMBatch: response = await client.batches.retrieve(**retrieve_batch_data) return LiteLLMBatch(**response.model_dump()) @@ -93,11 +93,11 @@ def retrieve_batch( api_version: Optional[str], timeout: Union[float, httpx.Timeout], max_retries: Optional[int], - client: Optional[AzureOpenAI] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ): azure_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( api_key=api_key, api_base=api_base, @@ -112,14 +112,14 @@ def retrieve_batch( ) if _is_async is True: - if not isinstance(azure_client, AsyncAzureOpenAI): + if not isinstance(azure_client, (AsyncAzureOpenAI, AsyncOpenAI)): raise ValueError( "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client." ) return self.aretrieve_batch( # type: ignore retrieve_batch_data=retrieve_batch_data, client=azure_client ) - response = cast(AzureOpenAI, azure_client).batches.retrieve( + response = cast(Union[AzureOpenAI, OpenAI], azure_client).batches.retrieve( **retrieve_batch_data ) return LiteLLMBatch(**response.model_dump()) @@ -127,10 +127,10 @@ def retrieve_batch( async def acancel_batch( self, cancel_batch_data: CancelBatchRequest, - client: AsyncAzureOpenAI, - ) -> Batch: + client: Union[AsyncAzureOpenAI, AsyncOpenAI], + ) -> LiteLLMBatch: response = await client.batches.cancel(**cancel_batch_data) - return response + return LiteLLMBatch(**response.model_dump()) def cancel_batch( self, @@ -141,11 +141,11 @@ def cancel_batch( api_version: Optional[str], timeout: Union[float, httpx.Timeout], max_retries: Optional[int], - client: Optional[AzureOpenAI] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ): azure_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( api_key=api_key, api_base=api_base, @@ -158,12 +158,27 @@ def cancel_batch( raise ValueError( "OpenAI client is not initialized. Make sure api_key is passed or OPENAI_API_KEY is set in the environment." ) + + if _is_async is True: + if not isinstance(azure_client, (AsyncAzureOpenAI, AsyncOpenAI)): + raise ValueError( + "Azure client is not an instance of AsyncAzureOpenAI or AsyncOpenAI. Make sure you passed an async client." + ) + return self.acancel_batch( # type: ignore + cancel_batch_data=cancel_batch_data, client=azure_client + ) + + # At this point, azure_client is guaranteed to be a sync client + if not isinstance(azure_client, (AzureOpenAI, OpenAI)): + raise ValueError( + "Azure client is not an instance of AzureOpenAI or OpenAI. Make sure you passed a sync client." + ) response = azure_client.batches.cancel(**cancel_batch_data) - return response + return LiteLLMBatch(**response.model_dump()) async def alist_batches( self, - client: AsyncAzureOpenAI, + client: Union[AsyncAzureOpenAI, AsyncOpenAI], after: Optional[str] = None, limit: Optional[int] = None, ): @@ -180,11 +195,11 @@ def list_batches( max_retries: Optional[int], after: Optional[str] = None, limit: Optional[int] = None, - client: Optional[AzureOpenAI] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ): azure_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( api_key=api_key, api_base=api_base, @@ -199,7 +214,7 @@ def list_batches( ) if _is_async is True: - if not isinstance(azure_client, AsyncAzureOpenAI): + if not isinstance(azure_client, (AsyncAzureOpenAI, AsyncOpenAI)): raise ValueError( "OpenAI client is not an instance of AsyncOpenAI. Make sure you passed an AsyncOpenAI client." ) diff --git a/litellm/llms/azure/chat/gpt_5_transformation.py b/litellm/llms/azure/chat/gpt_5_transformation.py index d563a2889ca6..eeb55911ecfa 100644 --- a/litellm/llms/azure/chat/gpt_5_transformation.py +++ b/litellm/llms/azure/chat/gpt_5_transformation.py @@ -2,6 +2,8 @@ from typing import List +import litellm +from litellm.exceptions import UnsupportedParamsError from litellm.llms.openai.chat.gpt_5_transformation import OpenAIGPT5Config from litellm.types.llms.openai import AllMessageValues @@ -20,10 +22,33 @@ def is_model_gpt_5_model(cls, model: str) -> bool: Accepts both explicit gpt-5 model names and the ``gpt5_series/`` prefix used for manual routing. """ - return "gpt-5" in model or "gpt5_series" in model + # gpt-5-chat* is a chat model and shouldn't go through GPT-5 reasoning restrictions. + return ("gpt-5" in model and "gpt-5-chat" not in model) or "gpt5_series" in model def get_supported_openai_params(self, model: str) -> List[str]: - return OpenAIGPT5Config.get_supported_openai_params(self, model=model) + """Get supported parameters for Azure OpenAI GPT-5 models. + + Azure OpenAI GPT-5.2 models support logprobs, unlike OpenAI's GPT-5. + This overrides the parent class to add logprobs support back for gpt-5.2. + + Reference: + - Tested with Azure OpenAI GPT-5.2 (api-version: 2025-01-01-preview) + - Azure returns logprobs successfully despite Microsoft's general + documentation stating reasoning models don't support it. + """ + params = OpenAIGPT5Config.get_supported_openai_params(self, model=model) + + # Azure supports tool_choice for GPT-5 deployments, but the base GPT-5 config + # can drop it when the deployment name isn't in the OpenAI model registry. + if "tool_choice" not in params: + params.append("tool_choice") + + # Only gpt-5.2 has been verified to support logprobs on Azure + if self.is_model_gpt_5_2_model(model): + azure_supported_params = ["logprobs", "top_logprobs"] + params.extend(azure_supported_params) + + return params def map_openai_params( self, @@ -33,7 +58,38 @@ def map_openai_params( drop_params: bool, api_version: str = "", ) -> dict: - return OpenAIGPT5Config.map_openai_params( + reasoning_effort_value = ( + non_default_params.get("reasoning_effort") + or optional_params.get("reasoning_effort") + ) + + # gpt-5.1 supports reasoning_effort='none', but other gpt-5 models don't + # See: https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/reasoning + is_gpt_5_1 = self.is_model_gpt_5_1_model(model) + + if reasoning_effort_value == "none" and not is_gpt_5_1: + if litellm.drop_params is True or ( + drop_params is not None and drop_params is True + ): + non_default_params = non_default_params.copy() + optional_params = optional_params.copy() + if non_default_params.get("reasoning_effort") == "none": + non_default_params.pop("reasoning_effort") + if optional_params.get("reasoning_effort") == "none": + optional_params.pop("reasoning_effort") + else: + raise UnsupportedParamsError( + status_code=400, + message=( + "Azure OpenAI does not support reasoning_effort='none' for this model. " + "Supported values are: 'low', 'medium', and 'high'. " + "To drop this parameter, set `litellm.drop_params=True` or for proxy:\n\n" + "`litellm_settings:\n drop_params: true`\n" + "Issue: https://github.com/BerriAI/litellm/issues/16704" + ), + ) + + result = OpenAIGPT5Config.map_openai_params( self, non_default_params=non_default_params, optional_params=optional_params, @@ -41,6 +97,12 @@ def map_openai_params( drop_params=drop_params, ) + # Only drop reasoning_effort='none' for non-gpt-5.1 models + if result.get("reasoning_effort") == "none" and not is_gpt_5_1: + result.pop("reasoning_effort") + + return result + def transform_request( self, model: str, diff --git a/litellm/llms/azure/common_utils.py b/litellm/llms/azure/common_utils.py index 4f5daca4564a..c06760f4ae5a 100644 --- a/litellm/llms/azure/common_utils.py +++ b/litellm/llms/azure/common_utils.py @@ -4,7 +4,7 @@ from typing import Any, Callable, Dict, Literal, Optional, Union, cast import httpx -from openai import AsyncAzureOpenAI, AzureOpenAI +from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI import litellm from litellm._logging import verbose_logger @@ -316,20 +316,18 @@ def get_azure_ad_token( Azure AD token as string if successful, None otherwise """ # Extract parameters + # Use `or` instead of default parameter to handle cases where key exists but value is None azure_ad_token_provider = litellm_params.get("azure_ad_token_provider") - azure_ad_token = litellm_params.get("azure_ad_token", None) or get_secret_str( + azure_ad_token = litellm_params.get("azure_ad_token") or get_secret_str( "AZURE_AD_TOKEN" ) - tenant_id = litellm_params.get("tenant_id", os.getenv("AZURE_TENANT_ID")) - client_id = litellm_params.get("client_id", os.getenv("AZURE_CLIENT_ID")) - client_secret = litellm_params.get( - "client_secret", os.getenv("AZURE_CLIENT_SECRET") - ) - azure_username = litellm_params.get("azure_username", os.getenv("AZURE_USERNAME")) - azure_password = litellm_params.get("azure_password", os.getenv("AZURE_PASSWORD")) - scope = litellm_params.get( - "azure_scope", - os.getenv("AZURE_SCOPE", "https://cognitiveservices.azure.com/.default"), + tenant_id = litellm_params.get("tenant_id") or os.getenv("AZURE_TENANT_ID") + client_id = litellm_params.get("client_id") or os.getenv("AZURE_CLIENT_ID") + client_secret = litellm_params.get("client_secret") or os.getenv("AZURE_CLIENT_SECRET") + azure_username = litellm_params.get("azure_username") or os.getenv("AZURE_USERNAME") + azure_password = litellm_params.get("azure_password") or os.getenv("AZURE_PASSWORD") + scope = litellm_params.get("azure_scope") or os.getenv( + "AZURE_SCOPE", "https://cognitiveservices.azure.com/.default" ) if scope is None: scope = "https://cognitiveservices.azure.com/.default" @@ -463,12 +461,12 @@ def get_azure_openai_client( api_key: Optional[str], api_base: Optional[str], api_version: Optional[str] = None, - client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, _is_async: bool = False, model: Optional[str] = None, - ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI]]: - openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None + ) -> Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]]: + openai_client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None client_initialization_params: dict = locals() client_initialization_params["is_async"] = _is_async if client is None: @@ -477,9 +475,7 @@ def get_azure_openai_client( client_type="azure", ) if cached_client: - if isinstance(cached_client, AzureOpenAI) or isinstance( - cached_client, AsyncAzureOpenAI - ): + if isinstance(cached_client, (AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI)): return cached_client azure_client_params = self.initialize_azure_sdk_client( @@ -490,15 +486,40 @@ def get_azure_openai_client( api_version=api_version, is_async=_is_async, ) - if _is_async is True: - openai_client = AsyncAzureOpenAI(**azure_client_params) + + # For Azure v1 API, use standard OpenAI client instead of AzureOpenAI + # See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#api-specs + if self._is_azure_v1_api_version(api_version): + # Extract only params that OpenAI client accepts + # Always use /openai/v1/ regardless of whether user passed "v1", "latest", or "preview" + v1_params = { + "api_key": azure_client_params.get("api_key"), + "base_url": f"{api_base}/openai/v1/", + } + if "timeout" in azure_client_params: + v1_params["timeout"] = azure_client_params["timeout"] + if "max_retries" in azure_client_params: + v1_params["max_retries"] = azure_client_params["max_retries"] + if "http_client" in azure_client_params: + v1_params["http_client"] = azure_client_params["http_client"] + + verbose_logger.debug(f"Using Azure v1 API with base_url: {v1_params['base_url']}") + + if _is_async is True: + openai_client = AsyncOpenAI(**v1_params) # type: ignore + else: + openai_client = OpenAI(**v1_params) # type: ignore else: - openai_client = AzureOpenAI(**azure_client_params) # type: ignore + # Traditional Azure API uses AzureOpenAI client + if _is_async is True: + openai_client = AsyncAzureOpenAI(**azure_client_params) + else: + openai_client = AzureOpenAI(**azure_client_params) # type: ignore else: openai_client = client if api_version is not None and isinstance( - openai_client._custom_query, dict - ): + openai_client, (AzureOpenAI, AsyncAzureOpenAI) + ) and isinstance(openai_client._custom_query, dict): # set api_version to version passed by user openai_client._custom_query.setdefault("api-version", api_version) @@ -522,23 +543,18 @@ def initialize_azure_sdk_client( azure_ad_token_provider = litellm_params.get("azure_ad_token_provider") # If we have api_key, then we have higher priority azure_ad_token = litellm_params.get("azure_ad_token") - tenant_id = litellm_params.get("tenant_id", os.getenv("AZURE_TENANT_ID")) - client_id = litellm_params.get("client_id", os.getenv("AZURE_CLIENT_ID")) - client_secret = litellm_params.get( - "client_secret", os.getenv("AZURE_CLIENT_SECRET") - ) - azure_username = litellm_params.get( - "azure_username", os.getenv("AZURE_USERNAME") - ) - azure_password = litellm_params.get( - "azure_password", os.getenv("AZURE_PASSWORD") - ) - scope = litellm_params.get( - "azure_scope", - os.getenv("AZURE_SCOPE", "https://cognitiveservices.azure.com/.default"), - ) + + # litellm_params sometimes contains the key, but the value is None + # We should respect environment variables in this case + tenant_id = self._resolve_env_var(litellm_params, "tenant_id", "AZURE_TENANT_ID") + client_id = self._resolve_env_var(litellm_params, "client_id", "AZURE_CLIENT_ID") + client_secret = self._resolve_env_var(litellm_params, "client_secret", "AZURE_CLIENT_SECRET") + azure_username = self._resolve_env_var(litellm_params, "azure_username", "AZURE_USERNAME") + azure_password = self._resolve_env_var(litellm_params, "azure_password", "AZURE_PASSWORD") + scope = self._resolve_env_var(litellm_params, "azure_scope", "AZURE_SCOPE") if scope is None: scope = "https://cognitiveservices.azure.com/.default" + max_retries = litellm_params.get("max_retries") timeout = litellm_params.get("timeout") if ( @@ -782,3 +798,16 @@ def _is_azure_v1_api_version(api_version: Optional[str]) -> bool: if api_version is None: return False return api_version in {"preview", "latest", "v1"} + + def _resolve_env_var(self, litellm_params: Dict[str, Any], param_key: str, env_var_key: str) -> Optional[str]: + """Resolve the environment variable for a given parameter key. + + The logic here is different from `params.get(key, os.getenv(env_var))` because + litellm_params may contain the key with a None value, in which case we want + to fallback to the environment variable. + """ + param_value = litellm_params.get(param_key) + if param_value is not None: + return param_value + return os.getenv(env_var_key) + diff --git a/litellm/llms/azure/cost_calculation.py b/litellm/llms/azure/cost_calculation.py index 96c58d95ff22..5b411095ea15 100644 --- a/litellm/llms/azure/cost_calculation.py +++ b/litellm/llms/azure/cost_calculation.py @@ -1,11 +1,12 @@ """ Helper util for handling azure openai-specific cost calculation -- e.g.: prompt caching +- e.g.: prompt caching, audio tokens """ from typing import Optional, Tuple from litellm._logging import verbose_logger +from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token from litellm.types.utils import Usage from litellm.utils import get_model_info @@ -18,34 +19,15 @@ def cost_per_token( Input: - model: str, the model name without provider prefix - - usage: LiteLLM Usage block, containing anthropic caching information + - usage: LiteLLM Usage block, containing caching and audio token information Returns: Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd """ ## GET MODEL INFO model_info = get_model_info(model=model, custom_llm_provider="azure") - cached_tokens: Optional[int] = None - ## CALCULATE INPUT COST - non_cached_text_tokens = usage.prompt_tokens - if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens: - cached_tokens = usage.prompt_tokens_details.cached_tokens - non_cached_text_tokens = non_cached_text_tokens - cached_tokens - prompt_cost: float = non_cached_text_tokens * model_info["input_cost_per_token"] - - ## CALCULATE OUTPUT COST - completion_cost: float = ( - usage["completion_tokens"] * model_info["output_cost_per_token"] - ) - - ## Prompt Caching cost calculation - if model_info.get("cache_read_input_token_cost") is not None and cached_tokens: - # Note: We read ._cache_read_input_tokens from the Usage - since cost_calculator.py standardizes the cache read tokens on usage._cache_read_input_tokens - prompt_cost += cached_tokens * ( - model_info.get("cache_read_input_token_cost", 0) or 0 - ) - ## Speech / Audio cost calculation + ## Speech / Audio cost calculation (cost per second for TTS models) if ( "output_cost_per_second" in model_info and model_info["output_cost_per_second"] is not None @@ -55,7 +37,14 @@ def cost_per_token( f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}" ) ## COST PER SECOND ## - prompt_cost = 0 + prompt_cost = 0.0 completion_cost = model_info["output_cost_per_second"] * response_time_ms / 1000 - - return prompt_cost, completion_cost + return prompt_cost, completion_cost + + ## Use generic cost calculator for all other cases + ## This properly handles: text tokens, audio tokens, cached tokens, reasoning tokens, etc. + return generic_cost_per_token( + model=model, + usage=usage, + custom_llm_provider="azure", + ) diff --git a/litellm/llms/azure/exception_mapping.py b/litellm/llms/azure/exception_mapping.py new file mode 100644 index 000000000000..bcccad9352f5 --- /dev/null +++ b/litellm/llms/azure/exception_mapping.py @@ -0,0 +1,93 @@ +from typing import Any, Dict, Optional, Tuple + +from litellm.exceptions import ContentPolicyViolationError + + +class AzureOpenAIExceptionMapping: + """ + Class for creating Azure OpenAI specific exceptions + """ + + @staticmethod + def create_content_policy_violation_error( + message: str, + model: str, + extra_information: str, + original_exception: Exception, + ) -> ContentPolicyViolationError: + """ + Create a content policy violation error + """ + azure_error, inner_error = AzureOpenAIExceptionMapping._extract_azure_error( + original_exception + ) + + # Prefer the provider message/type/code when present. + provider_message = ( + azure_error.get("message") + if isinstance(azure_error, dict) + else None + ) or message + provider_type = ( + azure_error.get("type") if isinstance(azure_error, dict) else None + ) + provider_code = ( + azure_error.get("code") if isinstance(azure_error, dict) else None + ) + + # Keep the OpenAI-style body fields populated so downstream (proxy + SDK) + # can surface `type` / `code` correctly. + openai_style_body: Dict[str, Any] = { + "message": provider_message, + "type": provider_type or "invalid_request_error", + "code": provider_code or "content_policy_violation", + "param": None, + } + + raise ContentPolicyViolationError( + message=provider_message, + llm_provider="azure", + model=model, + litellm_debug_info=extra_information, + response=getattr(original_exception, "response", None), + provider_specific_fields={ + # Preserve legacy key for backward compatibility. + "innererror": inner_error, + # Prefer Azure's current naming. + "inner_error": inner_error, + # Include the full Azure error object for clients that want it. + "azure_error": azure_error or None, + }, + body=openai_style_body, + ) + + @staticmethod + def _extract_azure_error( + original_exception: Exception, + ) -> Tuple[Dict[str, Any], Optional[dict]]: + """Extract Azure OpenAI error payload and inner error details. + + Azure error formats can vary by endpoint/version. Common shapes: + - {"innererror": {...}} (legacy) + - {"error": {"code": "...", "message": "...", "type": "...", "inner_error": {...}}} + - {"code": "...", "message": "...", "type": "..."} (already flattened) + """ + body_dict = getattr(original_exception, "body", None) or {} + if not isinstance(body_dict, dict): + return {}, None + + # Some SDKs place the payload under "error". + azure_error: Dict[str, Any] + if isinstance(body_dict.get("error"), dict): + azure_error = body_dict.get("error", {}) # type: ignore[assignment] + else: + azure_error = body_dict + + inner_error = ( + azure_error.get("inner_error") + or azure_error.get("innererror") + or body_dict.get("innererror") + or body_dict.get("inner_error") + ) + + return azure_error, inner_error diff --git a/litellm/llms/azure/files/handler.py b/litellm/llms/azure/files/handler.py index 50c122ccf2c1..e53ced6b0e2e 100644 --- a/litellm/llms/azure/files/handler.py +++ b/litellm/llms/azure/files/handler.py @@ -1,7 +1,7 @@ from typing import Any, Coroutine, Optional, Union, cast import httpx -from openai import AsyncAzureOpenAI, AzureOpenAI +from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI from openai.types.file_deleted import FileDeleted from litellm._logging import verbose_logger @@ -24,13 +24,26 @@ class AzureOpenAIFilesAPI(BaseAzureLLM): def __init__(self) -> None: super().__init__() + @staticmethod + def _prepare_create_file_data(create_file_data: CreateFileRequest) -> dict[str, Any]: + """ + Prepare create_file_data for OpenAI SDK. + + Removes expires_after if None to match SDK's Omit pattern. + SDK expects file_create_params.ExpiresAfter | Omit, but FileExpiresAfter works at runtime. + """ + data = dict(create_file_data) + if data.get("expires_after") is None: + data.pop("expires_after", None) + return data + async def acreate_file( self, create_file_data: CreateFileRequest, - openai_client: AsyncAzureOpenAI, + openai_client: Union[AsyncAzureOpenAI, AsyncOpenAI], ) -> OpenAIFileObject: verbose_logger.debug("create_file_data=%s", create_file_data) - response = await openai_client.files.create(**create_file_data) + response = await openai_client.files.create(**self._prepare_create_file_data(create_file_data)) # type: ignore[arg-type] verbose_logger.debug("create_file_response=%s", response) return OpenAIFileObject(**response.model_dump()) @@ -43,11 +56,11 @@ def create_file( api_version: Optional[str], timeout: Union[float, httpx.Timeout], max_retries: Optional[int], - client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ) -> Union[OpenAIFileObject, Coroutine[Any, Any, OpenAIFileObject]]: openai_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( litellm_params=litellm_params or {}, api_key=api_key, @@ -62,20 +75,20 @@ def create_file( ) if _is_async is True: - if not isinstance(openai_client, AsyncAzureOpenAI): + if not isinstance(openai_client, (AsyncAzureOpenAI, AsyncOpenAI)): raise ValueError( "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client." ) return self.acreate_file( create_file_data=create_file_data, openai_client=openai_client ) - response = cast(AzureOpenAI, openai_client).files.create(**create_file_data) + response = cast(Union[AzureOpenAI, OpenAI], openai_client).files.create(**self._prepare_create_file_data(create_file_data)) # type: ignore[arg-type] return OpenAIFileObject(**response.model_dump()) async def afile_content( self, file_content_request: FileContentRequest, - openai_client: AsyncAzureOpenAI, + openai_client: Union[AsyncAzureOpenAI, AsyncOpenAI], ) -> HttpxBinaryResponseContent: response = await openai_client.files.content(**file_content_request) return HttpxBinaryResponseContent(response=response.response) @@ -89,13 +102,13 @@ def file_content( timeout: Union[float, httpx.Timeout], max_retries: Optional[int], api_version: Optional[str] = None, - client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ) -> Union[ HttpxBinaryResponseContent, Coroutine[Any, Any, HttpxBinaryResponseContent] ]: openai_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( litellm_params=litellm_params or {}, api_key=api_key, @@ -110,7 +123,7 @@ def file_content( ) if _is_async is True: - if not isinstance(openai_client, AsyncAzureOpenAI): + if not isinstance(openai_client, (AsyncAzureOpenAI, AsyncOpenAI)): raise ValueError( "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client." ) @@ -118,7 +131,7 @@ def file_content( file_content_request=file_content_request, openai_client=openai_client, ) - response = cast(AzureOpenAI, openai_client).files.content( + response = cast(Union[AzureOpenAI, OpenAI], openai_client).files.content( **file_content_request ) @@ -127,7 +140,7 @@ def file_content( async def aretrieve_file( self, file_id: str, - openai_client: AsyncAzureOpenAI, + openai_client: Union[AsyncAzureOpenAI, AsyncOpenAI], ) -> FileObject: response = await openai_client.files.retrieve(file_id=file_id) return response @@ -141,11 +154,11 @@ def retrieve_file( timeout: Union[float, httpx.Timeout], max_retries: Optional[int], api_version: Optional[str] = None, - client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ): openai_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( litellm_params=litellm_params or {}, api_key=api_key, @@ -160,7 +173,7 @@ def retrieve_file( ) if _is_async is True: - if not isinstance(openai_client, AsyncAzureOpenAI): + if not isinstance(openai_client, (AsyncAzureOpenAI, AsyncOpenAI)): raise ValueError( "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client." ) @@ -175,7 +188,7 @@ def retrieve_file( async def adelete_file( self, file_id: str, - openai_client: AsyncAzureOpenAI, + openai_client: Union[AsyncAzureOpenAI, AsyncOpenAI], ) -> FileDeleted: response = await openai_client.files.delete(file_id=file_id) @@ -193,11 +206,11 @@ def delete_file( max_retries: Optional[int], organization: Optional[str] = None, api_version: Optional[str] = None, - client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ): openai_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( litellm_params=litellm_params or {}, api_key=api_key, @@ -212,7 +225,7 @@ def delete_file( ) if _is_async is True: - if not isinstance(openai_client, AsyncAzureOpenAI): + if not isinstance(openai_client, (AsyncAzureOpenAI, AsyncOpenAI)): raise ValueError( "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client." ) @@ -229,7 +242,7 @@ def delete_file( async def alist_files( self, - openai_client: AsyncAzureOpenAI, + openai_client: Union[AsyncAzureOpenAI, AsyncOpenAI], purpose: Optional[str] = None, ): if isinstance(purpose, str): @@ -247,11 +260,11 @@ def list_files( max_retries: Optional[int], purpose: Optional[str] = None, api_version: Optional[str] = None, - client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI]] = None, + client: Optional[Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI]] = None, litellm_params: Optional[dict] = None, ): openai_client: Optional[ - Union[AzureOpenAI, AsyncAzureOpenAI] + Union[AzureOpenAI, AsyncAzureOpenAI, OpenAI, AsyncOpenAI] ] = self.get_azure_openai_client( litellm_params=litellm_params or {}, api_key=api_key, @@ -266,7 +279,7 @@ def list_files( ) if _is_async is True: - if not isinstance(openai_client, AsyncAzureOpenAI): + if not isinstance(openai_client, (AsyncAzureOpenAI, AsyncOpenAI)): raise ValueError( "AzureOpenAI client is not an instance of AsyncAzureOpenAI. Make sure you passed an AsyncAzureOpenAI client." ) diff --git a/litellm/llms/azure/realtime/handler.py b/litellm/llms/azure/realtime/handler.py index 23c04e640c43..e533978e07a5 100644 --- a/litellm/llms/azure/realtime/handler.py +++ b/litellm/llms/azure/realtime/handler.py @@ -10,7 +10,9 @@ from ....litellm_core_utils.litellm_logging import Logging as LiteLLMLogging from ....litellm_core_utils.realtime_streaming import RealTimeStreaming +from ....llms.custom_httpx.http_handler import get_shared_realtime_ssl_context from ..azure import AzureChatCompletion +from litellm._logging import verbose_proxy_logger # BACKEND_WS_URL = "ws://localhost:8080/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01" @@ -27,16 +29,41 @@ async def forward_messages(client_ws: Any, backend_ws: Any): class AzureOpenAIRealtime(AzureChatCompletion): - def _construct_url(self, api_base: str, model: str, api_version: str) -> str: + def _construct_url( + self, + api_base: str, + model: str, + api_version: str, + realtime_protocol: Optional[str] = None, + ) -> str: """ - Example output: - "wss://my-endpoint-sweden-berri992.openai.azure.com/openai/realtime?api-version=2024-10-01-preview&deployment=gpt-4o-realtime-preview"; - + Construct Azure realtime WebSocket URL. + + Args: + api_base: Azure API base URL (will be converted from https:// to wss://) + model: Model deployment name + api_version: Azure API version + realtime_protocol: Protocol version to use: + - "GA" or "v1": Uses /openai/v1/realtime (GA path) + - "beta" or None: Uses /openai/realtime (beta path, default) + + Returns: + WebSocket URL string + + Examples: + beta/default: "wss://.../openai/realtime?api-version=2024-10-01-preview&deployment=gpt-4o-realtime-preview" + GA/v1: "wss://.../openai/v1/realtime?model=gpt-realtime-deployment" """ api_base = api_base.replace("https://", "wss://") - return ( - f"{api_base}/openai/realtime?api-version={api_version}&deployment={model}" - ) + + # Determine path based on realtime_protocol + if realtime_protocol in ("GA", "v1"): + path = "/openai/v1/realtime" + return f"{api_base}{path}?model={model}" + else: + # Default to beta path for backwards compatibility + path = "/openai/realtime" + return f"{api_base}{path}?api-version={api_version}&deployment={model}" async def async_realtime( self, @@ -49,6 +76,7 @@ async def async_realtime( azure_ad_token: Optional[str] = None, client: Optional[Any] = None, timeout: Optional[float] = None, + realtime_protocol: Optional[str] = None, ): import websockets from websockets.asyncio.client import ClientConnection @@ -58,15 +86,19 @@ async def async_realtime( if api_version is None: raise ValueError("api_version is required for Azure OpenAI calls") - url = self._construct_url(api_base, model, api_version) + url = self._construct_url( + api_base, model, api_version, realtime_protocol=realtime_protocol + ) try: + ssl_context = get_shared_realtime_ssl_context() async with websockets.connect( # type: ignore url, - extra_headers={ + additional_headers={ "api-key": api_key, # type: ignore }, max_size=REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES, + ssl=ssl_context, ) as backend_ws: realtime_streaming = RealTimeStreaming( websocket, cast(ClientConnection, backend_ws), logging_obj @@ -76,4 +108,5 @@ async def async_realtime( except websockets.exceptions.InvalidStatusCode as e: # type: ignore await websocket.close(code=e.status_code, reason=str(e)) except Exception: + verbose_proxy_logger.exception("Error in AzureOpenAIRealtime.async_realtime") pass diff --git a/litellm/llms/azure/responses/transformation.py b/litellm/llms/azure/responses/transformation.py index f8d44044973c..44ce368fd490 100644 --- a/litellm/llms/azure/responses/transformation.py +++ b/litellm/llms/azure/responses/transformation.py @@ -1,4 +1,5 @@ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union +from copy import deepcopy import httpx from openai.types.responses import ResponseReasoningItem @@ -43,7 +44,7 @@ def _handle_reasoning_item(self, item: Dict[str, Any]) -> Dict[str, Any]: """ Handle reasoning items to filter out the status field. Issue: https://github.com/BerriAI/litellm/issues/13484 - + Azure OpenAI API does not accept 'status' field in reasoning input items. """ if item.get("type") == "reasoning": @@ -78,7 +79,7 @@ def _handle_reasoning_item(self, item: Dict[str, Any]) -> Dict[str, Any]: } return filtered_item return item - + def _validate_input_param( self, input: Union[str, ResponseInputParam] ) -> Union[str, ResponseInputParam]: @@ -90,7 +91,7 @@ def _validate_input_param( # First call parent's validation validated_input = super()._validate_input_param(input) - + # Then filter out status from message items if isinstance(validated_input, list): filtered_input: List[Any] = [] @@ -102,7 +103,7 @@ def _validate_input_param( else: filtered_input.append(item) return cast(ResponseInputParam, filtered_input) - + return validated_input def transform_responses_api_request( @@ -116,6 +117,21 @@ def transform_responses_api_request( """No transform applied since inputs are in OpenAI spec already""" stripped_model_name = self.get_stripped_model_name(model) + # Azure Responses API requires flattened tools (params at top level, not nested in 'function') + if "tools" in response_api_optional_request_params and isinstance( + response_api_optional_request_params["tools"], list + ): + new_tools: List[Dict[str, Any]] = [] + for tool in response_api_optional_request_params["tools"]: + if isinstance(tool, dict) and "function" in tool: + new_tool: Dict[str, Any] = deepcopy(tool) + function_data = new_tool.pop("function") + new_tool.update(function_data) + new_tools.append(new_tool) + else: + new_tools.append(tool) + response_api_optional_request_params["tools"] = new_tools + return super().transform_responses_api_request( model=stripped_model_name, input=input, @@ -137,31 +153,16 @@ def get_complete_url( "https://litellm8397336933.openai.azure.com" OR "https://litellm8397336933.openai.azure.com/openai/responses?api-version=2024-05-01-preview" - OR (configured for Chat Completions - will be sanitized) - "https://litellm8397336933.openai.azure.com/openai/deployments/gpt-4o/chat/completions" - - litellm_params: LiteLLM parameters including api_version. + - model: Model name. + - optional_params: Additional query parameters, including "api_version". + - stream: If streaming is required (optional). Returns: - A complete URL string, e.g., "https://litellm8397336933.openai.azure.com/openai/responses?api-version=2024-05-01-preview" - - Note: Unlike Chat Completions API which uses /openai/deployments/{deployment-id}/chat/completions, - the Responses API uses /openai/responses and the model is specified in the request body. - This method strips any deployment-specific paths from api_base. """ - import re - from litellm.constants import AZURE_DEFAULT_RESPONSES_API_VERSION - # Sanitize api_base: strip deployment-specific paths - # The Responses API uses /openai/responses, not /openai/deployments/{name}/... - # The model is specified in the request body, not in the URL path - if api_base: - # Pattern matches /openai/deployments/{name} with optional suffix - # e.g., /openai/deployments/gpt-4o or /openai/deployments/gpt-4o/chat/completions - api_base = re.sub(r"/openai/deployments/[^/]+(/.*)?$", "", api_base) - api_base = api_base.rstrip("/") - return BaseAzureLLM._get_base_azure_url( api_base=api_base, litellm_params=litellm_params, diff --git a/litellm/llms/azure/text_to_speech/transformation.py b/litellm/llms/azure/text_to_speech/transformation.py index 0f8911ac2b8a..df582c3c09b8 100644 --- a/litellm/llms/azure/text_to_speech/transformation.py +++ b/litellm/llms/azure/text_to_speech/transformation.py @@ -382,6 +382,15 @@ def _build_tts_url(self, region: str) -> str: return f"https://{region}.{self.TTS_SPEECH_DOMAIN}{self.TTS_ENDPOINT_PATH}" return f"https://{self.TTS_SPEECH_DOMAIN}{self.TTS_ENDPOINT_PATH}" + + def is_ssml_input(self, input: str) -> bool: + """ + Returns True if input is SSML, False otherwise + + Based on https://www.w3.org/TR/speech-synthesis/ all SSML must start with + """ + return "" in input or ", it's passed through as-is without transformation + Returns: TextToSpeechRequestData: Contains SSML body and Azure-specific headers """ @@ -414,7 +426,15 @@ def transform_text_to_speech_request( ) headers["X-Microsoft-OutputFormat"] = output_format - # Build SSML + # Auto-detect SSML: if input contains , pass it through as-is + # Similar to Vertex AI behavior - check if input looks like SSML + if self.is_ssml_input(input=input): + return TextToSpeechRequestData( + ssml_body=input, + headers=headers, + ) + + # Build SSML from plain text rate = optional_params.get("rate", "0%") style = optional_params.get("style") styledegree = optional_params.get("styledegree") diff --git a/litellm/llms/azure/videos/transformation.py b/litellm/llms/azure/videos/transformation.py index 3af9e0778bc5..a6fbd8cef8b5 100644 --- a/litellm/llms/azure/videos/transformation.py +++ b/litellm/llms/azure/videos/transformation.py @@ -1,9 +1,8 @@ from typing import TYPE_CHECKING, Any, Dict, Optional from litellm.types.videos.main import VideoCreateOptionalRequestParams -from litellm.secret_managers.main import get_secret_str +from litellm.types.router import GenericLiteLLMParams from litellm.llms.azure.common_utils import BaseAzureLLM -import litellm from litellm.llms.openai.videos.transformation import OpenAIVideoConfig if TYPE_CHECKING: from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj @@ -56,21 +55,26 @@ def validate_environment( headers: dict, model: str, api_key: Optional[str] = None, + litellm_params: Optional[GenericLiteLLMParams] = None, ) -> dict: - api_key = ( - api_key - or litellm.api_key - or litellm.azure_key - or get_secret_str("AZURE_OPENAI_API_KEY") - or get_secret_str("AZURE_API_KEY") - ) - - headers.update( - { - "Authorization": f"Bearer {api_key}", - } + """ + Validate Azure environment and set up authentication headers. + Uses _base_validate_azure_environment to properly handle credentials from litellm_credential_name. + """ + # If litellm_params is provided, use it; otherwise create a new one + if litellm_params is None: + litellm_params = GenericLiteLLMParams() + + if api_key and not litellm_params.api_key: + litellm_params.api_key = api_key + + # Use the base Azure validation method which properly handles: + # 1. Credentials from litellm_credential_name via litellm_params + # 2. Sets the correct "api-key" header (not "Authorization: Bearer") + return BaseAzureLLM._base_validate_azure_environment( + headers=headers, + litellm_params=litellm_params ) - return headers def get_complete_url( self, diff --git a/litellm/llms/azure_ai/agents/__init__.py b/litellm/llms/azure_ai/agents/__init__.py new file mode 100644 index 000000000000..2553c21723c9 --- /dev/null +++ b/litellm/llms/azure_ai/agents/__init__.py @@ -0,0 +1,11 @@ +from litellm.llms.azure_ai.agents.handler import azure_ai_agents_handler +from litellm.llms.azure_ai.agents.transformation import ( + AzureAIAgentsConfig, + AzureAIAgentsError, +) + +__all__ = [ + "AzureAIAgentsConfig", + "AzureAIAgentsError", + "azure_ai_agents_handler", +] diff --git a/litellm/llms/azure_ai/agents/handler.py b/litellm/llms/azure_ai/agents/handler.py new file mode 100644 index 000000000000..379dc1e1c55a --- /dev/null +++ b/litellm/llms/azure_ai/agents/handler.py @@ -0,0 +1,558 @@ +""" +Handler for Azure Foundry Agent Service API. + +This handler executes the multi-step agent flow: +1. Create thread (or use existing) +2. Add messages to thread +3. Create and poll a run +4. Retrieve the assistant's response messages + +Model format: azure_ai/agents/ +API Base format: https://.services.ai.azure.com/api/projects/ + +Authentication: Uses Azure AD Bearer tokens (not API keys) + Get token via: az account get-access-token --resource 'https://ai.azure.com' + +Supports both polling-based and native streaming (SSE) modes. + +See: https://learn.microsoft.com/en-us/azure/ai-foundry/agents/quickstart +""" + +import asyncio +import json +import time +import uuid +from typing import ( + TYPE_CHECKING, + Any, + AsyncIterator, + Callable, + Dict, + List, + Optional, + Tuple, +) + +import httpx + +from litellm._logging import verbose_logger +from litellm.llms.azure_ai.agents.transformation import ( + AzureAIAgentsConfig, + AzureAIAgentsError, +) +from litellm.types.utils import ModelResponse + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler + + LiteLLMLoggingObj = _LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any + HTTPHandler = Any + AsyncHTTPHandler = Any + + +class AzureAIAgentsHandler: + """ + Handler for Azure AI Agent Service. + + Executes the complete agent flow which requires multiple API calls. + """ + + def __init__(self): + self.config = AzureAIAgentsConfig() + + # ------------------------------------------------------------------------- + # URL Builders + # ------------------------------------------------------------------------- + # Azure Foundry Agents API uses /assistants, /threads, etc. directly + # See: https://learn.microsoft.com/en-us/azure/ai-foundry/agents/quickstart + # ------------------------------------------------------------------------- + def _build_thread_url(self, api_base: str, api_version: str) -> str: + return f"{api_base}/threads?api-version={api_version}" + + def _build_messages_url(self, api_base: str, thread_id: str, api_version: str) -> str: + return f"{api_base}/threads/{thread_id}/messages?api-version={api_version}" + + def _build_runs_url(self, api_base: str, thread_id: str, api_version: str) -> str: + return f"{api_base}/threads/{thread_id}/runs?api-version={api_version}" + + def _build_run_status_url(self, api_base: str, thread_id: str, run_id: str, api_version: str) -> str: + return f"{api_base}/threads/{thread_id}/runs/{run_id}?api-version={api_version}" + + def _build_list_messages_url(self, api_base: str, thread_id: str, api_version: str) -> str: + return f"{api_base}/threads/{thread_id}/messages?api-version={api_version}" + + def _build_create_thread_and_run_url(self, api_base: str, api_version: str) -> str: + """URL for the create-thread-and-run endpoint (supports streaming).""" + return f"{api_base}/threads/runs?api-version={api_version}" + + # ------------------------------------------------------------------------- + # Response Helpers + # ------------------------------------------------------------------------- + def _extract_content_from_messages(self, messages_data: dict) -> str: + """Extract assistant content from the messages response.""" + for msg in messages_data.get("data", []): + if msg.get("role") == "assistant": + for content_item in msg.get("content", []): + if content_item.get("type") == "text": + return content_item.get("text", {}).get("value", "") + return "" + + def _build_model_response( + self, + model: str, + content: str, + model_response: ModelResponse, + thread_id: str, + messages: List[Dict[str, Any]], + ) -> ModelResponse: + """Build the ModelResponse from agent output.""" + from litellm.types.utils import Choices, Message, Usage + + model_response.choices = [ + Choices(finish_reason="stop", index=0, message=Message(content=content, role="assistant")) + ] + model_response.model = model + + # Store thread_id for conversation continuity + if not hasattr(model_response, "_hidden_params") or model_response._hidden_params is None: + model_response._hidden_params = {} + model_response._hidden_params["thread_id"] = thread_id + + # Estimate token usage + try: + from litellm.utils import token_counter + + prompt_tokens = token_counter(model="gpt-3.5-turbo", messages=messages) + completion_tokens = token_counter(model="gpt-3.5-turbo", text=content, count_response_tokens=True) + setattr( + model_response, + "usage", + Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ), + ) + except Exception as e: + verbose_logger.warning(f"Failed to calculate token usage: {str(e)}") + + return model_response + + def _prepare_completion_params( + self, + model: str, + api_base: str, + api_key: str, + optional_params: dict, + headers: Optional[dict], + ) -> tuple: + """Prepare common parameters for completion. + + Azure Foundry Agents API uses Bearer token authentication: + - Authorization: Bearer (Azure AD token from 'az account get-access-token --resource https://ai.azure.com') + + See: https://learn.microsoft.com/en-us/azure/ai-foundry/agents/quickstart + """ + if headers is None: + headers = {} + headers["Content-Type"] = "application/json" + + # Azure Foundry Agents uses Bearer token authentication + # The api_key here is expected to be an Azure AD token + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + api_version = optional_params.get("api_version", self.config.DEFAULT_API_VERSION) + agent_id = self.config._get_agent_id(model, optional_params) + thread_id = optional_params.get("thread_id") + api_base = api_base.rstrip("/") + + verbose_logger.debug(f"Azure AI Agents completion - api_base: {api_base}, agent_id: {agent_id}") + + return headers, api_version, agent_id, thread_id, api_base + + def _check_response(self, response: httpx.Response, expected_codes: List[int], error_msg: str): + """Check response status and raise error if not expected.""" + if response.status_code not in expected_codes: + raise AzureAIAgentsError(status_code=response.status_code, message=f"{error_msg}: {response.text}") + + # ------------------------------------------------------------------------- + # Sync Completion + # ------------------------------------------------------------------------- + def completion( + self, + model: str, + messages: List[Dict[str, Any]], + api_base: str, + api_key: str, + model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, + optional_params: dict, + litellm_params: dict, + timeout: float, + client: Optional[HTTPHandler] = None, + headers: Optional[dict] = None, + ) -> ModelResponse: + """Execute synchronous completion using Azure Agent Service.""" + from litellm.llms.custom_httpx.http_handler import _get_httpx_client + + if client is None: + client = _get_httpx_client(params={"ssl_verify": litellm_params.get("ssl_verify", None)}) + + headers, api_version, agent_id, thread_id, api_base = self._prepare_completion_params( + model, api_base, api_key, optional_params, headers + ) + + def make_request(method: str, url: str, json_data: Optional[dict] = None) -> httpx.Response: + if method == "GET": + return client.get(url=url, headers=headers) + return client.post(url=url, headers=headers, data=json.dumps(json_data) if json_data else None) + + # Execute the agent flow + thread_id, content = self._execute_agent_flow_sync( + make_request=make_request, + api_base=api_base, + api_version=api_version, + agent_id=agent_id, + thread_id=thread_id, + messages=messages, + optional_params=optional_params, + ) + + return self._build_model_response(model, content, model_response, thread_id, messages) + + def _execute_agent_flow_sync( + self, + make_request: Callable, + api_base: str, + api_version: str, + agent_id: str, + thread_id: Optional[str], + messages: List[Dict[str, Any]], + optional_params: dict, + ) -> Tuple[str, str]: + """Execute the agent flow synchronously. Returns (thread_id, content).""" + + # Step 1: Create thread if not provided + if not thread_id: + verbose_logger.debug(f"Creating thread at: {self._build_thread_url(api_base, api_version)}") + response = make_request("POST", self._build_thread_url(api_base, api_version), {}) + self._check_response(response, [200, 201], "Failed to create thread") + thread_id = response.json()["id"] + verbose_logger.debug(f"Created thread: {thread_id}") + + # At this point thread_id is guaranteed to be a string + assert thread_id is not None + + # Step 2: Add messages to thread + for msg in messages: + if msg.get("role") in ["user", "system"]: + url = self._build_messages_url(api_base, thread_id, api_version) + response = make_request("POST", url, {"role": "user", "content": msg.get("content", "")}) + self._check_response(response, [200, 201], "Failed to add message") + + # Step 3: Create run + run_payload = {"assistant_id": agent_id} + if "instructions" in optional_params: + run_payload["instructions"] = optional_params["instructions"] + + response = make_request("POST", self._build_runs_url(api_base, thread_id, api_version), run_payload) + self._check_response(response, [200, 201], "Failed to create run") + run_id = response.json()["id"] + verbose_logger.debug(f"Created run: {run_id}") + + # Step 4: Poll for completion + status_url = self._build_run_status_url(api_base, thread_id, run_id, api_version) + for _ in range(self.config.MAX_POLL_ATTEMPTS): + response = make_request("GET", status_url) + self._check_response(response, [200], "Failed to get run status") + + status = response.json().get("status") + verbose_logger.debug(f"Run status: {status}") + + if status == "completed": + break + elif status in ["failed", "cancelled", "expired"]: + error_msg = response.json().get("last_error", {}).get("message", "Unknown error") + raise AzureAIAgentsError(status_code=500, message=f"Run {status}: {error_msg}") + + time.sleep(self.config.POLL_INTERVAL_SECONDS) + else: + raise AzureAIAgentsError(status_code=408, message="Run timed out waiting for completion") + + # Step 5: Get messages + response = make_request("GET", self._build_list_messages_url(api_base, thread_id, api_version)) + self._check_response(response, [200], "Failed to get messages") + + content = self._extract_content_from_messages(response.json()) + return thread_id, content + + # ------------------------------------------------------------------------- + # Async Completion + # ------------------------------------------------------------------------- + async def acompletion( + self, + model: str, + messages: List[Dict[str, Any]], + api_base: str, + api_key: str, + model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, + optional_params: dict, + litellm_params: dict, + timeout: float, + client: Optional[AsyncHTTPHandler] = None, + headers: Optional[dict] = None, + ) -> ModelResponse: + """Execute asynchronous completion using Azure Agent Service.""" + import litellm + from litellm.llms.custom_httpx.http_handler import get_async_httpx_client + + if client is None: + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.AZURE_AI, + params={"ssl_verify": litellm_params.get("ssl_verify", None)}, + ) + + headers, api_version, agent_id, thread_id, api_base = self._prepare_completion_params( + model, api_base, api_key, optional_params, headers + ) + + async def make_request(method: str, url: str, json_data: Optional[dict] = None) -> httpx.Response: + if method == "GET": + return await client.get(url=url, headers=headers) + return await client.post(url=url, headers=headers, data=json.dumps(json_data) if json_data else None) + + # Execute the agent flow + thread_id, content = await self._execute_agent_flow_async( + make_request=make_request, + api_base=api_base, + api_version=api_version, + agent_id=agent_id, + thread_id=thread_id, + messages=messages, + optional_params=optional_params, + ) + + return self._build_model_response(model, content, model_response, thread_id, messages) + + async def _execute_agent_flow_async( + self, + make_request: Callable, + api_base: str, + api_version: str, + agent_id: str, + thread_id: Optional[str], + messages: List[Dict[str, Any]], + optional_params: dict, + ) -> Tuple[str, str]: + """Execute the agent flow asynchronously. Returns (thread_id, content).""" + + # Step 1: Create thread if not provided + if not thread_id: + verbose_logger.debug(f"Creating thread at: {self._build_thread_url(api_base, api_version)}") + response = await make_request("POST", self._build_thread_url(api_base, api_version), {}) + self._check_response(response, [200, 201], "Failed to create thread") + thread_id = response.json()["id"] + verbose_logger.debug(f"Created thread: {thread_id}") + + # At this point thread_id is guaranteed to be a string + assert thread_id is not None + + # Step 2: Add messages to thread + for msg in messages: + if msg.get("role") in ["user", "system"]: + url = self._build_messages_url(api_base, thread_id, api_version) + response = await make_request("POST", url, {"role": "user", "content": msg.get("content", "")}) + self._check_response(response, [200, 201], "Failed to add message") + + # Step 3: Create run + run_payload = {"assistant_id": agent_id} + if "instructions" in optional_params: + run_payload["instructions"] = optional_params["instructions"] + + response = await make_request("POST", self._build_runs_url(api_base, thread_id, api_version), run_payload) + self._check_response(response, [200, 201], "Failed to create run") + run_id = response.json()["id"] + verbose_logger.debug(f"Created run: {run_id}") + + # Step 4: Poll for completion + status_url = self._build_run_status_url(api_base, thread_id, run_id, api_version) + for _ in range(self.config.MAX_POLL_ATTEMPTS): + response = await make_request("GET", status_url) + self._check_response(response, [200], "Failed to get run status") + + status = response.json().get("status") + verbose_logger.debug(f"Run status: {status}") + + if status == "completed": + break + elif status in ["failed", "cancelled", "expired"]: + error_msg = response.json().get("last_error", {}).get("message", "Unknown error") + raise AzureAIAgentsError(status_code=500, message=f"Run {status}: {error_msg}") + + await asyncio.sleep(self.config.POLL_INTERVAL_SECONDS) + else: + raise AzureAIAgentsError(status_code=408, message="Run timed out waiting for completion") + + # Step 5: Get messages + response = await make_request("GET", self._build_list_messages_url(api_base, thread_id, api_version)) + self._check_response(response, [200], "Failed to get messages") + + content = self._extract_content_from_messages(response.json()) + return thread_id, content + + # ------------------------------------------------------------------------- + # Streaming Completion (Native SSE) + # ------------------------------------------------------------------------- + async def acompletion_stream( + self, + model: str, + messages: List[Dict[str, Any]], + api_base: str, + api_key: str, + logging_obj: LiteLLMLoggingObj, + optional_params: dict, + litellm_params: dict, + timeout: float, + headers: Optional[dict] = None, + ) -> AsyncIterator: + """Execute async streaming completion using Azure Agent Service with native SSE.""" + import litellm + from litellm.llms.custom_httpx.http_handler import get_async_httpx_client + + headers, api_version, agent_id, thread_id, api_base = self._prepare_completion_params( + model, api_base, api_key, optional_params, headers + ) + + # Build payload for create-thread-and-run with streaming + thread_messages = [] + for msg in messages: + if msg.get("role") in ["user", "system"]: + thread_messages.append({ + "role": "user", + "content": msg.get("content", "") + }) + + payload: Dict[str, Any] = { + "assistant_id": agent_id, + "stream": True, + } + + # Add thread with messages if we don't have an existing thread + if not thread_id: + payload["thread"] = {"messages": thread_messages} + + if "instructions" in optional_params: + payload["instructions"] = optional_params["instructions"] + + url = self._build_create_thread_and_run_url(api_base, api_version) + verbose_logger.debug(f"Azure AI Agents streaming - URL: {url}") + + # Use LiteLLM's async HTTP client for streaming + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.AZURE_AI, + params={"ssl_verify": litellm_params.get("ssl_verify", None)}, + ) + + response = await client.post( + url=url, + headers=headers, + data=json.dumps(payload), + stream=True, + ) + + if response.status_code not in [200, 201]: + error_text = await response.aread() + raise AzureAIAgentsError( + status_code=response.status_code, + message=f"Streaming request failed: {error_text.decode()}" + ) + + async for chunk in self._process_sse_stream(response, model): + yield chunk + + async def _process_sse_stream( + self, + response: httpx.Response, + model: str, + ) -> AsyncIterator: + """Process SSE stream and yield OpenAI-compatible streaming chunks.""" + from litellm.types.utils import Delta, ModelResponseStream, StreamingChoices + + response_id = f"chatcmpl-{uuid.uuid4().hex[:8]}" + created = int(time.time()) + thread_id = None + + current_event = None + + async for line in response.aiter_lines(): + line = line.strip() + + if line.startswith("event:"): + current_event = line[6:].strip() + continue + + if line.startswith("data:"): + data_str = line[5:].strip() + + if data_str == "[DONE]": + # Send final chunk with finish_reason + final_chunk = ModelResponseStream( + id=response_id, + created=created, + model=model, + object="chat.completion.chunk", + choices=[ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(content=None), + ) + ], + ) + if thread_id: + final_chunk._hidden_params = {"thread_id": thread_id} + yield final_chunk + return + + try: + data = json.loads(data_str) + except json.JSONDecodeError: + continue + + # Extract thread_id from thread.created event + if current_event == "thread.created" and "id" in data: + thread_id = data["id"] + verbose_logger.debug(f"Stream created thread: {thread_id}") + + # Process message deltas - this is where the actual content comes + if current_event == "thread.message.delta": + delta_content = data.get("delta", {}).get("content", []) + for content_item in delta_content: + if content_item.get("type") == "text": + text_value = content_item.get("text", {}).get("value", "") + if text_value: + chunk = ModelResponseStream( + id=response_id, + created=created, + model=model, + object="chat.completion.chunk", + choices=[ + StreamingChoices( + finish_reason=None, + index=0, + delta=Delta(content=text_value, role="assistant"), + ) + ], + ) + if thread_id: + chunk._hidden_params = {"thread_id": thread_id} + yield chunk + + +# Singleton instance +azure_ai_agents_handler = AzureAIAgentsHandler() diff --git a/litellm/llms/azure_ai/agents/transformation.py b/litellm/llms/azure_ai/agents/transformation.py new file mode 100644 index 000000000000..01945aad3231 --- /dev/null +++ b/litellm/llms/azure_ai/agents/transformation.py @@ -0,0 +1,400 @@ +""" +Transformation for Azure Foundry Agent Service API. + +Azure Foundry Agent Service provides an Assistants-like API for running agents. +This follows the OpenAI Assistants pattern: create thread -> add messages -> create/poll run. + +Model format: azure_ai/agents/ + +API Base format: https://.services.ai.azure.com/api/projects/ + +Authentication: Uses Azure AD Bearer tokens (not API keys) + Get token via: az account get-access-token --resource 'https://ai.azure.com' + +The API uses these endpoints: +- POST /threads - Create a thread +- POST /threads/{thread_id}/messages - Add message to thread +- POST /threads/{thread_id}/runs - Create a run +- GET /threads/{thread_id}/runs/{run_id} - Poll run status +- GET /threads/{thread_id}/messages - List messages in thread + +See: https://learn.microsoft.com/en-us/azure/ai-foundry/agents/quickstart +""" + +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +import httpx + +from litellm._logging import verbose_logger +from litellm.litellm_core_utils.prompt_templates.common_utils import ( + convert_content_list_to_str, +) +from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException +from litellm.types.llms.openai import AllMessageValues +from litellm.types.utils import ModelResponse + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler + + LiteLLMLoggingObj = _LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any + HTTPHandler = Any + AsyncHTTPHandler = Any + + +class AzureAIAgentsError(BaseLLMException): + """Exception class for Azure AI Agent Service API errors.""" + + pass + + +class AzureAIAgentsConfig(BaseConfig): + """ + Configuration for Azure AI Agent Service API. + + Azure AI Agent Service is a fully managed service for building AI agents + that can understand natural language and perform tasks. + + Model format: azure_ai/agents/ + + The flow is: + 1. Create a thread + 2. Add user messages to the thread + 3. Create and poll a run + 4. Retrieve the assistant's response messages + """ + + # Default API version for Azure Foundry Agent Service + # GA version: 2025-05-01, Preview: 2025-05-15-preview + # See: https://learn.microsoft.com/en-us/azure/ai-foundry/agents/quickstart + DEFAULT_API_VERSION = "2025-05-01" + + # Polling configuration + MAX_POLL_ATTEMPTS = 60 + POLL_INTERVAL_SECONDS = 1.0 + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @staticmethod + def is_azure_ai_agents_route(model: str) -> bool: + """ + Check if the model is an Azure AI Agents route. + + Model format: azure_ai/agents/ + """ + return "agents/" in model + + @staticmethod + def get_agent_id_from_model(model: str) -> str: + """ + Extract agent ID from the model string. + + Model format: azure_ai/agents/ -> + or: agents/ -> + """ + if "agents/" in model: + # Split on "agents/" and take the part after it + parts = model.split("agents/", 1) + if len(parts) == 2: + return parts[1] + return model + + def _get_openai_compatible_provider_info( + self, + api_base: Optional[str], + api_key: Optional[str], + ) -> Tuple[Optional[str], Optional[str]]: + """ + Get Azure AI Agent Service API base and key from params or environment. + + Returns: + Tuple of (api_base, api_key) + """ + from litellm.secret_managers.main import get_secret_str + + api_base = api_base or get_secret_str("AZURE_AI_API_BASE") + api_key = api_key or get_secret_str("AZURE_AI_API_KEY") + + return api_base, api_key + + def get_supported_openai_params(self, model: str) -> List[str]: + """ + Azure Agents supports minimal OpenAI params since it's an agent runtime. + """ + return ["stream"] + + def map_openai_params( + self, + non_default_params: dict, + optional_params: dict, + model: str, + drop_params: bool, + ) -> dict: + """ + Map OpenAI params to Azure Agents params. + """ + return optional_params + + def _get_api_version(self, optional_params: dict) -> str: + """Get API version from optional params or use default.""" + return optional_params.get("api_version", self.DEFAULT_API_VERSION) + + def get_complete_url( + self, + api_base: Optional[str], + api_key: Optional[str], + model: str, + optional_params: dict, + litellm_params: dict, + stream: Optional[bool] = None, + ) -> str: + """ + Get the base URL for Azure AI Agent Service. + + The actual endpoint will vary based on the operation: + - /openai/threads for creating threads + - /openai/threads/{thread_id}/messages for adding messages + - /openai/threads/{thread_id}/runs for creating runs + + This returns the base URL that will be modified for each operation. + """ + if api_base is None: + raise ValueError( + "api_base is required for Azure AI Agents. Set it via AZURE_AI_API_BASE env var or api_base parameter." + ) + + # Remove trailing slash if present + api_base = api_base.rstrip("/") + + # Return base URL - actual endpoints will be constructed during request + return api_base + + def _get_agent_id(self, model: str, optional_params: dict) -> str: + """ + Get the agent ID from model or optional_params. + + model format: "azure_ai/agents/" or "agents/" or just "" + """ + agent_id = optional_params.get("agent_id") or optional_params.get("assistant_id") + if agent_id: + return agent_id + + # Extract from model name using the static method + return self.get_agent_id_from_model(model) + + def transform_request( + self, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + headers: dict, + ) -> dict: + """ + Transform the request for Azure Agents. + + This stores the necessary data for the multi-step agent flow. + The actual API calls happen in the custom handler. + """ + agent_id = self._get_agent_id(model, optional_params) + + # Convert messages to a format we can use + converted_messages = [] + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + + # Handle content that might be a list + if isinstance(content, list): + content = convert_content_list_to_str(msg) + + # Ensure content is a string + if not isinstance(content, str): + content = str(content) + + converted_messages.append({"role": role, "content": content}) + + payload: Dict[str, Any] = { + "agent_id": agent_id, + "messages": converted_messages, + "api_version": self._get_api_version(optional_params), + } + + # Pass through thread_id if provided (for continuing conversations) + if "thread_id" in optional_params: + payload["thread_id"] = optional_params["thread_id"] + + # Pass through any additional instructions + if "instructions" in optional_params: + payload["instructions"] = optional_params["instructions"] + + verbose_logger.debug(f"Azure AI Agents request payload: {payload}") + return payload + + def validate_environment( + self, + headers: dict, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ) -> dict: + """ + Validate and set up environment for Azure Foundry Agents requests. + + Azure Foundry Agents uses Bearer token authentication with Azure AD tokens. + Get token via: az account get-access-token --resource 'https://ai.azure.com' + + See: https://learn.microsoft.com/en-us/azure/ai-foundry/agents/quickstart + """ + headers["Content-Type"] = "application/json" + + # Azure Foundry Agents uses Bearer token authentication + # The api_key here is expected to be an Azure AD token + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + return headers + + def get_error_class( + self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers] + ) -> BaseLLMException: + return AzureAIAgentsError(status_code=status_code, message=error_message) + + def should_fake_stream( + self, + model: Optional[str], + stream: Optional[bool], + custom_llm_provider: Optional[str] = None, + ) -> bool: + """ + Azure Agents uses polling, so we fake stream by returning the final response. + """ + return True + + @property + def has_custom_stream_wrapper(self) -> bool: + """Azure Agents doesn't have native streaming - uses fake stream.""" + return False + + @property + def supports_stream_param_in_request_body(self) -> bool: + """ + Azure Agents does not use a stream param in request body. + """ + return False + + def transform_response( + self, + model: str, + raw_response: httpx.Response, + model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, + request_data: dict, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + encoding: Any, + api_key: Optional[str] = None, + json_mode: Optional[bool] = None, + ) -> ModelResponse: + """ + Transform the Azure Agents response to LiteLLM ModelResponse format. + """ + # This is not used since we have a custom handler + return model_response + + @staticmethod + def completion( + model: str, + messages: List, + api_base: str, + api_key: Optional[str], + model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, + optional_params: dict, + litellm_params: dict, + timeout: Union[float, int, Any], + acompletion: bool, + stream: Optional[bool] = False, + headers: Optional[dict] = None, + ) -> Any: + """ + Dispatch method for Azure Foundry Agents completion. + + Routes to sync or async completion based on acompletion flag. + Supports native streaming via SSE when stream=True and acompletion=True. + + Authentication: Uses Azure AD Bearer tokens. + - Pass api_key directly as an Azure AD token + - Or set up Azure AD credentials via environment variables for automatic token retrieval: + - AZURE_TENANT_ID, AZURE_CLIENT_ID, AZURE_CLIENT_SECRET (Service Principal) + + See: https://learn.microsoft.com/en-us/azure/ai-foundry/agents/quickstart + """ + from litellm.llms.azure.common_utils import get_azure_ad_token + from litellm.llms.azure_ai.agents.handler import azure_ai_agents_handler + from litellm.types.router import GenericLiteLLMParams + + # If no api_key is provided, try to get Azure AD token + if api_key is None: + # Try to get Azure AD token using the existing Azure auth mechanisms + # This uses the scope for Azure AI (ai.azure.com) instead of cognitive services + # Create a GenericLiteLLMParams with the scope override for Azure Foundry Agents + azure_auth_params = dict(litellm_params) if litellm_params else {} + azure_auth_params["azure_scope"] = "https://ai.azure.com/.default" + api_key = get_azure_ad_token(GenericLiteLLMParams(**azure_auth_params)) + + if api_key is None: + raise ValueError( + "api_key (Azure AD token) is required for Azure Foundry Agents. " + "Either pass api_key directly, or set AZURE_TENANT_ID, AZURE_CLIENT_ID, " + "and AZURE_CLIENT_SECRET environment variables for Service Principal auth. " + "Manual token: az account get-access-token --resource 'https://ai.azure.com'" + ) + if acompletion: + if stream: + # Native async streaming via SSE - return the async generator directly + return azure_ai_agents_handler.acompletion_stream( + model=model, + messages=messages, + api_base=api_base, + api_key=api_key, + logging_obj=logging_obj, + optional_params=optional_params, + litellm_params=litellm_params, + timeout=timeout, + headers=headers, + ) + else: + return azure_ai_agents_handler.acompletion( + model=model, + messages=messages, + api_base=api_base, + api_key=api_key, + model_response=model_response, + logging_obj=logging_obj, + optional_params=optional_params, + litellm_params=litellm_params, + timeout=timeout, + headers=headers, + ) + else: + # Sync completion - streaming not supported for sync + return azure_ai_agents_handler.completion( + model=model, + messages=messages, + api_base=api_base, + api_key=api_key, + model_response=model_response, + logging_obj=logging_obj, + optional_params=optional_params, + litellm_params=litellm_params, + timeout=timeout, + headers=headers, + ) diff --git a/litellm/llms/azure_ai/anthropic/__init__.py b/litellm/llms/azure_ai/anthropic/__init__.py new file mode 100644 index 000000000000..233f22999f06 --- /dev/null +++ b/litellm/llms/azure_ai/anthropic/__init__.py @@ -0,0 +1,12 @@ +""" +Azure Anthropic provider - supports Claude models via Azure Foundry +""" +from .handler import AzureAnthropicChatCompletion +from .transformation import AzureAnthropicConfig + +try: + from .messages_transformation import AzureAnthropicMessagesConfig + __all__ = ["AzureAnthropicChatCompletion", "AzureAnthropicConfig", "AzureAnthropicMessagesConfig"] +except ImportError: + __all__ = ["AzureAnthropicChatCompletion", "AzureAnthropicConfig"] + diff --git a/litellm/llms/azure_ai/anthropic/count_tokens/__init__.py b/litellm/llms/azure_ai/anthropic/count_tokens/__init__.py new file mode 100644 index 000000000000..9605d401f8e0 --- /dev/null +++ b/litellm/llms/azure_ai/anthropic/count_tokens/__init__.py @@ -0,0 +1,19 @@ +""" +Azure AI Anthropic CountTokens API implementation. +""" + +from litellm.llms.azure_ai.anthropic.count_tokens.handler import ( + AzureAIAnthropicCountTokensHandler, +) +from litellm.llms.azure_ai.anthropic.count_tokens.token_counter import ( + AzureAIAnthropicTokenCounter, +) +from litellm.llms.azure_ai.anthropic.count_tokens.transformation import ( + AzureAIAnthropicCountTokensConfig, +) + +__all__ = [ + "AzureAIAnthropicCountTokensHandler", + "AzureAIAnthropicCountTokensConfig", + "AzureAIAnthropicTokenCounter", +] diff --git a/litellm/llms/azure_ai/anthropic/count_tokens/handler.py b/litellm/llms/azure_ai/anthropic/count_tokens/handler.py new file mode 100644 index 000000000000..52a0bb8bb095 --- /dev/null +++ b/litellm/llms/azure_ai/anthropic/count_tokens/handler.py @@ -0,0 +1,127 @@ +""" +Azure AI Anthropic CountTokens API handler. + +Uses httpx for HTTP requests with Azure authentication. +""" + +from typing import Any, Dict, List, Optional, Union + +import httpx + +import litellm +from litellm._logging import verbose_logger +from litellm.llms.anthropic.common_utils import AnthropicError +from litellm.llms.azure_ai.anthropic.count_tokens.transformation import ( + AzureAIAnthropicCountTokensConfig, +) +from litellm.llms.custom_httpx.http_handler import get_async_httpx_client + + +class AzureAIAnthropicCountTokensHandler(AzureAIAnthropicCountTokensConfig): + """ + Handler for Azure AI Anthropic CountTokens API requests. + + Uses httpx for HTTP requests with Azure authentication. + """ + + async def handle_count_tokens_request( + self, + model: str, + messages: List[Dict[str, Any]], + api_key: str, + api_base: str, + litellm_params: Optional[Dict[str, Any]] = None, + timeout: Optional[Union[float, httpx.Timeout]] = None, + ) -> Dict[str, Any]: + """ + Handle a CountTokens request using httpx with Azure authentication. + + Args: + model: The model identifier (e.g., "claude-3-5-sonnet") + messages: The messages to count tokens for + api_key: The Azure AI API key + api_base: The Azure AI API base URL + litellm_params: Optional LiteLLM parameters + timeout: Optional timeout for the request (defaults to litellm.request_timeout) + + Returns: + Dictionary containing token count response + + Raises: + AnthropicError: If the API request fails + """ + try: + # Validate the request + self.validate_request(model, messages) + + verbose_logger.debug( + f"Processing Azure AI Anthropic CountTokens request for model: {model}" + ) + + # Transform request to Anthropic format + request_body = self.transform_request_to_count_tokens( + model=model, + messages=messages, + ) + + verbose_logger.debug(f"Transformed request: {request_body}") + + # Get endpoint URL + endpoint_url = self.get_count_tokens_endpoint(api_base) + + verbose_logger.debug(f"Making request to: {endpoint_url}") + + # Get required headers with Azure authentication + headers = self.get_required_headers( + api_key=api_key, + litellm_params=litellm_params, + ) + + # Use LiteLLM's async httpx client + async_client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.AZURE_AI + ) + + # Use provided timeout or fall back to litellm.request_timeout + request_timeout = timeout if timeout is not None else litellm.request_timeout + + response = await async_client.post( + endpoint_url, + headers=headers, + json=request_body, + timeout=request_timeout, + ) + + verbose_logger.debug(f"Response status: {response.status_code}") + + if response.status_code != 200: + error_text = response.text + verbose_logger.error(f"Azure AI Anthropic API error: {error_text}") + raise AnthropicError( + status_code=response.status_code, + message=error_text, + ) + + azure_response = response.json() + + verbose_logger.debug(f"Azure AI Anthropic response: {azure_response}") + + # Return Anthropic-compatible response directly - no transformation needed + return azure_response + + except AnthropicError: + # Re-raise Anthropic exceptions as-is + raise + except httpx.HTTPStatusError as e: + # HTTP errors - preserve the actual status code + verbose_logger.error(f"HTTP error in CountTokens handler: {str(e)}") + raise AnthropicError( + status_code=e.response.status_code, + message=e.response.text, + ) + except Exception as e: + verbose_logger.error(f"Error in CountTokens handler: {str(e)}") + raise AnthropicError( + status_code=500, + message=f"CountTokens processing error: {str(e)}", + ) diff --git a/litellm/llms/azure_ai/anthropic/count_tokens/token_counter.py b/litellm/llms/azure_ai/anthropic/count_tokens/token_counter.py new file mode 100644 index 000000000000..14f928000794 --- /dev/null +++ b/litellm/llms/azure_ai/anthropic/count_tokens/token_counter.py @@ -0,0 +1,119 @@ +""" +Azure AI Anthropic Token Counter implementation using the CountTokens API. +""" + +import os +from typing import Any, Dict, List, Optional + +from litellm._logging import verbose_logger +from litellm.llms.azure_ai.anthropic.count_tokens.handler import ( + AzureAIAnthropicCountTokensHandler, +) +from litellm.llms.base_llm.base_utils import BaseTokenCounter +from litellm.types.utils import LlmProviders, TokenCountResponse + +# Global handler instance - reuse across all token counting requests +azure_ai_anthropic_count_tokens_handler = AzureAIAnthropicCountTokensHandler() + + +class AzureAIAnthropicTokenCounter(BaseTokenCounter): + """Token counter implementation for Azure AI Anthropic provider using the CountTokens API.""" + + def should_use_token_counting_api( + self, + custom_llm_provider: Optional[str] = None, + ) -> bool: + return custom_llm_provider == LlmProviders.AZURE_AI.value + + async def count_tokens( + self, + model_to_use: str, + messages: Optional[List[Dict[str, Any]]], + contents: Optional[List[Dict[str, Any]]], + deployment: Optional[Dict[str, Any]] = None, + request_model: str = "", + ) -> Optional[TokenCountResponse]: + """ + Count tokens using Azure AI Anthropic's CountTokens API. + + Args: + model_to_use: The model identifier + messages: The messages to count tokens for + contents: Alternative content format (not used for Anthropic) + deployment: Deployment configuration containing litellm_params + request_model: The original request model name + + Returns: + TokenCountResponse with token count, or None if counting fails + """ + from litellm.llms.anthropic.common_utils import AnthropicError + + if not messages: + return None + + deployment = deployment or {} + litellm_params = deployment.get("litellm_params", {}) + + # Get Azure AI API key from deployment config or environment + api_key = litellm_params.get("api_key") + if not api_key: + api_key = os.getenv("AZURE_AI_API_KEY") + + # Get API base from deployment config or environment + api_base = litellm_params.get("api_base") + if not api_base: + api_base = os.getenv("AZURE_AI_API_BASE") + + if not api_key: + verbose_logger.warning("No Azure AI API key found for token counting") + return None + + if not api_base: + verbose_logger.warning("No Azure AI API base found for token counting") + return None + + try: + result = await azure_ai_anthropic_count_tokens_handler.handle_count_tokens_request( + model=model_to_use, + messages=messages, + api_key=api_key, + api_base=api_base, + litellm_params=litellm_params, + ) + + if result is not None: + return TokenCountResponse( + total_tokens=result.get("input_tokens", 0), + request_model=request_model, + model_used=model_to_use, + tokenizer_type="azure_ai_anthropic_api", + original_response=result, + ) + except AnthropicError as e: + verbose_logger.warning( + f"Azure AI Anthropic CountTokens API error: status={e.status_code}, message={e.message}" + ) + return TokenCountResponse( + total_tokens=0, + request_model=request_model, + model_used=model_to_use, + tokenizer_type="azure_ai_anthropic_api", + error=True, + error_message=e.message, + status_code=e.status_code, + ) + except Exception as e: + verbose_logger.warning( + f"Error calling Azure AI Anthropic CountTokens API: {e}" + ) + return TokenCountResponse( + total_tokens=0, + request_model=request_model, + model_used=model_to_use, + tokenizer_type="azure_ai_anthropic_api", + error=True, + error_message=str(e), + status_code=500, + ) + + return None diff --git a/litellm/llms/azure_ai/anthropic/count_tokens/transformation.py b/litellm/llms/azure_ai/anthropic/count_tokens/transformation.py new file mode 100644 index 000000000000..e284595cc8aa --- /dev/null +++ b/litellm/llms/azure_ai/anthropic/count_tokens/transformation.py @@ -0,0 +1,88 @@ +""" +Azure AI Anthropic CountTokens API transformation logic. + +Extends the base Anthropic CountTokens transformation with Azure authentication. +""" + +from typing import Any, Dict, Optional + +from litellm.constants import ANTHROPIC_TOKEN_COUNTING_BETA_VERSION +from litellm.llms.anthropic.count_tokens.transformation import ( + AnthropicCountTokensConfig, +) +from litellm.llms.azure.common_utils import BaseAzureLLM +from litellm.types.router import GenericLiteLLMParams + + +class AzureAIAnthropicCountTokensConfig(AnthropicCountTokensConfig): + """ + Configuration and transformation logic for Azure AI Anthropic CountTokens API. + + Extends AnthropicCountTokensConfig with Azure authentication. + Azure AI Anthropic uses the same endpoint format but with Azure auth headers. + """ + + def get_required_headers( + self, + api_key: str, + litellm_params: Optional[Dict[str, Any]] = None, + ) -> Dict[str, str]: + """ + Get the required headers for the Azure AI Anthropic CountTokens API. + + Uses Azure authentication (api-key header) instead of Anthropic's x-api-key. + + Args: + api_key: The Azure AI API key + litellm_params: Optional LiteLLM parameters for additional auth config + + Returns: + Dictionary of required headers with Azure authentication + """ + # Start with base headers + headers = { + "Content-Type": "application/json", + "anthropic-version": "2023-06-01", + "anthropic-beta": ANTHROPIC_TOKEN_COUNTING_BETA_VERSION, + } + + # Use Azure authentication + litellm_params = litellm_params or {} + if "api_key" not in litellm_params: + litellm_params["api_key"] = api_key + + litellm_params_obj = GenericLiteLLMParams(**litellm_params) + + # Get Azure auth headers + azure_headers = BaseAzureLLM._base_validate_azure_environment( + headers={}, litellm_params=litellm_params_obj + ) + + # Merge Azure auth headers + headers.update(azure_headers) + + return headers + + def get_count_tokens_endpoint(self, api_base: str) -> str: + """ + Get the Azure AI Anthropic CountTokens API endpoint. + + Args: + api_base: The Azure AI API base URL + (e.g., https://my-resource.services.ai.azure.com or + https://my-resource.services.ai.azure.com/anthropic) + + Returns: + The endpoint URL for the CountTokens API + """ + # Azure AI Anthropic endpoint format: + # https://.services.ai.azure.com/anthropic/v1/messages/count_tokens + api_base = api_base.rstrip("/") + + # Ensure the URL has /anthropic path + if not api_base.endswith("/anthropic"): + if "/anthropic" not in api_base: + api_base = f"{api_base}/anthropic" + + # Add the count_tokens path + return f"{api_base}/v1/messages/count_tokens" diff --git a/litellm/llms/azure_ai/anthropic/handler.py b/litellm/llms/azure_ai/anthropic/handler.py new file mode 100644 index 000000000000..fe4524fd5be8 --- /dev/null +++ b/litellm/llms/azure_ai/anthropic/handler.py @@ -0,0 +1,227 @@ +""" +Azure Anthropic handler - reuses AnthropicChatCompletion logic with Azure authentication +""" +import copy +import json +from typing import TYPE_CHECKING, Callable, Union + +import httpx + +from litellm.llms.anthropic.chat.handler import AnthropicChatCompletion +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, +) +from litellm.types.utils import ModelResponse +from litellm.utils import CustomStreamWrapper + +from .transformation import AzureAnthropicConfig + +if TYPE_CHECKING: + pass + + +class AzureAnthropicChatCompletion(AnthropicChatCompletion): + """ + Azure Anthropic chat completion handler. + Reuses all Anthropic logic but with Azure authentication. + """ + + def __init__(self) -> None: + super().__init__() + + def completion( + self, + model: str, + messages: list, + api_base: str, + custom_llm_provider: str, + custom_prompt_dict: dict, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + api_key, + logging_obj, + optional_params: dict, + timeout: Union[float, httpx.Timeout], + litellm_params: dict, + acompletion=None, + logger_fn=None, + headers={}, + client=None, + ): + """ + Completion method that uses Azure authentication instead of Anthropic's x-api-key. + All other logic is the same as AnthropicChatCompletion. + """ + + optional_params = copy.deepcopy(optional_params) + stream = optional_params.pop("stream", None) + json_mode: bool = optional_params.pop("json_mode", False) + is_vertex_request: bool = optional_params.pop("is_vertex_request", False) + _is_function_call = False + messages = copy.deepcopy(messages) + + # Use AzureAnthropicConfig for both azure_anthropic and azure_ai Claude models + config = AzureAnthropicConfig() + + headers = config.validate_environment( + api_key=api_key, + headers=headers, + model=model, + messages=messages, + optional_params={**optional_params, "is_vertex_request": is_vertex_request}, + litellm_params=litellm_params, + ) + + data = config.transform_request( + model=model, + messages=messages, + optional_params=optional_params, + litellm_params=litellm_params, + headers=headers, + ) + + ## LOGGING + logging_obj.pre_call( + input=messages, + api_key=api_key, + additional_args={ + "complete_input_dict": data, + "api_base": api_base, + "headers": headers, + }, + ) + print_verbose(f"_is_function_call: {_is_function_call}") + if acompletion is True: + if ( + stream is True + ): # if function call - fake the streaming (need complete blocks for output parsing in openai format) + print_verbose("makes async azure anthropic streaming POST request") + data["stream"] = stream + return self.acompletion_stream_function( + model=model, + messages=messages, + data=data, + api_base=api_base, + custom_prompt_dict=custom_prompt_dict, + model_response=model_response, + print_verbose=print_verbose, + encoding=encoding, + api_key=api_key, + logging_obj=logging_obj, + optional_params=optional_params, + stream=stream, + _is_function_call=_is_function_call, + json_mode=json_mode, + litellm_params=litellm_params, + logger_fn=logger_fn, + headers=headers, + timeout=timeout, + client=( + client + if client is not None and isinstance(client, AsyncHTTPHandler) + else None + ), + ) + else: + return self.acompletion_function( + model=model, + messages=messages, + data=data, + api_base=api_base, + custom_prompt_dict=custom_prompt_dict, + model_response=model_response, + print_verbose=print_verbose, + encoding=encoding, + api_key=api_key, + provider_config=config, + logging_obj=logging_obj, + optional_params=optional_params, + stream=stream, + _is_function_call=_is_function_call, + litellm_params=litellm_params, + logger_fn=logger_fn, + headers=headers, + client=client, + json_mode=json_mode, + timeout=timeout, + ) + else: + ## COMPLETION CALL + if ( + stream is True + ): # if function call - fake the streaming (need complete blocks for output parsing in openai format) + data["stream"] = stream + # Import the make_sync_call from parent + from litellm.llms.anthropic.chat.handler import make_sync_call + + completion_stream, response_headers = make_sync_call( + client=client, + api_base=api_base, + headers=headers, # type: ignore + data=json.dumps(data), + model=model, + messages=messages, + logging_obj=logging_obj, + timeout=timeout, + json_mode=json_mode, + ) + from litellm.llms.anthropic.common_utils import ( + process_anthropic_headers, + ) + + return CustomStreamWrapper( + completion_stream=completion_stream, + model=model, + custom_llm_provider="azure_ai", + logging_obj=logging_obj, + _response_headers=process_anthropic_headers(response_headers), + ) + + else: + if client is None or not isinstance(client, HTTPHandler): + from litellm.llms.custom_httpx.http_handler import _get_httpx_client + + client = _get_httpx_client(params={"timeout": timeout}) + else: + client = client + + try: + response = client.post( + api_base, + headers=headers, + data=json.dumps(data), + timeout=timeout, + ) + except Exception as e: + from litellm.llms.anthropic.common_utils import AnthropicError + + status_code = getattr(e, "status_code", 500) + error_headers = getattr(e, "headers", None) + error_text = getattr(e, "text", str(e)) + error_response = getattr(e, "response", None) + if error_headers is None and error_response: + error_headers = getattr(error_response, "headers", None) + if error_response and hasattr(error_response, "text"): + error_text = getattr(error_response, "text", error_text) + raise AnthropicError( + message=error_text, + status_code=status_code, + headers=error_headers, + ) + + return config.transform_response( + model=model, + raw_response=response, + model_response=model_response, + logging_obj=logging_obj, + api_key=api_key, + request_data=data, + messages=messages, + optional_params=optional_params, + litellm_params=litellm_params, + encoding=encoding, + json_mode=json_mode, + ) + diff --git a/litellm/llms/azure_ai/anthropic/messages_transformation.py b/litellm/llms/azure_ai/anthropic/messages_transformation.py new file mode 100644 index 000000000000..0d00c907031d --- /dev/null +++ b/litellm/llms/azure_ai/anthropic/messages_transformation.py @@ -0,0 +1,117 @@ +""" +Azure Anthropic messages transformation config - extends AnthropicMessagesConfig with Azure authentication +""" +from typing import TYPE_CHECKING, Any, List, Optional, Tuple + +from litellm.llms.anthropic.experimental_pass_through.messages.transformation import ( + AnthropicMessagesConfig, +) +from litellm.llms.azure.common_utils import BaseAzureLLM +from litellm.types.router import GenericLiteLLMParams + +if TYPE_CHECKING: + pass + + +class AzureAnthropicMessagesConfig(AnthropicMessagesConfig): + """ + Azure Anthropic messages configuration that extends AnthropicMessagesConfig. + The only difference is authentication - Azure uses x-api-key header (not api-key) + and Azure endpoint format. + """ + + def validate_anthropic_messages_environment( + self, + headers: dict, + model: str, + messages: List[Any], + optional_params: dict, + litellm_params: dict, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ) -> Tuple[dict, Optional[str]]: + """ + Validate environment and set up Azure authentication headers for /v1/messages endpoint. + Azure Anthropic uses x-api-key header (not api-key). + """ + # Convert dict to GenericLiteLLMParams if needed + if isinstance(litellm_params, dict): + if api_key and "api_key" not in litellm_params: + litellm_params = {**litellm_params, "api_key": api_key} + litellm_params_obj = GenericLiteLLMParams(**litellm_params) + else: + litellm_params_obj = litellm_params or GenericLiteLLMParams() + if api_key and not litellm_params_obj.api_key: + litellm_params_obj.api_key = api_key + + # Use Azure authentication logic + headers = BaseAzureLLM._base_validate_azure_environment( + headers=headers, litellm_params=litellm_params_obj + ) + + # Azure Anthropic uses x-api-key header (not api-key) + # Convert api-key to x-api-key if present + if "api-key" in headers and "x-api-key" not in headers: + headers["x-api-key"] = headers.pop("api-key") + + # Set anthropic-version header + if "anthropic-version" not in headers: + headers["anthropic-version"] = "2023-06-01" + + # Set content-type header + if "content-type" not in headers: + headers["content-type"] = "application/json" + + # Update headers with anthropic beta features (context management, tool search, etc.) + headers = self._update_headers_with_anthropic_beta( + headers=headers, + optional_params=optional_params, + ) + + return headers, api_base + + def get_complete_url( + self, + api_base: Optional[str], + api_key: Optional[str], + model: str, + optional_params: dict, + litellm_params: dict, + stream: Optional[bool] = None, + ) -> str: + """ + Get the complete URL for Azure Anthropic /v1/messages endpoint. + Azure Foundry endpoint format: https://.services.ai.azure.com/anthropic/v1/messages + """ + from litellm.secret_managers.main import get_secret_str + + api_base = api_base or get_secret_str("AZURE_API_BASE") + if api_base is None: + raise ValueError( + "Missing Azure API Base - Please set `api_base` or `AZURE_API_BASE` environment variable. " + "Expected format: https://.services.ai.azure.com/anthropic" + ) + + # Ensure the URL ends with /v1/messages + api_base = api_base.rstrip("/") + if api_base.endswith("/v1/messages"): + # Already correct + pass + elif api_base.endswith("/anthropic/v1/messages"): + # Already correct + pass + else: + # Check if /anthropic is already in the path + if "/anthropic" in api_base: + # /anthropic exists, ensure we end with /anthropic/v1/messages + # Extract the base URL up to and including /anthropic + parts = api_base.split("/anthropic", 1) + api_base = parts[0] + "/anthropic" + else: + # /anthropic not in path, add it + api_base = api_base + "/anthropic" + # Add /v1/messages + api_base = api_base + "/v1/messages" + + return api_base + diff --git a/litellm/llms/azure_ai/anthropic/transformation.py b/litellm/llms/azure_ai/anthropic/transformation.py new file mode 100644 index 000000000000..2d8d3b987c73 --- /dev/null +++ b/litellm/llms/azure_ai/anthropic/transformation.py @@ -0,0 +1,119 @@ +""" +Azure Anthropic transformation config - extends AnthropicConfig with Azure authentication +""" +from typing import TYPE_CHECKING, Dict, List, Optional, Union + +from litellm.llms.anthropic.chat.transformation import AnthropicConfig +from litellm.llms.azure.common_utils import BaseAzureLLM +from litellm.types.llms.openai import AllMessageValues +from litellm.types.router import GenericLiteLLMParams + +if TYPE_CHECKING: + pass + + +class AzureAnthropicConfig(AnthropicConfig): + """ + Azure Anthropic configuration that extends AnthropicConfig. + The only difference is authentication - Azure uses api-key header or Azure AD token + instead of x-api-key header. + """ + + @property + def custom_llm_provider(self) -> Optional[str]: + return "azure_ai" + + def validate_environment( + self, + headers: dict, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: Union[dict, GenericLiteLLMParams], + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ) -> Dict: + """ + Validate environment and set up Azure authentication headers. + Azure supports: + 1. API key via 'api-key' header + 2. Azure AD token via 'Authorization: Bearer ' header + """ + # Convert dict to GenericLiteLLMParams if needed + if isinstance(litellm_params, dict): + # Ensure api_key is included if provided + if api_key and "api_key" not in litellm_params: + litellm_params = {**litellm_params, "api_key": api_key} + litellm_params_obj = GenericLiteLLMParams(**litellm_params) + else: + litellm_params_obj = litellm_params or GenericLiteLLMParams() + # Set api_key if provided and not already set + if api_key and not litellm_params_obj.api_key: + litellm_params_obj.api_key = api_key + + # Use Azure authentication logic + headers = BaseAzureLLM._base_validate_azure_environment( + headers=headers, litellm_params=litellm_params_obj + ) + + # Get tools and other anthropic-specific setup + tools = optional_params.get("tools") + prompt_caching_set = self.is_cache_control_set(messages=messages) + computer_tool_used = self.is_computer_tool_used(tools=tools) + mcp_server_used = self.is_mcp_server_used( + mcp_servers=optional_params.get("mcp_servers") + ) + pdf_used = self.is_pdf_used(messages=messages) + file_id_used = self.is_file_id_used(messages=messages) + user_anthropic_beta_headers = self._get_user_anthropic_beta_headers( + anthropic_beta_header=headers.get("anthropic-beta") + ) + + # Get anthropic headers (but we'll replace x-api-key with Azure auth) + anthropic_headers = self.get_anthropic_headers( + computer_tool_used=computer_tool_used, + prompt_caching_set=prompt_caching_set, + pdf_used=pdf_used, + api_key=api_key or "", # Azure auth is already in headers + file_id_used=file_id_used, + is_vertex_request=optional_params.get("is_vertex_request", False), + user_anthropic_beta_headers=user_anthropic_beta_headers, + mcp_server_used=mcp_server_used, + ) + # Merge headers - Azure auth (api-key or Authorization) takes precedence + headers = {**anthropic_headers, **headers} + + # Ensure anthropic-version header is set + if "anthropic-version" not in headers: + headers["anthropic-version"] = "2023-06-01" + + return headers + + def transform_request( + self, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + headers: dict, + ) -> dict: + """ + Transform request using parent AnthropicConfig, then remove unsupported params. + Azure Anthropic doesn't support extra_body, max_retries, or stream_options parameters. + """ + # Call parent transform_request + data = super().transform_request( + model=model, + messages=messages, + optional_params=optional_params, + litellm_params=litellm_params, + headers=headers, + ) + + # Remove unsupported parameters for Azure AI Anthropic + data.pop("extra_body", None) + data.pop("max_retries", None) + data.pop("stream_options", None) + + return data + diff --git a/litellm/llms/azure_ai/azure_model_router/__init__.py b/litellm/llms/azure_ai/azure_model_router/__init__.py new file mode 100644 index 000000000000..0165d60b6431 --- /dev/null +++ b/litellm/llms/azure_ai/azure_model_router/__init__.py @@ -0,0 +1,4 @@ +"""Azure AI Foundry Model Router support.""" +from .transformation import AzureModelRouterConfig + +__all__ = ["AzureModelRouterConfig"] diff --git a/litellm/llms/azure_ai/azure_model_router/transformation.py b/litellm/llms/azure_ai/azure_model_router/transformation.py new file mode 100644 index 000000000000..3d6dc53c5155 --- /dev/null +++ b/litellm/llms/azure_ai/azure_model_router/transformation.py @@ -0,0 +1,125 @@ +""" +Transformation for Azure AI Foundry Model Router. + +The Model Router is a special Azure AI deployment that automatically routes requests +to the best available model. It has specific cost tracking requirements. +""" +from typing import Any, List, Optional + +from httpx import Response + +from litellm.llms.azure_ai.chat.transformation import AzureAIStudioConfig +from litellm.llms.base_llm.chat.transformation import LiteLLMLoggingObj +from litellm.types.llms.openai import AllMessageValues +from litellm.types.utils import ModelResponse + + +class AzureModelRouterConfig(AzureAIStudioConfig): + """ + Configuration for Azure AI Foundry Model Router. + + Handles: + - Stripping model_router prefix before sending to Azure API + - Preserving full model path in responses for cost tracking + - Calculating flat infrastructure costs for Model Router + """ + + def transform_request( + self, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + headers: dict, + ) -> dict: + """ + Transform request for Model Router. + + Strips the model_router/ prefix so only the deployment name is sent to Azure. + Example: model_router/azure-model-router -> azure-model-router + """ + from litellm.llms.azure_ai.common_utils import AzureFoundryModelInfo + + # Get base model name (strips routing prefixes like model_router/) + base_model: str = AzureFoundryModelInfo.get_base_model(model) + + return super().transform_request( + base_model, messages, optional_params, litellm_params, headers + ) + + def transform_response( + self, + model: str, + raw_response: Response, + model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, + request_data: dict, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + encoding: Any, + api_key: Optional[str] = None, + json_mode: Optional[bool] = None, + ) -> ModelResponse: + """ + Transform response for Model Router. + + Preserves the original model path (including model_router/ prefix) in the response + for proper cost tracking and logging. + """ + from litellm.llms.azure_ai.common_utils import AzureFoundryModelInfo + + # Preserve the original model from litellm_params (includes routing prefixes like model_router/) + # This ensures cost tracking and logging use the full model path + original_model: str = litellm_params.get("model") or model + if not original_model.startswith("azure_ai/"): + # Add provider prefix if not already present + model_response.model = f"azure_ai/{original_model}" + else: + model_response.model = original_model + + # Get base model for the parent call (strips routing prefixes for API compatibility) + base_model: str = AzureFoundryModelInfo.get_base_model(model) + + return super().transform_response( + model=base_model, + raw_response=raw_response, + model_response=model_response, + logging_obj=logging_obj, + request_data=request_data, + messages=messages, + optional_params=optional_params, + litellm_params=litellm_params, + encoding=encoding, + api_key=api_key, + json_mode=json_mode, + ) + + def calculate_additional_costs( + self, model: str, prompt_tokens: int, completion_tokens: int + ) -> Optional[dict]: + """ + Calculate additional costs for Azure Model Router. + + Adds a flat infrastructure cost of $0.14 per M input tokens for using the Model Router. + + Args: + model: The model name (should be a model router model) + prompt_tokens: Number of prompt tokens + completion_tokens: Number of completion tokens + + Returns: + Dictionary with additional costs, or None if not applicable. + """ + from litellm.llms.azure_ai.cost_calculator import ( + calculate_azure_model_router_flat_cost, + ) + + flat_cost = calculate_azure_model_router_flat_cost( + model=model, prompt_tokens=prompt_tokens + ) + + if flat_cost > 0: + return {"Azure Model Router Flat Cost": flat_cost} + + return None diff --git a/litellm/llms/azure_ai/common_utils.py b/litellm/llms/azure_ai/common_utils.py index dcc9335e42d1..47d397d6e98b 100644 --- a/litellm/llms/azure_ai/common_utils.py +++ b/litellm/llms/azure_ai/common_utils.py @@ -1,46 +1,161 @@ -from typing import List, Optional +from typing import List, Literal, Optional import litellm -from litellm.llms.base_llm.base_utils import BaseLLMModelInfo +from litellm.llms.base_llm.base_utils import BaseLLMModelInfo, BaseTokenCounter from litellm.secret_managers.main import get_secret_str from litellm.types.llms.openai import AllMessageValues class AzureFoundryModelInfo(BaseLLMModelInfo): + """Model info for Azure AI / Azure Foundry models.""" + + def __init__(self, model: Optional[str] = None): + self._model = model + + @staticmethod + def get_azure_ai_route(model: str) -> Literal["agents", "model_router", "default"]: + """ + Get the Azure AI route for the given model. + + Similar to BedrockModelInfo.get_bedrock_route(). + + Supported routes: + - agents: azure_ai/agents/ + - model_router: azure_ai/model_router/ or models with "model-router"/"model_router" in name + - default: standard models + """ + if "agents/" in model: + return "agents" + # Detect model router by prefix (model_router/) or by name containing "model-router"/"model_router" + model_lower = model.lower() + if ( + "model_router/" in model_lower + or "model-router/" in model_lower + or "model-router" in model_lower + or "model_router" in model_lower + ): + return "model_router" + return "default" + @staticmethod def get_api_base(api_base: Optional[str] = None) -> Optional[str]: - return ( - api_base - or litellm.api_base - or get_secret_str("AZURE_AI_API_BASE") - ) - + return api_base or litellm.api_base or get_secret_str("AZURE_AI_API_BASE") + @staticmethod def get_api_key(api_key: Optional[str] = None) -> Optional[str]: return ( - api_key - or litellm.api_key - or litellm.openai_key - or get_secret_str("AZURE_AI_API_KEY") - ) - + api_key + or litellm.api_key + or litellm.openai_key + or get_secret_str("AZURE_AI_API_KEY") + ) + @property def api_version(self, api_version: Optional[str] = None) -> Optional[str]: api_version = ( - api_version - or litellm.api_version - or get_secret_str("AZURE_API_VERSION") + api_version or litellm.api_version or get_secret_str("AZURE_API_VERSION") ) return api_version - + + def get_token_counter(self) -> Optional[BaseTokenCounter]: + """ + Factory method to create a token counter for Azure AI. + + Returns: + AzureAIAnthropicTokenCounter for Claude models, None otherwise. + """ + # Only return token counter for Claude models + if self._model and "claude" in self._model.lower(): + from litellm.llms.azure_ai.anthropic.count_tokens.token_counter import ( + AzureAIAnthropicTokenCounter, + ) + + return AzureAIAnthropicTokenCounter() + return None + + def get_models( + self, api_key: Optional[str] = None, api_base: Optional[str] = None + ) -> List[str]: + """ + Returns a list of models supported by Azure AI. + + Azure AI doesn't have a standard model listing endpoint, + so this returns an empty list. + """ + return [] + ######################################################### # Not implemented methods ######################################################### + + @staticmethod + def strip_model_router_prefix(model: str) -> str: + """ + Strip the model_router prefix from model name. + Examples: + - "model_router/gpt-4o" -> "gpt-4o" + - "model-router/gpt-4o" -> "gpt-4o" + - "gpt-4o" -> "gpt-4o" + + Args: + model: Model name potentially with model_router prefix + + Returns: + Model name without the prefix + """ + if "model_router/" in model: + return model.split("model_router/", 1)[1] + if "model-router/" in model: + return model.split("model-router/", 1)[1] + return model + + @staticmethod + def get_base_model(model: str) -> str: + """ + Get the base model name, stripping any Azure AI routing prefixes. + + Args: + model: Model name potentially with routing prefixes + + Returns: + Base model name + """ + # Strip model_router prefix if present + model = AzureFoundryModelInfo.strip_model_router_prefix(model) + return model @staticmethod - def get_base_model(model: str) -> Optional[str]: - raise NotImplementedError("Azure Foundry does not support base model") + def get_azure_ai_config_for_model(model: str): + """ + Get the appropriate Azure AI config class for the given model. + + Routes to specialized configs based on model type: + - Model Router: AzureModelRouterConfig + - Claude models: AzureAnthropicConfig + - Default: AzureAIStudioConfig + + Args: + model: The model name + + Returns: + The appropriate config instance + """ + azure_ai_route = AzureFoundryModelInfo.get_azure_ai_route(model) + + if azure_ai_route == "model_router": + from litellm.llms.azure_ai.azure_model_router.transformation import ( + AzureModelRouterConfig, + ) + return AzureModelRouterConfig() + elif "claude" in model.lower(): + from litellm.llms.azure_ai.anthropic.transformation import ( + AzureAnthropicConfig, + ) + return AzureAnthropicConfig() + else: + from litellm.llms.azure_ai.chat.transformation import AzureAIStudioConfig + return AzureAIStudioConfig() def validate_environment( self, @@ -53,4 +168,6 @@ def validate_environment( api_base: Optional[str] = None, ) -> dict: """Azure Foundry sends api key in query params""" - raise NotImplementedError("Azure Foundry does not support environment validation") + raise NotImplementedError( + "Azure Foundry does not support environment validation" + ) diff --git a/litellm/llms/azure_ai/cost_calculator.py b/litellm/llms/azure_ai/cost_calculator.py new file mode 100644 index 000000000000..999f94da182a --- /dev/null +++ b/litellm/llms/azure_ai/cost_calculator.py @@ -0,0 +1,121 @@ +""" +Azure AI cost calculation helper. +Handles Azure AI Foundry Model Router flat cost and other Azure AI specific pricing. +""" + +from typing import Optional, Tuple + +from litellm._logging import verbose_logger +from litellm.litellm_core_utils.llm_cost_calc.utils import generic_cost_per_token +from litellm.types.utils import Usage +from litellm.utils import get_model_info + + +def _is_azure_model_router(model: str) -> bool: + """ + Check if the model is Azure AI Foundry Model Router. + + Detects patterns like: + - "azure-model-router" + - "model-router" + - "model_router/" + - "model-router/" + + Args: + model: The model name + + Returns: + bool: True if this is a model router model + """ + model_lower = model.lower() + return ( + "model-router" in model_lower + or "model_router" in model_lower + or model_lower == "azure-model-router" + ) + + +def calculate_azure_model_router_flat_cost(model: str, prompt_tokens: int) -> float: + """ + Calculate the flat cost for Azure AI Foundry Model Router. + + Args: + model: The model name (should be a model router model) + prompt_tokens: Number of prompt tokens + + Returns: + float: The flat cost in USD, or 0.0 if not applicable + """ + if not _is_azure_model_router(model): + return 0.0 + + # Get the model router pricing from model_prices_and_context_window.json + # Use "model_router" as the key (without actual model name suffix) + model_info = get_model_info(model="model_router", custom_llm_provider="azure_ai") + router_flat_cost_per_token = model_info.get("input_cost_per_token", 0) + + if router_flat_cost_per_token > 0: + return prompt_tokens * router_flat_cost_per_token + + return 0.0 + + +def cost_per_token( + model: str, usage: Usage, response_time_ms: Optional[float] = 0.0 +) -> Tuple[float, float]: + """ + Calculate the cost per token for Azure AI models. + + For Azure AI Foundry Model Router: + - Adds a flat cost of $0.14 per million input tokens (from model_prices_and_context_window.json) + - Plus the cost of the actual model used (handled by generic_cost_per_token) + + Args: + model: str, the model name without provider prefix + usage: LiteLLM Usage block + response_time_ms: Optional response time in milliseconds + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + + Raises: + ValueError: If the model is not found in the cost map and cost cannot be calculated + (except for Model Router models where we return just the routing flat cost) + """ + prompt_cost = 0.0 + completion_cost = 0.0 + + # Calculate base cost using generic cost calculator + # This may raise an exception if the model is not in the cost map + try: + prompt_cost, completion_cost = generic_cost_per_token( + model=model, + usage=usage, + custom_llm_provider="azure_ai", + ) + except Exception as e: + # For Model Router, the model name (e.g., "azure-model-router") may not be in the cost map + # because it's a routing service, not an actual model. In this case, we continue + # to calculate just the routing flat cost. + if not _is_azure_model_router(model): + # Re-raise for non-router models - they should have pricing defined + raise + verbose_logger.debug( + f"Azure AI Model Router: model '{model}' not in cost map, calculating routing flat cost only. Error: {e}" + ) + + # Add flat cost for Azure Model Router + # The flat cost is defined in model_prices_and_context_window.json for azure_ai/model_router + if _is_azure_model_router(model): + router_flat_cost = calculate_azure_model_router_flat_cost(model, usage.prompt_tokens) + + if router_flat_cost > 0: + verbose_logger.debug( + f"Azure AI Model Router flat cost: ${router_flat_cost:.6f} " + f"({usage.prompt_tokens} tokens × ${router_flat_cost / usage.prompt_tokens:.9f}/token)" + ) + + # Add flat cost to prompt cost + prompt_cost += router_flat_cost + + return prompt_cost, completion_cost diff --git a/litellm/llms/azure_ai/embed/handler.py b/litellm/llms/azure_ai/embed/handler.py index 13b8cc4cf29c..67733d1ccb59 100644 --- a/litellm/llms/azure_ai/embed/handler.py +++ b/litellm/llms/azure_ai/embed/handler.py @@ -58,7 +58,7 @@ async def async_image_embedding( data: ImageEmbeddingRequest, timeout: float, logging_obj, - model_response: litellm.EmbeddingResponse, + model_response: EmbeddingResponse, optional_params: dict, api_key: Optional[str], api_base: Optional[str], @@ -138,7 +138,7 @@ async def async_embedding( input: List, timeout: float, logging_obj, - model_response: litellm.EmbeddingResponse, + model_response: EmbeddingResponse, optional_params: dict, api_key: Optional[str] = None, api_base: Optional[str] = None, diff --git a/litellm/llms/azure_ai/image_edit/__init__.py b/litellm/llms/azure_ai/image_edit/__init__.py index e0e57bec403e..e3acd610446a 100644 --- a/litellm/llms/azure_ai/image_edit/__init__.py +++ b/litellm/llms/azure_ai/image_edit/__init__.py @@ -1,15 +1,28 @@ +from litellm.llms.azure_ai.image_generation.flux_transformation import ( + AzureFoundryFluxImageGenerationConfig, +) from litellm.llms.base_llm.image_edit.transformation import BaseImageEditConfig +from .flux2_transformation import AzureFoundryFlux2ImageEditConfig from .transformation import AzureFoundryFluxImageEditConfig -__all__ = ["AzureFoundryFluxImageEditConfig"] +__all__ = ["AzureFoundryFluxImageEditConfig", "AzureFoundryFlux2ImageEditConfig"] def get_azure_ai_image_edit_config(model: str) -> BaseImageEditConfig: - model = model.lower() - model = model.replace("-", "") - model = model.replace("_", "") - if model == "" or "flux" in model: # empty model is flux + """ + Get the appropriate image edit config for an Azure AI model. + + - FLUX 2 models use JSON with base64 image + - FLUX 1 models use multipart/form-data + """ + # Check if it's a FLUX 2 model + if AzureFoundryFluxImageGenerationConfig.is_flux2_model(model): + return AzureFoundryFlux2ImageEditConfig() + + # Default to FLUX 1 config for other FLUX models + model_normalized = model.lower().replace("-", "").replace("_", "") + if model_normalized == "" or "flux" in model_normalized: return AzureFoundryFluxImageEditConfig() - else: - raise ValueError(f"Model {model} is not supported for Azure AI image editing.") + + raise ValueError(f"Model {model} is not supported for Azure AI image editing.") diff --git a/litellm/llms/azure_ai/image_edit/flux2_transformation.py b/litellm/llms/azure_ai/image_edit/flux2_transformation.py new file mode 100644 index 000000000000..77d46ff91795 --- /dev/null +++ b/litellm/llms/azure_ai/image_edit/flux2_transformation.py @@ -0,0 +1,173 @@ +import base64 +from io import BufferedReader +from typing import Any, Dict, Optional, Tuple + +from httpx._types import RequestFiles + +import litellm +from litellm.llms.azure_ai.common_utils import AzureFoundryModelInfo +from litellm.llms.azure_ai.image_generation.flux_transformation import ( + AzureFoundryFluxImageGenerationConfig, +) +from litellm.llms.openai.image_edit.transformation import OpenAIImageEditConfig +from litellm.secret_managers.main import get_secret_str +from litellm.types.images.main import ImageEditOptionalRequestParams +from litellm.types.llms.openai import FileTypes +from litellm.types.router import GenericLiteLLMParams + + +class AzureFoundryFlux2ImageEditConfig(OpenAIImageEditConfig): + """ + Azure AI Foundry FLUX 2 image edit config + + Supports FLUX 2 models (e.g., flux.2-pro) for image editing. + Uses the same /providers/blackforestlabs/v1/flux-2-pro endpoint as image generation, + with the image passed as base64 in JSON body. + """ + + def get_supported_openai_params(self, model: str) -> list: + """ + FLUX 2 supports a subset of OpenAI image edit params + """ + return [ + "prompt", + "image", + "model", + "n", + "size", + ] + + def map_openai_params( + self, + image_edit_optional_params: ImageEditOptionalRequestParams, + model: str, + drop_params: bool, + ) -> Dict: + """ + Map OpenAI params to FLUX 2 params. + FLUX 2 uses the same param names as OpenAI for supported params. + """ + mapped_params: Dict[str, Any] = {} + supported_params = self.get_supported_openai_params(model) + + for key, value in dict(image_edit_optional_params).items(): + if key in supported_params and value is not None: + mapped_params[key] = value + + return mapped_params + + def use_multipart_form_data(self) -> bool: + """FLUX 2 uses JSON requests, not multipart/form-data.""" + return False + + def validate_environment( + self, + headers: dict, + model: str, + api_key: Optional[str] = None, + ) -> dict: + """ + Validate Azure AI Foundry environment and set up authentication + """ + api_key = AzureFoundryModelInfo.get_api_key(api_key) + + if not api_key: + raise ValueError( + f"Azure AI API key is required for model {model}. Set AZURE_AI_API_KEY environment variable or pass api_key parameter." + ) + + headers.update( + { + "Api-Key": api_key, + "Content-Type": "application/json", + } + ) + return headers + + def transform_image_edit_request( + self, + model: str, + prompt: Optional[str], + image: Optional[FileTypes], + image_edit_optional_request_params: Dict, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[Dict, RequestFiles]: + """ + Transform image edit request for FLUX 2. + + FLUX 2 uses the same endpoint for generation and editing, + with the image passed as base64 in the JSON body. + """ + if prompt is None: + raise ValueError("FLUX 2 image edit requires a prompt.") + + if image is None: + raise ValueError("FLUX 2 image edit requires an image.") + + image_b64 = self._convert_image_to_base64(image) + + # Build request body with required params + request_body: Dict[str, Any] = { + "prompt": prompt, + "image": image_b64, + "model": model, + } + + # Add mapped optional params (already filtered by map_openai_params) + request_body.update(image_edit_optional_request_params) + + # Return JSON body and empty files list (FLUX 2 doesn't use multipart) + return request_body, [] + + def _convert_image_to_base64(self, image: Any) -> str: + """Convert image file to base64 string""" + # Handle list of images (take first one) + if isinstance(image, list): + if len(image) == 0: + raise ValueError("Empty image list provided") + image = image[0] + + if isinstance(image, BufferedReader): + image_bytes = image.read() + image.seek(0) # Reset file pointer for potential reuse + elif isinstance(image, bytes): + image_bytes = image + elif hasattr(image, "read"): + image_bytes = image.read() # type: ignore + else: + raise ValueError(f"Unsupported image type: {type(image)}") + + return base64.b64encode(image_bytes).decode("utf-8") + + def get_complete_url( + self, + model: str, + api_base: Optional[str], + litellm_params: dict, + ) -> str: + """ + Constructs a complete URL for Azure AI Foundry FLUX 2 image edits. + + Uses the same /providers/blackforestlabs/v1/flux-2-pro endpoint as image generation. + """ + api_base = AzureFoundryModelInfo.get_api_base(api_base) + + if api_base is None: + raise ValueError( + "Azure AI API base is required. Set AZURE_AI_API_BASE environment variable or pass api_base parameter." + ) + + api_version = ( + litellm_params.get("api_version") + or litellm.api_version + or get_secret_str("AZURE_AI_API_VERSION") + or "preview" + ) + + return AzureFoundryFluxImageGenerationConfig.get_flux2_image_generation_url( + api_base=api_base, + model=model, + api_version=api_version, + ) + diff --git a/litellm/llms/azure_ai/image_edit/transformation.py b/litellm/llms/azure_ai/image_edit/transformation.py index 47f612912ce4..930b6d4db906 100644 --- a/litellm/llms/azure_ai/image_edit/transformation.py +++ b/litellm/llms/azure_ai/image_edit/transformation.py @@ -71,9 +71,11 @@ def get_complete_url( "Azure AI API base is required. Set AZURE_AI_API_BASE environment variable or pass api_base parameter." ) - api_version = (litellm_params.get("api_version") or litellm.api_version - or get_secret_str("AZURE_AI_API_VERSION") - ) + api_version = ( + litellm_params.get("api_version") + or litellm.api_version + or get_secret_str("AZURE_AI_API_VERSION") + ) if api_version is None: # API version is mandatory for Azure AI Foundry raise ValueError( diff --git a/litellm/llms/azure_ai/image_generation/flux_transformation.py b/litellm/llms/azure_ai/image_generation/flux_transformation.py index 5325f32ef636..6a1868d94cce 100644 --- a/litellm/llms/azure_ai/image_generation/flux_transformation.py +++ b/litellm/llms/azure_ai/image_generation/flux_transformation.py @@ -1,3 +1,5 @@ +from typing import Optional + from litellm.llms.openai.image_generation import GPTImageGenerationConfig @@ -11,4 +13,56 @@ class AzureFoundryFluxImageGenerationConfig(GPTImageGenerationConfig): From our test suite - following GPTImageGenerationConfig is working for this model """ - pass + + @staticmethod + def get_flux2_image_generation_url( + api_base: Optional[str], + model: str, + api_version: Optional[str], + ) -> str: + """ + Constructs the complete URL for Azure AI FLUX 2 image generation. + + FLUX 2 models on Azure AI use a different URL pattern than standard Azure OpenAI: + - Standard: /openai/deployments/{model}/images/generations + - FLUX 2: /providers/blackforestlabs/v1/flux-2-pro + + Args: + api_base: Base URL (e.g., https://litellm-ci-cd-prod.services.ai.azure.com) + model: Model name (e.g., flux.2-pro) + api_version: API version (e.g., preview) + + Returns: + Complete URL for the FLUX 2 image generation endpoint + """ + if api_base is None: + raise ValueError( + "api_base is required for Azure AI FLUX 2 image generation" + ) + + api_base = api_base.rstrip("/") + api_version = api_version or "preview" + + # If the api_base already contains /providers/, it's already a complete path + if "/providers/" in api_base: + if "?" in api_base: + return api_base + return f"{api_base}?api-version={api_version}" + + # Construct the FLUX 2 provider path + # Model name flux.2-pro maps to endpoint flux-2-pro + return f"{api_base}/providers/blackforestlabs/v1/flux-2-pro?api-version={api_version}" + + @staticmethod + def is_flux2_model(model: str) -> bool: + """ + Check if the model is an Azure AI FLUX 2 model. + + Args: + model: Model name (e.g., flux.2-pro, azure_ai/flux.2-pro) + + Returns: + True if the model is a FLUX 2 model + """ + model_lower = model.lower().replace(".", "-").replace("_", "-") + return "flux-2" in model_lower or "flux2" in model_lower diff --git a/litellm/llms/azure_ai/ocr/__init__.py b/litellm/llms/azure_ai/ocr/__init__.py index 86f7e53d60b0..7182a750b450 100644 --- a/litellm/llms/azure_ai/ocr/__init__.py +++ b/litellm/llms/azure_ai/ocr/__init__.py @@ -1,5 +1,13 @@ """Azure AI OCR module.""" +from .common_utils import get_azure_ai_ocr_config +from .document_intelligence.transformation import ( + AzureDocumentIntelligenceOCRConfig, +) from .transformation import AzureAIOCRConfig -__all__ = ["AzureAIOCRConfig"] +__all__ = [ + "AzureAIOCRConfig", + "AzureDocumentIntelligenceOCRConfig", + "get_azure_ai_ocr_config", +] diff --git a/litellm/llms/azure_ai/ocr/common_utils.py b/litellm/llms/azure_ai/ocr/common_utils.py new file mode 100644 index 000000000000..ef470c749237 --- /dev/null +++ b/litellm/llms/azure_ai/ocr/common_utils.py @@ -0,0 +1,53 @@ +""" +Common utilities for Azure AI OCR providers. + +This module provides routing logic to determine which OCR configuration to use +based on the model name. +""" + +from typing import TYPE_CHECKING, Optional + +from litellm._logging import verbose_logger + +if TYPE_CHECKING: + from litellm.llms.base_llm.ocr.transformation import BaseOCRConfig + + +def get_azure_ai_ocr_config(model: str) -> Optional["BaseOCRConfig"]: + """ + Determine which Azure AI OCR configuration to use based on the model name. + + Azure AI supports multiple OCR services: + - Azure Document Intelligence: azure_ai/doc-intelligence/ + - Mistral OCR (via Azure AI): azure_ai/ + + Args: + model: The model name (e.g., "azure_ai/doc-intelligence/prebuilt-read", + "azure_ai/pixtral-12b-2409") + + Returns: + OCR configuration instance for the specified model + + Examples: + >>> get_azure_ai_ocr_config("azure_ai/doc-intelligence/prebuilt-read") + + + >>> get_azure_ai_ocr_config("azure_ai/pixtral-12b-2409") + + """ + from litellm.llms.azure_ai.ocr.document_intelligence.transformation import ( + AzureDocumentIntelligenceOCRConfig, + ) + from litellm.llms.azure_ai.ocr.transformation import AzureAIOCRConfig + + # Check for Azure Document Intelligence models + if "doc-intelligence" in model or "documentintelligence" in model: + verbose_logger.debug( + f"Routing {model} to Azure Document Intelligence OCR config" + ) + return AzureDocumentIntelligenceOCRConfig() + + # Default to Mistral-based OCR for other azure_ai models + verbose_logger.debug(f"Routing {model} to Azure AI (Mistral) OCR config") + return AzureAIOCRConfig() + diff --git a/litellm/llms/azure_ai/ocr/document_intelligence/__init__.py b/litellm/llms/azure_ai/ocr/document_intelligence/__init__.py new file mode 100644 index 000000000000..372a6a8d7617 --- /dev/null +++ b/litellm/llms/azure_ai/ocr/document_intelligence/__init__.py @@ -0,0 +1,5 @@ +"""Azure Document Intelligence OCR module.""" +from .transformation import AzureDocumentIntelligenceOCRConfig + +__all__ = ["AzureDocumentIntelligenceOCRConfig"] + diff --git a/litellm/llms/azure_ai/ocr/document_intelligence/transformation.py b/litellm/llms/azure_ai/ocr/document_intelligence/transformation.py new file mode 100644 index 000000000000..b1ccfc36d0db --- /dev/null +++ b/litellm/llms/azure_ai/ocr/document_intelligence/transformation.py @@ -0,0 +1,696 @@ +""" +Azure Document Intelligence OCR transformation implementation. + +Azure Document Intelligence (formerly Form Recognizer) provides advanced document analysis capabilities. +This implementation transforms between Mistral OCR format and Azure Document Intelligence API v4.0. + +Note: Azure Document Intelligence API is async - POST returns 202 Accepted with Operation-Location header. +The operation location must be polled until the analysis completes. +""" +import asyncio +import re +import time +from typing import Any, Dict, Optional + +import httpx + +from litellm._logging import verbose_logger +from litellm.constants import ( + AZURE_DOCUMENT_INTELLIGENCE_API_VERSION, + AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI, + AZURE_OPERATION_POLLING_TIMEOUT, +) +from litellm.llms.base_llm.ocr.transformation import ( + BaseOCRConfig, + DocumentType, + OCRPage, + OCRPageDimensions, + OCRRequestData, + OCRResponse, + OCRUsageInfo, +) +from litellm.secret_managers.main import get_secret_str + + +class AzureDocumentIntelligenceOCRConfig(BaseOCRConfig): + """ + Azure Document Intelligence OCR transformation configuration. + + Supports Azure Document Intelligence v4.0 (2024-11-30) API. + Model route: azure_ai/doc-intelligence/ + + Supported models: + - prebuilt-layout: Extracts text with markdown, tables, and structure (closest to Mistral OCR) + - prebuilt-read: Basic text extraction optimized for reading + - prebuilt-document: General document analysis + + Reference: https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/ + """ + + def __init__(self) -> None: + super().__init__() + + def get_supported_ocr_params(self, model: str) -> list: + """ + Get supported OCR parameters for Azure Document Intelligence. + + Azure DI has minimal optional parameters compared to Mistral OCR. + Most Mistral-specific params are ignored during transformation. + """ + return [] + + def validate_environment( + self, + headers: Dict, + model: str, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + litellm_params: Optional[dict] = None, + **kwargs, + ) -> Dict: + """ + Validate environment and return headers for Azure Document Intelligence. + + Authentication uses Ocp-Apim-Subscription-Key header. + """ + # Get API key from environment if not provided + if api_key is None: + api_key = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_API_KEY") + + if api_key is None: + raise ValueError( + "Missing Azure Document Intelligence API Key - Set AZURE_DOCUMENT_INTELLIGENCE_API_KEY environment variable or pass api_key parameter" + ) + + # Validate API base/endpoint is provided + if api_base is None: + api_base = get_secret_str("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT") + + if api_base is None: + raise ValueError( + "Missing Azure Document Intelligence Endpoint - Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT environment variable or pass api_base parameter" + ) + + headers = { + "Ocp-Apim-Subscription-Key": api_key, + "Content-Type": "application/json", + **headers, + } + + return headers + + def get_complete_url( + self, + api_base: Optional[str], + model: str, + optional_params: dict, + litellm_params: Optional[dict] = None, + **kwargs, + ) -> str: + """ + Get complete URL for Azure Document Intelligence endpoint. + + Format: {endpoint}/documentintelligence/documentModels/{modelId}:analyze?api-version=2024-11-30 + + Note: API version 2024-11-30 uses /documentintelligence/ path (not /formrecognizer/) + + Args: + api_base: Azure Document Intelligence endpoint (e.g., https://your-resource.cognitiveservices.azure.com) + model: Model ID (e.g., "prebuilt-layout", "prebuilt-read") + optional_params: Optional parameters + + Returns: Complete URL for Azure DI analyze endpoint + """ + if api_base is None: + raise ValueError( + "Missing Azure Document Intelligence Endpoint - Set AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT environment variable or pass api_base parameter" + ) + + # Ensure no trailing slash + api_base = api_base.rstrip("/") + + # Extract model ID from full model path if needed + # Model can be "prebuilt-layout" or "azure_ai/doc-intelligence/prebuilt-layout" + model_id = model + if "/" in model: + # Extract the last part after the last slash + model_id = model.split("/")[-1] + + # Azure Document Intelligence analyze endpoint + # Note: API version 2024-11-30+ uses /documentintelligence/ (not /formrecognizer/) + return f"{api_base}/documentintelligence/documentModels/{model_id}:analyze?api-version={AZURE_DOCUMENT_INTELLIGENCE_API_VERSION}" + + def _extract_base64_from_data_uri(self, data_uri: str) -> str: + """ + Extract base64 content from a data URI. + + Args: + data_uri: Data URI like "data:application/pdf;base64,..." + + Returns: + Base64 string without the data URI prefix + """ + # Match pattern: data:[][;base64], + match = re.match(r"data:([^;]+)(?:;base64)?,(.+)", data_uri) + if match: + return match.group(2) + return data_uri + + def transform_ocr_request( + self, + model: str, + document: DocumentType, + optional_params: dict, + headers: dict, + **kwargs, + ) -> OCRRequestData: + """ + Transform OCR request to Azure Document Intelligence format. + + Mistral OCR format: + { + "document": { + "type": "document_url", + "document_url": "https://example.com/doc.pdf" + } + } + + Azure DI format: + { + "urlSource": "https://example.com/doc.pdf" + } + OR + { + "base64Source": "base64_encoded_content" + } + + Args: + model: Model name + document: Document dict from user (Mistral format) + optional_params: Already mapped optional parameters + headers: Request headers + + Returns: + OCRRequestData with JSON data + """ + verbose_logger.debug( + f"Azure Document Intelligence transform_ocr_request - model: {model}" + ) + + if not isinstance(document, dict): + raise ValueError(f"Expected document dict, got {type(document)}") + + # Extract document URL from Mistral format + doc_type = document.get("type") + document_url = None + + if doc_type == "document_url": + document_url = document.get("document_url", "") + elif doc_type == "image_url": + document_url = document.get("image_url", "") + else: + raise ValueError( + f"Invalid document type: {doc_type}. Must be 'document_url' or 'image_url'" + ) + + if not document_url: + raise ValueError("Document URL is required") + + # Build Azure DI request + data: Dict[str, Any] = {} + + # Check if it's a data URI (base64) + if document_url.startswith("data:"): + # Extract base64 content + base64_content = self._extract_base64_from_data_uri(document_url) + data["base64Source"] = base64_content + verbose_logger.debug("Using base64Source for Azure Document Intelligence") + else: + # Regular URL + data["urlSource"] = document_url + verbose_logger.debug("Using urlSource for Azure Document Intelligence") + + # Azure DI doesn't support most Mistral-specific params + # Ignore pages, include_image_base64, etc. + + return OCRRequestData(data=data, files=None) + + def _extract_page_markdown(self, page_data: Dict[str, Any]) -> str: + """ + Extract text from Azure DI page and format as markdown. + + Azure DI provides text in 'lines' array. We concatenate them with newlines. + + Args: + page_data: Azure DI page object + + Returns: + Markdown-formatted text + """ + lines = page_data.get("lines", []) + if not lines: + return "" + + # Extract text content from each line + text_lines = [line.get("content", "") for line in lines] + + # Join with newlines to preserve structure + return "\n".join(text_lines) + + def _convert_dimensions( + self, width: float, height: float, unit: str + ) -> OCRPageDimensions: + """ + Convert Azure DI dimensions to pixels. + + Azure DI provides dimensions in inches. We convert to pixels using configured DPI. + + Args: + width: Width in specified unit + height: Height in specified unit + unit: Unit of measurement (e.g., "inch") + + Returns: + OCRPageDimensions with pixel values + """ + # Convert to pixels using configured DPI + dpi = AZURE_DOCUMENT_INTELLIGENCE_DEFAULT_DPI + if unit == "inch": + width_px = int(width * dpi) + height_px = int(height * dpi) + else: + # If unit is not inches, assume it's already in pixels + width_px = int(width) + height_px = int(height) + + return OCRPageDimensions(width=width_px, height=height_px, dpi=dpi) + + @staticmethod + def _check_timeout(start_time: float, timeout_secs: int) -> None: + """ + Check if operation has timed out. + + Args: + start_time: Start time of the operation + timeout_secs: Timeout duration in seconds + + Raises: + TimeoutError: If operation has exceeded timeout + """ + if time.time() - start_time > timeout_secs: + raise TimeoutError( + f"Azure Document Intelligence operation polling timed out after {timeout_secs} seconds" + ) + + @staticmethod + def _get_retry_after(response: httpx.Response) -> int: + """ + Get retry-after duration from response headers. + + Args: + response: HTTP response + + Returns: + Retry-after duration in seconds (default: 2) + """ + retry_after = int(response.headers.get("retry-after", "2")) + verbose_logger.debug(f"Retry polling after: {retry_after} seconds") + return retry_after + + @staticmethod + def _check_operation_status(response: httpx.Response) -> str: + """ + Check Azure DI operation status from response. + + Args: + response: HTTP response from operation endpoint + + Returns: + Operation status string + + Raises: + ValueError: If operation failed or status is unknown + """ + try: + result = response.json() + status = result.get("status") + + verbose_logger.debug(f"Azure DI operation status: {status}") + + if status == "succeeded": + return "succeeded" + elif status == "failed": + error_msg = result.get("error", {}).get("message", "Unknown error") + raise ValueError( + f"Azure Document Intelligence analysis failed: {error_msg}" + ) + elif status in ["running", "notStarted"]: + return "running" + else: + raise ValueError(f"Unknown operation status: {status}") + + except Exception as e: + if "succeeded" in str(e) or "failed" in str(e): + raise + # If we can't parse JSON, something went wrong + raise ValueError(f"Failed to parse Azure DI operation response: {e}") + + def _poll_operation_sync( + self, + operation_url: str, + headers: Dict[str, str], + timeout_secs: int, + ) -> httpx.Response: + """ + Poll Azure Document Intelligence operation until completion (sync). + + Azure DI POST returns 202 with Operation-Location header. + We need to poll that URL until status is "succeeded" or "failed". + + Args: + operation_url: The Operation-Location URL to poll + headers: Request headers (including auth) + timeout_secs: Total timeout in seconds + + Returns: + Final response with completed analysis + """ + from litellm.llms.custom_httpx.http_handler import _get_httpx_client + + client = _get_httpx_client() + start_time = time.time() + + verbose_logger.debug(f"Polling Azure DI operation: {operation_url}") + + while True: + self._check_timeout(start_time=start_time, timeout_secs=timeout_secs) + + # Poll the operation status + response = client.get(url=operation_url, headers=headers) + + # Check operation status + status = self._check_operation_status(response=response) + + if status == "succeeded": + return response + elif status == "running": + # Wait before polling again + retry_after = self._get_retry_after(response=response) + time.sleep(retry_after) + + async def _poll_operation_async( + self, + operation_url: str, + headers: Dict[str, str], + timeout_secs: int, + ) -> httpx.Response: + """ + Poll Azure Document Intelligence operation until completion (async). + + Args: + operation_url: The Operation-Location URL to poll + headers: Request headers (including auth) + timeout_secs: Total timeout in seconds + + Returns: + Final response with completed analysis + """ + import litellm + from litellm.llms.custom_httpx.http_handler import get_async_httpx_client + + client = get_async_httpx_client(llm_provider=litellm.LlmProviders.AZURE_AI) + start_time = time.time() + + verbose_logger.debug(f"Polling Azure DI operation (async): {operation_url}") + + while True: + self._check_timeout(start_time=start_time, timeout_secs=timeout_secs) + + # Poll the operation status + response = await client.get(url=operation_url, headers=headers) + + # Check operation status + status = self._check_operation_status(response=response) + + if status == "succeeded": + return response + elif status == "running": + # Wait before polling again + retry_after = self._get_retry_after(response=response) + await asyncio.sleep(retry_after) + + def transform_ocr_response( + self, + model: str, + raw_response: httpx.Response, + logging_obj: Any, + **kwargs, + ) -> OCRResponse: + """ + Transform Azure Document Intelligence response to Mistral OCR format. + + Handles async operation polling: If response is 202 Accepted, polls Operation-Location + until analysis completes. + + Azure DI response (after polling): + { + "status": "succeeded", + "analyzeResult": { + "content": "Full document text...", + "pages": [ + { + "pageNumber": 1, + "width": 8.5, + "height": 11, + "unit": "inch", + "lines": [{"content": "text", "boundingBox": [...]}] + } + ] + } + } + + Mistral OCR format: + { + "pages": [ + { + "index": 0, + "markdown": "extracted text", + "dimensions": {"width": 816, "height": 1056, "dpi": 96} + } + ], + "model": "azure_ai/doc-intelligence/prebuilt-layout", + "usage_info": {"pages_processed": 1}, + "object": "ocr" + } + + Args: + model: Model name + raw_response: Raw HTTP response from Azure DI (may be 202 Accepted) + logging_obj: Logging object + + Returns: + OCRResponse in Mistral format + """ + try: + # Check if we got 202 Accepted (async operation started) + if raw_response.status_code == 202: + verbose_logger.debug( + "Azure DI returned 202 Accepted, polling operation..." + ) + + # Get Operation-Location header + operation_url = raw_response.headers.get("Operation-Location") + if not operation_url: + raise ValueError( + "Azure Document Intelligence returned 202 but no Operation-Location header found" + ) + + # Get headers for polling (need auth) + poll_headers = { + "Ocp-Apim-Subscription-Key": raw_response.request.headers.get( + "Ocp-Apim-Subscription-Key", "" + ) + } + + # Get timeout from kwargs or use default + timeout_secs = AZURE_OPERATION_POLLING_TIMEOUT + + # Poll until operation completes + raw_response = self._poll_operation_sync( + operation_url=operation_url, + headers=poll_headers, + timeout_secs=timeout_secs, + ) + + # Now parse the completed response + response_json = raw_response.json() + + verbose_logger.debug( + f"Azure Document Intelligence response status: {response_json.get('status')}" + ) + + # Check if request succeeded + status = response_json.get("status") + if status != "succeeded": + raise ValueError( + f"Azure Document Intelligence analysis failed with status: {status}" + ) + + # Extract analyze result + analyze_result = response_json.get("analyzeResult", {}) + azure_pages = analyze_result.get("pages", []) + + # Transform pages to Mistral format + mistral_pages = [] + for azure_page in azure_pages: + page_number = azure_page.get("pageNumber", 1) + index = page_number - 1 # Convert to 0-based index + + # Extract markdown text + markdown = self._extract_page_markdown(azure_page) + + # Convert dimensions + width = azure_page.get("width", 8.5) + height = azure_page.get("height", 11) + unit = azure_page.get("unit", "inch") + dimensions = self._convert_dimensions( + width=width, height=height, unit=unit + ) + + # Build OCR page + ocr_page = OCRPage( + index=index, markdown=markdown, dimensions=dimensions + ) + mistral_pages.append(ocr_page) + + # Build usage info + usage_info = OCRUsageInfo( + pages_processed=len(mistral_pages), doc_size_bytes=None + ) + + # Return Mistral OCR response + return OCRResponse( + pages=mistral_pages, + model=model, + usage_info=usage_info, + object="ocr", + ) + + except Exception as e: + verbose_logger.error( + f"Error parsing Azure Document Intelligence response: {e}" + ) + raise e + + async def async_transform_ocr_response( + self, + model: str, + raw_response: httpx.Response, + logging_obj: Any, + **kwargs, + ) -> OCRResponse: + """ + Async transform Azure Document Intelligence response to Mistral OCR format. + + Handles async operation polling: If response is 202 Accepted, polls Operation-Location + until analysis completes using async polling. + + Args: + model: Model name + raw_response: Raw HTTP response from Azure DI (may be 202 Accepted) + logging_obj: Logging object + + Returns: + OCRResponse in Mistral format + """ + try: + # Check if we got 202 Accepted (async operation started) + if raw_response.status_code == 202: + verbose_logger.debug( + "Azure DI returned 202 Accepted, polling operation (async)..." + ) + + # Get Operation-Location header + operation_url = raw_response.headers.get("Operation-Location") + if not operation_url: + raise ValueError( + "Azure Document Intelligence returned 202 but no Operation-Location header found" + ) + + # Get headers for polling (need auth) + poll_headers = { + "Ocp-Apim-Subscription-Key": raw_response.request.headers.get( + "Ocp-Apim-Subscription-Key", "" + ) + } + + # Get timeout from kwargs or use default + timeout_secs = AZURE_OPERATION_POLLING_TIMEOUT + + # Poll until operation completes (async) + raw_response = await self._poll_operation_async( + operation_url=operation_url, + headers=poll_headers, + timeout_secs=timeout_secs, + ) + + # Now parse the completed response + response_json = raw_response.json() + + verbose_logger.debug( + f"Azure Document Intelligence response status: {response_json.get('status')}" + ) + + # Check if request succeeded + status = response_json.get("status") + if status != "succeeded": + raise ValueError( + f"Azure Document Intelligence analysis failed with status: {status}" + ) + + # Extract analyze result + analyze_result = response_json.get("analyzeResult", {}) + azure_pages = analyze_result.get("pages", []) + + # Transform pages to Mistral format + mistral_pages = [] + for azure_page in azure_pages: + page_number = azure_page.get("pageNumber", 1) + index = page_number - 1 # Convert to 0-based index + + # Extract markdown text + markdown = self._extract_page_markdown(azure_page) + + # Convert dimensions + width = azure_page.get("width", 8.5) + height = azure_page.get("height", 11) + unit = azure_page.get("unit", "inch") + dimensions = self._convert_dimensions( + width=width, height=height, unit=unit + ) + + # Build OCR page + ocr_page = OCRPage( + index=index, markdown=markdown, dimensions=dimensions + ) + mistral_pages.append(ocr_page) + + # Build usage info + usage_info = OCRUsageInfo( + pages_processed=len(mistral_pages), doc_size_bytes=None + ) + + # Return Mistral OCR response + return OCRResponse( + pages=mistral_pages, + model=model, + usage_info=usage_info, + object="ocr", + ) + + except Exception as e: + verbose_logger.error( + f"Error parsing Azure Document Intelligence response (async): {e}" + ) + raise e + diff --git a/litellm/llms/azure_ai/ocr/transformation.py b/litellm/llms/azure_ai/ocr/transformation.py index eade2dd765f9..24fc9e86134f 100644 --- a/litellm/llms/azure_ai/ocr/transformation.py +++ b/litellm/llms/azure_ai/ocr/transformation.py @@ -35,6 +35,7 @@ def validate_environment( model: str, api_key: Optional[str] = None, api_base: Optional[str] = None, + litellm_params: Optional[dict] = None, **kwargs, ) -> Dict: """ @@ -73,6 +74,7 @@ def get_complete_url( api_base: Optional[str], model: str, optional_params: dict, + litellm_params: Optional[dict] = None, **kwargs, ) -> str: """ diff --git a/litellm/llms/azure_ai/rerank/transformation.py b/litellm/llms/azure_ai/rerank/transformation.py index 4465e0d70a29..a47b6082c371 100644 --- a/litellm/llms/azure_ai/rerank/transformation.py +++ b/litellm/llms/azure_ai/rerank/transformation.py @@ -18,7 +18,12 @@ class AzureAIRerankConfig(CohereRerankConfig): Azure AI Rerank - Follows the same Spec as Cohere Rerank """ - def get_complete_url(self, api_base: Optional[str], model: str) -> str: + def get_complete_url( + self, + api_base: Optional[str], + model: str, + optional_params: Optional[dict] = None, + ) -> str: if api_base is None: raise ValueError( "Azure AI API Base is required. api_base=None. Set in call or via `AZURE_AI_API_BASE` env var." @@ -32,6 +37,7 @@ def validate_environment( headers: dict, model: str, api_key: Optional[str] = None, + optional_params: Optional[dict] = None, ) -> dict: if api_key is None: api_key = get_secret_str("AZURE_AI_API_KEY") or litellm.azure_key diff --git a/litellm/llms/azure_ai/vector_stores/transformation.py b/litellm/llms/azure_ai/vector_stores/transformation.py index f99d2c4c4b28..96cea064ce19 100644 --- a/litellm/llms/azure_ai/vector_stores/transformation.py +++ b/litellm/llms/azure_ai/vector_stores/transformation.py @@ -7,8 +7,10 @@ from litellm.llms.base_llm.vector_store.transformation import BaseVectorStoreConfig from litellm.types.router import GenericLiteLLMParams from litellm.types.vector_stores import ( + BaseVectorStoreAuthCredentials, VectorStoreCreateOptionalRequestParams, VectorStoreCreateResponse, + VectorStoreIndexEndpoints, VectorStoreResultContent, VectorStoreSearchOptionalRequestParams, VectorStoreSearchResponse, @@ -34,6 +36,25 @@ class AzureAIVectorStoreConfig(BaseVectorStoreConfig, BaseAzureLLM): def __init__(self): super().__init__() + def get_vector_store_endpoints_by_type(self) -> VectorStoreIndexEndpoints: + return { + "read": [("GET", "/docs/search"), ("POST", "/docs/search")], + "write": [("PUT", "/docs")], + } + + def get_auth_credentials( + self, litellm_params: dict + ) -> BaseVectorStoreAuthCredentials: + api_key = litellm_params.get("api_key") + if api_key is None: + raise ValueError("api_key is required") + + return { + "headers": { + "api-key": api_key, + } + } + def validate_environment( self, headers: dict, litellm_params: Optional[GenericLiteLLMParams] ) -> dict: diff --git a/litellm/llms/base_llm/chat/transformation.py b/litellm/llms/base_llm/chat/transformation.py index 1867abde3108..ac209904e6e0 100644 --- a/litellm/llms/base_llm/chat/transformation.py +++ b/litellm/llms/base_llm/chat/transformation.py @@ -101,6 +101,7 @@ def get_config(cls): ), ) and v is not None + and not callable(v) # Filter out any callable objects including mocks } def get_json_schema_from_pydantic_object( @@ -131,10 +132,10 @@ def update_optional_params_with_thinking_tokens( Checks 'non_default_params' for 'thinking' and 'max_tokens' - if 'thinking' is enabled and 'max_tokens' is not specified, set 'max_tokens' to the thinking token budget + DEFAULT_MAX_TOKENS + if 'thinking' is enabled and 'max_tokens' or 'max_completion_tokens' is not specified, set 'max_tokens' to the thinking token budget + DEFAULT_MAX_TOKENS """ is_thinking_enabled = self.is_thinking_enabled(optional_params) - if is_thinking_enabled and "max_tokens" not in non_default_params: + if is_thinking_enabled and ("max_tokens" not in non_default_params and "max_completion_tokens" not in non_default_params): thinking_token_budget = cast(dict, optional_params["thinking"]).get( "budget_tokens", None ) @@ -436,3 +437,23 @@ def supports_stream_param_in_request_body(self) -> bool: By default, this is true for almost all providers. """ return True + + def calculate_additional_costs( + self, model: str, prompt_tokens: int, completion_tokens: int + ) -> Optional[dict]: + """ + Calculate any additional costs beyond standard token costs. + + This is used for provider-specific infrastructure costs, routing fees, etc. + + Args: + model: The model name + prompt_tokens: Number of prompt tokens + completion_tokens: Number of completion tokens + + Returns: + Optional dictionary with cost names and amounts, e.g.: + {"Infrastructure Fee": 0.001, "Routing Cost": 0.0005} + Returns None if no additional costs apply. + """ + return None diff --git a/litellm/llms/base_llm/containers/transformation.py b/litellm/llms/base_llm/containers/transformation.py new file mode 100644 index 000000000000..5ce374c77344 --- /dev/null +++ b/litellm/llms/base_llm/containers/transformation.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +import types +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any + +import httpx + +from litellm.types.containers.main import ContainerCreateOptionalRequestParams +from litellm.types.router import GenericLiteLLMParams + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + from litellm.types.containers.main import ( + ContainerFileListResponse as _ContainerFileListResponse, + ) + from litellm.types.containers.main import ( + ContainerListResponse as _ContainerListResponse, + ) + from litellm.types.containers.main import ContainerObject as _ContainerObject + from litellm.types.containers.main import ( + DeleteContainerResult as _DeleteContainerResult, + ) + + from ..chat.transformation import BaseLLMException as _BaseLLMException + + LiteLLMLoggingObj = _LiteLLMLoggingObj + BaseLLMException = _BaseLLMException + ContainerObject = _ContainerObject + DeleteContainerResult = _DeleteContainerResult + ContainerListResponse = _ContainerListResponse + ContainerFileListResponse = _ContainerFileListResponse +else: + LiteLLMLoggingObj = Any + BaseLLMException = Any + ContainerObject = Any + DeleteContainerResult = Any + ContainerListResponse = Any + ContainerFileListResponse = Any + + +class BaseContainerConfig(ABC): + def __init__(self): + pass + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not k.startswith("_abc") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + @abstractmethod + def get_supported_openai_params(self) -> list: + pass + + @abstractmethod + def map_openai_params( + self, + container_create_optional_params: ContainerCreateOptionalRequestParams, + drop_params: bool, + ) -> dict: + pass + + @abstractmethod + def validate_environment( + self, + headers: dict, + api_key: str | None = None, + ) -> dict: + return {} + + @abstractmethod + def get_complete_url( + self, + api_base: str | None, + litellm_params: dict, + ) -> str: + """Get the complete url for the request. + + OPTIONAL - Some providers need `model` in `api_base`. + """ + if api_base is None: + msg = "api_base is required" + raise ValueError(msg) + return api_base + + @abstractmethod + def transform_container_create_request( + self, + name: str, + container_create_optional_request_params: dict, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> dict: + """Transform the container creation request. + + Returns: + dict: Request data for container creation. + """ + ... + + @abstractmethod + def transform_container_create_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> ContainerObject: + """Transform the container creation response.""" + ... + + @abstractmethod + def transform_container_list_request( + self, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + after: str | None = None, + limit: int | None = None, + order: str | None = None, + extra_query: dict[str, Any] | None = None, + ) -> tuple[str, dict]: + """Transform the container list request into a URL and params. + + Returns: + tuple[str, dict]: (url, params) for the container list request. + """ + ... + + @abstractmethod + def transform_container_list_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> ContainerListResponse: + """Transform the container list response.""" + ... + + @abstractmethod + def transform_container_retrieve_request( + self, + container_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> tuple[str, dict]: + """Transform the container retrieve request into a URL and data/params. + + Returns: + tuple[str, dict]: (url, params) for the container retrieve request. + """ + ... + + @abstractmethod + def transform_container_retrieve_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> ContainerObject: + """Transform the container retrieve response.""" + ... + + @abstractmethod + def transform_container_delete_request( + self, + container_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> tuple[str, dict]: + """Transform the container delete request into a URL and data. + + Returns: + tuple[str, dict]: (url, data) for the container delete request. + """ + ... + + @abstractmethod + def transform_container_delete_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> DeleteContainerResult: + """Transform the container delete response.""" + ... + + @abstractmethod + def transform_container_file_list_request( + self, + container_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + after: str | None = None, + limit: int | None = None, + order: str | None = None, + extra_query: dict[str, Any] | None = None, + ) -> tuple[str, dict]: + """Transform the container file list request into a URL and params. + + Returns: + tuple[str, dict]: (url, params) for the container file list request. + """ + ... + + @abstractmethod + def transform_container_file_list_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> ContainerFileListResponse: + """Transform the container file list response.""" + ... + + @abstractmethod + def transform_container_file_content_request( + self, + container_id: str, + file_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> tuple[str, dict]: + """Transform the container file content request into a URL and params. + + Returns: + tuple[str, dict]: (url, params) for the container file content request. + """ + ... + + @abstractmethod + def transform_container_file_content_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> bytes: + """Transform the container file content response. + + Returns: + bytes: The raw file content. + """ + ... + + def get_error_class( + self, + error_message: str, + status_code: int, + headers: dict | httpx.Headers, + ) -> BaseLLMException: + from ..chat.transformation import BaseLLMException + + raise BaseLLMException( + status_code=status_code, + message=error_message, + headers=headers, + ) + diff --git a/litellm/llms/base_llm/files/azure_blob_storage_backend.py b/litellm/llms/base_llm/files/azure_blob_storage_backend.py new file mode 100644 index 000000000000..db3aa50d89a5 --- /dev/null +++ b/litellm/llms/base_llm/files/azure_blob_storage_backend.py @@ -0,0 +1,312 @@ +""" +Azure Blob Storage backend implementation for file storage. + +This module implements the Azure Blob Storage backend for storing files +in Azure Data Lake Storage Gen2. It inherits from AzureBlobStorageLogger +to reuse all authentication and Azure Storage operations. +""" + +import time +from typing import Optional +from urllib.parse import quote + +from litellm._logging import verbose_logger +from litellm._uuid import uuid + +from .storage_backend import BaseFileStorageBackend +from litellm.integrations.azure_storage.azure_storage import AzureBlobStorageLogger + + +class AzureBlobStorageBackend(BaseFileStorageBackend, AzureBlobStorageLogger): + """ + Azure Blob Storage backend implementation. + + Inherits from AzureBlobStorageLogger to reuse: + - Authentication (account key and Azure AD) + - Service client management + - Token management + - All Azure Storage helper methods + + Reads configuration from the same environment variables as AzureBlobStorageLogger. + """ + + def __init__(self, **kwargs): + """ + Initialize Azure Blob Storage backend. + + Inherits all functionality from AzureBlobStorageLogger which handles: + - Reading environment variables + - Authentication (account key and Azure AD) + - Service client management + - Token management + + Environment variables (same as AzureBlobStorageLogger): + - AZURE_STORAGE_ACCOUNT_NAME (required) + - AZURE_STORAGE_FILE_SYSTEM (required) + - AZURE_STORAGE_ACCOUNT_KEY (optional, if using account key auth) + - AZURE_STORAGE_TENANT_ID (optional, if using Azure AD) + - AZURE_STORAGE_CLIENT_ID (optional, if using Azure AD) + - AZURE_STORAGE_CLIENT_SECRET (optional, if using Azure AD) + + Note: We skip periodic_flush since we're not using this as a logger. + """ + # Initialize AzureBlobStorageLogger (handles all auth and config) + AzureBlobStorageLogger.__init__(self, **kwargs) + + # Disable logging functionality - we're only using this for file storage + # The periodic_flush task will be created but will do nothing since we override it + + async def periodic_flush(self): + """ + Override to do nothing - we're not using this as a logger. + This prevents the periodic flush task from doing any work. + """ + # Do nothing - this class is used for file storage, not logging + return + + async def async_log_success_event(self, *args, **kwargs): + """ + Override to do nothing - we're not using this as a logger. + """ + # Do nothing - this class is used for file storage, not logging + pass + + async def async_log_failure_event(self, *args, **kwargs): + """ + Override to do nothing - we're not using this as a logger. + """ + # Do nothing - this class is used for file storage, not logging + pass + + def _generate_file_name( + self, original_filename: str, file_naming_strategy: str + ) -> str: + """Generate file name based on naming strategy.""" + if file_naming_strategy == "original_filename": + # Use original filename, but sanitize it + return quote(original_filename, safe="") + elif file_naming_strategy == "timestamp": + # Use timestamp + extension = original_filename.split(".")[-1] if "." in original_filename else "" + timestamp = int(time.time() * 1000) # milliseconds + return f"{timestamp}.{extension}" if extension else str(timestamp) + else: # default to "uuid" + # Use UUID + extension = original_filename.split(".")[-1] if "." in original_filename else "" + file_uuid = str(uuid.uuid4()) + return f"{file_uuid}.{extension}" if extension else file_uuid + + async def upload_file( + self, + file_content: bytes, + filename: str, + content_type: str, + path_prefix: Optional[str] = None, + file_naming_strategy: str = "uuid", + ) -> str: + """ + Upload a file to Azure Blob Storage. + + Returns the blob URL in format: https://{account}.blob.core.windows.net/{container}/{path} + """ + try: + # Generate file name + file_name = self._generate_file_name(filename, file_naming_strategy) + + # Build full path + if path_prefix: + # Remove leading/trailing slashes and normalize + prefix = path_prefix.strip("/") + full_path = f"{prefix}/{file_name}" + else: + full_path = file_name + + if self.azure_storage_account_key: + # Use Azure SDK with account key (reuse logger's method) + storage_url = await self._upload_file_with_account_key( + file_content=file_content, + full_path=full_path, + ) + else: + # Use REST API with Azure AD token (reuse logger's methods) + storage_url = await self._upload_file_with_azure_ad( + file_content=file_content, + full_path=full_path, + ) + + verbose_logger.debug( + f"Successfully uploaded file to Azure Blob Storage: {storage_url}" + ) + return storage_url + + except Exception as e: + verbose_logger.exception(f"Error uploading file to Azure Blob Storage: {str(e)}") + raise + + async def _upload_file_with_account_key( + self, file_content: bytes, full_path: str + ) -> str: + """Upload file using Azure SDK with account key authentication.""" + # Reuse the logger's service client method + service_client = await self.get_service_client() + file_system_client = service_client.get_file_system_client( + file_system=self.azure_storage_file_system + ) + + # Create filesystem (container) if it doesn't exist + if not await file_system_client.exists(): + await file_system_client.create_file_system() + verbose_logger.debug(f"Created filesystem: {self.azure_storage_file_system}") + + # Extract directory and filename (similar to logger's pattern) + path_parts = full_path.split("/") + if len(path_parts) > 1: + directory_path = "/".join(path_parts[:-1]) + file_name = path_parts[-1] + + # Create directory if needed (like logger does) + directory_client = file_system_client.get_directory_client(directory_path) + if not await directory_client.exists(): + await directory_client.create_directory() + verbose_logger.debug(f"Created directory: {directory_path}") + + # Get file client from directory (same pattern as logger) + file_client = directory_client.get_file_client(file_name) + else: + # No directory, create file directly in root + file_client = file_system_client.get_file_client(full_path) + + # Create, append, and flush (same pattern as logger's upload_to_azure_data_lake_with_azure_account_key) + await file_client.create_file() + await file_client.append_data(data=file_content, offset=0, length=len(file_content)) + await file_client.flush_data(position=len(file_content), offset=0) + + # Return blob URL (not DFS URL) + blob_url = f"https://{self.azure_storage_account_name}.blob.core.windows.net/{self.azure_storage_file_system}/{full_path}" + return blob_url + + async def _upload_file_with_azure_ad( + self, file_content: bytes, full_path: str + ) -> str: + """Upload file using REST API with Azure AD authentication.""" + # Reuse the logger's token management + await self.set_valid_azure_ad_token() + + from litellm.llms.custom_httpx.http_handler import ( + get_async_httpx_client, + httpxSpecialProvider, + ) + + async_client = get_async_httpx_client( + llm_provider=httpxSpecialProvider.LoggingCallback + ) + + # Use DFS endpoint for upload + base_url = f"https://{self.azure_storage_account_name}.dfs.core.windows.net/{self.azure_storage_file_system}/{full_path}" + + # Execute 3-step upload process: create, append, flush + # Reuse the logger's helper methods + await self._create_file(async_client, base_url) + # Append data - logger's _append_data expects string, so we create our own for bytes + await self._append_data_bytes(async_client, base_url, file_content) + await self._flush_data(async_client, base_url, len(file_content)) + + # Return blob URL (not DFS URL) + blob_url = f"https://{self.azure_storage_account_name}.blob.core.windows.net/{self.azure_storage_file_system}/{full_path}" + return blob_url + + async def _append_data_bytes( + self, client, base_url: str, file_content: bytes + ): + """Append binary data to file using REST API.""" + from litellm.constants import AZURE_STORAGE_MSFT_VERSION + + headers = { + "x-ms-version": AZURE_STORAGE_MSFT_VERSION, + "Content-Type": "application/octet-stream", + "Authorization": f"Bearer {self.azure_auth_token}", + } + response = await client.patch( + f"{base_url}?action=append&position=0", + headers=headers, + content=file_content, + ) + response.raise_for_status() + + async def download_file(self, storage_url: str) -> bytes: + """ + Download a file from Azure Blob Storage. + + Args: + storage_url: Blob URL in format: https://{account}.blob.core.windows.net/{container}/{path} + + Returns: + bytes: File content + """ + try: + # Parse blob URL to extract path + # URL format: https://{account}.blob.core.windows.net/{container}/{path} + if ".blob.core.windows.net/" not in storage_url: + raise ValueError(f"Invalid Azure Blob Storage URL: {storage_url}") + + # Extract path after container name + container_and_path = storage_url.split(".blob.core.windows.net/", 1)[1] + path_parts = container_and_path.split("/", 1) + if len(path_parts) < 2: + raise ValueError(f"Invalid Azure Blob Storage URL format: {storage_url}") + file_path = path_parts[1] # Path after container name + + if self.azure_storage_account_key: + # Use Azure SDK (reuse logger's service client) + return await self._download_file_with_account_key(file_path) + else: + # Use REST API (reuse logger's token management) + return await self._download_file_with_azure_ad(file_path) + + except Exception as e: + verbose_logger.exception(f"Error downloading file from Azure Blob Storage: {str(e)}") + raise + + async def _download_file_with_account_key(self, file_path: str) -> bytes: + """Download file using Azure SDK with account key.""" + # Reuse the logger's service client method + service_client = await self.get_service_client() + file_system_client = service_client.get_file_system_client( + file_system=self.azure_storage_file_system + ) + # Ensure filesystem exists (should already exist, but check for safety) + if not await file_system_client.exists(): + raise ValueError(f"Filesystem {self.azure_storage_file_system} does not exist") + file_client = file_system_client.get_file_client(file_path) + # Download file + download_response = await file_client.download_file() + file_content = await download_response.readall() + return file_content + + async def _download_file_with_azure_ad(self, file_path: str) -> bytes: + """Download file using REST API with Azure AD token.""" + # Reuse the logger's token management + await self.set_valid_azure_ad_token() + + from litellm.llms.custom_httpx.http_handler import ( + get_async_httpx_client, + httpxSpecialProvider, + ) + from litellm.constants import AZURE_STORAGE_MSFT_VERSION + + async_client = get_async_httpx_client( + llm_provider=httpxSpecialProvider.LoggingCallback + ) + + # Use blob endpoint for download (simpler than DFS) + blob_url = f"https://{self.azure_storage_account_name}.blob.core.windows.net/{self.azure_storage_file_system}/{file_path}" + + headers = { + "x-ms-version": AZURE_STORAGE_MSFT_VERSION, + "Authorization": f"Bearer {self.azure_auth_token}", + } + + response = await async_client.get(blob_url, headers=headers) + response.raise_for_status() + return response.content + diff --git a/litellm/llms/base_llm/files/storage_backend.py b/litellm/llms/base_llm/files/storage_backend.py new file mode 100644 index 000000000000..d95704529505 --- /dev/null +++ b/litellm/llms/base_llm/files/storage_backend.py @@ -0,0 +1,79 @@ +""" +Base storage backend interface for file storage backends. + +This module defines the abstract base class that all file storage backends +(e.g., Azure Blob Storage, S3, GCS) must implement. +""" + +from abc import ABC, abstractmethod +from typing import Optional + + +class BaseFileStorageBackend(ABC): + """ + Abstract base class for file storage backends. + + All storage backends (Azure Blob Storage, S3, GCS, etc.) must implement + these methods to provide a consistent interface for file operations. + """ + + @abstractmethod + async def upload_file( + self, + file_content: bytes, + filename: str, + content_type: str, + path_prefix: Optional[str] = None, + file_naming_strategy: str = "uuid", + ) -> str: + """ + Upload a file to the storage backend. + + Args: + file_content: The file content as bytes + filename: Original filename (may be used for naming strategy) + content_type: MIME type of the file + path_prefix: Optional path prefix for organizing files + file_naming_strategy: Strategy for naming files ("uuid", "timestamp", "original_filename") + + Returns: + str: The storage URL where the file can be accessed/downloaded + + Raises: + Exception: If upload fails + """ + pass + + @abstractmethod + async def download_file(self, storage_url: str) -> bytes: + """ + Download a file from the storage backend. + + Args: + storage_url: The storage URL returned from upload_file + + Returns: + bytes: The file content + + Raises: + Exception: If download fails + """ + pass + + async def delete_file(self, storage_url: str) -> None: + """ + Delete a file from the storage backend. + + This is optional and can be overridden by backends that support deletion. + Default implementation does nothing. + + Args: + storage_url: The storage URL of the file to delete + + Raises: + Exception: If deletion fails + """ + # Default implementation: no-op + # Backends can override if they support deletion + pass + diff --git a/litellm/llms/base_llm/files/storage_backend_factory.py b/litellm/llms/base_llm/files/storage_backend_factory.py new file mode 100644 index 000000000000..1685f3fbd26d --- /dev/null +++ b/litellm/llms/base_llm/files/storage_backend_factory.py @@ -0,0 +1,41 @@ +""" +Factory for creating storage backend instances. + +This module provides a factory function to instantiate the correct storage backend +based on the backend type. Backends use the same configuration as their corresponding +callbacks (e.g., azure_storage uses the same env vars as AzureBlobStorageLogger). +""" + +from litellm._logging import verbose_logger + +from .azure_blob_storage_backend import AzureBlobStorageBackend +from .storage_backend import BaseFileStorageBackend + + +def get_storage_backend(backend_type: str) -> BaseFileStorageBackend: + """ + Factory function to create a storage backend instance. + + Backends are configured using the same environment variables as their + corresponding callbacks. For example, "azure_storage" uses the same + env vars as AzureBlobStorageLogger. + + Args: + backend_type: Backend type identifier (e.g., "azure_storage") + + Returns: + BaseFileStorageBackend: Instance of the appropriate storage backend + + Raises: + ValueError: If backend_type is not supported + """ + verbose_logger.debug(f"Creating storage backend: type={backend_type}") + + if backend_type == "azure_storage": + return AzureBlobStorageBackend() + else: + raise ValueError( + f"Unsupported storage backend type: {backend_type}. " + f"Supported types: azure_storage" + ) + diff --git a/litellm/llms/base_llm/files/transformation.py b/litellm/llms/base_llm/files/transformation.py index 35b76479cdca..58df15f0c46c 100644 --- a/litellm/llms/base_llm/files/transformation.py +++ b/litellm/llms/base_llm/files/transformation.py @@ -2,11 +2,14 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union import httpx +from openai.types.file_deleted import FileDeleted from litellm.proxy._types import UserAPIKeyAuth +from litellm.types.files import TwoStepFileUploadConfig from litellm.types.llms.openai import ( AllMessageValues, CreateFileRequest, + FileContentRequest, OpenAICreateFileRequestOptionalParams, OpenAIFileObject, OpenAIFilesPurpose, @@ -75,7 +78,15 @@ def transform_create_file_request( create_file_data: CreateFileRequest, optional_params: dict, litellm_params: dict, - ) -> Union[dict, str, bytes]: + ) -> Union[dict, str, bytes, "TwoStepFileUploadConfig"]: + """ + Transform OpenAI-style file creation request into provider-specific format. + + Returns: + - dict: For pre-signed single-step uploads (e.g., Bedrock S3) + - str/bytes: For traditional file uploads + - TwoStepFileUploadConfig: For two-step upload process (e.g., Manus, GCS) + """ pass @abstractmethod @@ -88,6 +99,86 @@ def transform_create_file_response( ) -> OpenAIFileObject: pass + @abstractmethod + def transform_retrieve_file_request( + self, + file_id: str, + optional_params: dict, + litellm_params: dict, + ) -> tuple[str, dict]: + """Transform file retrieve request into provider-specific format.""" + pass + + @abstractmethod + def transform_retrieve_file_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + litellm_params: dict, + ) -> OpenAIFileObject: + """Transform file retrieve response into OpenAI format.""" + pass + + @abstractmethod + def transform_delete_file_request( + self, + file_id: str, + optional_params: dict, + litellm_params: dict, + ) -> tuple[str, dict]: + """Transform file delete request into provider-specific format.""" + pass + + @abstractmethod + def transform_delete_file_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + litellm_params: dict, + ) -> "FileDeleted": + """Transform file delete response into OpenAI format.""" + pass + + @abstractmethod + def transform_list_files_request( + self, + purpose: Optional[str], + optional_params: dict, + litellm_params: dict, + ) -> tuple[str, dict]: + """Transform file list request into provider-specific format.""" + pass + + @abstractmethod + def transform_list_files_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + litellm_params: dict, + ) -> List[OpenAIFileObject]: + """Transform file list response into OpenAI format.""" + pass + + @abstractmethod + def transform_file_content_request( + self, + file_content_request: "FileContentRequest", + optional_params: dict, + litellm_params: dict, + ) -> tuple[str, dict]: + """Transform file content request into provider-specific format.""" + pass + + @abstractmethod + def transform_file_content_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + litellm_params: dict, + ) -> "HttpxBinaryResponseContent": + """Transform file content response into OpenAI format.""" + pass + def transform_request( self, model: str, @@ -136,6 +227,7 @@ async def afile_retrieve( self, file_id: str, litellm_parent_otel_span: Optional[Span], + llm_router: Optional[Router] = None, ) -> OpenAIFileObject: pass diff --git a/litellm/llms/base_llm/google_genai/transformation.py b/litellm/llms/base_llm/google_genai/transformation.py index 6dbccaada9a3..0a85e127bd7e 100644 --- a/litellm/llms/base_llm/google_genai/transformation.py +++ b/litellm/llms/base_llm/google_genai/transformation.py @@ -149,6 +149,7 @@ def transform_generate_content_request( contents: GenerateContentContentListUnionDict, tools: Optional[ToolConfigDict], generate_content_config_dict: Dict, + system_instruction: Optional[Any] = None, ) -> dict: """ Transform the request parameters for the generate content API. @@ -157,9 +158,8 @@ def transform_generate_content_request( model: The model name contents: Input contents tools: Tools - generate_content_request_params: Request parameters - litellm_params: LiteLLM parameters - headers: Request headers + generate_content_config_dict: Generation config parameters + system_instruction: Optional system instruction Returns: Transformed request data diff --git a/litellm/llms/base_llm/guardrail_translation/base_translation.py b/litellm/llms/base_llm/guardrail_translation/base_translation.py index 4599af1b745d..7106c207bd6f 100644 --- a/litellm/llms/base_llm/guardrail_translation/base_translation.py +++ b/litellm/llms/base_llm/guardrail_translation/base_translation.py @@ -1,17 +1,69 @@ from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Dict, List, Optional if TYPE_CHECKING: from litellm.integrations.custom_guardrail import CustomGuardrail + from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj + from litellm.proxy._types import UserAPIKeyAuth class BaseTranslation(ABC): + @staticmethod + def transform_user_api_key_dict_to_metadata( + user_api_key_dict: Optional[Any], + ) -> Dict[str, Any]: + """ + Transform user_api_key_dict to a metadata dict with prefixed keys. + + Converts keys like 'user_id' to 'user_api_key_user_id' to clearly indicate + the source of the metadata. + + Args: + user_api_key_dict: UserAPIKeyAuth object or dict with user information + + Returns: + Dict with keys prefixed with 'user_api_key_' + """ + if user_api_key_dict is None: + return {} + + # Convert to dict if it's a Pydantic object + user_dict = ( + user_api_key_dict.model_dump() + if hasattr(user_api_key_dict, "model_dump") + else user_api_key_dict + ) + + if not isinstance(user_dict, dict): + return {} + + # Transform keys to be prefixed with 'user_api_key_' + transformed = {} + for key, value in user_dict.items(): + # Skip None values and internal fields + if value is None or key.startswith("_"): + continue + + # If key already has the prefix, use as-is, otherwise add prefix + if key.startswith("user_api_key_"): + transformed[key] = value + else: + transformed[f"user_api_key_{key}"] = value + + return transformed + @abstractmethod async def process_input_messages( self, data: dict, guardrail_to_apply: "CustomGuardrail", + litellm_logging_obj: Optional["LiteLLMLoggingObj"] = None, ) -> Any: + """ + Process input messages with guardrails. + + Note: user_api_key_dict metadata should be available in the data dict. + """ pass @abstractmethod @@ -19,5 +71,30 @@ async def process_output_response( self, response: Any, guardrail_to_apply: "CustomGuardrail", + litellm_logging_obj: Optional["LiteLLMLoggingObj"] = None, + user_api_key_dict: Optional["UserAPIKeyAuth"] = None, ) -> Any: + """ + Process output response with guardrails. + + Args: + response: The response object from the LLM + guardrail_to_apply: The guardrail instance to apply + litellm_logging_obj: Optional logging object + user_api_key_dict: User API key metadata (passed separately since response doesn't contain it) + """ pass + + async def process_output_streaming_response( + self, + responses_so_far: List[Any], + guardrail_to_apply: "CustomGuardrail", + litellm_logging_obj: Optional["LiteLLMLoggingObj"] = None, + user_api_key_dict: Optional["UserAPIKeyAuth"] = None, + ) -> Any: + """ + Process output streaming response with guardrails. + + Optional to override in subclasses. + """ + return responses_so_far diff --git a/litellm/llms/base_llm/image_edit/transformation.py b/litellm/llms/base_llm/image_edit/transformation.py index f3ae2d32eaa2..b088cdf37f65 100644 --- a/litellm/llms/base_llm/image_edit/transformation.py +++ b/litellm/llms/base_llm/image_edit/transformation.py @@ -92,8 +92,8 @@ def get_complete_url( def transform_image_edit_request( self, model: str, - prompt: str, - image: FileTypes, + prompt: Optional[str], + image: Optional[FileTypes], image_edit_optional_request_params: Dict, litellm_params: GenericLiteLLMParams, headers: dict, @@ -109,6 +109,15 @@ def transform_image_edit_response( ) -> ImageResponse: pass + def use_multipart_form_data(self) -> bool: + """ + Return True if the provider uses multipart/form-data for image edit requests. + Return False if the provider uses JSON requests. + + Default is True for backwards compatibility with OpenAI-style providers. + """ + return True + def get_error_class( self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers] ) -> BaseLLMException: diff --git a/litellm/llms/base_llm/image_generation/transformation.py b/litellm/llms/base_llm/image_generation/transformation.py index fc8db8c65c76..151e2893d1c8 100644 --- a/litellm/llms/base_llm/image_generation/transformation.py +++ b/litellm/llms/base_llm/image_generation/transformation.py @@ -103,3 +103,11 @@ def transform_image_generation_response( raise NotImplementedError( "ImageVariationConfig implements 'transform_response_image_variation' for image variation models" ) + + def use_multipart_form_data(self) -> bool: + """ + Returns True if this provider requires multipart/form-data instead of JSON. + + Override this method in subclasses that need form-data (e.g., Stability AI). + """ + return False diff --git a/litellm/llms/base_llm/interactions/__init__.py b/litellm/llms/base_llm/interactions/__init__.py new file mode 100644 index 000000000000..2bec120f597a --- /dev/null +++ b/litellm/llms/base_llm/interactions/__init__.py @@ -0,0 +1,5 @@ +"""Base classes for Interactions API implementations.""" + +from litellm.llms.base_llm.interactions.transformation import BaseInteractionsAPIConfig + +__all__ = ["BaseInteractionsAPIConfig"] diff --git a/litellm/llms/base_llm/interactions/transformation.py b/litellm/llms/base_llm/interactions/transformation.py new file mode 100644 index 000000000000..4ceb3f5387b0 --- /dev/null +++ b/litellm/llms/base_llm/interactions/transformation.py @@ -0,0 +1,313 @@ +""" +Base transformation class for Interactions API implementations. + +This follows the same pattern as BaseResponsesAPIConfig for the Responses API. + +Per OpenAPI spec (https://ai.google.dev/static/api/interactions.openapi.json): +- Create: POST /{api_version}/interactions +- Get: GET /{api_version}/interactions/{interaction_id} +- Delete: DELETE /{api_version}/interactions/{interaction_id} +""" + +import types +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +import httpx + +from litellm.types.interactions import ( + CancelInteractionResult, + DeleteInteractionResult, + InteractionInput, + InteractionsAPIOptionalRequestParams, + InteractionsAPIResponse, + InteractionsAPIStreamingResponse, +) +from litellm.types.router import GenericLiteLLMParams +from litellm.types.utils import LlmProviders + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + + from ..chat.transformation import BaseLLMException as _BaseLLMException + + LiteLLMLoggingObj = _LiteLLMLoggingObj + BaseLLMException = _BaseLLMException +else: + LiteLLMLoggingObj = Any + BaseLLMException = Any + + +class BaseInteractionsAPIConfig(ABC): + """ + Base configuration class for Google Interactions API implementations. + + Per OpenAPI spec, the Interactions API supports two types of interactions: + - Model interactions (with model parameter) + - Agent interactions (with agent parameter) + + Implementations should override the abstract methods to provide + provider-specific transformations for requests and responses. + """ + + def __init__(self): + pass + + @property + @abstractmethod + def custom_llm_provider(self) -> LlmProviders: + """Return the LLM provider identifier.""" + pass + + @classmethod + def get_config(cls): + return { + k: v + for k, v in cls.__dict__.items() + if not k.startswith("__") + and not k.startswith("_abc") + and not isinstance( + v, + ( + types.FunctionType, + types.BuiltinFunctionType, + classmethod, + staticmethod, + ), + ) + and v is not None + } + + @abstractmethod + def get_supported_params(self, model: str) -> List[str]: + """ + Return the list of supported parameters for the given model. + """ + pass + + @abstractmethod + def validate_environment( + self, + headers: dict, + model: str, + litellm_params: Optional[GenericLiteLLMParams] + ) -> dict: + """ + Validate and prepare environment settings including headers. + """ + return {} + + @abstractmethod + def get_complete_url( + self, + api_base: Optional[str], + model: Optional[str], + agent: Optional[str] = None, + litellm_params: Optional[dict] = None, + stream: Optional[bool] = None, + ) -> str: + """ + Get the complete URL for the interaction request. + + Per OpenAPI spec: POST /{api_version}/interactions + + Args: + api_base: Base URL for the API + model: The model name (for model interactions) + agent: The agent name (for agent interactions) + litellm_params: LiteLLM parameters + stream: Whether this is a streaming request + + Returns: + The complete URL for the request + """ + if api_base is None: + raise ValueError("api_base is required") + return api_base + + @abstractmethod + def transform_request( + self, + model: Optional[str], + agent: Optional[str], + input: Optional[InteractionInput], + optional_params: InteractionsAPIOptionalRequestParams, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Dict: + """ + Transform the input request into the provider's expected format. + + Per OpenAPI spec, the request body should be either: + - CreateModelInteractionParams (with model) + - CreateAgentInteractionParams (with agent) + + Args: + model: The model name (for model interactions) + agent: The agent name (for agent interactions) + input: The input content (string, content object, or list) + optional_params: Optional parameters for the request + litellm_params: LiteLLM-specific parameters + headers: Request headers + + Returns: + The transformed request body as a dictionary + """ + pass + + @abstractmethod + def transform_response( + self, + model: Optional[str], + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> InteractionsAPIResponse: + """ + Transform the raw HTTP response into an InteractionsAPIResponse. + + Per OpenAPI spec, the response is an Interaction object. + """ + pass + + @abstractmethod + def transform_streaming_response( + self, + model: Optional[str], + parsed_chunk: dict, + logging_obj: LiteLLMLoggingObj, + ) -> InteractionsAPIStreamingResponse: + """ + Transform a parsed streaming response chunk into an InteractionsAPIStreamingResponse. + + Per OpenAPI spec, streaming uses SSE with various event types. + """ + pass + + # ========================================================= + # GET INTERACTION TRANSFORMATION + # ========================================================= + + @abstractmethod + def transform_get_interaction_request( + self, + interaction_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """ + Transform the get interaction request into URL and query params. + + Per OpenAPI spec: GET /{api_version}/interactions/{interaction_id} + + Returns: + Tuple of (URL, query_params) + """ + pass + + @abstractmethod + def transform_get_interaction_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> InteractionsAPIResponse: + """ + Transform the get interaction response. + """ + pass + + # ========================================================= + # DELETE INTERACTION TRANSFORMATION + # ========================================================= + + @abstractmethod + def transform_delete_interaction_request( + self, + interaction_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """ + Transform the delete interaction request into URL and body. + + Per OpenAPI spec: DELETE /{api_version}/interactions/{interaction_id} + + Returns: + Tuple of (URL, request_body) + """ + pass + + @abstractmethod + def transform_delete_interaction_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + interaction_id: str, + ) -> DeleteInteractionResult: + """ + Transform the delete interaction response. + """ + pass + + # ========================================================= + # CANCEL INTERACTION TRANSFORMATION + # ========================================================= + + @abstractmethod + def transform_cancel_interaction_request( + self, + interaction_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """ + Transform the cancel interaction request into URL and body. + + Returns: + Tuple of (URL, request_body) + """ + pass + + @abstractmethod + def transform_cancel_interaction_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> CancelInteractionResult: + """ + Transform the cancel interaction response. + """ + pass + + # ========================================================= + # ERROR HANDLING + # ========================================================= + + def get_error_class( + self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers] + ) -> BaseLLMException: + """ + Get the appropriate exception class for an error. + """ + from ..chat.transformation import BaseLLMException + + raise BaseLLMException( + status_code=status_code, + message=error_message, + headers=headers, + ) + + def should_fake_stream( + self, + model: Optional[str], + stream: Optional[bool], + custom_llm_provider: Optional[str] = None, + ) -> bool: + """ + Returns True if litellm should fake a stream for the given model. + + Override in subclasses if the provider doesn't support native streaming. + """ + return False diff --git a/litellm/llms/base_llm/ocr/transformation.py b/litellm/llms/base_llm/ocr/transformation.py index 2fe8f3def757..fb13332c464f 100644 --- a/litellm/llms/base_llm/ocr/transformation.py +++ b/litellm/llms/base_llm/ocr/transformation.py @@ -106,6 +106,7 @@ def validate_environment( model: str, api_key: Optional[str] = None, api_base: Optional[str] = None, + litellm_params: Optional[dict] = None, **kwargs, ) -> Dict: """ @@ -119,6 +120,7 @@ def get_complete_url( api_base: Optional[str], model: str, optional_params: dict, + litellm_params: Optional[dict] = None, **kwargs, ) -> str: """ @@ -196,6 +198,36 @@ def transform_ocr_response( """ raise NotImplementedError("transform_ocr_response must be implemented by provider") + async def async_transform_ocr_response( + self, + model: str, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + **kwargs, + ) -> OCRResponse: + """ + Async transform provider-specific OCR response to standard format. + Optional method - providers can override if they need async transformations + (e.g., Azure Document Intelligence for async operation polling). + + Default implementation falls back to sync transform_ocr_response. + + Args: + model: Model name + raw_response: Raw HTTP response + logging_obj: Logging object + + Returns: + OCRResponse in standard format + """ + # Default implementation: call sync version + return self.transform_ocr_response( + model=model, + raw_response=raw_response, + logging_obj=logging_obj, + **kwargs, + ) + def get_error_class( self, error_message: str, diff --git a/litellm/llms/base_llm/rerank/transformation.py b/litellm/llms/base_llm/rerank/transformation.py index 6e9c03dee89a..b22d85e82be5 100644 --- a/litellm/llms/base_llm/rerank/transformation.py +++ b/litellm/llms/base_llm/rerank/transformation.py @@ -23,6 +23,7 @@ def validate_environment( headers: dict, model: str, api_key: Optional[str] = None, + optional_params: Optional[dict] = None, ) -> dict: pass @@ -50,7 +51,12 @@ def transform_rerank_response( return model_response @abstractmethod - def get_complete_url(self, api_base: Optional[str], model: str) -> str: + def get_complete_url( + self, + api_base: Optional[str], + model: str, + optional_params: Optional[dict] = None, + ) -> str: """ OPTIONAL diff --git a/litellm/llms/base_llm/responses/transformation.py b/litellm/llms/base_llm/responses/transformation.py index facabbda72a3..7a4da985528e 100644 --- a/litellm/llms/base_llm/responses/transformation.py +++ b/litellm/llms/base_llm/responses/transformation.py @@ -242,3 +242,30 @@ def transform_cancel_response_api_response( ######################################################### ########## END CANCEL RESPONSE API TRANSFORMATION ####### ######################################################### + + ######################################################### + ########## COMPACT RESPONSE API TRANSFORMATION ########## + ######################################################### + @abstractmethod + def transform_compact_response_api_request( + self, + model: str, + input: Union[str, ResponseInputParam], + response_api_optional_request_params: Dict, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + pass + + @abstractmethod + def transform_compact_response_api_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> ResponsesAPIResponse: + pass + + ######################################################### + ########## END COMPACT RESPONSE API TRANSFORMATION ###### + ######################################################### diff --git a/litellm/llms/base_llm/skills/__init__.py b/litellm/llms/base_llm/skills/__init__.py new file mode 100644 index 000000000000..3c523a0d1286 --- /dev/null +++ b/litellm/llms/base_llm/skills/__init__.py @@ -0,0 +1,6 @@ +"""Base Skills API configuration""" + +from .transformation import BaseSkillsAPIConfig + +__all__ = ["BaseSkillsAPIConfig"] + diff --git a/litellm/llms/base_llm/skills/transformation.py b/litellm/llms/base_llm/skills/transformation.py new file mode 100644 index 000000000000..7c2ebc35298c --- /dev/null +++ b/litellm/llms/base_llm/skills/transformation.py @@ -0,0 +1,246 @@ +""" +Base configuration class for Skills API +""" + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple + +import httpx + +from litellm.llms.base_llm.chat.transformation import BaseLLMException +from litellm.types.llms.anthropic_skills import ( + CreateSkillRequest, + DeleteSkillResponse, + ListSkillsParams, + ListSkillsResponse, + Skill, +) +from litellm.types.router import GenericLiteLLMParams +from litellm.types.utils import LlmProviders + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + + LiteLLMLoggingObj = _LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any + + +class BaseSkillsAPIConfig(ABC): + """Base configuration for Skills API providers""" + + def __init__(self): + pass + + @property + @abstractmethod + def custom_llm_provider(self) -> LlmProviders: + pass + + @abstractmethod + def validate_environment( + self, headers: dict, litellm_params: Optional[GenericLiteLLMParams] + ) -> dict: + """ + Validate and update headers with provider-specific requirements + + Args: + headers: Base headers dictionary + litellm_params: LiteLLM parameters + + Returns: + Updated headers dictionary + """ + return headers + + @abstractmethod + def get_complete_url( + self, + api_base: Optional[str], + endpoint: str, + skill_id: Optional[str] = None, + ) -> str: + """ + Get the complete URL for the API request + + Args: + api_base: Base API URL + endpoint: API endpoint (e.g., 'skills', 'skills/{id}') + skill_id: Optional skill ID for specific skill operations + + Returns: + Complete URL + """ + if api_base is None: + raise ValueError("api_base is required") + return f"{api_base}/v1/{endpoint}" + + @abstractmethod + def transform_create_skill_request( + self, + create_request: CreateSkillRequest, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Dict: + """ + Transform create skill request to provider-specific format + + Args: + create_request: Skill creation parameters + litellm_params: LiteLLM parameters + headers: Request headers + + Returns: + Provider-specific request body + """ + pass + + @abstractmethod + def transform_create_skill_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> Skill: + """ + Transform provider response to Skill object + + Args: + raw_response: Raw HTTP response + logging_obj: Logging object + + Returns: + Skill object + """ + pass + + @abstractmethod + def transform_list_skills_request( + self, + list_params: ListSkillsParams, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """ + Transform list skills request parameters + + Args: + list_params: List parameters (pagination, filters) + litellm_params: LiteLLM parameters + headers: Request headers + + Returns: + Tuple of (url, query_params) + """ + pass + + @abstractmethod + def transform_list_skills_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> ListSkillsResponse: + """ + Transform provider response to ListSkillsResponse + + Args: + raw_response: Raw HTTP response + logging_obj: Logging object + + Returns: + ListSkillsResponse object + """ + pass + + @abstractmethod + def transform_get_skill_request( + self, + skill_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """ + Transform get skill request + + Args: + skill_id: Skill ID + api_base: Base API URL + litellm_params: LiteLLM parameters + headers: Request headers + + Returns: + Tuple of (url, headers) + """ + pass + + @abstractmethod + def transform_get_skill_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> Skill: + """ + Transform provider response to Skill object + + Args: + raw_response: Raw HTTP response + logging_obj: Logging object + + Returns: + Skill object + """ + pass + + @abstractmethod + def transform_delete_skill_request( + self, + skill_id: str, + api_base: str, + litellm_params: GenericLiteLLMParams, + headers: dict, + ) -> Tuple[str, Dict]: + """ + Transform delete skill request + + Args: + skill_id: Skill ID + api_base: Base API URL + litellm_params: LiteLLM parameters + headers: Request headers + + Returns: + Tuple of (url, headers) + """ + pass + + @abstractmethod + def transform_delete_skill_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> DeleteSkillResponse: + """ + Transform provider response to DeleteSkillResponse + + Args: + raw_response: Raw HTTP response + logging_obj: Logging object + + Returns: + DeleteSkillResponse object + """ + pass + + def get_error_class( + self, + error_message: str, + status_code: int, + headers: dict, + ) -> Exception: + """Get appropriate error class for the provider.""" + return BaseLLMException( + status_code=status_code, + message=error_message, + headers=headers, + ) + diff --git a/litellm/llms/base_llm/vector_store/transformation.py b/litellm/llms/base_llm/vector_store/transformation.py index 9d7ba7d61a8e..935fd53c1993 100644 --- a/litellm/llms/base_llm/vector_store/transformation.py +++ b/litellm/llms/base_llm/vector_store/transformation.py @@ -5,8 +5,11 @@ from litellm.types.router import GenericLiteLLMParams from litellm.types.vector_stores import ( + VECTOR_STORE_OPENAI_PARAMS, + BaseVectorStoreAuthCredentials, VectorStoreCreateOptionalRequestParams, VectorStoreCreateResponse, + VectorStoreIndexEndpoints, VectorStoreSearchOptionalRequestParams, VectorStoreSearchResponse, ) @@ -24,6 +27,30 @@ class BaseVectorStoreConfig: + + def get_supported_openai_params( + self, model: str + ) -> List[VECTOR_STORE_OPENAI_PARAMS]: + return [] + + def map_openai_params( + self, + non_default_params: dict, + optional_params: dict, + drop_params: bool, + ) -> dict: + return optional_params + + @abstractmethod + def get_auth_credentials( + self, litellm_params: dict + ) -> BaseVectorStoreAuthCredentials: + pass + + @abstractmethod + def get_vector_store_endpoints_by_type(self) -> VectorStoreIndexEndpoints: + pass + @abstractmethod def transform_search_vector_store_request( self, @@ -34,8 +61,33 @@ def transform_search_vector_store_request( litellm_logging_obj: LiteLLMLoggingObj, litellm_params: dict, ) -> Tuple[str, Dict]: + pass + async def atransform_search_vector_store_request( + self, + vector_store_id: str, + query: Union[str, List[str]], + vector_store_search_optional_params: VectorStoreSearchOptionalRequestParams, + api_base: str, + litellm_logging_obj: LiteLLMLoggingObj, + litellm_params: dict, + ) -> Tuple[str, Dict]: + """ + Optional async version of transform_search_vector_store_request. + If not implemented, the handler will fall back to the sync version. + Providers that need to make async calls (e.g., generating embeddings) should override this. + """ + # Default implementation: call the sync version + return self.transform_search_vector_store_request( + vector_store_id=vector_store_id, + query=query, + vector_store_search_optional_params=vector_store_search_optional_params, + api_base=api_base, + litellm_logging_obj=litellm_logging_obj, + litellm_params=litellm_params, + ) + @abstractmethod def transform_search_vector_store_response( self, response: httpx.Response, litellm_logging_obj: LiteLLMLoggingObj diff --git a/litellm/llms/base_llm/vector_store_files/transformation.py b/litellm/llms/base_llm/vector_store_files/transformation.py new file mode 100644 index 000000000000..f751022faaf7 --- /dev/null +++ b/litellm/llms/base_llm/vector_store_files/transformation.py @@ -0,0 +1,226 @@ +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union + +import httpx + +from litellm.types.router import GenericLiteLLMParams +from litellm.types.vector_store_files import ( + VectorStoreFileAuthCredentials, + VectorStoreFileChunkingStrategy, + VectorStoreFileContentResponse, + VectorStoreFileCreateRequest, + VectorStoreFileDeleteResponse, + VectorStoreFileListQueryParams, + VectorStoreFileListResponse, + VectorStoreFileObject, + VectorStoreFileUpdateRequest, +) + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + + from ..chat.transformation import BaseLLMException as _BaseLLMException + + LiteLLMLoggingObj = _LiteLLMLoggingObj + BaseLLMException = _BaseLLMException +else: + LiteLLMLoggingObj = Any + BaseLLMException = Any + + +class BaseVectorStoreFilesConfig(ABC): + """Base configuration contract for provider-specific vector store file implementations.""" + + def get_supported_openai_params( + self, + operation: str, + ) -> Tuple[str, ...]: + """Return the set of OpenAI params supported for the given operation.""" + + return tuple() + + def map_openai_params( + self, + *, + operation: str, + non_default_params: Dict[str, Any], + optional_params: Dict[str, Any], + drop_params: bool, + ) -> Dict[str, Any]: + """Map non-default OpenAI params to provider-specific params.""" + + return optional_params + + @abstractmethod + def get_auth_credentials( + self, litellm_params: Dict[str, Any] + ) -> VectorStoreFileAuthCredentials: + ... + + @abstractmethod + def get_vector_store_file_endpoints_by_type(self) -> Dict[ + str, Tuple[Tuple[str, str], ...] + ]: + ... + + @abstractmethod + def validate_environment( + self, + *, + headers: Dict[str, str], + litellm_params: Optional[GenericLiteLLMParams], + ) -> Dict[str, str]: + return {} + + @abstractmethod + def get_complete_url( + self, + *, + api_base: Optional[str], + vector_store_id: str, + litellm_params: Dict[str, Any], + ) -> str: + if api_base is None: + raise ValueError("api_base is required") + return api_base + + @abstractmethod + def transform_create_vector_store_file_request( + self, + *, + vector_store_id: str, + create_request: VectorStoreFileCreateRequest, + api_base: str, + ) -> Tuple[str, Dict[str, Any]]: + ... + + @abstractmethod + def transform_create_vector_store_file_response( + self, + *, + response: httpx.Response, + ) -> VectorStoreFileObject: + ... + + @abstractmethod + def transform_list_vector_store_files_request( + self, + *, + vector_store_id: str, + query_params: VectorStoreFileListQueryParams, + api_base: str, + ) -> Tuple[str, Dict[str, Any]]: + ... + + @abstractmethod + def transform_list_vector_store_files_response( + self, + *, + response: httpx.Response, + ) -> VectorStoreFileListResponse: + ... + + @abstractmethod + def transform_retrieve_vector_store_file_request( + self, + *, + vector_store_id: str, + file_id: str, + api_base: str, + ) -> Tuple[str, Dict[str, Any]]: + ... + + @abstractmethod + def transform_retrieve_vector_store_file_response( + self, + *, + response: httpx.Response, + ) -> VectorStoreFileObject: + ... + + @abstractmethod + def transform_retrieve_vector_store_file_content_request( + self, + *, + vector_store_id: str, + file_id: str, + api_base: str, + ) -> Tuple[str, Dict[str, Any]]: + ... + + @abstractmethod + def transform_retrieve_vector_store_file_content_response( + self, + *, + response: httpx.Response, + ) -> VectorStoreFileContentResponse: + ... + + @abstractmethod + def transform_update_vector_store_file_request( + self, + *, + vector_store_id: str, + file_id: str, + update_request: VectorStoreFileUpdateRequest, + api_base: str, + ) -> Tuple[str, Dict[str, Any]]: + ... + + @abstractmethod + def transform_update_vector_store_file_response( + self, + *, + response: httpx.Response, + ) -> VectorStoreFileObject: + ... + + @abstractmethod + def transform_delete_vector_store_file_request( + self, + *, + vector_store_id: str, + file_id: str, + api_base: str, + ) -> Tuple[str, Dict[str, Any]]: + ... + + @abstractmethod + def transform_delete_vector_store_file_response( + self, + *, + response: httpx.Response, + ) -> VectorStoreFileDeleteResponse: + ... + + def get_error_class( + self, + *, + error_message: str, + status_code: int, + headers: Union[Dict[str, Any], httpx.Headers], + ) -> BaseLLMException: + from ..chat.transformation import BaseLLMException + + raise BaseLLMException( + status_code=status_code, + message=error_message, + headers=headers, + ) + + def sign_request( + self, + *, + headers: Dict[str, str], + optional_params: Dict[str, Any], + request_data: Dict[str, Any], + api_base: str, + api_key: Optional[str] = None, + ) -> Tuple[Dict[str, str], Optional[bytes]]: + return headers, None + + def prepare_chunking_strategy( + self, + chunking_strategy: Optional[VectorStoreFileChunkingStrategy], + ) -> Optional[VectorStoreFileChunkingStrategy]: + return chunking_strategy diff --git a/litellm/llms/base_llm/videos/transformation.py b/litellm/llms/base_llm/videos/transformation.py index 223b308dc08e..50cada42b87f 100644 --- a/litellm/llms/base_llm/videos/transformation.py +++ b/litellm/llms/base_llm/videos/transformation.py @@ -5,9 +5,9 @@ import httpx from httpx._types import RequestFiles -from litellm.types.videos.main import VideoCreateOptionalRequestParams from litellm.types.responses.main import * from litellm.types.router import GenericLiteLLMParams +from litellm.types.videos.main import VideoCreateOptionalRequestParams if TYPE_CHECKING: from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj @@ -66,6 +66,7 @@ def validate_environment( headers: dict, model: str, api_key: Optional[str] = None, + litellm_params: Optional[GenericLiteLLMParams] = None, ) -> dict: return {} @@ -92,10 +93,11 @@ def transform_video_create_request( self, model: str, prompt: str, + api_base: str, video_create_optional_request_params: Dict, litellm_params: GenericLiteLLMParams, headers: dict, - ) -> Tuple[Dict, RequestFiles]: + ) -> Tuple[Dict, RequestFiles, str]: pass @abstractmethod @@ -104,6 +106,8 @@ def transform_video_create_response( model: str, raw_response: httpx.Response, logging_obj: LiteLLMLoggingObj, + custom_llm_provider: Optional[str] = None, + request_data: Optional[Dict] = None, ) -> VideoObject: pass @@ -111,7 +115,6 @@ def transform_video_create_response( def transform_video_content_request( self, video_id: str, - model: str, api_base: str, litellm_params: GenericLiteLLMParams, headers: dict, @@ -127,18 +130,41 @@ def transform_video_content_request( @abstractmethod def transform_video_content_response( self, - model: str, raw_response: httpx.Response, logging_obj: LiteLLMLoggingObj, ) -> bytes: pass + async def async_transform_video_content_response( + self, + raw_response: httpx.Response, + logging_obj: LiteLLMLoggingObj, + ) -> bytes: + """ + Async transform video content download response to bytes. + Optional method - providers can override if they need async transformations + (e.g., RunwayML for downloading video from CloudFront URL). + + Default implementation falls back to sync transform_video_content_response. + + Args: + raw_response: Raw HTTP response + logging_obj: Logging object + + Returns: + Video content as bytes + """ + # Default implementation: call sync version + return self.transform_video_content_response( + raw_response=raw_response, + logging_obj=logging_obj, + ) + @abstractmethod def transform_video_remix_request( self, video_id: str, prompt: str, - model: str, api_base: str, litellm_params: GenericLiteLLMParams, headers: dict, @@ -155,16 +181,15 @@ def transform_video_remix_request( @abstractmethod def transform_video_remix_response( self, - model: str, raw_response: httpx.Response, logging_obj: LiteLLMLoggingObj, + custom_llm_provider: Optional[str] = None, ) -> VideoObject: pass @abstractmethod def transform_video_list_request( self, - model: str, api_base: str, litellm_params: GenericLiteLLMParams, headers: dict, @@ -184,9 +209,9 @@ def transform_video_list_request( @abstractmethod def transform_video_list_response( self, - model: str, raw_response: httpx.Response, logging_obj: LiteLLMLoggingObj, + custom_llm_provider: Optional[str] = None, ) -> Dict[str,str]: pass @@ -194,7 +219,6 @@ def transform_video_list_response( def transform_video_delete_request( self, video_id: str, - model: str, api_base: str, litellm_params: GenericLiteLLMParams, headers: dict, @@ -210,7 +234,6 @@ def transform_video_delete_request( @abstractmethod def transform_video_delete_response( self, - model: str, raw_response: httpx.Response, logging_obj: LiteLLMLoggingObj, ) -> VideoObject: @@ -220,7 +243,6 @@ def transform_video_delete_response( def transform_video_status_retrieve_request( self, video_id: str, - model: str, api_base: str, litellm_params: GenericLiteLLMParams, headers: dict, @@ -236,9 +258,9 @@ def transform_video_status_retrieve_request( @abstractmethod def transform_video_status_retrieve_response( self, - model: str, raw_response: httpx.Response, logging_obj: LiteLLMLoggingObj, + custom_llm_provider: Optional[str] = None, ) -> VideoObject: pass diff --git a/litellm/llms/bedrock/base_aws_llm.py b/litellm/llms/bedrock/base_aws_llm.py index 4c8544375445..1de1c40c4384 100644 --- a/litellm/llms/bedrock/base_aws_llm.py +++ b/litellm/llms/bedrock/base_aws_llm.py @@ -74,6 +74,21 @@ def __init__(self) -> None: "aws_external_id", ] + def _get_ssl_verify(self, ssl_verify: Optional[Union[bool, str]] = None): + """ + Get SSL verification setting for boto3 clients. + + This ensures that custom CA certificates are properly used for all AWS API calls, + including STS and Bedrock services. + + Returns: + Union[bool, str]: SSL verification setting - False to disable, True to enable, + or a string path to a CA bundle file + """ + from litellm.llms.custom_httpx.http_handler import get_ssl_verify + + return get_ssl_verify(ssl_verify=ssl_verify) + def get_cache_key(self, credential_args: Dict[str, Optional[str]]) -> str: """ Generate a unique cache key based on the credential arguments. @@ -95,6 +110,7 @@ def get_credentials( aws_web_identity_token: Optional[str] = None, aws_sts_endpoint: Optional[str] = None, aws_external_id: Optional[str] = None, + ssl_verify: Optional[Union[bool, str]] = None, ): """ Return a boto3.Credentials object @@ -163,7 +179,11 @@ def get_credentials( ) # create cache key for non-expiring auth flows - args = {k: v for k, v in locals().items() if k.startswith("aws_")} + args = { + k: v + for k, v in locals().items() + if k.startswith("aws_") or k == "ssl_verify" + } cache_key = self.get_cache_key(args) _cached_credentials = self.iam_cache.get_cache(cache_key) @@ -227,6 +247,7 @@ def get_credentials( aws_role_name=aws_role_name, aws_session_name=aws_session_name, aws_external_id=aws_external_id, + ssl_verify=ssl_verify, ) elif aws_profile_name is not None: ### CHECK SESSION ### @@ -314,6 +335,12 @@ def get_bedrock_invoke_provider( if model.startswith("invoke/"): model = model.replace("invoke/", "", 1) + # Special case: Check for "nova" in model name first (before "amazon") + # This handles amazon.nova-* models which would otherwise match "amazon" (Titan) + if "nova" in model.lower(): + if "nova" in get_args(BEDROCK_INVOKE_PROVIDERS_LITERAL): + return cast(BEDROCK_INVOKE_PROVIDERS_LITERAL, "nova") + _split_model = model.split(".")[0] if _split_model in get_args(BEDROCK_INVOKE_PROVIDERS_LITERAL): return cast(BEDROCK_INVOKE_PROVIDERS_LITERAL, _split_model) @@ -323,13 +350,9 @@ def get_bedrock_invoke_provider( if provider is not None: return provider - # check if provider == "nova" - if "nova" in model: - return "nova" - else: - for provider in get_args(BEDROCK_INVOKE_PROVIDERS_LITERAL): - if provider in model: - return provider + for provider in get_args(BEDROCK_INVOKE_PROVIDERS_LITERAL): + if provider in model: + return provider return None @staticmethod @@ -353,6 +376,26 @@ def get_bedrock_model_id( model_id = BaseAWSLLM._get_model_id_from_model_with_spec( model_id, spec="deepseek_r1" ) + elif provider == "openai" and "openai/" in model_id: + model_id = BaseAWSLLM._get_model_id_from_model_with_spec( + model_id, spec="openai" + ) + elif provider == "qwen2" and "qwen2/" in model_id: + model_id = BaseAWSLLM._get_model_id_from_model_with_spec( + model_id, spec="qwen2" + ) + elif provider == "qwen3" and "qwen3/" in model_id: + model_id = BaseAWSLLM._get_model_id_from_model_with_spec( + model_id, spec="qwen3" + ) + elif provider == "stability" and "stability/" in model_id: + model_id = BaseAWSLLM._get_model_id_from_model_with_spec( + model_id, spec="stability" + ) + elif provider == "moonshot" and "moonshot/" in model_id: + model_id = BaseAWSLLM._get_model_id_from_model_with_spec( + model_id, spec="moonshot" + ) return model_id @staticmethod @@ -387,9 +430,16 @@ def get_bedrock_embedding_provider( Handles scenarios like: 1. model=cohere.embed-english-v3:0 -> Returns `cohere` 2. model=amazon.titan-embed-text-v1 -> Returns `amazon` - 3. model=us.twelvelabs.marengo-embed-2-7-v1:0 -> Returns `twelvelabs` - 4. model=twelvelabs.marengo-embed-2-7-v1:0 -> Returns `twelvelabs` + 3. model=amazon.nova-2-multimodal-embeddings-v1:0 -> Returns `nova` + 4. model=us.twelvelabs.marengo-embed-2-7-v1:0 -> Returns `twelvelabs` + 5. model=twelvelabs.marengo-embed-2-7-v1:0 -> Returns `twelvelabs` """ + # Special case: Check for "nova" in model name first (before "amazon") + # This handles amazon.nova-* models + if "nova" in model.lower(): + if "nova" in get_args(BEDROCK_EMBEDDING_PROVIDERS_LITERAL): + return cast(BEDROCK_EMBEDDING_PROVIDERS_LITERAL, "nova") + # Handle regional models like us.twelvelabs.marengo-embed-2-7-v1:0 if "." in model: parts = model.split(".") @@ -512,6 +562,7 @@ def _auth_with_web_identity_token( aws_region_name: Optional[str], aws_sts_endpoint: Optional[str], aws_external_id: Optional[str] = None, + ssl_verify: Optional[Union[bool, str]] = None, ) -> Tuple[Credentials, Optional[int]]: """ Authenticate with AWS Web Identity Token @@ -540,6 +591,7 @@ def _auth_with_web_identity_token( "sts", region_name=aws_region_name, endpoint_url=sts_endpoint, + verify=self._get_ssl_verify(ssl_verify), ) # https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRoleWithWebIdentity.html @@ -584,6 +636,7 @@ def _handle_irsa_cross_account( region: str, web_identity_token_file: str, aws_external_id: Optional[str] = None, + ssl_verify: Optional[Union[bool, str]] = None, ) -> dict: """Handle cross-account role assumption for IRSA.""" import boto3 @@ -596,7 +649,9 @@ def _handle_irsa_cross_account( # Create an STS client without credentials with tracer.trace("boto3.client(sts) for manual IRSA"): - sts_client = boto3.client("sts", region_name=region) + sts_client = boto3.client( + "sts", region_name=region, verify=self._get_ssl_verify(ssl_verify) + ) # Manually assume the IRSA role with the session name verbose_logger.debug( @@ -619,6 +674,7 @@ def _handle_irsa_cross_account( aws_access_key_id=irsa_creds["AccessKeyId"], aws_secret_access_key=irsa_creds["SecretAccessKey"], aws_session_token=irsa_creds["SessionToken"], + verify=self._get_ssl_verify(ssl_verify), ) # Get current caller identity for debugging @@ -651,13 +707,16 @@ def _handle_irsa_same_account( aws_session_name: str, region: str, aws_external_id: Optional[str] = None, + ssl_verify: Optional[Union[bool, str]] = None, ) -> dict: """Handle same-account role assumption for IRSA.""" import boto3 verbose_logger.debug("Same account role assumption, using automatic IRSA") with tracer.trace("boto3.client(sts) with automatic IRSA"): - sts_client = boto3.client("sts", region_name=region) + sts_client = boto3.client( + "sts", region_name=region, verify=self._get_ssl_verify(ssl_verify) + ) # Get current caller identity for debugging try: @@ -712,6 +771,7 @@ def _auth_with_aws_role( aws_role_name: str, aws_session_name: str, aws_external_id: Optional[str] = None, + ssl_verify: Optional[Union[bool, str]] = None, ) -> Tuple[Credentials, Optional[int]]: """ Authenticate with AWS Role @@ -754,10 +814,15 @@ def _auth_with_aws_role( region, web_identity_token_file, aws_external_id, + ssl_verify=ssl_verify, ) else: sts_response = self._handle_irsa_same_account( - aws_role_name, aws_session_name, region, aws_external_id + aws_role_name, + aws_session_name, + region, + aws_external_id, + ssl_verify=ssl_verify, ) return self._extract_credentials_and_ttl(sts_response) @@ -780,7 +845,9 @@ def _auth_with_aws_role( # This allows the web identity token to work automatically if aws_access_key_id is None and aws_secret_access_key is None: with tracer.trace("boto3.client(sts)"): - sts_client = boto3.client("sts") + sts_client = boto3.client( + "sts", verify=self._get_ssl_verify(ssl_verify) + ) else: with tracer.trace("boto3.client(sts)"): sts_client = boto3.client( @@ -788,6 +855,7 @@ def _auth_with_aws_role( aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, aws_session_token=aws_session_token, + verify=self._get_ssl_verify(ssl_verify), ) assume_role_params = { @@ -901,7 +969,7 @@ def get_runtime_endpoint( api_base: Optional[str], aws_bedrock_runtime_endpoint: Optional[str], aws_region_name: str, - endpoint_type: Optional[Literal["runtime", "agent"]] = "runtime", + endpoint_type: Optional[Literal["runtime", "agent", "agentcore"]] = "runtime", ) -> Tuple[str, str]: env_aws_bedrock_runtime_endpoint = get_secret("AWS_BEDROCK_RUNTIME_ENDPOINT") if api_base is not None: @@ -935,7 +1003,9 @@ def get_runtime_endpoint( return endpoint_url, proxy_endpoint_url def _select_default_endpoint_url( - self, endpoint_type: Optional[Literal["runtime", "agent"]], aws_region_name: str + self, + endpoint_type: Optional[Literal["runtime", "agent", "agentcore"]], + aws_region_name: str, ) -> str: """ Select the default endpoint url based on the endpoint type @@ -944,6 +1014,8 @@ def _select_default_endpoint_url( """ if endpoint_type == "agent": return f"https://bedrock-agent-runtime.{aws_region_name}.amazonaws.com" + elif endpoint_type == "agentcore": + return f"https://bedrock-agentcore.{aws_region_name}.amazonaws.com" else: return f"https://bedrock-runtime.{aws_region_name}.amazonaws.com" @@ -1091,7 +1163,7 @@ def _filter_headers_for_aws_signature(self, headers: dict) -> dict: def _sign_request( self, - service_name: Literal["bedrock", "sagemaker"], + service_name: Literal["bedrock", "sagemaker", "bedrock-agentcore", "s3vectors"], headers: dict, optional_params: dict, request_data: dict, @@ -1161,15 +1233,20 @@ def _sign_request( else: headers = {"Content-Type": "application/json"} + aws_signature_headers = self._filter_headers_for_aws_signature(headers) request = AWSRequest( method="POST", url=api_base, data=json.dumps(request_data), - headers=headers, + headers=aws_signature_headers, ) sigv4.add_auth(request) request_headers_dict = dict(request.headers) + # Add back original headers after signing. Only headers in SignedHeaders + # are integrity-protected; forwarded headers (x-forwarded-*) must remain unsigned. + for header_name, header_value in headers.items(): + request_headers_dict[header_name] = header_value if ( headers is not None and "Authorization" in headers ): # prevent sigv4 from overwriting the auth header diff --git a/litellm/llms/bedrock/batches/handler.py b/litellm/llms/bedrock/batches/handler.py new file mode 100644 index 000000000000..4a26bd433482 --- /dev/null +++ b/litellm/llms/bedrock/batches/handler.py @@ -0,0 +1,96 @@ +from openai.types.batch import BatchRequestCounts +from openai.types.batch import Metadata as OpenAIBatchMetadata + +from litellm.types.utils import LiteLLMBatch + + +class BedrockBatchesHandler: + """ + Handler for Bedrock Batches. + + Specific providers/models needed some special handling. + + E.g. Twelve Labs Embedding Async Invoke + """ + @staticmethod + def _handle_async_invoke_status( + batch_id: str, aws_region_name: str, logging_obj=None, **kwargs + ) -> "LiteLLMBatch": + """ + Handle async invoke status check for AWS Bedrock. + + This is for Twelve Labs Embedding Async Invoke. + + Args: + batch_id: The async invoke ARN + aws_region_name: AWS region name + **kwargs: Additional parameters + + Returns: + dict: Status information including status, output_file_id (S3 URL), etc. + """ + import asyncio + + from litellm.llms.bedrock.embed.embedding import BedrockEmbedding + + async def _async_get_status(): + # Create embedding handler instance + embedding_handler = BedrockEmbedding() + + # Get the status of the async invoke job + status_response = await embedding_handler._get_async_invoke_status( + invocation_arn=batch_id, + aws_region_name=aws_region_name, + logging_obj=logging_obj, + **kwargs, + ) + + # Transform response to a LiteLLMBatch object + from litellm.types.utils import LiteLLMBatch + + openai_batch_metadata: OpenAIBatchMetadata = { + "output_file_id": status_response["outputDataConfig"][ + "s3OutputDataConfig" + ]["s3Uri"], + "failure_message": status_response.get("failureMessage") or "", + "model_arn": status_response["modelArn"], + } + + result = LiteLLMBatch( + id=status_response["invocationArn"], + object="batch", + status=status_response["status"], + created_at=status_response["submitTime"], + in_progress_at=status_response["lastModifiedTime"], + completed_at=status_response.get("endTime"), + failed_at=status_response.get("endTime") + if status_response["status"] == "failed" + else None, + request_counts=BatchRequestCounts( + total=1, + completed=1 if status_response["status"] == "completed" else 0, + failed=1 if status_response["status"] == "failed" else 0, + ), + metadata=openai_batch_metadata, + completion_window="24h", + endpoint="/v1/embeddings", + input_file_id="", + ) + + return result + + # Since this function is called from within an async context via run_in_executor, + # we need to create a new event loop in a thread to avoid conflicts + import concurrent.futures + + def run_in_thread(): + new_loop = asyncio.new_event_loop() + asyncio.set_event_loop(new_loop) + try: + return new_loop.run_until_complete(_async_get_status()) + finally: + new_loop.close() + + with concurrent.futures.ThreadPoolExecutor() as executor: + future = executor.submit(run_in_thread) + return future.result() diff --git a/litellm/llms/bedrock/batches/transformation.py b/litellm/llms/bedrock/batches/transformation.py index 2f3d00dddda4..a9bc1b26c88c 100644 --- a/litellm/llms/bedrock/batches/transformation.py +++ b/litellm/llms/bedrock/batches/transformation.py @@ -6,6 +6,7 @@ from litellm.llms.base_llm.batches.transformation import BaseBatchesConfig from litellm.llms.base_llm.chat.transformation import BaseLLMException +from litellm.secret_managers.main import get_secret_str from litellm.types.llms.bedrock import ( BedrockCreateBatchRequest, BedrockCreateBatchResponse, @@ -140,10 +141,20 @@ def transform_create_batch_request( } # Build output data config + s3_output_config: BedrockS3OutputDataConfig = BedrockS3OutputDataConfig( + s3Uri=f"s3://{output_bucket}/{output_key}" + ) + + # Add optional KMS encryption key ID if provided + s3_encryption_key_id = ( + litellm_params.get("s3_encryption_key_id") + or get_secret_str("AWS_S3_ENCRYPTION_KEY_ID") + ) + if s3_encryption_key_id: + s3_output_config["s3EncryptionKeyId"] = s3_encryption_key_id + output_data_config: BedrockOutputDataConfig = { - "s3OutputDataConfig": BedrockS3OutputDataConfig( - s3Uri=f"s3://{output_bucket}/{output_key}" - ) + "s3OutputDataConfig": s3_output_config } # Create Bedrock batch request with proper typing diff --git a/litellm/llms/bedrock/chat/agentcore/__init__.py b/litellm/llms/bedrock/chat/agentcore/__init__.py new file mode 100644 index 000000000000..a2f138762035 --- /dev/null +++ b/litellm/llms/bedrock/chat/agentcore/__init__.py @@ -0,0 +1,4 @@ +from .transformation import AmazonAgentCoreConfig + +__all__ = ["AmazonAgentCoreConfig"] + diff --git a/litellm/llms/bedrock/chat/agentcore/transformation.py b/litellm/llms/bedrock/chat/agentcore/transformation.py new file mode 100644 index 000000000000..94e845e30958 --- /dev/null +++ b/litellm/llms/bedrock/chat/agentcore/transformation.py @@ -0,0 +1,859 @@ +""" +Transformation for Bedrock AgentCore + +https://docs.aws.amazon.com/bedrock/latest/APIReference/API_agentcore_InvokeAgentRuntime.html +""" + +import json +from collections.abc import AsyncGenerator +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast +from urllib.parse import quote + +import httpx + +from litellm._logging import verbose_logger +from litellm._uuid import uuid +from litellm.litellm_core_utils.prompt_templates.common_utils import ( + convert_content_list_to_str, +) +from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper +from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException +from litellm.llms.bedrock.base_aws_llm import BaseAWSLLM +from litellm.llms.bedrock.common_utils import BedrockError +from litellm.types.llms.bedrock_agentcore import ( + AgentCoreMessage, + AgentCoreParsedResponse, + AgentCoreUsage, +) +from litellm.types.llms.openai import AllMessageValues +from litellm.types.utils import Choices, Delta, Message, ModelResponse, StreamingChoices, Usage + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler + + LiteLLMLoggingObj = _LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any + HTTPHandler = Any + AsyncHTTPHandler = Any + + +class AmazonAgentCoreConfig(BaseConfig, BaseAWSLLM): + def __init__(self, **kwargs): + BaseConfig.__init__(self, **kwargs) + BaseAWSLLM.__init__(self, **kwargs) + + def get_supported_openai_params(self, model: str) -> List[str]: + """ + Bedrock AgentCore has 0 OpenAI compatible params + """ + return [] + + def map_openai_params( + self, + non_default_params: dict, + optional_params: dict, + model: str, + drop_params: bool, + ) -> dict: + """ + Map OpenAI params to AgentCore params + """ + return optional_params + + def get_complete_url( + self, + api_base: Optional[str], + api_key: Optional[str], + model: str, + optional_params: dict, + litellm_params: dict, + stream: Optional[bool] = None, + ) -> str: + """ + Get the complete url for the request + """ + ### SET RUNTIME ENDPOINT ### + aws_bedrock_runtime_endpoint = optional_params.get( + "aws_bedrock_runtime_endpoint", None + ) + + # Extract ARN from model string + agent_runtime_arn = self._get_agent_runtime_arn(model) + + # Parse ARN to get region + region = self._extract_region_from_arn(agent_runtime_arn) + + # Build the base endpoint URL for AgentCore + # Note: We don't use get_runtime_endpoint as AgentCore has its own endpoint structure + if aws_bedrock_runtime_endpoint: + base_url = aws_bedrock_runtime_endpoint + else: + base_url = f"https://bedrock-agentcore.{region}.amazonaws.com" + + # Based on boto3 client.invoke_agent_runtime, the path is: + # /runtimes/{URL-ENCODED-ARN}/invocations?qualifier= + encoded_arn = quote(agent_runtime_arn, safe="") + endpoint_url = f"{base_url}/runtimes/{encoded_arn}/invocations" + + # Add qualifier as query parameter if provided + if "qualifier" in optional_params: + endpoint_url = f"{endpoint_url}?qualifier={optional_params['qualifier']}" + + return endpoint_url + + def sign_request( + self, + headers: dict, + optional_params: dict, + request_data: dict, + api_base: str, + api_key: Optional[str] = None, + model: Optional[str] = None, + stream: Optional[bool] = None, + fake_stream: Optional[bool] = None, + ) -> Tuple[dict, Optional[bytes]]: + # Check if api_key (bearer token) is provided for Cognito authentication + # Priority: api_key parameter first, then optional_params + jwt_token = api_key or optional_params.get("api_key") + if jwt_token: + verbose_logger.debug( + f"AgentCore: Using Bearer token authentication (Cognito/JWT) - token: {jwt_token[:50]}..." + ) + headers["Content-Type"] = "application/json" + headers["Authorization"] = f"Bearer {jwt_token}" + # Return headers with bearer token and JSON-encoded body (not SigV4 signed) + return headers, json.dumps(request_data).encode() + + # Otherwise, use AWS SigV4 authentication + verbose_logger.debug("AgentCore: Using AWS SigV4 authentication (IAM)") + return self._sign_request( + service_name="bedrock-agentcore", + headers=headers, + optional_params=optional_params, + request_data=request_data, + api_base=api_base, + model=model, + stream=stream, + fake_stream=fake_stream, + api_key=api_key, + ) + + def _get_agent_runtime_arn(self, model: str) -> str: + """ + Extract ARN from model string + model = "agentcore/arn:aws:bedrock-agentcore:us-west-2:888602223428:runtime/hosted_agent_r9jvp-3ySZuRHjLC" + returns: "arn:aws:bedrock-agentcore:us-west-2:888602223428:runtime/hosted_agent_r9jvp-3ySZuRHjLC" + """ + parts = model.split("/", 1) + if len(parts) != 2 or parts[0] != "agentcore": + raise ValueError( + "Invalid model format. Expected format: 'model=bedrock/agentcore/arn:aws:bedrock-agentcore:region:account:runtime/runtime_id'" + ) + return parts[1] + + def _extract_region_from_arn(self, arn: str) -> str: + """ + Extract region from ARN + arn:aws:bedrock-agentcore:us-west-2:888602223428:runtime/hosted_agent_r9jvp-3ySZuRHjLC + returns: us-west-2 + """ + parts = arn.split(":") + if len(parts) >= 4: + return parts[3] + raise ValueError(f"Invalid ARN format: {arn}") + + def _get_runtime_session_id(self, optional_params: dict) -> str: + """ + Get or generate runtime session ID (must be 33+ chars) + """ + session_id = optional_params.get("runtimeSessionId", None) + if session_id: + verbose_logger.debug(f"Using provided runtimeSessionId: {session_id}") + return session_id + + # Generate a session ID with 33+ characters + generated_id = f"litellm-session-{str(uuid.uuid4())}" + verbose_logger.debug(f"Generated new session ID: {generated_id}") + return generated_id + + def _get_runtime_user_id(self, optional_params: dict) -> Optional[str]: + """ + Get runtime user ID if provided + """ + user_id = optional_params.get("runtimeUserId", None) + if user_id: + verbose_logger.debug(f"Using provided runtimeUserId: {user_id}") + return user_id + + def transform_request( + self, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + headers: dict, + ) -> dict: + """ + Transform the request to AgentCore format. + + Based on boto3's implementation: + - Session ID goes in header: X-Amzn-Bedrock-AgentCore-Runtime-Session-Id + - User ID goes in header: X-Amzn-Bedrock-AgentCore-Runtime-User-Id + - Qualifier goes as query parameter + - Only the payload goes in the request body + + Returns: + dict: Payload dict containing the prompt + """ + verbose_logger.debug( + f"AgentCore transform_request - optional_params keys: {list(optional_params.keys())}" + ) + + # Use the last message content as the prompt + prompt = convert_content_list_to_str(messages[-1]) + + # Create the payload - this is what goes in the body (raw JSON) + payload: dict = {"prompt": prompt} + + # Get or generate session ID - this goes in the header + runtime_session_id = self._get_runtime_session_id(optional_params) + headers["X-Amzn-Bedrock-AgentCore-Runtime-Session-Id"] = runtime_session_id + + # Get user ID if provided - this goes in the header + runtime_user_id = self._get_runtime_user_id(optional_params) + if runtime_user_id: + headers["X-Amzn-Bedrock-AgentCore-Runtime-User-Id"] = runtime_user_id + + # The request data is the payload dict (will be JSON encoded by the HTTP handler) + # Qualifier will be handled as a query parameter in get_complete_url + + verbose_logger.debug(f"PAYLOAD: {payload}") + return payload + + def _extract_sse_json(self, line: str) -> Optional[Dict]: + """Extract and parse JSON from an SSE data line.""" + if not line.startswith("data:"): + return None + + json_str = line[5:].strip() + if not json_str: + return None + + try: + data = json.loads(json_str) + # Skip non-dict data (some lines contain JSON strings) + return data if isinstance(data, dict) else None + except json.JSONDecodeError: + verbose_logger.debug(f"Skipping non-JSON line: {line[:100]}") + return None + + def _extract_usage_from_event(self, event_data: Dict) -> Optional[AgentCoreUsage]: + """Extract usage information from event metadata.""" + event_payload = event_data.get("event") + if not event_payload: + return None + + metadata = event_payload.get("metadata") + if metadata and "usage" in metadata: + return metadata["usage"] # type: ignore + + return None + + def _extract_content_delta(self, event_data: Dict) -> Optional[str]: + """Extract text content from contentBlockDelta event.""" + event_payload = event_data.get("event") + if not event_payload: + return None + + content_block_delta = event_payload.get("contentBlockDelta") + if not content_block_delta: + return None + + delta = content_block_delta.get("delta", {}) + return delta.get("text") + + def _extract_content_from_message(self, message: AgentCoreMessage) -> str: + """ + Extract text content from message content blocks. + This works for both SSE messages and JSON responses. + """ + content_list = message.get("content", []) + if not isinstance(content_list, list): + return "" + + return "".join( + block["text"] + for block in content_list + if isinstance(block, dict) and "text" in block + ) + + def _calculate_usage( + self, model: str, messages: List[AllMessageValues], content: str + ) -> Optional[Usage]: + """ + Calculate token usage using LiteLLM's token counter. + + Args: + model: The model name + messages: Input messages + content: Response content + + Returns: + Usage object with calculated tokens, or None if calculation fails + """ + try: + from litellm.utils import token_counter + + prompt_tokens = token_counter(model=model, messages=messages) + completion_tokens = token_counter( + model=model, text=content, count_response_tokens=True + ) + total_tokens = prompt_tokens + completion_tokens + + verbose_logger.debug( + f"Calculated usage - prompt: {prompt_tokens}, completion: {completion_tokens}, total: {total_tokens}" + ) + + return Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + ) + except Exception as e: + verbose_logger.warning(f"Failed to calculate token usage: {str(e)}") + return None + + def _parse_json_response(self, response_json: dict) -> AgentCoreParsedResponse: + """ + Parse direct JSON response (non-streaming). + + JSON response structure: + { + "result": { + "role": "assistant", + "content": [{"text": "..."}] + } + } + """ + result = response_json.get("result", {}) + + # Extract content using the same helper as SSE parsing + content = self._extract_content_from_message(result) # type: ignore + + # JSON responses don't include usage data + return AgentCoreParsedResponse( + content=content, + usage=None, + final_message=result, # type: ignore + ) + + def _get_parsed_response( + self, raw_response: httpx.Response + ) -> AgentCoreParsedResponse: + """ + Parse AgentCore response based on content type. + + Args: + raw_response: Raw HTTP response from AgentCore + + Returns: + AgentCoreParsedResponse: Parsed response data + """ + content_type = raw_response.headers.get("content-type", "").lower() + verbose_logger.debug(f"AgentCore response Content-Type: {content_type}") + + # Parse response based on content type + if "application/json" in content_type: + # Direct JSON response + verbose_logger.debug("Parsing JSON response") + response_json = raw_response.json() + verbose_logger.debug(f"Response JSON: {response_json}") + return self._parse_json_response(response_json) + else: + # SSE stream response (text/event-stream or default) + verbose_logger.debug("Parsing SSE stream response") + response_text = raw_response.text + verbose_logger.debug( + f"AgentCore response (first 500 chars): {response_text[:500]}" + ) + return self._parse_sse_stream(response_text) + + def _parse_sse_stream(self, response_text: str) -> AgentCoreParsedResponse: + """ + Parse Server-Sent Events (SSE) stream format. + Each line starts with 'data:' followed by JSON. + + Returns: + AgentCoreParsedResponse: Parsed response with content, usage, and message + """ + final_message: Optional[AgentCoreMessage] = None + usage_data: Optional[AgentCoreUsage] = None + content_blocks: List[str] = [] + + for line in response_text.strip().split("\n"): + line = line.strip() + if not line: + continue + + data = self._extract_sse_json(line) + if not data: + continue + + verbose_logger.debug(f"SSE event keys: {list(data.keys())}") + + # Check for final complete message + if "message" in data and isinstance(data["message"], dict): + final_message = data["message"] # type: ignore + verbose_logger.debug("Found final message") + + # Process event data + if "event" in data and isinstance(data["event"], dict): + event_payload = data["event"] + verbose_logger.debug( + f"Event payload keys: {list(event_payload.keys())}" + ) + + # Extract usage metadata + if usage := self._extract_usage_from_event(data): + usage_data = usage + verbose_logger.debug(f"Found usage data: {usage_data}") + + # Collect content deltas + if text := self._extract_content_delta(data): + content_blocks.append(text) + + # Build final content + content = ( + self._extract_content_from_message(final_message) + if final_message + else "".join(content_blocks) + ) + + verbose_logger.debug(f"Final usage_data: {usage_data}") + + return AgentCoreParsedResponse( + content=content, usage=usage_data, final_message=final_message + ) + + def _stream_agentcore_response_sync( + self, + response: httpx.Response, + model: str, + ): + """ + Internal sync generator that parses SSE and yields ModelResponse chunks. + """ + buffer = "" + for text_chunk in response.iter_text(): + buffer += text_chunk + + # Process complete lines + while '\n' in buffer: + line, buffer = buffer.split('\n', 1) + line = line.strip() + + if not line or not line.startswith('data:'): + continue + + json_str = line[5:].strip() + if not json_str: + continue + + try: + data_obj = json.loads(json_str) + if not isinstance(data_obj, dict): + continue + + # Process contentBlockDelta events + if "event" in data_obj and isinstance(data_obj["event"], dict): + event_payload = data_obj["event"] + content_block_delta = event_payload.get("contentBlockDelta") + + if content_block_delta: + delta = content_block_delta.get("delta", {}) + text = delta.get("text", "") + + if text: + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason=None, + index=0, + delta=Delta(content=text, role="assistant"), + ) + ] + yield chunk + + # Process metadata/usage + metadata = event_payload.get("metadata") + if metadata and "usage" in metadata: + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(), + ) + ] + usage_data: AgentCoreUsage = metadata["usage"] # type: ignore + setattr(chunk, "usage", Usage( + prompt_tokens=usage_data.get("inputTokens", 0), + completion_tokens=usage_data.get("outputTokens", 0), + total_tokens=usage_data.get("totalTokens", 0), + )) + yield chunk + + # Process final message + if "message" in data_obj and isinstance(data_obj["message"], dict): + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(), + ) + ] + yield chunk + + except json.JSONDecodeError: + verbose_logger.debug(f"Skipping non-JSON SSE line: {line[:100]}") + continue + + def get_sync_custom_stream_wrapper( + self, + model: str, + custom_llm_provider: str, + logging_obj: LiteLLMLoggingObj, + api_base: str, + headers: dict, + data: dict, + messages: list, + client: Optional[Union[HTTPHandler, "AsyncHTTPHandler"]] = None, + json_mode: Optional[bool] = None, + signed_json_body: Optional[bytes] = None, + ) -> "CustomStreamWrapper": + """ + Simplified sync streaming - returns a generator that yields ModelResponse chunks. + """ + from litellm.llms.custom_httpx.http_handler import ( + HTTPHandler, + _get_httpx_client, + ) + + if client is None or not isinstance(client, HTTPHandler): + client = _get_httpx_client(params={}) + + verbose_logger.debug(f"Making sync streaming request to: {api_base}") + + # Make streaming request + response = client.post( + api_base, + headers=headers, + data=signed_json_body if signed_json_body else json.dumps(data), + stream=True, + logging_obj=logging_obj, + ) + + if response.status_code != 200: + raise BedrockError( + status_code=response.status_code, message=str(response.read()) + ) + + # LOGGING + logging_obj.post_call( + input=messages, + api_key="", + original_response="first stream response received", + additional_args={"complete_input_dict": data}, + ) + + # Wrap the generator in CustomStreamWrapper + return CustomStreamWrapper( + completion_stream=self._stream_agentcore_response_sync(response, model), + model=model, + custom_llm_provider="bedrock", + logging_obj=logging_obj, + ) + + async def _stream_agentcore_response( + self, + response: httpx.Response, + model: str, + ) -> AsyncGenerator[ModelResponse, None]: + """ + Internal async generator that parses SSE and yields ModelResponse chunks. + """ + buffer = "" + async for text_chunk in response.aiter_text(): + buffer += text_chunk + + # Process complete lines + while '\n' in buffer: + line, buffer = buffer.split('\n', 1) + line = line.strip() + + if not line or not line.startswith('data:'): + continue + + json_str = line[5:].strip() + if not json_str: + continue + + try: + data_obj = json.loads(json_str) + if not isinstance(data_obj, dict): + continue + + # Process contentBlockDelta events + if "event" in data_obj and isinstance(data_obj["event"], dict): + event_payload = data_obj["event"] + content_block_delta = event_payload.get("contentBlockDelta") + + if content_block_delta: + delta = content_block_delta.get("delta", {}) + text = delta.get("text", "") + + if text: + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason=None, + index=0, + delta=Delta(content=text, role="assistant"), + ) + ] + yield chunk + + # Process metadata/usage + metadata = event_payload.get("metadata") + if metadata and "usage" in metadata: + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(), + ) + ] + usage_data: AgentCoreUsage = metadata["usage"] # type: ignore + setattr(chunk, "usage", Usage( + prompt_tokens=usage_data.get("inputTokens", 0), + completion_tokens=usage_data.get("outputTokens", 0), + total_tokens=usage_data.get("totalTokens", 0), + )) + yield chunk + + # Process final message + if "message" in data_obj and isinstance(data_obj["message"], dict): + chunk = ModelResponse( + id=f"chatcmpl-{uuid.uuid4()}", + created=0, + model=model, + object="chat.completion.chunk", + ) + chunk.choices = [ + StreamingChoices( + finish_reason="stop", + index=0, + delta=Delta(), + ) + ] + yield chunk + + except json.JSONDecodeError: + verbose_logger.debug(f"Skipping non-JSON SSE line: {line[:100]}") + continue + + async def get_async_custom_stream_wrapper( + self, + model: str, + custom_llm_provider: str, + logging_obj: LiteLLMLoggingObj, + api_base: str, + headers: dict, + data: dict, + messages: list, + client: Optional["AsyncHTTPHandler"] = None, + json_mode: Optional[bool] = None, + signed_json_body: Optional[bytes] = None, + ) -> "CustomStreamWrapper": + """ + Simplified async streaming - returns an async generator that yields ModelResponse chunks. + """ + from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + get_async_httpx_client, + ) + + if client is None or not isinstance(client, AsyncHTTPHandler): + client = get_async_httpx_client( + llm_provider=cast(Any, "bedrock"), params={} + ) + + verbose_logger.debug(f"Making async streaming request to: {api_base}") + + # Make async streaming request + response = await client.post( + api_base, + headers=headers, + data=signed_json_body if signed_json_body else json.dumps(data), + stream=True, + logging_obj=logging_obj, + ) + + if response.status_code != 200: + raise BedrockError( + status_code=response.status_code, message=str(await response.aread()) + ) + + # LOGGING + logging_obj.post_call( + input=messages, + api_key="", + original_response="first stream response received", + additional_args={"complete_input_dict": data}, + ) + + # Wrap the async generator in CustomStreamWrapper + return CustomStreamWrapper( + completion_stream=self._stream_agentcore_response(response, model), + model=model, + custom_llm_provider="bedrock", + logging_obj=logging_obj, + ) + + @property + def has_custom_stream_wrapper(self) -> bool: + """Indicates that this config has custom streaming support.""" + return True + + @property + def supports_stream_param_in_request_body(self) -> bool: + """ + AgentCore does not allow passing `stream` in the request body. + Streaming is automatic based on the response format. + """ + return False + + def transform_response( + self, + model: str, + raw_response: httpx.Response, + model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, + request_data: dict, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + encoding: Any, + api_key: Optional[str] = None, + json_mode: Optional[bool] = None, + ) -> ModelResponse: + """ + Transform the AgentCore response to LiteLLM ModelResponse format. + AgentCore can return either JSON or SSE (Server-Sent Events) stream responses. + + Note: For streaming responses, use get_streaming_response() instead. + """ + try: + # Parse the response based on content type (JSON or SSE) + parsed_data = self._get_parsed_response(raw_response) + + content = parsed_data["content"] + usage_data = parsed_data["usage"] + + verbose_logger.debug(f"Parsed content length: {len(content)}") + verbose_logger.debug(f"Usage data: {usage_data}") + + # Create the message + message = Message(content=content, role="assistant") + + # Create choices + choice = Choices(finish_reason="stop", index=0, message=message) + + # Update model response + model_response.choices = [choice] + model_response.model = model + + # Add usage information if available + # Note: AgentCore JSON responses don't include usage data + # SSE responses may include usage in metadata events + if usage_data: + usage = Usage( + prompt_tokens=usage_data.get("inputTokens", 0), + completion_tokens=usage_data.get("outputTokens", 0), + total_tokens=usage_data.get("totalTokens", 0), + ) + setattr(model_response, "usage", usage) + else: + # Calculate token usage using LiteLLM's token counter + verbose_logger.debug( + "No usage data from AgentCore - calculating tokens" + ) + calculated_usage = self._calculate_usage(model, messages, content) + if calculated_usage: + setattr(model_response, "usage", calculated_usage) + + return model_response + + except Exception as e: + verbose_logger.error( + f"Error processing Bedrock AgentCore response: {str(e)}" + ) + raise BedrockError( + message=f"Error processing response: {str(e)}", + status_code=raw_response.status_code, + ) + + def validate_environment( + self, + headers: dict, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ) -> dict: + return headers + + def get_error_class( + self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers] + ) -> BaseLLMException: + return BedrockError(status_code=status_code, message=error_message) + + def should_fake_stream( + self, + model: Optional[str], + stream: Optional[bool], + custom_llm_provider: Optional[str] = None, + ) -> bool: + # AgentCore supports true streaming - don't buffer + return False diff --git a/litellm/llms/bedrock/chat/converse_handler.py b/litellm/llms/bedrock/chat/converse_handler.py index fd1f6f0c8934..d5bd054118d1 100644 --- a/litellm/llms/bedrock/chat/converse_handler.py +++ b/litellm/llms/bedrock/chat/converse_handler.py @@ -29,6 +29,7 @@ def make_sync_call( logging_obj: LiteLLMLoggingObject, json_mode: Optional[bool] = False, fake_stream: bool = False, + stream_chunk_size: int = 1024, ): if client is None: client = _get_httpx_client() # Create a new client if none provided @@ -66,7 +67,7 @@ def make_sync_call( ) else: decoder = AWSEventStreamDecoder(model=model) - completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024)) + completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=stream_chunk_size)) # LOGGING logging_obj.post_call( @@ -102,6 +103,7 @@ async def async_streaming( fake_stream: bool = False, json_mode: Optional[bool] = False, api_key: Optional[str] = None, + stream_chunk_size: int = 1024, ) -> CustomStreamWrapper: request_data = await litellm.AmazonConverseConfig()._async_transform_request( model=model, @@ -143,6 +145,7 @@ async def async_streaming( logging_obj=logging_obj, fake_stream=fake_stream, json_mode=json_mode, + stream_chunk_size=stream_chunk_size, ) streaming_response = CustomStreamWrapper( completion_stream=completion_stream, @@ -260,6 +263,7 @@ def completion( # noqa: PLR0915 ): ## SETUP ## stream = optional_params.pop("stream", None) + stream_chunk_size = optional_params.pop("stream_chunk_size", 1024) unencoded_model_id = optional_params.pop("model_id", None) fake_stream = optional_params.pop("fake_stream", False) json_mode = optional_params.get("json_mode", False) @@ -356,7 +360,8 @@ def completion( # noqa: PLR0915 json_mode=json_mode, fake_stream=fake_stream, credentials=credentials, - api_key=api_key + api_key=api_key, + stream_chunk_size=stream_chunk_size, ) # type: ignore ### ASYNC COMPLETION return self.async_completion( @@ -433,6 +438,7 @@ def completion( # noqa: PLR0915 logging_obj=logging_obj, json_mode=json_mode, fake_stream=fake_stream, + stream_chunk_size=stream_chunk_size, ) streaming_response = CustomStreamWrapper( completion_stream=completion_stream, diff --git a/litellm/llms/bedrock/chat/converse_transformation.py b/litellm/llms/bedrock/chat/converse_transformation.py index d76a3c31b514..d4e4d3591ba1 100644 --- a/litellm/llms/bedrock/chat/converse_transformation.py +++ b/litellm/llms/bedrock/chat/converse_transformation.py @@ -12,7 +12,12 @@ import litellm from litellm._logging import verbose_logger from litellm.constants import RESPONSE_FORMAT_TOOL_NAME -from litellm.litellm_core_utils.core_helpers import map_finish_reason +from litellm.litellm_core_utils.core_helpers import ( + filter_exceptions_from_params, + filter_internal_params, + map_finish_reason, + safe_deep_copy, +) from litellm.litellm_core_utils.litellm_logging import Logging from litellm.litellm_core_utils.prompt_templates.common_utils import ( _parse_content_for_reasoning, @@ -48,7 +53,13 @@ PromptTokensDetailsWrapper, Usage, ) -from litellm.utils import add_dummy_tool, has_tool_call_blocks, supports_reasoning +from litellm.utils import ( + add_dummy_tool, + any_assistant_message_has_thinking_blocks, + has_tool_call_blocks, + last_assistant_with_tool_calls_has_no_thinking_blocks, + supports_reasoning, +) from ..common_utils import ( BedrockError, @@ -65,6 +76,13 @@ "text_editor_", ] +# Beta header patterns that are not supported by Bedrock Converse API +# These will be filtered out to prevent errors +UNSUPPORTED_BEDROCK_CONVERSE_BETA_PATTERNS = [ + "advanced-tool-use", # Bedrock Converse doesn't support advanced-tool-use beta headers + "prompt-caching", # Prompt caching not supported in Converse API +] + class AmazonConverseConfig(BaseConfig): """ @@ -100,6 +118,7 @@ def get_config_blocks(cls) -> dict: return { "guardrailConfig": GuardrailConfigBlock, "performanceConfig": PerformanceConfigBlock, + "serviceTier": ServiceTierBlock, } @staticmethod @@ -246,6 +265,175 @@ def _validate_request_metadata(self, metadata: dict) -> None: llm_provider="bedrock", ) + def _is_nova_lite_2_model(self, model: str) -> bool: + """ + Check if the model is a Nova Lite 2 model that supports reasoningConfig. + + Nova Lite 2 models use a different reasoning configuration structure compared to + Anthropic's thinking parameter and GPT-OSS's reasoning_effort parameter. + + Supported models: + - amazon.nova-2-lite-v1:0 + - us.amazon.nova-2-lite-v1:0 + - eu.amazon.nova-2-lite-v1:0 + - apac.amazon.nova-2-lite-v1:0 + + Args: + model: The model identifier + + Returns: + True if the model is a Nova Lite 2 model, False otherwise + + Examples: + >>> config = AmazonConverseConfig() + >>> config._is_nova_lite_2_model("amazon.nova-2-lite-v1:0") + True + >>> config._is_nova_lite_2_model("us.amazon.nova-2-lite-v1:0") + True + >>> config._is_nova_lite_2_model("amazon.nova-pro-1-5-v1:0") + False + >>> config._is_nova_lite_2_model("amazon.nova-pro-v1:0") + False + """ + # Remove regional prefix if present (us., eu., apac.) + model_without_region = model + for prefix in ["us.", "eu.", "apac."]: + if model.startswith(prefix): + model_without_region = model[len(prefix) :] + break + + # Check if the model is specifically Nova Lite 2 + return "nova-2-lite" in model_without_region + + def _map_web_search_options( + self, + web_search_options: dict, + model: str + ) -> Optional[BedrockToolBlock]: + """ + Map web_search_options to Nova grounding systemTool. + + Nova grounding (web search) is only supported on Amazon Nova models. + Returns None for non-Nova models. + + Args: + web_search_options: The web_search_options dict from the request + model: The model identifier string + + Returns: + BedrockToolBlock with systemTool for Nova models, None otherwise + + Reference: https://docs.aws.amazon.com/nova/latest/userguide/grounding.html + """ + # Only Nova models support nova_grounding + # Model strings can be like: "amazon.nova-pro-v1:0", "us.amazon.nova-pro-v1:0", etc. + if "nova" not in model.lower(): + verbose_logger.debug( + f"web_search_options passed but model {model} is not a Nova model. " + "Nova grounding is only supported on Amazon Nova models." + ) + return None + + # Nova doesn't support search_context_size or user_location params + # (unlike Anthropic), so we just enable grounding with no options + return BedrockToolBlock(systemTool={"name": "nova_grounding"}) + + def _transform_reasoning_effort_to_reasoning_config( + self, reasoning_effort: str + ) -> dict: + """ + Transform reasoning_effort parameter to Nova 2 reasoningConfig structure. + + Nova 2 models use a reasoningConfig structure in additionalModelRequestFields + that differs from both Anthropic's thinking parameter and GPT-OSS's reasoning_effort. + + Args: + reasoning_effort: The reasoning effort level, must be "low" or "high" + + Returns: + dict: A dictionary containing the reasoningConfig structure: + { + "reasoningConfig": { + "type": "enabled", + "maxReasoningEffort": "low" | "medium" |"high" + } + } + + Raises: + BadRequestError: If reasoning_effort is not "low", "medium" or "high" + + Examples: + >>> config = AmazonConverseConfig() + >>> config._transform_reasoning_effort_to_reasoning_config("high") + {'reasoningConfig': {'type': 'enabled', 'maxReasoningEffort': 'high'}} + >>> config._transform_reasoning_effort_to_reasoning_config("low") + {'reasoningConfig': {'type': 'enabled', 'maxReasoningEffort': 'low'}} + """ + valid_values = ["low", "medium", "high"] + if reasoning_effort not in valid_values: + raise litellm.exceptions.BadRequestError( + message=f"Invalid reasoning_effort value '{reasoning_effort}' for Nova 2 models. " + f"Supported values: {valid_values}", + model="amazon.nova-2-lite-v1:0", + llm_provider="bedrock_converse", + ) + + return { + "reasoningConfig": { + "type": "enabled", + "maxReasoningEffort": reasoning_effort, + } + } + + def _handle_reasoning_effort_parameter( + self, model: str, reasoning_effort: str, optional_params: dict + ) -> None: + """ + Handle the reasoning_effort parameter based on the model type. + + Different model families handle reasoning effort differently: + - GPT-OSS models: Keep reasoning_effort as-is (passed to additionalModelRequestFields) + - Nova Lite 2 models: Transform to reasoningConfig structure + - Other models (Anthropic, etc.): Convert to thinking parameter + + Args: + model: The model identifier + reasoning_effort: The reasoning effort value + optional_params: Dictionary of optional parameters to update in-place + + Examples: + >>> config = AmazonConverseConfig() + >>> params = {} + >>> config._handle_reasoning_effort_parameter("gpt-oss-model", "high", params) + >>> params + {'reasoning_effort': 'high'} + + >>> params = {} + >>> config._handle_reasoning_effort_parameter("amazon.nova-2-lite-v1:0", "high", params) + >>> params + {'reasoningConfig': {'type': 'enabled', 'maxReasoningEffort': 'high'}} + + >>> params = {} + >>> config._handle_reasoning_effort_parameter("anthropic.claude-3", "high", params) + >>> params + {'thinking': {'type': 'enabled', 'budget_tokens': 10000}} + """ + if "gpt-oss" in model: + # GPT-OSS models: keep reasoning_effort as-is + # It will be passed through to additionalModelRequestFields + optional_params["reasoning_effort"] = reasoning_effort + elif self._is_nova_lite_2_model(model): + # Nova Lite 2 models: transform to reasoningConfig + reasoning_config = self._transform_reasoning_effort_to_reasoning_config( + reasoning_effort + ) + optional_params.update(reasoning_config) + else: + # Anthropic and other models: convert to thinking parameter + optional_params["thinking"] = AnthropicConfig._map_reasoning_effort( + reasoning_effort + ) + def get_supported_openai_params(self, model: str) -> List[str]: from litellm.utils import supports_function_calling @@ -260,6 +448,7 @@ def get_supported_openai_params(self, model: str) -> List[str]: "extra_headers", "response_format", "requestMetadata", + "service_tier", ] if ( @@ -289,6 +478,10 @@ def get_supported_openai_params(self, model: str) -> List[str]: ): supported_params.append("tools") + # Nova models support web_search_options (mapped to nova_grounding systemTool) + if base_model.startswith("amazon.nova"): + supported_params.append("web_search_options") + if litellm.utils.supports_tool_choice( model=model, custom_llm_provider=self.custom_llm_provider ) or litellm.utils.supports_tool_choice( @@ -299,6 +492,10 @@ def get_supported_openai_params(self, model: str) -> List[str]: if "gpt-oss" in model: supported_params.append("reasoning_effort") + elif self._is_nova_lite_2_model(model): + # Nova Lite 2 models support reasoning_effort (transformed to reasoningConfig) + # These models use a different reasoning structure than Anthropic's thinking parameter + supported_params.append("reasoning_effort") elif ( "claude-3-7" in model or "claude-sonnet-4" in model @@ -420,6 +617,37 @@ def _transform_computer_use_tools( return transformed_tools + def _filter_unsupported_beta_headers_for_bedrock( + self, model: str, beta_list: list + ) -> list: + """ + Remove beta headers that are not supported on Bedrock Converse API for the given model. + + Extended thinking beta headers are only supported on specific Claude 4+ models. + Some beta headers are universally unsupported on Bedrock Converse API. + + Args: + model: The model name + beta_list: The list of beta headers to filter + + Returns: + Filtered list of beta headers + """ + filtered_betas = [] + + # 1. Filter out beta headers that are universally unsupported on Bedrock Converse + for beta in beta_list: + should_keep = True + for unsupported_pattern in UNSUPPORTED_BEDROCK_CONVERSE_BETA_PATTERNS: + if unsupported_pattern in beta.lower(): + should_keep = False + break + + if should_keep: + filtered_betas.append(beta) + + return filtered_betas + def _separate_computer_use_tools( self, tools: List[OpenAIChatCompletionToolParam], model: str ) -> Tuple[ @@ -560,26 +788,50 @@ def map_openai_params( if param == "thinking": optional_params["thinking"] = value elif param == "reasoning_effort" and isinstance(value, str): - if "gpt-oss" in model: - # GPT-OSS models: keep reasoning_effort as-is - # It will be passed through to additionalModelRequestFields - optional_params["reasoning_effort"] = value - else: - # Anthropic and other models: convert to thinking parameter - optional_params["thinking"] = AnthropicConfig._map_reasoning_effort( - value - ) + self._handle_reasoning_effort_parameter( + model=model, reasoning_effort=value, optional_params=optional_params + ) if param == "requestMetadata": if value is not None and isinstance(value, dict): self._validate_request_metadata(value) # type: ignore optional_params["requestMetadata"] = value - - # Only update thinking tokens for non-GPT-OSS models - if "gpt-oss" not in model: + if param == "service_tier" and isinstance(value, str): + # Map OpenAI service_tier (string) to Bedrock serviceTier (object) + # OpenAI values: "auto", "default", "flex", "priority" + # Bedrock values: "default", "flex", "priority" (no "auto") + bedrock_tier = value + if value == "auto": + bedrock_tier = "default" # Bedrock doesn't support "auto" + if bedrock_tier in ("default", "flex", "priority"): + optional_params["serviceTier"] = {"type": bedrock_tier} + + if param == "web_search_options" and isinstance(value, dict): + # Note: we use `isinstance(value, dict)` instead of `value and isinstance(value, dict)` + # because empty dict {} is falsy but is a valid way to enable Nova grounding + grounding_tool = self._map_web_search_options(value, model) + if grounding_tool is not None: + optional_params = self._add_tools_to_optional_params( + optional_params=optional_params, tools=[grounding_tool] + ) + + # Only update thinking tokens for non-GPT-OSS models and non-Nova-Lite-2 models + # Nova Lite 2 handles token budgeting differently through reasoningConfig + if "gpt-oss" not in model and not self._is_nova_lite_2_model(model): self.update_optional_params_with_thinking_tokens( non_default_params=non_default_params, optional_params=optional_params ) + final_is_thinking_enabled = self.is_thinking_enabled(optional_params) + if final_is_thinking_enabled and "tool_choice" in optional_params: + tool_choice_block = optional_params["tool_choice"] + if isinstance(tool_choice_block, dict): + if "any" in tool_choice_block or "tool" in tool_choice_block: + verbose_logger.info( + f"{model} does not support forced tool use (tool_choice='required' or specific tool) " + f"when reasoning is enabled. Changing tool_choice to 'auto'." + ) + optional_params["tool_choice"] = ToolChoiceValuesBlock(auto={}) + return optional_params def _translate_response_format_param( @@ -611,7 +863,7 @@ def _translate_response_format_param( return optional_params """ - Follow similar approach to anthropic - translate to a single tool call. + Follow similar approach to anthropic - translate to a single tool call. When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode - You usually want to provide a single tool @@ -766,7 +1018,10 @@ def _prepare_request_params( self, optional_params: dict, model: str ) -> Tuple[dict, dict, dict]: """Prepare and separate request parameters.""" - inference_params = copy.deepcopy(optional_params) + # Filter out exception objects before deepcopy to prevent deepcopy failures + # Exceptions should not be stored in optional_params (this is a defensive fix) + cleaned_params = filter_exceptions_from_params(optional_params) + inference_params = safe_deep_copy(cleaned_params) supported_converse_params = list( AmazonConverseConfig.__annotations__.keys() ) + ["top_k"] @@ -797,6 +1052,17 @@ def _prepare_request_params( self._handle_top_k_value(model, inference_params) ) + # Filter out internal/MCP-related parameters that shouldn't be sent to the API + # These are LiteLLM internal parameters, not API parameters + additional_request_params = filter_internal_params(additional_request_params) + + # Filter out non-serializable objects (exceptions, callables, logging objects, etc.) + # from additional_request_params to prevent JSON serialization errors + # This filters: Exception objects, callable objects (functions), Logging objects, etc. + additional_request_params = filter_exceptions_from_params( + additional_request_params + ) + return inference_params, additional_request_params, request_metadata def _process_tools_and_beta( @@ -815,11 +1081,24 @@ def _process_tools_and_beta( user_betas = get_anthropic_beta_from_headers(headers) anthropic_beta_list.extend(user_betas) + # Filter out tool search tools - Bedrock Converse API doesn't support them + filtered_tools = [] + if original_tools: + for tool in original_tools: + tool_type = tool.get("type", "") + if tool_type in ( + "tool_search_tool_regex_20251119", + "tool_search_tool_bm25_20251119", + ): + # Tool search not supported in Converse API - skip it + continue + filtered_tools.append(tool) + # Only separate tools if computer use tools are actually present - if original_tools and self.is_computer_use_tool_used(original_tools, model): + if filtered_tools and self.is_computer_use_tool_used(filtered_tools, model): # Separate computer use tools from regular function tools computer_use_tools, regular_tools = self._separate_computer_use_tools( - original_tools, model + filtered_tools, model ) # Process regular function tools using existing logic @@ -835,10 +1114,13 @@ def _process_tools_and_beta( additional_request_params["tools"] = transformed_computer_tools else: # No computer use tools, process all tools as regular tools - bedrock_tools = _bedrock_tools_pt(original_tools) + bedrock_tools = _bedrock_tools_pt(filtered_tools) # Set anthropic_beta in additional_request_params if we have any beta features - if anthropic_beta_list: + # ONLY apply to Anthropic/Claude models - other models (e.g., Qwen, Llama) don't support this field + # and will error with "unknown variant anthropic_beta" if included + base_model = BedrockModelInfo.get_base_model(model) + if anthropic_beta_list and base_model.startswith("anthropic"): # Remove duplicates while preserving order unique_betas = [] seen = set() @@ -846,7 +1128,14 @@ def _process_tools_and_beta( if beta not in seen: unique_betas.append(beta) seen.add(beta) - additional_request_params["anthropic_beta"] = unique_betas + + # Filter out unsupported beta headers for Bedrock Converse API + filtered_betas = self._filter_unsupported_beta_headers_for_bedrock( + model=model, + beta_list=unique_betas, + ) + + additional_request_params["anthropic_beta"] = filtered_betas return bedrock_tools, anthropic_beta_list @@ -878,9 +1167,28 @@ def _transform_request_helper( llm_provider="bedrock", ) + # Drop thinking param if thinking is enabled but thinking_blocks are missing + # This prevents the error: "Expected thinking or redacted_thinking, but found tool_use" + # + # IMPORTANT: Only drop thinking if NO assistant messages have thinking_blocks. + # If any message has thinking_blocks, we must keep thinking enabled, otherwise + # Related issues: https://github.com/BerriAI/litellm/issues/14194 + if ( + optional_params.get("thinking") is not None + and messages is not None + and last_assistant_with_tool_calls_has_no_thinking_blocks(messages) + and not any_assistant_message_has_thinking_blocks(messages) + ): + if litellm.modify_params: + optional_params.pop("thinking", None) + litellm.verbose_logger.warning( + "Dropping 'thinking' param because the last assistant message with tool_calls " + "has no thinking_blocks. The model won't use extended thinking for this turn." + ) + # Prepare and separate parameters - inference_params, additional_request_params, request_metadata = ( - self._prepare_request_params(optional_params, model) + inference_params, additional_request_params, request_metadata = self._prepare_request_params( + optional_params, model ) original_tools = inference_params.pop("tools", []) @@ -1171,20 +1479,23 @@ def _translate_message_content(self, content_blocks: List[ContentBlock]) -> Tupl str, List[ChatCompletionToolCallChunk], Optional[List[BedrockConverseReasoningContentBlock]], + Optional[List[CitationsContentBlock]], ]: """ - Translate the message content to a string and a list of tool calls and reasoning content blocks + Translate the message content to a string and a list of tool calls, reasoning content blocks, and citations. Returns: content_str: str tools: List[ChatCompletionToolCallChunk] reasoningContentBlocks: Optional[List[BedrockConverseReasoningContentBlock]] + citationsContentBlocks: Optional[List[CitationsContentBlock]] - Citations from Nova grounding """ content_str = "" tools: List[ChatCompletionToolCallChunk] = [] reasoningContentBlocks: Optional[List[BedrockConverseReasoningContentBlock]] = ( None ) + citationsContentBlocks: Optional[List[CitationsContentBlock]] = None for idx, content in enumerate(content_blocks): """ - Content is either a tool response or text @@ -1229,10 +1540,15 @@ def _translate_message_content(self, content_blocks: List[ContentBlock]) -> Tupl if reasoningContentBlocks is None: reasoningContentBlocks = [] reasoningContentBlocks.append(content["reasoningContent"]) + # Handle Nova grounding citations content + if "citationsContent" in content: + if citationsContentBlocks is None: + citationsContentBlocks = [] + citationsContentBlocks.append(content["citationsContent"]) - return content_str, tools, reasoningContentBlocks + return content_str, tools, reasoningContentBlocks, citationsContentBlocks - def _transform_response( + def _transform_response( # noqa: PLR0915 self, model: str, response: httpx.Response, @@ -1267,11 +1583,11 @@ def _transform_response( ) """ - Bedrock Response Object has optional message block + Bedrock Response Object has optional message block completion_response["output"].get("message", None) - A message block looks like this (Example 1): + A message block looks like this (Example 1): "output": { "message": { "role": "assistant", @@ -1308,18 +1624,27 @@ def _transform_response( reasoningContentBlocks: Optional[List[BedrockConverseReasoningContentBlock]] = ( None ) + citationsContentBlocks: Optional[List[CitationsContentBlock]] = None if message is not None: ( content_str, tools, reasoningContentBlocks, + citationsContentBlocks, ) = self._translate_message_content(message["content"]) + # Initialize provider_specific_fields if we have any special content blocks + provider_specific_fields: dict = {} + if reasoningContentBlocks is not None: + provider_specific_fields["reasoningContentBlocks"] = reasoningContentBlocks + if citationsContentBlocks is not None: + provider_specific_fields["citationsContent"] = citationsContentBlocks + + if provider_specific_fields: + chat_completion_message["provider_specific_fields"] = provider_specific_fields + if reasoningContentBlocks is not None: - chat_completion_message["provider_specific_fields"] = { - "reasoningContentBlocks": reasoningContentBlocks, - } chat_completion_message["reasoning_content"] = ( self._transform_reasoning_content(reasoningContentBlocks) ) @@ -1392,6 +1717,13 @@ def _transform_response( if "trace" in completion_response: setattr(model_response, "trace", completion_response["trace"]) + # Add service_tier if present in Bedrock response + # Map Bedrock serviceTier (object) to OpenAI service_tier (string) + if "serviceTier" in completion_response: + service_tier_block = completion_response["serviceTier"] + if isinstance(service_tier_block, dict) and "type" in service_tier_block: + setattr(model_response, "service_tier", service_tier_block["type"]) + return model_response def get_error_class( diff --git a/litellm/llms/bedrock/chat/invoke_handler.py b/litellm/llms/bedrock/chat/invoke_handler.py index 53cbafcbe6a0..1c58a11eebe7 100644 --- a/litellm/llms/bedrock/chat/invoke_handler.py +++ b/litellm/llms/bedrock/chat/invoke_handler.py @@ -51,7 +51,11 @@ ChatCompletionToolCallFunctionChunk, ChatCompletionUsageBlock, ) -from litellm.types.utils import ChatCompletionMessageToolCall, Choices, Delta +from litellm.types.utils import ( + ChatCompletionMessageToolCall, + Choices, + Delta, +) from litellm.types.utils import GenericStreamingChunk as GChunk from litellm.types.utils import ( ModelResponse, @@ -69,6 +73,9 @@ max_size_in_memory=50, default_ttl=600 ) from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig +from litellm.llms.bedrock.chat.invoke_transformations.amazon_openai_transformation import ( + AmazonBedrockOpenAIConfig, +) converse_config = AmazonConverseConfig() @@ -185,11 +192,17 @@ async def make_call( fake_stream: bool = False, json_mode: Optional[bool] = False, bedrock_invoke_provider: Optional[litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL] = None, + stream_chunk_size: int = 1024, ): try: if client is None: client = get_async_httpx_client( - llm_provider=litellm.LlmProviders.BEDROCK + llm_provider=litellm.LlmProviders.BEDROCK, + params={"ssl_verify": logging_obj.litellm_params.get("ssl_verify")} + if logging_obj + and logging_obj.litellm_params + and logging_obj.litellm_params.get("ssl_verify") + else None, ) # Create a new client if none provided response = await client.post( @@ -228,7 +241,7 @@ async def make_call( json_mode=json_mode, ) completion_stream = decoder.aiter_bytes( - response.aiter_bytes(chunk_size=1024) + response.aiter_bytes(chunk_size=stream_chunk_size) ) elif bedrock_invoke_provider == "deepseek_r1": decoder = AmazonDeepSeekR1StreamDecoder( @@ -236,12 +249,12 @@ async def make_call( sync_stream=False, ) completion_stream = decoder.aiter_bytes( - response.aiter_bytes(chunk_size=1024) + response.aiter_bytes(chunk_size=stream_chunk_size) ) else: decoder = AWSEventStreamDecoder(model=model) completion_stream = decoder.aiter_bytes( - response.aiter_bytes(chunk_size=1024) + response.aiter_bytes(chunk_size=stream_chunk_size) ) # LOGGING @@ -274,10 +287,17 @@ def make_sync_call( fake_stream: bool = False, json_mode: Optional[bool] = False, bedrock_invoke_provider: Optional[litellm.BEDROCK_INVOKE_PROVIDERS_LITERAL] = None, + stream_chunk_size: int = 1024, ): try: if client is None: - client = _get_httpx_client(params={}) + client = _get_httpx_client( + params={"ssl_verify": logging_obj.litellm_params.get("ssl_verify")} + if logging_obj + and logging_obj.litellm_params + and logging_obj.litellm_params.get("ssl_verify") + else None + ) response = client.post( api_base, @@ -314,16 +334,22 @@ def make_sync_call( sync_stream=True, json_mode=json_mode, ) - completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024)) + completion_stream = decoder.iter_bytes( + response.iter_bytes(chunk_size=stream_chunk_size) + ) elif bedrock_invoke_provider == "deepseek_r1": decoder = AmazonDeepSeekR1StreamDecoder( model=model, sync_stream=True, ) - completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024)) + completion_stream = decoder.iter_bytes( + response.iter_bytes(chunk_size=stream_chunk_size) + ) else: decoder = AWSEventStreamDecoder(model=model) - completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024)) + completion_stream = decoder.iter_bytes( + response.iter_bytes(chunk_size=stream_chunk_size) + ) # LOGGING logging_obj.post_call( @@ -365,6 +391,29 @@ class BedrockLLM(BaseAWSLLM): def __init__(self) -> None: super().__init__() + @staticmethod + def is_claude_messages_api_model(model: str) -> bool: + """ + Check if the model uses the Claude Messages API (Claude 3+). + + Handles: + - Regional prefixes: eu.anthropic.claude-*, us.anthropic.claude-* + - Claude 3 models: claude-3-haiku, claude-3-sonnet, claude-3-opus, claude-3-5-*, claude-3-7-* + - Claude 4 models: claude-opus-4, claude-sonnet-4, claude-haiku-4 + """ + # Normalize model string to lowercase for matching + model_lower = model.lower() + + # Claude 3+ indicators (all use Messages API) + messages_api_indicators = [ + "claude-3", # Claude 3.x models + "claude-opus-4", # Claude Opus 4 + "claude-sonnet-4", # Claude Sonnet 4 + "claude-haiku-4", # Claude Haiku 4 + ] + + return any(indicator in model_lower for indicator in messages_api_indicators) + def convert_messages_to_prompt( self, model, messages, provider, custom_prompt_dict ) -> Tuple[str, Optional[list]]: @@ -397,6 +446,10 @@ def convert_messages_to_prompt( prompt = prompt_factory( model=model, messages=messages, custom_llm_provider="bedrock" ) + elif provider == "openai": + # OpenAI uses messages directly, no prompt conversion needed + # Return empty prompt as it won't be used + prompt = "" elif provider == "cohere": prompt, chat_history = cohere_message_pt(messages=messages) else: @@ -452,7 +505,7 @@ def process_response( # noqa: PLR0915 completion_response["generations"][0]["finish_reason"] ) elif provider == "anthropic": - if model.startswith("anthropic.claude-3"): + if self.is_claude_messages_api_model(model): json_schemas: dict = {} _is_function_call = False ## Handle Tool Calling @@ -493,9 +546,9 @@ def process_response( # noqa: PLR0915 content=None, ) model_response.choices[0].message = _message # type: ignore - model_response._hidden_params["original_response"] = ( - outputText # allow user to access raw anthropic tool calling response - ) + model_response._hidden_params[ + "original_response" + ] = outputText # allow user to access raw anthropic tool calling response if ( _is_function_call is True and stream is not None @@ -574,6 +627,33 @@ def process_response( # noqa: PLR0915 ) elif provider == "meta" or provider == "llama": outputText = completion_response["generation"] + elif provider == "openai": + # OpenAI imported models use OpenAI Chat Completions format + if ( + "choices" in completion_response + and len(completion_response["choices"]) > 0 + ): + choice = completion_response["choices"][0] + if "message" in choice: + outputText = choice["message"].get("content") + elif "text" in choice: # fallback for completion format + outputText = choice["text"] + + # Set finish reason + if "finish_reason" in choice: + model_response.choices[0].finish_reason = map_finish_reason( + choice["finish_reason"] + ) + + # Set usage if available + if "usage" in completion_response: + usage = completion_response["usage"] + _usage = litellm.Usage( + prompt_tokens=usage.get("prompt_tokens", 0), + completion_tokens=usage.get("completion_tokens", 0), + total_tokens=usage.get("total_tokens", 0), + ) + setattr(model_response, "usage", _usage) elif provider == "mistral": outputText = completion_response["outputs"][0]["text"] model_response.choices[0].finish_reason = completion_response[ @@ -637,33 +717,42 @@ def process_response( # noqa: PLR0915 ) ## CALCULATING USAGE - bedrock returns usage in the headers - bedrock_input_tokens = response.headers.get( - "x-amzn-bedrock-input-token-count", None - ) - bedrock_output_tokens = response.headers.get( - "x-amzn-bedrock-output-token-count", None - ) + # Skip if usage was already set (e.g., from JSON response for OpenAI provider) + if ( + not hasattr(model_response, "usage") + or getattr(model_response, "usage", None) is None + ): + bedrock_input_tokens = response.headers.get( + "x-amzn-bedrock-input-token-count", None + ) + bedrock_output_tokens = response.headers.get( + "x-amzn-bedrock-output-token-count", None + ) - prompt_tokens = int( - bedrock_input_tokens or litellm.token_counter(messages=messages) - ) + prompt_tokens = int( + bedrock_input_tokens or litellm.token_counter(messages=messages) + ) - completion_tokens = int( - bedrock_output_tokens - or litellm.token_counter( - text=model_response.choices[0].message.content, # type: ignore - count_response_tokens=True, + completion_tokens = int( + bedrock_output_tokens + or litellm.token_counter( + text=model_response.choices[0].message.content, # type: ignore + count_response_tokens=True, + ) ) - ) - model_response.created = int(time.time()) - model_response.model = model - usage = Usage( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - setattr(model_response, "usage", usage) + model_response.created = int(time.time()) + model_response.model = model + usage = Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) + setattr(model_response, "usage", usage) + else: + # Ensure created and model are set even if usage was already set + model_response.created = int(time.time()) + model_response.model = model return model_response @@ -686,14 +775,13 @@ def completion( # noqa: PLR0915 client: Optional[Union[AsyncHTTPHandler, HTTPHandler]] = None, ) -> Union[ModelResponse, CustomStreamWrapper]: try: - from botocore.auth import SigV4Auth - from botocore.awsrequest import AWSRequest from botocore.credentials import Credentials except ImportError: raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.") ## SETUP ## stream = optional_params.pop("stream", None) + stream_chunk_size = optional_params.pop("stream_chunk_size", 1024) provider = self.get_bedrock_invoke_provider(model) modelId = self.get_bedrock_model_id( @@ -716,6 +804,7 @@ def completion( # noqa: PLR0915 ) # https://bedrock-runtime.{region_name}.amazonaws.com aws_web_identity_token = optional_params.pop("aws_web_identity_token", None) aws_sts_endpoint = optional_params.pop("aws_sts_endpoint", None) + ssl_verify = optional_params.pop("ssl_verify", None) ### SET REGION NAME ### if aws_region_name is None: @@ -746,6 +835,7 @@ def completion( # noqa: PLR0915 aws_role_name=aws_role_name, aws_web_identity_token=aws_web_identity_token, aws_sts_endpoint=aws_sts_endpoint, + ssl_verify=ssl_verify, ) ### SET RUNTIME ENDPOINT ### @@ -764,8 +854,6 @@ def completion( # noqa: PLR0915 endpoint_url = f"{endpoint_url}/model/{modelId}/invoke" proxy_endpoint_url = f"{proxy_endpoint_url}/model/{modelId}/invoke" - sigv4 = SigV4Auth(credentials, "bedrock", aws_region_name) - prompt, chat_history = self.convert_messages_to_prompt( model, messages, provider, custom_prompt_dict ) @@ -793,12 +881,12 @@ def completion( # noqa: PLR0915 ): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in inference_params[k] = v if stream is True: - inference_params["stream"] = ( - True # cohere requires stream = True in inference params - ) + inference_params[ + "stream" + ] = True # cohere requires stream = True in inference params data = json.dumps({"prompt": prompt, **inference_params}) elif provider == "anthropic": - if model.startswith("anthropic.claude-3"): + if self.is_claude_messages_api_model(model): # Separate system prompt from rest of message system_prompt_idx: list[int] = [] system_messages: list[str] = [] @@ -891,6 +979,19 @@ def completion( # noqa: PLR0915 ): # completion(top_k=3) > anthropic_config(top_k=3) <- allows for dynamic variables to be passed in inference_params[k] = v data = json.dumps({"prompt": prompt, **inference_params}) + elif provider == "openai": + ## OpenAI imported models use OpenAI Chat Completions format (messages-based) + # Use AmazonBedrockOpenAIConfig for proper OpenAI transformation + openai_config = AmazonBedrockOpenAIConfig() + supported_params = openai_config.get_supported_openai_params(model=model) + + # Filter to only supported OpenAI params + filtered_params = { + k: v for k, v in inference_params.items() if k in supported_params + } + + # OpenAI uses messages format, not prompt + data = json.dumps({"messages": messages, **filtered_params}) else: ## LOGGING logging_obj.pre_call( @@ -912,15 +1013,14 @@ def completion( # noqa: PLR0915 headers = {"Content-Type": "application/json"} if extra_headers is not None: headers = {"Content-Type": "application/json", **extra_headers} - request = AWSRequest( - method="POST", url=endpoint_url, data=data, headers=headers + prepped = self.get_request_headers( + credentials=credentials, + aws_region_name=aws_region_name, + extra_headers=extra_headers, + endpoint_url=endpoint_url, + data=data, + headers=headers, ) - sigv4.add_auth(request) - if ( - extra_headers is not None and "Authorization" in extra_headers - ): # prevent sigv4 from overwriting the auth header - request.headers["Authorization"] = extra_headers["Authorization"] - prepped = request.prepare() ## LOGGING logging_obj.pre_call( @@ -954,6 +1054,7 @@ def completion( # noqa: PLR0915 headers=prepped.headers, timeout=timeout, client=client, + stream_chunk_size=stream_chunk_size, ) # type: ignore ### ASYNC COMPLETION return self.async_completion( @@ -999,7 +1100,9 @@ def completion( # noqa: PLR0915 decoder = AWSEventStreamDecoder(model=model) - completion_stream = decoder.iter_bytes(response.iter_bytes(chunk_size=1024)) + completion_stream = decoder.iter_bytes( + response.iter_bytes(chunk_size=stream_chunk_size) + ) streaming_response = CustomStreamWrapper( completion_stream=completion_stream, model=model, @@ -1119,6 +1222,7 @@ async def async_streaming( logger_fn=None, headers={}, client: Optional[AsyncHTTPHandler] = None, + stream_chunk_size: int = 1024, ) -> CustomStreamWrapper: # The call is not made here; instead, we prepare the necessary objects for the stream. @@ -1134,6 +1238,7 @@ async def async_streaming( messages=messages, logging_obj=logging_obj, fake_stream=True if "ai21" in api_base else False, + stream_chunk_size=stream_chunk_size, ), model=model, custom_llm_provider="bedrock", @@ -1184,6 +1289,7 @@ def __init__(self, model: str) -> None: self.parser = EventStreamJSONParser() self.content_blocks: List[ContentBlockDeltaEvent] = [] self.tool_calls_index: Optional[int] = None + self.response_id: Optional[str] = None def check_empty_tool_call_args(self) -> bool: """ @@ -1245,8 +1351,172 @@ def translate_thinking_blocks( thinking_blocks_list.append(_thinking_block) return thinking_blocks_list + def _initialize_converse_response_id(self, chunk_data: dict): + """Initialize response_id from chunk data if not already set.""" + if self.response_id is None: + if "messageStart" in chunk_data: + conversation_id = chunk_data["messageStart"].get("conversationId") + if conversation_id: + self.response_id = f"chatcmpl-{conversation_id}" + else: + # Fallback to generating a UUID if the first chunk is not messageStart + self.response_id = f"chatcmpl-{uuid.uuid4()}" + + def _handle_converse_start_event( + self, + start_obj: ContentBlockStartEvent, + ) -> Tuple[ + Optional[ChatCompletionToolCallChunk], + dict, + Optional[ + List[ + Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock] + ] + ], + ]: + """Handle 'start' event in converse chunk parsing.""" + tool_use: Optional[ChatCompletionToolCallChunk] = None + provider_specific_fields: dict = {} + thinking_blocks: Optional[ + List[ + Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock] + ] + ] = None + + self.content_blocks = [] # reset + if start_obj is not None: + if "toolUse" in start_obj and start_obj["toolUse"] is not None: + ## check tool name was formatted by litellm + _response_tool_name = start_obj["toolUse"]["name"] + response_tool_name = get_bedrock_tool_name( + response_tool_name=_response_tool_name + ) + self.tool_calls_index = ( + 0 if self.tool_calls_index is None else self.tool_calls_index + 1 + ) + tool_use = { + "id": start_obj["toolUse"]["toolUseId"], + "type": "function", + "function": { + "name": response_tool_name, + "arguments": "", + }, + "index": self.tool_calls_index, + } + elif ( + "reasoningContent" in start_obj + and start_obj["reasoningContent"] is not None + ): # redacted thinking can be in start object + thinking_blocks = self.translate_thinking_blocks( + start_obj["reasoningContent"] + ) + provider_specific_fields = { + "reasoningContent": start_obj["reasoningContent"], + } + return tool_use, provider_specific_fields, thinking_blocks + + def _handle_converse_delta_event( + self, + delta_obj: ContentBlockDeltaEvent, + index: int, + ) -> Tuple[ + str, + Optional[ChatCompletionToolCallChunk], + dict, + Optional[str], + Optional[ + List[ + Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock] + ] + ], + ]: + """Handle 'delta' event in converse chunk parsing.""" + text = "" + tool_use: Optional[ChatCompletionToolCallChunk] = None + provider_specific_fields: dict = {} + reasoning_content: Optional[str] = None + thinking_blocks: Optional[ + List[ + Union[ChatCompletionThinkingBlock, ChatCompletionRedactedThinkingBlock] + ] + ] = None + + self.content_blocks.append(delta_obj) + if "text" in delta_obj: + text = delta_obj["text"] + elif "toolUse" in delta_obj: + tool_use = { + "id": None, + "type": "function", + "function": { + "name": None, + "arguments": delta_obj["toolUse"]["input"], + }, + "index": ( + self.tool_calls_index + if self.tool_calls_index is not None + else index + ), + } + elif "reasoningContent" in delta_obj: + provider_specific_fields = { + "reasoningContent": delta_obj["reasoningContent"], + } + reasoning_content = self.extract_reasoning_content_str( + delta_obj["reasoningContent"] + ) + thinking_blocks = self.translate_thinking_blocks( + delta_obj["reasoningContent"] + ) + if ( + thinking_blocks + and len(thinking_blocks) > 0 + and reasoning_content is None + ): + reasoning_content = ( + "" # set to non-empty string to ensure consistency with Anthropic + ) + elif "citationsContent" in delta_obj: + # Handle Nova grounding citations in streaming responses + provider_specific_fields = { + "citationsContent": delta_obj["citationsContent"], + } + return ( + text, + tool_use, + provider_specific_fields, + reasoning_content, + thinking_blocks, + ) + + def _handle_converse_stop_event( + self, index: int + ) -> Optional[ChatCompletionToolCallChunk]: + """Handle stop/contentBlockIndex event in converse chunk parsing.""" + tool_use: Optional[ChatCompletionToolCallChunk] = None + is_empty = self.check_empty_tool_call_args() + if is_empty: + tool_use = { + "id": None, + "type": "function", + "function": { + "name": None, + "arguments": "{}", + }, + "index": ( + self.tool_calls_index + if self.tool_calls_index is not None + else index + ), + } + return tool_use + def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream: try: + # Capture the conversationId from the first messageStart event + # and use it as the consistent ID for all subsequent chunks. + self._initialize_converse_response_id(chunk_data) + verbose_logger.debug("\n\nRaw Chunk: {}\n\n".format(chunk_data)) text = "" tool_use: Optional[ChatCompletionToolCallChunk] = None @@ -1262,94 +1532,27 @@ def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream: ] ] = None - index = int(chunk_data.get("contentBlockIndex", 0)) + content_block_index = int(chunk_data.get("contentBlockIndex", 0)) if "start" in chunk_data: start_obj = ContentBlockStartEvent(**chunk_data["start"]) - self.content_blocks = [] # reset - if start_obj is not None: - if "toolUse" in start_obj and start_obj["toolUse"] is not None: - ## check tool name was formatted by litellm - _response_tool_name = start_obj["toolUse"]["name"] - response_tool_name = get_bedrock_tool_name( - response_tool_name=_response_tool_name - ) - self.tool_calls_index = ( - 0 - if self.tool_calls_index is None - else self.tool_calls_index + 1 - ) - tool_use = { - "id": start_obj["toolUse"]["toolUseId"], - "type": "function", - "function": { - "name": response_tool_name, - "arguments": "", - }, - "index": self.tool_calls_index, - } - elif ( - "reasoningContent" in start_obj - and start_obj["reasoningContent"] is not None - ): # redacted thinking can be in start object - thinking_blocks = self.translate_thinking_blocks( - start_obj["reasoningContent"] - ) - provider_specific_fields = { - "reasoningContent": start_obj["reasoningContent"], - } + ( + tool_use, + provider_specific_fields, + thinking_blocks, + ) = self._handle_converse_start_event(start_obj) elif "delta" in chunk_data: delta_obj = ContentBlockDeltaEvent(**chunk_data["delta"]) - self.content_blocks.append(delta_obj) - if "text" in delta_obj: - text = delta_obj["text"] - elif "toolUse" in delta_obj: - tool_use = { - "id": None, - "type": "function", - "function": { - "name": None, - "arguments": delta_obj["toolUse"]["input"], - }, - "index": ( - self.tool_calls_index - if self.tool_calls_index is not None - else index - ), - } - elif "reasoningContent" in delta_obj: - provider_specific_fields = { - "reasoningContent": delta_obj["reasoningContent"], - } - reasoning_content = self.extract_reasoning_content_str( - delta_obj["reasoningContent"] - ) - thinking_blocks = self.translate_thinking_blocks( - delta_obj["reasoningContent"] - ) - if ( - thinking_blocks - and len(thinking_blocks) > 0 - and reasoning_content is None - ): - reasoning_content = "" # set to non-empty string to ensure consistency with Anthropic + ( + text, + tool_use, + provider_specific_fields, + reasoning_content, + thinking_blocks, + ) = self._handle_converse_delta_event(delta_obj, content_block_index) elif ( "contentBlockIndex" in chunk_data ): # stop block, no 'start' or 'delta' object - is_empty = self.check_empty_tool_call_args() - if is_empty: - tool_use = { - "id": None, - "type": "function", - "function": { - "name": None, - "arguments": "{}", - }, - "index": ( - self.tool_calls_index - if self.tool_calls_index is not None - else index - ), - } + tool_use = self._handle_converse_stop_event(content_block_index) elif "stopReason" in chunk_data: finish_reason = map_finish_reason(chunk_data.get("stopReason", "stop")) elif "usage" in chunk_data: @@ -1363,7 +1566,7 @@ def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream: choices=[ StreamingChoices( finish_reason=finish_reason, - index=index, + index=0, # Always 0 - Bedrock never returns multiple choices delta=Delta( content=text, role="assistant", @@ -1378,6 +1581,8 @@ def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream: ), ) ], + id=self.response_id, + model=self.model, usage=usage, provider_specific_fields=model_response_provider_specific_fields, ) diff --git a/litellm/llms/bedrock/chat/invoke_transformations/amazon_moonshot_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/amazon_moonshot_transformation.py new file mode 100644 index 000000000000..e53410760dd4 --- /dev/null +++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_moonshot_transformation.py @@ -0,0 +1,256 @@ +""" +Transformation for Bedrock Moonshot AI (Kimi K2) models. + +Supports the Kimi K2 Thinking model available on Amazon Bedrock. +Model format: bedrock/moonshot.kimi-k2-thinking-v1:0 + +Reference: https://aws.amazon.com/about-aws/whats-new/2025/12/amazon-bedrock-fully-managed-open-weight-models/ +""" + +from typing import TYPE_CHECKING, Any, List, Optional, Union +import re + +import httpx + +from litellm.llms.bedrock.chat.invoke_transformations.base_invoke_transformation import ( + AmazonInvokeConfig, +) +from litellm.llms.bedrock.common_utils import BedrockError +from litellm.llms.moonshot.chat.transformation import MoonshotChatConfig +from litellm.types.llms.openai import AllMessageValues +from litellm.types.utils import Choices + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + from litellm.types.utils import ModelResponse + + LiteLLMLoggingObj = _LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any + + +class AmazonMoonshotConfig(AmazonInvokeConfig, MoonshotChatConfig): + """ + Configuration for Bedrock Moonshot AI (Kimi K2) models. + + Reference: + https://aws.amazon.com/about-aws/whats-new/2025/12/amazon-bedrock-fully-managed-open-weight-models/ + https://platform.moonshot.ai/docs/api/chat + + Supported Params for the Amazon / Moonshot models: + - `max_tokens` (integer) max tokens + - `temperature` (float) temperature for model (0-1 for Moonshot) + - `top_p` (float) top p for model + - `stream` (bool) whether to stream responses + - `tools` (list) tool definitions (supported on kimi-k2-thinking) + - `tool_choice` (str|dict) tool choice specification (supported on kimi-k2-thinking) + + NOT Supported on Bedrock: + - `stop` sequences (Bedrock doesn't support stopSequences field for this model) + + Note: The kimi-k2-thinking model DOES support tool calls, unlike kimi-thinking-preview. + """ + + def __init__(self, **kwargs): + AmazonInvokeConfig.__init__(self, **kwargs) + MoonshotChatConfig.__init__(self, **kwargs) + + @property + def custom_llm_provider(self) -> Optional[str]: + return "bedrock" + + def _get_model_id(self, model: str) -> str: + """ + Extract the actual model ID from the LiteLLM model name. + + Removes routing prefixes like: + - bedrock/invoke/moonshot.kimi-k2-thinking -> moonshot.kimi-k2-thinking + - invoke/moonshot.kimi-k2-thinking -> moonshot.kimi-k2-thinking + - moonshot.kimi-k2-thinking -> moonshot.kimi-k2-thinking + """ + # Remove bedrock/ prefix if present + if model.startswith("bedrock/"): + model = model[8:] + + # Remove invoke/ prefix if present + if model.startswith("invoke/"): + model = model[7:] + + # Remove any provider prefix (e.g., moonshot/) + if "/" in model and not model.startswith("arn:"): + parts = model.split("/", 1) + if len(parts) == 2: + model = parts[1] + + return model + + def get_supported_openai_params(self, model: str) -> List[str]: + """ + Get the supported OpenAI params for Moonshot AI models on Bedrock. + + Bedrock-specific limitations: + - stopSequences field is not supported on Bedrock (unlike native Moonshot API) + - functions parameter is not supported (use tools instead) + - tool_choice doesn't support "required" value + + Note: kimi-k2-thinking DOES support tool calls (unlike kimi-thinking-preview) + The parent MoonshotChatConfig class handles the kimi-thinking-preview exclusion. + """ + excluded_params: List[str] = ["functions", "stop"] # Bedrock doesn't support stopSequences + + base_openai_params = super(MoonshotChatConfig, self).get_supported_openai_params(model=model) + final_params: List[str] = [] + for param in base_openai_params: + if param not in excluded_params: + final_params.append(param) + + return final_params + + def map_openai_params( + self, + non_default_params: dict, + optional_params: dict, + model: str, + drop_params: bool, + ) -> dict: + """ + Map OpenAI parameters to Moonshot AI parameters for Bedrock. + + Handles Moonshot AI specific limitations: + - tool_choice doesn't support "required" value + - Temperature <0.3 limitation for n>1 + - Temperature range is [0, 1] (not [0, 2] like OpenAI) + """ + return MoonshotChatConfig.map_openai_params( + self, + non_default_params=non_default_params, + optional_params=optional_params, + model=model, + drop_params=drop_params, + ) + + def transform_request( + self, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + headers: dict, + ) -> dict: + """ + Transform the request for Bedrock Moonshot AI models. + + Uses the Moonshot transformation logic which handles: + - Converting content lists to strings (Moonshot doesn't support list format) + - Adding tool_choice="required" message if needed + - Temperature and parameter validation + + """ + # Filter out AWS credentials using the existing method from BaseAWSLLM + self._get_boto_credentials_from_optional_params(optional_params, model) + + # Strip routing prefixes to get the actual model ID + clean_model_id = self._get_model_id(model) + + # Use Moonshot's transform_request which handles message transformation + # and tool_choice="required" workaround + return MoonshotChatConfig.transform_request( + self, + model=clean_model_id, + messages=messages, + optional_params=optional_params, + litellm_params=litellm_params, + headers=headers, + ) + + def _extract_reasoning_from_content(self, content: str) -> tuple[Optional[str], str]: + """ + Extract reasoning content from tags in the response. + + Moonshot AI's Kimi K2 Thinking model returns reasoning in tags. + This method extracts that content and returns it separately. + + Args: + content: The full content string from the API response + + Returns: + tuple: (reasoning_content, main_content) + """ + if not content: + return None, content + + # Match ... tags + reasoning_match = re.match( + r"(.*?)\s*(.*)", + content, + re.DOTALL + ) + + if reasoning_match: + reasoning_content = reasoning_match.group(1).strip() + main_content = reasoning_match.group(2).strip() + return reasoning_content, main_content + + return None, content + + def transform_response( + self, + model: str, + raw_response: httpx.Response, + model_response: "ModelResponse", + logging_obj: LiteLLMLoggingObj, + request_data: dict, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + encoding: Any, + api_key: Optional[str] = None, + json_mode: Optional[bool] = None, + ) -> "ModelResponse": + """ + Transform the response from Bedrock Moonshot AI models. + + Moonshot AI uses OpenAI-compatible response format, but returns reasoning + content in tags. This method: + 1. Calls parent class transformation + 2. Extracts reasoning content from tags + 3. Sets reasoning_content on the message object + """ + # First, get the standard transformation + model_response = MoonshotChatConfig.transform_response( + self, + model=model, + raw_response=raw_response, + model_response=model_response, + logging_obj=logging_obj, + request_data=request_data, + messages=messages, + optional_params=optional_params, + litellm_params=litellm_params, + encoding=encoding, + api_key=api_key, + json_mode=json_mode, + ) + + # Extract reasoning content from tags + if model_response.choices and len(model_response.choices) > 0: + for choice in model_response.choices: + # Only process Choices (not StreamingChoices) which have message attribute + if isinstance(choice, Choices) and choice.message and choice.message.content: + reasoning_content, main_content = self._extract_reasoning_from_content( + choice.message.content + ) + + if reasoning_content: + # Set the reasoning_content field + choice.message.reasoning_content = reasoning_content + # Update the main content without reasoning tags + choice.message.content = main_content + + return model_response + + def get_error_class( + self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers] + ) -> BedrockError: + """Return the appropriate error class for Bedrock.""" + return BedrockError(status_code=status_code, message=error_message) diff --git a/litellm/llms/bedrock/chat/invoke_transformations/amazon_nova_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/amazon_nova_transformation.py index a81d55f0ad22..3506c8f1cc07 100644 --- a/litellm/llms/bedrock/chat/invoke_transformations/amazon_nova_transformation.py +++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_nova_transformation.py @@ -10,7 +10,6 @@ import httpx -import litellm from litellm.litellm_core_utils.litellm_logging import Logging from litellm.types.llms.bedrock import BedrockInvokeNovaRequest from litellm.types.llms.openai import AllMessageValues @@ -80,7 +79,7 @@ def transform_response( encoding: Any, api_key: Optional[str] = None, json_mode: Optional[bool] = None, - ) -> litellm.ModelResponse: + ) -> ModelResponse: return AmazonConverseConfig.transform_response( self, model, diff --git a/litellm/llms/bedrock/chat/invoke_transformations/amazon_openai_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/amazon_openai_transformation.py new file mode 100644 index 000000000000..ee07b71ef154 --- /dev/null +++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_openai_transformation.py @@ -0,0 +1,186 @@ +""" +Transformation for Bedrock imported models that use OpenAI Chat Completions format. + +Use this for models imported into Bedrock that accept the OpenAI API format. +Model format: bedrock/openai/ + +Example: bedrock/openai/arn:aws:bedrock:us-east-1:123456789012:imported-model/abc123 +""" + +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union + +import httpx + +from litellm.llms.bedrock.base_aws_llm import BaseAWSLLM +from litellm.llms.bedrock.common_utils import BedrockError +from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig +from litellm.types.llms.openai import AllMessageValues + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + + LiteLLMLoggingObj = _LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any + + +class AmazonBedrockOpenAIConfig(OpenAIGPTConfig, BaseAWSLLM): + """ + Configuration for Bedrock imported models that use OpenAI Chat Completions format. + + This class handles the transformation of requests and responses for Bedrock + imported models that accept the OpenAI API format directly. + + Inherits from OpenAIGPTConfig to leverage standard OpenAI parameter handling + and response transformation, while adding Bedrock-specific URL generation + and AWS request signing. + + Usage: + model = "bedrock/openai/arn:aws:bedrock:us-east-1:123456789012:imported-model/abc123" + """ + + def __init__(self, **kwargs): + OpenAIGPTConfig.__init__(self, **kwargs) + BaseAWSLLM.__init__(self, **kwargs) + + @property + def custom_llm_provider(self) -> Optional[str]: + return "bedrock" + + def _get_openai_model_id(self, model: str) -> str: + """ + Extract the actual model ID from the LiteLLM model name. + + Input format: bedrock/openai/ + Returns: + """ + # Remove bedrock/ prefix if present + if model.startswith("bedrock/"): + model = model[8:] + + # Remove openai/ prefix + if model.startswith("openai/"): + model = model[7:] + + return model + + def get_complete_url( + self, + api_base: Optional[str], + api_key: Optional[str], + model: str, + optional_params: dict, + litellm_params: dict, + stream: Optional[bool] = None, + ) -> str: + """ + Get the complete URL for the Bedrock invoke endpoint. + + Uses the standard Bedrock invoke endpoint format. + """ + model_id = self._get_openai_model_id(model) + + # Get AWS region + aws_region_name = self._get_aws_region_name( + optional_params=optional_params, model=model + ) + + # Get runtime endpoint + aws_bedrock_runtime_endpoint = optional_params.get( + "aws_bedrock_runtime_endpoint", None + ) + endpoint_url, proxy_endpoint_url = self.get_runtime_endpoint( + api_base=api_base, + aws_bedrock_runtime_endpoint=aws_bedrock_runtime_endpoint, + aws_region_name=aws_region_name, + ) + + # Build the invoke URL + if stream: + endpoint_url = f"{endpoint_url}/model/{model_id}/invoke-with-response-stream" + else: + endpoint_url = f"{endpoint_url}/model/{model_id}/invoke" + + return endpoint_url + + def sign_request( + self, + headers: dict, + optional_params: dict, + request_data: dict, + api_base: str, + api_key: Optional[str] = None, + model: Optional[str] = None, + stream: Optional[bool] = None, + fake_stream: Optional[bool] = None, + ) -> Tuple[dict, Optional[bytes]]: + """ + Sign the request using AWS Signature Version 4. + """ + return self._sign_request( + service_name="bedrock", + headers=headers, + optional_params=optional_params, + request_data=request_data, + api_base=api_base, + api_key=api_key, + model=model, + stream=stream, + fake_stream=fake_stream, + ) + + def transform_request( + self, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + headers: dict, + ) -> dict: + """ + Transform the request to OpenAI Chat Completions format for Bedrock imported models. + + Removes AWS-specific params and stream param (handled separately in URL), + then delegates to parent class for standard OpenAI request transformation. + """ + # Remove stream from optional_params as it's handled separately in URL + optional_params.pop("stream", None) + + # Remove AWS-specific params that shouldn't be in the request body + inference_params = { + k: v + for k, v in optional_params.items() + if k not in self.aws_authentication_params + } + + # Use parent class transform_request for OpenAI format + return super().transform_request( + model=self._get_openai_model_id(model), + messages=messages, + optional_params=inference_params, + litellm_params=litellm_params, + headers=headers, + ) + + def validate_environment( + self, + headers: dict, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + api_key: Optional[str] = None, + api_base: Optional[str] = None, + ) -> dict: + """ + Validate the environment and return headers. + + For Bedrock, we don't need Bearer token auth since we use AWS SigV4. + """ + return headers + + def get_error_class( + self, error_message: str, status_code: int, headers: Union[dict, httpx.Headers] + ) -> BedrockError: + """Return the appropriate error class for Bedrock.""" + return BedrockError(status_code=status_code, message=error_message) diff --git a/litellm/llms/bedrock/chat/invoke_transformations/amazon_qwen2_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/amazon_qwen2_transformation.py new file mode 100644 index 000000000000..c532d8ea27cf --- /dev/null +++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_qwen2_transformation.py @@ -0,0 +1,98 @@ +""" +Handles transforming requests for `bedrock/invoke/{qwen2} models` + +Inherits from `AmazonQwen3Config` since Qwen2 and Qwen3 architectures are mostly similar. +The main difference is in the response format: Qwen2 uses "text" field while Qwen3 uses "generation" field. + +Qwen2 + Invoke API Tutorial: https://docs.aws.amazon.com/bedrock/latest/userguide/invoke-imported-model.html +""" + +from typing import Any, List, Optional + +import httpx + +from litellm.llms.bedrock.chat.invoke_transformations.amazon_qwen3_transformation import ( + AmazonQwen3Config, +) +from litellm.llms.bedrock.chat.invoke_transformations.base_invoke_transformation import ( + LiteLLMLoggingObj, +) +from litellm.types.llms.openai import AllMessageValues +from litellm.types.utils import ModelResponse + + +class AmazonQwen2Config(AmazonQwen3Config): + """ + Config for sending `qwen2` requests to `/bedrock/invoke/` + + Inherits from AmazonQwen3Config since Qwen2 and Qwen3 architectures are mostly similar. + The main difference is in the response format: Qwen2 uses "text" field while Qwen3 uses "generation" field. + + Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/invoke-imported-model.html + """ + + def transform_response( + self, + model: str, + raw_response: httpx.Response, + model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, + request_data: dict, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + encoding: Any, + api_key: Optional[str] = None, + json_mode: Optional[bool] = None, + ) -> ModelResponse: + """ + Transform Qwen2 Bedrock response to OpenAI format + + Qwen2 uses "text" field, but we also support "generation" field for compatibility. + """ + try: + if hasattr(raw_response, 'json'): + response_data = raw_response.json() + else: + response_data = raw_response + + # Extract the generated text - Qwen2 uses "text" field, but also support "generation" for compatibility + generated_text = response_data.get("generation", "") or response_data.get("text", "") + + # Clean up the response (remove assistant start token if present) + if generated_text.startswith("<|im_start|>assistant\n"): + generated_text = generated_text[len("<|im_start|>assistant\n"):] + if generated_text.endswith("<|im_end|>"): + generated_text = generated_text[:-len("<|im_end|>")] + + # Set the content in the existing model_response structure + if hasattr(model_response, 'choices') and len(model_response.choices) > 0: + choice = model_response.choices[0] + if hasattr(choice, 'message'): + choice.message.content = generated_text + choice.finish_reason = "stop" + else: + # Handle streaming choices + choice.delta.content = generated_text + choice.finish_reason = "stop" + + # Set usage information if available in response + if "usage" in response_data: + usage_data = response_data["usage"] + if hasattr(model_response, 'usage'): + model_response.usage.prompt_tokens = usage_data.get("prompt_tokens", 0) + model_response.usage.completion_tokens = usage_data.get("completion_tokens", 0) + model_response.usage.total_tokens = usage_data.get("total_tokens", 0) + + return model_response + + except Exception as e: + if logging_obj: + logging_obj.post_call( + input=messages, + api_key=api_key, + original_response=raw_response, + additional_args={"error": str(e)}, + ) + raise e + diff --git a/litellm/llms/bedrock/chat/invoke_transformations/amazon_twelvelabs_pegasus_transformation.py b/litellm/llms/bedrock/chat/invoke_transformations/amazon_twelvelabs_pegasus_transformation.py new file mode 100644 index 000000000000..62e98f7472fa --- /dev/null +++ b/litellm/llms/bedrock/chat/invoke_transformations/amazon_twelvelabs_pegasus_transformation.py @@ -0,0 +1,280 @@ +""" +Transforms OpenAI-style requests into TwelveLabs Pegasus 1.2 requests for Bedrock. + +Reference: +https://docs.twelvelabs.io/docs/models/pegasus +""" + +import json +import time +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +import httpx + +import litellm +from litellm._logging import verbose_logger +from litellm.litellm_core_utils.core_helpers import map_finish_reason +from litellm.llms.base_llm.base_utils import type_to_response_format_param +from litellm.llms.base_llm.chat.transformation import BaseConfig +from litellm.llms.bedrock.chat.invoke_transformations.base_invoke_transformation import ( + AmazonInvokeConfig, +) +from litellm.llms.bedrock.common_utils import BedrockError +from litellm.types.llms.openai import AllMessageValues +from litellm.types.utils import ModelResponse, Usage +from litellm.utils import get_base64_str + +if TYPE_CHECKING: + from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj + + LiteLLMLoggingObj = _LiteLLMLoggingObj +else: + LiteLLMLoggingObj = Any + + +class AmazonTwelveLabsPegasusConfig(AmazonInvokeConfig, BaseConfig): + """ + Handles transforming OpenAI-style requests into Bedrock InvokeModel requests for + `twelvelabs.pegasus-1-2-v1:0`. + + Pegasus 1.2 requires an `inputPrompt` and a `mediaSource` that either references + an S3 object or a base64-encoded clip. Optional OpenAI params (temperature, + response_format, max_tokens) are translated to the TwelveLabs schema. + """ + + def get_supported_openai_params(self, model: str) -> List[str]: + return [ + "max_tokens", + "max_completion_tokens", + "temperature", + "response_format", + ] + + def map_openai_params( + self, + non_default_params: dict, + optional_params: dict, + model: str, + drop_params: bool, + ) -> dict: + for param, value in non_default_params.items(): + if param in {"max_tokens", "max_completion_tokens"}: + optional_params["maxOutputTokens"] = value + if param == "temperature": + optional_params["temperature"] = value + if param == "response_format": + optional_params["responseFormat"] = self._normalize_response_format( + value + ) + return optional_params + + def _normalize_response_format(self, value: Any) -> Any: + """Normalize response_format to TwelveLabs format. + + TwelveLabs expects: + { + "jsonSchema": {...} + } + + But OpenAI format is: + { + "type": "json_schema", + "json_schema": { + "name": "...", + "schema": {...} + } + } + """ + if isinstance(value, dict): + # If it has json_schema field, extract and transform it + if "json_schema" in value: + json_schema = value["json_schema"] + # Extract the schema if nested + if isinstance(json_schema, dict) and "schema" in json_schema: + return {"jsonSchema": json_schema["schema"]} + # Otherwise use json_schema directly + return {"jsonSchema": json_schema} + # If it already has jsonSchema, return as is + if "jsonSchema" in value: + return value + # Otherwise return the dict as is + return value + return type_to_response_format_param(response_format=value) or value + + def transform_request( + self, + model: str, + messages: List[AllMessageValues], + optional_params: dict, + litellm_params: dict, + headers: dict, + ) -> dict: + input_prompt = self._convert_messages_to_prompt(messages=messages) + request_data: Dict[str, Any] = {"inputPrompt": input_prompt} + + media_source = self._build_media_source(optional_params) + if media_source is not None: + request_data["mediaSource"] = media_source + + # Handle temperature and maxOutputTokens + for key in ("temperature", "maxOutputTokens"): + if key in optional_params: + request_data[key] = optional_params.get(key) + + # Handle responseFormat - transform to TwelveLabs format + if "responseFormat" in optional_params: + response_format = optional_params["responseFormat"] + transformed_format = self._normalize_response_format(response_format) + if transformed_format: + request_data["responseFormat"] = transformed_format + + return request_data + + def _build_media_source(self, optional_params: dict) -> Optional[dict]: + direct_source = optional_params.get("mediaSource") or optional_params.get( + "media_source" + ) + if isinstance(direct_source, dict): + return direct_source + + base64_input = optional_params.get("video_base64") or optional_params.get( + "base64_string" + ) + if base64_input: + return {"base64String": get_base64_str(base64_input)} + + s3_uri = ( + optional_params.get("video_s3_uri") + or optional_params.get("s3_uri") + or optional_params.get("media_source_s3_uri") + ) + if s3_uri: + s3_location = {"uri": s3_uri} + bucket_owner = ( + optional_params.get("video_s3_bucket_owner") + or optional_params.get("s3_bucket_owner") + or optional_params.get("media_source_bucket_owner") + ) + if bucket_owner: + s3_location["bucketOwner"] = bucket_owner + return {"s3Location": s3_location} + return None + + def _convert_messages_to_prompt(self, messages: List[AllMessageValues]) -> str: + prompt_parts: List[str] = [] + for message in messages: + role = message.get("role", "user") + content = message.get("content", "") + if isinstance(content, list): + text_fragments = [] + for item in content: + if isinstance(item, dict): + item_type = item.get("type") + if item_type == "text": + text_fragments.append(item.get("text", "")) + elif item_type == "image_url": + text_fragments.append("") + elif item_type == "video_url": + text_fragments.append("