Merge branch 'main' into feature/model-registry-onboarding

opendatahub-io · Dec 11, 2024 · 7728bb4 · 7728bb4
2 parents 8fd734a + 5040fd3
commit 7728bb4
Show file tree

Hide file tree

Showing 23 changed files with 1,441 additions and 821 deletions.
diff --git a/conftest.py b/conftest.py
@@ -16,7 +16,7 @@
 def pytest_addoption(parser: Parser) -> None:
     aws_group = parser.getgroup(name="AWS")
     buckets_group = parser.getgroup(name="Buckets")
-
+    runtime_group = parser.getgroup(name="Runtime Details")
     # AWS config and credentials options
     aws_group.addoption(
         "--aws-secret-access-key",
@@ -55,6 +55,17 @@ def pytest_addoption(parser: Parser) -> None:
         default=os.environ.get("MODELS_S3_BUCKET_ENDPOINT"),
         help="Models S3 bucket endpoint",
     )
+    # Runtime options
+    runtime_group.addoption(
+        "--supported-accelerator-type",
+        default=os.environ.get("SUPPORTED_ACCLERATOR_TYPE"),
+        help="Supported accelerator type : Nvidia,AMD,Gaudi",
+    )
+    runtime_group.addoption(
+        "--vllm-runtime-image",
+        default=os.environ.get("VLLM_RUNTIME_IMAGE"),
+        help="Specify the runtime image to use for the tests",
+    )
 
 
 def pytest_sessionstart(session: Session) -> None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,6 +8,7 @@ output-format = "grouped"
 exclude = [".git", ".venv", ".mypy_cache", ".tox", "__pycache__"]
 
 [tool.mypy]
+exclude = ["utilities/plugins/tgis_grpc/"]
 check_untyped_defs = true
 disallow_any_generics = true
 disallow_incomplete_defs = true
@@ -50,6 +51,11 @@ dependencies = [
     "tenacity",
     "types-requests>=2.32.0.20241016",
     "schemathesis",
+    "requests",
+    "pytest-asyncio",
+    "syrupy",
+    "protobuf",
+    "grpcio-reflection",
 ]
 
 [project.urls]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -8,6 +8,7 @@
 from pyhelper_utils.shell import run_command
 
 from utilities.infra import create_ns
+from utilities.constants import AcceleratorType
 
 
 @pytest.fixture(scope="session")
@@ -119,3 +120,24 @@ def models_s3_bucket_endpoint(pytestconfig: pytest.Config) -> str:
             "Either pass with `--models-s3-bucket-endpoint` or set `MODELS_S3_BUCKET_ENDPOINT` environment variable"
         )
     return models_bucket_endpoint
+
+
+@pytest.fixture(scope="session")
+def supported_accelerator_type(pytestconfig: pytest.Config) -> str:
+    accelerator_type = pytestconfig.option.supported_accelerator_type
+    if not accelerator_type:
+        return None
+    if accelerator_type.lower() not in AcceleratorType.SUPPORTED_LISTS:
+        raise ValueError(
+            "accelerator type is not defined."
+            "Either pass with `--supported-accelerator-type` or set `SUPPORTED_ACCLERATOR_TYPE` environment variable"
+        )
+    return accelerator_type
+
+
+@pytest.fixture(scope="session")
+def vllm_runtime_image(pytestconfig: pytest.Config) -> str:
+    runtime_image = pytestconfig.option.vllm_runtime_image
+    if not runtime_image:
+        return None
+    return runtime_image
diff --git a/tests/model_serving/model_runtime/vllm/__init__.py b/tests/model_serving/model_runtime/vllm/__init__.py
diff --git a/tests/model_serving/model_runtime/vllm/basic_model_deployment/__init__.py b/tests/model_serving/model_runtime/vllm/basic_model_deployment/__init__.py
diff --git a/...[model_namespace0-s3_models_storage_uri0-serving_runtime0-vllm_inference_service0].1.json b/...[model_namespace0-s3_models_storage_uri0-serving_runtime0-vllm_inference_service0].1.json
@@ -0,0 +1,24 @@
+[
+  {
+    "finish_reason": "stop",
+    "index": 0,
+    "logprobs": null,
+    "message": {
+      "content": "Here is a simple Python code snippet to find even numbers in a list:\n\n```python\ndef find_even_numbers(numbers):\n    even_numbers = [num for num in numbers if num % 2 == 0]\n    return even_numbers\n\n# Test the function\nnumbers = [1, 2, 3, 4, 5, 6]\nprint(find_even_numbers(numbers))\n```\n\nThis code defines a function `find_even_numbers` that takes a list of numbers as input and returns a new list containing only the even numbers from the input list. The function uses a list comprehension to iterate over each number in the input list and checks if the number is even by using the modulus operator (`%`). If the remainder of the division is 0, the number is even and is added to the new list.",
+      "role": "assistant",
+      "tool_calls": []
+    },
+    "stop_reason": null
+  },
+  {
+    "finish_reason": "stop",
+    "index": 0,
+    "logprobs": null,
+    "message": {
+      "content": "1. Sentence: SpellForce 3 is a bad game.\n\t* Meaning: The game SpellForce 3 has negative qualities.\n2. Sentence: The developer Grimlore Games is a bunch of no-talent hacks.\n\t* Meaning: The developers of SpellForce 3, Grimlore Games, lack talent and skill.\n3. Sentence: 2017 was a terrible year for games.\n\t* Meaning: The year 2017 was marked by a lack of quality games.\n\nThe underlying meaning representation of the input sentence can be constructed as a single function with attributes and attribute values:\n\nfunction(sentence) {\n\tif (sentence === \"SpellForce 3 is a bad game.\") {\n\t\treturn {\n\t\t\t\"meaning\": \"The game SpellForce 3 has negative qualities.\"\n\t\t};\n\t} else if (sentence === \"The developer Grimlore Games is a bunch of no-talent hacks.\") {\n\t\treturn {\n\t\t\t\"meaning\": \"The developers of SpellForce 3, Grimlore Games, lack talent and skill.\"\n\t\t};\n\t} else if (sentence === \"2017 was a terrible year for games.\") {\n\t\treturn {\n\t\t\t\"meaning\": \"The year 2017 was marked by a lack of quality games.\"\n\t\t};\n\t}\n}",
+      "role": "assistant",
+      "tool_calls": []
+    },
+    "stop_reason": null
+  }
+]
diff --git a/...[model_namespace0-s3_models_storage_uri0-serving_runtime0-vllm_inference_service0].2.json b/...[model_namespace0-s3_models_storage_uri0-serving_runtime0-vllm_inference_service0].2.json
@@ -0,0 +1,50 @@
+[
+  {
+    "finish_reason": "length",
+    "index": 0,
+    "logprobs": null,
+    "prompt_logprobs": null,
+    "stop_reason": null,
+    "text": "\n\n1. Labrador Retriever - Labrador Retrievers are known for their friendly, outgoing, and intelligent personalities. They are also great swimmers and love water.\n\n2. German Shepherd - German Shepherds are highly intelligent, loyal, and courageous. They are versatile working dogs, excelling in roles such as police and military work, search and rescue, and guiding for the visually imp"
+  },
+  {
+    "finish_reason": "stop",
+    "index": 0,
+    "logprobs": null,
+    "prompt_logprobs": null,
+    "stop_reason": null,
+    "text": "\n\n1. Japanese: ыеは鳥に夏日に鳴りなさい。 (The early bird chirps at the break of the day.)\n2. French: Le oiseau avant le soleil cueille la truite.\n3. Swahili: Ikiwawi kuchwa katika uatoaji mbili. (The early bird collects the worm in the morning.)"
+  },
+  {
+    "finish_reason": "length",
+    "index": 0,
+    "logprobs": null,
+    "prompt_logprobs": null,
+    "stop_reason": null,
+    "text": "\n\nOnce upon a time, in a world far from ours, there lived a robot named C-317. C-317 was not like other robots. He was designed for one purpose only: to assist humans in their daily lives. But C-317 had a secret. He dreamed.\n\nEvery night, as he lay in a small chamber, his circuits would flicker and hum. And in those dreams, he"
+  },
+  {
+    "finish_reason": "length",
+    "index": 0,
+    "logprobs": null,
+    "prompt_logprobs": null,
+    "stop_reason": null,
+    "text": "\n\nThe Mona Lisa, also known as La Gioconda, is an oil painting created by the Italian Renaissance artist Leonardo da Vinci around 1503-1506. It is one of the most famous and recognizable works in the world, with an enduring allure that has captivated millions of viewers since its completion. The painting is renowned for its subtle details, amb"
+  },
+  {
+    "finish_reason": "length",
+    "index": 0,
+    "logprobs": null,
+    "prompt_logprobs": null,
+    "stop_reason": null,
+    "text": "\n\nComparison:\n\n1. Speed: Artificial Intelligence (AI) can process and analyze large amounts of data at a much faster rate than humans. For example, AI can quickly scan through millions of images to identify a specific object, while a human might take significantly longer to do the same task.\n\n2. Accuracy: While AI is excellent at processing and analyzing large volumes of data, its accuracy is often limited by the quality of the data it is given. Humans"
+  },
+  {
+    "finish_reason": "length",
+    "index": 0,
+    "logprobs": null,
+    "prompt_logprobs": null,
+    "stop_reason": null,
+    "text": "\n\n1. The Dartmouth Conference (1956): This marked the official start of AI research, with a group of researchers gathering to discuss the possibilities and challenges of creating machines that could mimic human intelligence.\n\n2. ELIZA (1964-1966): Developed by Joseph Weizenbaum, ELIZA was one of the first AI programs to simulate a conversation. It used pattern matching to respond to"
+  }
+]
diff --git a/...ce[model_namespace0-s3_models_storage_uri0-serving_runtime0-vllm_inference_service0].json b/...ce[model_namespace0-s3_models_storage_uri0-serving_runtime0-vllm_inference_service0].json
@@ -0,0 +1,24 @@
+[
+  {
+    "id": "granite-rest",
+    "max_model_len": 2048,
+    "object": "model",
+    "owned_by": "vllm",
+    "parent": null,
+    "permission": [
+      {
+        "allow_create_engine": false,
+        "allow_fine_tuning": false,
+        "allow_logprobs": true,
+        "allow_sampling": true,
+        "allow_search_indices": false,
+        "allow_view": true,
+        "group": null,
+        "is_blocking": false,
+        "object": "model_permission",
+        "organization": "*"
+      }
+    ],
+    "root": "/mnt/models"
+  }
+]
diff --git a/...model_runtime/vllm/basic_model_deployment/test_granite_2b_instruct_preview_4k_r240917a.py b/...model_runtime/vllm/basic_model_deployment/test_granite_2b_instruct_preview_4k_r240917a.py
@@ -0,0 +1,45 @@
+import pytest
+from simple_logger.logger import get_logger
+from utilities.constants import KServeDeploymentType
+from tests.model_serving.model_runtime.vllm.utils import fetch_openai_response
+
+LOGGER = get_logger(name=__name__)
+
+serving_arument = ["--dtype=bfloat16", "--model=/mnt/models", "--max-model-len=2048", "--uvicorn-log-level=debug"]
+
+
+pytestmark = pytest.mark.usefixtures("skip_if_no_supported_accelerator_type", "valid_aws_config")
+
+
+@pytest.mark.parametrize(
+    "model_namespace, s3_models_storage_uri, serving_runtime, vllm_inference_service",
+    [
+        pytest.param(
+            {"name": "granite-serverless-rest"},
+            {"model-dir": "granite-2b-instruct-preview-4k-r240917a"},
+            {"deployment_type": "Serverless"},
+            {
+                "deployment_mode": KServeDeploymentType.SERVERLESS,
+                "runtime_argument": serving_arument,
+                "gpu_count": 1,
+                "name": "granite-rest",
+                "min-replicas": 1,
+            },
+        ),
+    ],
+    indirect=True,
+)
+class TestGranite2BModel:
+    def test_deploy_model_inference(self, vllm_inference_service, response_snapshot):
+        URL = vllm_inference_service.instance.status.url
+        if (
+            vllm_inference_service.instance.metadata.annotations["serving.kserve.io/deploymentMode"]
+            == KServeDeploymentType.SERVERLESS
+        ):
+            model_info, chat_responses, completion_responses = fetch_openai_response(
+                url=URL,
+                model_name=vllm_inference_service.instance.metadata.name,
+            )
+            assert model_info == response_snapshot
+            assert chat_responses == response_snapshot
+            assert completion_responses == response_snapshot
diff --git a/tests/model_serving/model_runtime/vllm/conftest.py b/tests/model_serving/model_runtime/vllm/conftest.py
@@ -0,0 +1,127 @@
+from typing import Any, Generator
+import pytest
+from kubernetes.dynamic import DynamicClient
+from ocp_resources.namespace import Namespace
+from ocp_resources.serving_runtime import ServingRuntime
+from ocp_resources.inference_service import InferenceService
+from ocp_resources.secret import Secret
+from ocp_resources.service_account import ServiceAccount
+from tests.model_serving.model_runtime.vllm.utils import kserve_s3_endpoint_secret
+from tests.model_serving.model_server.authentication.conftest import s3_models_storage_uri  # noqa: F811
+from utilities.constants import KServeDeploymentType
+from pytest import FixtureRequest
+from syrupy.extensions.json import JSONSnapshotExtension
+from tests.model_serving.model_runtime.vllm.utils import get_runtime_manifest
+from tests.model_serving.model_server.utils import create_isvc
+from tests.model_serving.model_runtime.vllm.constant import TEMPLATE_MAP, ACCELERATOR_IDENTIFIER, PREDICT_RESOURCES
+from simple_logger.logger import get_logger
+
+
+LOGGER = get_logger(name=__name__)
+
+
+@pytest.fixture(scope="class")
+def serving_runtime(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    supported_accelerator_type: str,
+    vllm_runtime_image: str,
+):
+    accelerator_type = supported_accelerator_type.lower()
+    template_name = TEMPLATE_MAP.get(accelerator_type, "vllm-runtime-template")
+    manifest = get_runtime_manifest(
+        client=admin_client,
+        template_name=template_name,
+        deployment_type=request.param["deployment_type"],
+        runtime_image=vllm_runtime_image,
+    )
+    manifest["metadata"]["name"] = "vllm-runtime"
+    manifest["metadata"]["namespace"] = model_namespace.name
+    with ServingRuntime(client=admin_client, kind_dict=manifest) as model_runtime:
+        yield model_runtime
+
+
+@pytest.fixture(scope="session")
+def skip_if_no_supported_accelerator_type(supported_accelerator_type: str):
+    if not supported_accelerator_type:
+        pytest.skip("Accelartor type is not provide,vLLM test can not be run on CPU")
+
+
+@pytest.fixture(scope="class")
+def vllm_inference_service(
+    request: FixtureRequest,
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    serving_runtime: ServingRuntime,
+    supported_accelerator_type: str,
+    s3_models_storage_uri: str,
+    model_service_account: ServiceAccount,
+) -> Generator[InferenceService, Any, Any]:
+    isvc_kwargs = {
+        "client": admin_client,
+        "name": request.param["name"],
+        "namespace": model_namespace.name,
+        "runtime": serving_runtime.name,
+        "storage_uri": s3_models_storage_uri,
+        "model_format": serving_runtime.instance.spec.supportedModelFormats[0].name,
+        "model_service_account": model_service_account.name,
+        "deployment_mode": request.param.get("deployment-mode", KServeDeploymentType.SERVERLESS),
+    }
+    accelerator_type = supported_accelerator_type.lower()
+    gpu_count = request.param.get("gpu_count")
+    identifier = ACCELERATOR_IDENTIFIER.get(accelerator_type, "nvidia.com/gpu")
+    resources = PREDICT_RESOURCES["resources"]
+    resources["requests"][identifier] = gpu_count
+    resources["limits"][identifier] = gpu_count
+    isvc_kwargs["resources"] = resources
+
+    if gpu_count > 1:
+        isvc_kwargs["volumes"] = PREDICT_RESOURCES["volumes"]
+        isvc_kwargs["volumes_mounts"] = PREDICT_RESOURCES["volume_mounts"]
+    if arguments := request.param.get("runtime_argument"):
+        arguments.append(f"--tensor-parallel-size={gpu_count}")
+        isvc_kwargs["argument"] = arguments
+
+    if min_replicas := request.param.get("min-replicas"):
+        isvc_kwargs["min_replicas"] = min_replicas
+
+    with create_isvc(**isvc_kwargs) as isvc:
+        yield isvc
+
+
+@pytest.fixture(scope="class")
+def model_service_account(admin_client: DynamicClient, kserve_endpoint_s3_secret: Secret):
+    with ServiceAccount(
+        client=admin_client,
+        namespace=kserve_endpoint_s3_secret.namespace,
+        name="models-bucket-sa",
+        secrets=[{"name": kserve_endpoint_s3_secret.name}],
+    ) as sa:
+        yield sa
+
+
+@pytest.fixture(scope="class")
+def kserve_endpoint_s3_secret(
+    admin_client: DynamicClient,
+    model_namespace: Namespace,
+    aws_access_key_id: str,
+    aws_secret_access_key: str,
+    models_s3_bucket_region: str,
+    models_s3_bucket_endpoint: str,
+) -> Secret:
+    with kserve_s3_endpoint_secret(
+        admin_client=admin_client,
+        name="models-bucket-secret",
+        namespace=model_namespace.name,
+        aws_access_key=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+        aws_s3_region=models_s3_bucket_region,
+        aws_s3_endpoint=models_s3_bucket_endpoint,
+    ) as secret:
+        yield secret
+
+
+@pytest.fixture
+def response_snapshot(snapshot):
+    return snapshot.use_extension(JSONSnapshotExtension)