diff --git a/tests/model_registry/utils.py b/tests/model_registry/utils.py index e9d2211f..557df8b0 100644 --- a/tests/model_registry/utils.py +++ b/tests/model_registry/utils.py @@ -4,7 +4,7 @@ from ocp_resources.model_registry import ModelRegistry from kubernetes.dynamic.exceptions import ResourceNotFoundError -from utilities.exceptions import ProtocolNotSupported, TooManyServices +from utilities.exceptions import ProtocolNotSupportedError, TooManyServicesError from utilities.constants import Protocols, HTTPRequest from tests.model_registry.constants import ModelRegistryEndpoints @@ -35,7 +35,7 @@ def get_mr_service_by_label(client: DynamicClient, ns: Namespace, mr_instance: M ]: if len(svc) == 1: return svc[0] - raise TooManyServices(svc) + raise TooManyServicesError(svc) raise ResourceNotFoundError(f"{mr_instance.name} has no Service") @@ -43,7 +43,7 @@ def get_endpoint_from_mr_service(client: DynamicClient, svc: Service, protocol: if protocol in (Protocols.REST, Protocols.GRPC): return svc.instance.metadata.annotations[f"{ADDRESS_ANNOTATION_PREFIX}{protocol}"] else: - raise ProtocolNotSupported(protocol) + raise ProtocolNotSupportedError(protocol) def generate_register_model_command(endpoint: str, token: str) -> str: diff --git a/tests/model_serving/model_server/private_endpoint/utils.py b/tests/model_serving/model_server/private_endpoint/utils.py index 6fa5468d..7ab0f37d 100644 --- a/tests/model_serving/model_server/private_endpoint/utils.py +++ b/tests/model_serving/model_server/private_endpoint/utils.py @@ -9,7 +9,7 @@ from simple_logger.logger import get_logger from utilities.constants import Protocols -from utilities.exceptions import ProtocolNotSupported +from utilities.exceptions import ProtocolNotSupportedError LOGGER = get_logger(name=__name__) @@ -21,7 +21,7 @@ def curl_from_pod( protocol: str = Protocols.HTTP, ) -> str: if protocol not in (Protocols.HTTPS, Protocols.HTTP): - raise ProtocolNotSupported(protocol) + raise ProtocolNotSupportedError(protocol) host = isvc.instance.status.address.url if protocol == "http": parsed = urlparse(host) diff --git a/tests/model_serving/model_server/utils.py b/tests/model_serving/model_server/utils.py index 6c0a1add..4692a40e 100644 --- a/tests/model_serving/model_server/utils.py +++ b/tests/model_serving/model_server/utils.py @@ -7,15 +7,52 @@ from kubernetes.dynamic import DynamicClient from ocp_resources.inference_service import InferenceService from simple_logger.logger import get_logger +from timeout_sampler import TimeoutSampler from utilities.constants import KServeDeploymentType -from utilities.exceptions import InferenceResponseError, InvalidStorageArgument +from utilities.exceptions import FailedPodsError, InferenceResponseError, InvalidStorageArgumentError from utilities.inference_utils import UserInference -from utilities.infra import wait_for_inference_deployment_replicas +from utilities.infra import ( + get_pods_by_isvc_label, + wait_for_inference_deployment_replicas, +) LOGGER = get_logger(name=__name__) +def verify_no_failed_pods(client: DynamicClient, isvc: InferenceService) -> None: + failed_pods: dict[str, Any] = {} + + for pods in TimeoutSampler( + wait_timeout=5 * 60, + sleep=10, + func=get_pods_by_isvc_label, + client=client, + isvc=isvc, + ): + if pods: + if all([pod.instance.status.phase == pod.Status.RUNNING for pod in pods]): + return + + for pod in pods: + pod_status = pod.instance.status + if init_container_status := pod_status.initContainerStatuses: + if container_terminated := init_container_status[0].lastState.terminated: + if container_terminated.reason == "Error": + failed_pods[pod.name] = pod_status + + elif pod_status.phase in ( + pod.Status.CRASH_LOOPBACK_OFF, + pod.Status.FAILED, + pod.Status.IMAGE_PULL_BACK_OFF, + pod.Status.ERR_IMAGE_PULL, + ): + failed_pods[pod.name] = pod_status + + if failed_pods: + raise FailedPodsError(pods=failed_pods) + + @contextmanager def create_isvc( client: DynamicClient, @@ -113,6 +150,12 @@ def create_isvc( predictor=predictor_dict, label=labels, ) as inference_service: + if wait_for_predictor_pods: + verify_no_failed_pods(client=client, isvc=inference_service) + wait_for_inference_deployment_replicas( + client=client, isvc=inference_service, deployment_mode=deployment_mode + ) + if wait: inference_service.wait_for_condition( condition=inference_service.Condition.READY, @@ -120,11 +163,6 @@ def create_isvc( timeout=15 * 60, ) - if wait_for_predictor_pods: - wait_for_inference_deployment_replicas( - client=client, isvc=inference_service, deployment_mode=deployment_mode - ) - yield inference_service @@ -134,7 +172,7 @@ def _check_storage_arguments( storage_path: Optional[str], ) -> None: if (storage_uri and storage_path) or (not storage_uri and not storage_key) or (storage_key and not storage_path): - raise InvalidStorageArgument(storage_uri, storage_key, storage_path) + raise InvalidStorageArgumentError(storage_uri=storage_uri, storage_key=storage_key, storage_path=storage_path) def verify_inference_response( diff --git a/utilities/constants.py b/utilities/constants.py index cd788c39..9e61bd47 100644 --- a/utilities/constants.py +++ b/utilities/constants.py @@ -90,8 +90,8 @@ class Protocols: HTTPS: str = "https" GRPC: str = "grpc" REST: str = "rest" - TCP_PROTOCOLS: set[str] = {"HTTP", "HTTPS"} - ALL_SUPPORTED_PROTOCOLS: set[str] = TCP_PROTOCOLS.union({"GRPC"}) + TCP_PROTOCOLS: set[str] = {HTTP, HTTPS} + ALL_SUPPORTED_PROTOCOLS: set[str] = TCP_PROTOCOLS.union({GRPC}) class HTTPRequest: diff --git a/utilities/exceptions.py b/utilities/exceptions.py index 2a632541..dfcfff57 100644 --- a/utilities/exceptions.py +++ b/utilities/exceptions.py @@ -3,7 +3,7 @@ from ocp_resources.service import Service -class ProtocolNotSupported(Exception): +class ProtocolNotSupportedError(Exception): def __init__(self, protocol: str): self.protocol = protocol @@ -11,7 +11,7 @@ def __str__(self) -> str: return f"Protocol {self.protocol} is not supported" -class TooManyServices(Exception): +class TooManyServicesError(Exception): def __init__(self, services: list[Service]): self.services = services @@ -23,24 +23,24 @@ class InferenceResponseError(Exception): pass -class InvalidStorageArgument(Exception): +class InvalidStorageArgumentError(Exception): def __init__( self, - storageUri: Optional[str], + storage_uri: Optional[str], storage_key: Optional[str], storage_path: Optional[str], ): - self.storageUri = storageUri + self.storage_uri = storage_uri self.storage_key = storage_key self.storage_path = storage_path def __str__(self) -> str: msg = f""" You've passed the following parameters: - "storageUri": {self.storageUri} + "storage_uri": {self.storage_uri} "storage_key": {self.storage_key} "storage_path: {self.storage_path} - In order to create a valid ISVC you need to specify either a storageUri value + In order to create a valid ISVC you need to specify either a storage_uri value or both a storage key and a storage path. """ return msg @@ -48,3 +48,11 @@ def __str__(self) -> str: class MetricValidationError(Exception): pass + + +class FailedPodsError(Exception): + def __init__(self, pods: dict[str, str]): + self.pods = pods + + def __str__(self) -> str: + return f"The following pods are not running: {self.pods}"