From 330e5e2a8b0345eae66225f5581bec78b47ebb32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jiri=20Dan=C4=9Bk?= Date: Fri, 17 Jan 2025 10:22:52 +0100 Subject: [PATCH] fixup, implement everything so that test runs to completion --- tests/workbenches/conftest.py | 163 +++++++++++++++++- .../notebook-controller/test_spawning.py | 15 +- 2 files changed, 169 insertions(+), 9 deletions(-) diff --git a/tests/workbenches/conftest.py b/tests/workbenches/conftest.py index 1bc7d5b..b2e7968 100644 --- a/tests/workbenches/conftest.py +++ b/tests/workbenches/conftest.py @@ -1,14 +1,21 @@ from __future__ import annotations +import logging +import time +import traceback +from typing import Callable, Any, Generator + +import kubernetes.dynamic from kubernetes.dynamic import DynamicClient +import ocp_resources.pod import ocp_resources.resource import pytest @pytest.fixture(scope="function") -def function_resource_manager(admin_client: DynamicClient) -> KubeResourceManager: +def function_resource_manager(admin_client: DynamicClient) -> Generator[KubeResourceManager, None, None]: resource_manager = KubeResourceManager(admin_client) yield resource_manager resource_manager.destroy() @@ -129,3 +136,157 @@ class OdhConstants: # public static final String OLM_SOURCE_NAME = getOdhOrRhoai("OLM_SOURCE_NAME", ODH_OLM_SOURCE_NAME, RHOAI_OLM_SOURCE_NAME); # public static final String OLM_OPERATOR_CHANNEL = getOdhOrRhoai("OLM_OPERATOR_CHANNEL", ODH_OLM_OPERATOR_CHANNEL, RHOAI_OLM_OPERATOR_CHANNEL); # public static final String OLM_UPGRADE_STARTING_OPERATOR_VERSION = getOdhOrRhoai("OLM_UPGRADE_STARTING_OPERATOR_VERSION", ODH_OLM_UPGRADE_STARTING_OPERATOR_VERSION, RHOAI_OLM_UPGRADE_STARTING_OPERATOR_VERSION); + + +class PodUtils: + READINESS_TIMEOUT = 10 * 60 + + # consider using timeout_sampler + @staticmethod + def waitForPodsReady(client: DynamicClient, namespaceName: str, label_selector: str, expectPodsCount: int): + """Wait for all pods in namespace to be ready + :param client: + :param namespaceName: name of the namespace + :param label_selector: + :param expectPodsCount: + """ + # it's a dynamic client with the `resource` parameter already filled in + class ResourceType(kubernetes.dynamic.Resource, kubernetes.dynamic.DynamicClient): + pass + + resource: ResourceType = client.resources.get( + kind=ocp_resources.pod.Pod.kind, + api_version=ocp_resources.pod.Pod.api_version, + ) + + def ready() -> bool: + pods = resource.get(namespace=namespaceName, label_selector=label_selector).items + if not pods and expectPodsCount == 0: + logging.debug("All expected Pods {} in Namespace {} are ready", label_selector, namespaceName); + return True + if not pods: + logging.debug("Pods matching {}/{} are not ready", namespaceName, label_selector); + return False + if len(pods) != expectPodsCount: + logging.debug("Expected Pods {}/{} are not ready", namespaceName, label_selector); + return False + for pod in pods: + if not Readiness.isPodReady(pod) and not Readiness.isPodSucceeded(pod): + logging.debug("Pod is not ready: {}/{}", namespaceName, pod.getMetadata().getName()); + return False + else: + # check all containers in pods are ready + for cs in pod.status.containerStatuses: + if not (cs.ready or cs.state.get('terminated', {}).get('reason', '') == "Completed"): + logging.debug( + f"Container {cs.getName()} of Pod {namespaceName}/{pod.getMetadata().getName()} not ready") + return False + logging.info("Pods matching {}/{} are ready", namespaceName, label_selector) + return True + + Wait.until(f"readiness of all Pods matching {label_selector} in Namespace {namespaceName}", + TestFrameConstants.GLOBAL_POLL_INTERVAL_MEDIUM, PodUtils.READINESS_TIMEOUT, ready) + + +class Wait: + @staticmethod + def until(description: str, pollInterval: float, timeout: float, ready: Callable[[], bool], + onTimeout: Callable | None = None): + """or every poll (happening once each {@code pollIntervalMs}) checks if supplier {@code ready} is true. + # If yes, the wait is closed. Otherwise, waits another {@code pollIntervalMs} and tries again. + # Once the wait timeout (specified by {@code timeoutMs} is reached and supplier wasn't true until that time, + # runs the {@code onTimeout} (f.e. print of logs, showing the actual value that was checked inside {@code ready}), + # and finally throws {@link WaitException}. + # @param description information about on what we are waiting + # @param pollIntervalMs poll interval in milliseconds + # @param timeoutMs timeout specified in milliseconds + # @param ready {@link BooleanSupplier} containing code, which should be executed each poll, + # verifying readiness of the particular thing + # @param onTimeout {@link Runnable} executed once timeout is reached and + # before the {@link WaitException} is thrown.""" + logging.info("Waiting for: {}", description) + deadline = time.monotonic() + timeout + + exceptionMessage: str | None = None + previousExceptionMessage: str | None = None + + # in case we are polling every 1s, we want to print exception after x tries, not on the first try + # for minutes poll interval will 2 be enough + exceptionAppearanceCount: int = 2 if (pollInterval // 60) > 0 else max(timeout // pollInterval // 4, 2) + exceptionCount: int = 0 + newExceptionAppearance: int = 0 + + stackTraceError: str | None = None + + while True: + try: + result: bool = ready() + except Exception as e: + exceptionMessage = str(e) + + exceptionCount += 1 + newExceptionAppearance += 1 + if (exceptionCount == exceptionAppearanceCount + and exceptionMessage is not None + and exceptionMessage == previousExceptionMessage): + logging.info(f"While waiting for: {description} exception occurred: {exceptionMessage}") + # log the stacktrace + stackTraceError = traceback.format_exc() + elif (exceptionMessage is not None + and exceptionMessage != previousExceptionMessage + and newExceptionAppearance == 2): + previousExceptionMessage = exceptionMessage + + result = False + + timeLeft: float = deadline - time.monotonic() + if result: + return + if timeLeft <= 0: + if exceptionCount > 1: + logging.error("Exception waiting for: {}, {}", description, exceptionMessage) + + if stackTraceError is not None: + # printing handled stacktrace + logging.error(stackTraceError) + if onTimeout is not None: + onTimeout() + waitException: WaitException = WaitException(f"Timeout after {timeout} s waiting for {description}") + logging.error(waitException) + raise waitException + + sleepTime: float = min(pollInterval, timeLeft) + time.sleep(sleepTime) + + +class WaitException(Exception): + pass + + +class Readiness: + @staticmethod + def isPodReady(pod) -> bool: + Utils.checkNotNull(pod, "Pod can't be null.") + + condition = ocp_resources.pod.Pod.Condition.READY + status = ocp_resources.pod.Pod.Condition.Status.TRUE + for cond in pod.get("status", {}).get("conditions", []): + if cond["type"] == condition and cond["status"].casefold() == status.casefold(): + return True + return False + + @staticmethod + def isPodSucceeded(pod) -> bool: + Utils.checkNotNull(pod, "Pod can't be null.") + return pod.status is not None and "Succeeded" == pod.status.phase + + +class Utils: + @staticmethod + def checkNotNull(value, message) -> None: + if value is None: + raise ValueError(message) + + +class TestFrameConstants: + GLOBAL_POLL_INTERVAL_MEDIUM = 10 diff --git a/tests/workbenches/notebook-controller/test_spawning.py b/tests/workbenches/notebook-controller/test_spawning.py index 361937b..cd84e88 100644 --- a/tests/workbenches/notebook-controller/test_spawning.py +++ b/tests/workbenches/notebook-controller/test_spawning.py @@ -19,7 +19,8 @@ import yaml from kubernetes.dynamic import DynamicClient -from tests.workbenches.conftest import OdhAnnotationsLabels, OdhConstants +from tests.conftest import admin_client +from tests.workbenches.conftest import OdhAnnotationsLabels, OdhConstants, PodUtils from tests.workbenches.docs import TestDoc, SuiteDoc, Contact, Desc, Step @@ -81,7 +82,7 @@ def logger(cls): ), }, ) - def testCreateSimpleNotebook(self, function_resource_manager, unprivileged_client): + def testCreateSimpleNotebook(self, function_resource_manager, admin_client, unprivileged_client): with allure.step("Create namespace"): ns: ocp_resources.namespace.Namespace = ocp_resources.namespace.Namespace( name=self.NTB_NAMESPACE, @@ -110,12 +111,10 @@ def testCreateSimpleNotebook(self, function_resource_manager, unprivileged_clien notebook = loadDefaultNotebook(unprivileged_client, self.NTB_NAMESPACE, self.NTB_NAME, notebookImage) function_resource_manager.createResourceWithoutWait(unprivileged_client, notebook) - # with allure.step("Wait for Notebook pod readiness"): - # LabelSelector lblSelector = new LabelSelectorBuilder() - # .withMatchLabels(Map.of("app", NTB_NAME)) - # .build(); - # - # PodUtils.waitForPodsReady(NTB_NAMESPACE, lblSelector, 1, true, () -> { }); + with allure.step("Wait for Notebook pod readiness"): + + lblSelector: str = f"app={self.NTB_NAME}" + PodUtils.waitForPodsReady(admin_client, self.NTB_NAMESPACE, lblSelector, 1) #