From c500ceaa1fa311f03a97df41ffd967acdb8f9e0c Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 18 Nov 2024 18:05:57 +0100 Subject: [PATCH 01/44] add jobs to DSS validation for setup and test on NVIDIA GPUs For the moment we lump it together in the validate-intel-gpu launcher... more refactoring coming --- .../checkbox-provider-dss/bin/check_cuda.sh | 93 +++++++++++++++++++ .../checkbox-provider-dss/bin/check_dss.sh | 38 ++++++++ .../bin/tensorflow_can_use_cuda.py | 10 ++ .../checkbox-provider-dss/units/jobs.pxu | 79 ++++++++++++++++ .../checkbox-provider-dss/units/test-plan.pxu | 10 +- 5 files changed, 229 insertions(+), 1 deletion(-) create mode 100755 contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh create mode 100755 contrib/checkbox-dss-validation/checkbox-provider-dss/bin/tensorflow_can_use_cuda.py diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh new file mode 100755 index 0000000000..c61a1ed7bb --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +check_host_has_nvidia_gpus() { + result="$(lspci | grep -ci 'nvidia')" + if [[ "$result" -ge 1 ]]; then + echo "Test success; NVIDIA GPU available on host: count = ${result}" + else + >&2 echo "Test failed: 'lspci' does not report any NVIDIA GPUs" + exit 1 + fi +} + +check_nvidia_gpu_addon_can_be_enabled() { + # TODO: enable changing GPU_OPERATOR_VERSION + GPU_OPERATOR_VERSION=24.6.2 + echo "[INFO]: enabling the NVIDIA GPU addon" + sudo microk8s enable gpu --driver=operator --version="$GPU_OPERATOR_VERSION" + SLEEP_SECS=15 + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking rollout status." + sleep ${SLEEP_SECS} + microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-device-plugin-daemonset + echo "Test success: NVIDIA GPU addon enabled." +} + + +check_nvidia_gpu_validations_succeed() { + SLEEP_SECS=30 + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU validations were successful." + sleep ${SLEEP_SECS} + result=$(microk8s.kubectl logs -n gpu-operator-resources -lapp=nvidia-operator-validator -c nvidia-operator-validator) + if [ "${result}" = "all validations are successful" ]; then + echo "Test success: NVIDIA GPU validations were successful!" + else + >&2 echo "Test failure: NVIDIA GPU validations were not successful, got ${result}" + exit 1 + fi +} + +check_pytorch_can_use_cuda() { + echo "Starting PyTorch CUDA test" + pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'pytorch-cuda\S*') + echo "Found PyTorch CUDA pod: ${pod}" + script="import torch; assert torch.cuda.is_available(), 'CUDA is not available'" + if microk8s.kubectl -n dss exec "$pod" -- python3 -c "$script"; then + echo "PASS: PyTorch can use CUDA" + exit 0 + else + >&2 echo "FAIL: PyTorch can't use CUDA" + exit 1 + fi +} + +check_tensorflow_can_use_cuda() { + echo "Starting Tensorflow CUDA test" + pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'tensorflow-cuda\S*') + echo "Found Tensorflow CUDA pod: ${pod}" + script="$(cat "$SCRIPT_DIR/tensorflow_can_use_cuda.py")" + if microk8s.kubectl -n dss exec "$pod" -- python3 -c "$script"; then + echo "PASS: Tensorflow can use CUDA" + exit 0 + else + >&2 echo "FAIL: Tensorflow can't use CUDA" + exit 1 + fi +} + +help_function() { + echo "This script is used for tests related to CUDA" + echo "Usage: check_dss.sh " + echo + echo "Test cases currently implemented:" + echo -e "\t: check_nvidia_gpu_addon_can_be_enabled" + echo -e "\t: check_nvidia_gpu_validations_succeed" + echo -e "\t: check_pytorch_can_use_cuda" + echo -e "\t: check_tensorflow_can_use_cuda" +} + +main() { + case ${1} in + host_has_nvidia_gpus) check_host_has_nvidia_gpus ;; + gpu_addon_can_be_enabled) check_nvidia_gpu_addon_can_be_enabled ;; + gpu_validations_succeed) check_nvidia_gpu_validations_succeed ;; + pytorch_can_use_cuda) check_pytorch_can_use_cuda ;; + tensorflow_can_use_cuda) check_tensorflow_can_use_cuda ;; + *) help_function ;; + esac +} + +main "$@" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh index f3530a39fc..5d82a52675 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh @@ -80,6 +80,38 @@ check_dss_can_create_ipex_2120_notebook() { fi } +check_dss_has_nvidia_gpu_acceleration_enabled() { + cd "${HOME}" + result=$(dss status) # save result to shell var to avoid broken pipe error + if echo "${result}" | grep -q "NVIDIA GPU acceleration: Enabled.*"; then + echo "Test success: 'dss status' correctly reports NVIDIA GPU status." + else + >&2 echo "Test failure: 'dss status' does not report that NVIDIA GPU acceleration is enabled." + exit 1 + fi +} + +check_dss_can_create_pytorch_cuda_notebook() { + cd "${HOME}" + if dss create pytorch-cuda --image=pytorch-cuda; then + echo "Test success: successfully created pytorch-cuda notebook." + else + >&2 echo "Test failure: failed to create pytorch-cuda notebook." + exit 1 + fi +} + +check_dss_can_create_tensorflow_cuda_notebook() { + cd "${HOME}" + if dss create tensorflow-cuda --image=tensorflow-cuda; then + echo "Test success: successfully created tensorflow-cuda notebook." + else + >&2 echo "Test failure: failed to create tensorflow-cuda notebook." + exit 1 + fi +} + + help_function() { echo "This script is used for generic tests related to DSS" echo "Usage: check_dss.sh " @@ -92,6 +124,9 @@ help_function() { echo -e "\t: check_dss_has_intel_gpu_acceleration_enabled" echo -e "\t: check_dss_can_create_itex_215_notebook" echo -e "\t: check_dss_can_create_ipex_2120_notebook" + echo -e "\t: check_dss_has_nvidia_gpu_acceleration_enabled" + echo -e "\t: check_dss_can_create_pytorch_cuda_notebook" + echo -e "\t: check_dss_can_create_tensorflow_cuda_notebook" } main() { @@ -103,6 +138,9 @@ main() { intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;; can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;; can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;; + nvidia_gpu_acceleration_is_enabled) check_dss_has_nvidia_gpu_acceleration_enabled ;; + can_create_pytorch_cuda_notebook) check_dss_can_create_pytorch_cuda_notebook ;; + can_create_tensorflow_cuda_notebook) check_dss_can_create_tensorflow_cuda_notebook ;; *) help_function ;; esac } diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/tensorflow_can_use_cuda.py b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/tensorflow_can_use_cuda.py new file mode 100755 index 0000000000..d6183a0f5e --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/tensorflow_can_use_cuda.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 + +import tensorflow as tf + +devices = tf.config.experimental.list_physical_devices() +for device_str in devices: + if "CUDA" in device_str: + break +else: + raise AssertionError("CUDA device not found") diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 606d45053e..7d171c93a6 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -198,3 +198,82 @@ depends: ipex/ipex_2.1.20_import _summary: Check IPEX 2.1.20 GPU availability estimated_duration: 1m command: check_ipex.sh pytorch_can_use_xpu + +id: nvidia_gpu/host_gpu_avail +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'lspci' +_summary: Verify that an NVIDIA GPU is available on the host +estimated_duration: 5s +command: check_cuda.sh host_has_nvidia_gpus + +id: nvidia_gpu_addon/enable +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'microk8s' +depends: dss/initialize +_summary: Enable NVIDIA GPU addon +estimated_duration: 2m +command: check_cuda.sh gpu_addon_can_be_enabled + +id: nvidia_gpu_addon/validations_succeed +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'microk8s' +depends: nvidia_gpu_addon/enable +_summary: NVIDIA GPU validations should succeed +estimated_duration: 1m +command: check_cuda.sh gpu_validations_succeed + +id: dss/status_nvidia_gpu +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: intel_gpu_plugin/node_gpu_allocatable +_summary: Check that dss status reports that NVIDIA GPU acceleration is enabled +estimated_duration: 5s +command: check_dss.sh nvidia_gpu_acceleration_is_enabled + +id: dss/create_pytorch_cuda_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/status_nvidia_gpu +_summary: Check that an PyTorch CUDA notebook can be successfully created +estimated_duration: 3m +command: check_dss.sh can_create_pytorch_cuda_notebook + +id: cuda/pytorch_can_use_cuda +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'microk8s' +depends: dss/create_pytorch_cuda_notebook +_summary: Check PyTorch can use CUDA +estimated_duration: 1m +command: check_cuda.sh pytorch_can_use_cuda + +id: dss/create_tensorflow_cuda_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/status_nvidia_gpu +_summary: Check that an Tensorflow CUDA notebook can be successfully created +estimated_duration: 3m +command: check_dss.sh can_create_tensorflow_cuda_notebook + +id: cuda/tensorflow_can_use_cuda +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'microk8s' +depends: dss/create_tensorflow_cuda_notebook +_summary: Check PyTorch can use CUDA +estimated_duration: 1m +command: check_cuda.sh tensorflow_can_use_cuda diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index 2627c6af31..82e28336ce 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -1,6 +1,6 @@ id: dss-validation unit: test plan -_name: DSS validation of Intel-supported Docker images and k8s Intel GPU support +_name: DSS validation on Intel and NVIDIA GPUs include: intel_gpu/host_gpu_avail dss/initialize @@ -22,6 +22,14 @@ include: dss/create_ipex_2.1.20_notebook ipex/ipex_2.1.20_import ipex/ipex_2.1.20_gpu_avail + nvidia_gpu/host_gpu_avail + nvidia_gpu_addon/enable + nvidia_gpu_addon/validations_succeed + dss/status_nvidia_gpu + dss/create_pytorch_cuda_notebook + cuda/pytorch_can_use_cuda + dss/create_tensorflow_cuda_notebook + cuda/tensorflow_can_use_cuda bootstrap_include: com.canonical.certification::executable com.canonical.certification::snap From 14979c93b16959a784f1d98b4bfa749e65cd63a9 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 18 Nov 2024 20:22:37 +0100 Subject: [PATCH 02/44] fix cuda test for tensorflow and give more time for things to settle --- .../checkbox-provider-dss/bin/check_cuda.sh | 14 +++++++++++--- .../bin/tensorflow_can_use_cuda.py | 4 ++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh index c61a1ed7bb..697bdeb50b 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh @@ -28,15 +28,23 @@ check_nvidia_gpu_addon_can_be_enabled() { check_nvidia_gpu_validations_succeed() { - SLEEP_SECS=30 + SLEEP_SECS=60 echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU validations were successful." sleep ${SLEEP_SECS} result=$(microk8s.kubectl logs -n gpu-operator-resources -lapp=nvidia-operator-validator -c nvidia-operator-validator) if [ "${result}" = "all validations are successful" ]; then echo "Test success: NVIDIA GPU validations were successful!" else - >&2 echo "Test failure: NVIDIA GPU validations were not successful, got ${result}" - exit 1 + SLEEP_SECS=60 + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU validations again." + sleep ${SLEEP_SECS} + result=$(microk8s.kubectl logs -n gpu-operator-resources -lapp=nvidia-operator-validator -c nvidia-operator-validator) + if [ "${result}" = "all validations are successful" ]; then + echo "Test success: NVIDIA GPU validations were successful!" + else + >&2 echo "Test failure: NVIDIA GPU validations were not successful, got ${result}" + exit 1 + fi fi } diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/tensorflow_can_use_cuda.py b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/tensorflow_can_use_cuda.py index d6183a0f5e..dc5aee12eb 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/tensorflow_can_use_cuda.py +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/tensorflow_can_use_cuda.py @@ -2,9 +2,9 @@ import tensorflow as tf -devices = tf.config.experimental.list_physical_devices() +devices = tf.config.experimental.list_physical_devices("GPU") for device_str in devices: - if "CUDA" in device_str: + if "GPU" in device_str: break else: raise AssertionError("CUDA device not found") From c8a4afbfc312b1c639134871cd1cda1dd8b7bb57 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 18 Nov 2024 20:30:13 +0100 Subject: [PATCH 03/44] fix dependency of nvidia_gpu_addon/enable job --- .../checkbox-provider-dss/units/jobs.pxu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 7d171c93a6..33d2e79bf3 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -213,7 +213,7 @@ category_id: dss-regress flags: simple imports: from com.canonical.certification import executable requires: executable.name == 'microk8s' -depends: dss/initialize +depends: dss/initialize nvidia_gpu/host_gpu_avail _summary: Enable NVIDIA GPU addon estimated_duration: 2m command: check_cuda.sh gpu_addon_can_be_enabled From 6786061115173f59eb08139f01d9b1a9ad006e98 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 10:36:11 +0000 Subject: [PATCH 04/44] fix wrong dependency for cuda jobs and make validation more reliable --- .../checkbox-provider-dss/bin/check_cuda.sh | 21 ++++++------------- .../checkbox-provider-dss/units/jobs.pxu | 6 +++--- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh index 697bdeb50b..4bd63e537f 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh @@ -19,32 +19,23 @@ check_nvidia_gpu_addon_can_be_enabled() { GPU_OPERATOR_VERSION=24.6.2 echo "[INFO]: enabling the NVIDIA GPU addon" sudo microk8s enable gpu --driver=operator --version="$GPU_OPERATOR_VERSION" - SLEEP_SECS=15 - echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking rollout status." - sleep ${SLEEP_SECS} microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-device-plugin-daemonset + echo "[INFO]: Waiting for the GPU validations to rollout" + microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-operator-validator echo "Test success: NVIDIA GPU addon enabled." } check_nvidia_gpu_validations_succeed() { - SLEEP_SECS=60 - echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU validations were successful." + SLEEP_SECS=5 + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if GPU validations were successful." sleep ${SLEEP_SECS} result=$(microk8s.kubectl logs -n gpu-operator-resources -lapp=nvidia-operator-validator -c nvidia-operator-validator) if [ "${result}" = "all validations are successful" ]; then echo "Test success: NVIDIA GPU validations were successful!" else - SLEEP_SECS=60 - echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU validations again." - sleep ${SLEEP_SECS} - result=$(microk8s.kubectl logs -n gpu-operator-resources -lapp=nvidia-operator-validator -c nvidia-operator-validator) - if [ "${result}" = "all validations are successful" ]; then - echo "Test success: NVIDIA GPU validations were successful!" - else - >&2 echo "Test failure: NVIDIA GPU validations were not successful, got ${result}" - exit 1 - fi + >&2 echo "Test failure: NVIDIA GPU validations were not successful, got ${result}" + exit 1 fi } diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 33d2e79bf3..56a31be972 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -215,7 +215,7 @@ imports: from com.canonical.certification import executable requires: executable.name == 'microk8s' depends: dss/initialize nvidia_gpu/host_gpu_avail _summary: Enable NVIDIA GPU addon -estimated_duration: 2m +estimated_duration: 5m command: check_cuda.sh gpu_addon_can_be_enabled id: nvidia_gpu_addon/validations_succeed @@ -225,7 +225,7 @@ imports: from com.canonical.certification import executable requires: executable.name == 'microk8s' depends: nvidia_gpu_addon/enable _summary: NVIDIA GPU validations should succeed -estimated_duration: 1m +estimated_duration: 10s command: check_cuda.sh gpu_validations_succeed id: dss/status_nvidia_gpu @@ -233,7 +233,7 @@ category_id: dss-regress flags: simple imports: from com.canonical.certification import executable requires: executable.name == 'dss' -depends: intel_gpu_plugin/node_gpu_allocatable +depends: nvidia_gpu_addon/validations_succeed _summary: Check that dss status reports that NVIDIA GPU acceleration is enabled estimated_duration: 5s command: check_dss.sh nvidia_gpu_acceleration_is_enabled From 50601d19c6917b168ec924c3016d865a0b089b07 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 11:39:27 +0100 Subject: [PATCH 05/44] fix shebang to use control instead of remote in launcher script --- contrib/checkbox-dss-validation/bin/validate-intel-gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/checkbox-dss-validation/bin/validate-intel-gpu b/contrib/checkbox-dss-validation/bin/validate-intel-gpu index cfd23ddbb9..5d026f1c1b 100755 --- a/contrib/checkbox-dss-validation/bin/validate-intel-gpu +++ b/contrib/checkbox-dss-validation/bin/validate-intel-gpu @@ -1,4 +1,4 @@ -#!/usr/bin/env -S checkbox-cli-wrapper remote 127.0.0.1 +#!/usr/bin/env -S checkbox-cli-wrapper control 127.0.0.1 [launcher] app_id = com.canonical.contrib.dss-validation:checkbox launcher_version = 1 From 45d427c342c5dc6f823cb1b2bb1f42ab2c706001 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 12:39:59 +0100 Subject: [PATCH 06/44] fix flaky gpu addon rollout checking in better order and more sleep --- .../checkbox-provider-dss/bin/check_cuda.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh index 4bd63e537f..d3ac85c922 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh @@ -19,7 +19,15 @@ check_nvidia_gpu_addon_can_be_enabled() { GPU_OPERATOR_VERSION=24.6.2 echo "[INFO]: enabling the NVIDIA GPU addon" sudo microk8s enable gpu --driver=operator --version="$GPU_OPERATOR_VERSION" + SLEEP_SECS=30 + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU feature discovery has rolled out." + sleep ${SLEEP_SECS} + microk8s.kubectl -n gpu-operator-resources rollout status ds/gpu-operator-node-feature-discovery-worker + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if daemonsets have rolled out." + sleep ${SLEEP_SECS} microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-device-plugin-daemonset + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if daemonsets have rolled out." + sleep ${SLEEP_SECS} echo "[INFO]: Waiting for the GPU validations to rollout" microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-operator-validator echo "Test success: NVIDIA GPU addon enabled." From 9da8078bea5114a1c0f2b8b777b492ad936092ea Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 13:42:53 +0100 Subject: [PATCH 07/44] make the GPU checking into resources to control GPU tests are run --- .../checkbox-provider-dss/bin/check_cuda.sh | 12 -------- .../checkbox-provider-dss/bin/check_intel.sh | 12 -------- .../checkbox-provider-dss/units/jobs.pxu | 25 ++-------------- .../checkbox-provider-dss/units/resource.pxu | 29 +++++++++++++++++++ .../checkbox-provider-dss/units/test-plan.pxu | 6 ++-- 5 files changed, 35 insertions(+), 49 deletions(-) create mode 100644 contrib/checkbox-dss-validation/checkbox-provider-dss/units/resource.pxu diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh index d3ac85c922..20dbc635ca 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh @@ -4,16 +4,6 @@ set -euxo pipefail SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) -check_host_has_nvidia_gpus() { - result="$(lspci | grep -ci 'nvidia')" - if [[ "$result" -ge 1 ]]; then - echo "Test success; NVIDIA GPU available on host: count = ${result}" - else - >&2 echo "Test failed: 'lspci' does not report any NVIDIA GPUs" - exit 1 - fi -} - check_nvidia_gpu_addon_can_be_enabled() { # TODO: enable changing GPU_OPERATOR_VERSION GPU_OPERATOR_VERSION=24.6.2 @@ -33,7 +23,6 @@ check_nvidia_gpu_addon_can_be_enabled() { echo "Test success: NVIDIA GPU addon enabled." } - check_nvidia_gpu_validations_succeed() { SLEEP_SECS=5 echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if GPU validations were successful." @@ -88,7 +77,6 @@ help_function() { main() { case ${1} in - host_has_nvidia_gpus) check_host_has_nvidia_gpus ;; gpu_addon_can_be_enabled) check_nvidia_gpu_addon_can_be_enabled ;; gpu_validations_succeed) check_nvidia_gpu_validations_succeed ;; pytorch_can_use_cuda) check_pytorch_can_use_cuda ;; diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh index c43f87445f..1501d09b22 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh @@ -2,16 +2,6 @@ set -euxo pipefail -check_host_has_intel_gpus() { - result=$(intel_gpu_top -L) - if [[ ${result} == *"pci:vendor=8086"* ]]; then - echo "Test success: Intel GPU available on host: ${result}" - else - >&2 echo "Test failure: "intel_gpu_top -L" reports no Intel GPUs: ${result}" - exit 1 - fi -} - check_intel_gpu_plugin_can_be_installed() { # Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453 @@ -119,7 +109,6 @@ help_function() { echo "Usage: check.sh " echo echo "Test cases currently implemented:" - echo -e "\t: check_host_has_intel_gpus" echo -e "\t: check_intel_gpu_plugin_can_be_installed" echo -e "\t: check_intel_gpu_plugin_daemonset_is_deployed" echo -e "\t: check_one_intel_gpu_plugin_daemonset_is_available" @@ -132,7 +121,6 @@ help_function() { main() { case ${1} in - host_has_intel_gpus) check_host_has_intel_gpus ;; gpu_plugin_can_be_installed) check_intel_gpu_plugin_can_be_installed ;; gpu_plugin_daemonset_is_deployed) check_intel_gpu_plugin_daemonset_is_deployed ;; one_daemonset_is_available) check_one_intel_gpu_plugin_daemonset_is_available ;; diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 56a31be972..b29ab964c6 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -1,12 +1,3 @@ -id: intel_gpu/host_gpu_avail -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'intel_gpu_top' -_summary: Verify that an Intel GPU is available on the host -estimated_duration: 5s -command: check_intel.sh host_has_intel_gpus - id: dss/initialize category_id: dss-regress flags: simple @@ -14,7 +5,6 @@ imports: from com.canonical.certification import executable requires: executable.name == 'dss' executable.name == 'microk8s' -depends: intel_gpu/host_gpu_avail _summary: Check that the DSS environment initializes estimated_duration: 2m command: check_dss.sh dss_can_be_initialized @@ -53,7 +43,7 @@ id: intel_gpu_plugin/install category_id: dss-regress flags: simple imports: from com.canonical.certification import executable -requires: executable.name == 'kubectl' +requires: (host_has_intel_gpus.available == 'true' and executable.name == 'kubectl') depends: dss/initialize _summary: Install Intel K8s GPU Device Plugin estimated_duration: 2m @@ -199,21 +189,12 @@ _summary: Check IPEX 2.1.20 GPU availability estimated_duration: 1m command: check_ipex.sh pytorch_can_use_xpu -id: nvidia_gpu/host_gpu_avail -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'lspci' -_summary: Verify that an NVIDIA GPU is available on the host -estimated_duration: 5s -command: check_cuda.sh host_has_nvidia_gpus - id: nvidia_gpu_addon/enable category_id: dss-regress flags: simple imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: dss/initialize nvidia_gpu/host_gpu_avail +requires: (host_has_nvidia_gpus.available == 'true' and executable.name == 'microk8s') +depends: dss/initialize _summary: Enable NVIDIA GPU addon estimated_duration: 5m command: check_cuda.sh gpu_addon_can_be_enabled diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/resource.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/resource.pxu new file mode 100644 index 0000000000..45b15b7ae9 --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/resource.pxu @@ -0,0 +1,29 @@ +id: host_has_intel_gpus +category_id: dss-regress +plugin: resource +imports: from com.canonical.certification import executable +requires: executable.name == 'intel_gpu_top' +_summary: Verify that an Intel GPU is available on the host +estimated_duration: 5s +command: + result=$(intel_gpu_top -L) + if [[ ${result} == *"pci:vendor=8086"* ]]; then + echo "available: true" + else + echo "available: false" + fi + +id: host_has_nvidia_gpus +category_id: dss-regress +plugin: resource +imports: from com.canonical.certification import executable +requires: executable.name == 'lspci' +_summary: Verify that an NVIDIA GPU is available on the host +estimated_duration: 5s +command: + result="$(lspci | grep -ci 'nvidia')" + if [[ "$result" -ge 1 ]]; then + echo "available: true" + else + echo "available: false" + fi diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index 82e28336ce..e1c0028ace 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -1,8 +1,7 @@ id: dss-validation unit: test plan -_name: DSS validation on Intel and NVIDIA GPUs +_name: DSS validations with Intel and NVIDIA GPUs if available include: - intel_gpu/host_gpu_avail dss/initialize dss/namespace dss/status_mlflow @@ -22,7 +21,6 @@ include: dss/create_ipex_2.1.20_notebook ipex/ipex_2.1.20_import ipex/ipex_2.1.20_gpu_avail - nvidia_gpu/host_gpu_avail nvidia_gpu_addon/enable nvidia_gpu_addon/validations_succeed dss/status_nvidia_gpu @@ -34,6 +32,8 @@ bootstrap_include: com.canonical.certification::executable com.canonical.certification::snap com.canonical.certification::graphics_card + host_has_intel_gpus + host_has_nvidia_gpus id: ipex-validation unit: test plan From 37a9b6461395afdb83d725d37e9395c6399a5092 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 14:25:36 +0100 Subject: [PATCH 08/44] remove flaky mlflow deployed test This is covered by checking that DSS's status says 'MLFlow deployment: Ready'. The way the removed test was implemented assumed position of the service's name in the output and made it flaky, especially when re-running the tests. --- .../checkbox-provider-dss/bin/check_dss.sh | 13 ------------- .../checkbox-provider-dss/units/jobs.pxu | 10 ---------- .../checkbox-provider-dss/units/test-plan.pxu | 1 - 3 files changed, 24 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh index 5d82a52675..a71a6b0757 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh @@ -38,17 +38,6 @@ check_mlflow_status_is_ready() { fi } -check_mlflow_is_deployed_as_first_service() { - # TODO: enable mlflow to be a service in any position - result=$(microk8s.kubectl get service -n dss -o jsonpath='{.items[0].metadata.name}') - if [ "${result}" = "mlflow" ]; then - echo "Test success: 'mlflow' service is deployed!" - else - >&2 echo "Test failure: expected service name 'mlflow' but got ${result}" - exit 1 - fi -} - check_dss_has_intel_gpu_acceleration_enabled() { cd "${HOME}" result=$(dss status) # save result to shell var to avoid broken pipe error @@ -120,7 +109,6 @@ help_function() { echo -e "\t: check_dss_can_be_initialized" echo -e "\t: check_dss_namespace_is_deployed" echo -e "\t: check_mlflow_status_is_ready" - echo -e "\t: check_mlflow_is_deployed_as_first_service" echo -e "\t: check_dss_has_intel_gpu_acceleration_enabled" echo -e "\t: check_dss_can_create_itex_215_notebook" echo -e "\t: check_dss_can_create_ipex_2120_notebook" @@ -134,7 +122,6 @@ main() { dss_can_be_initialized) check_dss_can_be_initialized ;; dss_namespace_is_deployed) check_dss_namespace_is_deployed ;; mlflow_status_is_ready) check_mlflow_status_is_ready ;; - mlflow_is_deployed_as_first_service) check_mlflow_is_deployed_as_first_service ;; intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;; can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;; can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;; diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index b29ab964c6..9af515e915 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -29,16 +29,6 @@ _summary: Check that the dss mlflow is deployed estimated_duration: 5s command: check_dss.sh mlflow_status_is_ready -id: dss/mlflow_deployed -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: dss/namespace -_summary: Check that the first service name is mlflow -estimated_duration: 5s -command: check_dss.sh mlflow_is_deployed_as_first_service - id: intel_gpu_plugin/install category_id: dss-regress flags: simple diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index e1c0028ace..13f0f09f79 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -5,7 +5,6 @@ include: dss/initialize dss/namespace dss/status_mlflow - dss/mlflow_deployed intel_gpu_plugin/install intel_gpu_plugin/daemonset_name intel_gpu_plugin/daemonset_number_available From e180c9a1d62183f9a31b140390df93921c759639 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 15:08:58 +0100 Subject: [PATCH 09/44] update other dss test-plans to use the GPU as resources --- .../checkbox-provider-dss/units/test-plan.pxu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index 13f0f09f79..d2054bb8af 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -39,7 +39,6 @@ unit: test plan _name: IPEX validation testing plan include: opencl/ocl_device_check.* - intel_gpu/host_gpu_avail dss/initialize dss/namespace dss/status_mlflow @@ -60,13 +59,13 @@ bootstrap_include: com.canonical.certification::executable com.canonical.certification::snap com.canonical.certification::graphics_card + host_has_intel_gpus id: itex-validation unit: test plan _name: ITEX validation testing plan include: opencl/ocl_device_check.* - intel_gpu/host_gpu_avail dss/initialize dss/namespace dss/status_mlflow @@ -87,3 +86,4 @@ bootstrap_include: com.canonical.certification::executable com.canonical.certification::snap com.canonical.certification::graphics_card + host_has_intel_gpus From ac912620adf725e75684eea0ae1eca13370dfafe Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 15:11:05 +0100 Subject: [PATCH 10/44] reduce max_attempts for retry to 2 Since many tests here depend on some resources to be available, specifically: GPUs from Intel or NVIDIA, not all tests are expected to pass on a given machine and hence we should not waste our time too much retrying these tests. --- contrib/checkbox-dss-validation/bin/validate-intel-gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/checkbox-dss-validation/bin/validate-intel-gpu b/contrib/checkbox-dss-validation/bin/validate-intel-gpu index 5d026f1c1b..08f50af6b9 100755 --- a/contrib/checkbox-dss-validation/bin/validate-intel-gpu +++ b/contrib/checkbox-dss-validation/bin/validate-intel-gpu @@ -14,5 +14,5 @@ forced = yes [ui] type = silent auto_retry = yes -max_attempts = 10 +max_attempts = 2 delay_before_retry = 10 From 6d046e03c8330c6a2aab64e4b65c9b93e9511f43 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 15:30:22 +0100 Subject: [PATCH 11/44] add cpu-only tests for dss --- .../checkbox-provider-dss/bin/check_cpu.sh | 50 +++++++++++++++++++ .../checkbox-provider-dss/bin/check_dss.sh | 24 +++++++++ .../checkbox-provider-dss/units/jobs.pxu | 40 +++++++++++++++ .../checkbox-provider-dss/units/test-plan.pxu | 4 ++ 4 files changed, 118 insertions(+) create mode 100755 contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cpu.sh diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cpu.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cpu.sh new file mode 100755 index 0000000000..0e6ae72ba3 --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cpu.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +check_pytorch_can_use_cpu() { + echo "Starting PyTorch CPU test" + pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'pytorch-cpu\S*') + echo "Found PyTorch CPU pod: ${pod}" + script="import torch; print(torch.__version__)" + if microk8s.kubectl -n dss exec "$pod" -- python3 -c "$script"; then + echo "PASS: PyTorch can use CPU" + exit 0 + else + >&2 echo "FAIL: PyTorch can't use CPU" + exit 1 + fi +} + +check_tensorflow_can_use_cpu() { + echo "Starting Tensorflow CPU test" + pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'tensorflow-cpu\S*') + echo "Found Tensorflow CPU pod: ${pod}" + script="import tensorflow as tf; print(tf.config.experimental.list_physical_devices())" + if microk8s.kubectl -n dss exec "$pod" -- python3 -c "$script"; then + echo "PASS: Tensorflow can use CPU" + exit 0 + else + >&2 echo "FAIL: Tensorflow can't use CPU" + exit 1 + fi +} + +help_function() { + echo "This script is used for tests related to CUDA" + echo "Usage: check_dss.sh " + echo + echo "Test cases currently implemented:" + echo -e "\t: check_pytorch_can_use_cpu" + echo -e "\t: check_tensorflow_can_use_cpu" +} + +main() { + case ${1} in + pytorch_can_use_cpu) check_pytorch_can_use_cpu ;; + tensorflow_can_use_cpu) check_tensorflow_can_use_cpu ;; + *) help_function ;; + esac +} + +main "$@" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh index a71a6b0757..d70fe04c45 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh @@ -100,6 +100,26 @@ check_dss_can_create_tensorflow_cuda_notebook() { fi } +check_dss_can_create_pytorch_cpu_notebook() { + cd "${HOME}" + if dss create pytorch-cpu --image=pytorch; then + echo "Test success: successfully created pytorch-cpu notebook." + else + >&2 echo "Test failure: failed to create pytorch-cpu notebook." + exit 1 + fi +} + +check_dss_can_create_tensorflow_cpu_notebook() { + cd "${HOME}" + if dss create tensorflow-cpu --image=tensorflow; then + echo "Test success: successfully created tensorflow-cpu notebook." + else + >&2 echo "Test failure: failed to create tensorflow-cpu notebook." + exit 1 + fi +} + help_function() { echo "This script is used for generic tests related to DSS" @@ -115,6 +135,8 @@ help_function() { echo -e "\t: check_dss_has_nvidia_gpu_acceleration_enabled" echo -e "\t: check_dss_can_create_pytorch_cuda_notebook" echo -e "\t: check_dss_can_create_tensorflow_cuda_notebook" + echo -e "\t: check_dss_can_create_pytorch_cpu_notebook" + echo -e "\t: check_dss_can_create_tensorflow_cpu_notebook" } main() { @@ -128,6 +150,8 @@ main() { nvidia_gpu_acceleration_is_enabled) check_dss_has_nvidia_gpu_acceleration_enabled ;; can_create_pytorch_cuda_notebook) check_dss_can_create_pytorch_cuda_notebook ;; can_create_tensorflow_cuda_notebook) check_dss_can_create_tensorflow_cuda_notebook ;; + can_create_pytorch_cpu_notebook) check_dss_can_create_pytorch_cpu_notebook ;; + can_create_tensorflow_cpu_notebook) check_dss_can_create_tensorflow_cpu_notebook ;; *) help_function ;; esac } diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 9af515e915..715ac5e984 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -29,6 +29,46 @@ _summary: Check that the dss mlflow is deployed estimated_duration: 5s command: check_dss.sh mlflow_status_is_ready +id: dss/create_pytorch_cpu_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/initialize +_summary: Check that an PyTorch CPU notebook can be successfully created +estimated_duration: 3m +command: check_dss.sh can_create_pytorch_cpu_notebook + +id: cpu/pytorch_can_use_cpu +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'microk8s' +depends: dss/create_pytorch_cpu_notebook +_summary: Check PyTorch can use CPU +estimated_duration: 1m +command: check_cpu.sh pytorch_can_use_cpu + +id: dss/create_tensorflow_cpu_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/initialize +_summary: Check that an Tensorflow CPU notebook can be successfully created +estimated_duration: 3m +command: check_dss.sh can_create_tensorflow_cpu_notebook + +id: cpu/tensorflow_can_use_cpu +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'microk8s' +depends: dss/create_tensorflow_cpu_notebook +_summary: Check PyTorch can use CPU +estimated_duration: 1m +command: check_cpu.sh tensorflow_can_use_cpu + id: intel_gpu_plugin/install category_id: dss-regress flags: simple diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index d2054bb8af..8be8f1f46a 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -5,6 +5,10 @@ include: dss/initialize dss/namespace dss/status_mlflow + dss/create_pytorch_cpu_notebook + cpu/pytorch_can_use_cpu + dss/create_tensorflow_cpu_notebook + cpu/tensorflow_can_use_cpu intel_gpu_plugin/install intel_gpu_plugin/daemonset_name intel_gpu_plugin/daemonset_number_available From d722754860407107b970f7c9009da8e5a9cf93fa Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 19 Nov 2024 15:36:14 +0100 Subject: [PATCH 12/44] rename validate script to not contain intel and bump snap's version --- contrib/checkbox-dss-validation/README.md | 8 +++++--- .../bin/{validate-intel-gpu => validate-with-gpu} | 0 contrib/checkbox-dss-validation/snap/snapcraft.yaml | 7 +++---- contrib/checkbox-dss-validation/testflinger/job-def.yaml | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) rename contrib/checkbox-dss-validation/bin/{validate-intel-gpu => validate-with-gpu} (100%) diff --git a/contrib/checkbox-dss-validation/README.md b/contrib/checkbox-dss-validation/README.md index d2d06111c8..01ed79268a 100644 --- a/contrib/checkbox-dss-validation/README.md +++ b/contrib/checkbox-dss-validation/README.md @@ -1,12 +1,14 @@ # Welcome to the Checkbox DSS project! -This repository contains the Checkbox DSS Provider (test cases and test plans for validating Intel GPU support in the [Data Science Stack](https://documentation.ubuntu.com/data-science-stack/en/latest/)) as well as everything that is required to build the `checkbox-dss` snap. +This repository contains the Checkbox DSS Provider (test cases and test plans for validating Intel and NVIDIA GPU support in the [Data Science Stack](https://documentation.ubuntu.com/data-science-stack/en/latest/)) as well as everything that is required to build the `checkbox-dss` snap. # Requirements - Ubuntu Jammy (22.04) - Supported hardware platforms: + - No GPUs - Intel platforms with recent GPU (>= Broadwell) + - Recent NVIDIA GPU # Installation @@ -19,7 +21,7 @@ lxd init --auto git clone https://github.com/canonical/checkbox cd checkbox/contrib/checkbox-dss-validation snapcraft -sudo snap install --dangerous --classic ./checkbox-dss_2.0_amd64.snap +sudo snap install --dangerous --classic ./checkbox-dss_3.0_amd64.snap ``` Make sure that the provider service is running and active: @@ -48,7 +50,7 @@ checkbox-dss.install-deps --dss-snap-channel=latest/edge To run the test plans: ```shell -checkbox-dss.validate-intel-gpu +checkbox-dss.validate-with-gpu ``` # Cleanup diff --git a/contrib/checkbox-dss-validation/bin/validate-intel-gpu b/contrib/checkbox-dss-validation/bin/validate-with-gpu similarity index 100% rename from contrib/checkbox-dss-validation/bin/validate-intel-gpu rename to contrib/checkbox-dss-validation/bin/validate-with-gpu diff --git a/contrib/checkbox-dss-validation/snap/snapcraft.yaml b/contrib/checkbox-dss-validation/snap/snapcraft.yaml index 9624696101..fffbe1ed18 100644 --- a/contrib/checkbox-dss-validation/snap/snapcraft.yaml +++ b/contrib/checkbox-dss-validation/snap/snapcraft.yaml @@ -2,7 +2,7 @@ name: checkbox-dss summary: Checkbox tests for validating the Data Science Stack description: | Collection of tests to be run on devices that are part of the dss-validation project -version: '2.0' +version: '3.0' confinement: classic grade: stable @@ -35,9 +35,9 @@ apps: configure: command-chain: [bin/wrapper_local] command: bin/configure - validate-intel-gpu: + validate-with-gpu: command-chain: [bin/wrapper_local] - command: bin/validate-intel-gpu + command: bin/validate-with-gpu remote-slave: command-chain: [bin/wrapper_local] command: bin/checkbox-cli-wrapper slave @@ -82,4 +82,3 @@ parts: config-variables: plugin: dump source: config/ - diff --git a/contrib/checkbox-dss-validation/testflinger/job-def.yaml b/contrib/checkbox-dss-validation/testflinger/job-def.yaml index 1534061d8b..a39f343d66 100644 --- a/contrib/checkbox-dss-validation/testflinger/job-def.yaml +++ b/contrib/checkbox-dss-validation/testflinger/job-def.yaml @@ -57,5 +57,5 @@ test_data: # Run tests ssh ubuntu@$DEVICE_IP ' - checkbox-dss.validate-intel-gpu + checkbox-dss.validate-with-gpu ' From a74159679a9b1b3278101a91fc77427a635ed43c Mon Sep 17 00:00:00 2001 From: Abdullah Date: Wed, 20 Nov 2024 10:20:34 +0100 Subject: [PATCH 13/44] refactor testflinger job file builder to unify into one re-usable one --- .../testflinger-contrib-dss-regression.yaml | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/.github/workflows/testflinger-contrib-dss-regression.yaml b/.github/workflows/testflinger-contrib-dss-regression.yaml index 217c2088f2..175252e808 100644 --- a/.github/workflows/testflinger-contrib-dss-regression.yaml +++ b/.github/workflows/testflinger-contrib-dss-regression.yaml @@ -26,30 +26,18 @@ jobs: - latest/stable - latest/edge queue: - - dell-precision-3470-c30322 #ADL iGPU + NVIDIA GPU - - dell-precision-5680-c31665 #RPL iGPU + Arc Pro A60M dGPU + - name: dell-precision-3470-c30322 #ADL iGPU + NVIDIA GPU + provision_data: "distro: jammy" + - name: dell-precision-5680-c31665 #RPL iGPU + Arc Pro A60M dGPU + provision_data: "url: http://10.102.196.9/somerville/Platforms/jellyfish-muk/X96_A00/dell-bto-jammy-jellyfish-muk-X96-20230419-19_A00.iso" steps: - name: Check out code uses: actions/checkout@v4 - - name: Build job file from template with maas2 provisioning - if: ${{ matrix.queue == 'dell-precision-3470-c30322' }} - env: - PROVISION_DATA: "distro: jammy" + - name: Build job file from template run: | sed -e "s|REPLACE_BRANCH|${BRANCH}|" \ - -e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \ - -e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \ - -e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \ - ${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \ - ${GITHUB_WORKSPACE}/job.yaml - - name: Build job file from template with oemscript provisioning - if: ${{ matrix.queue == 'dell-precision-5680-c31665' }} - env: - PROVISION_DATA: "url: http://10.102.196.9/somerville/Platforms/jellyfish-muk/X96_A00/dell-bto-jammy-jellyfish-muk-X96-20230419-19_A00.iso" - run: | - sed -e "s|REPLACE_BRANCH|${BRANCH}|" \ - -e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \ - -e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \ + -e "s|REPLACE_QUEUE|${{ matrix.queue.name }}|" \ + -e "s|REPLACE_PROVISION_DATA|${{ matrix.queue.provision_data }}|" \ -e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \ ${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \ ${GITHUB_WORKSPACE}/job.yaml From 18b591e0dc9387356945c88ea7d2fb455c336231 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Wed, 20 Nov 2024 11:11:50 +0100 Subject: [PATCH 14/44] add nvidia dgx as target machine for DSS testflinger jobs --- .github/workflows/testflinger-contrib-dss-regression.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/testflinger-contrib-dss-regression.yaml b/.github/workflows/testflinger-contrib-dss-regression.yaml index 175252e808..50d3204932 100644 --- a/.github/workflows/testflinger-contrib-dss-regression.yaml +++ b/.github/workflows/testflinger-contrib-dss-regression.yaml @@ -30,6 +30,8 @@ jobs: provision_data: "distro: jammy" - name: dell-precision-5680-c31665 #RPL iGPU + Arc Pro A60M dGPU provision_data: "url: http://10.102.196.9/somerville/Platforms/jellyfish-muk/X96_A00/dell-bto-jammy-jellyfish-muk-X96-20230419-19_A00.iso" + - name: nvidia-dgx-station-c25989 # NO iGPU + NVIDIA GPU + provision_data: "distro: jammy" steps: - name: Check out code uses: actions/checkout@v4 From 2d435815c27182daa51f87e5010409496070d37f Mon Sep 17 00:00:00 2001 From: Abdullah Date: Wed, 20 Nov 2024 13:28:57 +0100 Subject: [PATCH 15/44] allow other workflow jobs in matrix to continue running if one fails --- .github/workflows/testflinger-contrib-dss-regression.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/testflinger-contrib-dss-regression.yaml b/.github/workflows/testflinger-contrib-dss-regression.yaml index 50d3204932..f971611b0b 100644 --- a/.github/workflows/testflinger-contrib-dss-regression.yaml +++ b/.github/workflows/testflinger-contrib-dss-regression.yaml @@ -21,6 +21,7 @@ jobs: run: working-directory: contrib/checkbox-dss-validation strategy: + fail-fast: false matrix: dss_channel: - latest/stable From 8d290aeab8238c42f3a96e9615ef97d3542dc1b9 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 11:32:36 +0100 Subject: [PATCH 16/44] add notebook removal tests and rename cases to be consistent Notebook removal is part of the CLI of DSS anyway, and makes sense to be tested. Nevertheless, the main reason to add these tests is so that the entire checkbox test plan can be repeated without having to uninstall everything; removing notebook resets DSS into a re-testable state. --- .../checkbox-provider-dss/bin/check_dss.sh | 13 +++- .../checkbox-provider-dss/units/jobs.pxu | 68 +++++++++++++++++-- .../checkbox-provider-dss/units/test-plan.pxu | 10 ++- 3 files changed, 84 insertions(+), 7 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh index d70fe04c45..9c77f1a851 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh @@ -120,10 +120,19 @@ check_dss_can_create_tensorflow_cpu_notebook() { fi } +check_dss_can_remove_notebook() { + cd "${HOME}" + if dss remove "$1"; then + echo "Test success: successfully removed '$1' notebook." + else + >&2 echo "Test failure: failed to remove '$1' notebook." + exit 1 + fi +} help_function() { echo "This script is used for generic tests related to DSS" - echo "Usage: check_dss.sh " + echo "Usage: check_dss.sh [args]..." echo echo "Test cases currently implemented:" echo -e "\t: check_dss_can_be_initialized" @@ -137,6 +146,7 @@ help_function() { echo -e "\t: check_dss_can_create_tensorflow_cuda_notebook" echo -e "\t: check_dss_can_create_pytorch_cpu_notebook" echo -e "\t: check_dss_can_create_tensorflow_cpu_notebook" + echo -e "\t: check_dss_can_remove_notebook " } main() { @@ -152,6 +162,7 @@ main() { can_create_tensorflow_cuda_notebook) check_dss_can_create_tensorflow_cuda_notebook ;; can_create_pytorch_cpu_notebook) check_dss_can_create_pytorch_cpu_notebook ;; can_create_tensorflow_cpu_notebook) check_dss_can_create_tensorflow_cpu_notebook ;; + can_remove_notebook) check_dss_can_remove_notebook "$2" ;; *) help_function ;; esac } diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 715ac5e984..5631f7b6bf 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -49,6 +49,16 @@ _summary: Check PyTorch can use CPU estimated_duration: 1m command: check_cpu.sh pytorch_can_use_cpu +id: dss/remove_pytorch_cpu_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/create_pytorch_cpu_notebook +_summary: Check that the PyTorch CPU notebook can be removed +estimated_duration: 1m +command: check_dss.sh can_remove_notebook "pytorch-cpu" + id: dss/create_tensorflow_cpu_notebook category_id: dss-regress flags: simple @@ -69,6 +79,16 @@ _summary: Check PyTorch can use CPU estimated_duration: 1m command: check_cpu.sh tensorflow_can_use_cpu +id: dss/remove_tensorflow_cpu_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/create_tensorflow_cpu_notebook +_summary: Check that the Tensorflow CPU notebook can be removed +estimated_duration: 1m +command: check_dss.sh can_remove_notebook "tensorflow-cpu" + id: intel_gpu_plugin/install category_id: dss-regress flags: simple @@ -159,7 +179,7 @@ _summary: Check that dss status reports that Intel GPU acceleration is enabled estimated_duration: 5s command: check_dss.sh intel_gpu_acceleration_is_enabled -id: dss/create_itex_2.15_notebook +id: dss/create_tensorflow_intel_notebook category_id: dss-regress flags: simple imports: from com.canonical.certification import executable @@ -174,7 +194,7 @@ category_id: dss-regress flags: simple imports: from com.canonical.certification import executable requires: executable.name == 'microk8s' -depends: dss/create_itex_2.15_notebook +depends: dss/create_tensorflow_intel_notebook _summary: Check to see if ITEX 2.15 can be imported estimated_duration: 1m command: check_itex.sh can_be_imported @@ -189,7 +209,17 @@ _summary: Check ITEX 2.15 GPU Availability estimated_duration: 1m command: check_itex.sh tensorflow_can_use_xpu -id: dss/create_ipex_2.1.20_notebook +id: dss/remove_tensorflow_intel_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/create_tensorflow_intel_notebook +_summary: Check that the Tensorflow Intel notebook can be removed +estimated_duration: 1m +command: check_dss.sh can_remove_notebook "itex-215-notebook" + +id: dss/create_pytorch_intel_notebook category_id: dss-regress flags: simple imports: from com.canonical.certification import executable @@ -204,7 +234,7 @@ category_id: dss-regress flags: simple imports: from com.canonical.certification import executable requires: executable.name == 'microk8s' -depends: dss/create_ipex_2.1.20_notebook +depends: dss/create_pytorch_intel_notebook _summary: Check to see if IPEX 2.1.20 can be imported estimated_duration: 1m command: check_ipex.sh can_be_imported @@ -219,6 +249,16 @@ _summary: Check IPEX 2.1.20 GPU availability estimated_duration: 1m command: check_ipex.sh pytorch_can_use_xpu +id: dss/remove_pytorch_intel_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/create_pytorch_intel_notebook +_summary: Check that the PyTorch Intel notebook can be removed +estimated_duration: 1m +command: check_dss.sh can_remove_notebook "ipex-2120-notebook" + id: nvidia_gpu_addon/enable category_id: dss-regress flags: simple @@ -269,6 +309,16 @@ _summary: Check PyTorch can use CUDA estimated_duration: 1m command: check_cuda.sh pytorch_can_use_cuda +id: dss/remove_pytorch_cuda_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/create_pytorch_cuda_notebook +_summary: Check that the PyTorch CUDA notebook can be removed +estimated_duration: 1m +command: check_dss.sh can_remove_notebook "pytorch-cuda" + id: dss/create_tensorflow_cuda_notebook category_id: dss-regress flags: simple @@ -288,3 +338,13 @@ depends: dss/create_tensorflow_cuda_notebook _summary: Check PyTorch can use CUDA estimated_duration: 1m command: check_cuda.sh tensorflow_can_use_cuda + +id: dss/remove_tensorflow_cuda_notebook +category_id: dss-regress +flags: simple +imports: from com.canonical.certification import executable +requires: executable.name == 'dss' +depends: dss/create_tensorflow_cuda_notebook +_summary: Check that the Tensorflow CUDA notebook can be removed +estimated_duration: 1m +command: check_dss.sh can_remove_notebook "tensorflow-cuda" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index 8be8f1f46a..8f69658e5f 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -7,8 +7,10 @@ include: dss/status_mlflow dss/create_pytorch_cpu_notebook cpu/pytorch_can_use_cpu + dss/remove_pytorch_cpu_notebook dss/create_tensorflow_cpu_notebook cpu/tensorflow_can_use_cpu + dss/remove_tensorflow_cpu_notebook intel_gpu_plugin/install intel_gpu_plugin/daemonset_name intel_gpu_plugin/daemonset_number_available @@ -18,19 +20,23 @@ include: intel_gpu_plugin/node_gpu_capacity intel_gpu_plugin/node_gpu_allocatable dss/status_intel_gpu - dss/create_itex_2.15_notebook + dss/create_tensorflow_intel_notebook itex/itex_2.15_import itex/itex_2.15_gpu_avail - dss/create_ipex_2.1.20_notebook + dss/remove_tensorflow_intel_notebook + dss/create_pytorch_intel_notebook ipex/ipex_2.1.20_import ipex/ipex_2.1.20_gpu_avail + dss/remove_pytorch_intel_notebook nvidia_gpu_addon/enable nvidia_gpu_addon/validations_succeed dss/status_nvidia_gpu dss/create_pytorch_cuda_notebook cuda/pytorch_can_use_cuda + dss/remove_pytorch_cuda_notebook dss/create_tensorflow_cuda_notebook cuda/tensorflow_can_use_cuda + dss/remove_tensorflow_cuda_notebook bootstrap_include: com.canonical.certification::executable com.canonical.certification::snap From eb4c09eff0d9b8b5d94fd29e9b8390f4bd7cc43d Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 12:44:47 +0100 Subject: [PATCH 17/44] skip installing intel gpu plugin if it is already there --- .../checkbox-provider-dss/bin/check_intel.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh index 1501d09b22..961ff8237a 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh @@ -3,7 +3,12 @@ set -euxo pipefail check_intel_gpu_plugin_can_be_installed() { - # Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453 + if microk8s.kubectl get daemonset.apps | grep -q "intel-gpu-plugin"; then + echo "Test success: 'intel-gpu-plugin' daemonset is already deployed!" + exit 0 + fi + + # NOTE: Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453 # TODO: make version a param VERSION=v0.30.0 From b6202e1d840b690dbb112364085acfa2738aa590 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 13:23:02 +0100 Subject: [PATCH 18/44] remove unused itex- and ipex-only test plans --- .../checkbox-provider-dss/units/test-plan.pxu | 54 ------------------- 1 file changed, 54 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index 8f69658e5f..49df936b7f 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -43,57 +43,3 @@ bootstrap_include: com.canonical.certification::graphics_card host_has_intel_gpus host_has_nvidia_gpus - -id: ipex-validation -unit: test plan -_name: IPEX validation testing plan -include: - opencl/ocl_device_check.* - dss/initialize - dss/namespace - dss/status_mlflow - dss/mlflow_deployed - intel_gpu_plugin/install - intel_gpu_plugin/daemonset_name - intel_gpu_plugin/daemonset_number_available - intel_gpu_plugin/daemonset_number_ready - intel_gpu_plugin/labels - intel_gpu_plugin/gpu_count - intel_gpu_plugin/node_gpu_capacity - intel_gpu_plugin/node_gpu_allocatable - dss/status_intel_gpu - dss/create_ipex_2.1.20_notebook - ipex/ipex_2.1.20_import - ipex/ipex_2.1.20_gpu_avail -bootstrap_include: - com.canonical.certification::executable - com.canonical.certification::snap - com.canonical.certification::graphics_card - host_has_intel_gpus - -id: itex-validation -unit: test plan -_name: ITEX validation testing plan -include: - opencl/ocl_device_check.* - dss/initialize - dss/namespace - dss/status_mlflow - dss/mlflow_deployed - intel_gpu_plugin/install - intel_gpu_plugin/daemonset_name - intel_gpu_plugin/daemonset_number_available - intel_gpu_plugin/daemonset_number_ready - intel_gpu_plugin/labels - intel_gpu_plugin/gpu_count - intel_gpu_plugin/node_gpu_capacity - intel_gpu_plugin/node_gpu_allocatable - dss/status_intel_gpu - dss/create_itex_2.15_notebook - itex/itex_2.15_import - itex/itex_2.15_gpu_avail -bootstrap_include: - com.canonical.certification::executable - com.canonical.certification::snap - com.canonical.certification::graphics_card - host_has_intel_gpus From 98359c54687dc32ed526798300e7731cf306de2e Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 13:29:55 +0100 Subject: [PATCH 19/44] rename check_dss.sh to check_dss for pseudo-fluent usage --- .../bin/{check_dss.sh => check_dss} | 0 .../checkbox-provider-dss/units/jobs.pxu | 34 +++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) rename contrib/checkbox-dss-validation/checkbox-provider-dss/bin/{check_dss.sh => check_dss} (100%) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss similarity index 100% rename from contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss.sh rename to contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 5631f7b6bf..31a1215405 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -7,7 +7,7 @@ requires: executable.name == 'microk8s' _summary: Check that the DSS environment initializes estimated_duration: 2m -command: check_dss.sh dss_can_be_initialized +command: check_dss dss_can_be_initialized id: dss/namespace category_id: dss-regress @@ -17,7 +17,7 @@ requires: executable.name == 'microk8s' depends: dss/initialize _summary: Check that the dss namespace is deployed estimated_duration: 5s -command: check_dss.sh dss_namespace_is_deployed +command: check_dss dss_namespace_is_deployed id: dss/status_mlflow category_id: dss-regress @@ -27,7 +27,7 @@ requires: executable.name == 'dss' depends: dss/namespace _summary: Check that the dss mlflow is deployed estimated_duration: 5s -command: check_dss.sh mlflow_status_is_ready +command: check_dss mlflow_status_is_ready id: dss/create_pytorch_cpu_notebook category_id: dss-regress @@ -37,7 +37,7 @@ requires: executable.name == 'dss' depends: dss/initialize _summary: Check that an PyTorch CPU notebook can be successfully created estimated_duration: 3m -command: check_dss.sh can_create_pytorch_cpu_notebook +command: check_dss can_create_pytorch_cpu_notebook id: cpu/pytorch_can_use_cpu category_id: dss-regress @@ -57,7 +57,7 @@ requires: executable.name == 'dss' depends: dss/create_pytorch_cpu_notebook _summary: Check that the PyTorch CPU notebook can be removed estimated_duration: 1m -command: check_dss.sh can_remove_notebook "pytorch-cpu" +command: check_dss can_remove_notebook "pytorch-cpu" id: dss/create_tensorflow_cpu_notebook category_id: dss-regress @@ -67,7 +67,7 @@ requires: executable.name == 'dss' depends: dss/initialize _summary: Check that an Tensorflow CPU notebook can be successfully created estimated_duration: 3m -command: check_dss.sh can_create_tensorflow_cpu_notebook +command: check_dss can_create_tensorflow_cpu_notebook id: cpu/tensorflow_can_use_cpu category_id: dss-regress @@ -87,7 +87,7 @@ requires: executable.name == 'dss' depends: dss/create_tensorflow_cpu_notebook _summary: Check that the Tensorflow CPU notebook can be removed estimated_duration: 1m -command: check_dss.sh can_remove_notebook "tensorflow-cpu" +command: check_dss can_remove_notebook "tensorflow-cpu" id: intel_gpu_plugin/install category_id: dss-regress @@ -177,7 +177,7 @@ requires: executable.name == 'dss' depends: intel_gpu_plugin/node_gpu_allocatable _summary: Check that dss status reports that Intel GPU acceleration is enabled estimated_duration: 5s -command: check_dss.sh intel_gpu_acceleration_is_enabled +command: check_dss intel_gpu_acceleration_is_enabled id: dss/create_tensorflow_intel_notebook category_id: dss-regress @@ -187,7 +187,7 @@ requires: executable.name == 'dss' depends: dss/status_intel_gpu _summary: Check that an ITEX 2.15 notebook can be successfully created estimated_duration: 3m -command: check_dss.sh can_create_itex_215_notebook +command: check_dss can_create_itex_215_notebook id: itex/itex_2.15_import category_id: dss-regress @@ -217,7 +217,7 @@ requires: executable.name == 'dss' depends: dss/create_tensorflow_intel_notebook _summary: Check that the Tensorflow Intel notebook can be removed estimated_duration: 1m -command: check_dss.sh can_remove_notebook "itex-215-notebook" +command: check_dss can_remove_notebook "itex-215-notebook" id: dss/create_pytorch_intel_notebook category_id: dss-regress @@ -227,7 +227,7 @@ requires: executable.name == 'dss' depends: dss/status_intel_gpu _summary: Check that an IPEX 2.1.20 notebook can be successfully created estimated_duration: 3m -command: check_dss.sh can_create_ipex_2120_notebook +command: check_dss can_create_ipex_2120_notebook id: ipex/ipex_2.1.20_import category_id: dss-regress @@ -257,7 +257,7 @@ requires: executable.name == 'dss' depends: dss/create_pytorch_intel_notebook _summary: Check that the PyTorch Intel notebook can be removed estimated_duration: 1m -command: check_dss.sh can_remove_notebook "ipex-2120-notebook" +command: check_dss can_remove_notebook "ipex-2120-notebook" id: nvidia_gpu_addon/enable category_id: dss-regress @@ -287,7 +287,7 @@ requires: executable.name == 'dss' depends: nvidia_gpu_addon/validations_succeed _summary: Check that dss status reports that NVIDIA GPU acceleration is enabled estimated_duration: 5s -command: check_dss.sh nvidia_gpu_acceleration_is_enabled +command: check_dss nvidia_gpu_acceleration_is_enabled id: dss/create_pytorch_cuda_notebook category_id: dss-regress @@ -297,7 +297,7 @@ requires: executable.name == 'dss' depends: dss/status_nvidia_gpu _summary: Check that an PyTorch CUDA notebook can be successfully created estimated_duration: 3m -command: check_dss.sh can_create_pytorch_cuda_notebook +command: check_dss can_create_pytorch_cuda_notebook id: cuda/pytorch_can_use_cuda category_id: dss-regress @@ -317,7 +317,7 @@ requires: executable.name == 'dss' depends: dss/create_pytorch_cuda_notebook _summary: Check that the PyTorch CUDA notebook can be removed estimated_duration: 1m -command: check_dss.sh can_remove_notebook "pytorch-cuda" +command: check_dss can_remove_notebook "pytorch-cuda" id: dss/create_tensorflow_cuda_notebook category_id: dss-regress @@ -327,7 +327,7 @@ requires: executable.name == 'dss' depends: dss/status_nvidia_gpu _summary: Check that an Tensorflow CUDA notebook can be successfully created estimated_duration: 3m -command: check_dss.sh can_create_tensorflow_cuda_notebook +command: check_dss can_create_tensorflow_cuda_notebook id: cuda/tensorflow_can_use_cuda category_id: dss-regress @@ -347,4 +347,4 @@ requires: executable.name == 'dss' depends: dss/create_tensorflow_cuda_notebook _summary: Check that the Tensorflow CUDA notebook can be removed estimated_duration: 1m -command: check_dss.sh can_remove_notebook "tensorflow-cuda" +command: check_dss can_remove_notebook "tensorflow-cuda" From 2971237b3ad0ba51eecb583c4416bd985bdfc8fc Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 13:55:47 +0100 Subject: [PATCH 20/44] refactor remove notebook test to accept multiple arguments --- .../checkbox-provider-dss/bin/check_dss | 8 ++++---- .../checkbox-provider-dss/units/jobs.pxu | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss index 9c77f1a851..ac9d14a359 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss @@ -122,10 +122,10 @@ check_dss_can_create_tensorflow_cpu_notebook() { check_dss_can_remove_notebook() { cd "${HOME}" - if dss remove "$1"; then - echo "Test success: successfully removed '$1' notebook." + if dss remove "$@"; then + echo "Test success: successfully removed notebook with '$*'." else - >&2 echo "Test failure: failed to remove '$1' notebook." + >&2 echo "Test failure: failed to remove notebook with '$*'." exit 1 fi } @@ -162,7 +162,7 @@ main() { can_create_tensorflow_cuda_notebook) check_dss_can_create_tensorflow_cuda_notebook ;; can_create_pytorch_cpu_notebook) check_dss_can_create_pytorch_cpu_notebook ;; can_create_tensorflow_cpu_notebook) check_dss_can_create_tensorflow_cpu_notebook ;; - can_remove_notebook) check_dss_can_remove_notebook "$2" ;; + can_remove_notebook) check_dss_can_remove_notebook "${@:2}" ;; *) help_function ;; esac } diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 31a1215405..a20e652ef9 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -57,7 +57,7 @@ requires: executable.name == 'dss' depends: dss/create_pytorch_cpu_notebook _summary: Check that the PyTorch CPU notebook can be removed estimated_duration: 1m -command: check_dss can_remove_notebook "pytorch-cpu" +command: check_dss can_remove_notebook pytorch-cpu id: dss/create_tensorflow_cpu_notebook category_id: dss-regress @@ -87,7 +87,7 @@ requires: executable.name == 'dss' depends: dss/create_tensorflow_cpu_notebook _summary: Check that the Tensorflow CPU notebook can be removed estimated_duration: 1m -command: check_dss can_remove_notebook "tensorflow-cpu" +command: check_dss can_remove_notebook tensorflow-cpu id: intel_gpu_plugin/install category_id: dss-regress @@ -217,7 +217,7 @@ requires: executable.name == 'dss' depends: dss/create_tensorflow_intel_notebook _summary: Check that the Tensorflow Intel notebook can be removed estimated_duration: 1m -command: check_dss can_remove_notebook "itex-215-notebook" +command: check_dss can_remove_notebook itex-215-notebook id: dss/create_pytorch_intel_notebook category_id: dss-regress @@ -257,7 +257,7 @@ requires: executable.name == 'dss' depends: dss/create_pytorch_intel_notebook _summary: Check that the PyTorch Intel notebook can be removed estimated_duration: 1m -command: check_dss can_remove_notebook "ipex-2120-notebook" +command: check_dss can_remove_notebook ipex-2120-notebook id: nvidia_gpu_addon/enable category_id: dss-regress @@ -317,7 +317,7 @@ requires: executable.name == 'dss' depends: dss/create_pytorch_cuda_notebook _summary: Check that the PyTorch CUDA notebook can be removed estimated_duration: 1m -command: check_dss can_remove_notebook "pytorch-cuda" +command: check_dss can_remove_notebook pytorch-cuda id: dss/create_tensorflow_cuda_notebook category_id: dss-regress @@ -347,4 +347,4 @@ requires: executable.name == 'dss' depends: dss/create_tensorflow_cuda_notebook _summary: Check that the Tensorflow CUDA notebook can be removed estimated_duration: 1m -command: check_dss can_remove_notebook "tensorflow-cuda" +command: check_dss can_remove_notebook tensorflow-cuda From b73a2818cd27e8b76bdc8dbfe5350d804ea047ba Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 13:59:21 +0100 Subject: [PATCH 21/44] extract out notebook creation to reused function --- .../checkbox-provider-dss/bin/check_dss | 71 ++----------------- .../checkbox-provider-dss/units/jobs.pxu | 12 ++-- 2 files changed, 11 insertions(+), 72 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss index ac9d14a359..d3cf774668 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss @@ -49,22 +49,12 @@ check_dss_has_intel_gpu_acceleration_enabled() { fi } -check_dss_can_create_itex_215_notebook() { +check_dss_can_create_notebook() { cd "${HOME}" - if dss create itex-215-notebook --image=intel/intel-extension-for-tensorflow:2.15.0-xpu-idp-jupyter; then - echo "Test success: successfully created an ITEX 2.15 notebook." + if dss create "${@}"; then + echo "Test success: successfully created notebook with '$*'." else - >&2 echo "Test failure: failed to create an ITEX 2.15 notebook." - exit 1 - fi -} - -check_dss_can_create_ipex_2120_notebook() { - cd "${HOME}" - if dss create ipex-2120-notebook --image=intel/intel-extension-for-pytorch:2.1.20-xpu-idp-jupyter; then - echo "Test success: successfully created an IPEX 2.1.20 notebook." - else - >&2 echo "Test failure: failed to create an IPEX 2.1.20 notebook." + >&2 echo "Test failure: failed to create notebook with '$*'." exit 1 fi } @@ -80,46 +70,6 @@ check_dss_has_nvidia_gpu_acceleration_enabled() { fi } -check_dss_can_create_pytorch_cuda_notebook() { - cd "${HOME}" - if dss create pytorch-cuda --image=pytorch-cuda; then - echo "Test success: successfully created pytorch-cuda notebook." - else - >&2 echo "Test failure: failed to create pytorch-cuda notebook." - exit 1 - fi -} - -check_dss_can_create_tensorflow_cuda_notebook() { - cd "${HOME}" - if dss create tensorflow-cuda --image=tensorflow-cuda; then - echo "Test success: successfully created tensorflow-cuda notebook." - else - >&2 echo "Test failure: failed to create tensorflow-cuda notebook." - exit 1 - fi -} - -check_dss_can_create_pytorch_cpu_notebook() { - cd "${HOME}" - if dss create pytorch-cpu --image=pytorch; then - echo "Test success: successfully created pytorch-cpu notebook." - else - >&2 echo "Test failure: failed to create pytorch-cpu notebook." - exit 1 - fi -} - -check_dss_can_create_tensorflow_cpu_notebook() { - cd "${HOME}" - if dss create tensorflow-cpu --image=tensorflow; then - echo "Test success: successfully created tensorflow-cpu notebook." - else - >&2 echo "Test failure: failed to create tensorflow-cpu notebook." - exit 1 - fi -} - check_dss_can_remove_notebook() { cd "${HOME}" if dss remove "$@"; then @@ -139,13 +89,7 @@ help_function() { echo -e "\t: check_dss_namespace_is_deployed" echo -e "\t: check_mlflow_status_is_ready" echo -e "\t: check_dss_has_intel_gpu_acceleration_enabled" - echo -e "\t: check_dss_can_create_itex_215_notebook" - echo -e "\t: check_dss_can_create_ipex_2120_notebook" echo -e "\t: check_dss_has_nvidia_gpu_acceleration_enabled" - echo -e "\t: check_dss_can_create_pytorch_cuda_notebook" - echo -e "\t: check_dss_can_create_tensorflow_cuda_notebook" - echo -e "\t: check_dss_can_create_pytorch_cpu_notebook" - echo -e "\t: check_dss_can_create_tensorflow_cpu_notebook" echo -e "\t: check_dss_can_remove_notebook " } @@ -155,13 +99,8 @@ main() { dss_namespace_is_deployed) check_dss_namespace_is_deployed ;; mlflow_status_is_ready) check_mlflow_status_is_ready ;; intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;; - can_create_itex_215_notebook) check_dss_can_create_itex_215_notebook ;; - can_create_ipex_2120_notebook) check_dss_can_create_ipex_2120_notebook ;; nvidia_gpu_acceleration_is_enabled) check_dss_has_nvidia_gpu_acceleration_enabled ;; - can_create_pytorch_cuda_notebook) check_dss_can_create_pytorch_cuda_notebook ;; - can_create_tensorflow_cuda_notebook) check_dss_can_create_tensorflow_cuda_notebook ;; - can_create_pytorch_cpu_notebook) check_dss_can_create_pytorch_cpu_notebook ;; - can_create_tensorflow_cpu_notebook) check_dss_can_create_tensorflow_cpu_notebook ;; + can_create_notebook) check_dss_can_create_notebook "${@:2}" ;; can_remove_notebook) check_dss_can_remove_notebook "${@:2}" ;; *) help_function ;; esac diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index a20e652ef9..8ca41b4f93 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -37,7 +37,7 @@ requires: executable.name == 'dss' depends: dss/initialize _summary: Check that an PyTorch CPU notebook can be successfully created estimated_duration: 3m -command: check_dss can_create_pytorch_cpu_notebook +command: check_dss can_create_notebook pytorch-cpu --image=pytorch id: cpu/pytorch_can_use_cpu category_id: dss-regress @@ -67,7 +67,7 @@ requires: executable.name == 'dss' depends: dss/initialize _summary: Check that an Tensorflow CPU notebook can be successfully created estimated_duration: 3m -command: check_dss can_create_tensorflow_cpu_notebook +command: check_dss can_create_notebook tensorflow-cpu --image=tensorflow id: cpu/tensorflow_can_use_cpu category_id: dss-regress @@ -187,7 +187,7 @@ requires: executable.name == 'dss' depends: dss/status_intel_gpu _summary: Check that an ITEX 2.15 notebook can be successfully created estimated_duration: 3m -command: check_dss can_create_itex_215_notebook +command: check_dss can_create_notebook itex-215-notebook --image tensorflow-intel id: itex/itex_2.15_import category_id: dss-regress @@ -227,7 +227,7 @@ requires: executable.name == 'dss' depends: dss/status_intel_gpu _summary: Check that an IPEX 2.1.20 notebook can be successfully created estimated_duration: 3m -command: check_dss can_create_ipex_2120_notebook +command: check_dss can_create_notebook ipex-2120-notebook --image pytorch-intel id: ipex/ipex_2.1.20_import category_id: dss-regress @@ -297,7 +297,7 @@ requires: executable.name == 'dss' depends: dss/status_nvidia_gpu _summary: Check that an PyTorch CUDA notebook can be successfully created estimated_duration: 3m -command: check_dss can_create_pytorch_cuda_notebook +command: check_dss can_create_notebook pytorch-cuda --image=pytorch-cuda id: cuda/pytorch_can_use_cuda category_id: dss-regress @@ -327,7 +327,7 @@ requires: executable.name == 'dss' depends: dss/status_nvidia_gpu _summary: Check that an Tensorflow CUDA notebook can be successfully created estimated_duration: 3m -command: check_dss can_create_tensorflow_cuda_notebook +command: check_dss can_create_notebook tensorflow-cuda --image=tensorflow-cuda id: cuda/tensorflow_can_use_cuda category_id: dss-regress From d13de27e422222bdd215adc0d146a992d031bc95 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 26 Nov 2024 17:05:02 +0100 Subject: [PATCH 22/44] disable intel gpu capacity tests temporarily the tests fail on re-runs because they start counting nvidia gpus too --- .../checkbox-provider-dss/bin/check_intel.sh | 2 ++ .../checkbox-provider-dss/units/jobs.pxu | 5 ++++- .../checkbox-provider-dss/units/test-plan.pxu | 5 +++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh index 961ff8237a..5eee7ba96e 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh @@ -72,6 +72,7 @@ check_intel_gpu_node_label_is_attached() { } check_at_least_one_intel_gpu_is_available() { + # FIXME: this test also counts NVIDIA GPUs result=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') if [ "${result}" -ge 1 ]; then echo "Test success: Found ${result} GPUs on system." @@ -82,6 +83,7 @@ check_at_least_one_intel_gpu_is_available() { } check_capacity_slots_for_intel_gpus_match() { + # FIXME: this test fails because it also counts NVIDIA GPUs once their plugin is enabled num_gpus=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.capacity.gpu\.intel\.com/i915}') # IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 8ca41b4f93..88745728d9 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -174,7 +174,10 @@ category_id: dss-regress flags: simple imports: from com.canonical.certification import executable requires: executable.name == 'dss' -depends: intel_gpu_plugin/node_gpu_allocatable +# FIXME: change back after re-enabling the tests for intel gpu counts +# FIXME: revisit dependency, it must also depend on dss/initialize +# depends: intel_gpu_plugin/node_gpu_allocatable +depends: intel_gpu_plugin/gpu_count _summary: Check that dss status reports that Intel GPU acceleration is enabled estimated_duration: 5s command: check_dss intel_gpu_acceleration_is_enabled diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index 49df936b7f..6458024e8f 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -17,8 +17,9 @@ include: intel_gpu_plugin/daemonset_number_ready intel_gpu_plugin/labels intel_gpu_plugin/gpu_count - intel_gpu_plugin/node_gpu_capacity - intel_gpu_plugin/node_gpu_allocatable + # FIXME: re-enable after fixing the tests for re-runs + # intel_gpu_plugin/node_gpu_capacity + # intel_gpu_plugin/node_gpu_allocatable dss/status_intel_gpu dss/create_tensorflow_intel_notebook itex/itex_2.15_import From 99bb957f7d36331a5614526c0657beb112cd28cb Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 15:14:11 +0100 Subject: [PATCH 23/44] rename test case for dss to be more fluid --- .../checkbox-provider-dss/bin/check_dss | 8 ++++---- .../checkbox-provider-dss/units/jobs.pxu | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss index d3cf774668..3c85bcae44 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss @@ -85,8 +85,8 @@ help_function() { echo "Usage: check_dss.sh [args]..." echo echo "Test cases currently implemented:" - echo -e "\t: check_dss_can_be_initialized" - echo -e "\t: check_dss_namespace_is_deployed" + echo -e "\t: check_dss_can_be_initialized" + echo -e "\t: check_dss_namespace_is_deployed" echo -e "\t: check_mlflow_status_is_ready" echo -e "\t: check_dss_has_intel_gpu_acceleration_enabled" echo -e "\t: check_dss_has_nvidia_gpu_acceleration_enabled" @@ -95,8 +95,8 @@ help_function() { main() { case ${1} in - dss_can_be_initialized) check_dss_can_be_initialized ;; - dss_namespace_is_deployed) check_dss_namespace_is_deployed ;; + can_be_initialized) check_dss_can_be_initialized ;; + namespace_is_deployed) check_dss_namespace_is_deployed ;; mlflow_status_is_ready) check_mlflow_status_is_ready ;; intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;; nvidia_gpu_acceleration_is_enabled) check_dss_has_nvidia_gpu_acceleration_enabled ;; diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 88745728d9..7d23f9adb8 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -7,7 +7,7 @@ requires: executable.name == 'microk8s' _summary: Check that the DSS environment initializes estimated_duration: 2m -command: check_dss dss_can_be_initialized +command: check_dss can_be_initialized id: dss/namespace category_id: dss-regress @@ -17,7 +17,7 @@ requires: executable.name == 'microk8s' depends: dss/initialize _summary: Check that the dss namespace is deployed estimated_duration: 5s -command: check_dss dss_namespace_is_deployed +command: check_dss namespace_is_deployed id: dss/status_mlflow category_id: dss-regress From adfe2cdb82796fb9d1bf62244840fa52ea128ec7 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 15:28:34 +0100 Subject: [PATCH 24/44] refactor checking dss status into reusable function --- .../checkbox-provider-dss/bin/check_dss | 36 ++++--------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss index 3c85bcae44..ba4df331fb 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss @@ -27,24 +27,13 @@ check_dss_namespace_is_deployed() { fi } -check_mlflow_status_is_ready() { +check_dss_status_contains() { cd "${HOME}" result=$(dss status) # save result to shell var to avoid broken pipe error - if echo "${result}" | grep -q "MLflow deployment: Ready"; then - echo "Test success: 'dss status' shows ready status for mlflow." + if echo "${result}" | grep -q "${1}"; then + echo "Test success: 'dss status' shows '$1'." else - >&2 echo "Test failure: 'dss status' does not show ready status for mlflow." - exit 1 - fi -} - -check_dss_has_intel_gpu_acceleration_enabled() { - cd "${HOME}" - result=$(dss status) # save result to shell var to avoid broken pipe error - if echo "${result}" | grep -q "Intel GPU acceleration: Enabled"; then - echo "Test success: 'dss status' correctly reports Intel GPU status." - else - >&2 echo "Test failure: 'dss status' does not report that Intel GPU acceleration is enabled." + >&2 echo "Test failure: 'dss status' does not show '$1'." exit 1 fi } @@ -59,17 +48,6 @@ check_dss_can_create_notebook() { fi } -check_dss_has_nvidia_gpu_acceleration_enabled() { - cd "${HOME}" - result=$(dss status) # save result to shell var to avoid broken pipe error - if echo "${result}" | grep -q "NVIDIA GPU acceleration: Enabled.*"; then - echo "Test success: 'dss status' correctly reports NVIDIA GPU status." - else - >&2 echo "Test failure: 'dss status' does not report that NVIDIA GPU acceleration is enabled." - exit 1 - fi -} - check_dss_can_remove_notebook() { cd "${HOME}" if dss remove "$@"; then @@ -97,9 +75,9 @@ main() { case ${1} in can_be_initialized) check_dss_can_be_initialized ;; namespace_is_deployed) check_dss_namespace_is_deployed ;; - mlflow_status_is_ready) check_mlflow_status_is_ready ;; - intel_gpu_acceleration_is_enabled) check_dss_has_intel_gpu_acceleration_enabled ;; - nvidia_gpu_acceleration_is_enabled) check_dss_has_nvidia_gpu_acceleration_enabled ;; + mlflow_status_is_ready) check_dss_status_contains "MLflow deployment: Ready" ;; + intel_gpu_acceleration_is_enabled) check_dss_status_contains "Intel GPU acceleration: Enabled.*" ;; + nvidia_gpu_acceleration_is_enabled) check_dss_status_contains "NVIDIA GPU acceleration: Enabled.*" ;; can_create_notebook) check_dss_can_create_notebook "${@:2}" ;; can_remove_notebook) check_dss_can_remove_notebook "${@:2}" ;; *) help_function ;; From 89bfdca14795663beed9daaee23660304e496bad Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 15:29:53 +0100 Subject: [PATCH 25/44] add missing usage string for dss create notebook function --- .../checkbox-dss-validation/checkbox-provider-dss/bin/check_dss | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss index ba4df331fb..93807fd9e8 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss @@ -68,6 +68,7 @@ help_function() { echo -e "\t: check_mlflow_status_is_ready" echo -e "\t: check_dss_has_intel_gpu_acceleration_enabled" echo -e "\t: check_dss_has_nvidia_gpu_acceleration_enabled" + echo -e "\t: check_dss_can_create_notebook [args]" echo -e "\t: check_dss_can_remove_notebook " } From 12fca77987f98df0b3b6a10745b9f4d3246bdd62 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Mon, 25 Nov 2024 15:33:30 +0100 Subject: [PATCH 26/44] use pushd popd instead of cd-ing to HOME in check dss --- .../checkbox-provider-dss/bin/check_dss | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss index 93807fd9e8..8a75be22a0 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_dss @@ -13,7 +13,6 @@ export -n PYTHONHOME PYTHONPATH PYTHONUSERBASE check_dss_can_be_initialized() { # TODO: we actually seem to initialize dss here; maybe split it out - cd "${HOME}" dss initialize --kubeconfig="$(sudo microk8s config)" echo "Test success: dss initialized." } @@ -28,7 +27,6 @@ check_dss_namespace_is_deployed() { } check_dss_status_contains() { - cd "${HOME}" result=$(dss status) # save result to shell var to avoid broken pipe error if echo "${result}" | grep -q "${1}"; then echo "Test success: 'dss status' shows '$1'." @@ -39,7 +37,6 @@ check_dss_status_contains() { } check_dss_can_create_notebook() { - cd "${HOME}" if dss create "${@}"; then echo "Test success: successfully created notebook with '$*'." else @@ -49,7 +46,6 @@ check_dss_can_create_notebook() { } check_dss_can_remove_notebook() { - cd "${HOME}" if dss remove "$@"; then echo "Test success: successfully removed notebook with '$*'." else @@ -73,6 +69,7 @@ help_function() { } main() { + pushd "${HOME}" case ${1} in can_be_initialized) check_dss_can_be_initialized ;; namespace_is_deployed) check_dss_namespace_is_deployed ;; @@ -83,6 +80,7 @@ main() { can_remove_notebook) check_dss_can_remove_notebook "${@:2}" ;; *) help_function ;; esac + popd } main "$@" From 6e051f30c2157408d300d87b5bd278219eeb976c Mon Sep 17 00:00:00 2001 From: Abdullah Date: Tue, 26 Nov 2024 16:14:21 +0100 Subject: [PATCH 27/44] rename check_cuda.sh to check_cuda to have a pseudo-fluent usage --- .../bin/{check_cuda.sh => check_cuda} | 2 +- .../checkbox-provider-dss/units/jobs.pxu | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) rename contrib/checkbox-dss-validation/checkbox-provider-dss/bin/{check_cuda.sh => check_cuda} (98%) diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda similarity index 98% rename from contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh rename to contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda index 20dbc635ca..64995c52d6 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda.sh +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda @@ -16,7 +16,7 @@ check_nvidia_gpu_addon_can_be_enabled() { echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if daemonsets have rolled out." sleep ${SLEEP_SECS} microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-device-plugin-daemonset - echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if daemonsets have rolled out." + echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU validations have rolled out." sleep ${SLEEP_SECS} echo "[INFO]: Waiting for the GPU validations to rollout" microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-operator-validator diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 7d23f9adb8..80d6d7aae6 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -270,7 +270,7 @@ requires: (host_has_nvidia_gpus.available == 'true' and executable.name == 'micr depends: dss/initialize _summary: Enable NVIDIA GPU addon estimated_duration: 5m -command: check_cuda.sh gpu_addon_can_be_enabled +command: check_cuda gpu_addon_can_be_enabled id: nvidia_gpu_addon/validations_succeed category_id: dss-regress @@ -280,7 +280,7 @@ requires: executable.name == 'microk8s' depends: nvidia_gpu_addon/enable _summary: NVIDIA GPU validations should succeed estimated_duration: 10s -command: check_cuda.sh gpu_validations_succeed +command: check_cuda gpu_validations_succeed id: dss/status_nvidia_gpu category_id: dss-regress @@ -310,7 +310,7 @@ requires: executable.name == 'microk8s' depends: dss/create_pytorch_cuda_notebook _summary: Check PyTorch can use CUDA estimated_duration: 1m -command: check_cuda.sh pytorch_can_use_cuda +command: check_cuda pytorch_can_use_cuda id: dss/remove_pytorch_cuda_notebook category_id: dss-regress @@ -340,7 +340,7 @@ requires: executable.name == 'microk8s' depends: dss/create_tensorflow_cuda_notebook _summary: Check PyTorch can use CUDA estimated_duration: 1m -command: check_cuda.sh tensorflow_can_use_cuda +command: check_cuda tensorflow_can_use_cuda id: dss/remove_tensorflow_cuda_notebook category_id: dss-regress From 8e1f358b6c12d83dd4bff245d1368073faefbe69 Mon Sep 17 00:00:00 2001 From: Abdullah Date: Wed, 27 Nov 2024 12:45:38 +0100 Subject: [PATCH 28/44] refactor cuda notebook tests to reusable script --- .../checkbox-provider-dss/bin/check_cuda | 34 -------------- .../checkbox-provider-dss/bin/check_notebook | 47 +++++++++++++++++++ .../checkbox-provider-dss/units/jobs.pxu | 6 +-- 3 files changed, 50 insertions(+), 37 deletions(-) create mode 100755 contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_notebook diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda index 64995c52d6..adf913f557 100755 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_cuda @@ -2,8 +2,6 @@ set -euxo pipefail -SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) - check_nvidia_gpu_addon_can_be_enabled() { # TODO: enable changing GPU_OPERATOR_VERSION GPU_OPERATOR_VERSION=24.6.2 @@ -36,34 +34,6 @@ check_nvidia_gpu_validations_succeed() { fi } -check_pytorch_can_use_cuda() { - echo "Starting PyTorch CUDA test" - pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'pytorch-cuda\S*') - echo "Found PyTorch CUDA pod: ${pod}" - script="import torch; assert torch.cuda.is_available(), 'CUDA is not available'" - if microk8s.kubectl -n dss exec "$pod" -- python3 -c "$script"; then - echo "PASS: PyTorch can use CUDA" - exit 0 - else - >&2 echo "FAIL: PyTorch can't use CUDA" - exit 1 - fi -} - -check_tensorflow_can_use_cuda() { - echo "Starting Tensorflow CUDA test" - pod=$(microk8s.kubectl get pods -n dss --field-selector=status.phase==Running -o=jsonpath='{.items..metadata.name}' | grep -o 'tensorflow-cuda\S*') - echo "Found Tensorflow CUDA pod: ${pod}" - script="$(cat "$SCRIPT_DIR/tensorflow_can_use_cuda.py")" - if microk8s.kubectl -n dss exec "$pod" -- python3 -c "$script"; then - echo "PASS: Tensorflow can use CUDA" - exit 0 - else - >&2 echo "FAIL: Tensorflow can't use CUDA" - exit 1 - fi -} - help_function() { echo "This script is used for tests related to CUDA" echo "Usage: check_dss.sh " @@ -71,16 +41,12 @@ help_function() { echo "Test cases currently implemented:" echo -e "\t: check_nvidia_gpu_addon_can_be_enabled" echo -e "\t: check_nvidia_gpu_validations_succeed" - echo -e "\t: check_pytorch_can_use_cuda" - echo -e "\t: check_tensorflow_can_use_cuda" } main() { case ${1} in gpu_addon_can_be_enabled) check_nvidia_gpu_addon_can_be_enabled ;; gpu_validations_succeed) check_nvidia_gpu_validations_succeed ;; - pytorch_can_use_cuda) check_pytorch_can_use_cuda ;; - tensorflow_can_use_cuda) check_tensorflow_can_use_cuda ;; *) help_function ;; esac } diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_notebook b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_notebook new file mode 100755 index 0000000000..42d15d031e --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_notebook @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +pytorch_can_use_cuda_script="import torch; assert torch.cuda.is_available(), 'CUDA is not available'" +tensorflow_can_use_cuda_script="$(cat "$SCRIPT_DIR/tensorflow_can_use_cuda.py")" + +check_notebook_can_run_python_script_in_pod() { + if microk8s.kubectl -n dss exec "$1" -- python -c "$2"; then + echo "Test success: in pod $1" + else + err_code=$? + >&2 echo "Test failed: in pod $1 with error code ${err_code}" + exit $err_code + fi +} + +help_function() { + echo "This script is used for tests related to CUDA" + echo "Usage: check_notebook [args]..." + echo + echo "Test cases currently implemented:" + echo -e "\t: check_notebook_can_run_python_script_in_pod