RHOAIENG-16076: tests(gha): run Makefile tests on opendatahub-io/not…

…ebooks Github Actions (#775) * RHOAIENG-16076: tests(gha): run Makefile tests in GitHub Actions * fixup, looks like I lost the second changed line from #761 (comment) when merging the work * fixup, linter wants space in the comments; IntelliJ is ok with it, so let's do that * fixup, add reference to OpenShift CI for the source of the make invocations * fixup, the ifNotPresent pull policy (for PR checks without image registry) and the symbolic links apparently needed to deploy rocm stuff
opendatahub-io · Nov 28, 2024 · b3d8af0 · b3d8af0
1 parent fb6e1b9
commit b3d8af0
Show file tree

Hide file tree

Showing 7 changed files with 309 additions and 11 deletions.
diff --git a/.github/workflows/build-notebooks-TEMPLATE.yaml b/.github/workflows/build-notebooks-TEMPLATE.yaml
@@ -46,6 +46,8 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
+      # region Free up disk space
+
       - name: Free up additional disk space
         # https://docs.github.com/en/actions/learn-github-actions/expressions
         if: "${{ contains(inputs.target, 'rocm') || contains(inputs.target, 'cuda') || contains(inputs.target, 'intel') ||
@@ -86,6 +88,10 @@ jobs:
           df -h
           free -h
 
+      # endregion
+
+      # region Podman setup
+
       # https://github.com/containers/buildah/issues/2521#issuecomment-884779112
       - name: Workaround https://github.com/containers/podman/issues/22152#issuecomment-2027705598
         run: sudo apt-get -qq remove podman crun
@@ -156,6 +162,10 @@ jobs:
           echo "IMAGE_TAG=${IMAGE_TAG}" >> "$GITHUB_OUTPUT"
           echo "OUTPUT_IMAGE=${{ env.IMAGE_REGISTRY}}:${{ inputs.target }}-${IMAGE_TAG}" >> "$GITHUB_OUTPUT"
 
+      # endregion
+
+      # region Trivy init & DB pre-pull
+
       - name: "pull_request|schedule: resolve target if Trivy scan should run"
         id: resolve-target
         if: ${{ fromJson(inputs.github).event_name == 'pull_request' || fromJson(inputs.github).event_name == 'schedule' }}
@@ -210,6 +220,10 @@ jobs:
               image \
               --download-java-db-only
 
+      # endregion
+
+      # region Image build
+
       # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#push
       - name: "push|schedule: make ${{ inputs.target }}"
         run: |
@@ -235,10 +249,34 @@ jobs:
       - name: "Show podman images information"
         run: podman images --digests
 
+      # endregion
+
+      # region Makefile image tests
+
       - name: "Check if we have tests or not"
         id: have-tests
         run: "ci/cached-builds/has_tests.py --target ${{ inputs.target }}"
 
+      - name: "Change pull policy to IfNotPresent"
+        run: |
+          set -Eeuxo pipefail
+
+          find . \( -name "statefulset.yaml" -o -name "pod.yaml" \) -type f -exec \
+            sed -i'' 's/imagePullPolicy: Always/imagePullPolicy: IfNotPresent/g' {} \;
+          git diff
+
+      # [INFO] Running command (('make deploy9-runtimes-rocm-tensorflow-ubi9-python-3.11',), {'shell': True})
+      # Deploying notebook from runtimes/rocm/tensorflow/ubi9-python-3.11/kustomize/base directory...
+      # sed: can't read runtimes/rocm/tensorflow/ubi9-python-3.11/kustomize/base/kustomization.yaml: No such file or directory
+      - name: "Fixup paths that prevent us from running rocm tests"
+        if: ${{ steps.have-tests.outputs.tests == 'true' }}
+        run: |
+          set -Eeuxo pipefail
+
+          mkdir -p runtimes/rocm
+          ln -s ../rocm-tensorflow runtimes/rocm/tensorflow
+          ln -s ../rocm-pytorch runtimes/rocm/pytorch
+
       # https://cri-o.io/
       - name: Install cri-o
         if: ${{ steps.have-tests.outputs.tests == 'true' }}
@@ -288,11 +326,11 @@ jobs:
 
       # do this early, it's a good check that cri-o is not completely broken
       - name: "Show crio images information"
-        if: ${{ steps.have-tests.outputs.tests == 'true'  }}
+        if: ${{ steps.have-tests.outputs.tests == 'true' }}
         run: sudo crictl images
 
       - name: Install Kubernetes cluster
-        if: ${{ steps.have-tests.outputs.tests == 'true'  }}
+        if: ${{ steps.have-tests.outputs.tests == 'true' }}
         run: |
           set -Eeuxo pipefail
 
@@ -350,6 +388,18 @@ jobs:
           kubectl wait deployments --all --all-namespaces --for=condition=Available --timeout=100s
           kubectl wait pods --all --all-namespaces --for=condition=Ready --timeout=100s
 
+      - name: "Run image tests"
+        if: ${{ steps.have-tests.outputs.tests == 'true' }}
+        run: python3 ci/cached-builds/make_test.py --target ${{ inputs.target }}
+        env:
+          IMAGE_TAG: "${{ steps.calculated_vars.outputs.IMAGE_TAG }}"
+          # for make deploy, mandatory to specify for the more exotic cases
+          NOTEBOOK_TAG: "${{ inputs.target }}-${{ steps.calculated_vars.outputs.IMAGE_TAG }}"
+
+      # endregion
+
+      # region Trivy vulnerability scan
+
       - name: Run Trivy vulnerability scanner
         if: ${{ steps.resolve-target.outputs.target }}
         run: |
@@ -391,6 +441,10 @@ jobs:
 
           cat $REPORT_FOLDER/$REPORT_FILE >> $GITHUB_STEP_SUMMARY
 
+      # endregion
+
+      # region Typescript (browser) image tests
+
       # https://playwright.dev/docs/ci
       # https://playwright.dev/docs/docker
       # we leave little free disk space after we mount LVM for podman storage
@@ -436,5 +490,7 @@ jobs:
           path: tests/browser/playwright-report/
           retention-days: 30
 
+      # endregion
+
       - run: df -h
         if: "${{ !cancelled() }}"
diff --git a/Makefile b/Makefile
@@ -553,7 +553,7 @@ validate-runtime-image: bin/kubectl
 		fi; \
 		if [ $$cmd == "python3" ]; then \
 			echo "=> Checking notebook execution..." ; \
-			$(KUBECTL_BIN) exec runtime-pod -- /bin/sh -c "curl https://raw.githubusercontent.com/opendatahub-io/elyra/refs/heads/main/etc/generic/requirements-elyra.txt --output req.txt && \ 
+			$(KUBECTL_BIN) exec runtime-pod -- /bin/sh -c "curl https://raw.githubusercontent.com/opendatahub-io/elyra/refs/heads/main/etc/generic/requirements-elyra.txt --output req.txt && \
 				python3 -m pip install -r req.txt > /dev/null && \
 				curl https://raw.githubusercontent.com/nteract/papermill/main/papermill/tests/notebooks/simple_execute.ipynb --output simple_execute.ipynb && \
 				python3 -m papermill simple_execute.ipynb output.ipynb > /dev/null" ; \

diff --git a/ci/cached-builds/make_test.py b/ci/cached-builds/make_test.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python3
+import argparse
+import contextlib
+import functools
+import re
+import subprocess
+import sys
+import time
+import typing
+import unittest
+import unittest.mock
+
+"""Runs the make commands used to deploy, test, and undeploy image in Kubernetes
+
+The make commands this runs are intended to reproduce the commands we define in our OpenShift CI config at
+https://github.com/openshift/release/blob/master/ci-operator/config/opendatahub-io/notebooks/opendatahub-io-notebooks-main.yaml#L1485
+"""
+
+
+class Args(argparse.Namespace):
+    """Type annotation to have autocompletion for args"""
+    target: str
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser("make_test.py")
+    parser.add_argument("--target", type=str)
+    args = typing.cast(Args, parser.parse_args())
+
+    run_tests(args.target)
+
+
+def run_tests(target: str) -> None:
+    prefix = target.translate(str.maketrans(".", "-"))
+    # this is a pod name in statefulset, some tests deploy individual unmanaged pods, though
+    pod = prefix + "-notebook-0"  # `$(kubectl get statefulset -o name | head -n 1)` would work too
+    namespace = "ns-" + prefix
+
+    if target.startswith("runtime-"):
+        deploy = "deploy9"
+        deploy_target = target.replace("runtime-", "runtimes-")
+    elif target.startswith("intel-runtime-"):
+        deploy = "deploy9"
+        deploy_target = target.replace("intel-runtime-", "intel-runtimes-")
+    elif target.startswith("rocm-runtime-"):
+        deploy = "deploy9"
+        deploy_target = target.replace("rocm-runtime-", "runtimes-rocm-")
+    elif target.startswith("rocm-jupyter-"):
+        deploy = "deploy9"
+        deploy_target = target.replace("rocm-jupyter-", "jupyter-rocm-")
+    elif target.startswith("cuda-rstudio-"):
+        deploy = "deploy"
+        os = re.match(r"^cuda-rstudio-([^-]+-).*", target)
+        deploy_target = os.group(1) + target.removeprefix("cuda-")
+    elif target.startswith("rstudio-"):
+        deploy = "deploy"
+        os = re.match(r"^rstudio-([^-]+-).*", target)
+        deploy_target = os.group(1) + target
+    else:
+        deploy = "deploy9"
+        deploy_target = target
+
+    check_call(f"kubectl create namespace {namespace}", shell=True)
+    check_call(f"kubectl config set-context --current --namespace={namespace}", shell=True)
+    check_call(f"kubectl label namespace {namespace} fake-scc=fake-restricted-v2", shell=True)
+
+    # wait for service account to be created, otherwise pod is refused to be created
+    # $ bin/kubectl apply -k runtimes/minimal/ubi9-python-3.9/kustomize/base
+    # configmap/runtime-req-config-9hhb2bhhmd created
+    # Error from server (Forbidden): error when creating "runtimes/minimal/ubi9-python-3.9/kustomize/base": pods "runtime-pod" is forbidden: error looking up service account ns-runtime-minimal-ubi9-python-3-9/default: serviceaccount "default" not found
+    # See https://github.com/kubernetes/kubernetes/issues/66689
+    check_call(f"timeout 10s bash -c 'until kubectl get serviceaccount/default; do sleep 1; done'", shell=True)
+
+    check_call(f"make {deploy}-{deploy_target}", shell=True)
+    wait_for_stability(pod)
+
+    try:
+        if target.startswith("runtime-") or target.startswith("intel-runtime-"):
+            check_call(f"make validate-runtime-image image={target}", shell=True)
+        elif target.startswith("rocm-runtime-"):
+            check_call(f"make validate-runtime-image image={target
+                       .replace("rocm-runtime-", "runtime-rocm-")}", shell=True)
+        elif target.startswith("rstudio-") or target.startswith("cuda-rstudio-"):
+            check_call(f"make validate-rstudio-image image={target}", shell=True)
+        elif target.startswith("codeserver-"):
+            check_call(f"make validate-codeserver-image image={target}", shell=True)
+        elif target.startswith("rocm-jupyter"):
+            check_call(f"make test-{target
+                       .replace("rocm-jupyter-", "jupyter-rocm-")}", shell=True)
+        else:
+            check_call(f"make test-{target}", shell=True)
+    finally:
+        # dump a lot of info to the GHA logs
+        with gha_log_group("pod and statefulset info"):
+            call(f"kubectl get statefulsets", shell=True)
+            call(f"kubectl describe statefulsets", shell=True)
+            call(f"kubectl get pods", shell=True)
+            call(f"kubectl describe pods", shell=True)
+            # describe does not show everything about the pod
+            call(f"kubectl get pods -o yaml", shell=True)
+
+        with gha_log_group("kubernetes namespace events"):
+            # events aren't all that useful, but it can tell what was happening in the current namespace
+            call(f"kubectl get events", shell=True)
+
+        with gha_log_group("previous pod logs"):
+            # relevant if the pod is crashlooping, this shows the final lines
+            # use the negative label selector as a trick to match all pods (as we don't have any pods with nosuchlabel)
+            call(f"kubectl logs --selector=nosuchlabel!=nosuchvalue --all-pods --timestamps --previous", shell=True)
+        with gha_log_group("current pod logs"):
+            # regular logs from a running (or finished) pod
+            call(f"kubectl logs --selector=nosuchlabel!=nosuchvalue --all-pods --timestamps", shell=True)
+
+    check_call(f"make un{deploy}-{deploy_target}", shell=True)
+
+    print(f"[INFO] Finished testing {target}")
+
+
+@functools.wraps(subprocess.check_call)
+def check_call(*args, **kwargs) -> int:
+    return execute(subprocess.check_call, args, kwargs)
+
+
+@functools.wraps(subprocess.call)
+def call(*args, **kwargs) -> int:
+    return execute(subprocess.call, args, kwargs)
+
+
+def execute(executor: typing.Callable, args: tuple, kwargs: dict) -> int:
+    print(f"[INFO] Running command {args, kwargs}")
+    sys.stdout.flush()
+    result = executor(*args, **kwargs)
+    print(f"\tDONE running command {args, kwargs}")
+    sys.stdout.flush()
+    return result
+
+
+# TODO(jdanek) this is a dumb impl, needs to be improved
+def wait_for_stability(pod: str) -> None:
+    """Waits for the pod to be stable. Often I'm seeing that the probes initially fail.
+    > error: Internal error occurred: error executing command in container: container is not created or running
+    > error: unable to upgrade connection: container not found ("notebook")
+    """
+    timeout = 100
+    for _ in range(3):
+        call(
+            f"timeout {timeout}s bash -c 'until kubectl wait --for=condition=Ready pods --all --timeout 5s; do sleep 1; done'", shell=True)
+        timeout = 50
+        time.sleep(3)
+
+
+# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#grouping-log-lines
+@contextlib.contextmanager
+def gha_log_group(title):
+    """Prints the starting and ending magic strings for GitHub Actions line group in log."""
+    print(f"::group::{title}", file=sys.stdout)
+    sys.stdout.flush()
+    try:
+        yield
+    finally:
+        print("::endgroup::", file=sys.stdout)
+        sys.stdout.flush()
+
+
+# https://docs.python.org/3/library/unittest.mock-examples.html#patch-decorators
+@unittest.mock.patch("time.sleep", unittest.mock.Mock())
+class TestMakeTest(unittest.TestCase):
+    @unittest.mock.patch("make_test.execute")
+    def test_make_commands_jupyter(self, mock_execute: unittest.mock.Mock) -> None:
+        """Compares the commands with what we had in the openshift/release yaml"""
+        run_tests("jupyter-minimal-ubi9-python-3.11")
+        commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
+        assert "make deploy9-jupyter-minimal-ubi9-python-3.11" in commands
+        assert "make test-jupyter-minimal-ubi9-python-3.11" in commands
+        assert "make undeploy9-jupyter-minimal-ubi9-python-3.11" in commands
+
+    @unittest.mock.patch("make_test.execute")
+    def test_make_commands_jupyter_rocm(self, mock_execute: unittest.mock.Mock) -> None:
+        """Compares the commands with what we had in the openshift/release yaml"""
+        run_tests("rocm-jupyter-tensorflow-ubi9-python-3.11")
+        commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
+        assert "make deploy9-jupyter-rocm-tensorflow-ubi9-python-3.11" in commands
+        assert "make test-jupyter-rocm-tensorflow-ubi9-python-3.11" in commands
+        assert "make undeploy9-jupyter-rocm-tensorflow-ubi9-python-3.11" in commands
+
+    @unittest.mock.patch("make_test.execute")
+    def test_make_commands_codeserver(self, mock_execute: unittest.mock.Mock) -> None:
+        """Compares the commands with what we had in the openshift/release yaml"""
+        run_tests("codeserver-ubi9-python-3.11")
+        commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
+        assert "make deploy9-codeserver-ubi9-python-3.11" in commands
+        assert "make validate-codeserver-image image=codeserver-ubi9-python-3.11" in commands
+        assert "make undeploy9-codeserver-ubi9-python-3.11" in commands
+
+    @unittest.mock.patch("make_test.execute")
+    def test_make_commands_rstudio(self, mock_execute: unittest.mock.Mock) -> None:
+        """Compares the commands with what we had in the openshift/release yaml"""
+        run_tests("rstudio-c9s-python-3.11")
+        commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
+        assert "make deploy-c9s-rstudio-c9s-python-3.11" in commands
+        assert "make validate-rstudio-image image=rstudio-c9s-python-3.11" in commands
+        assert "make undeploy-c9s-rstudio-c9s-python-3.11" in commands
+
+    @unittest.mock.patch("make_test.execute")
+    def test_make_commands_cuda_rstudio(self, mock_execute: unittest.mock.Mock) -> None:
+        """Compares the commands with what we had in the openshift/release yaml"""
+        run_tests("cuda-rstudio-c9s-python-3.11")
+        commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
+        assert "make deploy-c9s-rstudio-c9s-python-3.11" in commands
+        assert "make validate-rstudio-image image=cuda-rstudio-c9s-python-3.11" in commands
+        assert "make undeploy-c9s-rstudio-c9s-python-3.11" in commands
+
+    @unittest.mock.patch("make_test.execute")
+    def test_make_commands_runtime(self, mock_execute: unittest.mock.Mock) -> None:
+        """Compares the commands with what we had in the openshift/release yaml"""
+        run_tests("runtime-datascience-ubi9-python-3.11")
+        commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
+        assert "make deploy9-runtimes-datascience-ubi9-python-3.11" in commands
+        assert "make validate-runtime-image image=runtime-datascience-ubi9-python-3.11" in commands
+        assert "make undeploy9-runtimes-datascience-ubi9-python-3.11" in commands
+
+    @unittest.mock.patch("make_test.execute")
+    def test_make_commands_intel_runtime(self, mock_execute: unittest.mock.Mock) -> None:
+        """Compares the commands with what we had in the openshift/release yaml"""
+        run_tests("intel-runtime-ml-ubi9-python-3.11")
+        commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
+        assert "make deploy9-intel-runtimes-ml-ubi9-python-3.11" in commands
+        assert "make validate-runtime-image image=intel-runtime-ml-ubi9-python-3.11" in commands
+        assert "make undeploy9-intel-runtimes-ml-ubi9-python-3.11" in commands
+
+    @unittest.mock.patch("make_test.execute")
+    def test_make_commands_rocm_runtime(self, mock_execute: unittest.mock.Mock) -> None:
+        """Compares the commands with what we had in the openshift/release yaml"""
+        run_tests("rocm-runtime-pytorch-ubi9-python-3.11")
+        commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
+        assert "make deploy9-runtimes-rocm-pytorch-ubi9-python-3.11" in commands
+        assert "make validate-runtime-image image=runtime-rocm-pytorch-ubi9-python-3.11" in commands
+        assert "make undeploy9-runtimes-rocm-pytorch-ubi9-python-3.11" in commands
+
+
+if __name__ == "__main__":
+    main()
diff --git a/jupyter/rocm/pytorch/ubi9-python-3.11/kustomize/base/kustomization.yaml b/jupyter/rocm/pytorch/ubi9-python-3.11/kustomize/base/kustomization.yaml
@@ -1,9 +1,9 @@
 ---
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
-namePrefix: rocm-jupyter-pytorch-ubi9-python-3-11-
+namePrefix: jupyter-rocm-pytorch-ubi9-python-3-11-
 commonLabels:
-  app: rocm-jupyter-pytorch-ubi9-python-3-11
+  app: jupyter-rocm-pytorch-ubi9-python-3-11
 resources:
   - service.yaml
   - statefulset.yaml