Skip to content

Commit

Permalink
RHOAIENG-16076: tests(gha): run Makefile tests on opendatahub-io/not…
Browse files Browse the repository at this point in the history
…ebooks Github Actions (#775)

* RHOAIENG-16076: tests(gha): run Makefile tests in GitHub Actions

* fixup, looks like I lost the second changed line from #761 (comment) when merging the work

* fixup, linter wants space in the comments; IntelliJ is ok with it, so let's do that

* fixup, add reference to OpenShift CI for the source of the make invocations

* fixup, the ifNotPresent pull policy (for PR checks without image registry) and the symbolic links apparently needed to deploy rocm stuff
  • Loading branch information
jiridanek authored Nov 28, 2024
1 parent fb6e1b9 commit b3d8af0
Show file tree
Hide file tree
Showing 7 changed files with 309 additions and 11 deletions.
60 changes: 58 additions & 2 deletions .github/workflows/build-notebooks-TEMPLATE.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

# region Free up disk space

- name: Free up additional disk space
# https://docs.github.com/en/actions/learn-github-actions/expressions
if: "${{ contains(inputs.target, 'rocm') || contains(inputs.target, 'cuda') || contains(inputs.target, 'intel') ||
Expand Down Expand Up @@ -86,6 +88,10 @@ jobs:
df -h
free -h
# endregion

# region Podman setup

# https://github.com/containers/buildah/issues/2521#issuecomment-884779112
- name: Workaround https://github.com/containers/podman/issues/22152#issuecomment-2027705598
run: sudo apt-get -qq remove podman crun
Expand Down Expand Up @@ -156,6 +162,10 @@ jobs:
echo "IMAGE_TAG=${IMAGE_TAG}" >> "$GITHUB_OUTPUT"
echo "OUTPUT_IMAGE=${{ env.IMAGE_REGISTRY}}:${{ inputs.target }}-${IMAGE_TAG}" >> "$GITHUB_OUTPUT"
# endregion

# region Trivy init & DB pre-pull

- name: "pull_request|schedule: resolve target if Trivy scan should run"
id: resolve-target
if: ${{ fromJson(inputs.github).event_name == 'pull_request' || fromJson(inputs.github).event_name == 'schedule' }}
Expand Down Expand Up @@ -210,6 +220,10 @@ jobs:
image \
--download-java-db-only
# endregion

# region Image build

# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#push
- name: "push|schedule: make ${{ inputs.target }}"
run: |
Expand All @@ -235,10 +249,34 @@ jobs:
- name: "Show podman images information"
run: podman images --digests

# endregion

# region Makefile image tests

- name: "Check if we have tests or not"
id: have-tests
run: "ci/cached-builds/has_tests.py --target ${{ inputs.target }}"

- name: "Change pull policy to IfNotPresent"
run: |
set -Eeuxo pipefail
find . \( -name "statefulset.yaml" -o -name "pod.yaml" \) -type f -exec \
sed -i'' 's/imagePullPolicy: Always/imagePullPolicy: IfNotPresent/g' {} \;
git diff
# [INFO] Running command (('make deploy9-runtimes-rocm-tensorflow-ubi9-python-3.11',), {'shell': True})
# Deploying notebook from runtimes/rocm/tensorflow/ubi9-python-3.11/kustomize/base directory...
# sed: can't read runtimes/rocm/tensorflow/ubi9-python-3.11/kustomize/base/kustomization.yaml: No such file or directory
- name: "Fixup paths that prevent us from running rocm tests"
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: |
set -Eeuxo pipefail
mkdir -p runtimes/rocm
ln -s ../rocm-tensorflow runtimes/rocm/tensorflow
ln -s ../rocm-pytorch runtimes/rocm/pytorch
# https://cri-o.io/
- name: Install cri-o
if: ${{ steps.have-tests.outputs.tests == 'true' }}
Expand Down Expand Up @@ -288,11 +326,11 @@ jobs:
# do this early, it's a good check that cri-o is not completely broken
- name: "Show crio images information"
if: ${{ steps.have-tests.outputs.tests == 'true' }}
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: sudo crictl images

- name: Install Kubernetes cluster
if: ${{ steps.have-tests.outputs.tests == 'true' }}
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: |
set -Eeuxo pipefail
Expand Down Expand Up @@ -350,6 +388,18 @@ jobs:
kubectl wait deployments --all --all-namespaces --for=condition=Available --timeout=100s
kubectl wait pods --all --all-namespaces --for=condition=Ready --timeout=100s
- name: "Run image tests"
if: ${{ steps.have-tests.outputs.tests == 'true' }}
run: python3 ci/cached-builds/make_test.py --target ${{ inputs.target }}
env:
IMAGE_TAG: "${{ steps.calculated_vars.outputs.IMAGE_TAG }}"
# for make deploy, mandatory to specify for the more exotic cases
NOTEBOOK_TAG: "${{ inputs.target }}-${{ steps.calculated_vars.outputs.IMAGE_TAG }}"

# endregion

# region Trivy vulnerability scan

- name: Run Trivy vulnerability scanner
if: ${{ steps.resolve-target.outputs.target }}
run: |
Expand Down Expand Up @@ -391,6 +441,10 @@ jobs:
cat $REPORT_FOLDER/$REPORT_FILE >> $GITHUB_STEP_SUMMARY
# endregion

# region Typescript (browser) image tests

# https://playwright.dev/docs/ci
# https://playwright.dev/docs/docker
# we leave little free disk space after we mount LVM for podman storage
Expand Down Expand Up @@ -436,5 +490,7 @@ jobs:
path: tests/browser/playwright-report/
retention-days: 30

# endregion

- run: df -h
if: "${{ !cancelled() }}"
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -553,7 +553,7 @@ validate-runtime-image: bin/kubectl
fi; \
if [ $$cmd == "python3" ]; then \
echo "=> Checking notebook execution..." ; \
$(KUBECTL_BIN) exec runtime-pod -- /bin/sh -c "curl https://raw.githubusercontent.com/opendatahub-io/elyra/refs/heads/main/etc/generic/requirements-elyra.txt --output req.txt && \
$(KUBECTL_BIN) exec runtime-pod -- /bin/sh -c "curl https://raw.githubusercontent.com/opendatahub-io/elyra/refs/heads/main/etc/generic/requirements-elyra.txt --output req.txt && \
python3 -m pip install -r req.txt > /dev/null && \
curl https://raw.githubusercontent.com/nteract/papermill/main/papermill/tests/notebooks/simple_execute.ipynb --output simple_execute.ipynb && \
python3 -m papermill simple_execute.ipynb output.ipynb > /dev/null" ; \
Expand Down
242 changes: 242 additions & 0 deletions ci/cached-builds/make_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
#!/usr/bin/env python3
import argparse
import contextlib
import functools
import re
import subprocess
import sys
import time
import typing
import unittest
import unittest.mock

"""Runs the make commands used to deploy, test, and undeploy image in Kubernetes
The make commands this runs are intended to reproduce the commands we define in our OpenShift CI config at
https://github.com/openshift/release/blob/master/ci-operator/config/opendatahub-io/notebooks/opendatahub-io-notebooks-main.yaml#L1485
"""


class Args(argparse.Namespace):
"""Type annotation to have autocompletion for args"""
target: str


def main() -> None:
parser = argparse.ArgumentParser("make_test.py")
parser.add_argument("--target", type=str)
args = typing.cast(Args, parser.parse_args())

run_tests(args.target)


def run_tests(target: str) -> None:
prefix = target.translate(str.maketrans(".", "-"))
# this is a pod name in statefulset, some tests deploy individual unmanaged pods, though
pod = prefix + "-notebook-0" # `$(kubectl get statefulset -o name | head -n 1)` would work too
namespace = "ns-" + prefix

if target.startswith("runtime-"):
deploy = "deploy9"
deploy_target = target.replace("runtime-", "runtimes-")
elif target.startswith("intel-runtime-"):
deploy = "deploy9"
deploy_target = target.replace("intel-runtime-", "intel-runtimes-")
elif target.startswith("rocm-runtime-"):
deploy = "deploy9"
deploy_target = target.replace("rocm-runtime-", "runtimes-rocm-")
elif target.startswith("rocm-jupyter-"):
deploy = "deploy9"
deploy_target = target.replace("rocm-jupyter-", "jupyter-rocm-")
elif target.startswith("cuda-rstudio-"):
deploy = "deploy"
os = re.match(r"^cuda-rstudio-([^-]+-).*", target)
deploy_target = os.group(1) + target.removeprefix("cuda-")
elif target.startswith("rstudio-"):
deploy = "deploy"
os = re.match(r"^rstudio-([^-]+-).*", target)
deploy_target = os.group(1) + target
else:
deploy = "deploy9"
deploy_target = target

check_call(f"kubectl create namespace {namespace}", shell=True)
check_call(f"kubectl config set-context --current --namespace={namespace}", shell=True)
check_call(f"kubectl label namespace {namespace} fake-scc=fake-restricted-v2", shell=True)

# wait for service account to be created, otherwise pod is refused to be created
# $ bin/kubectl apply -k runtimes/minimal/ubi9-python-3.9/kustomize/base
# configmap/runtime-req-config-9hhb2bhhmd created
# Error from server (Forbidden): error when creating "runtimes/minimal/ubi9-python-3.9/kustomize/base": pods "runtime-pod" is forbidden: error looking up service account ns-runtime-minimal-ubi9-python-3-9/default: serviceaccount "default" not found
# See https://github.com/kubernetes/kubernetes/issues/66689
check_call(f"timeout 10s bash -c 'until kubectl get serviceaccount/default; do sleep 1; done'", shell=True)

check_call(f"make {deploy}-{deploy_target}", shell=True)
wait_for_stability(pod)

try:
if target.startswith("runtime-") or target.startswith("intel-runtime-"):
check_call(f"make validate-runtime-image image={target}", shell=True)
elif target.startswith("rocm-runtime-"):
check_call(f"make validate-runtime-image image={target
.replace("rocm-runtime-", "runtime-rocm-")}", shell=True)
elif target.startswith("rstudio-") or target.startswith("cuda-rstudio-"):
check_call(f"make validate-rstudio-image image={target}", shell=True)
elif target.startswith("codeserver-"):
check_call(f"make validate-codeserver-image image={target}", shell=True)
elif target.startswith("rocm-jupyter"):
check_call(f"make test-{target
.replace("rocm-jupyter-", "jupyter-rocm-")}", shell=True)
else:
check_call(f"make test-{target}", shell=True)
finally:
# dump a lot of info to the GHA logs
with gha_log_group("pod and statefulset info"):
call(f"kubectl get statefulsets", shell=True)
call(f"kubectl describe statefulsets", shell=True)
call(f"kubectl get pods", shell=True)
call(f"kubectl describe pods", shell=True)
# describe does not show everything about the pod
call(f"kubectl get pods -o yaml", shell=True)

with gha_log_group("kubernetes namespace events"):
# events aren't all that useful, but it can tell what was happening in the current namespace
call(f"kubectl get events", shell=True)

with gha_log_group("previous pod logs"):
# relevant if the pod is crashlooping, this shows the final lines
# use the negative label selector as a trick to match all pods (as we don't have any pods with nosuchlabel)
call(f"kubectl logs --selector=nosuchlabel!=nosuchvalue --all-pods --timestamps --previous", shell=True)
with gha_log_group("current pod logs"):
# regular logs from a running (or finished) pod
call(f"kubectl logs --selector=nosuchlabel!=nosuchvalue --all-pods --timestamps", shell=True)

check_call(f"make un{deploy}-{deploy_target}", shell=True)

print(f"[INFO] Finished testing {target}")


@functools.wraps(subprocess.check_call)
def check_call(*args, **kwargs) -> int:
return execute(subprocess.check_call, args, kwargs)


@functools.wraps(subprocess.call)
def call(*args, **kwargs) -> int:
return execute(subprocess.call, args, kwargs)


def execute(executor: typing.Callable, args: tuple, kwargs: dict) -> int:
print(f"[INFO] Running command {args, kwargs}")
sys.stdout.flush()
result = executor(*args, **kwargs)
print(f"\tDONE running command {args, kwargs}")
sys.stdout.flush()
return result


# TODO(jdanek) this is a dumb impl, needs to be improved
def wait_for_stability(pod: str) -> None:
"""Waits for the pod to be stable. Often I'm seeing that the probes initially fail.
> error: Internal error occurred: error executing command in container: container is not created or running
> error: unable to upgrade connection: container not found ("notebook")
"""
timeout = 100
for _ in range(3):
call(
f"timeout {timeout}s bash -c 'until kubectl wait --for=condition=Ready pods --all --timeout 5s; do sleep 1; done'", shell=True)
timeout = 50
time.sleep(3)


# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#grouping-log-lines
@contextlib.contextmanager
def gha_log_group(title):
"""Prints the starting and ending magic strings for GitHub Actions line group in log."""
print(f"::group::{title}", file=sys.stdout)
sys.stdout.flush()
try:
yield
finally:
print("::endgroup::", file=sys.stdout)
sys.stdout.flush()


# https://docs.python.org/3/library/unittest.mock-examples.html#patch-decorators
@unittest.mock.patch("time.sleep", unittest.mock.Mock())
class TestMakeTest(unittest.TestCase):
@unittest.mock.patch("make_test.execute")
def test_make_commands_jupyter(self, mock_execute: unittest.mock.Mock) -> None:
"""Compares the commands with what we had in the openshift/release yaml"""
run_tests("jupyter-minimal-ubi9-python-3.11")
commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
assert "make deploy9-jupyter-minimal-ubi9-python-3.11" in commands
assert "make test-jupyter-minimal-ubi9-python-3.11" in commands
assert "make undeploy9-jupyter-minimal-ubi9-python-3.11" in commands

@unittest.mock.patch("make_test.execute")
def test_make_commands_jupyter_rocm(self, mock_execute: unittest.mock.Mock) -> None:
"""Compares the commands with what we had in the openshift/release yaml"""
run_tests("rocm-jupyter-tensorflow-ubi9-python-3.11")
commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
assert "make deploy9-jupyter-rocm-tensorflow-ubi9-python-3.11" in commands
assert "make test-jupyter-rocm-tensorflow-ubi9-python-3.11" in commands
assert "make undeploy9-jupyter-rocm-tensorflow-ubi9-python-3.11" in commands

@unittest.mock.patch("make_test.execute")
def test_make_commands_codeserver(self, mock_execute: unittest.mock.Mock) -> None:
"""Compares the commands with what we had in the openshift/release yaml"""
run_tests("codeserver-ubi9-python-3.11")
commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
assert "make deploy9-codeserver-ubi9-python-3.11" in commands
assert "make validate-codeserver-image image=codeserver-ubi9-python-3.11" in commands
assert "make undeploy9-codeserver-ubi9-python-3.11" in commands

@unittest.mock.patch("make_test.execute")
def test_make_commands_rstudio(self, mock_execute: unittest.mock.Mock) -> None:
"""Compares the commands with what we had in the openshift/release yaml"""
run_tests("rstudio-c9s-python-3.11")
commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
assert "make deploy-c9s-rstudio-c9s-python-3.11" in commands
assert "make validate-rstudio-image image=rstudio-c9s-python-3.11" in commands
assert "make undeploy-c9s-rstudio-c9s-python-3.11" in commands

@unittest.mock.patch("make_test.execute")
def test_make_commands_cuda_rstudio(self, mock_execute: unittest.mock.Mock) -> None:
"""Compares the commands with what we had in the openshift/release yaml"""
run_tests("cuda-rstudio-c9s-python-3.11")
commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
assert "make deploy-c9s-rstudio-c9s-python-3.11" in commands
assert "make validate-rstudio-image image=cuda-rstudio-c9s-python-3.11" in commands
assert "make undeploy-c9s-rstudio-c9s-python-3.11" in commands

@unittest.mock.patch("make_test.execute")
def test_make_commands_runtime(self, mock_execute: unittest.mock.Mock) -> None:
"""Compares the commands with what we had in the openshift/release yaml"""
run_tests("runtime-datascience-ubi9-python-3.11")
commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
assert "make deploy9-runtimes-datascience-ubi9-python-3.11" in commands
assert "make validate-runtime-image image=runtime-datascience-ubi9-python-3.11" in commands
assert "make undeploy9-runtimes-datascience-ubi9-python-3.11" in commands

@unittest.mock.patch("make_test.execute")
def test_make_commands_intel_runtime(self, mock_execute: unittest.mock.Mock) -> None:
"""Compares the commands with what we had in the openshift/release yaml"""
run_tests("intel-runtime-ml-ubi9-python-3.11")
commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
assert "make deploy9-intel-runtimes-ml-ubi9-python-3.11" in commands
assert "make validate-runtime-image image=intel-runtime-ml-ubi9-python-3.11" in commands
assert "make undeploy9-intel-runtimes-ml-ubi9-python-3.11" in commands

@unittest.mock.patch("make_test.execute")
def test_make_commands_rocm_runtime(self, mock_execute: unittest.mock.Mock) -> None:
"""Compares the commands with what we had in the openshift/release yaml"""
run_tests("rocm-runtime-pytorch-ubi9-python-3.11")
commands: list[str] = [c[0][1][0] for c in mock_execute.call_args_list]
assert "make deploy9-runtimes-rocm-pytorch-ubi9-python-3.11" in commands
assert "make validate-runtime-image image=runtime-rocm-pytorch-ubi9-python-3.11" in commands
assert "make undeploy9-runtimes-rocm-pytorch-ubi9-python-3.11" in commands


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namePrefix: rocm-jupyter-pytorch-ubi9-python-3-11-
namePrefix: jupyter-rocm-pytorch-ubi9-python-3-11-
commonLabels:
app: rocm-jupyter-pytorch-ubi9-python-3-11
app: jupyter-rocm-pytorch-ubi9-python-3-11
resources:
- service.yaml
- statefulset.yaml
Expand Down
Loading

0 comments on commit b3d8af0

Please sign in to comment.