Skip to content

Commit 117a479

Browse files
jiridanekjstourac
andauthored
RHOAIENG-16076: tests(gha): install a cri-o backed kubernetes for running Makefile tests (#783)
* RHOAIENG-16076: tests(gha): install a cri-o backed kubernetes for running tests * Update .github/workflows/build-notebooks-TEMPLATE.yaml Co-authored-by: Jan Stourac <[email protected]> --------- Co-authored-by: Jan Stourac <[email protected]>
1 parent eff0195 commit 117a479

File tree

5 files changed

+257
-0
lines changed

5 files changed

+257
-0
lines changed

.github/workflows/build-notebooks-TEMPLATE.yaml

+115
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,121 @@ jobs:
235235
- name: "Show podman images information"
236236
run: podman images --digests
237237

238+
- name: "Check if we have tests or not"
239+
id: have-tests
240+
run: "ci/cached-builds/has_tests.py --target ${{ inputs.target }}"
241+
242+
# https://cri-o.io/
243+
- name: Install cri-o
244+
if: ${{ steps.have-tests.outputs.tests == 'true' }}
245+
run: |
246+
set -Eeuxo pipefail
247+
248+
sudo apt-get update
249+
sudo apt-get install -y software-properties-common curl
250+
251+
curl -fsSL https://pkgs.k8s.io/core:/stable:/$KUBERNETES_VERSION/deb/Release.key | \
252+
sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
253+
254+
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/$KUBERNETES_VERSION/deb/ /" | \
255+
sudo tee /etc/apt/sources.list.d/kubernetes.list
256+
257+
curl -fsSL https://pkgs.k8s.io/addons:/cri-o:/stable:/$CRIO_VERSION/deb/Release.key | \
258+
sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/cri-o-apt-keyring.gpg
259+
260+
echo "deb [signed-by=/etc/apt/keyrings/cri-o-apt-keyring.gpg] https://pkgs.k8s.io/addons:/cri-o:/stable:/$CRIO_VERSION/deb/ /" | \
261+
sudo tee /etc/apt/sources.list.d/cri-o.list
262+
263+
sudo apt-get update
264+
sudo apt-get install -y cri-o kubelet kubeadm kubectl
265+
266+
# make use of /etc/cni/net.d/11-crio-ipv4-bridge.conflist so we don't
267+
# need a pod network and just use the default bridge
268+
sudo rm -rf /etc/cni/net.d/*
269+
# cat /etc/cni/net.d/11-crio-ipv4-bridge.conflist
270+
# https://github.com/containerd/containerd/blob/main/script%2Fsetup%2Finstall-cni
271+
# https://www.cni.dev/plugins/current/main/bridge/
272+
sudo cp ci/cached-builds/11-crio-ipv4-bridge.conflist /etc/cni/net.d/11-crio-ipv4-bridge.conflist
273+
274+
sudo cp ci/cached-builds/crio.conf /etc/crio/crio.conf.d/
275+
276+
sudo systemctl start crio.service
277+
env:
278+
CRIO_VERSION: v1.30
279+
KUBERNETES_VERSION: v1.30
280+
281+
- name: Show crio debug data (on failure)
282+
if: ${{ failure() && steps.have-tests.outputs.tests == 'true' }}
283+
run: |
284+
set -Eeuxo pipefail
285+
286+
sudo systemctl status crio.service || true
287+
sudo journalctl -xeu crio.service
288+
289+
# do this early, it's a good check that cri-o is not completely broken
290+
- name: "Show crio images information"
291+
if: ${{ steps.have-tests.outputs.tests == 'true' }}
292+
run: sudo crictl images
293+
294+
- name: Install Kubernetes cluster
295+
if: ${{ steps.have-tests.outputs.tests == 'true' }}
296+
run: |
297+
set -Eeuxo pipefail
298+
299+
sudo swapoff -a
300+
sudo modprobe br_netfilter
301+
sudo sysctl -w net.ipv4.ip_forward=1
302+
303+
# Was getting strange DNS resolution errors from pods that don't seem to want to go away sometimes:
304+
# Resolving raw.githubusercontent.com (raw.githubusercontent.com)... failed: Name or service not known.
305+
# wget: unable to resolve host address ‘raw.githubusercontent.com’
306+
# Here's what helped:
307+
# https://kubernetes.io/docs/tasks/administer-cluster/dns-debugging-resolution/#known-issues
308+
# https://github.com/kubernetes/kubernetes/blob/e4c1f980b76fecece30c2f77885a7117192170a6/CHANGELOG/CHANGELOG-1.30.md?plain=1#L1454
309+
# https://github.com/canonical/microk8s/issues/68#issuecomment-404923563
310+
sudo ufw allow in on cni0
311+
sudo ufw allow out on cni0
312+
sudo ufw default allow routed
313+
sudo iptables -P FORWARD ACCEPT
314+
sudo iptables -t nat -A POSTROUTING -s 10.85.0.0/16 -o eth0 -j MASQUERADE
315+
316+
# https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm
317+
sudo kubeadm init --config=ci/cached-builds/kubeadm.yaml
318+
319+
mkdir -p $HOME/.kube
320+
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
321+
sudo chown $(id -u):$(id -g) $HOME/.kube/config
322+
323+
- name: Show kubelet debug data (on failure)
324+
if: ${{ failure() && steps.have-tests.outputs.tests == 'true' }}
325+
run: |
326+
set -Eeuxo pipefail
327+
328+
sudo systemctl status kubelet || true
329+
sudo journalctl -xeu kubelet
330+
331+
# Here is one example how you may list all running Kubernetes containers by using crictl:
332+
sudo crictl --runtime-endpoint unix:///var/run/crio/crio.sock ps -a | grep kube | grep -v pause
333+
# Once you have found the failing container, you can inspect its logs with:
334+
# crictl --runtime-endpoint unix:///var/run/crio/crio.sock logs CONTAINERID
335+
336+
- name: Untaint the master
337+
if: ${{ steps.have-tests.outputs.tests == 'true' }}
338+
run: kubectl taint nodes --all node-role.kubernetes.io/control-plane-
339+
340+
- name: Show nodes status and wait for readiness
341+
if: ${{ steps.have-tests.outputs.tests == 'true' }}
342+
run: |
343+
kubectl describe nodes
344+
kubectl wait --for=condition=Ready nodes --all --timeout=100s || (kubectl describe nodes && false)
345+
346+
- name: Wait for pods to be running
347+
if: ${{ steps.have-tests.outputs.tests == 'true' }}
348+
run: |
349+
set -Eeuxo pipefail
350+
kubectl wait deployments --all --all-namespaces --for=condition=Available --timeout=100s
351+
kubectl wait pods --all --all-namespaces --for=condition=Ready --timeout=100s
352+
238353
- name: Run Trivy vulnerability scanner
239354
if: ${{ steps.resolve-target.outputs.target }}
240355
run: |
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"cniVersion": "1.0.0",
3+
"name": "crio",
4+
"plugins": [
5+
{
6+
"type": "bridge",
7+
"bridge": "cni0",
8+
"isGateway": true,
9+
"ipMasq": true,
10+
"hairpinMode": true,
11+
"ipam": {
12+
"type": "host-local",
13+
"routes": [
14+
{ "dst": "0.0.0.0/0" }
15+
],
16+
"ranges": [
17+
[{ "subnet": "10.85.0.0/16" }]
18+
]
19+
}
20+
}
21+
]
22+
}

ci/cached-builds/crio.conf

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md
2+
3+
[crio]
4+
storage_driver = "overlay"
5+
# storage_option = [ "overlay.mountopt=nodev,metacopy=on" ]
6+
7+
# reuse podman's container storage because we have huge images that don't fit on disk twice
8+
root = "/home/runner/.local/share/containers/storage"
9+
# has to be the same as root!
10+
runroot = "/home/runner/.local/share/containers/storage"
11+
12+
# https://stackoverflow.com/questions/62408028/kubelet-failed-to-createpodsandbox-for-coredns-failed-to-set-bridge-addr-c
13+
[crio.network]
14+
# the /etc/cni/net.d/11-crio-ipv4-bridge.conflist default IPs confilct with flannel,
15+
# older versions of kubernetes the kubelet was touching the cni, now only the container runtime touches
16+
# c.f. https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/network-plugins/#installation
17+
#network_dir = "/etc/cni/net.d-kube/"

ci/cached-builds/has_tests.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import json
4+
import os
5+
import pathlib
6+
import typing
7+
import unittest
8+
9+
import gha_pr_changed_files
10+
11+
"""Determines whether we have deploy Makefile tests for this target or not
12+
13+
https://github.com/openshift/release/blob/master/ci-operator/config/opendatahub-io/notebooks/opendatahub-io-notebooks-main.yaml#L1485
14+
"""
15+
16+
17+
class Args(argparse.Namespace):
18+
"""Type annotation to have autocompletion for args"""
19+
target: str
20+
21+
22+
def main() -> None:
23+
parser = argparse.ArgumentParser("make_test.py")
24+
parser.add_argument("--target", type=str)
25+
args = typing.cast(Args, parser.parse_args())
26+
27+
has_tests = check_tests(args.target)
28+
29+
if "GITHUB_ACTIONS" in os.environ:
30+
with open(os.environ["GITHUB_OUTPUT"], "at") as f:
31+
print(f"tests={json.dumps(has_tests)}", file=f)
32+
33+
print(f"{has_tests=}")
34+
35+
36+
def check_tests(target: str) -> bool:
37+
if target.startswith("rocm-jupyter-minimal-") or target.startswith("rocm-jupyter-datascience-"):
38+
return False # we don't have specific tests for -minimal-, ... in ci-operator/config
39+
if '-intel-' in target:
40+
return False # RHOAIENG-8388: Intel tensorflow notebook failed to get tested on OCP-CI
41+
42+
has_tests = False
43+
dirs = gha_pr_changed_files.analyze_build_directories(target)
44+
for d in reversed(dirs): # (!)
45+
kustomization = pathlib.Path(gha_pr_changed_files.PROJECT_ROOT) / d / "kustomize/base/kustomization.yaml"
46+
has_tests = has_tests or kustomization.is_file()
47+
break # TODO: check only the last directory (the top level layer) for now
48+
return has_tests
49+
50+
51+
class TestCheckTests(unittest.TestCase):
52+
def test_has_tests(self):
53+
assert check_tests("base-c9s-python-3.11") is False
54+
assert check_tests("jupyter-minimal-ubi9-python-3.9") is True
55+
56+
57+
if __name__ == "__main__":
58+
main()

ci/cached-builds/kubeadm.yaml

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
---
2+
# kubeadm config print init-defaults > kubeadm.yaml
3+
# kubeadm init --cri-socket=/var/run/crio/crio.sock
4+
5+
# https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta3/
6+
# https://kubernetes.io/docs/reference/config-api/kubeadm-config.v1beta4/
7+
apiVersion: kubeadm.k8s.io/v1beta3
8+
bootstrapTokens:
9+
- groups:
10+
- system:bootstrappers:kubeadm:default-node-token
11+
token: abcdef.0123456789abcdef
12+
ttl: 24h0m0s
13+
usages:
14+
- signing
15+
- authentication
16+
kind: InitConfiguration
17+
localAPIEndpoint:
18+
bindPort: 6443
19+
nodeRegistration:
20+
kubeletExtraArgs:
21+
# Need to have enough disk space for Kubelet, so move root-dir on the LVM volume
22+
# Note: the internets discourage from changing the default because storage plugins may then struggle
23+
# https://cep.dev/posts/adventure-trying-change-kubelet-rootdir/
24+
root-dir: "/home/runner/.local/share/containers/kubelet-root-dir"
25+
criSocket: unix:///var/run/crio/crio.sock
26+
imagePullPolicy: IfNotPresent
27+
taints: null
28+
---
29+
apiServer:
30+
timeoutForControlPlane: 4m0s
31+
apiVersion: kubeadm.k8s.io/v1beta3
32+
certificatesDir: /etc/kubernetes/pki
33+
clusterName: kubernetes
34+
controllerManager: {}
35+
dns: {}
36+
etcd:
37+
local:
38+
dataDir: /var/lib/etcd
39+
imageRepository: registry.k8s.io
40+
kind: ClusterConfiguration
41+
networking:
42+
dnsDomain: cluster.local
43+
# this matches the default in /etc/cni/net.d/11-crio-ipv4-bridge.conflist
44+
podSubnet: 10.85.0.0/16
45+
scheduler: {}

0 commit comments

Comments
 (0)