From f0174bc152e4633025bb28516cbca409daa4015a Mon Sep 17 00:00:00 2001 From: Vladimir Popov Date: Thu, 28 Oct 2021 18:58:08 +0700 Subject: [PATCH 1/6] Add scripts for Calico CNI setup Signed-off-by: Vladimir Popov --- .github/workflows/ci.yaml | 1 + cloudtest/packet.yaml | 8 +- main_test.go | 14 ++ scripts/calico/deploy-calico.sh | 12 ++ scripts/calico/kustomization.yaml | 9 ++ scripts/calico/patch.yaml | 8 + scripts/calico/setup-interfaces.sh | 8 + scripts/calico/setup-node-ip.sh | 8 + scripts/create-kubernetes-cluster.sh | 152 ++++++++++++------ scripts/destroy-old-clusters.sh | 4 +- scripts/download-postmortem-data.sh | 2 + scripts/include/wait-pids.sh | 15 ++ scripts/include/wait-start.sh | 26 +++ .../{docker-ulimit.sh => config-docker.sh} | 7 +- scripts/k8s/download-worker-images.sh | 5 +- scripts/k8s/install-kubernetes.sh | 8 +- scripts/k8s/start-master.sh | 40 ++++- scripts/setup-sshd.sh | 9 ++ scripts/sriov/config-SRIOV.sh | 4 +- scripts/sriov/enable-SRIOV.sh | 4 +- scripts/sriov/enable-VFIO.sh | 2 +- scripts/sriov/setup-SRIOV.sh | 45 +----- 22 files changed, 283 insertions(+), 108 deletions(-) create mode 100755 scripts/calico/deploy-calico.sh create mode 100644 scripts/calico/kustomization.yaml create mode 100644 scripts/calico/patch.yaml create mode 100755 scripts/calico/setup-interfaces.sh create mode 100755 scripts/calico/setup-node-ip.sh create mode 100755 scripts/include/wait-pids.sh create mode 100755 scripts/include/wait-start.sh rename scripts/k8s/{docker-ulimit.sh => config-docker.sh} (73%) create mode 100755 scripts/setup-sshd.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f3005e01..534206bf 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -99,6 +99,7 @@ jobs: PACKET_AUTH_TOKEN: ${{ secrets.PACKET_AUTH_TOKEN }} PACKET_PROJECT_ID: 383890d0-f5d1-4de1-881a-4d1ede549d18 KUBERNETES_VERSION: ${{ secrets.NSM_KUBERNETES_VERSION }} + CALICO: on - name: Publish test report # 7. Publish test report uses: mikepenz/action-junit-report@v2.1.0 if: ${{ always() }} diff --git a/cloudtest/packet.yaml b/cloudtest/packet.yaml index 7a6acac4..348cae98 100644 --- a/cloudtest/packet.yaml +++ b/cloudtest/packet.yaml @@ -4,7 +4,7 @@ providers: - name: "packet" kind: "packet" instances: 1 - retry: 5 + retry: 0 node-count: 2 enabled: true timeout: 2400 # 40 minutes to start cluster @@ -25,13 +25,15 @@ providers: os: "ubuntu_20_04" billing-cycle: "hourly" port-vlans: - eth3: 1044 + eth1: 3000 # calico VLAN + eth3: 1044 # SR-IOV VLAN - name: "Worker" host-name: "SR-IOV-Worker-${CLUSTER_NAME}" os: "ubuntu_20_04" billing-cycle: "hourly" port-vlans: - eth3: 1044 + eth1: 3000 # calico VLAN + eth3: 1044 # SR-IOV VLAN hardware-reservations: - 2cf78481-53b0-46c8-a084-6e9815acdb0b - 2361d3c2-f694-4fa7-a683-a9f69e2abe7c diff --git a/main_test.go b/main_test.go index 790e2266..1b6c3fdf 100644 --- a/main_test.go +++ b/main_test.go @@ -17,6 +17,7 @@ package main_test import ( + "os" "testing" "github.com/stretchr/testify/suite" @@ -27,7 +28,14 @@ import ( "github.com/networkservicemesh/integration-tests/suites/sriov" ) +func isCalico() bool { + return os.Getenv("CALICO") != "" +} + func TestMemory(t *testing.T) { + if isCalico() { + t.Skip("not available with Calico") + } suite.Run(t, new(memory.Suite)) } @@ -36,9 +44,15 @@ func TestSRIOV(t *testing.T) { } func TestMultiForwarder(t *testing.T) { + if isCalico() { + t.Skip("not available with Calico") + } suite.Run(t, new(multiforwarder.Suite)) } func TestHeal(t *testing.T) { + if isCalico() { + t.Skip("not available with Calico") + } suite.Run(t, new(heal.Suite)) } diff --git a/scripts/calico/deploy-calico.sh b/scripts/calico/deploy-calico.sh new file mode 100755 index 00000000..b1753164 --- /dev/null +++ b/scripts/calico/deploy-calico.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +function on_error() { + kubectl describe pods --all-namespaces + exit 1 +} +trap 'on_error' ERR + +kubectl apply -k scripts/calico + +kubectl -n calico-vpp-dataplane rollout status daemonset/calico-vpp-node --timeout=5m +kubectl -n kube-system rollout status deployment/calico-kube-controllers --timeout=5m diff --git a/scripts/calico/kustomization.yaml b/scripts/calico/kustomization.yaml new file mode 100644 index 00000000..446d8683 --- /dev/null +++ b/scripts/calico/kustomization.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - https://raw.githubusercontent.com/projectcalico/vpp-dataplane/v0.17.0-calicov3.20.2/yaml/generated/calico-vpp-nohuge.yaml + +patchesStrategicMerge: + - patch.yaml diff --git a/scripts/calico/patch.yaml b/scripts/calico/patch.yaml new file mode 100644 index 00000000..adf9f207 --- /dev/null +++ b/scripts/calico/patch.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: calico-vpp-config + namespace: calico-vpp-dataplane +data: + vpp_dataplane_interface: eno2 diff --git a/scripts/calico/setup-interfaces.sh b/scripts/calico/setup-interfaces.sh new file mode 100755 index 00000000..c093f56b --- /dev/null +++ b/scripts/calico/setup-interfaces.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +ip="$1" + +ip addr add "${ip}" dev eno2 +ip link set up dev eno2 diff --git a/scripts/calico/setup-node-ip.sh b/scripts/calico/setup-node-ip.sh new file mode 100755 index 00000000..ee9664bd --- /dev/null +++ b/scripts/calico/setup-node-ip.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e + +ip="$1" + +sed -Ei "s/(.*)\"/\1 --node-ip=${ip}\"/g" /var/lib/kubelet/kubeadm-flags.env +systemctl restart kubelet diff --git a/scripts/create-kubernetes-cluster.sh b/scripts/create-kubernetes-cluster.sh index 9fd84e99..5cdfc73d 100755 --- a/scripts/create-kubernetes-cluster.sh +++ b/scripts/create-kubernetes-cluster.sh @@ -1,82 +1,138 @@ #!/bin/bash -x -# shellcheck disable=SC2086 +# shellcheck disable=SC2086,SC2029 master_ip=$1 worker_ip=$2 sshkey=$3 -SSH_OPTS="-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i ${sshkey}" - -function wait_pids() { - pids="$1" - message="$2" - for pid in ${pids}; do - echo "waiting for PID ${pid}" - wait ${pid} - code=$? - if test $code -ne 0; then - echo "${message}: process exited with code $code, aborting..." && return 1 - fi - done - return 0 -} - -# Setup SR-IOV +SSH_CONFIG="ssh_config" +SSH_OPTS="-F ${SSH_CONFIG} -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i ${sshkey}" + +if [[ -n "$CALICO" ]]; then # calico + CALICO_MASTER_IP="10.0.0.$(( GITHUB_RUN_NUMBER % 100 ))" + CALICO_WORKER_IP="10.0.0.$(( GITHUB_RUN_NUMBER % 100 + 1 ))" + CALICO_SUBNET_MASK="30" +fi + +ENVS="KUBERNETES_VERSION CALICO" + +# wait_pids pid_1 ... pid_n +source scripts/include/wait-pids.sh +# wait_start ip_1 ... ip_n +source scripts/include/wait-start.sh + +wait_start ${master_ip} ${worker_ip} + +# 0. Setup SendEnv on the local side. +cp /etc/ssh/ssh_config ${SSH_CONFIG} || exit 1 +echo "Host * + SendEnv ${ENVS}" >> ${SSH_CONFIG} || exit 2 + +# 1. Setup AcceptEnv on the servers sides and wait for sshd to restart. +scp ${SSH_OPTS} scripts/setup-sshd.sh root@${master_ip}:setup-sshd.sh || exit 11 +scp ${SSH_OPTS} scripts/setup-sshd.sh root@${worker_ip}:setup-sshd.sh || exit 12 + +pids="" +ssh ${SSH_OPTS} root@${master_ip} ./setup-sshd.sh "${ENVS}" & +pids+=" $!" +ssh ${SSH_OPTS} root@${worker_ip} ./setup-sshd.sh "${ENVS}" & +pids+=" $!" +wait_pids "${pids}" "sshd config failed" || exit 13 + +wait_start ${master_ip} ${worker_ip} || exit 14 + +## 2. Setup SR-IOV. pids="" /bin/bash scripts/sriov/setup-SRIOV.sh "${master_ip}" "${worker_ip}" "${SSH_OPTS}" & pids+=" $!" -wait_pids "${pids}" "SR-IOV config failed" || exit 1 +wait_pids "${pids}" "SR-IOV config failed" || exit 21 + +if [[ -n "$CALICO" ]]; then # calico + # 3. Create Calico scripts directory on nodes. + ssh ${SSH_OPTS} root@${master_ip} mkdir calico || exit 31 + ssh ${SSH_OPTS} root@${worker_ip} mkdir calico || exit 32 + + # 4. Setup Calico interfaces. + scp ${SSH_OPTS} scripts/calico/setup-interfaces.sh root@${master_ip}:calico/setup-interfaces.sh || exit 41 + scp ${SSH_OPTS} scripts/calico/setup-interfaces.sh root@${worker_ip}:calico/setup-interfaces.sh || exit 42 + + pids="" + ssh ${SSH_OPTS} root@${master_ip} ./calico/setup-interfaces.sh "${CALICO_MASTER_IP}/${CALICO_SUBNET_MASK}" & + pids+=" $!" + ssh ${SSH_OPTS} root@${worker_ip} ./calico/setup-interfaces.sh "${CALICO_WORKER_IP}/${CALICO_SUBNET_MASK}" & + pids+=" $!" + wait_pids "${pids}" "setup Calico interfaces failed" || exit 43 +fi -# Create k8s scripts directory on nodes -ssh ${SSH_OPTS} root@${master_ip} mkdir k8s -ssh ${SSH_OPTS} root@${worker_ip} mkdir k8s +# 5. Create k8s scripts directory on nodes. +ssh ${SSH_OPTS} root@${master_ip} mkdir k8s || exit 51 +ssh ${SSH_OPTS} root@${worker_ip} mkdir k8s || exit 52 -# Setup docker ulimit -scp ${SSH_OPTS} scripts/k8s/docker-ulimit.sh root@${master_ip}:k8s/docker-ulimit.sh || exit 2 -scp ${SSH_OPTS} scripts/k8s/docker-ulimit.sh root@${worker_ip}:k8s/docker-ulimit.sh || exit 3 +# 6. Config docker. +scp ${SSH_OPTS} scripts/k8s/config-docker.sh root@${master_ip}:k8s/config-docker.sh || exit 61 +scp ${SSH_OPTS} scripts/k8s/config-docker.sh root@${worker_ip}:k8s/config-docker.sh || exit 62 pids="" -ssh ${SSH_OPTS} root@${master_ip} ./k8s/docker-ulimit.sh & +ssh ${SSH_OPTS} root@${master_ip} ./k8s/config-docker.sh & pids+=" $!" -ssh ${SSH_OPTS} root@${worker_ip} ./k8s/docker-ulimit.sh & +ssh ${SSH_OPTS} root@${worker_ip} ./k8s/config-docker.sh & pids+=" $!" -wait_pids "${pids}" "kubernetes install failed" || exit 4 +wait_pids "${pids}" "docker config failed" || exit 63 -# Install kubeadm, kubelet and kubectl -scp ${SSH_OPTS} scripts/k8s/install-kubernetes.sh root@${master_ip}:k8s/install-kubernetes.sh || exit 5 -scp ${SSH_OPTS} scripts/k8s/install-kubernetes.sh root@${worker_ip}:k8s/install-kubernetes.sh || exit 6 +# 7. Install kubeadm, kubelet and kubectl. +scp ${SSH_OPTS} scripts/k8s/install-kubernetes.sh root@${master_ip}:k8s/install-kubernetes.sh || exit 71 +scp ${SSH_OPTS} scripts/k8s/install-kubernetes.sh root@${worker_ip}:k8s/install-kubernetes.sh || exit 72 pids="" -ssh ${SSH_OPTS} root@${master_ip} ./k8s/install-kubernetes.sh ${KUBERNETES_VERSION} & +ssh ${SSH_OPTS} root@${master_ip} ./k8s/install-kubernetes.sh & pids+=" $!" -ssh ${SSH_OPTS} root@${worker_ip} ./k8s/install-kubernetes.sh ${KUBERNETES_VERSION} & +ssh ${SSH_OPTS} root@${worker_ip} ./k8s/install-kubernetes.sh & pids+=" $!" -wait_pids "${pids}" "kubernetes install failed" || exit 7 +wait_pids "${pids}" "kubernetes install failed" || exit 73 -# master: start kubernetes and create join script -# worker: download kubernetes images -scp ${SSH_OPTS} scripts/k8s/start-master.sh root@${master_ip}:k8s/start-master.sh || exit 8 -scp ${SSH_OPTS} scripts/k8s/download-worker-images.sh root@${worker_ip}:k8s/download-worker-images.sh || exit 9 +# 8. +# master: start kubernetes and create join script. +# worker: download kubernetes images. +scp ${SSH_OPTS} scripts/k8s/start-master.sh root@${master_ip}:k8s/start-master.sh || exit 81 +scp ${SSH_OPTS} scripts/k8s/download-worker-images.sh root@${worker_ip}:k8s/download-worker-images.sh || exit 82 pids="" -ssh ${SSH_OPTS} root@${master_ip} ./k8s/start-master.sh ${KUBERNETES_VERSION} & +ssh ${SSH_OPTS} root@${master_ip} ./k8s/start-master.sh ${master_ip} ${CALICO_MASTER_IP} & pids+=" $!" ssh ${SSH_OPTS} root@${worker_ip} ./k8s/download-worker-images.sh & pids+=" $!" -wait_pids "${pids}" "node setup failed" || exit 10 +wait_pids "${pids}" "nodes setup failed" || exit 83 -# Download worker join script +# 9. Download, upload and run worker join script. mkdir -p /tmp/${master_ip} -scp ${SSH_OPTS} root@${master_ip}:k8s/join-cluster.sh /tmp/${master_ip}/join-cluster.sh || exit 11 -chmod +x /tmp/${master_ip}/join-cluster.sh || exit 12 +scp ${SSH_OPTS} root@${master_ip}:k8s/join-cluster.sh /tmp/${master_ip}/join-cluster.sh || exit 91 +chmod +x /tmp/${master_ip}/join-cluster.sh || exit 92 -# Upload and run worker join script -scp ${SSH_OPTS} /tmp/${master_ip}/join-cluster.sh root@${worker_ip}:k8s/join-cluster.sh || exit 13 +scp ${SSH_OPTS} /tmp/${master_ip}/join-cluster.sh root@${worker_ip}:k8s/join-cluster.sh || exit 93 pids="" ssh ${SSH_OPTS} root@${worker_ip} ./k8s/join-cluster.sh & pids+=" $!" -wait_pids "${pids}" "worker join failed" || exit 14 +wait_pids "${pids}" "worker join failed" || exit 94 + +# 10. Save KUBECONFIG to file. +scp ${SSH_OPTS} root@${master_ip}:.kube/config ${KUBECONFIG} || exit 101 + +if [[ -n "$CALICO" ]]; then # calico + # 11. Setup cluster nodes IPs. + scp ${SSH_OPTS} scripts/calico/setup-node-ip.sh root@${master_ip}:calico/setup-node-ip.sh || exit 111 + scp ${SSH_OPTS} scripts/calico/setup-node-ip.sh root@${worker_ip}:calico/setup-node-ip.sh || exit 112 + + pids="" + ssh ${SSH_OPTS} root@${master_ip} ./calico/setup-node-ip.sh "${CALICO_MASTER_IP}" & + pids+=" $!" + ssh ${SSH_OPTS} root@${worker_ip} ./calico/setup-node-ip.sh "${CALICO_WORKER_IP}" & + pids+=" $!" + wait_pids "${pids}" "nodes IPs setup failed" || exit 113 + + # 12. Deploy Calico CNI. + /bin/bash scripts/calico/deploy-calico.sh || exit 121 +fi -echo "Save KUBECONFIG to file" -scp ${SSH_OPTS} root@${master_ip}:.kube/config ${KUBECONFIG} || exit 15 +# Get pods +kubectl get pods --all-namespaces \ No newline at end of file diff --git a/scripts/destroy-old-clusters.sh b/scripts/destroy-old-clusters.sh index 3b698f12..ba18e21d 100755 --- a/scripts/destroy-old-clusters.sh +++ b/scripts/destroy-old-clusters.sh @@ -1,3 +1,5 @@ -#!/bin/bash +#!/bin/bash -x + +set -e go run github.com/networkservicemesh/cloudtest/pkg/providers/packet/packet_cleanup -k y -c y diff --git a/scripts/download-postmortem-data.sh b/scripts/download-postmortem-data.sh index a5c18f79..a099f6f2 100755 --- a/scripts/download-postmortem-data.sh +++ b/scripts/download-postmortem-data.sh @@ -1,6 +1,8 @@ #!/bin/bash -x # shellcheck disable=SC2086 +set -e + master_ip=$1 worker_ip=$2 cluster_id=$3 diff --git a/scripts/include/wait-pids.sh b/scripts/include/wait-pids.sh new file mode 100755 index 00000000..fbd3ed50 --- /dev/null +++ b/scripts/include/wait-pids.sh @@ -0,0 +1,15 @@ +function wait_pids() { + pids="$1" + message="$2" + for pid in ${pids}; do + echo "waiting for PID ${pid}" + # shellcheck disable=SC2086 + wait ${pid} + code=$? + if test $code -ne 0; then + echo "${message}: process exited with code $code, aborting..." + return 1 + fi + done + return 0 +} diff --git a/scripts/include/wait-start.sh b/scripts/include/wait-start.sh new file mode 100755 index 00000000..a6c612af --- /dev/null +++ b/scripts/include/wait-start.sh @@ -0,0 +1,26 @@ +function wait_start() { + for ip in "$@"; do + success_attempts=0 + # ~15 minutes to start + for i in {1..60}; do + if [[ ${i} == 60 ]]; then + echo "timeout waiting for the ${ip} to start, aborting..." + return 1 + fi + + # shellcheck disable=SC2086 + if ssh ${SSH_OPTS} -o ConnectTimeout=1 -o BatchMode=yes root@${ip} true; then + ((success_attempts++)) + else + success_attempts=0 + fi + + if [[ ${success_attempts} == 3 ]]; then + break + fi + + sleep 15 + done + done + return 0 +} diff --git a/scripts/k8s/docker-ulimit.sh b/scripts/k8s/config-docker.sh similarity index 73% rename from scripts/k8s/docker-ulimit.sh rename to scripts/k8s/config-docker.sh index 723885b7..e9887f0b 100755 --- a/scripts/k8s/docker-ulimit.sh +++ b/scripts/k8s/config-docker.sh @@ -1,4 +1,6 @@ -#!/bin/bash +#!/bin/bash -x + +set -e mkdir -p /etc/docker @@ -11,5 +13,6 @@ echo \ "soft": 67108864, "hard": 67108864 } - } + }, + "exec-opts": ["native.cgroupdriver=systemd"] }' >/etc/docker/daemon.json diff --git a/scripts/k8s/download-worker-images.sh b/scripts/k8s/download-worker-images.sh index 2c962726..9dc13d5d 100755 --- a/scripts/k8s/download-worker-images.sh +++ b/scripts/k8s/download-worker-images.sh @@ -1,2 +1,5 @@ -#!/bin/sh +#!/bin/bash -x + +set -e + kubeadm config images pull diff --git a/scripts/k8s/install-kubernetes.sh b/scripts/k8s/install-kubernetes.sh index beb0e472..3a056349 100755 --- a/scripts/k8s/install-kubernetes.sh +++ b/scripts/k8s/install-kubernetes.sh @@ -1,6 +1,8 @@ -#!/bin/sh +#!/bin/bash -x -KUBERNETES_VERSION="$1-00" +set -e + +VERSION="${KUBERNETES_VERSION}-00" curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - cat </etc/apt/sources.list.d/kubernetes.list @@ -9,7 +11,7 @@ EOF apt-get update apt-get install -y docker.io -apt-get install -qy kubelet="${KUBERNETES_VERSION}" kubectl="${KUBERNETES_VERSION}" kubeadm="${KUBERNETES_VERSION}" +apt-get install -qy kubelet="${VERSION}" kubectl="${VERSION}" kubeadm="${VERSION}" systemctl daemon-reload systemctl restart kubelet diff --git a/scripts/k8s/start-master.sh b/scripts/k8s/start-master.sh index 7d4628c5..02a821c7 100755 --- a/scripts/k8s/start-master.sh +++ b/scripts/k8s/start-master.sh @@ -1,19 +1,43 @@ -#!/bin/sh - -KUBERNETES_VERSION="$1" +#!/bin/bash -x +# shellcheck disable=SC2086 set -e +public_ip="$1" +calico_ip="$2" + K8S_DIR=$(dirname "$0") -kubeadm init --kubernetes-version "${KUBERNETES_VERSION}" --pod-network-cidr=192.168.0.0/16 --skip-token-print +if [[ -z "${CALICO}" ]]; then # not calico + ip="${public_ip}" +else + ip="${calico_ip}" +fi + +kubeadm init \ + --kubernetes-version "${KUBERNETES_VERSION}" \ + --pod-network-cidr=192.168.0.0/16 \ + --skip-token-print \ + --apiserver-advertise-address=$ip -mkdir -p "$HOME"/.kube -sudo cp -f /etc/kubernetes/admin.conf "$HOME"/.kube/config -sudo chown "$(id -u):$(id -g)" "$HOME"/.kube/config +mkdir -p ~/.kube +cp -f /etc/kubernetes/admin.conf ~/.kube/config +chown "$(id -u):$(id -g)" ~/.kube/config -kubectl apply -f "https://cloud.weave.works/k8s/net?k8s-version=$(kubectl version | base64 | tr -d '\n')&env.IPALLOC_RANGE=192.168.0.0/16" +if [[ -z "${CALICO}" ]]; then # not calico + kubectl apply -f "https://cloud.weave.works/k8s/net?k8s-version=$(kubectl version | base64 | tr -d '\n')&env.IPALLOC_RANGE=192.168.0.0/16" +fi kubectl taint nodes --all node-role.kubernetes.io/master- +if [[ -n "${CALICO}" ]]; then # calico + kubectl -n kube-system get configmap kubeadm-config -o jsonpath='{.data.ClusterConfiguration}' > kubeadm.yaml + sed -i "/^apiServer:$/a \ \ certSANs:\n - \"${public_ip}\"\n - \"${calico_ip}\"" kubeadm.yaml + + rm /etc/kubernetes/pki/apiserver.{crt,key} + kubeadm init phase certs apiserver --config kubeadm.yaml + + sed -i "s/${calico_ip//./\.}/${public_ip}/g" ~/.kube/config +fi + kubeadm token create --print-join-command > "${K8S_DIR}/join-cluster.sh" diff --git a/scripts/setup-sshd.sh b/scripts/setup-sshd.sh new file mode 100755 index 00000000..f32090b7 --- /dev/null +++ b/scripts/setup-sshd.sh @@ -0,0 +1,9 @@ +#!/bin/bash -x + +set -e + +ENVS="$*" + +echo "AcceptEnv ${ENVS}" >> /etc/ssh/sshd_config + +nohup bash -c "sleep 5; systemctl restart sshd" >/dev/null 2>&1 & \ No newline at end of file diff --git a/scripts/sriov/config-SRIOV.sh b/scripts/sriov/config-SRIOV.sh index fe61fce7..3852e2f1 100755 --- a/scripts/sriov/config-SRIOV.sh +++ b/scripts/sriov/config-SRIOV.sh @@ -1,6 +1,8 @@ -#!/bin/bash +#!/bin/bash -x # shellcheck disable=SC2064,SC2129 +set -e + CONFIG_DIRECTORY="/var/lib/networkservicemesh" CONFIG_FILE="${CONFIG_DIRECTORY}/sriov.config" diff --git a/scripts/sriov/enable-SRIOV.sh b/scripts/sriov/enable-SRIOV.sh index 012c097c..964edd30 100755 --- a/scripts/sriov/enable-SRIOV.sh +++ b/scripts/sriov/enable-SRIOV.sh @@ -1,4 +1,6 @@ -#!/bin/bash +#!/bin/bash -x + +set -e sed -Ei "s/(GRUB_CMDLINE_LINUX=.*)'/\1 intel_iommu=on'/" /etc/default/grub grub-mkconfig -o /boot/grub/grub.cfg diff --git a/scripts/sriov/enable-VFIO.sh b/scripts/sriov/enable-VFIO.sh index 0e4de224..7490c808 100755 --- a/scripts/sriov/enable-VFIO.sh +++ b/scripts/sriov/enable-VFIO.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -x # shellcheck disable=SC2002,SC2064 device="/sys/class/net/$1/device" diff --git a/scripts/sriov/setup-SRIOV.sh b/scripts/sriov/setup-SRIOV.sh index bf9ac1e5..ea4aa96e 100755 --- a/scripts/sriov/setup-SRIOV.sh +++ b/scripts/sriov/setup-SRIOV.sh @@ -5,23 +5,13 @@ master_ip="$1" worker_ip="$2" SSH_OPTS="$3" -function wait_pids() { - pids="$1" - message="$2" - for pid in ${pids}; do - echo "waiting for PID ${pid}" - wait ${pid} - code=$? - if test $code -ne 0; then - echo "${message}: process exited with code $code, aborting..." - return 1 - fi - done - return 0 -} - SRIOV_DIR=$(dirname "$0") +# wait_pids pid_1 ... pid_n +source scripts/include/wait-pids.sh +# wait_start ip_1 ... ip_n +source scripts/include/wait-start.sh + # Create SR-IOV scripts directory on nodes ssh ${SSH_OPTS} root@${master_ip} mkdir sriov ssh ${SSH_OPTS} root@${worker_ip} mkdir sriov @@ -37,30 +27,7 @@ ssh ${SSH_OPTS} root@${worker_ip} ./sriov/enable-SRIOV.sh & pids+=" $!" wait_pids "${pids}" "SR-IOV setup failed" || exit 3 -sleep 5 - -for ip in ${master_ip} ${worker_ip}; do - success_attempts=0 - # ~15 minutes to start - for i in {1..60}; do - if [[ ${i} == 60 ]]; then - echo "timeout waiting for the ${ip} to start, aborting..." - exit 4 - fi - - if ssh ${SSH_OPTS} -o ConnectTimeout=1 -o BatchMode=yes root@${ip} true; then - ((success_attempts++)) - else - success_attempts=0 - fi - - if [[ ${success_attempts} == 3 ]]; then - break - fi - - sleep 15 - done -done +wait_start ${master_ip} ${worker_ip} # Create SR-IOV config scp ${SSH_OPTS} ${SRIOV_DIR}/config-SRIOV.sh root@${master_ip}:sriov/config-SRIOV.sh || exit 5 From eb7251639f0d49e29429ce87ec2c859731a51faf Mon Sep 17 00:00:00 2001 From: Vladimir Popov Date: Tue, 2 Nov 2021 12:15:56 +0700 Subject: [PATCH 2/6] Setup actions Signed-off-by: Vladimir Popov --- .cloudtest_calico.yaml | 31 ++++++++++++++++++++++++++++ .github/workflows/ci.yaml | 28 ++++++++++++++++++------- cloudtest/packet.yaml | 2 +- scripts/create-kubernetes-cluster.sh | 6 +++--- scripts/k8s/start-master.sh | 6 +++--- scripts/sriov/setup-SRIOV.sh | 2 +- 6 files changed, 59 insertions(+), 16 deletions(-) create mode 100644 .cloudtest_calico.yaml diff --git a/.cloudtest_calico.yaml b/.cloudtest_calico.yaml new file mode 100644 index 00000000..bb08463b --- /dev/null +++ b/.cloudtest_calico.yaml @@ -0,0 +1,31 @@ +--- +version: 1.0 +root: "./.tests/cloud_test_calico/" +timeout: 7200 # 2 hour total total timeout +shuffle-enabled: true +statistics: + enabled: true + interval: 60 # 60 seconds for statistics +import: + - cloudtest/packet.yaml + - cloudtest/tests.yaml + +retest: # Allow to do test re-run if some kind of failures are detected, line CNI network plugin errors. + count: 1 # Allow 5 times to do restart + warmup-time: 15 # Put 15 seconds warmup for cluster instance to be used again. + allowed-retests: 2 # If cluster instance have few attempts with retest requests one after another, we need to restart cluster. + pattern: + - "NetworkPlugin cni failed to set up pod" # Error in AWS dur to leak of IPs or not ability to assign them. + - "etcdserver: request timed out" # Error in any could, reason unknown. + - "unable to establish connection to VPP (VPP API socket file /run/vpp/api.sock does not exist)" # a VPP is not started, it will be re-started in general, but will cause test fail. + # Sometimes (rarely) docker registry is unavailable for a moment + - "Error response from daemon: Get https://.*docker.io/.*: dial tcp: lookup registry" + - "Error response from daemon: Get https://.*docker.io/.*: net/http: request canceled while waiting for connection" + - "Failed create pod sandbox" +reporting: + junit-report: "results/junit.xml" +health-check: + - message: "Branch is not up to date" + interval: 60 # 1 minute + run: | + echo "Health check!" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 534206bf..879863a4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -68,6 +68,9 @@ jobs: packet: name: packet runs-on: ubuntu-latest + strategy: + matrix: + calico: ["off", "on"] steps: - name: Set up /bin permissions # 1. Set up /bin permissions run: | @@ -87,32 +90,41 @@ jobs: with: repository: networkservicemesh/deployments-k8s path: networkservicemesh/deployments-k8s - - name: Checkout files # 5. Checkout files + - name: Compute suffix # 5. Compute suffix for cloudtest input and output paths + id: suffix + run: | + if [[ "${CALICO}" == "on" ]]; then + echo ::set-output name=val::_calico + fi + env: + CALICO: ${{ matrix.calico }} + - name: Checkout files # 6. Checkout files uses: actions/checkout@v2 with: path: ${{ github.repository }} - - name: Run tests with cloudtest # 6. Run tests with cloudtest + - name: Run tests with cloudtest # 7. Run tests with cloudtest working-directory: ${{ github.repository }} run: | - cloudtest + cloudtest --config=.cloudtest${suffix}.yaml env: PACKET_AUTH_TOKEN: ${{ secrets.PACKET_AUTH_TOKEN }} PACKET_PROJECT_ID: 383890d0-f5d1-4de1-881a-4d1ede549d18 KUBERNETES_VERSION: ${{ secrets.NSM_KUBERNETES_VERSION }} - CALICO: on - - name: Publish test report # 7. Publish test report + CALICO: ${{ matrix.calico }} + suffix: ${{ steps.suffix.outputs.val }} + - name: Publish test report # 8. Publish test report uses: mikepenz/action-junit-report@v2.1.0 if: ${{ always() }} with: - report_paths: "**/cloud_test/results/junit.xml" + report_paths: "**/cloud_test${{ steps.suffix.outputs.val }}/results/junit.xml" suite_regex: "Test*" github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Upload logs # 8. Upload logs + - name: Upload logs # 9. Upload logs uses: actions/upload-artifact@v2 if: ${{ always() }} with: name: logs-${{ github.run_number }} - path: ${{ github.repository }}/.tests/cloud_test/ + path: ${{ github.repository }}/.tests/cloud_test${{ steps.suffix.outputs.val }}/ packet-cleanup: name: packet cleanup diff --git a/cloudtest/packet.yaml b/cloudtest/packet.yaml index 348cae98..08d2fbb0 100644 --- a/cloudtest/packet.yaml +++ b/cloudtest/packet.yaml @@ -4,7 +4,7 @@ providers: - name: "packet" kind: "packet" instances: 1 - retry: 0 + retry: 5 node-count: 2 enabled: true timeout: 2400 # 40 minutes to start cluster diff --git a/scripts/create-kubernetes-cluster.sh b/scripts/create-kubernetes-cluster.sh index 5cdfc73d..939994ff 100755 --- a/scripts/create-kubernetes-cluster.sh +++ b/scripts/create-kubernetes-cluster.sh @@ -8,7 +8,7 @@ sshkey=$3 SSH_CONFIG="ssh_config" SSH_OPTS="-F ${SSH_CONFIG} -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i ${sshkey}" -if [[ -n "$CALICO" ]]; then # calico +if [[ "$CALICO" == "on" ]]; then # calico CALICO_MASTER_IP="10.0.0.$(( GITHUB_RUN_NUMBER % 100 ))" CALICO_WORKER_IP="10.0.0.$(( GITHUB_RUN_NUMBER % 100 + 1 ))" CALICO_SUBNET_MASK="30" @@ -47,7 +47,7 @@ pids="" pids+=" $!" wait_pids "${pids}" "SR-IOV config failed" || exit 21 -if [[ -n "$CALICO" ]]; then # calico +if [[ "$CALICO" == "on" ]]; then # calico # 3. Create Calico scripts directory on nodes. ssh ${SSH_OPTS} root@${master_ip} mkdir calico || exit 31 ssh ${SSH_OPTS} root@${worker_ip} mkdir calico || exit 32 @@ -118,7 +118,7 @@ wait_pids "${pids}" "worker join failed" || exit 94 # 10. Save KUBECONFIG to file. scp ${SSH_OPTS} root@${master_ip}:.kube/config ${KUBECONFIG} || exit 101 -if [[ -n "$CALICO" ]]; then # calico +if [[ "$CALICO" == "on" ]]; then # calico # 11. Setup cluster nodes IPs. scp ${SSH_OPTS} scripts/calico/setup-node-ip.sh root@${master_ip}:calico/setup-node-ip.sh || exit 111 scp ${SSH_OPTS} scripts/calico/setup-node-ip.sh root@${worker_ip}:calico/setup-node-ip.sh || exit 112 diff --git a/scripts/k8s/start-master.sh b/scripts/k8s/start-master.sh index 02a821c7..42c3ef32 100755 --- a/scripts/k8s/start-master.sh +++ b/scripts/k8s/start-master.sh @@ -8,7 +8,7 @@ calico_ip="$2" K8S_DIR=$(dirname "$0") -if [[ -z "${CALICO}" ]]; then # not calico +if [[ "$CALICO" != "on" ]]; then # not calico ip="${public_ip}" else ip="${calico_ip}" @@ -24,13 +24,13 @@ mkdir -p ~/.kube cp -f /etc/kubernetes/admin.conf ~/.kube/config chown "$(id -u):$(id -g)" ~/.kube/config -if [[ -z "${CALICO}" ]]; then # not calico +if [[ "$CALICO" != "on" ]]; then # not calico kubectl apply -f "https://cloud.weave.works/k8s/net?k8s-version=$(kubectl version | base64 | tr -d '\n')&env.IPALLOC_RANGE=192.168.0.0/16" fi kubectl taint nodes --all node-role.kubernetes.io/master- -if [[ -n "${CALICO}" ]]; then # calico +if [[ "$CALICO" == "on" ]]; then # calico kubectl -n kube-system get configmap kubeadm-config -o jsonpath='{.data.ClusterConfiguration}' > kubeadm.yaml sed -i "/^apiServer:$/a \ \ certSANs:\n - \"${public_ip}\"\n - \"${calico_ip}\"" kubeadm.yaml diff --git a/scripts/sriov/setup-SRIOV.sh b/scripts/sriov/setup-SRIOV.sh index ea4aa96e..498dc063 100755 --- a/scripts/sriov/setup-SRIOV.sh +++ b/scripts/sriov/setup-SRIOV.sh @@ -27,7 +27,7 @@ ssh ${SSH_OPTS} root@${worker_ip} ./sriov/enable-SRIOV.sh & pids+=" $!" wait_pids "${pids}" "SR-IOV setup failed" || exit 3 -wait_start ${master_ip} ${worker_ip} +wait_start ${master_ip} ${worker_ip} || exit 4 # Create SR-IOV config scp ${SSH_OPTS} ${SRIOV_DIR}/config-SRIOV.sh root@${master_ip}:sriov/config-SRIOV.sh || exit 5 From 1976fae9dff2ecc80f930f48d47d796d76b2b815 Mon Sep 17 00:00:00 2001 From: Vladimir Popov Date: Tue, 2 Nov 2021 12:20:02 +0700 Subject: [PATCH 3/6] Fix tests Signed-off-by: Vladimir Popov --- main_test.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/main_test.go b/main_test.go index 1b6c3fdf..af463c55 100644 --- a/main_test.go +++ b/main_test.go @@ -22,6 +22,8 @@ import ( "github.com/stretchr/testify/suite" + "github.com/networkservicemesh/integration-tests/suites/basic" + "github.com/networkservicemesh/integration-tests/suites/features" "github.com/networkservicemesh/integration-tests/suites/heal" "github.com/networkservicemesh/integration-tests/suites/memory" "github.com/networkservicemesh/integration-tests/suites/multiforwarder" @@ -29,7 +31,7 @@ import ( ) func isCalico() bool { - return os.Getenv("CALICO") != "" + return os.Getenv("CALICO") == "on" } func TestMemory(t *testing.T) { @@ -56,3 +58,17 @@ func TestHeal(t *testing.T) { } suite.Run(t, new(heal.Suite)) } + +func TestBasic(t *testing.T) { + if isCalico() { + t.Skip("not available with Calico") + } + suite.Run(t, new(basic.Suite)) +} + +func TestFeatures(t *testing.T) { + if isCalico() { + t.Skip("not available with Calico") + } + suite.Run(t, new(features.Suite)) +} From a3256c7a35651f567737dffa98c775e14e50f111 Mon Sep 17 00:00:00 2001 From: Vladimir Popov Date: Tue, 2 Nov 2021 13:53:42 +0700 Subject: [PATCH 4/6] Fix issues Signed-off-by: Vladimir Popov --- .github/workflows/ci.yaml | 6 ++++-- scripts/create-kubernetes-cluster.sh | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 879863a4..80034573 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -66,9 +66,10 @@ jobs: git diff --name-only --exit-code go.sum || ( echo "Run go tidy" && false ) packet: - name: packet + name: packet (Calico ${{ matrix.calico }}) runs-on: ubuntu-latest strategy: + fail-fast: false matrix: calico: ["off", "on"] steps: @@ -119,12 +120,13 @@ jobs: report_paths: "**/cloud_test${{ steps.suffix.outputs.val }}/results/junit.xml" suite_regex: "Test*" github_token: ${{ secrets.GITHUB_TOKEN }} + check_name: "JUnit Test Report (Calico ${{ matrix.calico }})" - name: Upload logs # 9. Upload logs uses: actions/upload-artifact@v2 if: ${{ always() }} with: name: logs-${{ github.run_number }} - path: ${{ github.repository }}/.tests/cloud_test${{ steps.suffix.outputs.val }}/ + path: ${{ github.repository }}/.tests/ packet-cleanup: name: packet cleanup diff --git a/scripts/create-kubernetes-cluster.sh b/scripts/create-kubernetes-cluster.sh index 939994ff..968a7b18 100755 --- a/scripts/create-kubernetes-cluster.sh +++ b/scripts/create-kubernetes-cluster.sh @@ -21,13 +21,13 @@ source scripts/include/wait-pids.sh # wait_start ip_1 ... ip_n source scripts/include/wait-start.sh -wait_start ${master_ip} ${worker_ip} - # 0. Setup SendEnv on the local side. cp /etc/ssh/ssh_config ${SSH_CONFIG} || exit 1 echo "Host * SendEnv ${ENVS}" >> ${SSH_CONFIG} || exit 2 +wait_start ${master_ip} ${worker_ip} || exit 3 + # 1. Setup AcceptEnv on the servers sides and wait for sshd to restart. scp ${SSH_OPTS} scripts/setup-sshd.sh root@${master_ip}:setup-sshd.sh || exit 11 scp ${SSH_OPTS} scripts/setup-sshd.sh root@${worker_ip}:setup-sshd.sh || exit 12 From 0da0e0a62672cdba540d72c9676b574ce726f99d Mon Sep 17 00:00:00 2001 From: Vladimir Popov Date: Tue, 2 Nov 2021 18:05:40 +0700 Subject: [PATCH 5/6] Fix Calico IP addresses Signed-off-by: Vladimir Popov --- scripts/create-kubernetes-cluster.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/create-kubernetes-cluster.sh b/scripts/create-kubernetes-cluster.sh index 968a7b18..b5529dba 100755 --- a/scripts/create-kubernetes-cluster.sh +++ b/scripts/create-kubernetes-cluster.sh @@ -9,8 +9,12 @@ SSH_CONFIG="ssh_config" SSH_OPTS="-F ${SSH_CONFIG} -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i ${sshkey}" if [[ "$CALICO" == "on" ]]; then # calico - CALICO_MASTER_IP="10.0.0.$(( GITHUB_RUN_NUMBER % 100 ))" - CALICO_WORKER_IP="10.0.0.$(( GITHUB_RUN_NUMBER % 100 + 1 ))" + # Use a new 10.0.0.${base_ip}/30 subnet to prevent IP addresses collisions + # ${base_ip} should be <= 248, because 10.0.0.252/30 subnet is reserved for manual testing + base_ip=$(( GITHUB_RUN_NUMBER % 63 * 4 )) + + CALICO_MASTER_IP="10.0.0.$(( base_ip + 1 ))" + CALICO_WORKER_IP="10.0.0.$(( base_ip + 2 ))" CALICO_SUBNET_MASK="30" fi From 880c916898bd5484063b223db00520a9d2810512 Mon Sep 17 00:00:00 2001 From: Vladimir Popov Date: Wed, 3 Nov 2021 20:40:48 +0700 Subject: [PATCH 6/6] Remove feature tests Signed-off-by: Vladimir Popov --- main_test.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/main_test.go b/main_test.go index af463c55..692d3853 100644 --- a/main_test.go +++ b/main_test.go @@ -23,7 +23,6 @@ import ( "github.com/stretchr/testify/suite" "github.com/networkservicemesh/integration-tests/suites/basic" - "github.com/networkservicemesh/integration-tests/suites/features" "github.com/networkservicemesh/integration-tests/suites/heal" "github.com/networkservicemesh/integration-tests/suites/memory" "github.com/networkservicemesh/integration-tests/suites/multiforwarder" @@ -65,10 +64,3 @@ func TestBasic(t *testing.T) { } suite.Run(t, new(basic.Suite)) } - -func TestFeatures(t *testing.T) { - if isCalico() { - t.Skip("not available with Calico") - } - suite.Run(t, new(features.Suite)) -}