Skip to content

Commit

Permalink
feat: add a webhook to prevent eviction of pods on kosmos NotReady nodes
Browse files Browse the repository at this point in the history
Signed-off-by: wangyizhi1 <[email protected]>
  • Loading branch information
wangyizhi1 committed Dec 22, 2023
1 parent 76c7ecb commit ce7a0f4
Show file tree
Hide file tree
Showing 14 changed files with 436 additions and 0 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ TARGETS := clusterlink-controller-manager \
clusterlink-proxy \
clustertree-cluster-manager \
scheduler \
webhook

CTL_TARGETS := kosmosctl

Expand Down
63 changes: 63 additions & 0 deletions cmd/webhook/app/options/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package options

import (
"github.com/spf13/pflag"
"k8s.io/client-go/tools/leaderelection/resourcelock"
componentbaseconfig "k8s.io/component-base/config"

"github.com/kosmos.io/kosmos/pkg/utils"
"github.com/kosmos.io/kosmos/pkg/webhook"
)

type Options struct {
LeaderElection componentbaseconfig.LeaderElectionConfiguration
KubernetesOptions KubernetesOptions
WebhookServerOptions WebhookServerOptions
PodValidatorOptions webhook.PodValidatorOptions
}

type KubernetesOptions struct {
KubeConfig string `json:"kubeconfig" yaml:"kubeconfig"`
Master string `json:"master,omitempty" yaml:"master,omitempty"`
QPS float32 `json:"qps,omitempty" yaml:"qps,omitempty"`
Burst int `json:"burst,omitempty" yaml:"burst,omitempty"`
}

type WebhookServerOptions struct {
Host string
Port int
CertDir string
CertName string
KeyName string
}

func NewOptions() *Options {
return &Options{
LeaderElection: componentbaseconfig.LeaderElectionConfiguration{
LeaderElect: true,
ResourceLock: resourcelock.LeasesResourceLock,
ResourceNamespace: utils.DefaultNamespace,
ResourceName: "network-manager",
},
}
}

func (o *Options) AddFlags(flags *pflag.FlagSet) {
if o == nil {
return
}

flags.BoolVar(&o.LeaderElection.LeaderElect, "leader-elect", true, "Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.")
flags.StringVar(&o.LeaderElection.ResourceName, "leader-elect-resource-name", "kosmos-webhook", "The name of resource object that is used for locking during leader election.")
flags.StringVar(&o.LeaderElection.ResourceNamespace, "leader-elect-resource-namespace", utils.DefaultNamespace, "The namespace of resource object that is used for locking during leader election.")
flags.Float32Var(&o.KubernetesOptions.QPS, "kube-qps", 40.0, "QPS to use while talking with kube-apiserver.")
flags.IntVar(&o.KubernetesOptions.Burst, "kube-burst", 60, "Burst to use while talking with kube-apiserver.")
flags.StringVar(&o.KubernetesOptions.KubeConfig, "kubeconfig", "", "Path for kubernetes kubeconfig file, if left blank, will use in cluster way.")
flags.StringVar(&o.KubernetesOptions.Master, "master", "", "Used to generate kubeconfig for downloading, if not specified, will use host in kubeconfig.")
flags.StringVar(&o.WebhookServerOptions.Host, "bind-address", "0.0.0.0", "The IP address on which to listen for the --secure-port port.")
flags.IntVar(&o.WebhookServerOptions.Port, "secure-port", 9443, "The secure port on which to serve HTTPS.")
flags.StringVar(&o.WebhookServerOptions.CertDir, "cert-dir", "/etc/certs", "The directory that contains the server key and certificate.")
flags.StringVar(&o.WebhookServerOptions.CertName, "tls-cert-file-name", "tls.crt", "The name of server certificate.")
flags.StringVar(&o.WebhookServerOptions.KeyName, "tls-private-key-file-name", "tls.key", "The name of server key.")
flags.StringArrayVar(&o.PodValidatorOptions.UsernamesNeedToPrevent, "usernames-need-to-prevent", []string{"system:serviceaccount:kube-system:node-controller"}, "Usernames that need to prevent deleting pods on NotReady kosmos nodes.")
}
10 changes: 10 additions & 0 deletions cmd/webhook/app/options/validation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package options

import "k8s.io/apimachinery/pkg/util/validation/field"

// Validate checks Options and return a slice of found errs.
func (o *Options) Validate() field.ErrorList {
errs := field.ErrorList{}

return errs
}
102 changes: 102 additions & 0 deletions cmd/webhook/app/webhook.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package app

import (
"context"
"fmt"
"net/http"

"github.com/spf13/cobra"
"k8s.io/client-go/tools/clientcmd"
cliflag "k8s.io/component-base/cli/flag"
"k8s.io/klog/v2"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/webhook"

"github.com/kosmos.io/kosmos/cmd/webhook/app/options"
"github.com/kosmos.io/kosmos/pkg/scheme"
"github.com/kosmos.io/kosmos/pkg/sharedcli/klogflag"
kosmoswebhook "github.com/kosmos.io/kosmos/pkg/webhook"
)

func NewWebhookCommand(ctx context.Context) *cobra.Command {
opts := options.NewOptions()

cmd := &cobra.Command{
Use: "kosmos-webhook",
Long: `TODO`,
RunE: func(cmd *cobra.Command, args []string) error {
if errs := opts.Validate(); len(errs) != 0 {
return errs.ToAggregate()
}
if err := Run(ctx, opts); err != nil {
return err
}
return nil
},
Args: func(cmd *cobra.Command, args []string) error {
for _, arg := range args {
if len(arg) > 0 {
return fmt.Errorf("%q does not take any arguments, got %q", cmd.CommandPath(), args)
}
}
return nil
},
}

fss := cliflag.NamedFlagSets{}

genericFlagSet := fss.FlagSet("generic")
opts.AddFlags(genericFlagSet)

logsFlagSet := fss.FlagSet("logs")
klogflag.Add(logsFlagSet)

cmd.Flags().AddFlagSet(genericFlagSet)
cmd.Flags().AddFlagSet(logsFlagSet)

return cmd
}

func Run(ctx context.Context, opts *options.Options) error {
config, err := clientcmd.BuildConfigFromFlags(opts.KubernetesOptions.Master, opts.KubernetesOptions.KubeConfig)
if err != nil {
panic(err)
}
config.QPS, config.Burst = opts.KubernetesOptions.QPS, opts.KubernetesOptions.Burst

mgr, err := controllerruntime.NewManager(config, controllerruntime.Options{
Logger: klog.Background(),
Scheme: scheme.NewSchema(),
WebhookServer: &webhook.Server{
Host: opts.WebhookServerOptions.Host,
Port: opts.WebhookServerOptions.Port,
CertDir: opts.WebhookServerOptions.CertDir,
CertName: opts.WebhookServerOptions.CertName,
KeyName: opts.WebhookServerOptions.KeyName,
},
MetricsBindAddress: "0",
HealthProbeBindAddress: "0",
LeaderElection: opts.LeaderElection.LeaderElect,
LeaderElectionID: opts.LeaderElection.ResourceName,
LeaderElectionNamespace: opts.LeaderElection.ResourceNamespace,
})
if err != nil {
klog.Errorf("failed to build webhook server: %v", err)
return err
}

hookServer := mgr.GetWebhookServer()
hookServer.Register("/validate-delete-pod", &webhook.Admission{Handler: &kosmoswebhook.PodValidator{
Client: mgr.GetClient(),
Options: opts.PodValidatorOptions,
}})
hookServer.WebhookMux.Handle("/health", http.StripPrefix("/health", &healthz.Handler{}))

if err := mgr.Start(ctx); err != nil {
klog.Errorf("failed to start webhook manager: %v", err)
return err
}

return nil
}
17 changes: 17 additions & 0 deletions cmd/webhook/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package main

import (
"os"

apiserver "k8s.io/apiserver/pkg/server"
"k8s.io/component-base/cli"

"github.com/kosmos.io/kosmos/cmd/webhook/app"
)

func main() {
ctx := apiserver.SetupSignalContext()
cmd := app.NewWebhookCommand(ctx)
code := cli.Run(cmd)
os.Exit(code)
}
53 changes: 53 additions & 0 deletions deploy/webhook/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: kosmos-webhook
namespace: kosmos-system
spec:
selector:
matchLabels:
app: kosmos-webhook
template:
metadata:
labels:
app: kosmos-webhook
spec:
serviceAccountName: kosmos-webhook
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kosmos.io/node
operator: DoesNotExist
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- kosmos-webhook
namespaces:
- kosmos-system
topologyKey: kubernetes.io/hostname
containers:
- image: ghcr.io/kosmos-io/webhook:__VERSION__
name: kosmos-webhook
volumeMounts:
- name: tls
mountPath: "/etc/certs"
command:
- webhook
- --v=4
resources:
limits:
memory: 500Mi
cpu: 500m
requests:
cpu: 500m
memory: 500Mi
volumes:
- name: tls
secret:
secretName: kosmos-webhook-tls
27 changes: 27 additions & 0 deletions deploy/webhook/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kosmos-webhook
namespace: kosmos-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kosmos-webhook
rules:
- apiGroups: ['*']
resources: ["nodes", "pods"]
verbs: ["*"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kosmos-webhook
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kosmos-webhook
subjects:
- kind: ServiceAccount
name: kosmos-webhook
namespace: kosmos-system
12 changes: 12 additions & 0 deletions deploy/webhook/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: kosmos-webhook
namespace: kosmos-system
spec:
ports:
- port: 9443
protocol: TCP
targetPort: 9443
selector:
app: kosmos-webhook
9 changes: 9 additions & 0 deletions deploy/webhook/tls-secret.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: v1
data:
tls.crt: __BASE64_SERVER_CRT__
tls.key: __BASE64_SERVER_KEY__
kind: Secret
metadata:
name: kosmos-webhook-tls
namespace: kosmos-system
type: kubernetes.io/tls
26 changes: 26 additions & 0 deletions deploy/webhook/validate-delete-pod-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: "validate-delete-pod.kosmos.io"
webhooks:
- name: "validate-delete-pod.kosmos.io"
rules:
- apiGroups: [""]
apiVersions: ["v1"]
operations: ["DELETE"]
resources: ["pods"]
scope: "*"
admissionReviewVersions: ["v1"]
# FailurePolicy defines how unrecognized errors from the admission endpoint are handled - allowed values are
# Ignore or Fail. Defaults to Fail.
failurePolicy: Ignore
sideEffects: None
timeoutSeconds: 3
clientConfig:
service:
namespace: kosmos-system
name: kosmos-webhook
path: /validate-delete-pod
port: 9443
caBundle: |
__BASE64_CA_CRT__
36 changes: 36 additions & 0 deletions hack/gen-webhook-certs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

WEBHOOK_NAME="kosmos-webhook"
NAMESPACE="kosmos-system"
DAYS="36500"

openssl genrsa -out ca.key 2048

openssl req -new -x509 -days ${DAYS} -key ca.key \
-subj "/C=CN/CN=${WEBHOOK_NAME}"\
-out ca.crt

openssl req -newkey rsa:2048 -nodes -keyout server.key \
-subj "/C=CN/CN=${WEBHOOK_NAME}" \
-out server.csr

openssl x509 -req \
-extfile <(printf "subjectAltName=DNS:${WEBHOOK_NAME}.${NAMESPACE}.svc") \
-days ${DAYS} \
-in server.csr \
-CA ca.crt -CAkey ca.key -CAcreateserial \
-out server.crt

echo
echo ">> Generating kube secrets..."
kubectl create secret tls ${WEBHOOK_NAME}-tls \
--cert=server.crt \
--key=server.key \
--dry-run=client -o yaml \
> tls-secret.yaml

echo
echo ">> MutatingWebhookConfiguration caBundle:"
cat ca.crt | base64 | fold

rm ca.crt ca.key ca.srl server.crt server.csr server.key
1 change: 1 addition & 0 deletions hack/util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ CLUSTERLINK_TARGET_SOURCE=(
clusterlink-controller-manager=cmd/clusterlink/controller-manager
clustertree-cluster-manager=cmd/clustertree/cluster-manager
kosmosctl=cmd/kosmosctl
webhook=cmd/webhook
)

#https://textkool.com/en/ascii-art-generator?hl=default&vl=default&font=DOS%20Rebel&text=KOSMOS
Expand Down
9 changes: 9 additions & 0 deletions pkg/utils/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,3 +342,12 @@ func ListResourceClusters(anno map[string]string) []string {
owners := strings.Split(anno[KosmosResourceOwnersAnnotations], ",")
return owners
}

func IsNotReady(node *corev1.Node) bool {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue {
return false
}
}
return true
}
Loading

0 comments on commit ce7a0f4

Please sign in to comment.