diff --git a/api/apps/v1alpha1/nimcache_types.go b/api/apps/v1alpha1/nimcache_types.go index f16a9b9b..d9dca074 100644 --- a/api/apps/v1alpha1/nimcache_types.go +++ b/api/apps/v1alpha1/nimcache_types.go @@ -258,6 +258,16 @@ func (n *NIMCache) GetGroupID() *int64 { return n.Spec.GroupID } +// GetTolerations returns tolerations configured for the NIMCache Job +func (n *NIMCache) GetTolerations() []corev1.Toleration { + return n.Spec.Tolerations +} + +// GetNodeSelectors returns nodeselectors configured for the NIMCache Job +func (n *NIMCache) GetNodeSelectors() map[string]string { + return n.Spec.NodeSelectors +} + // +kubebuilder:object:root=true // NIMCacheList contains a list of NIMCache diff --git a/api/apps/v1alpha1/nimservice_types.go b/api/apps/v1alpha1/nimservice_types.go index f237aceb..9e829465 100644 --- a/api/apps/v1alpha1/nimservice_types.go +++ b/api/apps/v1alpha1/nimservice_types.go @@ -25,6 +25,7 @@ import ( autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" networkingv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" ) @@ -187,7 +188,7 @@ func (n *NIMService) GetStandardEnv() []corev1.EnvVar { // GetStandardAnnotations returns default annotations to apply to the NIMService instance func (n *NIMService) GetStandardAnnotations() map[string]string { standardAnnotations := map[string]string{ - "openshift.io/scc": "anyuid", + "openshift.io/scc": "nonroot", } return standardAnnotations } @@ -633,7 +634,16 @@ func (n *NIMService) GetRoleParams() *rendertypes.RoleParams { params.Name = n.GetName() params.Namespace = n.GetNamespace() - // TODO: set rules + // Set rules to use SCC + params.Rules = []rbacv1.PolicyRule{ + { + APIGroups: []string{"security.openshift.io"}, + Resources: []string{"securitycontextconstraints"}, + ResourceNames: []string{"nonroot"}, + Verbs: []string{"use"}, + }, + } + return params } diff --git a/bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml b/bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml index fd67e2d4..9128c4de 100644 --- a/bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml +++ b/bundle/manifests/k8s-nim-operator.clusterserviceversion.yaml @@ -470,6 +470,14 @@ spec: - get - list - watch + - apiGroups: + - security.openshift.io + resourceNames: + - nonroot + resources: + - securitycontextconstraints + verbs: + - use - apiGroups: - security.openshift.io resources: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index a2cffc85..e6f51422 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -263,6 +263,14 @@ rules: - patch - update - watch +- apiGroups: + - security.openshift.io + resourceNames: + - nonroot + resources: + - securitycontextconstraints + verbs: + - use - apiGroups: - storage.k8s.io resources: diff --git a/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml b/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml index 7a1239ee..892ad49e 100644 --- a/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml +++ b/deployments/helm/k8s-nim-operator/templates/manager-rbac.yaml @@ -246,6 +246,14 @@ rules: - get - list - watch +- apiGroups: + - security.openshift.io + resourceNames: + - nonroot + resources: + - securitycontextconstraints + verbs: + - use - apiGroups: - security.openshift.io resources: diff --git a/internal/controller/nimcache_controller.go b/internal/controller/nimcache_controller.go index 29dea0f4..6d5cbd6f 100644 --- a/internal/controller/nimcache_controller.go +++ b/internal/controller/nimcache_controller.go @@ -37,6 +37,7 @@ import ( "gopkg.in/yaml.v2" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" "k8s.io/apimachinery/pkg/api/errors" apiResource "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -60,6 +61,15 @@ const ( // AllProfiles represents all profiles in the NIM manifest AllProfiles = "all" + + // NIMCacheRole is the name of the role for all NIMCache instances in the namespace + NIMCacheRole = "nim-cache-role" + + // NIMCacheRoleBinding is the name of the rolebinding for all NIMCache instances in the namespace + NIMCacheRoleBinding = "nim-cache-rolebinding" + + // NIMCacheServiceAccount is the name of the serviceaccount for all NIMCache instances in the namespace + NIMCacheServiceAccount = "nim-cache-sa" ) // NIMCacheReconciler reconciles a NIMCache object @@ -87,6 +97,7 @@ func NewNIMCacheReconciler(client client.Client, scheme *runtime.Scheme, log log // +kubebuilder:rbac:groups=apps.nvidia.com,resources=nimcaches,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=apps.nvidia.com,resources=nimcaches/status,verbs=get;update;patch // +kubebuilder:rbac:groups=apps.nvidia.com,resources=nimcaches/finalizers,verbs=update +// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use,resourceNames=nonroot // +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;delete // +kubebuilder:rbac:groups="",resources=pods/log,verbs=get @@ -226,6 +237,182 @@ func (r *NIMCacheReconciler) cleanupNIMCache(ctx context.Context, nimCache *apps return nil } +func (r *NIMCacheReconciler) reconcileRole(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error { + logger := r.GetLogger() + roleName := NIMCacheRole + roleNamespacedName := types.NamespacedName{Name: roleName, Namespace: nimCache.GetNamespace()} + + // Desired Role configuration + desiredRole := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: roleName, + Namespace: nimCache.GetNamespace(), + Labels: map[string]string{ + "app": "k8s-nim-operator", + }, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{"security.openshift.io"}, + Resources: []string{"securitycontextconstraints"}, + ResourceNames: []string{"nonroot"}, + Verbs: []string{"use"}, + }, + }, + } + + // Check if the Role already exists + existingRole := &rbacv1.Role{} + err := r.Get(ctx, roleNamespacedName, existingRole) + if err != nil && client.IgnoreNotFound(err) != nil { + logger.Error(err, "Failed to get Role", "Name", roleName) + return err + } + + if err != nil { + // Role does not exist, create a new one + logger.Info("Creating a new Role", "Name", roleName) + + err = r.Create(ctx, desiredRole) + if err != nil { + logger.Error(err, "Failed to create Role", "Name", roleName) + return err + } + + logger.Info("Successfully created Role", "Name", roleName) + } else { + // Role exists, check if it needs to be updated + if !roleEqual(existingRole, desiredRole) { + logger.Info("Updating existing Role", "Name", roleName) + existingRole.Rules = desiredRole.Rules + + err = r.Update(ctx, existingRole) + if err != nil { + logger.Error(err, "Failed to update Role", "Name", roleName) + return err + } + + logger.Info("Successfully updated Role", "Name", roleName) + } + } + + return nil +} + +// Helper function to check if two Roles are equal +func roleEqual(existing, desired *rbacv1.Role) bool { + return utils.IsEqual(existing, desired, "Rules") +} + +func (r *NIMCacheReconciler) reconcileRoleBinding(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error { + logger := r.GetLogger() + rbName := NIMCacheRoleBinding + rbNamespacedName := types.NamespacedName{Name: rbName, Namespace: nimCache.GetNamespace()} + + // Desired RoleBinding configuration + desiredRB := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: rbName, + Namespace: nimCache.GetNamespace(), + Labels: map[string]string{ + "app": "k8s-nim-operator", + }, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "Role", + Name: NIMCacheRole, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: NIMCacheServiceAccount, + Namespace: nimCache.GetNamespace(), + }, + }, + } + + // Check if the RoleBinding already exists + existingRB := &rbacv1.RoleBinding{} + err := r.Get(ctx, rbNamespacedName, existingRB) + if err != nil && client.IgnoreNotFound(err) != nil { + logger.Error(err, "Failed to get RoleBinding", "Name", rbName) + return err + } + + if err != nil { + // RoleBinding does not exist, create a new one + logger.Info("Creating a new RoleBinding", "Name", rbName) + + err = r.Create(ctx, desiredRB) + if err != nil { + logger.Error(err, "Failed to create RoleBinding", "Name", rbName) + return err + } + + logger.Info("Successfully created RoleBinding", "Name", rbName) + } else { + // RoleBinding exists, check if it needs to be updated + if !roleBindingEqual(existingRB, desiredRB) { + logger.Info("Updating existing RoleBinding", "Name", rbName) + existingRB.RoleRef = desiredRB.RoleRef + existingRB.Subjects = desiredRB.Subjects + + err = r.Update(ctx, existingRB) + if err != nil { + logger.Error(err, "Failed to update RoleBinding", "Name", rbName) + return err + } + + logger.Info("Successfully updated RoleBinding", "Name", rbName) + } + } + + return nil +} + +// Helper function to check if two RoleBindings are equal +func roleBindingEqual(existing, desired *rbacv1.RoleBinding) bool { + return utils.IsEqual(existing, desired, "RoleRef", "Subjects") +} + +func (r *NIMCacheReconciler) reconcileServiceAccount(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error { + logger := r.GetLogger() + saName := NIMCacheServiceAccount + saNamespacedName := types.NamespacedName{Name: saName, Namespace: nimCache.GetNamespace()} + + sa := &corev1.ServiceAccount{} + err := r.Get(ctx, saNamespacedName, sa) + if err != nil && client.IgnoreNotFound(err) != nil { + return err + } + + // If ServiceAccount does not exist, create a new one + if err != nil { + logger.Info("Creating a new ServiceAccount", "Name", saName) + + newSA := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: saName, + Namespace: nimCache.GetNamespace(), + Labels: map[string]string{"app": "k8s-nim-operator"}, + }, + } + + // Create the ServiceAccount + err = r.Create(ctx, newSA) + if err != nil { + logger.Error(err, "Failed to create ServiceAccount", "Name", saName) + return err + } + + logger.Info("Successfully created ServiceAccount", "Name", saName) + } + + // If the ServiceAccount already exists, no action is needed + return nil +} + func (r *NIMCacheReconciler) reconcilePVC(ctx context.Context, nimCache *appsv1alpha1.NIMCache) error { logger := r.GetLogger() pvcName := getPvcName(nimCache, nimCache.Spec.Storage.PVC) @@ -540,8 +727,29 @@ func (r *NIMCacheReconciler) createPod(ctx context.Context, pod *corev1.Pod) err func (r *NIMCacheReconciler) reconcileNIMCache(ctx context.Context, nimCache *appsv1alpha1.NIMCache) (ctrl.Result, error) { logger := r.GetLogger() + // Reconcile ServiceAccount + err := r.reconcileServiceAccount(ctx, nimCache) + if err != nil { + logger.Error(err, "reconciliation of serviceaccount failed") + return ctrl.Result{}, err + } + + // Reconcile Role + err = r.reconcileRole(ctx, nimCache) + if err != nil { + logger.Error(err, "reconciliation of role failed") + return ctrl.Result{}, err + } + + // Reconcile RoleBinding + err = r.reconcileRoleBinding(ctx, nimCache) + if err != nil { + logger.Error(err, "reconciliation of rolebinding failed") + return ctrl.Result{}, err + } + // Reconcile PVC - err := r.reconcilePVC(ctx, nimCache) + err = r.reconcilePVC(ctx, nimCache) if err != nil { logger.Error(err, "reconciliation of pvc failed", "pvc", getPvcName(nimCache, nimCache.Spec.Storage.PVC)) return ctrl.Result{}, err @@ -597,7 +805,7 @@ func constructPodSpec(nimCache *appsv1alpha1.NIMCache) *corev1.Pod { } annotations := map[string]string{ - "openshift.io/scc": "anyuid", + "openshift.io/scc": "nonroot", } pod := &corev1.Pod{ @@ -629,6 +837,9 @@ func constructPodSpec(nimCache *appsv1alpha1.NIMCache) *corev1.Pod { FSGroup: nimCache.GetGroupID(), RunAsNonRoot: ptr.To[bool](true), }, + ServiceAccountName: nimCache.GetName(), + Tolerations: nimCache.GetTolerations(), + NodeSelector: nimCache.GetNodeSelectors(), }, } @@ -710,7 +921,10 @@ func constructJob(nimCache *appsv1alpha1.NIMCache) (*batchv1.Job, error) { }, }, }, - ImagePullSecrets: []corev1.LocalObjectReference{}, + ImagePullSecrets: []corev1.LocalObjectReference{}, + ServiceAccountName: nimCache.GetName(), + Tolerations: nimCache.GetTolerations(), + NodeSelector: nimCache.GetNodeSelectors(), }, }, BackoffLimit: ptr.To[int32](5), // retry max 5 times on failure diff --git a/internal/controller/nimcache_controller_test.go b/internal/controller/nimcache_controller_test.go index 217b3100..d71c33ff 100644 --- a/internal/controller/nimcache_controller_test.go +++ b/internal/controller/nimcache_controller_test.go @@ -26,6 +26,7 @@ import ( . "github.com/onsi/gomega" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -48,6 +49,7 @@ var _ = Describe("NIMCache Controller", func() { Expect(appsv1alpha1.AddToScheme(scheme)).To(Succeed()) Expect(batchv1.AddToScheme(scheme)).To(Succeed()) Expect(corev1.AddToScheme(scheme)).To(Succeed()) + Expect(rbacv1.AddToScheme(scheme)).To(Succeed()) client = fake.NewClientBuilder().WithScheme(scheme). WithStatusSubresource(&appsv1alpha1.NIMCache{}). @@ -231,6 +233,71 @@ var _ = Describe("NIMCache Controller", func() { }) Context("when creating a NIMCache resource", func() { + It("should create a Role with SCC rules", func() { + ctx := context.TODO() + nimCache := &appsv1alpha1.NIMCache{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimcache", + Namespace: "default", + }, + Spec: appsv1alpha1.NIMCacheSpec{ + Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret"}}, + }, + } + + err := reconciler.reconcileRole(ctx, nimCache) + Expect(err).NotTo(HaveOccurred()) + + // Check if the Role was created + role := &rbacv1.Role{} + roleName := types.NamespacedName{Name: NIMCacheRole, Namespace: "default"} + + err = client.Get(ctx, roleName, role) + Expect(err).NotTo(HaveOccurred()) + Expect(role.Rules).To(HaveLen(1)) + + // Check the Role has the expected SCC rules + expectedRule := rbacv1.PolicyRule{ + APIGroups: []string{"security.openshift.io"}, + Resources: []string{"securitycontextconstraints"}, + ResourceNames: []string{"nonroot"}, + Verbs: []string{"use"}, + } + Expect(role.Rules[0]).To(Equal(expectedRule)) + }) + + It("should create a RoleBinding for the Role", func() { + ctx := context.TODO() + nimCache := &appsv1alpha1.NIMCache{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimcache", + Namespace: "default", + }, + Spec: appsv1alpha1.NIMCacheSpec{ + Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret"}}, + }, + } + + err := reconciler.reconcileRole(ctx, nimCache) + Expect(err).NotTo(HaveOccurred()) + + err = reconciler.reconcileRoleBinding(ctx, nimCache) + Expect(err).NotTo(HaveOccurred()) + + // Check if the RoleBinding was created + rb := &rbacv1.RoleBinding{} + rbName := types.NamespacedName{Name: NIMCacheRoleBinding, Namespace: "default"} + err = client.Get(ctx, rbName, rb) + Expect(err).NotTo(HaveOccurred()) + + // Check that the RoleBinding is bound to the correct Role + Expect(rb.RoleRef.Name).To(Equal(NIMCacheRole)) + Expect(rb.Subjects).To(HaveLen(1)) + Expect(rb.Subjects[0].Kind).To(Equal("ServiceAccount")) + Expect(rb.Subjects[0].Name).To(Equal(NIMCacheServiceAccount)) + Expect(rb.Subjects[0].Namespace).To(Equal(nimCache.GetNamespace())) + }) + It("should construct a pod with right specifications", func() { nimCache := &appsv1alpha1.NIMCache{ ObjectMeta: metav1.ObjectMeta{ diff --git a/internal/controller/nimservice_controller.go b/internal/controller/nimservice_controller.go index e9c6f2d5..160785a7 100644 --- a/internal/controller/nimservice_controller.go +++ b/internal/controller/nimservice_controller.go @@ -69,6 +69,7 @@ func NewNIMServiceReconciler(client client.Client, scheme *runtime.Scheme, updat // +kubebuilder:rbac:groups=apps.nvidia.com,resources=nimcaches,verbs=get;list;watch; // +kubebuilder:rbac:groups=config.openshift.io,resources=clusterversions;proxies,verbs=get;list;watch // +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=use,resourceNames=nonroot // +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles;rolebindings,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=serviceaccounts;pods;pods/eviction;services;services/finalizers;endpoints,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="",resources=persistentvolumeclaims;configmaps;secrets,verbs=get;list;watch;create;update;patch;delete diff --git a/internal/render/types/types.go b/internal/render/types/types.go index f16d8db2..fd1823c3 100644 --- a/internal/render/types/types.go +++ b/internal/render/types/types.go @@ -20,6 +20,7 @@ import ( autoscalingv2 "k8s.io/api/autoscaling/v2" corev1 "k8s.io/api/core/v1" networkingv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" ) // DaemonsetParams holds the parameters for rendering a Daemonset template @@ -134,7 +135,7 @@ type ServiceAccountParams struct { type RoleParams struct { Name string Namespace string - Rules []string + Rules []rbacv1.PolicyRule } // RoleBindingParams holds the parameters for rendering a RoleBinding template diff --git a/internal/utils/utils.go b/internal/utils/utils.go index e0b18b37..2b1622eb 100644 --- a/internal/utils/utils.go +++ b/internal/utils/utils.go @@ -28,6 +28,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/rand" + "reflect" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -227,3 +228,17 @@ func ContainsElement[T comparable](slice []T, element T) bool { } return false } + +// IsEqual compares two Kubernetes objects based on their relevant fields. +func IsEqual[T client.Object](existing, desired T, fieldsToCompare ...string) bool { + for _, field := range fieldsToCompare { + existingValue := reflect.ValueOf(existing).Elem().FieldByName(field) + desiredValue := reflect.ValueOf(desired).Elem().FieldByName(field) + + if !reflect.DeepEqual(existingValue.Interface(), desiredValue.Interface()) { + return false + } + } + + return true +} diff --git a/manifests/deployment.yaml b/manifests/deployment.yaml index 8c517b7e..5364d554 100644 --- a/manifests/deployment.yaml +++ b/manifests/deployment.yaml @@ -28,6 +28,7 @@ spec: labels: app: {{ .Name }} spec: + serviceAccountName: {{ .ServiceAccountName }} containers: - name: {{ .ContainerName }} image: {{ .Image }} diff --git a/manifests/role.yaml b/manifests/role.yaml index a06afd52..5e7a8a9b 100644 --- a/manifests/role.yaml +++ b/manifests/role.yaml @@ -4,6 +4,6 @@ metadata: name: {{ .Name }} namespace: {{ .Namespace }} rules: - {{- range .Rules }} - - {{ . | yaml | nindent 2 }} + {{- if .Rules }} + {{- .Rules | yaml | nindent 2 }} {{- end }} diff --git a/manifests/rolebinding.yaml b/manifests/rolebinding.yaml index 6ebe738c..ef11af60 100644 --- a/manifests/rolebinding.yaml +++ b/manifests/rolebinding.yaml @@ -10,4 +10,4 @@ roleRef: subjects: - kind: ServiceAccount name: {{ .ServiceAccountName }} - namespace: {{ .Namespace }} \ No newline at end of file + namespace: {{ .Namespace }}