diff --git a/api/apps/v1alpha1/common_types.go b/api/apps/v1alpha1/common_types.go index 859536e2..ca7219da 100644 --- a/api/apps/v1alpha1/common_types.go +++ b/api/apps/v1alpha1/common_types.go @@ -58,6 +58,7 @@ type Autoscaling struct { HPA HorizontalPodAutoscalerSpec `json:"hpa,omitempty"` } +// HorizontalPodAutoscalerSpec defines the parameters required to setup HPA type HorizontalPodAutoscalerSpec struct { MinReplicas *int32 `json:"minReplicas,omitempty"` MaxReplicas int32 `json:"maxReplicas"` @@ -93,7 +94,16 @@ type IngressPath struct { ServiceType string `json:"serviceType,omitempty"` } +// Probe defines attributes for startup/liveness/readiness probes type Probe struct { Enabled *bool `json:"enabled,omitempty"` Probe *corev1.Probe `json:"probe,omitempty"` } + +// CertConfig defines the configuration for custom certificates. +type CertConfig struct { + // Name of the ConfigMap containing the certificate data. + Name string `json:"name"` + // MountPath is the path where the certificates should be mounted in the container. + MountPath string `json:"mountPath"` +} diff --git a/api/apps/v1alpha1/nimcache_types.go b/api/apps/v1alpha1/nimcache_types.go index 462d4fd1..4098c2de 100644 --- a/api/apps/v1alpha1/nimcache_types.go +++ b/api/apps/v1alpha1/nimcache_types.go @@ -41,8 +41,13 @@ type NIMCacheSpec struct { Tolerations []corev1.Toleration `json:"tolerations,omitempty"` // NodeSelectors are the node selector labels to schedule the caching job. NodeSelectors map[string]string `json:"gpuSelectors,omitempty"` - UserID *int64 `json:"userID,omitempty"` - GroupID *int64 `json:"groupID,omitempty"` + // UserID is the user ID for the caching job + UserID *int64 `json:"userID,omitempty"` + // GroupID is the group ID for the caching job + GroupID *int64 `json:"groupID,omitempty"` + // CertConfig is the name of the ConfigMap containing the custom certificates. + // for secure communication. + CertConfig *CertConfig `json:"certConfig,omitempty"` } // NIMSource defines the source for caching NIM model diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index ee653177..f184ff7b 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -48,6 +48,21 @@ func (in *Autoscaling) DeepCopy() *Autoscaling { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CertConfig) DeepCopyInto(out *CertConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CertConfig. +func (in *CertConfig) DeepCopy() *CertConfig { + if in == nil { + return nil + } + out := new(CertConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DataStoreSource) DeepCopyInto(out *DataStoreSource) { *out = *in @@ -381,6 +396,11 @@ func (in *NIMCacheSpec) DeepCopyInto(out *NIMCacheSpec) { *out = new(int64) **out = **in } + if in.CertConfig != nil { + in, out := &in.CertConfig, &out.CertConfig + *out = new(CertConfig) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMCacheSpec. diff --git a/bundle/manifests/apps.nvidia.com_nimcaches.yaml b/bundle/manifests/apps.nvidia.com_nimcaches.yaml index a1fcf553..a06c7e30 100644 --- a/bundle/manifests/apps.nvidia.com_nimcaches.yaml +++ b/bundle/manifests/apps.nvidia.com_nimcaches.yaml @@ -49,6 +49,23 @@ spec: spec: description: NIMCacheSpec defines the desired state of NIMCache properties: + certConfig: + description: |- + CertConfig is the name of the ConfigMap containing the custom certificates. + for secure communication. + properties: + mountPath: + description: MountPath is the path where the certificates should + be mounted in the container. + type: string + name: + description: Name of the ConfigMap containing the certificate + data. + type: string + required: + - mountPath + - name + type: object gpuSelectors: additionalProperties: type: string @@ -56,6 +73,7 @@ spec: the caching job. type: object groupID: + description: GroupID is the group ID for the caching job format: int64 type: integer resources: @@ -259,6 +277,7 @@ spec: type: object type: array userID: + description: UserID is the user ID for the caching job format: int64 type: integer required: diff --git a/bundle/manifests/apps.nvidia.com_nimpipelines.yaml b/bundle/manifests/apps.nvidia.com_nimpipelines.yaml index ff1b6e05..a2aa7a37 100644 --- a/bundle/manifests/apps.nvidia.com_nimpipelines.yaml +++ b/bundle/manifests/apps.nvidia.com_nimpipelines.yaml @@ -552,6 +552,8 @@ spec: type: string type: object livenessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1108,6 +1110,8 @@ spec: x-kubernetes-list-type: atomic type: object readinessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1335,6 +1339,8 @@ spec: enabled: type: boolean hpa: + description: HorizontalPodAutoscalerSpec defines the + parameters required to setup HPA properties: behavior: description: |- @@ -1947,6 +1953,8 @@ spec: type: object type: object startupProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean diff --git a/bundle/manifests/apps.nvidia.com_nimservices.yaml b/bundle/manifests/apps.nvidia.com_nimservices.yaml index 5d503f75..ad864295 100644 --- a/bundle/manifests/apps.nvidia.com_nimservices.yaml +++ b/bundle/manifests/apps.nvidia.com_nimservices.yaml @@ -502,6 +502,8 @@ spec: type: string type: object livenessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1050,6 +1052,8 @@ spec: x-kubernetes-list-type: atomic type: object readinessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1273,6 +1277,8 @@ spec: enabled: type: boolean hpa: + description: HorizontalPodAutoscalerSpec defines the parameters + required to setup HPA properties: behavior: description: |- @@ -1867,6 +1873,8 @@ spec: type: object type: object startupProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean diff --git a/config/crd/bases/apps.nvidia.com_nimcaches.yaml b/config/crd/bases/apps.nvidia.com_nimcaches.yaml index a1fcf553..a06c7e30 100644 --- a/config/crd/bases/apps.nvidia.com_nimcaches.yaml +++ b/config/crd/bases/apps.nvidia.com_nimcaches.yaml @@ -49,6 +49,23 @@ spec: spec: description: NIMCacheSpec defines the desired state of NIMCache properties: + certConfig: + description: |- + CertConfig is the name of the ConfigMap containing the custom certificates. + for secure communication. + properties: + mountPath: + description: MountPath is the path where the certificates should + be mounted in the container. + type: string + name: + description: Name of the ConfigMap containing the certificate + data. + type: string + required: + - mountPath + - name + type: object gpuSelectors: additionalProperties: type: string @@ -56,6 +73,7 @@ spec: the caching job. type: object groupID: + description: GroupID is the group ID for the caching job format: int64 type: integer resources: @@ -259,6 +277,7 @@ spec: type: object type: array userID: + description: UserID is the user ID for the caching job format: int64 type: integer required: diff --git a/config/crd/bases/apps.nvidia.com_nimpipelines.yaml b/config/crd/bases/apps.nvidia.com_nimpipelines.yaml index ff1b6e05..a2aa7a37 100644 --- a/config/crd/bases/apps.nvidia.com_nimpipelines.yaml +++ b/config/crd/bases/apps.nvidia.com_nimpipelines.yaml @@ -552,6 +552,8 @@ spec: type: string type: object livenessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1108,6 +1110,8 @@ spec: x-kubernetes-list-type: atomic type: object readinessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1335,6 +1339,8 @@ spec: enabled: type: boolean hpa: + description: HorizontalPodAutoscalerSpec defines the + parameters required to setup HPA properties: behavior: description: |- @@ -1947,6 +1953,8 @@ spec: type: object type: object startupProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean diff --git a/config/crd/bases/apps.nvidia.com_nimservices.yaml b/config/crd/bases/apps.nvidia.com_nimservices.yaml index 5d503f75..ad864295 100644 --- a/config/crd/bases/apps.nvidia.com_nimservices.yaml +++ b/config/crd/bases/apps.nvidia.com_nimservices.yaml @@ -502,6 +502,8 @@ spec: type: string type: object livenessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1050,6 +1052,8 @@ spec: x-kubernetes-list-type: atomic type: object readinessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1273,6 +1277,8 @@ spec: enabled: type: boolean hpa: + description: HorizontalPodAutoscalerSpec defines the parameters + required to setup HPA properties: behavior: description: |- @@ -1867,6 +1873,8 @@ spec: type: object type: object startupProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimcaches.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimcaches.yaml index a1fcf553..a06c7e30 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimcaches.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimcaches.yaml @@ -49,6 +49,23 @@ spec: spec: description: NIMCacheSpec defines the desired state of NIMCache properties: + certConfig: + description: |- + CertConfig is the name of the ConfigMap containing the custom certificates. + for secure communication. + properties: + mountPath: + description: MountPath is the path where the certificates should + be mounted in the container. + type: string + name: + description: Name of the ConfigMap containing the certificate + data. + type: string + required: + - mountPath + - name + type: object gpuSelectors: additionalProperties: type: string @@ -56,6 +73,7 @@ spec: the caching job. type: object groupID: + description: GroupID is the group ID for the caching job format: int64 type: integer resources: @@ -259,6 +277,7 @@ spec: type: object type: array userID: + description: UserID is the user ID for the caching job format: int64 type: integer required: diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml index ff1b6e05..a2aa7a37 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml @@ -552,6 +552,8 @@ spec: type: string type: object livenessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1108,6 +1110,8 @@ spec: x-kubernetes-list-type: atomic type: object readinessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1335,6 +1339,8 @@ spec: enabled: type: boolean hpa: + description: HorizontalPodAutoscalerSpec defines the + parameters required to setup HPA properties: behavior: description: |- @@ -1947,6 +1953,8 @@ spec: type: object type: object startupProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml index 5d503f75..ad864295 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml @@ -502,6 +502,8 @@ spec: type: string type: object livenessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1050,6 +1052,8 @@ spec: x-kubernetes-list-type: atomic type: object readinessProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean @@ -1273,6 +1277,8 @@ spec: enabled: type: boolean hpa: + description: HorizontalPodAutoscalerSpec defines the parameters + required to setup HPA properties: behavior: description: |- @@ -1867,6 +1873,8 @@ spec: type: object type: object startupProbe: + description: Probe defines attributes for startup/liveness/readiness + probes properties: enabled: type: boolean diff --git a/internal/controller/nimcache_controller.go b/internal/controller/nimcache_controller.go index 970ec90d..8bcf2c7a 100644 --- a/internal/controller/nimcache_controller.go +++ b/internal/controller/nimcache_controller.go @@ -652,7 +652,7 @@ func (r *NIMCacheReconciler) reconcileJob(ctx context.Context, nimCache *appsv1a // If Job does not exist and caching is not complete, create a new one if err != nil && nimCache.Status.State != appsv1alpha1.NimCacheStatusReady { - job, err := constructJob(nimCache) + job, err := r.constructJob(ctx, nimCache) if err != nil { logger.Error(err, "Failed to construct job") return err @@ -936,7 +936,8 @@ func (r *NIMCacheReconciler) getPodLogs(ctx context.Context, pod *corev1.Pod) (s return buf.String(), nil } -func constructJob(nimCache *appsv1alpha1.NIMCache) (*batchv1.Job, error) { +func (r *NIMCacheReconciler) constructJob(ctx context.Context, nimCache *appsv1alpha1.NIMCache) (*batchv1.Job, error) { + logger := r.GetLogger() pvcName := getPvcName(nimCache, nimCache.Spec.Storage.PVC) labels := map[string]string{ "app": "k8s-nim-operator", @@ -1090,6 +1091,7 @@ func constructJob(nimCache *appsv1alpha1.NIMCache) (*batchv1.Job, error) { // Pass specific profiles to download based on user selection or auto-selection selectedProfiles, err := getSelectedProfiles(nimCache) if err != nil { + logger.Error(err, "failed to get selected profiles for caching") return nil, err } @@ -1105,6 +1107,40 @@ func constructJob(nimCache *appsv1alpha1.NIMCache) (*batchv1.Job, error) { job.Spec.Template.Spec.Containers[0].Args = append(job.Spec.Template.Spec.Containers[0].Args, selectedProfiles...) } } + + // Inject custom CA certificates when running in a proxy envronment + if nimCache.Spec.CertConfig != nil { + certConfig, err := r.getConfigMap(ctx, nimCache.Spec.CertConfig.Name, nimCache.Namespace) + if err != nil { + logger.Error(err, "Failed to get configmap for custom certificates") + return nil, err + } + + // Prepare the volume that references the ConfigMap + volume := corev1.Volume{ + Name: "cert-volume", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: nimCache.Spec.CertConfig.Name, + }, + }, + }, + } + + // Create individual volume mounts for each key in the ConfigMap + volumeMounts := []corev1.VolumeMount{} + for key := range certConfig.Data { + volumeMounts = append(volumeMounts, corev1.VolumeMount{ + Name: "cert-volume", + MountPath: fmt.Sprintf("%s/%s", nimCache.Spec.CertConfig.MountPath, key), + SubPath: key, + }) + } + + job.Spec.Template.Spec.Volumes = append(job.Spec.Template.Spec.Volumes, volume) + job.Spec.Template.Spec.Containers[0].VolumeMounts = append(job.Spec.Template.Spec.Containers[0].VolumeMounts, volumeMounts...) + } } return job, nil } diff --git a/internal/controller/nimcache_controller_test.go b/internal/controller/nimcache_controller_test.go index 742bcd96..b7d4e05c 100644 --- a/internal/controller/nimcache_controller_test.go +++ b/internal/controller/nimcache_controller_test.go @@ -385,7 +385,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - job, err := constructJob(nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache) Expect(err).ToNot(HaveOccurred()) Expect(job.Name).To(Equal(getJobName(nimCache))) @@ -416,7 +416,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - job, err := constructJob(nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache) Expect(err).ToNot(HaveOccurred()) Expect(job.Name).To(Equal(getJobName(nimCache))) @@ -443,7 +443,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - job, err := constructJob(nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache) Expect(err).ToNot(HaveOccurred()) Expect(job.Name).To(Equal(getJobName(nimCache))) @@ -469,7 +469,7 @@ var _ = Describe("NIMCache Controller", func() { }, } - job, err := constructJob(nimCache) + job, err := reconciler.constructJob(context.TODO(), nimCache) Expect(err).ToNot(HaveOccurred()) err = cli.Create(context.TODO(), job) @@ -482,6 +482,80 @@ var _ = Describe("NIMCache Controller", func() { }, time.Second*10).Should(Succeed()) }) + It("should create a job with the right custom CA certificate volumes", func() { + ctx := context.TODO() + profiles := []string{AllProfiles} + nimCache := &appsv1alpha1.NIMCache{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-nimcache", + Namespace: "default", + }, + Spec: appsv1alpha1.NIMCacheSpec{ + Source: appsv1alpha1.NIMSource{NGC: &appsv1alpha1.NGCSource{ModelPuller: "nvcr.io/nim:test", PullSecret: "my-secret", Model: appsv1alpha1.ModelSpec{Profiles: profiles}}}, + CertConfig: &appsv1alpha1.CertConfig{ + Name: "custom-ca-configmap", + MountPath: "/usr/share/ssl/certs", + }, + }, + } + + // Create a sample ConfigMap with certificate files + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "custom-ca-configmap", + Namespace: "default", + }, + Data: map[string]string{ + "custom-ca-cert.pem": "fake-cert-data", + "another-cert.pem": "fake-cert-data-2", + }, + } + + err := reconciler.Create(context.TODO(), configMap) + Expect(err).ToNot(HaveOccurred()) + + job, err := reconciler.constructJob(context.TODO(), nimCache) + Expect(err).ToNot(HaveOccurred()) + + err = cli.Create(context.TODO(), job) + Expect(err).ToNot(HaveOccurred()) + + job = &batchv1.Job{} + jobName := types.NamespacedName{Name: getJobName(nimCache), Namespace: "default"} + err = cli.Get(ctx, jobName, job) + Expect(err).ToNot(HaveOccurred()) + + // Verify CertConfig volume and mounts + Expect(job.Spec.Template.Spec.Volumes).To(ContainElement( + corev1.Volume{ + Name: "cert-volume", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: "custom-ca-configmap", + }, + }, + }, + }, + )) + + Expect(job.Spec.Template.Spec.Containers[0].VolumeMounts).To(ContainElement( + corev1.VolumeMount{ + Name: "cert-volume", + MountPath: "/usr/share/ssl/certs/custom-ca-cert.pem", + SubPath: "custom-ca-cert.pem", + }, + )) + + Expect(job.Spec.Template.Spec.Containers[0].VolumeMounts).To(ContainElement( + corev1.VolumeMount{ + Name: "cert-volume", + MountPath: "/usr/share/ssl/certs/another-cert.pem", + SubPath: "another-cert.pem", + }, + )) + }) + It("should create a ConfigMap with the given model manifest data", func() { ctx := context.TODO() nimCache := &appsv1alpha1.NIMCache{