From 5c0b946ac19d783568996df6cc917cf135cb6c10 Mon Sep 17 00:00:00 2001 From: Vishesh Tanksale Date: Thu, 17 Oct 2024 19:56:53 +0000 Subject: [PATCH] Updating NIMService CRD for multinode inferencing Signed-off-by: Vishesh Tanksale --- api/apps/v1alpha1/nimservice_types.go | 21 ++++++++-- api/apps/v1alpha1/zz_generated.deepcopy.go | 42 +++++++++++++++++++ .../apps.nvidia.com_nimpipelines.yaml | 20 +++++++++ .../apps.nvidia.com_nimservices.yaml | 20 +++++++++ .../bases/apps.nvidia.com_nimpipelines.yaml | 20 +++++++++ .../bases/apps.nvidia.com_nimservices.yaml | 20 +++++++++ .../crds/apps.nvidia.com_nimpipelines.yaml | 20 +++++++++ .../crds/apps.nvidia.com_nimservices.yaml | 20 +++++++++ 8 files changed, 180 insertions(+), 3 deletions(-) diff --git a/api/apps/v1alpha1/nimservice_types.go b/api/apps/v1alpha1/nimservice_types.go index eb7f9017..7b51f559 100644 --- a/api/apps/v1alpha1/nimservice_types.go +++ b/api/apps/v1alpha1/nimservice_types.go @@ -51,6 +51,20 @@ const ( NIMServiceStatusFailed = "Failed" ) +// MultiNodeSpec defines the parameters to render a multi-node NIMService deployment. +type MultiNodeSpec struct { + ClusterStartTimeout int `json:"clusterStartTimeout,omitempty"` + Enabled *bool `json:"enabled,omitempty"` + Workers int `json:"workers,omitempty"` + GpusPerNode int `json:"gpusPerNode,omitempty"` + LeaderWorkerSet LeaderWorkerSet `json:"leaderWorkerSet,omitempty"` +} + +// LeaderWorkerSet defines the details of leader-worker set CRD for the NIMService deployment. +type LeaderWorkerSet struct { + Enabled *bool `json:"enabled,omitempty"` +} + // NIMServiceSpec defines the desired state of NIMService type NIMServiceSpec struct { Image Image `json:"image,omitempty"` @@ -75,9 +89,10 @@ type NIMServiceSpec struct { Metrics Metrics `json:"metrics,omitempty"` // +kubebuilder:validation:Minimum=1 // +kubebuilder:default:=1 - Replicas int `json:"replicas,omitempty"` - UserID *int64 `json:"userID,omitempty"` - GroupID *int64 `json:"groupID,omitempty"` + Replicas int `json:"replicas,omitempty"` + UserID *int64 `json:"userID,omitempty"` + GroupID *int64 `json:"groupID,omitempty"` + MultiNode MultiNodeSpec `json:"multiNode,omitempty"` } // NIMCacheVolSpec defines the spec to use NIMCache volume diff --git a/api/apps/v1alpha1/zz_generated.deepcopy.go b/api/apps/v1alpha1/zz_generated.deepcopy.go index dd005f75..8a6e6fdd 100644 --- a/api/apps/v1alpha1/zz_generated.deepcopy.go +++ b/api/apps/v1alpha1/zz_generated.deepcopy.go @@ -272,6 +272,26 @@ func (in *IngressPath) DeepCopy() *IngressPath { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LeaderWorkerSet) DeepCopyInto(out *LeaderWorkerSet) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LeaderWorkerSet. +func (in *LeaderWorkerSet) DeepCopy() *LeaderWorkerSet { + if in == nil { + return nil + } + out := new(LeaderWorkerSet) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Metrics) DeepCopyInto(out *Metrics) { *out = *in @@ -325,6 +345,27 @@ func (in *ModelSpec) DeepCopy() *ModelSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MultiNodeSpec) DeepCopyInto(out *MultiNodeSpec) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + in.LeaderWorkerSet.DeepCopyInto(&out.LeaderWorkerSet) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MultiNodeSpec. +func (in *MultiNodeSpec) DeepCopy() *MultiNodeSpec { + if in == nil { + return nil + } + out := new(MultiNodeSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NGCSource) DeepCopyInto(out *NGCSource) { *out = *in @@ -805,6 +846,7 @@ func (in *NIMServiceSpec) DeepCopyInto(out *NIMServiceSpec) { *out = new(int64) **out = **in } + in.MultiNode.DeepCopyInto(&out.MultiNode) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NIMServiceSpec. diff --git a/bundle/manifests/apps.nvidia.com_nimpipelines.yaml b/bundle/manifests/apps.nvidia.com_nimpipelines.yaml index 76fb5b49..afaf9a55 100644 --- a/bundle/manifests/apps.nvidia.com_nimpipelines.yaml +++ b/bundle/manifests/apps.nvidia.com_nimpipelines.yaml @@ -755,6 +755,26 @@ spec: type: string type: object type: object + multiNode: + description: MultiNodeSpec defines the parameters to render + a multi-node NIMService deployment. + properties: + clusterStartTimeout: + type: integer + enabled: + type: boolean + gpusPerNode: + type: integer + leaderWorkerSet: + description: LeaderWorkerSet defines the details of + leader-worker set CRD for the NIMService deployment. + properties: + enabled: + type: boolean + type: object + workers: + type: integer + type: object nodeSelector: additionalProperties: type: string diff --git a/bundle/manifests/apps.nvidia.com_nimservices.yaml b/bundle/manifests/apps.nvidia.com_nimservices.yaml index 78643707..5af657c8 100644 --- a/bundle/manifests/apps.nvidia.com_nimservices.yaml +++ b/bundle/manifests/apps.nvidia.com_nimservices.yaml @@ -702,6 +702,26 @@ spec: type: string type: object type: object + multiNode: + description: MultiNodeSpec defines the parameters to render a multi-node + NIMService deployment. + properties: + clusterStartTimeout: + type: integer + enabled: + type: boolean + gpusPerNode: + type: integer + leaderWorkerSet: + description: LeaderWorkerSet defines the details of leader-worker + set CRD for the NIMService deployment. + properties: + enabled: + type: boolean + type: object + workers: + type: integer + type: object nodeSelector: additionalProperties: type: string diff --git a/config/crd/bases/apps.nvidia.com_nimpipelines.yaml b/config/crd/bases/apps.nvidia.com_nimpipelines.yaml index 76fb5b49..afaf9a55 100644 --- a/config/crd/bases/apps.nvidia.com_nimpipelines.yaml +++ b/config/crd/bases/apps.nvidia.com_nimpipelines.yaml @@ -755,6 +755,26 @@ spec: type: string type: object type: object + multiNode: + description: MultiNodeSpec defines the parameters to render + a multi-node NIMService deployment. + properties: + clusterStartTimeout: + type: integer + enabled: + type: boolean + gpusPerNode: + type: integer + leaderWorkerSet: + description: LeaderWorkerSet defines the details of + leader-worker set CRD for the NIMService deployment. + properties: + enabled: + type: boolean + type: object + workers: + type: integer + type: object nodeSelector: additionalProperties: type: string diff --git a/config/crd/bases/apps.nvidia.com_nimservices.yaml b/config/crd/bases/apps.nvidia.com_nimservices.yaml index 78643707..5af657c8 100644 --- a/config/crd/bases/apps.nvidia.com_nimservices.yaml +++ b/config/crd/bases/apps.nvidia.com_nimservices.yaml @@ -702,6 +702,26 @@ spec: type: string type: object type: object + multiNode: + description: MultiNodeSpec defines the parameters to render a multi-node + NIMService deployment. + properties: + clusterStartTimeout: + type: integer + enabled: + type: boolean + gpusPerNode: + type: integer + leaderWorkerSet: + description: LeaderWorkerSet defines the details of leader-worker + set CRD for the NIMService deployment. + properties: + enabled: + type: boolean + type: object + workers: + type: integer + type: object nodeSelector: additionalProperties: type: string diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml index 76fb5b49..afaf9a55 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimpipelines.yaml @@ -755,6 +755,26 @@ spec: type: string type: object type: object + multiNode: + description: MultiNodeSpec defines the parameters to render + a multi-node NIMService deployment. + properties: + clusterStartTimeout: + type: integer + enabled: + type: boolean + gpusPerNode: + type: integer + leaderWorkerSet: + description: LeaderWorkerSet defines the details of + leader-worker set CRD for the NIMService deployment. + properties: + enabled: + type: boolean + type: object + workers: + type: integer + type: object nodeSelector: additionalProperties: type: string diff --git a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml index 78643707..5af657c8 100644 --- a/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml +++ b/deployments/helm/k8s-nim-operator/crds/apps.nvidia.com_nimservices.yaml @@ -702,6 +702,26 @@ spec: type: string type: object type: object + multiNode: + description: MultiNodeSpec defines the parameters to render a multi-node + NIMService deployment. + properties: + clusterStartTimeout: + type: integer + enabled: + type: boolean + gpusPerNode: + type: integer + leaderWorkerSet: + description: LeaderWorkerSet defines the details of leader-worker + set CRD for the NIMService deployment. + properties: + enabled: + type: boolean + type: object + workers: + type: integer + type: object nodeSelector: additionalProperties: type: string