Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add git model repo / git image puller logic for nim cache #23

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions api/v1alpha1/nimcache_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ type NIMSource struct {
// NGCSource represents models stored in NGC
NGC *NGCSource `json:"ngc,omitempty"`

// GITSource represents models stored in NGC
GIT *GITSource `json:"git,omitempty"`

// NGCSource represents models stored in NVIDIA DataStore service
DataStore *DataStoreSource `json:"dataStore,omitempty"`
}
Expand All @@ -63,6 +66,20 @@ type NGCSource struct {
Model ModelSpec `json:"model,omitempty"`
}

// GITSource references a model stored on NVIDIA NGC
type GITSource struct {
// The name of an existing auth secret for the git repo
AuthSecret string `json:"authSecret"`
// ModelPuller is the container image that can pull the model
ModelPuller string `json:"modelPuller"`
// PullSecret to pull the model puller image
PullSecret string `json:"pullSecret,omitempty"`
// Model spec for caching
Model ModelSpec `json:"model,omitempty"`
// Path is the git repo path
Path string `json:"path"`
}

// ModelSpec is the spec required to cache selected models
type ModelSpec struct {
// Profiles are the specific model profiles to cache
Expand Down
73 changes: 72 additions & 1 deletion internal/controller/nimcache_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,10 @@ func isModelSelectionDone(nimCache *appsv1alpha1.NIMCache) bool {

func getSelectedProfiles(nimCache *appsv1alpha1.NIMCache) ([]string, error) {
// Return profiles explicitly specified by the user in the spec
if len(nimCache.Spec.Source.NGC.Model.Profiles) > 0 {
if nimCache.Spec.Source.NGC != nil && len(nimCache.Spec.Source.NGC.Model.Profiles) > 0 {
return nimCache.Spec.Source.NGC.Model.Profiles, nil
} else if nimCache.Spec.Source.GIT != nil && len(nimCache.Spec.Source.GIT.Model.Profiles) > 0 {
return nimCache.Spec.Source.GIT.Model.Profiles, nil
} else if isModelSelectionRequired(nimCache) {
// Retrieve the selected profiles from the annotation
var selectedProfiles []string
Expand Down Expand Up @@ -820,6 +822,75 @@ func constructJob(nimCache *appsv1alpha1.NIMCache) (*batchv1.Job, error) {
job.Spec.Template.Spec.Containers[0].Args = []string{"--profiles"}
job.Spec.Template.Spec.Containers[0].Args = append(job.Spec.Template.Spec.Containers[0].Args, selectedProfiles...)
}
} else if nimCache.Spec.Source.GIT != nil {
job.Spec.Template.Spec.Containers = []corev1.Container{
{
Name: "nim-cache",
Image: nimCache.Spec.Source.GIT.ModelPuller,
// TODO: finalize standard image / command line / config
// to download model from git
Command: []string{"tdb-git-download-to-cache"},
EnvFrom: nimCache.Spec.Source.EnvFromSecrets(),
Env: []corev1.EnvVar{
{
Name: "HF_HOME",
Value: "/model-store", // Need to be set to a writable directory by non-root user
},
{
Name: "NIM_CACHE_PATH", // Note: in the download mode, NIM_CACHE_PATH is not used
Value: "/model-store",
},
{
Name: "NGC_HOME", // Note: NGC_HOME is required and handled as NIM_CACHE_PATH in the download mode
Value: "/model-store",
},
},
VolumeMounts: []corev1.VolumeMount{
{
Name: "nim-cache-volume",
MountPath: "/model-store",
},
},
Resources: corev1.ResourceRequirements{
Limits: map[corev1.ResourceName]apiResource.Quantity{
"cpu": nimCache.Spec.Resources.CPU,
"memory": nimCache.Spec.Resources.Memory,
"nvidia.com/gpu": *apiResource.NewQuantity(int64(nimCache.Spec.Resources.GPUs), apiResource.DecimalExponent),
},
Requests: map[corev1.ResourceName]apiResource.Quantity{
"cpu": nimCache.Spec.Resources.CPU,
"memory": nimCache.Spec.Resources.Memory,
"nvidia.com/gpu": *apiResource.NewQuantity(int64(nimCache.Spec.Resources.GPUs), apiResource.DecimalExponent),
},
},
TerminationMessagePath: "/dev/termination-log",
TerminationMessagePolicy: corev1.TerminationMessageFallbackToLogsOnError,
SecurityContext: &corev1.SecurityContext{
AllowPrivilegeEscalation: ptr.To[bool](false),
Capabilities: &corev1.Capabilities{
Drop: []corev1.Capability{"ALL"},
},
RunAsNonRoot: ptr.To[bool](true),
RunAsGroup: ptr.To[int64](2000),
RunAsUser: ptr.To[int64](1000),
},
},
}
job.Spec.Template.Spec.ImagePullSecrets = []corev1.LocalObjectReference{
{
Name: nimCache.Spec.Source.GIT.PullSecret,
},
}
// Pass specific profiles to download based on user selection or auto-selection
// TODO: See if the logic applies to git model repo
selectedProfiles, err := getSelectedProfiles(nimCache)
if err != nil {
return nil, err
}
if selectedProfiles != nil {
job.Spec.Template.Spec.Containers[0].Args = []string{"--profiles"}
job.Spec.Template.Spec.Containers[0].Args = append(job.Spec.Template.Spec.Containers[0].Args, selectedProfiles...)
}
}
return job, nil
}
Expand Down
22 changes: 22 additions & 0 deletions internal/controller/nimcache_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,28 @@ var _ = Describe("NIMCache Controller", func() {
jobName := types.NamespacedName{Name: getJobName(nimCache), Namespace: "default"}
return client.Get(ctx, jobName, job)
}, time.Second*10).Should(Succeed())

nimCache.Spec.Source.NGC = nil
nimCache.Spec.Source.GIT = &appsv1alpha1.GITSource{
ModelPuller: "nvcr.io/nim:test-git-puller",
PullSecret: "my-secret",
Path: "https://github.com/modelx",
}

err = client.Delete(context.TODO(), job)
Expect(err).ToNot(HaveOccurred())

job, err = constructJob(nimCache)
Expect(err).ToNot(HaveOccurred())

err = client.Create(context.TODO(), job)
Expect(err).ToNot(HaveOccurred())

Eventually(func() error {
job := &batchv1.Job{}
jobName := types.NamespacedName{Name: getJobName(nimCache), Namespace: "default"}
return client.Get(ctx, jobName, job)
}, time.Second*10).Should(Succeed())
})

It("should create a ConfigMap with the given model manifest data", func() {
Expand Down