From 7263d26817e5d16c30775245b97690ceac73219e Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 30 Oct 2024 15:05:28 +0100 Subject: [PATCH 1/2] Add feature gate to require NVIDIA kernel modules This change adds an opt-in feature to the NVIDIA Container Runtime that only uses the NVIDIA runtime if the NVIDIA kernel modules are loaded. Signed-off-by: Evan Lezar --- internal/config/features.go | 3 +++ internal/runtime/runtime_factory.go | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/internal/config/features.go b/internal/config/features.go index 80d3c95ae..d5ce22dfd 100644 --- a/internal/config/features.go +++ b/internal/config/features.go @@ -21,6 +21,9 @@ type features struct { // DisableImexChannelCreation ensures that the implicit creation of // requested IMEX channels is skipped when invoking the nvidia-container-cli. DisableImexChannelCreation *feature `toml:"disable-imex-channel-creation,omitempty"` + // RequireNvidiaKernelModules indicates that the NVIDIA kernel module must be + // loaded for the NVIDIA Container Runtime to perform any OCI spec modifications. + RequireNvidiaKernelModules *feature `toml:"require-nvidia-kernel-module,omitempty"` } //nolint:unused diff --git a/internal/runtime/runtime_factory.go b/internal/runtime/runtime_factory.go index 50c19a4f9..436db3fda 100644 --- a/internal/runtime/runtime_factory.go +++ b/internal/runtime/runtime_factory.go @@ -18,6 +18,7 @@ package runtime import ( "fmt" + "os" "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" @@ -41,6 +42,11 @@ func newNVIDIAContainerRuntime(logger logger.Interface, cfg *config.Config, argv return lowLevelRuntime, nil } + if cfg.Features.RequireNvidiaKernelModules.IsEnabled() && !isNvidiaModuleLoaded() { + logger.Tracef("NVIDIA driver modules are not yet loaded; skipping modifer") + return lowLevelRuntime, nil + } + ociSpec, err := oci.NewSpec(logger, argv) if err != nil { return nil, fmt.Errorf("error constructing OCI specification: %v", err) @@ -62,6 +68,19 @@ func newNVIDIAContainerRuntime(logger logger.Interface, cfg *config.Config, argv return r, nil } +// isNvidiaKernelModuleLoaded checks whether the NVIDIA GPU driver is installed +// and the kernel module is available. +func isNvidiaModuleLoaded() bool { + // TODO: This was implemented as: + // cat /proc/modules | grep -e \"^nvidia \" >/dev/null 2>&1 + // if [ "${?}" != "0" ]; then + // echo "nvidia driver modules are not yet loaded, invoking runc directly" + // exec runc "$@" + // fi + _, err := os.Stat("/proc/driver/nvidia/version") + return err == nil +} + // newSpecModifier is a factory method that creates constructs an OCI spec modifer based on the provided config. func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, driver *root.Driver) (oci.SpecModifier, error) { rawSpec, err := ociSpec.Load() From ac61306900e24a9905e63da896d11adc991b1e3a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 30 Oct 2024 15:13:34 +0100 Subject: [PATCH 2/2] Use require-nvidia-kernel-modules feature for toolkit installation Signed-off-by: Evan Lezar --- tools/container/toolkit/executable.go | 6 ------ tools/container/toolkit/executable_test.go | 17 ----------------- tools/container/toolkit/runtime.go | 17 +++-------------- tools/container/toolkit/runtime_test.go | 7 ------- tools/container/toolkit/toolkit.go | 2 ++ 5 files changed, 5 insertions(+), 44 deletions(-) diff --git a/tools/container/toolkit/executable.go b/tools/container/toolkit/executable.go index 394ca0076..7fb879b70 100644 --- a/tools/container/toolkit/executable.go +++ b/tools/container/toolkit/executable.go @@ -36,7 +36,6 @@ type executable struct { source string target executableTarget env map[string]string - preLines []string argLines []string } @@ -96,11 +95,6 @@ func (e executable) writeWrapperTo(wrapper io.Writer, destFolder string, dotfile // Add the shebang fmt.Fprintln(wrapper, "#! /bin/sh") - // Add the preceding lines if any - for _, line := range e.preLines { - fmt.Fprintf(wrapper, "%s\n", r.apply(line)) - } - // Update the path to include the destination folder var env map[string]string if e.env == nil { diff --git a/tools/container/toolkit/executable_test.go b/tools/container/toolkit/executable_test.go index 8cb47596c..5b3249ea7 100644 --- a/tools/container/toolkit/executable_test.go +++ b/tools/container/toolkit/executable_test.go @@ -59,23 +59,6 @@ func TestWrapper(t *testing.T) { "", }, }, - { - e: executable{ - preLines: []string{ - "preline1", - "preline2", - }, - }, - expectedLines: []string{ - shebang, - "preline1", - "preline2", - "PATH=/dest/folder:$PATH \\", - "source.real \\", - "\t\"$@\"", - "", - }, - }, { e: executable{ argLines: []string{ diff --git a/tools/container/toolkit/runtime.go b/tools/container/toolkit/runtime.go index bdfca9834..1f3ce31bc 100644 --- a/tools/container/toolkit/runtime.go +++ b/tools/container/toolkit/runtime.go @@ -57,16 +57,6 @@ func newNvidiaContainerRuntimeInstaller(source string) *executable { } func newRuntimeInstaller(source string, target executableTarget, env map[string]string) *executable { - preLines := []string{ - "", - "cat /proc/modules | grep -e \"^nvidia \" >/dev/null 2>&1", - "if [ \"${?}\" != \"0\" ]; then", - " echo \"nvidia driver modules are not yet loaded, invoking runc directly\"", - " exec runc \"$@\"", - "fi", - "", - } - runtimeEnv := make(map[string]string) runtimeEnv["XDG_CONFIG_HOME"] = filepath.Join(destDirPattern, ".config") for k, v := range env { @@ -74,10 +64,9 @@ func newRuntimeInstaller(source string, target executableTarget, env map[string] } r := executable{ - source: source, - target: target, - env: runtimeEnv, - preLines: preLines, + source: source, + target: target, + env: runtimeEnv, } return &r diff --git a/tools/container/toolkit/runtime_test.go b/tools/container/toolkit/runtime_test.go index d2841506d..77d84ec7c 100644 --- a/tools/container/toolkit/runtime_test.go +++ b/tools/container/toolkit/runtime_test.go @@ -38,13 +38,6 @@ func TestNvidiaContainerRuntimeInstallerWrapper(t *testing.T) { expectedLines := []string{ shebang, - "", - "cat /proc/modules | grep -e \"^nvidia \" >/dev/null 2>&1", - "if [ \"${?}\" != \"0\" ]; then", - " echo \"nvidia driver modules are not yet loaded, invoking runc directly\"", - " exec runc \"$@\"", - "fi", - "", "PATH=/dest/folder:$PATH \\", "XDG_CONFIG_HOME=/dest/folder/.config \\", "source.real \\", diff --git a/tools/container/toolkit/toolkit.go b/tools/container/toolkit/toolkit.go index 43e68ca55..2a487a52a 100644 --- a/tools/container/toolkit/toolkit.go +++ b/tools/container/toolkit/toolkit.go @@ -465,6 +465,8 @@ func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContai configValues["nvidia-container-runtime.runtimes"] = toolkitRuntimeList } + // We require the NVIDIA kernel modules to be loaded. + configValues["features.require-nvidia-kernel-modules"] = true for _, optInFeature := range opts.optInFeatures.Value() { configValues["features."+optInFeature] = true }