Skip to content

Commit

Permalink
[no-relnote] Refactor CDI version extraction
Browse files Browse the repository at this point in the history
Signed-off-by: Evan Lezar <[email protected]>
  • Loading branch information
elezar committed Dec 6, 2024
1 parent b0a5fb9 commit 108b8c5
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 49 deletions.
4 changes: 2 additions & 2 deletions pkg/nvcdi/common-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (

// newCommonNVMLDiscoverer returns a discoverer for entities that are not associated with a specific CDI device.
// This includes driver libraries and meta devices, for example.
func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) {
func (l *nvmllib) newCommonNVMLDiscoverer(version string) (discover.Discover, error) {
metaDevices := discover.NewCharDeviceDiscoverer(
l.logger,
l.devRoot,
Expand All @@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) {
l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err)
}

driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib)
driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, version)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err)
}
Expand Down
17 changes: 1 addition & 16 deletions pkg/nvcdi/driver-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"path/filepath"
"strings"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"golang.org/x/sys/unix"

"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
Expand All @@ -34,21 +33,7 @@ import (

// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation.
// The supplied NVML Library is used to query the expected driver version.
func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) {
if r := nvmllib.Init(); r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %v", r)
}
defer func() {
if r := nvmllib.Shutdown(); r != nvml.SUCCESS {
logger.Warningf("failed to shutdown NVML: %v", r)
}
}()

version, r := nvmllib.SystemGetDriverVersion()
if r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to determine driver version: %v", r)
}

func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, version string) (discover.Discover, error) {
return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version)
}

Expand Down
7 changes: 6 additions & 1 deletion pkg/nvcdi/lib-nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,12 @@ func (l *nvmllib) GetAllDeviceSpecs() ([]specs.Device, error) {

// GetCommonEdits generates a CDI specification that can be used for ANY devices
func (l *nvmllib) GetCommonEdits() (*cdi.ContainerEdits, error) {
common, err := l.newCommonNVMLDiscoverer()
version, err := (*nvcdilib)(l).getDriverVersion()
if err != nil {
return nil, fmt.Errorf("failed to get driver version: %v", err)
}

common, err := l.newCommonNVMLDiscoverer(version)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err)
}
Expand Down
35 changes: 28 additions & 7 deletions pkg/nvcdi/lib.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ package nvcdi

import (
"fmt"
"path/filepath"
"strings"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
Expand All @@ -26,6 +28,7 @@ import (

"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils"
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv"
Expand Down Expand Up @@ -228,18 +231,36 @@ func (l *nvcdilib) resolveMode() (rmode string) {
return ModeNvml
}

// getCudaVersion returns the CUDA version of the current system.
func (l *nvcdilib) getCudaVersion() (string, error) {
version, err := l.getCudaVersionNvsandboxutils()
if err == nil {
// getDriverVersion returns the driver version of the current system.
func (l *nvcdilib) getDriverVersion() (string, error) {
if version, err := l.getDriverVersionNvsandboxutils(); err == nil && version != "" {
return version, err
}

// Fallback to NVML
return l.getCudaVersionNvml()
if version, err := l.getDriverVersionNvml(); err == nil && version != "" {
return version, err
}

// Fallback to getting the version from the libcuda.so suffix.
return l.getDriverVersionLibcudaSo()
}

func (l *nvcdilib) getDriverVersionLibcudaSo() (string, error) {
libCudaPaths, err := cuda.New(
l.driver.Libraries(),
).Locate(".*.*")
if err != nil {
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libCudaPath := libCudaPaths[0]

version := strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")

return version, nil
}

func (l *nvcdilib) getCudaVersionNvml() (string, error) {
func (l *nvcdilib) getDriverVersionNvml() (string, error) {
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
return "", fmt.Errorf("nvml not detected: %v", reason)
}
Expand All @@ -263,7 +284,7 @@ func (l *nvcdilib) getCudaVersionNvml() (string, error) {
return version, nil
}

func (l *nvcdilib) getCudaVersionNvsandboxutils() (string, error) {
func (l *nvcdilib) getDriverVersionNvsandboxutils() (string, error) {
if l.nvsandboxutilslib == nil {
return "", fmt.Errorf("libnvsandboxutils is not available")
}
Expand Down
24 changes: 1 addition & 23 deletions pkg/nvcdi/management.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (

"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
)
Expand Down Expand Up @@ -75,7 +74,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
}()
}

version, err := m.getCudaVersion()
version, err := (*nvcdilib)(m).getDriverVersion()
if err != nil {
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
}
Expand All @@ -93,27 +92,6 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
return edits, nil
}

// getCudaVersion returns the CUDA version for use in managementlib containers.
func (m *managementlib) getCudaVersion() (string, error) {
version, err := (*nvcdilib)(m).getCudaVersion()
if err == nil {
return version, nil
}

libCudaPaths, err := cuda.New(
m.driver.Libraries(),
).Locate(".*.*")
if err != nil {
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
}

libCudaPath := libCudaPaths[0]

version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")

return version, nil
}

type managementDiscoverer struct {
discover.Discover
}
Expand Down

0 comments on commit 108b8c5

Please sign in to comment.