From a800bd58332fa9ec8d484c5e75d53502a2cd2103 Mon Sep 17 00:00:00 2001 From: Maciej Strzelczyk Date: Mon, 20 May 2024 13:21:23 +0000 Subject: [PATCH] New cuda installer tool (#35) * WIP: Maybe 80% there. Signed-off-by: Maciej Strzelczyk * The functionality seems complete. * Docstrings, license headers and reformat * Updating the tool and tests. Signed-off-by: Maciej Strzelczyk * Fixing test problems and updating the old script. * Final fixes to the READMEs --------- Signed-off-by: Maciej Strzelczyk --- linux/README.md | 55 +- linux/cuda_installer/__main__.py | 76 +++ linux/cuda_installer/config.py | 49 ++ linux/cuda_installer/decorators.py | 44 ++ linux/cuda_installer/logger.py | 40 ++ .../cuda_installer/os_installers/__init__.py | 544 ++++++++++++++++++ linux/cuda_installer/os_installers/debian.py | 84 +++ .../os_installers/dnf_system.py | 94 +++ linux/cuda_installer/os_installers/rhel.py | 31 + linux/cuda_installer/os_installers/rocky.py | 18 + linux/cuda_installer/os_installers/ubuntu.py | 59 ++ linux/install_gpu_driver.py | 5 +- linux/tests/README.md | 11 +- linux/tests/requirements.txt | 8 +- linux/tests/startup_script.sh | 11 + linux/tests/test_installations.py | 197 +++++-- 16 files changed, 1225 insertions(+), 101 deletions(-) create mode 100644 linux/cuda_installer/__main__.py create mode 100644 linux/cuda_installer/config.py create mode 100644 linux/cuda_installer/decorators.py create mode 100644 linux/cuda_installer/logger.py create mode 100644 linux/cuda_installer/os_installers/__init__.py create mode 100644 linux/cuda_installer/os_installers/debian.py create mode 100644 linux/cuda_installer/os_installers/dnf_system.py create mode 100644 linux/cuda_installer/os_installers/rhel.py create mode 100644 linux/cuda_installer/os_installers/rocky.py create mode 100644 linux/cuda_installer/os_installers/ubuntu.py create mode 100644 linux/tests/startup_script.sh diff --git a/linux/README.md b/linux/README.md index 71b61e9..7e7c706 100644 --- a/linux/README.md +++ b/linux/README.md @@ -1,48 +1,47 @@ -# Installation for Linux. +# Installation for Linux -In the `install_gpu_driver.py` you can find a script that automates installation -of newer GPU drivers for NVIDIA GPU drivers available for Google Compute Engine -instances. +The recommended way to install NVIDIA GPU drivers and CUDA Toolkit for Google Cloud Compute Engine +instances is through the cuda_installer tool. Look for the newest version in the +[releases](https://github.com/GoogleCloudPlatform/compute-gpu-installation/releases) +section of this repository. -The script support the following operating systems: +The `install_gpu_driver.py` script is still available to not break existing setups, +but is considered deprecated and should not be used anymore. -* CentOS: versions 7 -* CentOS Stream: version 8 -* Debian: versions 10 and 11 -* RHEL: versions 7 and 8 -* Rocky: version 8 -* Ubuntu: version 20 and 21 +The tool supports following operating systems (x86_64/amd64 architecture): -Note: Just because an operating system is not supported by this script, doesn't -mean that it's impossible to install NVIDIA drivers on it. You should check and -try instructions on -[NVIDIAs website](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) -to discover other ways of installing drivers. +* Debian: versions 10, 11 and 12 +* RHEL: versions 8 and 9 +* Rocky: version 8 and 9 +* Ubuntu: version 20, 22 and 24 + +Note: Just because an operating system is not listed as supported by this tool, +it doesn't mean that it's impossible to install NVIDIA drivers on it. You should check and +try instructions on [NVIDIAs website](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) to discover other ways of installing drivers. ## Requirements The system on which you want to run the script needs to meet the following requirements: -* Python interpreter in version 3.6 installed (by default available in all - supported OSes except CentOS 7 and RHEL 7). -* Access to Internet (the script needs to download the driver). -* (optional) At least one GPU unit attached. +* Python interpreter in version 3.6 or newer installed. +* Access to Internet (the script needs to download the driver and CUDA tookit). +* At least one GPU unit attached. -## Running the script +## Running the tool -The `install_gpu_driver.py` script needs to be executed with root privileges -(for example `sudo python3 install_gpu_driver.py`). +The `cuda_installer.pyz` script needs to be executed with root privileges +(for example `sudo python3 cuda_installer.pyz`). -Note: On some systems the script might trigger system reboot, it -needs to be restarted after the reboot is done. +Note: During the installation the script will trigger system reboots. After a +reboot, the script needs to be started again to continue the installation process. -After the installation, you should restart your system to make sure everything -is initialized properly and working. +After successfully installation, the tool will restart your system once more to make +sure everything is initialized properly and working system-wide. ## Script output -The installation script logs its outputs to `/opt/google/gpu-installer/` folder. +The installation tool logs its outputs to `/opt/google/cuda-installer/` folder. If you are facing any problems with the installation, this should be the first place to check for any errors. When asking for support, you will be asked to provide the log files from this folder. diff --git a/linux/cuda_installer/__main__.py b/linux/cuda_installer/__main__.py new file mode 100644 index 0000000..800331b --- /dev/null +++ b/linux/cuda_installer/__main__.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import sys + +import config +from logger import logger +# Need to import all the subpackages here, or the program fails for Python 3.6 +from os_installers import get_installer, debian, ubuntu, rhel, rocky + + +# Mentioning the packages from import above, so automatic import cleanups don't remove them +del debian +del ubuntu +del rhel +del rocky + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Manage GPU drivers and CUDA toolkit installation." + ) + parser.add_argument( + "command", + choices=[ + "install_driver", + "install_cuda", + "verify_driver", + "verify_cuda", + "uninstall_driver", + ], + help="Install GPU driver or CUDA Toolkit.", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + if os.geteuid() != 0: + print("This script needs to be run with root privileges!") + sys.exit(1) + args = parse_args() + logger.info(f"Switching to working directory: {config.INSTALLER_DIR}") + os.chdir(config.INSTALLER_DIR) + installer = get_installer() + + if args.command == "install_driver": + installer.install_driver() + elif args.command == "verify_driver": + if installer.verify_driver(verbose=True): + sys.exit(0) + else: + sys.exit(1) + elif args.command == "uninstall_driver": + installer.uninstall_driver() + elif args.command == "install_cuda": + installer.install_cuda() + elif args.command == "verify_cuda": + if installer.verify_cuda(): + sys.exit(0) + else: + sys.exit(1) diff --git a/linux/cuda_installer/config.py b/linux/cuda_installer/config.py new file mode 100644 index 0000000..dac863b --- /dev/null +++ b/linux/cuda_installer/config.py @@ -0,0 +1,49 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib + +INSTALLER_DIR = pathlib.Path("/opt/google/cuda-installer/") +try: + INSTALLER_DIR.mkdir(parents=True, exist_ok=True) +except PermissionError: + pass + + +K80_DRIVER_VERSION = "470.239.06" +K80_DEVICE_CODE = "10de:102d" +K80_DRIVER_URL = f"https://us.download.nvidia.com/tesla/{K80_DRIVER_VERSION}/NVIDIA-Linux-x86_64-{K80_DRIVER_VERSION}.run" +K80_DRIVER_SHA256_SUM = ( + "7d74caac140a0432d79ebe8e4330dc796f39ba7dd40b3fcd61df760181bf9ccc" +) + +CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run" +CUDA_TOOLKIT_SHA256_SUM = ( + "367d2299b3a4588ab487a6d27276ca5d9ead6e394904f18bccb9e12433b9c4fb" +) + +CUDA_SAMPLES_TARGZ = ( + "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v12.4.1.tar.gz" +) +CUDA_SAMPLES_SHA256_SUM = ( + "01bb311cc8f802a0d243700e4abe6a2d402132c9d97ecf2c64f3fbb1006c304c" +) + +CUDA_PROFILE_FILENAME = pathlib.Path("/etc/profile.d/google_cuda_install.sh") +CUDA_BIN_FOLDER = "/usr/local/cuda-12.4/bin" +CUDA_LIB_FOLDER = "/usr/local/cuda-12.4/lib64" + +NVIDIA_PERSISTANCED_INSTALLER = ( + "/usr/share/doc/NVIDIA_GLX-1.0/samples/nvidia-persistenced-init.tar.bz2" +) diff --git a/linux/cuda_installer/decorators.py b/linux/cuda_installer/decorators.py new file mode 100644 index 0000000..7e8eac6 --- /dev/null +++ b/linux/cuda_installer/decorators.py @@ -0,0 +1,44 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pathlib +from datetime import datetime + +from config import INSTALLER_DIR +from logger import logger + + +def checkpoint_decorator(file_name: str, skip_message: str): + from os_installers import RebootRequired + + def decorator(func): + def wrapper(*args, **kwargs): + if pathlib.Path(INSTALLER_DIR / file_name).exists(): + logger.info(skip_message) + return + try: + func(*args, **kwargs) + except RebootRequired: + reboot_required = True + else: + reboot_required = False + with pathlib.Path(INSTALLER_DIR / file_name).open(mode="w") as flag: + flag.write(str(datetime.now())) + flag.flush() + if reboot_required: + raise RebootRequired + + return wrapper + + return decorator diff --git a/linux/cuda_installer/logger.py b/linux/cuda_installer/logger.py new file mode 100644 index 0000000..d57b31c --- /dev/null +++ b/linux/cuda_installer/logger.py @@ -0,0 +1,40 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import logging.handlers +import sys + +from config import INSTALLER_DIR + + +logger = logging.getLogger("GoogleCUDAInstaller") +_file_handler = logging.FileHandler(INSTALLER_DIR / "installer.log", mode="a") +_file_handler.level = logging.DEBUG +logger.addHandler(_file_handler) +_sys_handler = logging.handlers.SysLogHandler( + "/dev/log", facility=logging.handlers.SysLogHandler.LOG_LOCAL0 +) +_sys_handler.ident = "[GoogleCUDAInstaller] " +_sys_handler.level = logging.INFO +logger.addHandler(_sys_handler) +stdout_handler = logging.StreamHandler(sys.stdout) +stdout_handler.level = logging.INFO +logger.addHandler(stdout_handler) +logger.setLevel(logging.DEBUG) + +formatter = logging.Formatter("[%(asctime)s] %(levelname)s - %(message)s") +_file_handler.setFormatter(formatter) + +__all__ = ["logger"] diff --git a/linux/cuda_installer/os_installers/__init__.py b/linux/cuda_installer/os_installers/__init__.py new file mode 100644 index 0000000..9eeff50 --- /dev/null +++ b/linux/cuda_installer/os_installers/__init__.py @@ -0,0 +1,544 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import os +import pathlib +import re +import shlex +import shutil +import subprocess +import sys +import tempfile +import urllib.parse +from contextlib import contextmanager +from enum import Enum, auto +from typing import Optional, Union + +from config import ( + K80_DRIVER_URL, + CUDA_TOOLKIT_URL, + CUDA_TOOLKIT_SHA256_SUM, + K80_DRIVER_SHA256_SUM, + K80_DEVICE_CODE, + CUDA_PROFILE_FILENAME, + CUDA_BIN_FOLDER, + CUDA_LIB_FOLDER, + NVIDIA_PERSISTANCED_INSTALLER, + CUDA_SAMPLES_TARGZ, + CUDA_SAMPLES_SHA256_SUM, +) +from decorators import checkpoint_decorator +from logger import logger + + +class RebootRequired(RuntimeError): + pass + + +class System(Enum): + CentOS = auto() + Debian = auto() + Fedora = auto() + RHEL = auto() + Rocky = auto() + SUSE = auto() + Ubuntu = auto() + + +@contextmanager +def chdir(path: Union[pathlib.Path, str]): + """ + Switch the current working directory for a while. Restore the previous one on context exit. + """ + prev = os.getcwd() + try: + os.chdir(path) + yield + finally: + os.chdir(prev) + + +class LinuxInstaller(metaclass=abc.ABCMeta): + """ + Handles the installation process for both driver and CUDA toolkit. Needs to have couple of methods implemented + in child classes, but contains most of the required logic. + """ + + def __init__(self): + self.kernel_version = self.run("uname -r", silent=True).stdout + self.device_code = self.detect_gpu_device() + self._file_download_verified = set() + + @abc.abstractmethod + def _install_prerequisites(self): + """ + Update kernel to the newest version and install all required packages for the NVIDIA drivers to be installed. + """ + pass + + @abc.abstractmethod + def lock_kernel_updates(self): + """ + Make sure that drivers aren't broken by an automatic kernel update. + """ + pass + + @abc.abstractmethod + def unlock_kernel_updates(self): + """ + Allows the kernel related packages to be upgraded. + """ + pass + + def install_driver(self): + """ + Downloads the installation package and installs the driver. It also handles installation of + drive prerequisites and will trigger a reboot on first run, when those prerequisites are installed. + + On second run, it will proceed to download proper installer and install the driver. When it's done, `nvidia-smi` + should be available in the system and the drivers are installed. + + It also triggers kernel packages lock in the system, so the driver is not broken by auto-updates. + """ + if self.verify_driver(): + logger.info("GPU driver already installed.") + return + + if self.device_code == K80_DEVICE_CODE: + installer_path = self.download_k80_driver_installer() + else: + installer_path = self.download_cuda_toolkit_installer() + + logger.info("Installing prerequisite packages and updating kernel...") + try: + self._install_prerequisites() + except RebootRequired: + self.reboot() + + if self.device_code == K80_DEVICE_CODE: + logger.info("Installing GPU drivers for K80...") + self.run(f"sh {installer_path} -s", check=True) + else: + logger.info("Installing GPU drivers for your device...") + self.run(f"sh {installer_path} --silent --driver", check=True) + + if self.verify_driver(): + self.lock_kernel_updates() + logger.info("GPU driver installation completed!") + else: + logger.error( + "Something went wrong with driver installation. The installation failed :(" + ) + + def uninstall_driver(self): + """ + Uses the Nvidia installers to execute driver uninstallation. It will also unlock the kernel updates in the + system. + """ + if not self.verify_driver(): + logger.info("GPU driver not found.") + return + with tempfile.TemporaryDirectory() as temp_dir: + if self.device_code == K80_DEVICE_CODE: + installer_path = self.download_k80_driver_installer() + else: + installer_path = self.download_cuda_toolkit_installer() + logger.info( + "Extracting NVIDIA driver installer, to complete uninstallation..." + ) + self.run(f"sh {installer_path} --extract={temp_dir}", check=True) + installer_path = pathlib.Path( + f"{temp_dir}/NVIDIA-Linux-x86_64-550.54.15.run" + ) + + logger.info("Starting uninstallation...") + self.run(f"sh {installer_path} -s --uninstall", check=True) + logger.info("Uninstallation completed!") + self.unlock_kernel_updates() + + def verify_driver(self, verbose: bool = False) -> bool: + """ + Checks if the driver is already installed by calling the `nvidia-smi` binary. + If it's available and doesn't produce errors, that means the driver is already installed. + """ + process = self.run("which nvidia-smi", check=False, silent=True) + if process.returncode != 0: + if verbose: + print("Couldn't find nvidia-smi, the driver is not installed.") + return False + process2 = self.run("nvidia-smi -L", check=False, silent=True) + success = process2.returncode == 0 and "UUID" in process2.stdout + if verbose: + print(f"nvidia-smi -L output: {process2.stdout} {process2.stderr}") + return success + + @checkpoint_decorator( + "cuda_installation", "CUDA toolkit already marked as installed." + ) + def _install_cuda(self): + """ + This is the method to install the CUDA Toolkit. It will install the toolkit and execute post-installation + configuration in the operating system, to make it available for all users. + """ + if self.device_code == K80_DEVICE_CODE: + logger.info("CUDA installation is not supported for K80 GPUs.") + return + if not self.verify_driver(): + logger.info( + "CUDA installation requires GPU driver to be installed first. " + "Attempting to install GPU driver now." + ) + self.install_driver() + + installer_path = self.download_cuda_toolkit_installer() + + logger.info("Installing CUDA toolkit...") + self.run(f"sh {installer_path} --silent --toolkit", check=True) + logger.info("CUDA toolkit installation completed!") + logger.info("Executing post-installation actions...") + self.cuda_postinstallation_actions() + logger.info("CUDA post-installation actions completed!") + raise RebootRequired + + def install_cuda(self): + try: + self._install_cuda() + except RebootRequired: + self.reboot() + + def cuda_postinstallation_actions(self): + """ + Perform required and suggested post-installation actions: + * set environment variables + * make persistent changes to environment variables + * configure nvidia-persistanced to auto-start (if exists) + + More info: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions + """ + os.environ["PATH"] = f"{CUDA_BIN_FOLDER}:{os.environ['PATH']}" + if "LD_LIBRARY_PATH" in os.environ: + os.environ["LD_LIBRARY_PATH"] = ( + f"{CUDA_LIB_FOLDER}:{os.environ['LD_LIBRARY_PATH']}" + ) + else: + os.environ["LD_LIBRARY_PATH"] = CUDA_LIB_FOLDER + + with CUDA_PROFILE_FILENAME.open("w") as profile: + profile.write( + "# Configuring CUDA toolkit. File created by Google CUDA installation manager.\n" + ) + profile.write("export PATH=" + CUDA_BIN_FOLDER + "${PATH:+:${PATH}}\n") + profile.write( + "export LD_LIBRARY_PATH=" + + CUDA_LIB_FOLDER + + "${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}\n" + ) + + self.configure_persistanced_service() + + def configure_persistanced_service(self): + """ + Configures the nvidia-persistenced daemon to auto-start. It creates a service to be controlled using + `systemctl`. + """ + if not pathlib.Path("/usr/bin/nvidia-persistenced").exists(): + return + + if not pathlib.Path(NVIDIA_PERSISTANCED_INSTALLER).exists(): + return + + with tempfile.TemporaryDirectory() as temp_dir: + shutil.copy(NVIDIA_PERSISTANCED_INSTALLER, temp_dir + "/installer.tar.bz2") + with chdir(temp_dir): + self.run("tar -xf installer.tar.bz2", silent=True) + logger.info("Executing nvidia-persistenced installer...") + self.run("sh nvidia-persistenced-init/install.sh", check=True) + + def verify_cuda(self) -> bool: + """ + Make sure that CUDA Toolkit is properly installed by compiling and executing CUDA code samples. + """ + logger.info("Verifying CUDA installation...") + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir = pathlib.Path(temp_dir) + with chdir(temp_dir): + logger.info( + f"Using {temp_dir} to download, build and execute code samples." + ) + samples_tar = self.download_file( + CUDA_SAMPLES_TARGZ, CUDA_SAMPLES_SHA256_SUM + ) + self.run(f"tar -xf {samples_tar.name}") + with chdir( + temp_dir / "cuda-samples-12.4.1/Samples/1_Utilities/deviceQuery" + ): + self.run("make", check=True) + dev_query = self.run("./deviceQuery", check=True) + if "Result = PASS" not in dev_query.stdout: + logger.error( + "Cuda Toolkit verification failed. DeviceQuery sample failed." + ) + return False + with chdir( + temp_dir / "cuda-samples-12.4.1/Samples/1_Utilities/bandwidthTest" + ): + self.run("make", check=True) + bandwidth = self.run("./bandwidthTest", check=True) + if "Result = PASS" not in bandwidth.stdout: + logger.error( + "Cuda Toolkit verification failed. BandwidthTest sample failed." + ) + return False + logger.info("Cuda Toolkit verification completed!") + return True + + @staticmethod + def run( + command: str, + check=True, + input=None, + cwd=None, + silent=False, + environment=None, + retries=0, + ) -> subprocess.CompletedProcess: + """ + Runs a provided command, streaming its output to the log files. + + :param command: A command to be executed, as a single string. + :param check: If true, will throw exception on failure (exit code != 0) + :param input: Input for the executed command. + :param cwd: Directory in which to execute the command. + :param silent: If set to True, the output of command won't be logged or printed. + :param environment: A set of environment variable for the process to use. If None, the current env is inherited. + :param retries: How many times should the command be repeated if it exits with non-zero code. + + :return: CompletedProcess instance - the result of the command execution. + """ + if not silent: + logger.info(f"Executing: {command}") + + try_count = 0 + stdout = [] + stderr = [] + proc = None + + while try_count <= retries: + stdout.clear() + stderr.clear() + proc = subprocess.Popen( + shlex.split(command), + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE if input else None, + cwd=cwd, + env=environment, + ) + os.set_blocking(proc.stdout.fileno(), False) + os.set_blocking(proc.stderr.fileno(), False) + if input is not None: + proc.stdin.write(input.encode()) + proc.stdin.close() + + def capture_comms(): + for line in proc.stdout.readlines(): + if not silent: + logger.info(line.decode().strip()) + stdout.append(line.decode().strip()) + for line in proc.stderr.readlines(): + if not silent: + logger.warning(line.decode().strip()) + stdout.append(line.decode().strip()) + + while proc.poll() is None: + # While the process is running, we capture the output + capture_comms() + try: + proc.wait(0.1) + except subprocess.TimeoutExpired: + continue + # When the process is finished, we need to capture any output left in buffers + capture_comms() + + if proc.returncode == 0: + break + else: + try_count += 1 + continue + + if check and proc.returncode: + raise subprocess.SubprocessError("Command exited with non-zero code") + + return subprocess.CompletedProcess( + command, proc.returncode, stdout="\n".join(stdout), stderr="\n".join(stderr) + ) + + @classmethod + def check_gpu_present(cls) -> bool: + """ + Checks in `lspci` if there's an NVIDIA device present in the system. + """ + lspci = cls.run("lspci") + return "nvidia" in lspci.stdout.lower() + + @classmethod + def check_driver_installed(cls) -> bool: + """ + Checks if the driver is already installed by calling the `nvidia-smi` binary. + If it's available, that means the driver is already installed. + """ + process = cls.run("which nvidia-smi", check=False) + if process.returncode != 0: + return False + process2 = cls.run("nvidia-smi", check=False) + return process2.returncode == 0 + + @staticmethod + def check_python_version(): + """ + Makes sure that the script is run with Python 3.6 or newer. + """ + if sys.version_info.major == 3 and sys.version_info.minor >= 6: + return + version = "{}.{}".format(sys.version_info.major, sys.version_info.minor) + raise RuntimeError( + "Unsupported Python version {}. " + "Supported versions: 3.6 - 3.12".format(version) + ) + + @classmethod + def reboot(cls): + """ + Reboots the system. + """ + logger.info( + "The system needs to be rebooted to complete the installation process. " + "The process will be continued after the reboot." + ) + logger.info("Rebooting now.") + cls.run("reboot now") + sys.exit(0) + + @classmethod + def detect_gpu_device(cls) -> Optional[str]: + """ + Check if there is an NVIDIA GPU device attached and return its device code. + """ + lspci = cls.run("lspci -n", silent=True) + output = lspci.stdout + dev_re = re.compile(r"10de:[\w\d]{4}") + for line in output.splitlines(): + dev_code = dev_re.findall(line) + if dev_code: + return dev_code[0] + else: + return None + + def download_cuda_toolkit_installer(self) -> pathlib.Path: + logger.info("Downloading CUDA installation kit...") + return self.download_file(CUDA_TOOLKIT_URL, CUDA_TOOLKIT_SHA256_SUM) + + def download_k80_driver_installer(self) -> pathlib.Path: + logger.info("K80 GPU detected, downloading only the driver installer...") + return self.download_file(K80_DRIVER_URL, K80_DRIVER_SHA256_SUM) + + def download_file(self, url: str, sha256sum: str) -> pathlib.Path: + """ + Uses `curl` to download a file pointed by url. It will also execute `sha256sum` on the downloaded file + to verify if it's matching with the expected hash. + + It also keeps track of files already downloaded and checked, so that it doesn't waste time with repeating the + download or check. + """ + filename = urllib.parse.urlparse(url).path.split("/")[-1] + file_path = pathlib.Path(filename) + + if file_path.exists() and url in self._file_download_verified: + return file_path + + if not file_path.exists(): + self.run(f"curl -fSsL -O {url}") + + checksum = self.run(f"sha256sum {file_path}").stdout.strip().split()[0] + if checksum != sha256sum: + raise RuntimeError( + f"The installer file checksum does not match. Won't continue installation." + f"Try deleting {file_path.absolute()} and trying again." + ) + self._file_download_verified.add(url) + return file_path + + +def _detect_linux_distro() -> (System, str): + """ + Checks the /etc/os-release file to figure out what distribution of OS + we're running. + """ + with open("/etc/os-release") as os_release: + lines = [line.strip() for line in os_release.readlines() if line.strip() != ""] + info = { + k: v.strip("'\"") + for k, v in (line.split("=", maxsplit=1) for line in lines) + } + + name = info["NAME"] + + if name.startswith("Debian"): + system = System.Debian + version = info["VERSION"].split()[0] # 11 (rodete) -> 11 + elif name.startswith("CentOS"): + system = System.CentOS + version = info["VERSION_ID"] # 8 + elif name.startswith("Rocky"): + system = System.Rocky + version = info["VERSION_ID"] # 8.4 + elif name.startswith("Ubuntu"): + system = System.Ubuntu + version = info["VERSION_ID"] # 20.04 + elif name.startswith("SLES"): + system = System.SUSE + version = info["VERSION_ID"] # 15.3 + elif name.startswith("Red Hat"): + system = System.RHEL + version = info["VERSION_ID"] # 8.4 + elif name.startswith("Fedora"): + system = System.Fedora + version = info["VERSION_ID"] # 34 + else: + raise RuntimeError("Unrecognized operating system.") + return system, version + + +def get_installer() -> LinuxInstaller: + """ + Retrieve an Installer instance appropriate for the hosting operating system. + """ + system, version = _detect_linux_distro() + + from os_installers.debian import DebianInstaller + from os_installers.ubuntu import UbuntuInstaller + from os_installers.rhel import RHELInstaller + from os_installers.rocky import RockyInstaller + + if system == System.Debian: + return DebianInstaller() + elif system == System.Ubuntu: + return UbuntuInstaller() + elif system == System.RHEL: + return RHELInstaller() + elif system == System.Rocky: + return RockyInstaller() + else: + raise NotImplementedError("Sorry, don't know how to install for this system.") diff --git a/linux/cuda_installer/os_installers/debian.py b/linux/cuda_installer/os_installers/debian.py new file mode 100644 index 0000000..79d3d98 --- /dev/null +++ b/linux/cuda_installer/os_installers/debian.py @@ -0,0 +1,84 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +from decorators import checkpoint_decorator +from logger import logger +from os_installers import LinuxInstaller, RebootRequired + + +class DebianInstaller(LinuxInstaller): + KERNEL_IMAGE_PACKAGE = "linux-image-{version}" + KERNEL_VERSION_FORMAT = "{major}.{minor}.{patch}-{micro}-cloud-amd64" + KERNEL_HEADERS_PACKAGE = "linux-headers-{version}" + KERNEL_PACKAGE_REGEX = r"linux-image-{major}.{minor}.([\d]+)-([\d]+)-cloud-amd64" + + @checkpoint_decorator("prerequisites", "System preparations already done.") + def _install_prerequisites(self): + """ + Installs packages required for the proper driver installation on Debian. + """ + self.run("apt-get update", silent=True) + + major, minor, *_ = self.kernel_version.split(".") + kernel_package_regex = re.compile( + self.KERNEL_PACKAGE_REGEX.format(major=major, minor=minor) + ) + + # Find the newest version of kernel to update to, but staying with the same major version + packages = self.run("apt-cache search linux-image").stdout + patch, micro = max(kernel_package_regex.findall(packages)) + + wanted_kernel_version = self.KERNEL_VERSION_FORMAT.format( + major=major, minor=minor, patch=patch, micro=micro + ) + wanted_kernel_package = self.KERNEL_IMAGE_PACKAGE.format( + version=wanted_kernel_version + ) + wanted_kernel_headers = self.KERNEL_HEADERS_PACKAGE.format( + version=wanted_kernel_version + ) + + self.run( + f"apt-get install -y make gcc {wanted_kernel_package} {wanted_kernel_headers} " + f"software-properties-common pciutils gcc make dkms" + ) + raise RebootRequired + + def lock_kernel_updates(self): + """ + Marks kernel related packages, so they don't get auto-updated. This would cause the driver to stop working. + """ + logger.info("Locking kernel updates...") + self.run( + f"apt-mark hold " + f"linux-image-{self.kernel_version} " + f"linux-headers-{self.kernel_version} " + f"linux-image-cloud-amd64 " + f"linux-headers-cloud-amd64" + ) + + def unlock_kernel_updates(self): + """ + Allows the kernel related packages to be upgraded. + """ + logger.info("Unlocking kernel updates...") + self.run( + f"apt-mark unhold " + f"linux-image-{self.kernel_version} " + f"linux-headers-{self.kernel_version} " + f"linux-image-cloud-amd64 " + f"linux-headers-cloud-amd64" + ) diff --git a/linux/cuda_installer/os_installers/dnf_system.py b/linux/cuda_installer/os_installers/dnf_system.py new file mode 100644 index 0000000..8c83f19 --- /dev/null +++ b/linux/cuda_installer/os_installers/dnf_system.py @@ -0,0 +1,94 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import configparser +import shutil + +from logger import logger +from os_installers import LinuxInstaller + + +class DNFSystemInstaller(LinuxInstaller, metaclass=abc.ABCMeta): + """ + An abstract class providing implementation of DNF kernel locking methods. + """ + + def lock_kernel_updates(self): + """Make sure no kernel updates are installed.""" + logger.info("Attempting to update /etc/dnf/dnf.conf to block kernel updates.") + + conf_parser = configparser.ConfigParser() + conf_parser.read("/etc/dnf/dnf.conf") + if "exclude" in conf_parser["main"]: + value = conf_parser["main"]["exclude"] + if "kernel*" in value: + logger.info("Kernel updates are already blocked in /etc/dnf/dnf.conf") + return + value = [s.strip() for s in value.split(",")] + value.append("kernel*") + else: + value = ["kernel*"] + conf_parser["main"]["exclude"] = ", ".join(value) + + shutil.copyfile("/etc/dnf/dnf.conf", "/etc/dnf/dnf.conf_backup") + try: + with open("/etc/dnf/dnf.conf", mode="w") as dnf_conf_file: + conf_parser.write(dnf_conf_file) + except Exception as e: + logger.error( + "Failed to update /etc/dnf/dnf.conf due to {}. Restoring config file from backup.".format( + e + ) + ) + shutil.copyfile("/etc/dnf/dnf.conf_backup", "/etc/dnf/dnf.conf") + raise e + else: + logger.info( + "Kernel updates blocked by `exclude` entry in /etc/dnf/dnf.conf" + ) + + def unlock_kernel_updates(self): + """Remove `kernel*` from exclusion list in /etc/dnf/dnf.conf""" + logger.info("Attempting to update /etc/dnf/dnf.conf to unblock kernel updates.") + + conf_parser = configparser.ConfigParser() + conf_parser.read("/etc/dnf/dnf.conf") + if "exclude" not in conf_parser["main"]: + logger.info("Kernel updates are not blocked in /etc/dnf/dnf.conf") + return + + value = conf_parser["main"]["exclude"] + value = [s.strip() for s in value.split(",")] + if "kernel*" not in value: + logger.info("Kernel updates are not blocked in /etc/dnf/dnf.conf") + return + value.remove("kernel*") + conf_parser["main"]["exclude"] = ", ".join(value) + + shutil.copyfile("/etc/dnf/dnf.conf", "/etc/dnf/dnf.conf_backup") + + try: + with open("/etc/dnf/dnf.conf", mode="w") as dnf_conf_file: + conf_parser.write(dnf_conf_file) + except Exception as e: + logger.error( + "Failed to update /etc/dnf/dnf.conf due to {}. Restoring config file from backup.".format( + e + ) + ) + shutil.copyfile("/etc/dnf/dnf.conf_backup", "/etc/dnf/dnf.conf") + raise e + else: + logger.info("Kernel updates unblocked in /etc/dnf/dnf.conf") diff --git a/linux/cuda_installer/os_installers/rhel.py b/linux/cuda_installer/os_installers/rhel.py new file mode 100644 index 0000000..4f8ddc7 --- /dev/null +++ b/linux/cuda_installer/os_installers/rhel.py @@ -0,0 +1,31 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from decorators import checkpoint_decorator +from os_installers import RebootRequired +from os_installers.dnf_system import DNFSystemInstaller + + +class RHELInstaller(DNFSystemInstaller): + + def __init__(self): + self.run("dnf install -y pciutils", silent=True) + DNFSystemInstaller.__init__(self) + + @checkpoint_decorator("prerequisites", "System preparations already done.") + def _install_prerequisites(self): + self.run( + "dnf --refresh install -y kernel kernel-devel kernel-headers gcc gcc-c++ make bzip2" + ) + raise RebootRequired diff --git a/linux/cuda_installer/os_installers/rocky.py b/linux/cuda_installer/os_installers/rocky.py new file mode 100644 index 0000000..d465f46 --- /dev/null +++ b/linux/cuda_installer/os_installers/rocky.py @@ -0,0 +1,18 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from os_installers.rhel import RHELInstaller + + +RockyInstaller = RHELInstaller diff --git a/linux/cuda_installer/os_installers/ubuntu.py b/linux/cuda_installer/os_installers/ubuntu.py new file mode 100644 index 0000000..34a149d --- /dev/null +++ b/linux/cuda_installer/os_installers/ubuntu.py @@ -0,0 +1,59 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from decorators import checkpoint_decorator +from logger import logger +from os_installers import LinuxInstaller, RebootRequired + + +class UbuntuInstaller(LinuxInstaller): + + @checkpoint_decorator("prerequisites", "System preparations already done.") + def _install_prerequisites(self): + """ + Installs packages required for the proper driver installation on Debian. + """ + self.run("apt-get update", silent=True) + + self.run( + "apt-get install -y linux-image-gcp linux-headers-gcp " + "gcc make dkms pciutils software-properties-common" + ) + raise RebootRequired + + def lock_kernel_updates(self): + """ + Marks kernel related packages, so they don't get auto-updated. This would cause the driver to stop working. + """ + logger.info("Locking kernel updates...") + self.run( + f"apt-mark hold " + f"linux-image-gcp " + f"linux-headers-gcp " + f"linux-image-{self.kernel_version} " + f"linux-headers-{self.kernel_version}" + ) + + def unlock_kernel_updates(self): + """ + Allows the kernel related packages to be upgraded. + """ + logger.info("Unlocking kernel updates...") + self.run( + f"apt-mark unhold " + f"linux-image-gcp " + f"linux-headers-gcp " + f"linux-image-{self.kernel_version} " + f"linux-headers-{self.kernel_version}" + ) diff --git a/linux/install_gpu_driver.py b/linux/install_gpu_driver.py index e8c9c14..9a9ea63 100644 --- a/linux/install_gpu_driver.py +++ b/linux/install_gpu_driver.py @@ -19,11 +19,14 @@ import shlex import subprocess import sys +import warnings from datetime import datetime from enum import Enum, auto from typing import Optional -DRIVER_VERSION = "525.125.06" +warnings.warn("This script is being deprecated. Please use cuda_installer as replacement.", DeprecationWarning) + +DRIVER_VERSION = "550.54.15" K80_DRIVER_VERSION = "470.199.02" DRIVER_URL = f"https://us.download.nvidia.com/tesla/{DRIVER_VERSION}/NVIDIA-Linux-x86_64-{DRIVER_VERSION}.run" diff --git a/linux/tests/README.md b/linux/tests/README.md index 392f6e6..ad6cfa6 100644 --- a/linux/tests/README.md +++ b/linux/tests/README.md @@ -10,13 +10,4 @@ manually on local developer machine. Required steps: 3. Install required Python packages `pip install -Ur requirements.txt` 4. Run test using `pytest` command. You can speed up the process by using parallel execution with - `pytest --workers 1 --tests-per-worker 10`. Remember to use only one - process, as the tests use thread semaphores to make sure they don't exceed - GPU quota - - -Note: The VMs created for this test don't have external IP -addresses assigned, so it's required for the project -they are created in to have Cloud NAT configured for the -default VPC Network. Without it, the instances won't be -able to download necessary drivers. \ No newline at end of file + `pytest -n auto`. diff --git a/linux/tests/requirements.txt b/linux/tests/requirements.txt index 4c3f4c6..4b97c48 100644 --- a/linux/tests/requirements.txt +++ b/linux/tests/requirements.txt @@ -1,3 +1,5 @@ -pytest==7.1.1 -pytest-parallel==0.1.1 -google-cloud-compute==1.1.0 \ No newline at end of file +pytest>=8.2.0 +pytest-xdist>=3.6.1 +google-cloud-compute>=1.18.0 +google-cloud-storage>=2.16.0 +google-cloud-iam>=2.15.0 \ No newline at end of file diff --git a/linux/tests/startup_script.sh b/linux/tests/startup_script.sh new file mode 100644 index 0000000..52422ac --- /dev/null +++ b/linux/tests/startup_script.sh @@ -0,0 +1,11 @@ +#!/bin/bash +if test -f /opt/google/cuda-installer/ +then + exit +fi + +mkdir -p /opt/google/cuda-installer/ +cd /opt/google/cuda-installer/ || exit + +gsutil cp {GS_INSTALLER_PATH} cuda_installer.pyz +python3 cuda_installer.pyz install_cuda \ No newline at end of file diff --git a/linux/tests/test_installations.py b/linux/tests/test_installations.py index c7a957d..b6bb3ab 100644 --- a/linux/tests/test_installations.py +++ b/linux/tests/test_installations.py @@ -17,39 +17,45 @@ import sys import tempfile import time +import random import uuid +import zipapp from pathlib import Path -from threading import BoundedSemaphore +from multiprocessing import BoundedSemaphore from typing import Tuple import google.api_core.exceptions import google.auth import pytest +from google.cloud import iam_admin_v1 from google.cloud import compute_v1 +from google.cloud import storage +from google.cloud.storage.constants import STANDARD_STORAGE_CLASS PROJECT = google.auth.default()[1] INSTALLATION_TIMEOUT = 30*60 # 30 minutes +GS_BUCKET_NAME = f"{PROJECT}-cuda-installer-tests" + # Cloud project and family OPERATING_SYSTEMS = ( - # ("centos-cloud", "centos-7"), - ("centos-cloud", "centos-stream-8"), - # ("debian-cloud", "debian-10"), - # ("debian-cloud", "debian-11"), - # ("rhel-cloud", "rhel-7"), + ("debian-cloud", "debian-10"), + ("debian-cloud", "debian-11"), + ("debian-cloud", "debian-12"), ("rhel-cloud", "rhel-8"), ("rhel-cloud", "rhel-9"), - # ("rocky-linux-cloud", "rocky-linux-8"), + ("rocky-linux-cloud", "rocky-linux-8"), ("rocky-linux-cloud", "rocky-linux-9"), - # ("ubuntu-os-cloud", "ubuntu-2004-lts"), - # ("ubuntu-os-cloud", "ubuntu-2204-lts"), + ("ubuntu-os-cloud", "ubuntu-2004-lts"), + ("ubuntu-os-cloud", "ubuntu-2204-lts"), + ("ubuntu-os-cloud", "ubuntu-2404-lts-amd64"), ) GPUS = { # "L4": "nvidia-l4", # "A100": "nvidia-tesla-a100", - "K80": "nvidia-tesla-k80", + # "K80": "nvidia-tesla-k80", # "P4": "nvidia-tesla-p4", "T4": "nvidia-tesla-t4", # "P100": "nvidia-tesla-p100", @@ -59,21 +65,21 @@ GPU_QUOTA_SEMAPHORES = { "L4": BoundedSemaphore(8), "A100": BoundedSemaphore(8), - "K80": BoundedSemaphore(8), + "K80": BoundedSemaphore(16), "P4": BoundedSemaphore(1), - "T4": BoundedSemaphore(4), + "T4": BoundedSemaphore(8), "P100": BoundedSemaphore(1), "V100": BoundedSemaphore(8), } ZONES = { - "L4": "us-central1-a", - "A100": "us-central1-f", - "K80": "us-central1-a", - "P4": "us-central1-a", - "T4": "us-central1-b", - "P100": "us-central1-c", - "V100": "us-central1-a", + "L4": ("us-central1-a",), + "A100": ("us-central1-f",), + "K80": ("us-central1-a",), + "P4": ("us-central1-a",), + "T4": ("us-central1-b", "europe-west2-a", "us-west1-b", "northamerica-northeast1-c", "europe-west3-b"), + "P100": ("us-central1-c",), + "V100": ("us-central1-a",), } MACHINE_TYPES = { @@ -87,6 +93,61 @@ } +@pytest.fixture(scope="session") +def service_account(): + iam_admin_client = iam_admin_v1.IAMClient() + + sa_full_name = f"cuda-tester@{PROJECT}.iam.gserviceaccount.com" + if sa_full_name in (sa.email for sa in iam_admin_client.list_service_accounts(name=f"projects/{PROJECT}")): + yield sa_full_name + return + + request = iam_admin_v1.CreateServiceAccountRequest() + + request.account_id = "cuda-tester" + request.name = f"projects/{PROJECT}" + + service_account = iam_admin_v1.ServiceAccount() + service_account.display_name = "Cuda Installer testing account" + request.service_account = service_account + + account = iam_admin_client.create_service_account(request) + + yield account.email + + +@pytest.fixture(scope="session") +def gs_bucket(): + storage_client = storage.Client() + + if GS_BUCKET_NAME in (b.name for b in storage_client.list_buckets()): + bucket = storage_client.get_bucket(GS_BUCKET_NAME) + yield bucket + return + + # Need to create the bucket + bucket = storage_client.bucket(GS_BUCKET_NAME) + bucket.storage_class = STANDARD_STORAGE_CLASS + yield storage_client.create_bucket(bucket, location="us-central1") + + +@pytest.fixture(scope="session") +def zipapp_gs_url(gs_bucket: storage.Bucket, service_account: str): + """ + Package the cuda_installer to a zipapp file and upload to a GS bucket. + """ + file_name = f"cuda-installer-{uuid.uuid4().hex[:8]}.pyz" + with tempfile.NamedTemporaryFile(mode="wb+", suffix=".pyz") as pyz_file: + zipapp.create_archive("../cuda_installer", pyz_file.file) + pyz_file.seek(0) + blob = gs_bucket.blob(file_name) + blob.upload_from_filename(pyz_file.name, if_generation_match=0) + blob.acl.reload() + blob.acl.user(service_account).grant_read() + blob.acl.save() + yield f"gs://{gs_bucket.name}/{blob.name}" + + @pytest.fixture(scope='module') def ssh_key(): """ @@ -150,21 +211,22 @@ def read_ssh_pubkey(ssh_key: str) -> str: @pytest.mark.parametrize("opsys,gpu", itertools.product(OPERATING_SYSTEMS, GPUS)) -def test_install_driver_for_system(ssh_key: str, opsys: Tuple[str, str], gpu: str): +def test_install_driver_for_system(zipapp_gs_url: str, service_account: str, ssh_key: str, opsys: Tuple[str, str], gpu: str): """ Run the installation test for given operating system and GPU card. """ - zone = ZONES[gpu] + zone = random.choice(ZONES[gpu]) op_sys_image = get_image_from_family(*opsys) disks = [_get_boot_disk(op_sys_image.self_link, zone)] - # We do not configure external IP to save on the billing, - # but the project you try to run this tests in needs to - # have a Cloud NAT configured, so the instances can - # download the drivers. network_interface = compute_v1.NetworkInterface() network_interface.name = "global/networks/default" + access = compute_v1.AccessConfig() + access.type_ = compute_v1.AccessConfig.Type.ONE_TO_ONE_NAT.name + access.name = "External NAT" + access.network_tier = access.NetworkTier.PREMIUM.name + network_interface.access_configs = [access] # GPUs accelerator = compute_v1.AcceleratorConfig() @@ -178,6 +240,10 @@ def test_install_driver_for_system(ssh_key: str, opsys: Tuple[str, str], gpu: st instance.disks = disks instance.guest_accelerators = [accelerator] instance.network_interfaces = [network_interface] + compute_sa = compute_v1.ServiceAccount() + compute_sa.email = service_account + compute_sa.scopes = ["https://www.googleapis.com/auth/cloud-platform"] + instance.service_accounts = [compute_sa] # Instance with GPU has to have LiveMigration disabled instance.scheduling = compute_v1.Scheduling() @@ -189,8 +255,8 @@ def test_install_driver_for_system(ssh_key: str, opsys: Tuple[str, str], gpu: st instance.metadata = compute_v1.Metadata() meta_item = compute_v1.Items() meta_item.key = 'startup-script' - with open(Path(__file__).parent / '../startup_script.sh') as script: - meta_item.value = script.read() + with open(Path(__file__).parent / 'startup_script.sh') as script: + meta_item.value = script.read().format(GS_INSTALLER_PATH=zipapp_gs_url) ssh_item = compute_v1.Items() ssh_item.key = 'ssh-keys' ssh_item.value = read_ssh_pubkey(ssh_key) @@ -208,36 +274,33 @@ def test_install_driver_for_system(ssh_key: str, opsys: Tuple[str, str], gpu: st instance_client = compute_v1.InstancesClient() operation_client = compute_v1.ZoneOperationsClient() - with GPU_QUOTA_SEMAPHORES[gpu]: - # Making sure not to exceed the GPU quota while executing the tests - # in multiple threads. + try: + operation = instance_client.insert_unary(request) + operation = operation_client.wait(project=PROJECT, zone=zone, operation=operation.name) + + if operation.error: + print(f"Error during instance {instance_name} creation:", operation.error, file=sys.stderr) + raise RuntimeError(operation.error) + + if operation.warnings: + msgs = [] + for warning in operation.warnings: + if warning.code != 'DISK_SIZE_LARGER_THAN_IMAGE_SIZE': + msgs.append(f" - {warning.code}: {warning.message}") + if msgs: + print(f"Warnings during instance {instance_name} creation:\n", file=sys.stderr) + for msg in msgs: + print(msg, file=sys.stderr) + + _test_body(zone, instance_name, gpu, ssh_key) + finally: try: - operation = instance_client.insert_unary(request) - operation = operation_client.wait(project=PROJECT, zone=zone, operation=operation.name) - - if operation.error: - print(f"Error during instance {instance_name} creation:", operation.error, file=sys.stderr) - raise RuntimeError(operation.error) - - if operation.warnings: - msgs = [] - for warning in operation.warnings: - if warning.code != 'DISK_SIZE_LARGER_THAN_IMAGE_SIZE': - msgs.append(f" - {warning.code}: {warning.message}") - if msgs: - print(f"Warnings during instance {instance_name} creation:\n", file=sys.stderr) - for msg in msgs: - print(msg, file=sys.stderr) - - _test_body(zone, instance_name, gpu, ssh_key) - finally: - try: - # print("This is where I'd delete the instance, but we keep it for debugging.") - operation = instance_client.delete_unary(project=PROJECT, zone=zone, instance=instance_name) - operation_client.wait(project=PROJECT, zone=zone, operation=operation.name) - except google.api_core.exceptions.NotFound: - # The instance was not properly created at all. - pass + # print("This is where I'd delete the instance, but we keep it for debugging.") + operation = instance_client.delete_unary(project=PROJECT, zone=zone, instance=instance_name) + operation_client.wait(project=PROJECT, zone=zone, operation=operation.name) + except google.api_core.exceptions.NotFound: + # The instance was not properly created at all. + pass def _test_body(zone: str, instance_name: str, gpu: str, ssh_key: str): @@ -255,7 +318,7 @@ def _test_body(zone: str, instance_name: str, gpu: str, ssh_key: str): process = subprocess.run( ["gcloud", "compute", "ssh", instance_name, "--zone", zone, "--ssh-key-file", ssh_key, - "--command", "ls /opt/google/gpu-installer"], + "--command", "ls /opt/google/cuda-installer"], stderr=subprocess.PIPE, stdout=subprocess.PIPE, text=True, @@ -265,9 +328,25 @@ def _test_body(zone: str, instance_name: str, gpu: str, ssh_key: str): continue else: output = process.stdout, process.stderr - if 'success' in process.stdout: + print("Output:", output) + if 'cuda_installation' in process.stdout: + # Give it some time to reboot, as in some cases it can take a while. + time.sleep(60) # Installation appears to be completed successfully - break + process = subprocess.run( + ["gcloud", "compute", "ssh", instance_name, "--zone", zone, + "--ssh-key-file", ssh_key, + "--command", "sudo python3 /opt/google/cuda-installer/cuda_installer.pyz verify_cuda"], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True, + timeout=600 + ) + print("process.stdout: ", process.stdout) + if "Cuda Toolkit verification completed!" in process.stdout: + # Now we're sure that the installation worked. + break + pytest.fail(f"Cuda verification failed for {instance_name}!") else: print(f"Tried to run SSH connection {tries} times.") print(f"Standard output from {instance_name}:\n" + output[0])