diff --git a/centos_pkg_dirs b/centos_pkg_dirs index 41a920acf..3c1e83657 100644 --- a/centos_pkg_dirs +++ b/centos_pkg_dirs @@ -85,4 +85,3 @@ python/python-webencodings python/python-daemon base/inih base/pf-bb-config -gpu/gpu-operator diff --git a/centos_tarball-dl.lst b/centos_tarball-dl.lst index 3924a01cb..6c0fda732 100644 --- a/centos_tarball-dl.lst +++ b/centos_tarball-dl.lst @@ -71,4 +71,3 @@ xxHash-1f40c6511fa8dd9d2e337ca8c9bc18b3e87663c9.tar.gz#xxHash#https://api.github zstd-f4340f46b2387bc8de7d5320c0b83bb1499933ad.tar.gz#zstd#https://api.github.com/repos/facebook/zstd/tarball/f4340f46b2387bc8de7d5320c0b83bb1499933ad#https## inih-b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69.tar.gz#inih-44#https://github.com/benhoyt/inih/tarball/b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69#https## pf-bb-config-791b4f38d15377d4fbb3c9799a652acbc405b088.tar.gz#pf-bb-config-20.11#https://github.com/intel/pf-bb-config/tarball/791b4f38d15377d4fbb3c9799a652acbc405b088#https## -gpu-operator-1.6.0.tar.gz#gpu-operator-1.6.0#https://github.com/NVIDIA/gpu-operator/archive/1.6.0.tar.gz##https## diff --git a/gpu/gpu-operator/centos/build_srpm.data b/gpu/gpu-operator/centos/build_srpm.data deleted file mode 100644 index 927c712de..000000000 --- a/gpu/gpu-operator/centos/build_srpm.data +++ /dev/null @@ -1,8 +0,0 @@ -VERSION=1.6.0 -TAR_NAME=gpu-operator -TAR="$TAR_NAME-$VERSION.tar.gz" -COPY_LIST=" \ - $PKG_BASE/files/* \ - $STX_BASE/downloads/$TAR" - -TIS_PATCH_VER=PKG_GITREVCOUNT diff --git a/gpu/gpu-operator/centos/gpu-operator.spec b/gpu/gpu-operator/centos/gpu-operator.spec deleted file mode 100644 index 1db9fc955..000000000 --- a/gpu/gpu-operator/centos/gpu-operator.spec +++ /dev/null @@ -1,45 +0,0 @@ -# Build variables -%global app_folder /usr/local/share/applications/helm - -Summary: StarlingX nvidia gpu-operator helm chart -Name: gpu-operator -Version: 1.6.0 -Release: 0%{?_tis_dist}.%{tis_patch_ver} -License: Apache-2.0 -Group: base -Packager: Wind River -URL: https://github.com/NVIDIA/gpu-operator/tree/gh-pages - -Source0: %{name}-%{version}.tar.gz - -BuildArch: noarch - -Patch01: deployments-setup-configmap-with-assets-for-volumemo.patch -Patch02: enablement-support-on-starlingx-cloud-platform.patch - -BuildRequires: helm - -%define debug_package %{nil} -%description -StarlingX port of NVIDIA gpu-operator - -%prep -%setup - -%patch01 -p1 -%patch02 -p1 - -%build -cp -r assets deployments/gpu-operator/assets - -helm lint deployments/gpu-operator -mkdir build_results -helm package --version %{version} --app-version %{version} -d build_results deployments/gpu-operator - -%install -install -d -m 755 ${RPM_BUILD_ROOT}%{helm_folder} -install -p -D -m 755 build_results/%{name}-%{version}.tgz ${RPM_BUILD_ROOT}%{helm_folder} - -%files -%defattr(-,root,root,-) -%{helm_folder} diff --git a/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch b/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch deleted file mode 100644 index 6a7129fba..000000000 --- a/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch +++ /dev/null @@ -1,137 +0,0 @@ -From de6068e56987960b7f3227dd4747e64b169742df Mon Sep 17 00:00:00 2001 -From: Babak Sarashki -Date: Sat, 6 Mar 2021 00:22:40 +0000 -Subject: [PATCH] deployments: setup configmap with assets for volumemounts - -This feature allows inclusion of assets/ in the helm chart and their -export to the gpu-operator pod through configmap volumeMounts. - -Signed-off-by: Babak Sarashki ---- - .../gpu-operator/templates/operator.yaml | 45 +++++++++++++++++++ - .../templates/operator_configmap.yaml | 36 +++++++++++++++ - deployments/gpu-operator/values.yaml | 2 + - 3 files changed, 83 insertions(+) - create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml - -diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml -index 50983b20..90aa3874 100644 ---- a/deployments/gpu-operator/templates/operator.yaml -+++ b/deployments/gpu-operator/templates/operator.yaml -@@ -50,6 +50,45 @@ spec: - - name: host-os-release - mountPath: "/host-etc/os-release" - readOnly: true -+ -+ {{- if eq .Values.operator.include_assets "include_assets" }} -+ {{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }} -+ - name: assets -+ mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }} -+ subPath: {{ printf "gfd_%s" (base $path) }} -+ {{- end }} -+ -+ {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} -+ - name: assets -+ mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }} -+ subPath: {{ printf "state_container_toolkit_%s" (base $path) }} -+ {{- end }} -+ -+ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} -+ - name: assets -+ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }} -+ subPath: {{ printf "state_device_%s" (base $path) }} -+ {{- end }} -+ -+ {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} -+ - name: assets -+ mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }} -+ subPath: {{ printf "state_device_validation_%s" (base $path) }} -+ {{- end }} -+ -+ {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} -+ - name: assets -+ mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }} -+ subPath: {{ printf "state_driver_%s" (base $path) }} -+ {{- end }} -+ -+ {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} -+ - name: assets -+ mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }} -+ subPath: {{ printf "state_monitor_%s" (base $path) }} -+ {{- end }} -+ {{- end }} -+ - readinessProbe: - exec: - command: ["stat", "/tmp/operator-sdk-ready"] -@@ -63,6 +102,12 @@ spec: - - name: host-os-release - hostPath: - path: "/etc/os-release" -+ {{- if eq .Values.operator.include_assets "include_assets" }} -+ - name: assets -+ configMap: -+ name: operator-configmap -+ {{- end }} -+ - {{- with .Values.operator.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} -diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml -new file mode 100644 -index 00000000..61f366e8 ---- /dev/null -+++ b/deployments/gpu-operator/templates/operator_configmap.yaml -@@ -0,0 +1,36 @@ -+{{- if eq .Values.operator.include_assets "include_assets" }} -+apiVersion: v1 -+kind: ConfigMap -+metadata: -+ name: operator-configmap -+data: -+{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }} -+{{ printf "gfd_%s" (base $path) | indent 2 }}: |- -+{{ $.Files.Get $path | indent 4 }} -+{{- end }} -+ -+{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} -+{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |- -+{{ $.Files.Get $path | indent 4 }} -+{{- end }} -+ -+{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} -+{{ printf "state_device_%s" (base $path) | indent 2 }}: |- -+{{ $.Files.Get $path | indent 4 }} -+{{- end }} -+ -+{{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} -+{{ printf "state_device_validation_%s" (base $path) | indent 2 }}: |- -+{{ $.Files.Get $path | indent 4 }} -+{{- end }} -+ -+{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} -+{{ printf "state_driver_%s" (base $path) | indent 2 }}: |- -+{{ $.Files.Get $path | indent 4 }} -+{{- end }} -+ -+{{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} -+{{ printf "state_monitor_%s" (base $path) | indent 2 }}: |- -+{{ $.Files.Get $path | indent 4 }} -+{{- end }} -+{{- end }} -diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml -index 00d94195..8b43c59f 100644 ---- a/deployments/gpu-operator/values.yaml -+++ b/deployments/gpu-operator/values.yaml -@@ -39,6 +39,8 @@ operator: - values: [""] - logging: - timeEncoding: epoch -+ # Set to "include_assets" to include assets/gpu-operator with the helm chart -+ include_assets: "" - - driver: - repository: nvcr.io/nvidia --- -2.17.1 - diff --git a/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch b/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch deleted file mode 100644 index 7608a2dd3..000000000 --- a/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch +++ /dev/null @@ -1,590 +0,0 @@ -From eeb01daae7a39db2717198e03d2aa1e73c7130d8 Mon Sep 17 00:00:00 2001 -From: Babak Sarashki -Date: Sun, 7 Mar 2021 17:19:08 +0000 -Subject: [PATCH] enablement: support on starlingx cloud platform - -StarlingX is a cloud infrastructure software stack for edge. -It has an immutable file system, and system configruation. For -instance changes to set containerd runtime by the gpu-operator -will be overriden and must be avoided. The default_runtime is -to remain docker, therefore. - -This commit enables gpu-operator on Starlingx (starlingx.io). -The changes to the gpu-operator include bundling modified assets -and a modified version of the nvidia-driver with the helm charts. - -The modficiations to the assets include setting the runtimeClassName -on the gpu-operator pods that require nvidia-container-runtime and -host-mounting the kernel headers and build directory. The changes to -the nvidia-driver account for pre-installed kernel packages. - -To load the operator on starlingx, define a runtimeclass with name -and handler set to nvidia; thereafter: - -$ source /etc/platform/openrc -[...(keystone_admin)]$ system service-parameter-add \ - platform container_runtime \ - custom_container_runtime=nvidia:/path/to/nvidia-container-runtime - -[...(keystone_admin)]$ system host-lock 1; system host-unlock 1 - -Signed-off-by: Babak Sarashki ---- - .../gpu-feature-discovery/0500_daemonset.yaml | 1 + - .../cuda-vector-add.yaml | 1 + - .../0400_device_plugin.yml | 1 + - assets/state-driver/0400_configmap.yaml | 327 +++++++++++++++++- - assets/state-driver/0500_daemonset.yaml | 39 ++- - assets/state-monitoring/0900_daemonset.yaml | 1 + - .../gpu-operator/templates/operator.yaml | 12 +- - deployments/gpu-operator/values.yaml | 8 +- - 8 files changed, 379 insertions(+), 11 deletions(-) - -diff --git a/assets/gpu-feature-discovery/0500_daemonset.yaml b/assets/gpu-feature-discovery/0500_daemonset.yaml -index 9785dc93..1589e710 100644 ---- a/assets/gpu-feature-discovery/0500_daemonset.yaml -+++ b/assets/gpu-feature-discovery/0500_daemonset.yaml -@@ -18,6 +18,7 @@ spec: - app.kubernetes.io/part-of: nvidia-gpu - spec: - serviceAccount: nvidia-gpu-feature-discovery -+ runtimeClassName: nvidia - containers: - - image: "FILLED BY THE OPERATOR" - name: gpu-feature-discovery -diff --git a/assets/state-device-plugin-validation/cuda-vector-add.yaml b/assets/state-device-plugin-validation/cuda-vector-add.yaml -index cfb547ad..8269adeb 100644 ---- a/assets/state-device-plugin-validation/cuda-vector-add.yaml -+++ b/assets/state-device-plugin-validation/cuda-vector-add.yaml -@@ -12,6 +12,7 @@ spec: - effect: NoSchedule - readOnlyRootFilesystem: true - restartPolicy: OnFailure -+ runtimeClassName: nvidia - initContainers: - - name: device-plugin-validation-init - image: "FILLED BY THE OPERATOR" -diff --git a/assets/state-device-plugin/0400_device_plugin.yml b/assets/state-device-plugin/0400_device_plugin.yml -index a5cf7fae..84e9c534 100644 ---- a/assets/state-device-plugin/0400_device_plugin.yml -+++ b/assets/state-device-plugin/0400_device_plugin.yml -@@ -30,6 +30,7 @@ spec: - operator: Exists - effect: NoSchedule - serviceAccount: nvidia-device-plugin -+ runtimeClassName: nvidia - initContainers: - - name: toolkit-validation - image: "FILLED BY THE OPERATOR" -diff --git a/assets/state-driver/0400_configmap.yaml b/assets/state-driver/0400_configmap.yaml -index 48e9f51e..561adc9f 100644 ---- a/assets/state-driver/0400_configmap.yaml -+++ b/assets/state-driver/0400_configmap.yaml -@@ -4,7 +4,7 @@ metadata: - name: nvidia-driver - namespace: gpu-operator-resources - data: -- oci-nvidia-hook-json: | -+ oci-nvidia-hook-json: | - { - "version": "1.0.0", - "hook": { -@@ -20,3 +20,328 @@ data: - }, - "stages": ["prestart"] - } -+ nvidia-driver-build-script: | -+ #! /bin/bash -+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -+ # Copyright (c) 2021 Wind River Systems, Inc. SPDX-License-Identifier: -+ # Apache-2.0. -+ # This script is from: https://gitlab.com/nvidia/container-images/driver. -+ # It is modified and included under configmap for platforms that require -+ # pre-installed packages. Such platforms have the option to modify the -+ # entrypoint in 0500_daemonset.yaml, or the nvidia-driver script here for -+ # further customizations. -+ -+ set -eu -+ -+ RUN_DIR=/run/nvidia -+ PID_FILE=${RUN_DIR}/${0##*/}.pid -+ DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"} -+ KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver -+ KERNEL_VERSION="$(uname -r)" -+ -+ # Default to 0 ; 1 is experimental and not supported -+ export IGNORE_PREEMPT_RT_PRESENCE=0 -+ -+ # Check if the kernel version requires a new precompiled driver packages. -+ _kernel_requires_package() { -+ local proc_mount_arg="" -+ -+ echo "Checking NVIDIA driver packages..." -+ cd /usr/src/nvidia-${DRIVER_VERSION}/kernel -+ -+ # When the kernel version is latest on host, this check fails and lead to recompilation, even when precompiled modules exist. -+ #if [ "${KERNEL_VERSION}" != "$(uname -r)" ]; then -+ #Not needed with pre-installed readonly headers, devel and modules -+ #proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc" -+ #fi -+ for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do -+ is_match=$(../mkprecompiled --match ${pkg_name} ${proc_mount_arg}) -+ if [ "${is_match}" == "kernel interface matches." ]; then -+ echo "Found NVIDIA driver package ${pkg_name##*/}" -+ return 1 -+ fi -+ done -+ return 0 -+ } -+ -+ # Compile the kernel modules, optionally sign them, and generate a precompiled package for use by the nvidia-installer. -+ _create_driver_package() ( -+ local pkg_name="nvidia-modules-${KERNEL_VERSION%%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}" -+ local nvidia_sign_args="" -+ local nvidia_modeset_sign_args="" -+ local nvidia_uvm_sign_args="" -+ -+ trap "make -s -j 4 SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT -+ -+ echo "Compiling NVIDIA driver kernel modules..." -+ cd /usr/src/nvidia-${DRIVER_VERSION}/kernel -+ -+ export IGNORE_CC_MISMATCH=1 -+ make -s -j 4 SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null -+ -+ echo "Relinking NVIDIA driver kernel modules..." -+ rm -f nvidia.ko nvidia-modeset.ko -+ ld -d -r -o nvidia.ko ./nv-linux.o ./nvidia/nv-kernel.o_binary -+ ld -d -r -o nvidia-modeset.ko ./nv-modeset-linux.o ./nvidia-modeset/nv-modeset-kernel.o_binary -+ -+ if [ -n "${PRIVATE_KEY}" ]; then -+ echo "Signing NVIDIA driver kernel modules..." -+ donkey get ${PRIVATE_KEY} sh -c "PATH=${PATH}:/usr/src/kernels/$(uname -r)/scripts && \ -+ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign && \ -+ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign && \ -+ sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko" -+ nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign" -+ nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign" -+ nvidia_uvm_sign_args="--signed" -+ fi -+ -+ echo "Building NVIDIA driver package ${pkg_name}..." -+ ../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION} \ -+ --driver-version ${DRIVER_VERSION} \ -+ --kernel-interface nv-linux.o \ -+ --linked-module-name nvidia.ko \ -+ --core-object-name nvidia/nv-kernel.o_binary \ -+ ${nvidia_sign_args} \ -+ --target-directory . \ -+ --kernel-interface nv-modeset-linux.o \ -+ --linked-module-name nvidia-modeset.ko \ -+ --core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \ -+ ${nvidia_modeset_sign_args} \ -+ --target-directory . \ -+ --kernel-module nvidia-uvm.ko \ -+ ${nvidia_uvm_sign_args} \ -+ --target-directory . -+ mkdir -p precompiled -+ mv ${pkg_name} precompiled -+ ) -+ -+ # Load the kernel modules and start persistenced. -+ _load_driver() { -+ echo "Loading IPMI kernel module..." -+ modprobe ipmi_msghandler -+ -+ echo "Loading NVIDIA driver kernel modules..." -+ modprobe -a nvidia nvidia-uvm nvidia-modeset -+ -+ echo "Starting NVIDIA persistence daemon..." -+ nvidia-persistenced --persistence-mode -+ } -+ -+ # Stop persistenced and unload the kernel modules if they are currently loaded. -+ _unload_driver() { -+ local rmmod_args=() -+ local nvidia_deps=0 -+ local nvidia_refs=0 -+ local nvidia_uvm_refs=0 -+ local nvidia_modeset_refs=0 -+ -+ echo "Stopping NVIDIA persistence daemon..." -+ if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then -+ local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid) -+ -+ kill -SIGTERM "${pid}" -+ for i in $(seq 1 10); do -+ kill -0 "${pid}" 2> /dev/null || break -+ sleep 0.1 -+ done -+ if [ $i -eq 10 ]; then -+ echo "Could not stop NVIDIA persistence daemon" >&2 -+ return 1 -+ fi -+ fi -+ -+ echo "Unloading NVIDIA driver kernel modules..." -+ if [ -f /sys/module/nvidia_modeset/refcnt ]; then -+ nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt) -+ rmmod_args+=("nvidia-modeset") -+ ((++nvidia_deps)) -+ fi -+ if [ -f /sys/module/nvidia_uvm/refcnt ]; then -+ nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt) -+ rmmod_args+=("nvidia-uvm") -+ ((++nvidia_deps)) -+ fi -+ if [ -f /sys/module/nvidia/refcnt ]; then -+ nvidia_refs=$(< /sys/module/nvidia/refcnt) -+ rmmod_args+=("nvidia") -+ fi -+ if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then -+ echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2 -+ return 1 -+ fi -+ -+ if [ ${#rmmod_args[@]} -gt 0 ]; then -+ rmmod ${rmmod_args[@]} -+ fi -+ return 0 -+ } -+ -+ # Link and install the kernel modules from a precompiled package using the nvidia-installer. -+ _install_driver() { -+ local install_args=() -+ -+ echo "Installing NVIDIA driver kernel modules..." -+ cd /usr/src/nvidia-${DRIVER_VERSION} -+ rm -rf /lib/modules/${KERNEL_VERSION}/video -+ -+ if [ "${ACCEPT_LICENSE}" = "yes" ]; then -+ install_args+=("--accept-license") -+ fi -+ nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"} -+ # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path -+ # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point -+ # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit -+ #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"} -+ } -+ -+ # Mount the driver rootfs into the run directory with the exception of sysfs. -+ _mount_rootfs() { -+ echo "Mounting NVIDIA driver rootfs..." -+ mount --make-runbindable /sys -+ mount --make-private /sys -+ mkdir -p ${RUN_DIR}/driver -+ mount --rbind / ${RUN_DIR}/driver -+ } -+ -+ # Unmount the driver rootfs from the run directory. -+ _unmount_rootfs() { -+ echo "Unmounting NVIDIA driver rootfs..." -+ if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then -+ umount -l -R ${RUN_DIR}/driver -+ fi -+ } -+ -+ # Write a kernel postinst.d script to automatically precompile packages on kernel update (similar to DKMS). -+ _write_kernel_update_hook() { -+ if [ ! -d ${KERNEL_UPDATE_HOOK%/*} ]; then -+ return -+ fi -+ -+ echo "Writing kernel update hook..." -+ cat > ${KERNEL_UPDATE_HOOK} <<'EOF' -+ #!/bin/bash -+ -+ set -eu -+ trap 'echo "ERROR: Failed to update the NVIDIA driver" >&2; exit 0' ERR -+ -+ NVIDIA_DRIVER_PID=$(< /run/nvidia/nvidia-driver.pid) -+ -+ export "$(grep -z DRIVER_VERSION /proc/${NVIDIA_DRIVER_PID}/environ)" -+ nsenter -t "${NVIDIA_DRIVER_PID}" -m -- nvidia-driver update --kernel "$1" -+ EOF -+ chmod +x ${KERNEL_UPDATE_HOOK} -+ } -+ -+ _shutdown() { -+ if _unload_driver; then -+ _unmount_rootfs -+ rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK} -+ return 0 -+ fi -+ return 1 -+ } -+ -+ init() { -+ echo -e "\n========== NVIDIA Software Installer ==========\n" -+ echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" -+ -+ exec 3> ${PID_FILE} -+ if ! flock -n 3; then -+ echo "An instance of the NVIDIA driver is already running, aborting" -+ exit 1 -+ fi -+ echo $$ >&3 -+ -+ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM -+ trap "_shutdown" EXIT -+ -+ _unload_driver || exit 1 -+ _unmount_rootfs -+ -+ if _kernel_requires_package; then -+ _create_driver_package -+ fi -+ -+ _install_driver -+ _load_driver -+ _mount_rootfs -+ _write_kernel_update_hook -+ -+ echo "Done, now waiting for signal" -+ sleep infinity & -+ trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM -+ trap - EXIT -+ while true; do wait $! || continue; done -+ exit 0 -+ } -+ -+ update() { -+ exec 3>&2 -+ if exec 2> /dev/null 4< ${PID_FILE}; then -+ if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then -+ exec > >(tee -a "/proc/${pid}/fd/1") -+ exec 2> >(tee -a "/proc/${pid}/fd/2" >&3) -+ else -+ exec 2>&3 -+ fi -+ exec 4>&- -+ fi -+ exec 3>&- -+ -+ echo -e "\n========== NVIDIA Software Updater ==========\n" -+ echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n" -+ -+ trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM -+ -+ if _kernel_requires_package; then -+ _create_driver_package -+ fi -+ -+ echo "Done" -+ exit 0 -+ } -+ -+ usage() { -+ cat >&2 < -+ cat /usr/local/bin/nvidia-driver.22 > /usr/local/bin/nvidia-driver && -+ chmod 755 /usr/local/bin/nvidia-driver && -+ mkdir -p /usr/src/kernels && -+ tar -C /usr/src/host-kernels/ -c $(uname -r) -f - | tar -C /usr/src/kernels/ -xf - && -+ rm -rf /lib/modules/ && mkdir -p /lib/modules/ && -+ tar -C /lib/host-modules/ -c $(uname -r) -f - | tar -C /lib/modules/ -xf - && -+ ln -rfs /usr/lib64/libelf.so.1 /usr/lib/libelf.so && -+ /usr/local/bin/nvidia-driver init - securityContext: - privileged: true - seLinuxOptions: -@@ -44,10 +55,23 @@ spec: - mountPropagation: Bidirectional - - name: config - mountPath: /etc/containers/oci/hooks.d -+ subPath: oci-nvidia-hook-json -+ - name: config -+ mountPath: /usr/local/bin/nvidia-driver.22 -+ subPath: nvidia-driver-build-script - - name: var-log - mountPath: /var/log - - name: dev-log - mountPath: /dev/log -+ - name: host-modules -+ mountPath: /lib/host-modules -+ readOnly: true -+ - name: host-include -+ mountPath: /usr/include -+ readOnly: true -+ - name: host-kernel-devel -+ mountPath: /usr/src/host-kernels -+ readOnly: true - volumes: - - name: run-nvidia - hostPath: -@@ -58,11 +82,22 @@ spec: - - name: dev-log - hostPath: - path: /dev/log -+ - name: host-modules -+ hostPath: -+ path: /lib/modules -+ - name: host-kernel-devel -+ hostPath: -+ path: /usr/src/kernels/ -+ - name: host-include -+ hostPath: -+ path: /usr/include - - name: config - configMap: - name: nvidia-driver - items: - - key: oci-nvidia-hook-json - path: oci-nvidia-hook.json -+ - key: nvidia-driver-build-script -+ path: nvidia-driver-build-script - nodeSelector: - nvidia.com/gpu.present: "true" -diff --git a/assets/state-monitoring/0900_daemonset.yaml b/assets/state-monitoring/0900_daemonset.yaml -index 38c4d63a..aebb4297 100644 ---- a/assets/state-monitoring/0900_daemonset.yaml -+++ b/assets/state-monitoring/0900_daemonset.yaml -@@ -31,6 +31,7 @@ spec: - effect: NoSchedule - serviceAccount: nvidia-dcgm-exporter - serviceAccountName: nvidia-dcgm-exporter -+ runtimeClassName: nvidia - initContainers: - - name: toolkit-validation - image: "FILLED BY THE OPERATOR" -diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml -index 439b78ba..90aa3874 100644 ---- a/deployments/gpu-operator/templates/operator.yaml -+++ b/deployments/gpu-operator/templates/operator.yaml -@@ -57,38 +57,38 @@ spec: - mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }} - subPath: {{ printf "gfd_%s" (base $path) }} - {{- end }} -- -+ - {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }} - - name: assets - mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }} - subPath: {{ printf "state_container_toolkit_%s" (base $path) }} - {{- end }} -- -+ - {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }} - - name: assets - mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }} - subPath: {{ printf "state_device_%s" (base $path) }} - {{- end }} -- -+ - {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }} - - name: assets - mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }} - subPath: {{ printf "state_device_validation_%s" (base $path) }} - {{- end }} -- -+ - {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }} - - name: assets - mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }} - subPath: {{ printf "state_driver_%s" (base $path) }} - {{- end }} -- -+ - {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }} - - name: assets - mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }} - subPath: {{ printf "state_monitor_%s" (base $path) }} - {{- end }} - {{- end }} -- -+ - readinessProbe: - exec: - command: ["stat", "/tmp/operator-sdk-ready"] -diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml -index 8b43c59f..17662729 100644 ---- a/deployments/gpu-operator/values.yaml -+++ b/deployments/gpu-operator/values.yaml -@@ -15,6 +15,10 @@ operator: - #version: 1.5.2 - imagePullPolicy: IfNotPresent - imagePullSecrets: [] -+ # We cannot default to containerd because the operator modifies containerd -+ # configuration by adding itself to it, either as the default runtime or a -+ # runtimeclass, and restarts the service thereafter. -+ # defaultRuntime: containerd - defaultRuntime: docker - validator: - image: cuda-sample -@@ -40,7 +44,7 @@ operator: - logging: - timeEncoding: epoch - # Set to "include_assets" to include assets/gpu-operator with the helm chart -- include_assets: "" -+ include_assets: "include_assets" - - driver: - repository: nvcr.io/nvidia -@@ -73,7 +77,7 @@ driver: - toolkit: - repository: nvcr.io/nvidia/k8s - image: container-toolkit -- version: 1.4.5-ubuntu18.04 -+ version: 1.4.5-ubi8 - imagePullPolicy: IfNotPresent - imagePullSecrets: [] - env: [] --- -2.17.1 -