From 7badc1dad154bd28a8d299d748854dad53606c82 Mon Sep 17 00:00:00 2001
From: Babak Sarashki <babak.sarashki@windriver.com>
Date: Wed, 3 Mar 2021 12:15:52 +0000
Subject: [PATCH] integ: add nvidia gpu-operator helm charts

This commit adds nvidia gpu-operator helm charts use case for
custom container runtime feature. To load nvidia-gpu-operator
on starlingx:

system service-parameter-add platform container_runtime \
custom_container_runtime=\
nvidia:/usr/local/nvidia/toolkit/nvidia-container-runtime

And define  runtimeClass for nvidia gpu  pods:

kind: RuntimeClass
apiVersion: node.k8s.io/v1beta1
metadata:
  name: nvidia
handler: nvidia

The above will direct all containerd creations of pods with nvidia
runtimeClass to nvidia-container-runtime -- where the nvidia-conta
iner-runtime is installed by the operator onto a hostMount.

Story: 2008434
Task: 41978

Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
Change-Id: Ifea8cdf6eb89a159f446c53566279e72fcf0e45e
---
 centos_pkg_dirs                               |   1 +
 centos_tarball-dl.lst                         |   1 +
 gpu/gpu-operator/centos/build_srpm.data       |   8 +
 gpu/gpu-operator/centos/gpu-operator.spec     |  46 ++
 ...p-configmap-with-assets-for-volumemo.patch | 137 +++++
 ...-support-on-starlingx-cloud-platform.patch | 540 ++++++++++++++++++
 6 files changed, 733 insertions(+)
 create mode 100644 gpu/gpu-operator/centos/build_srpm.data
 create mode 100644 gpu/gpu-operator/centos/gpu-operator.spec
 create mode 100644 gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch
 create mode 100644 gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch

diff --git a/centos_pkg_dirs b/centos_pkg_dirs
index 3c1e83657..41a920acf 100644
--- a/centos_pkg_dirs
+++ b/centos_pkg_dirs
@@ -85,3 +85,4 @@ python/python-webencodings
 python/python-daemon
 base/inih
 base/pf-bb-config
+gpu/gpu-operator
diff --git a/centos_tarball-dl.lst b/centos_tarball-dl.lst
index 98c06b240..7aabbc23d 100644
--- a/centos_tarball-dl.lst
+++ b/centos_tarball-dl.lst
@@ -71,3 +71,4 @@ xxHash-1f40c6511fa8dd9d2e337ca8c9bc18b3e87663c9.tar.gz#xxHash#https://api.github
 zstd-f4340f46b2387bc8de7d5320c0b83bb1499933ad.tar.gz#zstd#https://api.github.com/repos/facebook/zstd/tarball/f4340f46b2387bc8de7d5320c0b83bb1499933ad#https##
 inih-b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69.tar.gz#inih-44#https://github.com/benhoyt/inih/tarball/b1dbff4b0bd1e1f40d237e21011f6dee0ec2fa69#https##
 pf-bb-config-945712e8876be2003f2f31de70353c48501519fa.tar.gz#pf-bb-config-21.3#https://github.com/intel/pf-bb-config/tarball/945712e8876be2003f2f31de70353c48501519fa#https##
+gpu-operator-1.6.0.tar.gz#gpu-operator-1.6.0#https://github.com/NVIDIA/gpu-operator/archive/1.6.0.tar.gz##https##
diff --git a/gpu/gpu-operator/centos/build_srpm.data b/gpu/gpu-operator/centos/build_srpm.data
new file mode 100644
index 000000000..927c712de
--- /dev/null
+++ b/gpu/gpu-operator/centos/build_srpm.data
@@ -0,0 +1,8 @@
+VERSION=1.6.0
+TAR_NAME=gpu-operator
+TAR="$TAR_NAME-$VERSION.tar.gz"
+COPY_LIST=" \
+  $PKG_BASE/files/* \
+  $STX_BASE/downloads/$TAR"
+
+TIS_PATCH_VER=PKG_GITREVCOUNT
diff --git a/gpu/gpu-operator/centos/gpu-operator.spec b/gpu/gpu-operator/centos/gpu-operator.spec
new file mode 100644
index 000000000..cd1300b50
--- /dev/null
+++ b/gpu/gpu-operator/centos/gpu-operator.spec
@@ -0,0 +1,46 @@
+# Build variables
+%global helm_ver v3
+%global helm_folder /usr/lib/helm
+
+Summary: StarlingX nvidia gpu-operator helm chart
+Name: gpu-operator
+Version: 1.6.0
+Release: 0%{?_tis_dist}.%{tis_patch_ver}
+License: Apache-2.0
+Group: base
+Packager: Wind River <info@windriver.com>
+URL: https://github.com/NVIDIA/gpu-operator/tree/gh-pages
+
+Source0: %{name}-%{version}.tar.gz
+
+BuildArch: noarch
+
+Patch01: deployments-setup-configmap-with-assets-for-volumemo.patch
+Patch02: enablement-support-on-starlingx-cloud-platform.patch
+
+BuildRequires: helm
+
+%define debug_package %{nil}
+%description
+StarlingX port of NVIDIA gpu-operator
+
+%prep
+%setup
+
+%patch01 -p1
+%patch02 -p1
+
+%build
+cp -r assets deployments/gpu-operator/assets
+
+helm lint deployments/gpu-operator
+mkdir build_results
+helm package --version %{helm_ver}-%{version}.%{tis_patch_ver} --app-version %{version} -d build_results deployments/gpu-operator
+
+%install
+install -d -m 755 ${RPM_BUILD_ROOT}%{helm_folder}
+install -p -D -m 755 build_results/%{name}-%{helm_ver}-%{version}.%{tis_patch_ver}.tgz ${RPM_BUILD_ROOT}%{helm_folder}
+
+%files
+%defattr(-,root,root,-)
+%{helm_folder}
diff --git a/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch b/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch
new file mode 100644
index 000000000..eeca1a38c
--- /dev/null
+++ b/gpu/gpu-operator/files/deployments-setup-configmap-with-assets-for-volumemo.patch
@@ -0,0 +1,137 @@
+From b968c69971a195aba4e0c03e8a70df074c128f69 Mon Sep 17 00:00:00 2001
+From: Babak Sarashki <babak.sarashki@windriver.com>
+Date: Sat, 6 Mar 2021 00:22:40 +0000
+Subject: [PATCH 1/2] deployments: setup configmap with assets for volumemounts
+
+This feature allows inclusion of assets/ in the helm chart and their
+export to the gpu-operator pod through configmap volumeMounts.
+
+Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
+---
+ .../gpu-operator/templates/operator.yaml      | 45 +++++++++++++++++++
+ .../templates/operator_configmap.yaml         | 36 +++++++++++++++
+ deployments/gpu-operator/values.yaml          |  2 +
+ 3 files changed, 83 insertions(+)
+ create mode 100644 deployments/gpu-operator/templates/operator_configmap.yaml
+
+diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml
+index 50983b20..1dfd9dbc 100644
+--- a/deployments/gpu-operator/templates/operator.yaml
++++ b/deployments/gpu-operator/templates/operator.yaml
+@@ -50,6 +50,45 @@ spec:
+           - name: host-os-release
+             mountPath: "/host-etc/os-release"
+             readOnly: true
++
++          {{- if eq .Values.operator.include_assets "include_assets" }}
++          {{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
++          - name: assets
++            mountPath: {{ printf "/opt/gpu-operator/gpu-feature-discovery/%s" (base $path) }}
++            subPath: {{ printf "gfd_%s" (base $path) }}
++          {{- end }}
++
++          {{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
++          - name: assets
++            mountPath: {{ printf "/opt/gpu-operator/state-container-toolkit/%s" (base $path) }}
++            subPath: {{ printf "state_container_toolkit_%s" (base $path) }}
++          {{- end }}
++
++          {{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
++          - name: assets
++            mountPath: {{ printf "/opt/gpu-operator/state-device-plugin/%s" (base $path) }}
++            subPath: {{ printf "state_device_%s" (base $path) }}
++          {{- end }}
++
++          {{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }}
++          - name: assets
++            mountPath: {{ printf "/opt/gpu-operator/state-device-plugin-validation/%s" (base $path) }}
++            subPath: {{ printf "state_device_validation_%s" (base $path) }}
++          {{- end }}
++
++          {{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
++          - name: assets
++            mountPath: {{ printf "/opt/gpu-operator/state-driver/%s" (base $path) }}
++            subPath: {{ printf "state_driver_%s" (base $path) }}
++          {{- end }}
++
++          {{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }}
++          - name: assets
++            mountPath: {{ printf "/opt/gpu-operator/state-monitoring/%s" (base $path) }}
++            subPath: {{ printf "state_monitor_%s" (base $path) }}
++          {{- end }}
++          {{- end }}
++ 
+         readinessProbe:
+           exec:
+             command: ["stat", "/tmp/operator-sdk-ready"]
+@@ -63,6 +102,12 @@ spec:
+         - name: host-os-release
+           hostPath:
+             path: "/etc/os-release"
++        {{- if eq .Values.operator.include_assets "include_assets" }}
++        - name: assets
++          configMap:
++            name: operator-configmap
++        {{- end }}
++
+     {{- with .Values.operator.nodeSelector }}
+       nodeSelector:
+         {{- toYaml . | nindent 8 }}
+diff --git a/deployments/gpu-operator/templates/operator_configmap.yaml b/deployments/gpu-operator/templates/operator_configmap.yaml
+new file mode 100644
+index 00000000..61f366e8
+--- /dev/null
++++ b/deployments/gpu-operator/templates/operator_configmap.yaml
+@@ -0,0 +1,36 @@
++{{- if eq .Values.operator.include_assets "include_assets" }}
++apiVersion: v1
++kind: ConfigMap
++metadata:
++  name: operator-configmap
++data:
++{{- range $path, $_ := .Files.Glob "assets/gpu-feature-discovery/*" }}
++{{ printf "gfd_%s" (base $path) | indent 2 }}: |-
++{{ $.Files.Get $path | indent 4 }}
++{{- end }}
++
++{{- range $path, $_ := .Files.Glob "assets/state-container-toolkit/*" }}
++{{ printf "state_container_toolkit_%s" (base $path) | indent 2 }}: |-
++{{ $.Files.Get $path | indent 4 }}
++{{- end }}
++
++{{- range $path, $_ := .Files.Glob "assets/state-device-plugin/*" }}
++{{ printf "state_device_%s" (base $path) | indent 2 }}: |-
++{{ $.Files.Get $path | indent 4 }}
++{{- end }}
++
++{{- range $path, $_ := .Files.Glob "assets/state-device-plugin-validation/*" }}
++{{ printf "state_device_validation_%s" (base $path) | indent 2 }}: |-
++{{ $.Files.Get $path | indent 4 }}
++{{- end }}
++
++{{- range $path, $_ := .Files.Glob "assets/state-driver/*" }}
++{{ printf "state_driver_%s" (base $path) | indent 2 }}: |-
++{{ $.Files.Get $path | indent 4 }}
++{{- end }}
++
++{{- range $path, $_ := .Files.Glob "assets/state-monitoring/*" }}
++{{ printf "state_monitor_%s" (base $path) | indent 2 }}: |-
++{{ $.Files.Get $path | indent 4 }}
++{{- end }}
++{{- end }}
+diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
+index 00d94195..8b43c59f 100644
+--- a/deployments/gpu-operator/values.yaml
++++ b/deployments/gpu-operator/values.yaml
+@@ -39,6 +39,8 @@ operator:
+                 values: [""]
+   logging:
+     timeEncoding: epoch
++  # Set to "include_assets" to include assets/gpu-operator with the helm chart
++  include_assets: ""
+ 
+ driver:
+   repository: nvcr.io/nvidia
+-- 
+2.17.1
+
diff --git a/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch b/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch
new file mode 100644
index 000000000..096965e52
--- /dev/null
+++ b/gpu/gpu-operator/files/enablement-support-on-starlingx-cloud-platform.patch
@@ -0,0 +1,540 @@
+From 74c08e4ce69b80e8c5687d01c6bd1a4752233e20 Mon Sep 17 00:00:00 2001
+From: Babak Sarashki <babak.sarashki@windriver.com>
+Date: Sun, 7 Mar 2021 17:19:08 +0000
+Subject: [PATCH 2/2] enablement: support on starlingx cloud platform
+
+StarlingX is a cloud infrastructure software stack for edge.
+It has an immutable file system, and system configruation. For
+instance changes to set containerd runtime by the gpu-operator
+will be overriden and must be avoided. The default_runtime is
+to remain docker, therefore.
+
+This commit enables gpu-operator on Starlingx (starlingx.io).
+The changes to the gpu-operator include bundling modified assets
+and a modified version of the nvidia-driver with the helm charts.
+
+The modficiations to the assets include setting the runtimeClassName
+on the gpu-operator pods that require nvidia-container-runtime and
+host-mounting the kernel headers and build directory.  The changes to
+the nvidia-driver account for pre-installed kernel packages.
+
+To load the operator on starlingx, define a runtimeclass with name
+and handler set to nvidia; thereafter:
+
+$ source /etc/platform/openrc
+[...(keystone_admin)]$ system service-parameter-add \
+  platform container_runtime \
+  custom_container_runtime=nvidia:/path/to/nvidia-container-runtime
+
+[...(keystone_admin)]$ system host-lock 1; system host-unlock 1
+
+Signed-off-by: Babak Sarashki <babak.sarashki@windriver.com>
+---
+ .../gpu-feature-discovery/0500_daemonset.yaml |   1 +
+ .../cuda-vector-add.yaml                      |   1 +
+ .../0400_device_plugin.yml                    |   1 +
+ assets/state-driver/0400_configmap.yaml       | 327 +++++++++++++++++-
+ assets/state-driver/0500_daemonset.yaml       |  39 ++-
+ assets/state-monitoring/0900_daemonset.yaml   |   1 +
+ deployments/gpu-operator/values.yaml          |   8 +-
+ 7 files changed, 373 insertions(+), 5 deletions(-)
+
+diff --git a/assets/gpu-feature-discovery/0500_daemonset.yaml b/assets/gpu-feature-discovery/0500_daemonset.yaml
+index 9785dc93..1589e710 100644
+--- a/assets/gpu-feature-discovery/0500_daemonset.yaml
++++ b/assets/gpu-feature-discovery/0500_daemonset.yaml
+@@ -18,6 +18,7 @@ spec:
+         app.kubernetes.io/part-of: nvidia-gpu
+     spec:
+       serviceAccount: nvidia-gpu-feature-discovery
++      runtimeClassName: nvidia
+       containers:
+         - image: "FILLED BY THE OPERATOR"
+           name: gpu-feature-discovery
+diff --git a/assets/state-device-plugin-validation/cuda-vector-add.yaml b/assets/state-device-plugin-validation/cuda-vector-add.yaml
+index cfb547ad..8269adeb 100644
+--- a/assets/state-device-plugin-validation/cuda-vector-add.yaml
++++ b/assets/state-device-plugin-validation/cuda-vector-add.yaml
+@@ -12,6 +12,7 @@ spec:
+       effect: NoSchedule
+   readOnlyRootFilesystem: true
+   restartPolicy: OnFailure
++  runtimeClassName: nvidia
+   initContainers:
+   - name: device-plugin-validation-init
+     image: "FILLED BY THE OPERATOR"
+diff --git a/assets/state-device-plugin/0400_device_plugin.yml b/assets/state-device-plugin/0400_device_plugin.yml
+index a5cf7fae..84e9c534 100644
+--- a/assets/state-device-plugin/0400_device_plugin.yml
++++ b/assets/state-device-plugin/0400_device_plugin.yml
+@@ -30,6 +30,7 @@ spec:
+         operator: Exists
+         effect: NoSchedule
+       serviceAccount: nvidia-device-plugin
++      runtimeClassName: nvidia
+       initContainers:
+       - name: toolkit-validation
+         image: "FILLED BY THE OPERATOR"
+diff --git a/assets/state-driver/0400_configmap.yaml b/assets/state-driver/0400_configmap.yaml
+index 48e9f51e..561adc9f 100644
+--- a/assets/state-driver/0400_configmap.yaml
++++ b/assets/state-driver/0400_configmap.yaml
+@@ -4,7 +4,7 @@ metadata:
+   name: nvidia-driver
+   namespace: gpu-operator-resources
+ data:
+-  oci-nvidia-hook-json: | 
++  oci-nvidia-hook-json: |
+     {
+         "version": "1.0.0",
+         "hook": {
+@@ -20,3 +20,328 @@ data:
+         },
+         "stages": ["prestart"]
+     }
++  nvidia-driver-build-script: |
++    #! /bin/bash
++    # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
++    # Copyright (c) 2021 Wind River Systems, Inc. SPDX-License-Identifier:
++    # Apache-2.0.
++    # This script is from: https://gitlab.com/nvidia/container-images/driver.
++    # It is modified and included under configmap for platforms that require
++    # pre-installed packages. Such platforms have the option to modify the
++    # entrypoint in 0500_daemonset.yaml, or the nvidia-driver script here for
++    # further customizations.
++
++    set -eu
++
++    RUN_DIR=/run/nvidia
++    PID_FILE=${RUN_DIR}/${0##*/}.pid
++    DRIVER_VERSION=${DRIVER_VERSION:?"Missing driver version"}
++    KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
++    KERNEL_VERSION="$(uname -r)"
++
++    # Default to 0 ; 1 is experimental and not supported
++    export IGNORE_PREEMPT_RT_PRESENCE=0
++
++    # Check if the kernel version requires a new precompiled driver packages.
++    _kernel_requires_package() {
++        local proc_mount_arg=""
++
++        echo "Checking NVIDIA driver packages..."
++        cd /usr/src/nvidia-${DRIVER_VERSION}/kernel
++
++        # When the kernel version is latest on host, this check fails and lead to recompilation, even when precompiled modules exist.
++        #if [ "${KERNEL_VERSION}" != "$(uname -r)" ]; then
++        #Not needed with pre-installed readonly headers, devel and modules
++        #proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc"
++        #fi
++        for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do
++            is_match=$(../mkprecompiled --match ${pkg_name} ${proc_mount_arg})
++            if [ "${is_match}" == "kernel interface matches." ]; then
++                echo "Found NVIDIA driver package ${pkg_name##*/}"
++                return 1
++            fi
++        done
++        return 0
++    }
++
++    # Compile the kernel modules, optionally sign them, and generate a precompiled package for use by the nvidia-installer.
++    _create_driver_package() (
++        local pkg_name="nvidia-modules-${KERNEL_VERSION%%-*}${PACKAGE_TAG:+-${PACKAGE_TAG}}"
++        local nvidia_sign_args=""
++        local nvidia_modeset_sign_args=""
++        local nvidia_uvm_sign_args=""
++
++        trap "make -s -j 4 SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT
++
++        echo "Compiling NVIDIA driver kernel modules..."
++        cd /usr/src/nvidia-${DRIVER_VERSION}/kernel
++
++        export IGNORE_CC_MISMATCH=1
++        make -s -j 4 SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null
++
++        echo "Relinking NVIDIA driver kernel modules..."
++        rm -f nvidia.ko nvidia-modeset.ko
++        ld -d -r -o nvidia.ko ./nv-linux.o ./nvidia/nv-kernel.o_binary
++        ld -d -r -o nvidia-modeset.ko ./nv-modeset-linux.o ./nvidia-modeset/nv-modeset-kernel.o_binary
++
++        if [ -n "${PRIVATE_KEY}" ]; then
++            echo "Signing NVIDIA driver kernel modules..."
++            donkey get ${PRIVATE_KEY} sh -c "PATH=${PATH}:/usr/src/kernels/$(uname -r)/scripts &&             \
++              sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia.ko nvidia.ko.sign &&                          \
++              sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-modeset.ko nvidia-modeset.ko.sign &&          \
++              sign-file sha512 \$DONKEY_FILE pubkey.x509 nvidia-uvm.ko"
++            nvidia_sign_args="--linked-module nvidia.ko --signed-module nvidia.ko.sign"
++            nvidia_modeset_sign_args="--linked-module nvidia-modeset.ko --signed-module nvidia-modeset.ko.sign"
++            nvidia_uvm_sign_args="--signed"
++        fi
++
++        echo "Building NVIDIA driver package ${pkg_name}..."
++        ../mkprecompiled --pack ${pkg_name} --description ${KERNEL_VERSION}                              \
++                                            --driver-version ${DRIVER_VERSION}                           \
++                                            --kernel-interface nv-linux.o                                \
++                                            --linked-module-name nvidia.ko                               \
++                                            --core-object-name nvidia/nv-kernel.o_binary                 \
++                                            ${nvidia_sign_args}                                          \
++                                            --target-directory .                                         \
++                                            --kernel-interface nv-modeset-linux.o                        \
++                                            --linked-module-name nvidia-modeset.ko                       \
++                                            --core-object-name nvidia-modeset/nv-modeset-kernel.o_binary \
++                                            ${nvidia_modeset_sign_args}                                  \
++                                            --target-directory .                                         \
++                                            --kernel-module nvidia-uvm.ko                                \
++                                            ${nvidia_uvm_sign_args}                                      \
++                                            --target-directory .
++        mkdir -p precompiled
++        mv ${pkg_name} precompiled
++    )
++
++    # Load the kernel modules and start persistenced.
++    _load_driver() {
++        echo "Loading IPMI kernel module..."
++        modprobe ipmi_msghandler
++
++        echo "Loading NVIDIA driver kernel modules..."
++        modprobe -a nvidia nvidia-uvm nvidia-modeset
++
++        echo "Starting NVIDIA persistence daemon..."
++        nvidia-persistenced --persistence-mode
++    }
++
++    # Stop persistenced and unload the kernel modules if they are currently loaded.
++    _unload_driver() {
++        local rmmod_args=()
++        local nvidia_deps=0
++        local nvidia_refs=0
++        local nvidia_uvm_refs=0
++        local nvidia_modeset_refs=0
++
++        echo "Stopping NVIDIA persistence daemon..."
++        if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then
++            local pid=$(< /var/run/nvidia-persistenced/nvidia-persistenced.pid)
++
++            kill -SIGTERM "${pid}"
++            for i in $(seq 1 10); do
++                kill -0 "${pid}" 2> /dev/null || break
++                sleep 0.1
++            done
++            if [ $i -eq 10 ]; then
++                echo "Could not stop NVIDIA persistence daemon" >&2
++                return 1
++            fi
++        fi
++
++        echo "Unloading NVIDIA driver kernel modules..."
++        if [ -f /sys/module/nvidia_modeset/refcnt ]; then
++            nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)
++            rmmod_args+=("nvidia-modeset")
++            ((++nvidia_deps))
++        fi
++        if [ -f /sys/module/nvidia_uvm/refcnt ]; then
++            nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt)
++            rmmod_args+=("nvidia-uvm")
++            ((++nvidia_deps))
++        fi
++        if [ -f /sys/module/nvidia/refcnt ]; then
++            nvidia_refs=$(< /sys/module/nvidia/refcnt)
++            rmmod_args+=("nvidia")
++        fi
++        if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then
++            echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2
++            return 1
++        fi
++
++        if [ ${#rmmod_args[@]} -gt 0 ]; then
++            rmmod ${rmmod_args[@]}
++        fi
++        return 0
++    }
++
++    # Link and install the kernel modules from a precompiled package using the nvidia-installer.
++    _install_driver() {
++        local install_args=()
++
++        echo "Installing NVIDIA driver kernel modules..."
++        cd /usr/src/nvidia-${DRIVER_VERSION}
++        rm -rf /lib/modules/${KERNEL_VERSION}/video
++
++        if [ "${ACCEPT_LICENSE}" = "yes" ]; then
++            install_args+=("--accept-license")
++        fi
++        nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"}
++        # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path
++        # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point
++        # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit
++        #nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check --no-cc-version-check --proc-mount-point /lib/modules/${KERNEL_VERSION}/proc ${install_args[@]+"${install_args[@]}"}
++    }
++
++    # Mount the driver rootfs into the run directory with the exception of sysfs.
++    _mount_rootfs() {
++        echo "Mounting NVIDIA driver rootfs..."
++        mount --make-runbindable /sys
++        mount --make-private /sys
++        mkdir -p ${RUN_DIR}/driver
++        mount --rbind / ${RUN_DIR}/driver
++    }
++
++    # Unmount the driver rootfs from the run directory.
++    _unmount_rootfs() {
++        echo "Unmounting NVIDIA driver rootfs..."
++        if findmnt -r -o TARGET | grep "${RUN_DIR}/driver" > /dev/null; then
++            umount -l -R ${RUN_DIR}/driver
++        fi
++    }
++
++    # Write a kernel postinst.d script to automatically precompile packages on kernel update (similar to DKMS).
++    _write_kernel_update_hook() {
++        if [ ! -d ${KERNEL_UPDATE_HOOK%/*} ]; then
++            return
++        fi
++
++        echo "Writing kernel update hook..."
++        cat > ${KERNEL_UPDATE_HOOK} <<'EOF'
++    #!/bin/bash
++
++    set -eu
++    trap 'echo "ERROR: Failed to update the NVIDIA driver" >&2; exit 0' ERR
++
++    NVIDIA_DRIVER_PID=$(< /run/nvidia/nvidia-driver.pid)
++
++    export "$(grep -z DRIVER_VERSION /proc/${NVIDIA_DRIVER_PID}/environ)"
++    nsenter -t "${NVIDIA_DRIVER_PID}" -m -- nvidia-driver update --kernel "$1"
++    EOF
++        chmod +x ${KERNEL_UPDATE_HOOK}
++    }
++
++    _shutdown() {
++        if _unload_driver; then
++            _unmount_rootfs
++            rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK}
++            return 0
++        fi
++        return 1
++    }
++
++    init() {
++        echo -e "\n========== NVIDIA Software Installer ==========\n"
++        echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
++
++        exec 3> ${PID_FILE}
++        if ! flock -n 3; then
++            echo "An instance of the NVIDIA driver is already running, aborting"
++            exit 1
++        fi
++        echo $$ >&3
++
++        trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
++        trap "_shutdown" EXIT
++
++        _unload_driver || exit 1
++        _unmount_rootfs
++
++        if _kernel_requires_package; then
++            _create_driver_package
++        fi
++
++        _install_driver
++        _load_driver
++        _mount_rootfs
++        _write_kernel_update_hook
++
++        echo "Done, now waiting for signal"
++        sleep infinity &
++        trap "echo 'Caught signal'; _shutdown && { kill $!; exit 0; }" HUP INT QUIT PIPE TERM
++        trap - EXIT
++        while true; do wait $! || continue; done
++        exit 0
++    }
++
++    update() {
++        exec 3>&2
++        if exec 2> /dev/null 4< ${PID_FILE}; then
++            if ! flock -n 4 && read pid <&4 && kill -0 "${pid}"; then
++                exec > >(tee -a "/proc/${pid}/fd/1")
++                exec 2> >(tee -a "/proc/${pid}/fd/2" >&3)
++            else
++                exec 2>&3
++            fi
++            exec 4>&-
++        fi
++        exec 3>&-
++
++        echo -e "\n========== NVIDIA Software Updater ==========\n"
++        echo -e "Starting update of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
++
++        trap "echo 'Caught signal'; exit 1" HUP INT QUIT PIPE TERM
++
++        if _kernel_requires_package; then
++            _create_driver_package
++        fi
++
++        echo "Done"
++        exit 0
++    }
++
++    usage() {
++        cat >&2 <<EOF
++    Usage: $0 COMMAND [ARG...]
++
++    Commands:
++      init   [-a | --accept-license]
++      update [-k | --kernel VERSION] [-s | --sign KEYID] [-t | --tag TAG]
++    EOF
++        exit 1
++    }
++
++    if [ $# -eq 0 ]; then
++        usage
++    fi
++    command=$1; shift
++    case "${command}" in
++        init) options=$(getopt -l accept-license -o a -- "$@") ;;
++        update) options=$(getopt -l kernel:,sign:,tag: -o k:s:t: -- "$@") ;;
++        *) usage ;;
++    esac
++    if [ $? -ne 0 ]; then
++        usage
++    fi
++    eval set -- "${options}"
++
++    ACCEPT_LICENSE=""
++    KERNEL_VERSION=$(uname -r)
++    PRIVATE_KEY=""
++    PACKAGE_TAG=""
++
++    for opt in ${options}; do
++        case "$opt" in
++        -a | --accept-license) ACCEPT_LICENSE="yes"; shift 1 ;;
++        -k | --kernel) KERNEL_VERSION=$2; shift 2 ;;
++        -s | --sign) PRIVATE_KEY=$2; shift 2 ;;
++        -t | --tag) PACKAGE_TAG=$2; shift 2 ;;
++        --) shift; break ;;
++        esac
++    done
++    if [ $# -ne 0 ]; then
++        usage
++    fi
++
++    $command
+diff --git a/assets/state-driver/0500_daemonset.yaml b/assets/state-driver/0500_daemonset.yaml
+index 3a2dc06b..7a1d8a17 100644
+--- a/assets/state-driver/0500_daemonset.yaml
++++ b/assets/state-driver/0500_daemonset.yaml
+@@ -32,8 +32,19 @@ spec:
+       - image: "FILLED BY THE OPERATOR"
+         imagePullPolicy: Always
+         name: nvidia-driver-ctr
+-        command: ["nvidia-driver"]
+-        args: ["init"]
++        command: ["/bin/bash"]
++        args:
++        - "-c"
++        - "--"
++        - >
++          cat /usr/local/bin/nvidia-driver.22 > /usr/local/bin/nvidia-driver &&
++          chmod 755 /usr/local/bin/nvidia-driver &&
++          mkdir -p /usr/src/kernels &&
++          tar -C /usr/src/host-kernels/ -c $(uname -r) -f - | tar -C /usr/src/kernels/ -xf - &&
++          rm -rf /lib/modules/ && mkdir -p /lib/modules/ &&
++          tar -C /lib/host-modules/ -c $(uname -r) -f - | tar -C /lib/modules/ -xf - &&
++          ln -rfs /usr/lib64/libelf.so.1 /usr/lib/libelf.so &&
++          /usr/local/bin/nvidia-driver init
+         securityContext:
+           privileged: true
+           seLinuxOptions:
+@@ -44,10 +55,23 @@ spec:
+             mountPropagation: Bidirectional
+           - name: config
+             mountPath: /etc/containers/oci/hooks.d
++            subPath: oci-nvidia-hook-json
++          - name: config
++            mountPath: /usr/local/bin/nvidia-driver.22
++            subPath: nvidia-driver-build-script
+           - name: var-log
+             mountPath: /var/log
+           - name: dev-log
+             mountPath: /dev/log
++          - name: host-modules
++            mountPath: /lib/host-modules
++            readOnly: true
++          - name: host-include
++            mountPath: /usr/include
++            readOnly: true
++          - name: host-kernel-devel
++            mountPath: /usr/src/host-kernels
++            readOnly: true
+       volumes:
+         - name: run-nvidia
+           hostPath:
+@@ -58,11 +82,22 @@ spec:
+         - name: dev-log
+           hostPath:
+             path: /dev/log
++        - name: host-modules
++          hostPath:
++            path: /lib/modules
++        - name: host-kernel-devel
++          hostPath:
++            path: /usr/src/kernels/
++        - name: host-include
++          hostPath:
++            path: /usr/include
+         - name: config
+           configMap:
+             name: nvidia-driver
+             items:
+               - key: oci-nvidia-hook-json
+                 path: oci-nvidia-hook.json
++              - key: nvidia-driver-build-script
++                path: nvidia-driver-build-script
+       nodeSelector:
+         nvidia.com/gpu.present: "true"
+diff --git a/assets/state-monitoring/0900_daemonset.yaml b/assets/state-monitoring/0900_daemonset.yaml
+index 38c4d63a..aebb4297 100644
+--- a/assets/state-monitoring/0900_daemonset.yaml
++++ b/assets/state-monitoring/0900_daemonset.yaml
+@@ -31,6 +31,7 @@ spec:
+         effect: NoSchedule
+       serviceAccount: nvidia-dcgm-exporter
+       serviceAccountName: nvidia-dcgm-exporter
++      runtimeClassName: nvidia
+       initContainers:
+       - name: toolkit-validation
+         image: "FILLED BY THE OPERATOR"
+diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml
+index 8b43c59f..17662729 100644
+--- a/deployments/gpu-operator/values.yaml
++++ b/deployments/gpu-operator/values.yaml
+@@ -15,6 +15,10 @@ operator:
+   #version: 1.5.2
+   imagePullPolicy: IfNotPresent
+   imagePullSecrets: []
++  # We cannot default to containerd because the operator modifies containerd
++  # configuration by adding itself to it, either as the default runtime or a
++  # runtimeclass, and restarts the service thereafter.
++  # defaultRuntime: containerd
+   defaultRuntime: docker
+   validator:
+     image: cuda-sample
+@@ -40,7 +44,7 @@ operator:
+   logging:
+     timeEncoding: epoch
+   # Set to "include_assets" to include assets/gpu-operator with the helm chart
+-  include_assets: ""
++  include_assets: "include_assets"
+ 
+ driver:
+   repository: nvcr.io/nvidia
+@@ -73,7 +77,7 @@ driver:
+ toolkit:
+   repository: nvcr.io/nvidia/k8s
+   image: container-toolkit
+-  version: 1.4.5-ubuntu18.04
++  version: 1.4.5-ubi8
+   imagePullPolicy: IfNotPresent
+   imagePullSecrets: []
+   env: []
+-- 
+2.17.1
+