From 821d4fe856645894a9ad34a13831fa910b1489b9 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Mon, 16 Dec 2024 11:50:54 -0800 Subject: [PATCH] Rename draDriver to imexDRADriver in Clusterpolicy Signed-off-by: Christopher Desiniotis --- api/nvidia/v1/clusterpolicy_types.go | 26 +-- api/nvidia/v1/zz_generated.deepcopy.go | 82 +++++----- .../0100_service_account.yaml | 2 +- .../0200_clusterrole.yaml | 2 +- .../0300_clusterrolebinding.yaml | 6 +- .../0400_deviceclass-imex.yaml | 0 .../0500_deployment.yaml | 10 +- .../0600_configmap.yaml | 2 +- .../0700_daemonset.yaml | 31 +--- .../crd/bases/nvidia.com_clusterpolicies.yaml | 154 +++++++++--------- controllers/object_controls.go | 36 ++-- controllers/state_manager.go | 6 +- controllers/transforms_test.go | 28 ++-- .../crds/nvidia.com_clusterpolicies.yaml | 154 +++++++++--------- .../gpu-operator/templates/clusterpolicy.yaml | 36 ++-- deployments/gpu-operator/values.yaml | 2 +- 16 files changed, 282 insertions(+), 295 deletions(-) rename assets/{state-dra-driver => state-imex-dra-driver}/0100_service_account.yaml (73%) rename assets/{state-dra-driver => state-imex-dra-driver}/0200_clusterrole.yaml (87%) rename assets/{state-dra-driver => state-imex-dra-driver}/0300_clusterrolebinding.yaml (69%) rename assets/{state-dra-driver => state-imex-dra-driver}/0400_deviceclass-imex.yaml (100%) rename assets/{state-dra-driver => state-imex-dra-driver}/0500_deployment.yaml (81%) rename assets/{state-dra-driver => state-imex-dra-driver}/0600_configmap.yaml (96%) rename assets/{state-dra-driver => state-imex-dra-driver}/0700_daemonset.yaml (77%) diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index b3f45778b..6dff89512 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -54,7 +54,7 @@ type ClusterPolicySpec struct { // DevicePlugin component spec DevicePlugin DevicePluginSpec `json:"devicePlugin"` // DRADriver component spec - DRADriver DRADriverSpec `json:"draDriver"` + IMEXDRADriver IMEXDRADriverSpec `json:"imexDRADriver"` // DCGMExporter spec DCGMExporter DCGMExporterSpec `json:"dcgmExporter"` // DCGM component spec @@ -843,24 +843,24 @@ type SandboxDevicePluginSpec struct { Env []EnvVar `json:"env,omitempty"` } -// DRADriverSpec defines the properties for the NVIDIA DRA Driver deployment +// IMEXDRADriverSpec defines the properties for the NVIDIA IMEX DRA Driver deployment // TODO: add 'controller' and 'kubeletPlugin' structs to allow for per-component configuration -type DRADriverSpec struct { - // Enabled indicates if the deployment of NVIDIA DRA Driver through the operator is enabled +type IMEXDRADriverSpec struct { + // Enabled indicates if the deployment of NVIDIA IMEX DRA Driver through the operator is enabled // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true - // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA DRA Driver deployment through GPU Operator" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA IMEX DRA Driver deployment through GPU Operator" // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" Enabled *bool `json:"enabled,omitempty"` - // NVIDIA DRA Driver image repository + // NVIDIA IMEX DRA Driver image repository // +kubebuilder:validation:Optional Repository string `json:"repository,omitempty"` - // NVIDIA DRA Driver image name + // NVIDIA IMEX DRA Driver image name // +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+ Image string `json:"image,omitempty"` - // NVIDIA DRA Driver image tag + // NVIDIA IMEX DRA Driver image tag // +kubebuilder:validation:Optional Version string `json:"version,omitempty"` @@ -1820,9 +1820,9 @@ func ImagePath(spec interface{}) (string, error) { case *SandboxDevicePluginSpec: config := spec.(*SandboxDevicePluginSpec) return imagePath(config.Repository, config.Image, config.Version, "SANDBOX_DEVICE_PLUGIN_IMAGE") - case *DRADriverSpec: - config := spec.(*DRADriverSpec) - return imagePath(config.Repository, config.Image, config.Version, "DRA_DRIVER_IMAGE") + case *IMEXDRADriverSpec: + config := spec.(*IMEXDRADriverSpec) + return imagePath(config.Repository, config.Image, config.Version, "IMEX_DRA_DRIVER_IMAGE") case *DCGMExporterSpec: config := spec.(*DCGMExporterSpec) return imagePath(config.Repository, config.Image, config.Version, "DCGM_EXPORTER_IMAGE") @@ -1931,8 +1931,8 @@ func (p *DevicePluginSpec) IsEnabled() bool { return *p.Enabled } -// IsEnabled returns true if draDriver is enabled through gpu-operator -func (d *DRADriverSpec) IsEnabled() bool { +// IsEnabled returns true if IMEX DRA Driver is enabled through gpu-operator +func (d *IMEXDRADriverSpec) IsEnabled() bool { if d.Enabled == nil { // default is true if not specified by user return true diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index 35319d003..fd8e29c1b 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -181,7 +181,7 @@ func (in *ClusterPolicySpec) DeepCopyInto(out *ClusterPolicySpec) { in.Driver.DeepCopyInto(&out.Driver) in.Toolkit.DeepCopyInto(&out.Toolkit) in.DevicePlugin.DeepCopyInto(&out.DevicePlugin) - in.DRADriver.DeepCopyInto(&out.DRADriver) + in.IMEXDRADriver.DeepCopyInto(&out.IMEXDRADriver) in.DCGMExporter.DeepCopyInto(&out.DCGMExporter) in.DCGM.DeepCopyInto(&out.DCGM) in.NodeStatusExporter.DeepCopyInto(&out.NodeStatusExporter) @@ -407,46 +407,6 @@ func (in *DCGMSpec) DeepCopy() *DCGMSpec { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *DRADriverSpec) DeepCopyInto(out *DRADriverSpec) { - *out = *in - if in.Enabled != nil { - in, out := &in.Enabled, &out.Enabled - *out = new(bool) - **out = **in - } - if in.ImagePullSecrets != nil { - in, out := &in.ImagePullSecrets, &out.ImagePullSecrets - *out = make([]string, len(*in)) - copy(*out, *in) - } - if in.Resources != nil { - in, out := &in.Resources, &out.Resources - *out = new(ResourceRequirements) - (*in).DeepCopyInto(*out) - } - if in.Args != nil { - in, out := &in.Args, &out.Args - *out = make([]string, len(*in)) - copy(*out, *in) - } - if in.Env != nil { - in, out := &in.Env, &out.Env - *out = make([]EnvVar, len(*in)) - copy(*out, *in) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DRADriverSpec. -func (in *DRADriverSpec) DeepCopy() *DRADriverSpec { - if in == nil { - return nil - } - out := new(DRADriverSpec) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DaemonsetsSpec) DeepCopyInto(out *DaemonsetsSpec) { *out = *in @@ -919,6 +879,46 @@ func (in *HostPathsSpec) DeepCopy() *HostPathsSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *IMEXDRADriverSpec) DeepCopyInto(out *IMEXDRADriverSpec) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.ImagePullSecrets != nil { + in, out := &in.ImagePullSecrets, &out.ImagePullSecrets + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(ResourceRequirements) + (*in).DeepCopyInto(*out) + } + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]EnvVar, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new IMEXDRADriverSpec. +func (in *IMEXDRADriverSpec) DeepCopy() *IMEXDRADriverSpec { + if in == nil { + return nil + } + out := new(IMEXDRADriverSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *InitContainerSpec) DeepCopyInto(out *InitContainerSpec) { *out = *in diff --git a/assets/state-dra-driver/0100_service_account.yaml b/assets/state-imex-dra-driver/0100_service_account.yaml similarity index 73% rename from assets/state-dra-driver/0100_service_account.yaml rename to assets/state-imex-dra-driver/0100_service_account.yaml index 76d6d61af..93dd391da 100644 --- a/assets/state-dra-driver/0100_service_account.yaml +++ b/assets/state-imex-dra-driver/0100_service_account.yaml @@ -1,5 +1,5 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: nvidia-dra-driver + name: nvidia-imex-dra-driver namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-dra-driver/0200_clusterrole.yaml b/assets/state-imex-dra-driver/0200_clusterrole.yaml similarity index 87% rename from assets/state-dra-driver/0200_clusterrole.yaml rename to assets/state-imex-dra-driver/0200_clusterrole.yaml index 2c8f523e1..c30d25479 100644 --- a/assets/state-dra-driver/0200_clusterrole.yaml +++ b/assets/state-imex-dra-driver/0200_clusterrole.yaml @@ -1,7 +1,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: nvidia-dra-driver + name: nvidia-imex-dra-driver rules: # TODO: restrict RBAC for DRA driver - apiGroups: diff --git a/assets/state-dra-driver/0300_clusterrolebinding.yaml b/assets/state-imex-dra-driver/0300_clusterrolebinding.yaml similarity index 69% rename from assets/state-dra-driver/0300_clusterrolebinding.yaml rename to assets/state-imex-dra-driver/0300_clusterrolebinding.yaml index 6cabfb52d..535542137 100644 --- a/assets/state-dra-driver/0300_clusterrolebinding.yaml +++ b/assets/state-imex-dra-driver/0300_clusterrolebinding.yaml @@ -1,12 +1,12 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: nvidia-dra-driver + name: nvidia-imex-dra-driver roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: nvidia-dra-driver + name: nvidia-imex-dra-driver subjects: - kind: ServiceAccount - name: nvidia-dra-driver + name: nvidia-imex-dra-driver namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-dra-driver/0400_deviceclass-imex.yaml b/assets/state-imex-dra-driver/0400_deviceclass-imex.yaml similarity index 100% rename from assets/state-dra-driver/0400_deviceclass-imex.yaml rename to assets/state-imex-dra-driver/0400_deviceclass-imex.yaml diff --git a/assets/state-dra-driver/0500_deployment.yaml b/assets/state-imex-dra-driver/0500_deployment.yaml similarity index 81% rename from assets/state-dra-driver/0500_deployment.yaml rename to assets/state-imex-dra-driver/0500_deployment.yaml index 082cc4f52..5d502bf7a 100644 --- a/assets/state-dra-driver/0500_deployment.yaml +++ b/assets/state-imex-dra-driver/0500_deployment.yaml @@ -2,21 +2,21 @@ apiVersion: apps/v1 kind: Deployment metadata: labels: - app: nvidia-dra-driver-controller - name: nvidia-dra-driver-controller + app: nvidia-imex-dra-driver-controller + name: nvidia-imex-dra-driver-controller namespace: "FILLED BY THE OPERATOR" spec: replicas: 1 selector: matchLabels: - app: nvidia-dra-driver-controller + app: nvidia-imex-dra-driver-controller template: metadata: labels: - app: nvidia-dra-driver-controller + app: nvidia-imex-dra-driver-controller spec: priorityClassName: system-node-critical - serviceAccountName: nvidia-dra-driver + serviceAccountName: nvidia-imex-dra-driver tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master diff --git a/assets/state-dra-driver/0600_configmap.yaml b/assets/state-imex-dra-driver/0600_configmap.yaml similarity index 96% rename from assets/state-dra-driver/0600_configmap.yaml rename to assets/state-imex-dra-driver/0600_configmap.yaml index d910b26e3..9b0310a29 100644 --- a/assets/state-dra-driver/0600_configmap.yaml +++ b/assets/state-imex-dra-driver/0600_configmap.yaml @@ -4,7 +4,7 @@ metadata: name: nvidia-dra-driver-kubelet-plugin-entrypoint namespace: "FILLED BY THE OPERATOR" labels: - app: nvidia-dra-driver-kubelet-plugin + app: nvidia-imex-dra-driver-kubelet-plugin data: entrypoint.sh: |- #!/bin/bash diff --git a/assets/state-dra-driver/0700_daemonset.yaml b/assets/state-imex-dra-driver/0700_daemonset.yaml similarity index 77% rename from assets/state-dra-driver/0700_daemonset.yaml rename to assets/state-imex-dra-driver/0700_daemonset.yaml index 7654b604c..59032e69d 100644 --- a/assets/state-dra-driver/0700_daemonset.yaml +++ b/assets/state-imex-dra-driver/0700_daemonset.yaml @@ -2,13 +2,13 @@ apiVersion: apps/v1 kind: DaemonSet metadata: labels: - app: nvidia-dra-driver-kubelet-plugin - name: nvidia-dra-driver-kubelet-plugin + app: nvidia-imex-dra-driver-kubelet-plugin + name: nvidia-imex-dra-driver-kubelet-plugin namespace: "FILLED BY THE OPERATOR" spec: selector: matchLabels: - app: nvidia-dra-driver-kubelet-plugin + app: nvidia-imex-dra-driver-kubelet-plugin updateStrategy: rollingUpdate: maxSurge: 0 @@ -17,30 +17,17 @@ spec: template: metadata: labels: - app: nvidia-dra-driver-kubelet-plugin + app: nvidia-imex-dra-driver-kubelet-plugin spec: priorityClassName: system-node-critical serviceAccountName: nvidia-dra-driver - # TODO: revisit the affinity / nodeSelector for this daemonset affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: feature.node.kubernetes.io/pci-10de.present - operator: In - values: - - "true" - - matchExpressions: - - key: feature.node.kubernetes.io/cpu-model.vendor_id - operator: In - values: - - NVIDIA - - matchExpressions: - - key: nvidia.com/gpu.present - operator: In - values: - - "true" + - key: nvidia.com/gpu.imex-domain + operator: Exists initContainers: - image: "FILLED BY THE OPERATOR" name: driver-validation @@ -83,7 +70,7 @@ spec: securityContext: privileged: true volumeMounts: - - name: nvidia-dra-driver-kubelet-plugin-entrypoint + - name: nvidia-imex-dra-driver-kubelet-plugin-entrypoint readOnly: true mountPath: /bin/entrypoint.sh subPath: entrypoint.sh @@ -104,9 +91,9 @@ spec: - mountPath: /var/run/cdi name: cdi volumes: - - name: nvidia-dra-driver-kubelet-plugin-entrypoint + - name: nvidia-imex-dra-driver-kubelet-plugin-entrypoint configMap: - name: nvidia-dra-driver-kubelet-plugin-entrypoint + name: nvidia-imex-dra-driver-kubelet-plugin-entrypoint defaultMode: 448 - name: run-nvidia-validations hostPath: diff --git a/config/crd/bases/nvidia.com_clusterpolicies.yaml b/config/crd/bases/nvidia.com_clusterpolicies.yaml index 13ae16082..c47255cbb 100644 --- a/config/crd/bases/nvidia.com_clusterpolicies.yaml +++ b/config/crd/bases/nvidia.com_clusterpolicies.yaml @@ -596,82 +596,6 @@ spec: description: NVIDIA Device Plugin image tag type: string type: object - draDriver: - description: DRADriver component spec - properties: - args: - description: 'Optional: List of arguments' - items: - type: string - type: array - enabled: - description: Enabled indicates if the deployment of NVIDIA DRA - Driver through the operator is enabled - type: boolean - env: - description: 'Optional: List of environment variables' - items: - description: EnvVar represents an environment variable present - in a Container. - properties: - name: - description: Name of the environment variable. - type: string - value: - description: Value of the environment variable. - type: string - required: - - name - type: object - type: array - image: - description: NVIDIA DRA Driver image name - pattern: '[a-zA-Z0-9\-]+' - type: string - imagePullPolicy: - description: Image pull policy - type: string - imagePullSecrets: - description: Image pull secrets - items: - type: string - type: array - repository: - description: NVIDIA DRA Driver image repository - type: string - resources: - description: 'Optional: Define resources requests and limits for - each pod' - properties: - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - type: object - version: - description: NVIDIA DRA Driver image tag - type: string - type: object driver: description: Driver component spec properties: @@ -1262,6 +1186,82 @@ spec: stop, start, or restart systemd services. type: string type: object + imexDRADriver: + description: DRADriver component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if the deployment of NVIDIA IMEX + DRA Driver through the operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA IMEX DRA Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA IMEX DRA Driver image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA IMEX DRA Driver image tag + type: string + type: object kataManager: description: KataManager component spec properties: @@ -2372,9 +2372,9 @@ spec: - dcgm - dcgmExporter - devicePlugin - - draDriver - driver - gfd + - imexDRADriver - nodeStatusExporter - operator - toolkit diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 288005350..57e6e569f 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -698,7 +698,7 @@ func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error "nvidia-vgpu-device-manager": TransformVGPUDeviceManager, "nvidia-vfio-manager": TransformVFIOManager, "nvidia-container-toolkit-daemonset": TransformToolkit, - "nvidia-dra-driver-kubelet-plugin": TransformDRADriverPlugin, + "nvidia-imex-dra-driver-kubelet-plugin": TransformIMEXDRADriverPlugin, "nvidia-device-plugin-daemonset": TransformDevicePlugin, "nvidia-device-plugin-mps-control-daemon": TransformMPSControlDaemon, "nvidia-sandbox-device-plugin-daemonset": TransformSandboxDevicePlugin, @@ -1543,8 +1543,8 @@ func TransformSandboxDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPo return nil } -// TransformDRADriverPlugin transforms nvidia-dra-driver-plugin daemonset with required config as per ClusterPolicy -func TransformDRADriverPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { +// TransformIMEXDRADriverPlugin transforms nvidia-imex-dra-driver-plugin daemonset with required config as per ClusterPolicy +func TransformIMEXDRADriverPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update validation container err := transformValidationInitContainer(obj, config) if err != nil { @@ -1552,35 +1552,35 @@ func TransformDRADriverPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy } // update image - image, err := gpuv1.ImagePath(&config.DRADriver) + image, err := gpuv1.ImagePath(&config.IMEXDRADriver) if err != nil { return err } obj.Spec.Template.Spec.Containers[0].Image = image // update image pull policy - obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.DRADriver.ImagePullPolicy) + obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.IMEXDRADriver.ImagePullPolicy) // set image pull secrets - if len(config.DRADriver.ImagePullSecrets) > 0 { - addPullSecrets(&obj.Spec.Template.Spec, config.DRADriver.ImagePullSecrets) + if len(config.IMEXDRADriver.ImagePullSecrets) > 0 { + addPullSecrets(&obj.Spec.Template.Spec, config.IMEXDRADriver.ImagePullSecrets) } // set resource limits - if config.DRADriver.Resources != nil { + if config.IMEXDRADriver.Resources != nil { // apply resource limits to all containers for i := range obj.Spec.Template.Spec.Containers { - obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.DRADriver.Resources.Requests - obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.DRADriver.Resources.Limits + obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.IMEXDRADriver.Resources.Requests + obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.IMEXDRADriver.Resources.Limits } } // set arguments if specified for device-plugin container - if len(config.DRADriver.Args) > 0 { - obj.Spec.Template.Spec.Containers[0].Args = config.DRADriver.Args + if len(config.IMEXDRADriver.Args) > 0 { + obj.Spec.Template.Spec.Containers[0].Args = config.IMEXDRADriver.Args } // set/append environment variables for device-plugin container - if len(config.DRADriver.Env) > 0 { - for _, env := range config.DRADriver.Env { + if len(config.IMEXDRADriver.Env) > 0 { + for _, env := range config.IMEXDRADriver.Env { setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value) } } @@ -3716,8 +3716,8 @@ func getDaemonsetControllerRevisionHash(ctx context.Context, daemonset *appsv1.D return hash, nil } -func TransformDRADriverController(obj *appsv1.Deployment, spec *gpuv1.ClusterPolicySpec) error { - config := spec.DRADriver +func TransformIMEXDRADriverController(obj *appsv1.Deployment, spec *gpuv1.ClusterPolicySpec) error { + config := spec.IMEXDRADriver // update image image, err := gpuv1.ImagePath(&config) if err != nil { @@ -3739,8 +3739,8 @@ func TransformDRADriverController(obj *appsv1.Deployment, spec *gpuv1.ClusterPol func transformDeployment(obj *appsv1.Deployment, n ClusterPolicyController) error { logger := n.logger.WithValues("Deployment", obj.Name, "Namespace", obj.Namespace) switch obj.Name { - case "nvidia-dra-driver-controller": - return TransformDRADriverController(obj, &n.singleton.Spec) + case "nvidia-imex-dra-driver-controller": + return TransformIMEXDRADriverController(obj, &n.singleton.Spec) default: logger.Info("No transformation for object") return nil diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 6b7d923b0..7fbaa85f9 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -794,7 +794,7 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP addState(n, "/opt/gpu-operator/state-container-toolkit") addState(n, "/opt/gpu-operator/state-operator-validation") addState(n, "/opt/gpu-operator/state-device-plugin") - addState(n, "/opt/gpu-operator/state-dra-driver") + addState(n, "/opt/gpu-operator/state-imex-dra-driver") addState(n, "/opt/gpu-operator/state-mps-control-daemon") addState(n, "/opt/gpu-operator/state-dcgm") addState(n, "/opt/gpu-operator/state-dcgm-exporter") @@ -1028,8 +1028,8 @@ func (n ClusterPolicyController) isStateEnabled(stateName string) bool { return true case "state-operator-metrics": return true - case "state-dra-driver": - return clusterPolicySpec.DRADriver.IsEnabled() + case "state-imex-dra-driver": + return clusterPolicySpec.IMEXDRADriver.IsEnabled() default: n.logger.Error(nil, "invalid state passed", "stateName", stateName) return false diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index c30d49677..3929138c3 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -1194,7 +1194,7 @@ func TestTransformSandboxValidator(t *testing.T) { } } -func TestTransformDRADriverPlugin(t *testing.T) { +func TestTransformIMEXDRADriverPlugin(t *testing.T) { testCases := []struct { description string ds Daemonset @@ -1203,23 +1203,23 @@ func TestTransformDRADriverPlugin(t *testing.T) { errorExpected bool }{ { - description: "empty draDriver spec", + description: "empty imexDRADriver spec", ds: NewDaemonset(). WithInitContainer(corev1.Container{Name: "dummy"}). WithContainer(corev1.Container{Name: "dummy"}), cpSpec: &gpuv1.ClusterPolicySpec{ - DRADriver: gpuv1.DRADriverSpec{}, + IMEXDRADriver: gpuv1.IMEXDRADriverSpec{}, }, expectedDs: NewDaemonset(), errorExpected: true, }, { - description: "valid draDriver spec, toolkit disabled", + description: "valid imexDRADriver spec, toolkit disabled", ds: NewDaemonset(). WithInitContainer(corev1.Container{Name: "dummy"}). WithContainer(corev1.Container{Name: "dummy"}), cpSpec: &gpuv1.ClusterPolicySpec{ - DRADriver: gpuv1.DRADriverSpec{ + IMEXDRADriver: gpuv1.IMEXDRADriverSpec{ Repository: "nvcr.io/nvidia/cloud-native", Image: "k8s-dra-driver", Version: "v1.0.0", @@ -1237,12 +1237,12 @@ func TestTransformDRADriverPlugin(t *testing.T) { }), }, { - description: "valid draDriver spec, toolkit enabled", + description: "valid imexDRADriver spec, toolkit enabled", ds: NewDaemonset(). WithInitContainer(corev1.Container{Name: "dummy"}). WithContainer(corev1.Container{Name: "dummy"}), cpSpec: &gpuv1.ClusterPolicySpec{ - DRADriver: gpuv1.DRADriverSpec{ + IMEXDRADriver: gpuv1.IMEXDRADriverSpec{ Repository: "nvcr.io/nvidia/cloud-native", Image: "k8s-dra-driver", Version: "v1.0.0", @@ -1294,7 +1294,7 @@ func TestTransformDRADriverPlugin(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - err := TransformDRADriverPlugin(tc.ds.DaemonSet, tc.cpSpec, ClusterPolicyController{logger: ctrl.Log.WithName("test")}) + err := TransformIMEXDRADriverPlugin(tc.ds.DaemonSet, tc.cpSpec, ClusterPolicyController{logger: ctrl.Log.WithName("test")}) if tc.errorExpected { require.Error(t, err) return @@ -1305,7 +1305,7 @@ func TestTransformDRADriverPlugin(t *testing.T) { } } -func TestTransformDRADriverController(t *testing.T) { +func TestTransformIMEXDRADriverController(t *testing.T) { testCases := []struct { description string deployment deployment @@ -1314,21 +1314,21 @@ func TestTransformDRADriverController(t *testing.T) { errorExpected bool }{ { - description: "empty draDriver spec", + description: "empty imexDRADriver spec", deployment: NewDeployment(). WithContainer(corev1.Container{Name: "dummy"}), cpSpec: &gpuv1.ClusterPolicySpec{ - DRADriver: gpuv1.DRADriverSpec{}, + IMEXDRADriver: gpuv1.IMEXDRADriverSpec{}, }, expectedDeployment: NewDeployment(), errorExpected: true, }, { - description: "valid draDriver spec", + description: "valid imexDRADriver spec", deployment: NewDeployment(). WithContainer(corev1.Container{Name: "dummy"}), cpSpec: &gpuv1.ClusterPolicySpec{ - DRADriver: gpuv1.DRADriverSpec{ + IMEXDRADriver: gpuv1.IMEXDRADriverSpec{ Repository: "nvcr.io/nvidia/cloud-native", Image: "k8s-dra-driver", Version: "v1.0.0", @@ -1348,7 +1348,7 @@ func TestTransformDRADriverController(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { - err := TransformDRADriverController(tc.deployment.Deployment, tc.cpSpec) + err := TransformIMEXDRADriverController(tc.deployment.Deployment, tc.cpSpec) if tc.errorExpected { require.Error(t, err) return diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index 13ae16082..c47255cbb 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -596,82 +596,6 @@ spec: description: NVIDIA Device Plugin image tag type: string type: object - draDriver: - description: DRADriver component spec - properties: - args: - description: 'Optional: List of arguments' - items: - type: string - type: array - enabled: - description: Enabled indicates if the deployment of NVIDIA DRA - Driver through the operator is enabled - type: boolean - env: - description: 'Optional: List of environment variables' - items: - description: EnvVar represents an environment variable present - in a Container. - properties: - name: - description: Name of the environment variable. - type: string - value: - description: Value of the environment variable. - type: string - required: - - name - type: object - type: array - image: - description: NVIDIA DRA Driver image name - pattern: '[a-zA-Z0-9\-]+' - type: string - imagePullPolicy: - description: Image pull policy - type: string - imagePullSecrets: - description: Image pull secrets - items: - type: string - type: array - repository: - description: NVIDIA DRA Driver image repository - type: string - resources: - description: 'Optional: Define resources requests and limits for - each pod' - properties: - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - type: object - version: - description: NVIDIA DRA Driver image tag - type: string - type: object driver: description: Driver component spec properties: @@ -1262,6 +1186,82 @@ spec: stop, start, or restart systemd services. type: string type: object + imexDRADriver: + description: DRADriver component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if the deployment of NVIDIA IMEX + DRA Driver through the operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA IMEX DRA Driver image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA IMEX DRA Driver image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA IMEX DRA Driver image tag + type: string + type: object kataManager: description: KataManager component spec properties: @@ -2372,9 +2372,9 @@ spec: - dcgm - dcgmExporter - devicePlugin - - draDriver - driver - gfd + - imexDRADriver - nodeStatusExporter - operator - toolkit diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index b137433f4..f17fa86c8 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -459,31 +459,31 @@ spec: name: {{ .Values.devicePlugin.config.name }} default: {{ .Values.devicePlugin.config.default }} {{- end }} - draDriver: - enabled: {{ .Values.draDriver.enabled }} - {{- if .Values.draDriver.repository }} - repository: {{ .Values.draDriver.repository }} + imexDRADriver: + enabled: {{ .Values.imexDRADriver.enabled }} + {{- if .Values.imexDRADriver.repository }} + repository: {{ .Values.imexDRADriver.repository }} {{- end }} - {{- if .Values.draDriver.image }} - image: {{ .Values.draDriver.image }} + {{- if .Values.imexDRADriver.image }} + image: {{ .Values.imexDRADriver.image }} {{- end }} - {{- if .Values.draDriver.version }} - version: {{ .Values.draDriver.version | quote }} + {{- if .Values.imexDRADriver.version }} + version: {{ .Values.imexDRADriver.version | quote }} {{- end }} - {{- if .Values.draDriver.imagePullPolicy }} - imagePullPolicy: {{ .Values.draDriver.imagePullPolicy }} + {{- if .Values.imexDRADriver.imagePullPolicy }} + imagePullPolicy: {{ .Values.imexDRADriver.imagePullPolicy }} {{- end }} - {{- if .Values.draDriver.imagePullSecrets }} - imagePullSecrets: {{ toYaml .Values.draDriver.imagePullSecrets | nindent 6 }} + {{- if .Values.imexDRADriver.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.imexDRADriver.imagePullSecrets | nindent 6 }} {{- end }} - {{- if .Values.draDriver.resources }} - resources: {{ toYaml .Values.draDriver.resources | nindent 6 }} + {{- if .Values.imexDRADriver.resources }} + resources: {{ toYaml .Values.imexDRADriver.resources | nindent 6 }} {{- end }} - {{- if .Values.draDriver.env }} - env: {{ toYaml .Values.draDriver.env | nindent 6 }} + {{- if .Values.imexDRADriver.env }} + env: {{ toYaml .Values.imexDRADriver.env | nindent 6 }} {{- end }} - {{- if .Values.draDriver.args }} - args: {{ toYaml .Values.draDriver.args | nindent 6 }} + {{- if .Values.imexDRADriver.args }} + args: {{ toYaml .Values.imexDRADriver.args | nindent 6 }} {{- end }} dcgm: enabled: {{ .Values.dcgm.enabled }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 2f25dbdc2..431bfb586 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -294,7 +294,7 @@ devicePlugin: # MPS root path on the host root: "/run/nvidia/mps" -draDriver: +imexDRADriver: enabled: true repository: ghcr.io/nvidia image: k8s-dra-driver