-
Notifications
You must be signed in to change notification settings - Fork 313
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add DRA driver for IMEX #1143
base: main
Are you sure you want to change the base?
Add DRA driver for IMEX #1143
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,8 @@ type ClusterPolicySpec struct { | |
Toolkit ToolkitSpec `json:"toolkit"` | ||
// DevicePlugin component spec | ||
DevicePlugin DevicePluginSpec `json:"devicePlugin"` | ||
// DRADriver component spec | ||
IMEXDRADriver IMEXDRADriverSpec `json:"imexDRADriver"` | ||
// DCGMExporter spec | ||
DCGMExporter DCGMExporterSpec `json:"dcgmExporter"` | ||
// DCGM component spec | ||
|
@@ -841,6 +843,60 @@ type SandboxDevicePluginSpec struct { | |
Env []EnvVar `json:"env,omitempty"` | ||
} | ||
|
||
// IMEXDRADriverSpec defines the properties for the NVIDIA IMEX DRA Driver deployment | ||
// TODO: add 'controller' and 'kubeletPlugin' structs to allow for per-component configuration | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One question: Should we expose There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't believe we should at this point in time. But I can see where having the ability to control the |
||
type IMEXDRADriverSpec struct { | ||
// Enabled indicates if the deployment of NVIDIA IMEX DRA Driver through the operator is enabled | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA IMEX DRA Driver deployment through GPU Operator" | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" | ||
Enabled *bool `json:"enabled,omitempty"` | ||
|
||
// NVIDIA IMEX DRA Driver image repository | ||
// +kubebuilder:validation:Optional | ||
Repository string `json:"repository,omitempty"` | ||
|
||
// NVIDIA IMEX DRA Driver image name | ||
// +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+ | ||
Image string `json:"image,omitempty"` | ||
|
||
// NVIDIA IMEX DRA Driver image tag | ||
// +kubebuilder:validation:Optional | ||
Version string `json:"version,omitempty"` | ||
|
||
// Image pull policy | ||
// +kubebuilder:validation:Optional | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy" | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy" | ||
ImagePullPolicy string `json:"imagePullPolicy,omitempty"` | ||
|
||
// Image pull secrets | ||
// +kubebuilder:validation:Optional | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets" | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret" | ||
ImagePullSecrets []string `json:"imagePullSecrets,omitempty"` | ||
|
||
// Optional: Define resources requests and limits for each pod | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements" | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:resourceRequirements" | ||
Resources *ResourceRequirements `json:"resources,omitempty"` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Question -- should we include the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the IMEX DRA Driver consists of a controller and kubeletPlugin, it feels as if I should update this so that users can configure each component independently.
|
||
|
||
// Optional: List of arguments | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Arguments" | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text" | ||
Args []string `json:"args,omitempty"` | ||
|
||
// Optional: List of environment variables | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables" | ||
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text" | ||
Env []EnvVar `json:"env,omitempty"` | ||
} | ||
|
||
// DCGMExporterSpec defines the properties for NVIDIA DCGM Exporter deployment | ||
type DCGMExporterSpec struct { | ||
// Enabled indicates if deployment of NVIDIA DCGM Exporter through operator is enabled | ||
|
@@ -1764,6 +1820,9 @@ func ImagePath(spec interface{}) (string, error) { | |
case *SandboxDevicePluginSpec: | ||
config := spec.(*SandboxDevicePluginSpec) | ||
return imagePath(config.Repository, config.Image, config.Version, "SANDBOX_DEVICE_PLUGIN_IMAGE") | ||
case *IMEXDRADriverSpec: | ||
config := spec.(*IMEXDRADriverSpec) | ||
return imagePath(config.Repository, config.Image, config.Version, "IMEX_DRA_DRIVER_IMAGE") | ||
case *DCGMExporterSpec: | ||
config := spec.(*DCGMExporterSpec) | ||
return imagePath(config.Repository, config.Image, config.Version, "DCGM_EXPORTER_IMAGE") | ||
|
@@ -1872,6 +1931,15 @@ func (p *DevicePluginSpec) IsEnabled() bool { | |
return *p.Enabled | ||
} | ||
|
||
// IsEnabled returns true if IMEX DRA Driver is enabled through gpu-operator | ||
func (d *IMEXDRADriverSpec) IsEnabled() bool { | ||
if d.Enabled == nil { | ||
// default is true if not specified by user | ||
return true | ||
} | ||
return *d.Enabled | ||
} | ||
|
||
// IsEnabled returns true if dcgm-exporter is enabled(default) through gpu-operator | ||
func (e *DCGMExporterSpec) IsEnabled() bool { | ||
if e.Enabled == nil { | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
apiVersion: v1 | ||
kind: ServiceAccount | ||
metadata: | ||
name: nvidia-imex-dra-driver | ||
namespace: "FILLED BY THE OPERATOR" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: ClusterRole | ||
metadata: | ||
name: nvidia-imex-dra-driver | ||
rules: | ||
# TODO: restrict RBAC for DRA driver | ||
- apiGroups: | ||
- "" | ||
- apps | ||
- resource.k8s.io | ||
- gpu.nvidia.com | ||
resources: | ||
- '*' | ||
verbs: | ||
- '*' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: ClusterRoleBinding | ||
metadata: | ||
name: nvidia-imex-dra-driver | ||
roleRef: | ||
apiGroup: rbac.authorization.k8s.io | ||
kind: ClusterRole | ||
name: nvidia-imex-dra-driver | ||
subjects: | ||
- kind: ServiceAccount | ||
name: nvidia-imex-dra-driver | ||
namespace: "FILLED BY THE OPERATOR" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
apiVersion: resource.k8s.io/v1alpha3 | ||
kind: DeviceClass | ||
metadata: | ||
name: imex.nvidia.com | ||
spec: | ||
selectors: | ||
- cel: | ||
expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'imex-channel'" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
labels: | ||
app: nvidia-imex-dra-driver-controller | ||
name: nvidia-imex-dra-driver-controller | ||
namespace: "FILLED BY THE OPERATOR" | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
app: nvidia-imex-dra-driver-controller | ||
template: | ||
metadata: | ||
labels: | ||
app: nvidia-imex-dra-driver-controller | ||
spec: | ||
priorityClassName: system-node-critical | ||
serviceAccountName: nvidia-imex-dra-driver | ||
tolerations: | ||
- effect: NoSchedule | ||
key: node-role.kubernetes.io/master | ||
operator: Exists | ||
- effect: NoSchedule | ||
key: node-role.kubernetes.io/control-plane | ||
operator: Exists | ||
containers: | ||
- name: controller | ||
image: "FILLED BY THE OPERATOR" | ||
imagePullPolicy: IfNotPresent | ||
command: ["nvidia-dra-controller", "-v", "6"] | ||
env: | ||
- name: DEVICE_CLASSES | ||
value: imex | ||
- name: POD_NAME | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.name | ||
- name: NAMESPACE | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.namespace |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
apiVersion: v1 | ||
kind: ConfigMap | ||
metadata: | ||
name: nvidia-imex-dra-driver-kubelet-plugin-entrypoint | ||
namespace: "FILLED BY THE OPERATOR" | ||
labels: | ||
app: nvidia-imex-dra-driver-kubelet-plugin | ||
data: | ||
entrypoint.sh: |- | ||
#!/bin/bash | ||
|
||
until [[ -f /run/nvidia/validations/driver-ready ]] | ||
do | ||
echo "waiting for the driver validations to be ready..." | ||
sleep 5 | ||
done | ||
|
||
set -o allexport | ||
cat /run/nvidia/validations/driver-ready | ||
. /run/nvidia/validations/driver-ready | ||
# TODO: add an alias for DRIVER_ROOT_CTR_PATH in the k8s-dra-driver and remove the below export | ||
export CONTAINER_DRIVER_ROOT=$DRIVER_ROOT_CTR_PATH | ||
|
||
# Conditionally mask the params file to prevent this container from | ||
# recreating any missing GPU device nodes. This is necessary, for | ||
# example, when running under nvkind to limit the set GPUs governed | ||
# by the plugin even though it has cgroup access to all of them. | ||
if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then | ||
cp /proc/driver/nvidia/params root/gpu-params | ||
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params | ||
mount --bind root/gpu-params /proc/driver/nvidia/params | ||
fi | ||
echo "Starting nvidia-dra-plugin" | ||
exec nvidia-dra-plugin |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
apiVersion: apps/v1 | ||
kind: DaemonSet | ||
metadata: | ||
labels: | ||
app: nvidia-imex-dra-driver-kubelet-plugin | ||
name: nvidia-imex-dra-driver-kubelet-plugin | ||
namespace: "FILLED BY THE OPERATOR" | ||
spec: | ||
selector: | ||
matchLabels: | ||
app: nvidia-imex-dra-driver-kubelet-plugin | ||
updateStrategy: | ||
rollingUpdate: | ||
maxSurge: 0 | ||
maxUnavailable: 1 | ||
type: RollingUpdate | ||
template: | ||
metadata: | ||
labels: | ||
app: nvidia-imex-dra-driver-kubelet-plugin | ||
spec: | ||
priorityClassName: system-node-critical | ||
serviceAccountName: nvidia-imex-dra-driver | ||
affinity: | ||
nodeAffinity: | ||
requiredDuringSchedulingIgnoredDuringExecution: | ||
nodeSelectorTerms: | ||
- matchExpressions: | ||
- key: nvidia.com/gpu.imex-domain | ||
operator: Exists | ||
initContainers: | ||
- image: "FILLED BY THE OPERATOR" | ||
name: driver-validation | ||
command: [ 'sh', '-c' ] | ||
args: [ "until [ -f /run/nvidia/validations/driver-ready ]; do echo waiting for driver to be setup; sleep 5; done" ] | ||
securityContext: | ||
privileged: true | ||
volumeMounts: | ||
- name: run-nvidia-validations | ||
mountPath: /run/nvidia/validations | ||
mountPropagation: HostToContainer | ||
containers: | ||
- name: plugin | ||
image: "FILLED BY THE OPERATOR" | ||
imagePullPolicy: IfNotPresent | ||
command: ["/bin/bash", "-c"] | ||
args: | ||
- /bin/entrypoint.sh | ||
env: | ||
- name: MASK_NVIDIA_DRIVER_PARAMS | ||
value: "false" | ||
- name: NVIDIA_VISIBLE_DEVICES | ||
value: void | ||
- name: CDI_ROOT | ||
value: /var/run/cdi | ||
- name: NVIDIA_MIG_CONFIG_DEVICES | ||
value: all | ||
- name: DEVICE_CLASSES | ||
value: imex | ||
- name: NODE_NAME | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: spec.nodeName | ||
- name: NAMESPACE | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.namespace | ||
securityContext: | ||
privileged: true | ||
volumeMounts: | ||
- name: nvidia-imex-dra-driver-kubelet-plugin-entrypoint | ||
readOnly: true | ||
mountPath: /bin/entrypoint.sh | ||
subPath: entrypoint.sh | ||
- name: run-nvidia-validations | ||
mountPath: /run/nvidia/validations | ||
- name: driver-install-dir | ||
mountPath: /driver-root | ||
mountPropagation: HostToContainer | ||
readOnly: true | ||
- name: host-root | ||
mountPath: /host | ||
readOnly: true | ||
- mountPath: /var/lib/kubelet/plugins_registry | ||
name: plugins-registry | ||
- mountPath: /var/lib/kubelet/plugins | ||
mountPropagation: Bidirectional | ||
name: plugins | ||
- mountPath: /var/run/cdi | ||
name: cdi | ||
volumes: | ||
- name: nvidia-imex-dra-driver-kubelet-plugin-entrypoint | ||
configMap: | ||
name: nvidia-imex-dra-driver-kubelet-plugin-entrypoint | ||
defaultMode: 448 | ||
- name: run-nvidia-validations | ||
hostPath: | ||
path: "/run/nvidia/validations" | ||
type: DirectoryOrCreate | ||
- name: driver-install-dir | ||
hostPath: | ||
path: "/run/nvidia/driver" | ||
type: DirectoryOrCreate | ||
- name: host-root | ||
hostPath: | ||
path: / | ||
- name: plugins-registry | ||
hostPath: | ||
path: /var/lib/kubelet/plugins_registry | ||
- name: plugins | ||
hostPath: | ||
path: /var/lib/kubelet/plugins | ||
- name: cdi | ||
hostPath: | ||
path: /var/run/cdi | ||
type: DirectoryOrCreate |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This approach seems better.