Skip to content

Commit

Permalink
Enable labels for ClusterUUID and CliqueId
Browse files Browse the repository at this point in the history
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
  • Loading branch information
ArangoGutierrez committed Sep 25, 2024
1 parent 71c1fa7 commit cada570
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,10 @@ spec:
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: host-sys
mountPath: "/sys"
{{- if .Values.imexEnabled }}
- name: imex-nodes-config
mountPath: "/etc/nvidia-imex/nodes_config.cfg"
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
mountPath: /available-configs
Expand All @@ -199,6 +203,11 @@ spec:
- name: host-sys
hostPath:
path: "/sys"
{{- if .Values.imexEnabled }}
- name: imex-nodes-config
hostPath:
path: "/etc/nvidia-imex/nodes_config.cfg"
{{- end }}
{{- if $options.hasConfigMap }}
- name: available-configs
configMap:
Expand Down
1 change: 1 addition & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null
imexEnabled: false
deviceDiscoveryStrategy: null

nameOverride: ""
Expand Down
71 changes: 71 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@
package lm

import (
"bufio"
"errors"
"fmt"
"math/rand"
"os"
"sort"
"strconv"
"strings"

Expand All @@ -28,6 +32,7 @@ import (

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
"github.com/google/uuid"
)

var errMPSSharingNotSupported = errors.New("MPS sharing is not supported")
Expand Down Expand Up @@ -80,13 +85,22 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

var imexLabeler Labeler
if *config.Flags.GFD.ImexNodesConfig != "" {

Check failure on line 89 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / check

config.Flags.GFD.ImexNodesConfig undefined (type *"github.com/NVIDIA/k8s-device-plugin/api/config/v1".GFDCommandLineFlags has no field or method ImexNodesConfig)

Check failure on line 89 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / check

config.Flags.GFD.ImexNodesConfig undefined (type *"github.com/NVIDIA/k8s-device-plugin/api/config/v1".GFDCommandLineFlags has no field or method ImexNodesConfig)

Check failure on line 89 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / Build

config.Flags.GFD.ImexNodesConfig undefined (type *"github.com/NVIDIA/k8s-device-plugin/api/config/v1".GFDCommandLineFlags has no field or method ImexNodesConfig)

Check failure on line 89 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / Unit test

config.Flags.GFD.ImexNodesConfig undefined (type *"github.com/NVIDIA/k8s-device-plugin/api/config/v1".GFDCommandLineFlags has no field or method ImexNodesConfig)
imexLabeler, err = newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig)

Check failure on line 90 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / check

config.Flags.GFD.ImexNodesConfig undefined (type *"github.com/NVIDIA/k8s-device-plugin/api/config/v1".GFDCommandLineFlags has no field or method ImexNodesConfig)) (typecheck)

Check failure on line 90 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / check

config.Flags.GFD.ImexNodesConfig undefined (type *"github.com/NVIDIA/k8s-device-plugin/api/config/v1".GFDCommandLineFlags has no field or method ImexNodesConfig) (typecheck)

Check failure on line 90 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / Build

config.Flags.GFD.ImexNodesConfig undefined (type *"github.com/NVIDIA/k8s-device-plugin/api/config/v1".GFDCommandLineFlags has no field or method ImexNodesConfig)

Check failure on line 90 in internal/lm/nvml.go

View workflow job for this annotation

GitHub Actions / Unit test

config.Flags.GFD.ImexNodesConfig undefined (type *"github.com/NVIDIA/k8s-device-plugin/api/config/v1".GFDCommandLineFlags has no field or method ImexNodesConfig)
if err != nil {
return nil, fmt.Errorf("error creating imex domain labeler: %v", err)
}
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
imexLabeler,
)

return l, nil
Expand Down Expand Up @@ -218,6 +232,41 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
return labels, nil
}

func newImexDomainLabeler(configFile string) (Labeler, error) {
// read file and parse it
imexConfig, err := os.Open(configFile)
if err != nil {
return nil, fmt.Errorf("failed to read imex config file: %v", err)
}
defer imexConfig.Close()

// Read the file line by line
var ips []string
scanner := bufio.NewScanner(imexConfig)
for scanner.Scan() {
line := scanner.Text()
ips = append(ips, line)
}

if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("failed to read imex config file: %v", err)
}

// Sort the IP addresses
sort.Strings(ips)

// Join the sorted IPs into a single string
sortedIPs := strings.Join(ips, "\n")

hashedconfig := generateUUIDs(sortedIPs, 1)[0]

labels := Labels{
"nvidia.com/gpu.imex-domain": hashedconfig,
}

return labels, nil
}

func getModeForClasses(classes []uint32) string {
if len(classes) == 0 {
return "unknown"
Expand Down Expand Up @@ -254,3 +303,25 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
}
return classes, nil
}

func generateUUIDs(seed string, count int) []string {
rand := rand.New(rand.NewSource(hash(seed)))

uuids := make([]string, count)
for i := 0; i < count; i++ {
charset := make([]byte, 16)
rand.Read(charset)
uuid, _ := uuid.FromBytes(charset)
uuids[i] = uuid.String()
}

return uuids
}

func hash(s string) int64 {
h := int64(0)
for _, c := range s {
h = 31*h + int64(c)
}
return h
}

0 comments on commit cada570

Please sign in to comment.