Skip to content

Commit

Permalink
Upgrade to latest NVIDIA NVML package
Browse files Browse the repository at this point in the history
  • Loading branch information
Harish Senthilkumar committed Jan 8, 2025
1 parent c46c0f4 commit b036ab0
Show file tree
Hide file tree
Showing 45 changed files with 32,825 additions and 7,411 deletions.
4 changes: 2 additions & 2 deletions ecs-init/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/aws/amazon-ecs-agent/ecs-init
go 1.22

require (
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5
github.com/NVIDIA/go-nvml v0.12.4-0
github.com/aws/aws-sdk-go-v2 v1.31.0
github.com/aws/aws-sdk-go-v2/config v1.27.37
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.14
Expand All @@ -16,7 +16,7 @@ require (
github.com/fsouza/go-dockerclient v1.10.1
github.com/golang/mock v1.6.0
github.com/pkg/errors v0.9.1
github.com/stretchr/testify v1.8.4
github.com/stretchr/testify v1.9.0
)

require (
Expand Down
8 changes: 4 additions & 4 deletions ecs-init/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migc
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
github.com/Microsoft/hcsshim v0.9.10 h1:TxXGNmcbQxBKVWvjvTocNb6jrPyeHlk5EiDhhgHgggs=
github.com/Microsoft/hcsshim v0.9.10/go.mod h1:7pLA8lDk46WKDWlVsENo92gC0XFa8rbKfyFRBqxEbCc=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 h1:WLyvLAM0QfjAarRzRTG9EgT5McqGWNZMvqqSUSoyUUY=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA=
github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg=
github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
github.com/aws/aws-sdk-go-v2 v1.31.0 h1:3V05LbxTSItI5kUqNwhJrrrY1BAXxXt0sN0l72QmG5U=
github.com/aws/aws-sdk-go-v2 v1.31.0/go.mod h1:ztolYtaEUtdpf9Wftr31CJfLVjOnD/CVRkKOOYgF8hA=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.5 h1:xDAuZTn4IMm8o1LnBZvmrL8JA1io4o3YWNXgohbf20g=
Expand Down Expand Up @@ -113,8 +113,8 @@ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
Expand Down
9,299 changes: 9,299 additions & 0 deletions ecs-init/gpu/mocks/mock_nvml_device.go

Large diffs are not rendered by default.

51 changes: 32 additions & 19 deletions ecs-init/gpu/nvidia_gpu_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (
"os"
"path/filepath"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/cihub/seelog"
"github.com/pkg/errors"
)
Expand Down Expand Up @@ -129,7 +129,11 @@ func (n *NvidiaGPUManager) Initialize() error {
var InitializeNVML = InitNVML

func InitNVML() error {
return nvml.Init()
ret := nvml.Init()
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
return nil
}

// Shutdown is for shutting down nvidia's nvml library
Expand All @@ -144,7 +148,11 @@ func (n *NvidiaGPUManager) Shutdown() error {
var ShutdownNVML = ShutdownNVMLib

func ShutdownNVMLib() error {
return nvml.Shutdown()
ret := nvml.Shutdown()
if ret != nvml.SUCCESS {
return errors.New(nvml.ErrorString(ret))
}
return nil
}

// GetDriverVersion is for getting Nvidia driver version on the instance
Expand All @@ -159,7 +167,11 @@ func (n *NvidiaGPUManager) GetDriverVersion() (string, error) {
var NvmlGetDriverVersion = GetNvidiaDriverVersion

func GetNvidiaDriverVersion() (string, error) {
return nvml.GetDriverVersion()
version, ret := nvml.SystemGetDriverVersion()
if ret != nvml.SUCCESS {
return "", errors.New(nvml.ErrorString(ret))
}
return version, nil
}

// GetGPUDeviceIDs is for getting the GPU device UUIDs
Expand All @@ -169,14 +181,18 @@ func (n *NvidiaGPUManager) GetGPUDeviceIDs() ([]string, error) {
return nil, errors.Wrapf(err, "error getting GPU device count for UUID detection")
}
var gpuIDs []string
var i uint
for i = 0; i < count; i++ {
device, err := NvmlNewDeviceLite(i)
if err != nil {
seelog.Errorf("error initializing device of index %d: %v", i, err)
for i := 0; i < count; i++ {
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
seelog.Errorf("Error initializing device of index %d: %v", i, nvml.ErrorString(ret))
continue
}
gpuIDs = append(gpuIDs, device.UUID)
uuid, ret := nvml.DeviceGetUUID(device)
if ret != nvml.SUCCESS {
seelog.Errorf("Failed to get UUID for device at index %d: %v", i, nvml.ErrorString(ret))
continue
}
gpuIDs = append(gpuIDs, uuid)
}
if len(gpuIDs) == 0 {
return gpuIDs, errors.New("error initializing GPU devices")
Expand All @@ -187,15 +203,12 @@ func (n *NvidiaGPUManager) GetGPUDeviceIDs() ([]string, error) {
var NvmlGetDeviceCount = GetDeviceCount

// GetDeviceCount is for getting the number of GPU devices in the instance
func GetDeviceCount() (uint, error) {
return nvml.GetDeviceCount()
}

var NvmlNewDeviceLite = NewDeviceLite

// NewDeviceLite is for initializing a new GPU device
func NewDeviceLite(idx uint) (*nvml.Device, error) {
return nvml.NewDeviceLite(idx)
func GetDeviceCount() (int, error) {
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return 0, errors.New(nvml.ErrorString(ret))
}
return int(count), nil
}

// SaveGPUState saves gpu state info on the disk
Expand Down
152 changes: 89 additions & 63 deletions ecs-init/gpu/nvidia_gpu_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ package gpu
import (
"errors"
"os"
"reflect"
"testing"

"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
"github.com/aws/amazon-ecs-agent/ecs-init/gpu/mocks"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/golang/mock/gomock"
"github.com/stretchr/testify/assert"
)

Expand Down Expand Up @@ -48,19 +50,19 @@ func TestNVMLInitializeError(t *testing.T) {
}

func TestDeviceCount(t *testing.T) {
NvmlGetDeviceCount = func() (uint, error) {
NvmlGetDeviceCount = func() (int, error) {
return 1, nil
}
defer func() {
NvmlGetDeviceCount = GetDeviceCount
}()
count, err := NvmlGetDeviceCount()
assert.Equal(t, uint(1), count)
assert.Equal(t, int(1), count)
assert.NoError(t, err)
}

func TestDeviceCountError(t *testing.T) {
NvmlGetDeviceCount = func() (uint, error) {
NvmlGetDeviceCount = func() (int, error) {
return 0, errors.New("device count error")
}
defer func() {
Expand All @@ -70,85 +72,95 @@ func TestDeviceCountError(t *testing.T) {
assert.Error(t, err)
}

func TestNewDeviceLite(t *testing.T) {
model := "Tesla-k80"
NvmlNewDeviceLite = func(idx uint) (*nvml.Device, error) {
return &nvml.Device{
UUID: "gpu-0123",
Model: &model,
}, nil
}
defer func() {
NvmlNewDeviceLite = NewDeviceLite
}()
device, err := NvmlNewDeviceLite(4)
assert.NoError(t, err)
assert.Equal(t, "gpu-0123", device.UUID)
assert.Equal(t, model, *device.Model)
}
func TestGetGPUDeviceIDs(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

nvidiaGPUManager := NewNvidiaGPUManager()

func TestNewDeviceLiteError(t *testing.T) {
NvmlNewDeviceLite = func(idx uint) (*nvml.Device, error) {
return nil, errors.New("device error")
// Mock NvmlGetDeviceCount
oldNvmlGetDeviceCount := NvmlGetDeviceCount
NvmlGetDeviceCount = func() (int, error) {
return 2, nil
}
defer func() {
NvmlNewDeviceLite = NewDeviceLite
NvmlGetDeviceCount = oldNvmlGetDeviceCount
}()
device, err := NvmlNewDeviceLite(4)
assert.Error(t, err)
assert.Nil(t, device)
}

func TestGetGPUDeviceIDs(t *testing.T) {
nvidiaGPUManager := NewNvidiaGPUManager()
NvmlGetDeviceCount = func() (uint, error) {
return 2, nil
}
NvmlNewDeviceLite = func(idx uint) (*nvml.Device, error) {
var uuid string
// Mock DeviceGetHandleByIndex and DeviceGetUUID
oldDeviceGetHandleByIndex := nvml.DeviceGetHandleByIndex
oldDeviceGetUUID := nvml.DeviceGetUUID

mockDevice1 := mocks.NewMockDevice(ctrl)
mockDevice2 := mocks.NewMockDevice(ctrl)

nvml.DeviceGetHandleByIndex = func(idx int) (nvml.Device, nvml.Return) {
if idx == 0 {
uuid = "gpu-0123"
} else {
uuid = "gpu-1234"
return mockDevice1, nvml.SUCCESS
}
return &nvml.Device{
UUID: uuid,
}, nil
return mockDevice2, nvml.SUCCESS
}

mockDevice1.EXPECT().GetUUID().Return("gpu-0123", nvml.SUCCESS)
mockDevice2.EXPECT().GetUUID().Return("gpu-1234", nvml.SUCCESS)

defer func() {
NvmlGetDeviceCount = GetDeviceCount
NvmlNewDeviceLite = NewDeviceLite
nvml.DeviceGetHandleByIndex = oldDeviceGetHandleByIndex
nvml.DeviceGetUUID = oldDeviceGetUUID
}()

// Call the function and assert
gpuIDs, err := nvidiaGPUManager.GetGPUDeviceIDs()
assert.NoError(t, err)
assert.True(t, reflect.DeepEqual([]string{"gpu-0123", "gpu-1234"}, gpuIDs))
assert.Equal(t, []string{"gpu-0123", "gpu-1234"}, gpuIDs)
}

func TestGetGPUDeviceIDsCountError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

nvidiaGPUManager := NewNvidiaGPUManager()
NvmlGetDeviceCount = func() (uint, error) {

// Mock NvmlGetDeviceCount
oldNvmlGetDeviceCount := NvmlGetDeviceCount
NvmlGetDeviceCount = func() (int, error) {
return 0, errors.New("device count error")
}
defer func() {
NvmlGetDeviceCount = GetDeviceCount
NvmlGetDeviceCount = oldNvmlGetDeviceCount
}()

// Call the function and assert
gpuIDs, err := nvidiaGPUManager.GetGPUDeviceIDs()
assert.Error(t, err)
assert.Empty(t, gpuIDs)
}

func TestGetGPUDeviceIDsDeviceError(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

nvidiaGPUManager := NewNvidiaGPUManager()
NvmlGetDeviceCount = func() (uint, error) {

// Mock NvmlGetDeviceCount
oldNvmlGetDeviceCount := NvmlGetDeviceCount
NvmlGetDeviceCount = func() (int, error) {
return 1, nil
}
NvmlNewDeviceLite = func(idx uint) (*nvml.Device, error) {
return nil, errors.New("device error")
defer func() {
NvmlGetDeviceCount = oldNvmlGetDeviceCount
}()

// Mock DeviceGetHandleByIndex to return an error
oldDeviceGetHandleByIndex := nvml.DeviceGetHandleByIndex
nvml.DeviceGetHandleByIndex = func(int) (nvml.Device, nvml.Return) {
return nil, nvml.ERROR_UNKNOWN
}
defer func() {
NvmlGetDeviceCount = GetDeviceCount
NvmlNewDeviceLite = NewDeviceLite
nvml.DeviceGetHandleByIndex = oldDeviceGetHandleByIndex
}()

// Call the function and assert
gpuIDs, err := nvidiaGPUManager.GetGPUDeviceIDs()
assert.Error(t, err)
assert.Empty(t, gpuIDs)
Expand Down Expand Up @@ -279,50 +291,64 @@ func TestSetupNoGPU(t *testing.T) {
}

func TestGPUSetupSuccessful(t *testing.T) {
ctrl := gomock.NewController(t)
defer ctrl.Finish()

driverVersion := "396.44"
nvidiaGPUManager := NewNvidiaGPUManager()

MatchFilePattern = func(string) ([]string, error) {
return []string{"/dev/nvidia0", "/dev/nvidia1"}, nil
}

InitializeNVML = func() error {
return nil
}

NvmlGetDriverVersion = func() (string, error) {
return driverVersion, nil
}
NvmlGetDeviceCount = func() (uint, error) {

NvmlGetDeviceCount = func() (int, error) {
return 2, nil
}
NvmlNewDeviceLite = func(idx uint) (*nvml.Device, error) {
var uuid string

mockDevice1 := mocks.NewMockDevice(ctrl)
mockDevice2 := mocks.NewMockDevice(ctrl)
mockDevice1.EXPECT().GetUUID().Return("gpu-0123", nvml.SUCCESS)
mockDevice2.EXPECT().GetUUID().Return("gpu-1234", nvml.SUCCESS)

// Mock DeviceGetHandleByIndex
oldDeviceGetHandleByIndex := nvml.DeviceGetHandleByIndex
nvml.DeviceGetHandleByIndex = func(idx int) (nvml.Device, nvml.Return) {
if idx == 0 {
uuid = "gpu-0123"
} else {
uuid = "gpu-1234"
return mockDevice1, nvml.SUCCESS
}
return &nvml.Device{
UUID: uuid,
}, nil
return mockDevice2, nvml.SUCCESS
}

WriteContentToFile = func(string, []byte, os.FileMode) error {
return nil
}

ShutdownNVML = func() error {
return nil
}

defer func() {
MatchFilePattern = FilePatternMatch
InitializeNVML = InitNVML
NvmlGetDriverVersion = GetNvidiaDriverVersion
NvmlGetDeviceCount = GetDeviceCount
NvmlNewDeviceLite = NewDeviceLite
nvml.DeviceGetHandleByIndex = oldDeviceGetHandleByIndex
WriteContentToFile = WriteToFile
ShutdownNVML = ShutdownNVMLib
}()

err := nvidiaGPUManager.Setup()
assert.NoError(t, err)
assert.Equal(t, driverVersion, nvidiaGPUManager.(*NvidiaGPUManager).DriverVersion)
assert.True(t, reflect.DeepEqual([]string{"gpu-0123", "gpu-1234"}, nvidiaGPUManager.(*NvidiaGPUManager).GPUIDs))
assert.Equal(t, []string{"gpu-0123", "gpu-1234"}, nvidiaGPUManager.(*NvidiaGPUManager).GPUIDs)
}

func TestSetupNVMLError(t *testing.T) {
Expand Down
Loading

0 comments on commit b036ab0

Please sign in to comment.