Skip to content

Commit

Permalink
scheduler: consider pod requests when gpu&RDMA joint allocate (#2233)
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <[email protected]>
Co-authored-by: wangjianyu.wjy <[email protected]>
  • Loading branch information
ZiMengSheng and wangjianyu.wjy authored Oct 24, 2024
1 parent bcd2bfd commit 3963c04
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 2 deletions.
4 changes: 2 additions & 2 deletions pkg/scheduler/plugins/deviceshare/device_allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,8 @@ func (a *AutopilotAllocator) jointAllocate(nodeDevice *nodeDevice, requestCtx *r
pcieIDs := newPreferredPCIes(nodeDevice, primaryDeviceType, primaryAllocations)
secondaryDeviceAllocations = apiext.DeviceAllocations{}
for _, deviceType := range secondaryDeviceTypes {
desiredCount := 1
if jointAllocate != nil && jointAllocate.RequiredScope == apiext.SamePCIeDeviceJointAllocateScope {
desiredCount := a.desiredCountPerDeviceType[deviceType]
if jointAllocate != nil && jointAllocate.RequiredScope == apiext.SamePCIeDeviceJointAllocateScope && desiredCount < pcieIDs.Len() {
desiredCount = pcieIDs.Len()
}
allocations, status := allocateDevices(
Expand Down
130 changes: 130 additions & 0 deletions pkg/scheduler/plugins/deviceshare/device_allocator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ func TestAutopilotAllocator(t *testing.T) {
name string
deviceCR *schedulingv1alpha1.Device
gpuWanted int
rdmaWanted int
hostNetwork bool
secondaryDeviceWellPlanned bool
assignedDevices apiext.DeviceAllocations
Expand Down Expand Up @@ -812,6 +813,131 @@ func TestAutopilotAllocator(t *testing.T) {
},
},
},
{
name: "2 RDMA with 2 GPUs Per PCIE, 2 NUMA Nodes, assigned 4 GPUs, requests 4 GPUs",
deviceCR: func() *schedulingv1alpha1.Device {
var data = []byte(`{"metadata":{"name":"test-node-1","creationTimestamp":null,"annotations":{}},"spec":{"devices":[{"type":"rdma","labels":{"type":"fakeW"},"id":"0000:1f:00.0","minor":1,"health":true,"resources":{"koordinator.sh/rdma":"100"},"topology":{"socketID":0,"nodeID":0,"pcieID":"0"},"vfGroups":[{"labels":{"type":"general"},"vfs":[{"minor":0,"busID":"0000:20:00.0"},{"minor":1,"busID":"0000:20:00.1"},{"minor":2,"busID":"0000:20:00.2"},{"minor":3,"busID":"0000:20:00.3"},{"minor":4,"busID":"0000:20:00.4"},{"minor":5,"busID":"0000:20:00.5"},{"minor":6,"busID":"0000:20:00.6"},{"minor":7,"busID":"0000:20:00.7"},{"minor":8,"busID":"0000:20:00.8"},{"minor":9,"busID":"0000:20:00.9"},{"minor":10,"busID":"0000:20:00.a"},{"minor":11,"busID":"0000:20:00.b"},{"minor":12,"busID":"0000:20:00.c"},{"minor":13,"busID":"0000:20:00.d"},{"minor":14,"busID":"0000:20:00.e"},{"minor":15,"busID":"0000:20:00.f"},{"minor":16,"busID":"0000:20:00.10"},{"minor":17,"busID":"0000:20:00.11"},{"minor":18,"busID":"0000:20:00.12"},{"minor":19,"busID":"0000:20:00.13"},{"minor":20,"busID":"0000:20:00.14"},{"minor":21,"busID":"0000:20:00.15"},{"minor":22,"busID":"0000:20:00.16"},{"minor":23,"busID":"0000:20:00.17"},{"minor":24,"busID":"0000:20:00.18"},{"minor":25,"busID":"0000:20:00.19"},{"minor":26,"busID":"0000:20:00.1a"},{"minor":27,"busID":"0000:20:00.1b"},{"minor":28,"busID":"0000:20:00.1c"},{"minor":29,"busID":"0000:20:00.1d"}]}]},{"type":"rdma","labels":{"type":"fakeW"},"id":"0000:90:00.0","minor":2,"health":true,"resources":{"koordinator.sh/rdma":"100"},"topology":{"socketID":0,"nodeID":0,"pcieID":"1"},"vfGroups":[{"labels":{"type":"general"},"vfs":[{"minor":0,"busID":"0000:21:00.0"},{"minor":1,"busID":"0000:21:00.1"},{"minor":2,"busID":"0000:21:00.2"},{"minor":3,"busID":"0000:21:00.3"},{"minor":4,"busID":"0000:21:00.4"},{"minor":5,"busID":"0000:21:00.5"},{"minor":6,"busID":"0000:21:00.6"},{"minor":7,"busID":"0000:21:00.7"},{"minor":8,"busID":"0000:21:00.8"},{"minor":9,"busID":"0000:21:00.9"},{"minor":10,"busID":"0000:21:00.a"},{"minor":11,"busID":"0000:21:00.b"},{"minor":12,"busID":"0000:21:00.c"},{"minor":13,"busID":"0000:21:00.d"},{"minor":14,"busID":"0000:21:00.e"},{"minor":15,"busID":"0000:21:00.f"},{"minor":16,"busID":"0000:21:00.10"},{"minor":17,"busID":"0000:21:00.11"},{"minor":18,"busID":"0000:21:00.12"},{"minor":19,"busID":"0000:21:00.13"},{"minor":20,"busID":"0000:21:00.14"},{"minor":21,"busID":"0000:21:00.15"},{"minor":22,"busID":"0000:21:00.16"},{"minor":23,"busID":"0000:21:00.17"},{"minor":24,"busID":"0000:21:00.18"},{"minor":25,"busID":"0000:21:00.19"},{"minor":26,"busID":"0000:21:00.1a"},{"minor":27,"busID":"0000:21:00.1b"},{"minor":28,"busID":"0000:21:00.1c"},{"minor":29,"busID":"0000:21:00.1d"}]}]},{"type":"rdma","labels":{"type":"fakeW"},"id":"0000:51:00.0","minor":3,"health":true,"resources":{"koordinator.sh/rdma":"100"},"topology":{"socketID":1,"nodeID":1,"pcieID":"2"},"vfGroups":[{"labels":{"type":"general"},"vfs":[{"minor":0,"busID":"0000:22:00.0"},{"minor":1,"busID":"0000:22:00.1"},{"minor":2,"busID":"0000:22:00.2"},{"minor":3,"busID":"0000:22:00.3"},{"minor":4,"busID":"0000:22:00.4"},{"minor":5,"busID":"0000:22:00.5"},{"minor":6,"busID":"0000:22:00.6"},{"minor":7,"busID":"0000:22:00.7"},{"minor":8,"busID":"0000:22:00.8"},{"minor":9,"busID":"0000:22:00.9"},{"minor":10,"busID":"0000:22:00.a"},{"minor":11,"busID":"0000:22:00.b"},{"minor":12,"busID":"0000:22:00.c"},{"minor":13,"busID":"0000:22:00.d"},{"minor":14,"busID":"0000:22:00.e"},{"minor":15,"busID":"0000:22:00.f"},{"minor":16,"busID":"0000:22:00.10"},{"minor":17,"busID":"0000:22:00.11"},{"minor":18,"busID":"0000:22:00.12"},{"minor":19,"busID":"0000:22:00.13"},{"minor":20,"busID":"0000:22:00.14"},{"minor":21,"busID":"0000:22:00.15"},{"minor":22,"busID":"0000:22:00.16"},{"minor":23,"busID":"0000:22:00.17"},{"minor":24,"busID":"0000:22:00.18"},{"minor":25,"busID":"0000:22:00.19"},{"minor":26,"busID":"0000:22:00.1a"},{"minor":27,"busID":"0000:22:00.1b"},{"minor":28,"busID":"0000:22:00.1c"},{"minor":29,"busID":"0000:22:00.1d"}]}]},{"type":"rdma","labels":{"type":"fakeW"},"id":"0000:b9:00.0","minor":4,"health":true,"resources":{"koordinator.sh/rdma":"100"},"topology":{"socketID":1,"nodeID":1,"pcieID":"3"},"vfGroups":[{"labels":{"type":"general"},"vfs":[{"minor":0,"busID":"0000:23:00.0"},{"minor":1,"busID":"0000:23:00.1"},{"minor":2,"busID":"0000:23:00.2"},{"minor":3,"busID":"0000:23:00.3"},{"minor":4,"busID":"0000:23:00.4"},{"minor":5,"busID":"0000:23:00.5"},{"minor":6,"busID":"0000:23:00.6"},{"minor":7,"busID":"0000:23:00.7"},{"minor":8,"busID":"0000:23:00.8"},{"minor":9,"busID":"0000:23:00.9"},{"minor":10,"busID":"0000:23:00.a"},{"minor":11,"busID":"0000:23:00.b"},{"minor":12,"busID":"0000:23:00.c"},{"minor":13,"busID":"0000:23:00.d"},{"minor":14,"busID":"0000:23:00.e"},{"minor":15,"busID":"0000:23:00.f"},{"minor":16,"busID":"0000:23:00.10"},{"minor":17,"busID":"0000:23:00.11"},{"minor":18,"busID":"0000:23:00.12"},{"minor":19,"busID":"0000:23:00.13"},{"minor":20,"busID":"0000:23:00.14"},{"minor":21,"busID":"0000:23:00.15"},{"minor":22,"busID":"0000:23:00.16"},{"minor":23,"busID":"0000:23:00.17"},{"minor":24,"busID":"0000:23:00.18"},{"minor":25,"busID":"0000:23:00.19"},{"minor":26,"busID":"0000:23:00.1a"},{"minor":27,"busID":"0000:23:00.1b"},{"minor":28,"busID":"0000:23:00.1c"},{"minor":29,"busID":"0000:23:00.1d"}]}]},{"type":"rdma","labels":{"type":"fakeW"},"id":"0001:1f:00.0","minor":5,"health":true,"resources":{"koordinator.sh/rdma":"100"},"topology":{"socketID":0,"nodeID":0,"pcieID":"0"},"vfGroups":[{"labels":{"type":"general"},"vfs":[{"minor":0,"busID":"0001:20:00.0"},{"minor":1,"busID":"0001:20:00.1"},{"minor":2,"busID":"0001:20:00.2"},{"minor":3,"busID":"0001:20:00.3"},{"minor":4,"busID":"0001:20:00.4"},{"minor":5,"busID":"0001:20:00.5"},{"minor":6,"busID":"0001:20:00.6"},{"minor":7,"busID":"0001:20:00.7"},{"minor":8,"busID":"0001:20:00.8"},{"minor":9,"busID":"0001:20:00.9"},{"minor":10,"busID":"0001:20:00.a"},{"minor":11,"busID":"0001:20:00.b"},{"minor":12,"busID":"0001:20:00.c"},{"minor":13,"busID":"0001:20:00.d"},{"minor":14,"busID":"0001:20:00.e"},{"minor":15,"busID":"0001:20:00.f"},{"minor":16,"busID":"0001:20:00.10"},{"minor":17,"busID":"0001:20:00.11"},{"minor":18,"busID":"0001:20:00.12"},{"minor":19,"busID":"0001:20:00.13"},{"minor":20,"busID":"0001:20:00.14"},{"minor":21,"busID":"0001:20:00.15"},{"minor":22,"busID":"0001:20:00.16"},{"minor":23,"busID":"0001:20:00.17"},{"minor":24,"busID":"0001:20:00.18"},{"minor":25,"busID":"0001:20:00.19"},{"minor":26,"busID":"0001:20:00.1a"},{"minor":27,"busID":"0001:20:00.1b"},{"minor":28,"busID":"0001:20:00.1c"},{"minor":29,"busID":"0001:20:00.1d"}]}]},{"type":"rdma","labels":{"type":"fakeW"},"id":"0001:90:00.0","minor":6,"health":true,"resources":{"koordinator.sh/rdma":"100"},"topology":{"socketID":0,"nodeID":0,"pcieID":"1"},"vfGroups":[{"labels":{"type":"general"},"vfs":[{"minor":0,"busID":"0001:21:00.0"},{"minor":1,"busID":"0001:21:00.1"},{"minor":2,"busID":"0001:21:00.2"},{"minor":3,"busID":"0001:21:00.3"},{"minor":4,"busID":"0001:21:00.4"},{"minor":5,"busID":"0001:21:00.5"},{"minor":6,"busID":"0001:21:00.6"},{"minor":7,"busID":"0001:21:00.7"},{"minor":8,"busID":"0001:21:00.8"},{"minor":9,"busID":"0001:21:00.9"},{"minor":10,"busID":"0001:21:00.a"},{"minor":11,"busID":"0001:21:00.b"},{"minor":12,"busID":"0001:21:00.c"},{"minor":13,"busID":"0001:21:00.d"},{"minor":14,"busID":"0001:21:00.e"},{"minor":15,"busID":"0001:21:00.f"},{"minor":16,"busID":"0001:21:00.10"},{"minor":17,"busID":"0001:21:00.11"},{"minor":18,"busID":"0001:21:00.12"},{"minor":19,"busID":"0001:21:00.13"},{"minor":20,"busID":"0001:21:00.14"},{"minor":21,"busID":"0001:21:00.15"},{"minor":22,"busID":"0001:21:00.16"},{"minor":23,"busID":"0001:21:00.17"},{"minor":24,"busID":"0001:21:00.18"},{"minor":25,"busID":"0001:21:00.19"},{"minor":26,"busID":"0001:21:00.1a"},{"minor":27,"busID":"0001:21:00.1b"},{"minor":28,"busID":"0001:21:00.1c"},{"minor":29,"busID":"0001:21:00.1d"}]}]},{"type":"rdma","labels":{"type":"fakeW"},"id":"0001:51:00.0","minor":7,"health":true,"resources":{"koordinator.sh/rdma":"100"},"topology":{"socketID":1,"nodeID":1,"pcieID":"2"},"vfGroups":[{"labels":{"type":"general"},"vfs":[{"minor":0,"busID":"0001:22:00.0"},{"minor":1,"busID":"0001:22:00.1"},{"minor":2,"busID":"0001:22:00.2"},{"minor":3,"busID":"0001:22:00.3"},{"minor":4,"busID":"0001:22:00.4"},{"minor":5,"busID":"0001:22:00.5"},{"minor":6,"busID":"0001:22:00.6"},{"minor":7,"busID":"0001:22:00.7"},{"minor":8,"busID":"0001:22:00.8"},{"minor":9,"busID":"0001:22:00.9"},{"minor":10,"busID":"0001:22:00.a"},{"minor":11,"busID":"0001:22:00.b"},{"minor":12,"busID":"0001:22:00.c"},{"minor":13,"busID":"0001:22:00.d"},{"minor":14,"busID":"0001:22:00.e"},{"minor":15,"busID":"0001:22:00.f"},{"minor":16,"busID":"0001:22:00.10"},{"minor":17,"busID":"0001:22:00.11"},{"minor":18,"busID":"0001:22:00.12"},{"minor":19,"busID":"0001:22:00.13"},{"minor":20,"busID":"0001:22:00.14"},{"minor":21,"busID":"0001:22:00.15"},{"minor":22,"busID":"0001:22:00.16"},{"minor":23,"busID":"0001:22:00.17"},{"minor":24,"busID":"0001:22:00.18"},{"minor":25,"busID":"0001:22:00.19"},{"minor":26,"busID":"0001:22:00.1a"},{"minor":27,"busID":"0001:22:00.1b"},{"minor":28,"busID":"0001:22:00.1c"},{"minor":29,"busID":"0001:22:00.1d"}]}]},{"type":"rdma","labels":{"type":"fakeW"},"id":"0001:b9:00.0","minor":8,"health":true,"resources":{"koordinator.sh/rdma":"100"},"topology":{"socketID":1,"nodeID":1,"pcieID":"3"},"vfGroups":[{"labels":{"type":"general"},"vfs":[{"minor":0,"busID":"0001:23:00.0"},{"minor":1,"busID":"0001:23:00.1"},{"minor":2,"busID":"0001:23:00.2"},{"minor":3,"busID":"0001:23:00.3"},{"minor":4,"busID":"0001:23:00.4"},{"minor":5,"busID":"0001:23:00.5"},{"minor":6,"busID":"0001:23:00.6"},{"minor":7,"busID":"0001:23:00.7"},{"minor":8,"busID":"0001:23:00.8"},{"minor":9,"busID":"0001:23:00.9"},{"minor":10,"busID":"0001:23:00.a"},{"minor":11,"busID":"0001:23:00.b"},{"minor":12,"busID":"0001:23:00.c"},{"minor":13,"busID":"0001:23:00.d"},{"minor":14,"busID":"0001:23:00.e"},{"minor":15,"busID":"0001:23:00.f"},{"minor":16,"busID":"0001:23:00.10"},{"minor":17,"busID":"0001:23:00.11"},{"minor":18,"busID":"0001:23:00.12"},{"minor":19,"busID":"0001:23:00.13"},{"minor":20,"busID":"0001:23:00.14"},{"minor":21,"busID":"0001:23:00.15"},{"minor":22,"busID":"0001:23:00.16"},{"minor":23,"busID":"0001:23:00.17"},{"minor":24,"busID":"0001:23:00.18"},{"minor":25,"busID":"0001:23:00.19"},{"minor":26,"busID":"0001:23:00.1a"},{"minor":27,"busID":"0001:23:00.1b"},{"minor":28,"busID":"0001:23:00.1c"},{"minor":29,"busID":"0001:23:00.1d"}]}]},{"type":"gpu","id":"GPU-8c25ea37-2909-6e62-b7bf-e2fcadebea8d","minor":0,"health":true,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"83201216Ki","koordinator.sh/gpu-memory-ratio":"100"},"topology":{"socketID":0,"nodeID":0,"pcieID":"0"}},{"type":"gpu","id":"GPU-befd76c3-8a36-7b8a-179c-eae75aa7d9f2","minor":1,"health":true,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"83201216Ki","koordinator.sh/gpu-memory-ratio":"100"},"topology":{"socketID":0,"nodeID":0,"pcieID":"0"}},{"type":"gpu","id":"GPU-87a9047b-dade-e08c-c067-7fedfd2e2750","minor":2,"health":true,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"83201216Ki","koordinator.sh/gpu-memory-ratio":"100"},"topology":{"socketID":0,"nodeID":0,"pcieID":"1"}},{"type":"gpu","id":"GPU-44a68f77-c18d-85a6-5425-e314c0e8e182","minor":3,"health":true,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"83201216Ki","koordinator.sh/gpu-memory-ratio":"100"},"topology":{"socketID":0,"nodeID":0,"pcieID":"1"}},{"type":"gpu","id":"GPU-ac53dc25-2cb7-a11d-417f-ce23331dcea0","minor":4,"health":true,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"83201216Ki","koordinator.sh/gpu-memory-ratio":"100"},"topology":{"socketID":1,"nodeID":1,"pcieID":"2"}},{"type":"gpu","id":"GPU-3908dbfd-6e0b-013d-549b-fca246a16fa0","minor":5,"health":true,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"83201216Ki","koordinator.sh/gpu-memory-ratio":"100"},"topology":{"socketID":1,"nodeID":1,"pcieID":"2"}},{"type":"gpu","id":"GPU-7a87e98a-a1a7-28bc-c880-28c870bf0c7d","minor":6,"health":true,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"83201216Ki","koordinator.sh/gpu-memory-ratio":"100"},"topology":{"socketID":1,"nodeID":1,"pcieID":"3"}},{"type":"gpu","id":"GPU-c3b7de0e-8a41-9bdb-3f71-8175c3438890","minor":7,"health":true,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"83201216Ki","koordinator.sh/gpu-memory-ratio":"100"},"topology":{"socketID":1,"nodeID":1,"pcieID":"3"}}]},"status":{}}`)
var deviceCR schedulingv1alpha1.Device
_ = json.Unmarshal(data, &deviceCR)
return &deviceCR
}(),
gpuWanted: 4,
rdmaWanted: 4,
assignedDevices: apiext.DeviceAllocations{
schedulingv1alpha1.GPU: []*apiext.DeviceAllocation{
{
Minor: 0,
Resources: gpuResourceList,
},
{
Minor: 1,
Resources: gpuResourceList,
},
{
Minor: 2,
Resources: gpuResourceList,
},
{
Minor: 3,
Resources: gpuResourceList,
},
},
schedulingv1alpha1.RDMA: []*apiext.DeviceAllocation{
{
Minor: 1,
Resources: corev1.ResourceList{
apiext.ResourceRDMA: *resource.NewQuantity(1, resource.DecimalSI),
},
Extension: &apiext.DeviceAllocationExtension{
VirtualFunctions: []apiext.VirtualFunction{
{
BusID: "0000:20:00.0",
Minor: 0,
},
},
},
},
},
},
want: apiext.DeviceAllocations{
schedulingv1alpha1.GPU: []*apiext.DeviceAllocation{
{
Minor: 4,
Resources: gpuResourceList,
},
{
Minor: 5,
Resources: gpuResourceList,
},
{
Minor: 6,
Resources: gpuResourceList,
},
{
Minor: 7,
Resources: gpuResourceList,
},
},
schedulingv1alpha1.RDMA: []*apiext.DeviceAllocation{
{
Minor: 3,
Resources: corev1.ResourceList{
apiext.ResourceRDMA: *resource.NewQuantity(100, resource.DecimalSI),
},
Extension: &apiext.DeviceAllocationExtension{
VirtualFunctions: []apiext.VirtualFunction{
{
BusID: "0000:22:00.0",
Minor: 0,
},
},
},
},
{
Minor: 4,
Resources: corev1.ResourceList{
apiext.ResourceRDMA: *resource.NewQuantity(100, resource.DecimalSI),
},
Extension: &apiext.DeviceAllocationExtension{
VirtualFunctions: []apiext.VirtualFunction{
{
BusID: "0000:23:00.0",
Minor: 0,
},
},
},
},
{
Minor: 7,
Resources: corev1.ResourceList{
apiext.ResourceRDMA: *resource.NewQuantity(100, resource.DecimalSI),
},
Extension: &apiext.DeviceAllocationExtension{
VirtualFunctions: []apiext.VirtualFunction{
{
BusID: "0001:22:00.0",
Minor: 0,
},
},
},
},
{
Minor: 8,
Resources: corev1.ResourceList{
apiext.ResourceRDMA: *resource.NewQuantity(100, resource.DecimalSI),
},
Extension: &apiext.DeviceAllocationExtension{
VirtualFunctions: []apiext.VirtualFunction{
{
BusID: "0001:23:00.0",
Minor: 0,
},
},
},
},
},
},
},
{
name: "1 GPU with hostNetwork and apply for all RDMAs",
deviceCR: func() *schedulingv1alpha1.Device {
Expand Down Expand Up @@ -974,6 +1100,10 @@ func TestAutopilotAllocator(t *testing.T) {
podRequest := corev1.ResourceList{
apiext.ResourceRDMA: *resource.NewQuantity(1, resource.DecimalSI),
}
if tt.rdmaWanted > 0 {
podRequest[apiext.ResourceRDMA] = *resource.NewQuantity(int64(100*tt.rdmaWanted), resource.DecimalSI)
}

if tt.gpuWanted > 0 {
podRequest[apiext.ResourceNvidiaGPU] = *resource.NewQuantity(int64(tt.gpuWanted), resource.DecimalSI)
}
Expand Down

0 comments on commit 3963c04

Please sign in to comment.