diff --git a/apis/extension/device_share.go b/apis/extension/device_share.go index 8cd977ebb..5914a16e1 100644 --- a/apis/extension/device_share.go +++ b/apis/extension/device_share.go @@ -80,14 +80,32 @@ const ( "koordinator.sh/gpu-memory": "16Gi" } } + ], + "rdma": [ + { + "minor": 0, + "id": "0000:09:00.0", + "resources": { + "koordinator.sh/rdma": 100, + } + }, + { + "minor": 1, + "id": "0000:10:00.0", + "resources": { + "koordinator.sh/rdma": 100, + } + } ] } */ type DeviceAllocations map[schedulingv1alpha1.DeviceType][]*DeviceAllocation type DeviceAllocation struct { - Minor int32 `json:"minor"` - Resources corev1.ResourceList `json:"resources"` + Minor int32 `json:"minor"` + Resources corev1.ResourceList `json:"resources"` + // ID is the well known identifier for device, because for some device, such as rdma, Minor is meaningless + ID string `json:"id,omitempty"` Extension *DeviceAllocationExtension `json:"extension,omitempty"` } diff --git a/apis/extension/device_share_test.go b/apis/extension/device_share_test.go index a24a9061f..97d43ce64 100644 --- a/apis/extension/device_share_test.go +++ b/apis/extension/device_share_test.go @@ -89,6 +89,33 @@ func Test_GetDeviceAllocations(t *testing.T) { }, wantErr: false, }, + { + name: "correct annotations with busID", + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + UID: "123456789", + Namespace: "default", + Name: "test", + Annotations: map[string]string{ + AnnotationDeviceAllocated: `{"gpu":[{"minor":1,"id": "0000:08.00.0","resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"16Gi","koordinator.sh/gpu-memory-ratio":"100"}}]}`, + }, + }, + }, + want: DeviceAllocations{ + schedulingv1alpha1.GPU: []*DeviceAllocation{ + { + Minor: 1, + ID: "0000:08.00.0", + Resources: corev1.ResourceList{ + ResourceGPUCore: resource.MustParse("100"), + ResourceGPUMemoryRatio: resource.MustParse("100"), + ResourceGPUMemory: resource.MustParse("16Gi"), + }, + }, + }, + }, + wantErr: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/docs/proposals/scheduling/20220629-fine-grained-device-scheduling.md b/docs/proposals/scheduling/20220629-fine-grained-device-scheduling.md index 2592d0d3f..293aa06ea 100644 --- a/docs/proposals/scheduling/20220629-fine-grained-device-scheduling.md +++ b/docs/proposals/scheduling/20220629-fine-grained-device-scheduling.md @@ -218,7 +218,7 @@ resources: ##### DeviceAllocation -In the PreBind stage, the scheduler will update the device (including GPU) allocation results, including the device's Minor and resource allocation information, to the Pod in the form of annotations. +In the PreBind stage, the scheduler will update the device (including GPU) allocation results, including the device's Minor, ID, resource allocation information, to the Pod in the form of annotations. ```go /* @@ -226,7 +226,7 @@ In the PreBind stage, the scheduler will update the device (including GPU) alloc "gpu": [ { "minor": 0, - "resouurces": { + "resources": { "koordinator.sh/gpu-core": 100, "koordinator.sh/gpu-memory-ratio": 100, "koordinator.sh/gpu-memory": "16Gi" @@ -234,18 +234,38 @@ In the PreBind stage, the scheduler will update the device (including GPU) alloc }, { "minor": 1, - "resouurces": { + "resources": { "koordinator.sh/gpu-core": 100, "koordinator.sh/gpu-memory-ratio": 100, "koordinator.sh/gpu-memory": "16Gi" } } + ], + "rdma": [ + { + "minor": 0, + "id": "0000:09:00.0", + "resources": { + "koordinator.sh/rdma": 100, + } + }, + { + "minor": 1, + "id": "0000:10:00.0", + "resources": { + "koordinator.sh/rdma": 100, + } + } ] } */ + type DeviceAllocation struct { - Minor int32 - Resources map[string]resource.Quantity + Minor int32 `json:"minor"` + Resources corev1.ResourceList `json:"resources"` + // ID is the well known identifier for device, because for some device, such as rdma, Minor is meaningless + ID string `json:"id,omitempty"` + Extension *DeviceAllocationExtension `json:"extension,omitempty"` } type DeviceAllocations map[DeviceType][]*DeviceAllocation diff --git a/pkg/scheduler/plugins/deviceshare/plugin.go b/pkg/scheduler/plugins/deviceshare/plugin.go index fd3d8bd8c..a45dea7c0 100644 --- a/pkg/scheduler/plugins/deviceshare/plugin.go +++ b/pkg/scheduler/plugins/deviceshare/plugin.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/types" quotav1 "k8s.io/apiserver/pkg/quota/v1" k8sfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/scheduler/framework" apiext "github.com/koordinator-sh/koordinator/apis/extension" @@ -540,12 +541,45 @@ func (p *Plugin) preBindObject(ctx context.Context, cycleState *framework.CycleS return nil } + err := p.fillID(state.allocationResult, nodeName) + if err != nil { + return framework.AsStatus(err) + } + if err := apiext.SetDeviceAllocations(object, state.allocationResult); err != nil { return framework.NewStatus(framework.Error, err.Error()) } return nil } +func (p *Plugin) fillID(allocationResult apiext.DeviceAllocations, nodeName string) error { + extendedHandle, ok := p.handle.(frameworkext.ExtendedHandle) + if !ok { + return fmt.Errorf("expect handle to be type frameworkext.ExtendedHandle, got %T", p.handle) + } + deviceLister := extendedHandle.KoordinatorSharedInformerFactory().Scheduling().V1alpha1().Devices().Lister() + device, err := deviceLister.Get(nodeName) + if err != nil { + klog.ErrorS(err, "Failed to get Device", "node", nodeName) + return err + } + for deviceType, allocations := range allocationResult { + if deviceType == schedulingv1alpha1.GPU { + // because gpu minor is well known, ID isn't needed + continue + } + for i, allocation := range allocations { + for _, info := range device.Spec.Devices { + if info.Minor != nil && *info.Minor == allocation.Minor && info.Type == deviceType { + allocationResult[deviceType][i].ID = info.UUID + break + } + } + } + } + return nil +} + func updateReservationAllocatable(cycleState *framework.CycleState, reservation *schedulingv1alpha1.Reservation) *framework.Status { state, status := getPreFilterState(cycleState) if !status.IsSuccess() { diff --git a/pkg/scheduler/plugins/deviceshare/plugin_test.go b/pkg/scheduler/plugins/deviceshare/plugin_test.go index a908365a1..da4c4d905 100644 --- a/pkg/scheduler/plugins/deviceshare/plugin_test.go +++ b/pkg/scheduler/plugins/deviceshare/plugin_test.go @@ -3731,7 +3731,8 @@ func Test_Plugin_PreBind(t *testing.T) { Name: "test-container-a", Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - apiext.ResourceGPU: resource.MustParse("2"), + apiext.ResourceGPU: resource.MustParse("2"), + apiext.ResourceRDMA: resource.MustParse("100"), }, }, }, @@ -3746,6 +3747,7 @@ func Test_Plugin_PreBind(t *testing.T) { name string args args wantPod *corev1.Pod + deviceCR *schedulingv1alpha1.Device wantStatus *framework.Status }{ { @@ -3767,7 +3769,7 @@ func Test_Plugin_PreBind(t *testing.T) { wantPod: &corev1.Pod{}, }, { - name: "pre-bind successfully", + name: "pre-bind successfully, deviceCR with topology", args: args{ pod: testPod.DeepCopy(), state: &preFilterState{ @@ -3791,6 +3793,14 @@ func Test_Plugin_PreBind(t *testing.T) { }, }, }, + schedulingv1alpha1.RDMA: { + { + Minor: 1, + Resources: corev1.ResourceList{ + apiext.ResourceRDMA: resource.MustParse("100"), + }, + }, + }, }, podRequests: map[schedulingv1alpha1.DeviceType]corev1.ResourceList{ schedulingv1alpha1.GPU: { @@ -3798,16 +3808,20 @@ func Test_Plugin_PreBind(t *testing.T) { apiext.ResourceGPUMemoryRatio: resource.MustParse("200"), apiext.ResourceGPUMemory: resource.MustParse("32Gi"), }, + schedulingv1alpha1.RDMA: { + apiext.ResourceRDMA: resource.MustParse("100"), + }, }, }, }, + deviceCR: fakeDeviceCR, wantPod: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ UID: "123456789", Namespace: "default", Name: "test", Annotations: map[string]string{ - apiext.AnnotationDeviceAllocated: `{"gpu":[{"minor":0,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"16Gi","koordinator.sh/gpu-memory-ratio":"100"}},{"minor":1,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"16Gi","koordinator.sh/gpu-memory-ratio":"100"}}]}`, + apiext.AnnotationDeviceAllocated: `{"gpu":[{"minor":0,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"16Gi","koordinator.sh/gpu-memory-ratio":"100"}},{"minor":1,"resources":{"koordinator.sh/gpu-core":"100","koordinator.sh/gpu-memory":"16Gi","koordinator.sh/gpu-memory-ratio":"100"}}],"rdma":[{"minor":1,"resources":{"koordinator.sh/rdma":"100"},"id":"0000:1f:00.0"}]}`, }, }, Spec: corev1.PodSpec{ @@ -3817,7 +3831,8 @@ func Test_Plugin_PreBind(t *testing.T) { Name: "test-container-a", Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ - apiext.ResourceGPU: resource.MustParse("2"), + apiext.ResourceGPU: resource.MustParse("2"), + apiext.ResourceRDMA: resource.MustParse("100"), }, }, }, @@ -3831,6 +3846,11 @@ func Test_Plugin_PreBind(t *testing.T) { suit := newPluginTestSuit(t, nil) _, err := suit.ClientSet().CoreV1().Pods(testPod.Namespace).Create(context.TODO(), testPod, metav1.CreateOptions{}) assert.NoError(t, err) + if tt.deviceCR != nil { + _, err = suit.koordClientSet.SchedulingV1alpha1().Devices().Create(context.TODO(), tt.deviceCR, metav1.CreateOptions{}) + assert.NoError(t, err) + } + pl, err := suit.proxyNew(getDefaultArgs(), suit.Framework) assert.NoError(t, err) @@ -3843,7 +3863,7 @@ func Test_Plugin_PreBind(t *testing.T) { if tt.args.state != nil { cycleState.Write(stateKey, tt.args.state) } - status := pl.(*Plugin).PreBind(context.TODO(), cycleState, tt.args.pod, "test-node") + status := pl.(*Plugin).PreBind(context.TODO(), cycleState, tt.args.pod, fakeDeviceCR.Name) assert.Equal(t, tt.wantStatus, status) assert.Equal(t, tt.wantPod, tt.args.pod) }) @@ -3906,6 +3926,9 @@ func Test_Plugin_PreBindReservation(t *testing.T) { _, err := suit.koordClientSet.SchedulingV1alpha1().Reservations().Create(context.TODO(), reservation, metav1.CreateOptions{}) assert.NoError(t, err) + _, err = suit.koordClientSet.SchedulingV1alpha1().Devices().Create(context.TODO(), fakeDeviceCRWithoutTopology, metav1.CreateOptions{}) + assert.NoError(t, err) + pl, err := suit.proxyNew(getDefaultArgs(), suit.Framework) assert.NoError(t, err) @@ -3916,7 +3939,7 @@ func Test_Plugin_PreBindReservation(t *testing.T) { cycleState := framework.NewCycleState() cycleState.Write(stateKey, state) - status := pl.(*Plugin).PreBindReservation(context.TODO(), cycleState, reservation, "test-node") + status := pl.(*Plugin).PreBindReservation(context.TODO(), cycleState, reservation, fakeDeviceCRWithoutTopology.Name) assert.True(t, status.IsSuccess()) allocations, err := apiext.GetDeviceAllocations(reservation.Annotations)