diff --git a/pkg/koordlet/runtimehooks/config.go b/pkg/koordlet/runtimehooks/config.go index ad517ad5d..c3a122e5f 100644 --- a/pkg/koordlet/runtimehooks/config.go +++ b/pkg/koordlet/runtimehooks/config.go @@ -32,6 +32,7 @@ import ( "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/cpuset" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/gpu" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/groupidentity" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/rdma" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/resctrl" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/tc" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/terwayqos" @@ -60,6 +61,12 @@ const ( // beta: v1.1 GPUEnvInject featuregate.Feature = "GPUEnvInject" + // RDMADeviceInject injects rdma device info according to allocate result from koord-scheduler. + // + // owner: @ZiMengSheng + // alpha: v1.6 + RDMADeviceInject featuregate.Feature = "RDMADeviceInject" + // BatchResource sets request and limits of cpu and memory on cgroup file according batch resources. // // owner: @saintube @zwzhang0107 @@ -101,6 +108,7 @@ var ( GroupIdentity: {Default: true, PreRelease: featuregate.Beta}, CPUSetAllocator: {Default: true, PreRelease: featuregate.Beta}, GPUEnvInject: {Default: false, PreRelease: featuregate.Alpha}, + RDMADeviceInject: {Default: false, PreRelease: featuregate.Alpha}, BatchResource: {Default: true, PreRelease: featuregate.Beta}, CPUNormalization: {Default: false, PreRelease: featuregate.Alpha}, CoreSched: {Default: false, PreRelease: featuregate.Alpha}, @@ -113,6 +121,7 @@ var ( GroupIdentity: groupidentity.Object(), CPUSetAllocator: cpuset.Object(), GPUEnvInject: gpu.Object(), + RDMADeviceInject: rdma.Object(), BatchResource: batchresource.Object(), CPUNormalization: cpunormalization.Object(), CoreSched: coresched.Object(), diff --git a/pkg/koordlet/runtimehooks/hooks/rdma/rdma.go b/pkg/koordlet/runtimehooks/hooks/rdma/rdma.go new file mode 100644 index 000000000..fb510a9f0 --- /dev/null +++ b/pkg/koordlet/runtimehooks/hooks/rdma/rdma.go @@ -0,0 +1,165 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rdma + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "k8s.io/klog/v2" + + ext "github.com/koordinator-sh/koordinator/apis/extension" + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks" + "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol" + rmconfig "github.com/koordinator-sh/koordinator/pkg/runtimeproxy/config" +) + +const ( + IBDevDir = "/dev/infiniband" + RdmaCmDir = "/dev/infiniband/rdma_cm" + SysBusPci = "/sys/bus/pci/devices" +) + +type rdmaPlugin struct{} + +func (p *rdmaPlugin) Register(op hooks.Options) { + klog.V(5).Infof("register hook %v", "rdma device inject") + hooks.Register(rmconfig.PreCreateContainer, "rdma device inject", "inject NVIDIA_VISIBLE_DEVICES env into container", p.InjectDevice) +} + +var singleton *rdmaPlugin + +func Object() *rdmaPlugin { + if singleton == nil { + singleton = &rdmaPlugin{} + } + return singleton +} + +func (p *rdmaPlugin) InjectDevice(proto protocol.HooksProtocol) error { + containerCtx := proto.(*protocol.ContainerContext) + if containerCtx == nil { + return fmt.Errorf("container protocol is nil for plugin gpu") + } + containerReq := containerCtx.Request + alloc, err := ext.GetDeviceAllocations(containerReq.PodAnnotations) + if err != nil { + klog.Errorf("InjectDevice: GetDeviceAllocations error:%v", err) + return err + } + devices, ok := alloc[schedulingv1alpha1.RDMA] + if !ok || len(devices) == 0 { + klog.V(5).Infof("no rdma alloc info in pod anno, %s", containerReq.PodMeta.Name) + return nil + } + + deviceInfoCM, err := getDeviceNumbers(RdmaCmDir) + if err != nil { + klog.Errorf("InjectDevice: GetDeviceNumbers deviceinfoCM from %s error:%v", RdmaCmDir, err) + return err + } + + containerCtx.Response.AddContainerDevices = []*protocol.LinuxDevice{ + { + Path: RdmaCmDir, + Type: "c", + Major: deviceInfoCM[0], + Minor: deviceInfoCM[1], + FileModeValue: 0666, + }, + } + + for _, device := range devices { + // Both VF and PF of the same device are not allowed + if device.Extension != nil && device.Extension.VirtualFunctions != nil && len(device.Extension.VirtualFunctions) > 0 { + for _, vf := range device.Extension.VirtualFunctions { + uverbsOfVF, err := getUVerbsViaPciAdd(vf.BusID) + if err != nil { + klog.Errorf("InjectDevice: getUVerbsViaPciAdd uverbsOfVF error:%v", err) + return err + } + + deviceInfoVf, err := getDeviceNumbers(uverbsOfVF) + if err != nil { + klog.Errorf("InjectDevice: GetDeviceNumbers deviceinfoVf from %s error:%v", uverbsOfVF, err) + return err + } + containerCtx.Response.AddContainerDevices = append(containerCtx.Response.AddContainerDevices, + &protocol.LinuxDevice{ + Path: uverbsOfVF, + Major: deviceInfoVf[0], + Minor: deviceInfoVf[1], + Type: "c", + FileModeValue: 0666, + }) + } + continue + } + + uverbs, err := getUVerbsViaPciAdd(device.ID) + if err != nil { + klog.Errorf("InjectDevice: getUVerbsViaPciAdd error:%v", err) + return err + } + deviceInfoPf, err := getDeviceNumbers(uverbs) + if err != nil { + klog.Errorf("InjectDevice: GetDeviceNumbers deviceinfoPf from %s error:%v", uverbs, err) + return err + } + containerCtx.Response.AddContainerDevices = append(containerCtx.Response.AddContainerDevices, + &protocol.LinuxDevice{ + Path: uverbs, + Major: deviceInfoPf[0], + Minor: deviceInfoPf[1], + Type: "c", + FileModeValue: 0666, + }) + } + klog.V(4).Infof("InjectDevice: AddContainerDevices: %v", containerCtx.Response.AddContainerDevices) + return nil +} + +func getUVerbsViaPciAdd(pciAddress string) (string, error) { + pciDir := filepath.Join(SysBusPci, pciAddress, "infiniband_verbs") + files, err := os.ReadDir(pciDir) + if err != nil || len(files) == 0 { + return "", fmt.Errorf("failed to get uverbs: %s", err.Error()) + } + return filepath.Join(IBDevDir, files[0].Name()), nil +} + +func major(dev uint64) int64 { + return int64((dev>>8)&0xff) | int64((dev>>12)&0xfff00) +} + +func minor(dev uint64) int64 { + return int64(dev&0xff) | int64((dev>>12)&0xffffff00) +} + +func getDeviceNumbers(devicePath string) ([]int64, error) { + fileInfo, err := os.Stat(devicePath) + if err != nil { + return nil, fmt.Errorf("failed to stat device file: %v", err) + } + deviceNumber := fileInfo.Sys().(*syscall.Stat_t).Rdev + major := major(deviceNumber) + minor := minor(deviceNumber) + return []int64{major, minor}, nil +} diff --git a/pkg/koordlet/runtimehooks/protocol/container_context.go b/pkg/koordlet/runtimehooks/protocol/container_context.go index a39ba04a9..5c76bdec8 100644 --- a/pkg/koordlet/runtimehooks/protocol/container_context.go +++ b/pkg/koordlet/runtimehooks/protocol/container_context.go @@ -181,8 +181,17 @@ func (c *ContainerRequest) FromReconciler(podMeta *statesinformer.PodMeta, conta } type ContainerResponse struct { - Resources Resources - AddContainerEnvs map[string]string + Resources Resources + AddContainerEnvs map[string]string + AddContainerDevices []*LinuxDevice +} + +type LinuxDevice struct { + Path string + Type string + Major int64 + Minor int64 + FileModeValue uint32 } func (c *ContainerResponse) ProxyDone(resp *runtimeapi.ContainerResourceHookResponse) { @@ -279,6 +288,21 @@ func (c *ContainerContext) NriDone(executor resourceexecutor.ResourceUpdateExecu } } + if len(c.Response.AddContainerDevices) != 0 { + for i := range c.Response.AddContainerDevices { + adjust.AddDevice(&api.LinuxDevice{ + Path: c.Response.AddContainerDevices[i].Path, + Type: c.Response.AddContainerDevices[i].Type, + Major: c.Response.AddContainerDevices[i].Major, + Minor: c.Response.AddContainerDevices[i].Minor, + FileMode: &api.OptionalFileMode{ + Value: c.Response.AddContainerDevices[i].FileModeValue, + }, + }) + } + + } + c.Update() return adjust, update, nil