Skip to content

Commit

Permalink
koordlet: rdma device inject (koordinator-sh#2285)
Browse files Browse the repository at this point in the history
Signed-off-by: [email protected] <[email protected]>
Signed-off-by: wangjianyu.wjy <[email protected]>
  • Loading branch information
ferris-cx authored and j4ckstraw committed Dec 4, 2024
1 parent 38130aa commit 6e4f99e
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 2 deletions.
9 changes: 9 additions & 0 deletions pkg/koordlet/runtimehooks/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/cpuset"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/gpu"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/groupidentity"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/rdma"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/resctrl"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/tc"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/terwayqos"
Expand Down Expand Up @@ -60,6 +61,12 @@ const (
// beta: v1.1
GPUEnvInject featuregate.Feature = "GPUEnvInject"

// RDMADeviceInject injects rdma device info according to allocate result from koord-scheduler.
//
// owner: @ZiMengSheng
// alpha: v1.6
RDMADeviceInject featuregate.Feature = "RDMADeviceInject"

// BatchResource sets request and limits of cpu and memory on cgroup file according batch resources.
//
// owner: @saintube @zwzhang0107
Expand Down Expand Up @@ -101,6 +108,7 @@ var (
GroupIdentity: {Default: true, PreRelease: featuregate.Beta},
CPUSetAllocator: {Default: true, PreRelease: featuregate.Beta},
GPUEnvInject: {Default: false, PreRelease: featuregate.Alpha},
RDMADeviceInject: {Default: false, PreRelease: featuregate.Alpha},
BatchResource: {Default: true, PreRelease: featuregate.Beta},
CPUNormalization: {Default: false, PreRelease: featuregate.Alpha},
CoreSched: {Default: false, PreRelease: featuregate.Alpha},
Expand All @@ -113,6 +121,7 @@ var (
GroupIdentity: groupidentity.Object(),
CPUSetAllocator: cpuset.Object(),
GPUEnvInject: gpu.Object(),
RDMADeviceInject: rdma.Object(),
BatchResource: batchresource.Object(),
CPUNormalization: cpunormalization.Object(),
CoreSched: coresched.Object(),
Expand Down
165 changes: 165 additions & 0 deletions pkg/koordlet/runtimehooks/hooks/rdma/rdma.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
Copyright 2022 The Koordinator Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package rdma

import (
"fmt"
"os"
"path/filepath"
"syscall"

"k8s.io/klog/v2"

ext "github.com/koordinator-sh/koordinator/apis/extension"
schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol"
rmconfig "github.com/koordinator-sh/koordinator/pkg/runtimeproxy/config"
)

const (
IBDevDir = "/dev/infiniband"
RdmaCmDir = "/dev/infiniband/rdma_cm"
SysBusPci = "/sys/bus/pci/devices"
)

type rdmaPlugin struct{}

func (p *rdmaPlugin) Register(op hooks.Options) {
klog.V(5).Infof("register hook %v", "rdma device inject")
hooks.Register(rmconfig.PreCreateContainer, "rdma device inject", "inject NVIDIA_VISIBLE_DEVICES env into container", p.InjectDevice)
}

var singleton *rdmaPlugin

func Object() *rdmaPlugin {
if singleton == nil {
singleton = &rdmaPlugin{}
}
return singleton
}

func (p *rdmaPlugin) InjectDevice(proto protocol.HooksProtocol) error {
containerCtx := proto.(*protocol.ContainerContext)
if containerCtx == nil {
return fmt.Errorf("container protocol is nil for plugin gpu")
}
containerReq := containerCtx.Request
alloc, err := ext.GetDeviceAllocations(containerReq.PodAnnotations)
if err != nil {
klog.Errorf("InjectDevice: GetDeviceAllocations error:%v", err)
return err
}
devices, ok := alloc[schedulingv1alpha1.RDMA]
if !ok || len(devices) == 0 {
klog.V(5).Infof("no rdma alloc info in pod anno, %s", containerReq.PodMeta.Name)
return nil
}

deviceInfoCM, err := getDeviceNumbers(RdmaCmDir)
if err != nil {
klog.Errorf("InjectDevice: GetDeviceNumbers deviceinfoCM from %s error:%v", RdmaCmDir, err)
return err
}

containerCtx.Response.AddContainerDevices = []*protocol.LinuxDevice{
{
Path: RdmaCmDir,
Type: "c",
Major: deviceInfoCM[0],
Minor: deviceInfoCM[1],
FileModeValue: 0666,
},
}

for _, device := range devices {
// Both VF and PF of the same device are not allowed
if device.Extension != nil && device.Extension.VirtualFunctions != nil && len(device.Extension.VirtualFunctions) > 0 {
for _, vf := range device.Extension.VirtualFunctions {
uverbsOfVF, err := getUVerbsViaPciAdd(vf.BusID)
if err != nil {
klog.Errorf("InjectDevice: getUVerbsViaPciAdd uverbsOfVF error:%v", err)
return err
}

deviceInfoVf, err := getDeviceNumbers(uverbsOfVF)
if err != nil {
klog.Errorf("InjectDevice: GetDeviceNumbers deviceinfoVf from %s error:%v", uverbsOfVF, err)
return err
}
containerCtx.Response.AddContainerDevices = append(containerCtx.Response.AddContainerDevices,
&protocol.LinuxDevice{
Path: uverbsOfVF,
Major: deviceInfoVf[0],
Minor: deviceInfoVf[1],
Type: "c",
FileModeValue: 0666,
})
}
continue
}

uverbs, err := getUVerbsViaPciAdd(device.ID)
if err != nil {
klog.Errorf("InjectDevice: getUVerbsViaPciAdd error:%v", err)
return err
}
deviceInfoPf, err := getDeviceNumbers(uverbs)
if err != nil {
klog.Errorf("InjectDevice: GetDeviceNumbers deviceinfoPf from %s error:%v", uverbs, err)
return err
}
containerCtx.Response.AddContainerDevices = append(containerCtx.Response.AddContainerDevices,
&protocol.LinuxDevice{
Path: uverbs,
Major: deviceInfoPf[0],
Minor: deviceInfoPf[1],
Type: "c",
FileModeValue: 0666,
})
}
klog.V(4).Infof("InjectDevice: AddContainerDevices: %v", containerCtx.Response.AddContainerDevices)
return nil
}

func getUVerbsViaPciAdd(pciAddress string) (string, error) {
pciDir := filepath.Join(SysBusPci, pciAddress, "infiniband_verbs")
files, err := os.ReadDir(pciDir)
if err != nil || len(files) == 0 {
return "", fmt.Errorf("failed to get uverbs: %s", err.Error())
}
return filepath.Join(IBDevDir, files[0].Name()), nil
}

func major(dev uint64) int64 {
return int64((dev>>8)&0xff) | int64((dev>>12)&0xfff00)
}

func minor(dev uint64) int64 {
return int64(dev&0xff) | int64((dev>>12)&0xffffff00)
}

func getDeviceNumbers(devicePath string) ([]int64, error) {
fileInfo, err := os.Stat(devicePath)
if err != nil {
return nil, fmt.Errorf("failed to stat device file: %v", err)
}
deviceNumber := fileInfo.Sys().(*syscall.Stat_t).Rdev
major := major(deviceNumber)
minor := minor(deviceNumber)
return []int64{major, minor}, nil
}
28 changes: 26 additions & 2 deletions pkg/koordlet/runtimehooks/protocol/container_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,17 @@ func (c *ContainerRequest) FromReconciler(podMeta *statesinformer.PodMeta, conta
}

type ContainerResponse struct {
Resources Resources
AddContainerEnvs map[string]string
Resources Resources
AddContainerEnvs map[string]string
AddContainerDevices []*LinuxDevice
}

type LinuxDevice struct {
Path string
Type string
Major int64
Minor int64
FileModeValue uint32
}

func (c *ContainerResponse) ProxyDone(resp *runtimeapi.ContainerResourceHookResponse) {
Expand Down Expand Up @@ -279,6 +288,21 @@ func (c *ContainerContext) NriDone(executor resourceexecutor.ResourceUpdateExecu
}
}

if len(c.Response.AddContainerDevices) != 0 {
for i := range c.Response.AddContainerDevices {
adjust.AddDevice(&api.LinuxDevice{
Path: c.Response.AddContainerDevices[i].Path,
Type: c.Response.AddContainerDevices[i].Type,
Major: c.Response.AddContainerDevices[i].Major,
Minor: c.Response.AddContainerDevices[i].Minor,
FileMode: &api.OptionalFileMode{
Value: c.Response.AddContainerDevices[i].FileModeValue,
},
})
}

}

c.Update()

return adjust, update, nil
Expand Down

0 comments on commit 6e4f99e

Please sign in to comment.