Skip to content

Commit

Permalink
Merge pull request microsoft#2040 from katiewasnothere/kabaldau/retry…
Browse files Browse the repository at this point in the history
…_find_dev_nodes

Add the ability to retry when looking for hot added device nodes
  • Loading branch information
katiewasnothere authored Mar 13, 2024
2 parents 4f70fdf + 56ad47f commit 1ad9c65
Showing 1 changed file with 57 additions and 34 deletions.
91 changes: 57 additions & 34 deletions guest/runtime/hcsv2/spec_devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"path/filepath"
"strings"
"time"

"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
"github.com/Microsoft/hcsshim/internal/log"
Expand All @@ -30,10 +31,15 @@ const (
// on the spec and updates the spec so that the correct device nodes can be mounted
// into the resulting container by the runtime.
func addAssignedDevice(ctx context.Context, spec *oci.Spec) error {
// Add an explicit timeout before we try to find the dev nodes so we
// aren't waiting forever.
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()

for _, d := range spec.Windows.Devices {
switch d.IDType {
case vpciDeviceIDTypeLegacy, vpciDeviceIDType:
// validate that the device is ready
// validate that the device is available
fullPCIPath, err := pci.FindDeviceFullPath(ctx, d.ID)
if err != nil {
return errors.Wrapf(err, "failed to find device pci path for device %v", d)
Expand Down Expand Up @@ -61,47 +67,64 @@ func devicePathsFromPCIPath(ctx context.Context, pciPath string) ([]*devices.Dev
return nil, err
}

// get all host dev devices
hostDevices, err := devices.HostDevices()
if err != nil {
return nil, err
}

// some drivers create multiple dev nodes associated with the PCI device
out := []*devices.Device{}

// find corresponding entries in sysfs
for _, d := range hostDevices {
major := d.Rule.Major
minor := d.Rule.Minor

deviceTypeString := ""
switch d.Rule.Type {
case devices.BlockDevice:
deviceTypeString = blockType
case devices.CharDevice:
deviceTypeString = charType
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("find the device nodes for sysfs pci path cancelled: %w", ctx.Err())
default:
return nil, errors.New("unsupported device type")
}

syfsDevPath := fmt.Sprintf(sysfsDevPathFormat, deviceTypeString, major, minor)
sysfsFullPath, err := filepath.EvalSymlinks(syfsDevPath)
// some drivers create multiple dev nodes associated with the PCI device
out := []*devices.Device{}

// get all host dev devices
hostDevices, err := devices.HostDevices()
if err != nil {
// Some drivers will make dev nodes that do not have a matching block or
// char device -- skip those.
log.G(ctx).WithError(err).Debugf("failed to find sysfs path for device %s", d.Path)
continue
return nil, err
}
if strings.HasPrefix(sysfsFullPath, pciFullPath) {
out = append(out, d)

// find corresponding entries in sysfs
for _, d := range hostDevices {
major := d.Rule.Major
minor := d.Rule.Minor

log.G(ctx).WithField("device", d).Infof("looking at device: %+v", d)

deviceTypeString := ""
switch d.Rule.Type {
case devices.BlockDevice:
deviceTypeString = blockType
case devices.CharDevice:
deviceTypeString = charType
default:
return nil, errors.New("unsupported device type")
}

sysfsDevPath := fmt.Sprintf(sysfsDevPathFormat, deviceTypeString, major, minor)
sysfsFullPath, err := filepath.EvalSymlinks(sysfsDevPath)
if err != nil {
// Some drivers will make dev nodes that do not have a matching block or
// char device -- skip those.
log.G(ctx).WithError(err).Debugf("failed to find sysfs path for device %s", d.Path)
continue
}
if strings.HasPrefix(sysfsFullPath, pciFullPath) {
out = append(out, d)
}
}

if len(out) != 0 {
return out, nil
}
}

if len(out) == 0 {
return nil, errors.New("failed to find the device nodes for sysfs pci path")
// There is not a generic way to determine when a device is ready for use after
// being hot-added. As a result, there may be a race between when the device
// was hot-added and when the related /dev nodes are read from the filesystem.
// Best we can do is retry until we get results. However, it's still possible we
// will miss some /dev nodes since we don't know ahead of time how many to expect.
// TODO(katiewasnothere): find a better way to make sure we find all the nodes.
time.Sleep(time.Millisecond * 500)
}
return out, nil
}

func addLinuxDeviceToSpec(ctx context.Context, hostDevice *devices.Device, spec *oci.Spec, addCgroupDevice bool) {
Expand Down

0 comments on commit 1ad9c65

Please sign in to comment.