From 3d466f91ecd9693cc733a4ec3fb0e105ec4e3063 Mon Sep 17 00:00:00 2001 From: Kathryn Baldauf Date: Mon, 26 Feb 2024 14:22:27 -0800 Subject: [PATCH] Add the ability to retry when looking for hot added device nodes Signed-off-by: Kathryn Baldauf --- internal/guest/runtime/hcsv2/spec_devices.go | 91 ++++++++++++-------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/internal/guest/runtime/hcsv2/spec_devices.go b/internal/guest/runtime/hcsv2/spec_devices.go index c56d209018..f4c403c503 100644 --- a/internal/guest/runtime/hcsv2/spec_devices.go +++ b/internal/guest/runtime/hcsv2/spec_devices.go @@ -8,6 +8,7 @@ import ( "fmt" "path/filepath" "strings" + "time" "github.com/Microsoft/hcsshim/internal/guest/storage/pci" "github.com/Microsoft/hcsshim/internal/log" @@ -30,10 +31,15 @@ const ( // on the spec and updates the spec so that the correct device nodes can be mounted // into the resulting container by the runtime. func addAssignedDevice(ctx context.Context, spec *oci.Spec) error { + // Add an explicit timeout before we try to find the dev nodes so we + // aren't waiting forever. + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + for _, d := range spec.Windows.Devices { switch d.IDType { case vpciDeviceIDTypeLegacy, vpciDeviceIDType: - // validate that the device is ready + // validate that the device is available fullPCIPath, err := pci.FindDeviceFullPath(ctx, d.ID) if err != nil { return errors.Wrapf(err, "failed to find device pci path for device %v", d) @@ -61,47 +67,64 @@ func devicePathsFromPCIPath(ctx context.Context, pciPath string) ([]*devices.Dev return nil, err } - // get all host dev devices - hostDevices, err := devices.HostDevices() - if err != nil { - return nil, err - } - - // some drivers create multiple dev nodes associated with the PCI device - out := []*devices.Device{} - - // find corresponding entries in sysfs - for _, d := range hostDevices { - major := d.Rule.Major - minor := d.Rule.Minor - - deviceTypeString := "" - switch d.Rule.Type { - case devices.BlockDevice: - deviceTypeString = blockType - case devices.CharDevice: - deviceTypeString = charType + for { + select { + case <-ctx.Done(): + return nil, fmt.Errorf("find the device nodes for sysfs pci path cancelled: %w", ctx.Err()) default: - return nil, errors.New("unsupported device type") } - syfsDevPath := fmt.Sprintf(sysfsDevPathFormat, deviceTypeString, major, minor) - sysfsFullPath, err := filepath.EvalSymlinks(syfsDevPath) + // some drivers create multiple dev nodes associated with the PCI device + out := []*devices.Device{} + + // get all host dev devices + hostDevices, err := devices.HostDevices() if err != nil { - // Some drivers will make dev nodes that do not have a matching block or - // char device -- skip those. - log.G(ctx).WithError(err).Debugf("failed to find sysfs path for device %s", d.Path) - continue + return nil, err } - if strings.HasPrefix(sysfsFullPath, pciFullPath) { - out = append(out, d) + + // find corresponding entries in sysfs + for _, d := range hostDevices { + major := d.Rule.Major + minor := d.Rule.Minor + + log.G(ctx).WithField("device", d).Infof("looking at device: %+v", d) + + deviceTypeString := "" + switch d.Rule.Type { + case devices.BlockDevice: + deviceTypeString = blockType + case devices.CharDevice: + deviceTypeString = charType + default: + return nil, errors.New("unsupported device type") + } + + sysfsDevPath := fmt.Sprintf(sysfsDevPathFormat, deviceTypeString, major, minor) + sysfsFullPath, err := filepath.EvalSymlinks(sysfsDevPath) + if err != nil { + // Some drivers will make dev nodes that do not have a matching block or + // char device -- skip those. + log.G(ctx).WithError(err).Debugf("failed to find sysfs path for device %s", d.Path) + continue + } + if strings.HasPrefix(sysfsFullPath, pciFullPath) { + out = append(out, d) + } + } + + if len(out) != 0 { + return out, nil } - } - if len(out) == 0 { - return nil, errors.New("failed to find the device nodes for sysfs pci path") + // There is not a generic way to determine when a device is ready for use after + // being hot-added. As a result, there may be a race between when the device + // was hot-added and when the related /dev nodes are read from the filesystem. + // Best we can do is retry until we get results. However, it's still possible we + // will miss some /dev nodes since we don't know ahead of time how many to expect. + // TODO(katiewasnothere): find a better way to make sure we find all the nodes. + time.Sleep(time.Millisecond * 500) } - return out, nil } func addLinuxDeviceToSpec(ctx context.Context, hostDevice *devices.Device, spec *oci.Spec, addCgroupDevice bool) {