Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the ability to retry when looking for hot added device nodes #2040

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 57 additions & 34 deletions internal/guest/runtime/hcsv2/spec_devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"path/filepath"
"strings"
"time"

"github.com/Microsoft/hcsshim/internal/guest/storage/pci"
"github.com/Microsoft/hcsshim/internal/log"
Expand All @@ -30,10 +31,15 @@ const (
// on the spec and updates the spec so that the correct device nodes can be mounted
// into the resulting container by the runtime.
func addAssignedDevice(ctx context.Context, spec *oci.Spec) error {
// Add an explicit timeout before we try to find the dev nodes so we
// aren't waiting forever.
ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()

for _, d := range spec.Windows.Devices {
switch d.IDType {
case vpciDeviceIDTypeLegacy, vpciDeviceIDType:
// validate that the device is ready
// validate that the device is available
fullPCIPath, err := pci.FindDeviceFullPath(ctx, d.ID)
if err != nil {
return errors.Wrapf(err, "failed to find device pci path for device %v", d)
Expand Down Expand Up @@ -61,47 +67,64 @@ func devicePathsFromPCIPath(ctx context.Context, pciPath string) ([]*devices.Dev
return nil, err
}

// get all host dev devices
hostDevices, err := devices.HostDevices()
if err != nil {
return nil, err
}

// some drivers create multiple dev nodes associated with the PCI device
out := []*devices.Device{}

// find corresponding entries in sysfs
for _, d := range hostDevices {
major := d.Rule.Major
minor := d.Rule.Minor

deviceTypeString := ""
switch d.Rule.Type {
case devices.BlockDevice:
deviceTypeString = blockType
case devices.CharDevice:
deviceTypeString = charType
for {
select {
case <-ctx.Done():
return nil, fmt.Errorf("find the device nodes for sysfs pci path cancelled: %w", ctx.Err())
default:
return nil, errors.New("unsupported device type")
}

syfsDevPath := fmt.Sprintf(sysfsDevPathFormat, deviceTypeString, major, minor)
sysfsFullPath, err := filepath.EvalSymlinks(syfsDevPath)
// some drivers create multiple dev nodes associated with the PCI device
out := []*devices.Device{}

// get all host dev devices
hostDevices, err := devices.HostDevices()
if err != nil {
// Some drivers will make dev nodes that do not have a matching block or
// char device -- skip those.
log.G(ctx).WithError(err).Debugf("failed to find sysfs path for device %s", d.Path)
continue
return nil, err
}
if strings.HasPrefix(sysfsFullPath, pciFullPath) {
out = append(out, d)

// find corresponding entries in sysfs
for _, d := range hostDevices {
major := d.Rule.Major
minor := d.Rule.Minor

log.G(ctx).WithField("device", d).Infof("looking at device: %+v", d)

deviceTypeString := ""
switch d.Rule.Type {
case devices.BlockDevice:
deviceTypeString = blockType
case devices.CharDevice:
deviceTypeString = charType
default:
return nil, errors.New("unsupported device type")
}

sysfsDevPath := fmt.Sprintf(sysfsDevPathFormat, deviceTypeString, major, minor)
sysfsFullPath, err := filepath.EvalSymlinks(sysfsDevPath)
if err != nil {
// Some drivers will make dev nodes that do not have a matching block or
// char device -- skip those.
log.G(ctx).WithError(err).Debugf("failed to find sysfs path for device %s", d.Path)
continue
}
if strings.HasPrefix(sysfsFullPath, pciFullPath) {
out = append(out, d)
}
}

if len(out) != 0 {
return out, nil
}
}

if len(out) == 0 {
return nil, errors.New("failed to find the device nodes for sysfs pci path")
// There is not a generic way to determine when a device is ready for use after
// being hot-added. As a result, there may be a race between when the device
// was hot-added and when the related /dev nodes are read from the filesystem.
// Best we can do is retry until we get results. However, it's still possible we
// will miss some /dev nodes since we don't know ahead of time how many to expect.
// TODO(katiewasnothere): find a better way to make sure we find all the nodes.
time.Sleep(time.Millisecond * 500)
}
return out, nil
}

func addLinuxDeviceToSpec(ctx context.Context, hostDevice *devices.Device, spec *oci.Spec, addCgroupDevice bool) {
Expand Down
Loading