From 0f1a72e648caf273e685cc22fbbda6de87927b35 Mon Sep 17 00:00:00 2001 From: Tobias Giese Date: Thu, 11 Jul 2024 13:39:25 +0200 Subject: [PATCH 1/2] feat: implement MlxResetFW to reset the FW on VF changes Signed-off-by: Tobias Giese --- Dockerfile.sriov-network-config-daemon | 4 +++- pkg/helper/mock/mock_helper.go | 14 ++++++++++++++ pkg/plugins/mellanox/mellanox_plugin.go | 11 ++++++++++- pkg/vendors/mellanox/mellanox.go | 18 ++++++++++++++++++ pkg/vendors/mellanox/mock/mock_mellanox.go | 14 ++++++++++++++ 5 files changed, 59 insertions(+), 2 deletions(-) diff --git a/Dockerfile.sriov-network-config-daemon b/Dockerfile.sriov-network-config-daemon index 219d77c6d..35533448f 100644 --- a/Dockerfile.sriov-network-config-daemon +++ b/Dockerfile.sriov-network-config-daemon @@ -5,7 +5,9 @@ RUN make _build-sriov-network-config-daemon BIN_PATH=build/_output/cmd FROM quay.io/centos/centos:stream9 ARG MSTFLINT=mstflint -RUN ARCH_DEP_PKGS=$(if [ "$(uname -m)" != "s390x" ]; then echo -n ${MSTFLINT} ; fi) && yum -y install hwdata $ARCH_DEP_PKGS && yum clean all +# We have to ensure that pciutils is installed. This package is needed for mstfwreset to succeed. +# xref pkg/vendors/mellanox/mellanox.go#L150 +RUN ARCH_DEP_PKGS=$(if [ "$(uname -m)" != "s390x" ]; then echo -n ${MSTFLINT} ; fi) && yum -y install hwdata pciutils $ARCH_DEP_PKGS && yum clean all LABEL io.k8s.display-name="sriov-network-config-daemon" \ io.k8s.description="This is a daemon that manage and config sriov network devices in Kubernetes cluster" COPY --from=builder /go/src/github.com/k8snetworkplumbingwg/sriov-network-operator/build/_output/cmd/sriov-network-config-daemon /usr/bin/ diff --git a/pkg/helper/mock/mock_helper.go b/pkg/helper/mock/mock_helper.go index 23b32a1d4..03e0da573 100644 --- a/pkg/helper/mock/mock_helper.go +++ b/pkg/helper/mock/mock_helper.go @@ -811,6 +811,20 @@ func (mr *MockHostHelpersInterfaceMockRecorder) MlxConfigFW(attributesToChange i return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MlxConfigFW", reflect.TypeOf((*MockHostHelpersInterface)(nil).MlxConfigFW), attributesToChange) } +// MlxResetFW mocks base method. +func (m *MockHostHelpersInterface) MlxResetFW(pciAddresses []string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "MlxResetFW", pciAddresses) + ret0, _ := ret[0].(error) + return ret0 +} + +// MlxResetFW indicates an expected call of MlxResetFW. +func (mr *MockHostHelpersInterfaceMockRecorder) MlxResetFW(pciAddresses interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MlxResetFW", reflect.TypeOf((*MockHostHelpersInterface)(nil).MlxResetFW), pciAddresses) +} + // MstConfigReadData mocks base method. func (m *MockHostHelpersInterface) MstConfigReadData(arg0 string) (string, string, error) { m.ctrl.T.Helper() diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index e5e9a9913..827a226ac 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -20,6 +20,7 @@ type MellanoxPlugin struct { helpers helper.HostHelpersInterface } +var pciAddressesToReset []string var attributesToChange map[string]mlx.MlxNic var mellanoxNicsStatus map[string]map[string]sriovnetworkv1.InterfaceExt var mellanoxNicsSpec map[string]sriovnetworkv1.Interface @@ -52,6 +53,7 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS needDrain = false needReboot = false err = nil + pciAddressesToReset = []string{} attributesToChange = map[string]mlx.MlxNic{} mellanoxNicsStatus = map[string]map[string]sriovnetworkv1.InterfaceExt{} mellanoxNicsSpec = map[string]sriovnetworkv1.Interface{} @@ -132,6 +134,10 @@ func (p *MellanoxPlugin) OnNodeStateChange(new *sriovnetworkv1.SriovNetworkNodeS if needReboot || changeWithoutReboot { attributesToChange[ifaceSpec.PciAddress] = *attrs } + + if needReboot { + pciAddressesToReset = append(pciAddressesToReset, ifaceSpec.PciAddress) + } } // Set total VFs to 0 for mellanox interfaces with no spec @@ -202,7 +208,10 @@ func (p *MellanoxPlugin) Apply() error { return nil } log.Log.Info("mellanox plugin Apply()") - return p.helpers.MlxConfigFW(attributesToChange) + if err := p.helpers.MlxConfigFW(attributesToChange); err != nil { + return err + } + return p.helpers.MlxResetFW(pciAddressesToReset) } // nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed diff --git a/pkg/vendors/mellanox/mellanox.go b/pkg/vendors/mellanox/mellanox.go index c6631ed28..82410c7f8 100644 --- a/pkg/vendors/mellanox/mellanox.go +++ b/pkg/vendors/mellanox/mellanox.go @@ -6,6 +6,7 @@ import ( "strconv" "strings" + kerrors "k8s.io/apimachinery/pkg/util/errors" "sigs.k8s.io/controller-runtime/pkg/log" sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" @@ -60,6 +61,7 @@ type MellanoxInterface interface { GetMlxNicFwData(pciAddress string) (current, next *MlxNic, err error) MlxConfigFW(attributesToChange map[string]MlxNic) error + MlxResetFW(pciAddresses []string) error } type mellanoxHelper struct { @@ -141,6 +143,22 @@ func (m *mellanoxHelper) GetMellanoxBlueFieldMode(PciAddress string) (BlueFieldM return -1, fmt.Errorf("MellanoxBlueFieldMode(): unknown device status for %s", PciAddress) } +func (m *mellanoxHelper) MlxResetFW(pciAddresses []string) error { + log.Log.Info("mellanox-plugin resetFW()") + var errs []error + for _, pciAddress := range pciAddresses { + cmdArgs := []string{"-d", pciAddress, "--skip_driver", "-l", "3", "-y", "reset"} + log.Log.Info("mellanox-plugin: resetFW()", "cmd-args", cmdArgs) + // We have to ensure that pciutils is installed into the container image Dockerfile.sriov-network-config-daemon + _, stderr, err := m.utils.RunCommand("mstfwreset", cmdArgs...) + if err != nil { + log.Log.Error(err, "mellanox-plugin resetFW(): failed", "stderr", stderr) + errs = append(errs, err) + } + } + return kerrors.NewAggregate(errs) +} + func (m *mellanoxHelper) MlxConfigFW(attributesToChange map[string]MlxNic) error { log.Log.Info("mellanox-plugin configFW()") for pciAddr, fwArgs := range attributesToChange { diff --git a/pkg/vendors/mellanox/mock/mock_mellanox.go b/pkg/vendors/mellanox/mock/mock_mellanox.go index 78c1d4903..1a0ca3120 100644 --- a/pkg/vendors/mellanox/mock/mock_mellanox.go +++ b/pkg/vendors/mellanox/mock/mock_mellanox.go @@ -79,6 +79,20 @@ func (mr *MockMellanoxInterfaceMockRecorder) MlxConfigFW(attributesToChange inte return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MlxConfigFW", reflect.TypeOf((*MockMellanoxInterface)(nil).MlxConfigFW), attributesToChange) } +// MlxResetFW mocks base method. +func (m *MockMellanoxInterface) MlxResetFW(pciAddresses []string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "MlxResetFW", pciAddresses) + ret0, _ := ret[0].(error) + return ret0 +} + +// MlxResetFW indicates an expected call of MlxResetFW. +func (mr *MockMellanoxInterfaceMockRecorder) MlxResetFW(pciAddresses interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "MlxResetFW", reflect.TypeOf((*MockMellanoxInterface)(nil).MlxResetFW), pciAddresses) +} + // MstConfigReadData mocks base method. func (m *MockMellanoxInterface) MstConfigReadData(arg0 string) (string, string, error) { m.ctrl.T.Helper() From 08b524e388fd42bd4fe12dc8580e3f2fd22f628d Mon Sep 17 00:00:00 2001 From: Tobias Giese Date: Tue, 23 Jul 2024 21:47:59 +0200 Subject: [PATCH 2/2] chore: add feature flag for mellanox fw reset To not enable the new feature by default we want to add a feature flag first. Signed-off-by: Tobias Giese --- cmd/sriov-network-config-daemon/start.go | 15 +++++++++++++++ pkg/consts/constants.go | 3 +++ pkg/daemon/daemon.go | 14 ++++++++++++++ pkg/daemon/daemon_test.go | 9 ++++++--- pkg/plugins/mellanox/mellanox_plugin.go | 6 +++++- pkg/vars/vars.go | 3 +++ 6 files changed, 46 insertions(+), 4 deletions(-) diff --git a/cmd/sriov-network-config-daemon/start.go b/cmd/sriov-network-config-daemon/start.go index efe35bcbf..f003e5435 100644 --- a/cmd/sriov-network-config-daemon/start.go +++ b/cmd/sriov-network-config-daemon/start.go @@ -26,6 +26,7 @@ import ( "github.com/spf13/cobra" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" @@ -41,6 +42,7 @@ import ( snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/daemon" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms" @@ -276,6 +278,18 @@ func runStartCmd(cmd *cobra.Command, args []string) error { } go nodeWriter.Run(stopCh, refreshCh, syncCh) + // Init feature gates once to prevent race conditions. + defaultConfig := &sriovnetworkv1.SriovOperatorConfig{} + err = kClient.Get(context.Background(), types.NamespacedName{Namespace: vars.Namespace, Name: consts.DefaultConfigName}, defaultConfig) + if err != nil { + log.Log.Error(err, "Failed to get default SriovOperatorConfig object") + return err + } + featureGates := featuregate.New() + featureGates.Init(defaultConfig.Spec.FeatureGates) + vars.MlxPluginFwReset = featureGates.IsEnabled(consts.MellanoxFirmwareResetFeatureGate) + log.Log.Info("Enabled featureGates", "featureGates", featureGates.String()) + setupLog.V(0).Info("Starting SriovNetworkConfigDaemon") err = daemon.New( kClient, @@ -288,6 +302,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error { syncCh, refreshCh, eventRecorder, + featureGates, startOpts.disabledPlugins, ).Run(stopCh, exitCh) if err != nil { diff --git a/pkg/consts/constants.go b/pkg/consts/constants.go index cbfe9ad98..f3c076111 100644 --- a/pkg/consts/constants.go +++ b/pkg/consts/constants.go @@ -139,6 +139,9 @@ const ( // ManageSoftwareBridgesFeatureGate: enables management of software bridges by the operator ManageSoftwareBridgesFeatureGate = "manageSoftwareBridges" + // MellanoxFirmwareResetFeatureGate: enables the firmware reset via mstfwreset before a reboot + MellanoxFirmwareResetFeatureGate = "mellanoxFirmwareReset" + // The path to the file on the host filesystem that contains the IB GUID distribution for IB VFs InfinibandGUIDConfigFilePath = SriovConfBasePath + "/infiniband/guids" ) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 0fd1ec2cb..5e327dc87 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -5,6 +5,7 @@ import ( "fmt" "math/rand" "os/exec" + "reflect" "sync" "time" @@ -23,6 +24,7 @@ import ( snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned" sninformer "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/informers/externalversions" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms" @@ -82,6 +84,8 @@ type Daemon struct { workqueue workqueue.RateLimitingInterface eventRecorder *EventRecorder + + featureGate featuregate.FeatureGate } func New( @@ -95,6 +99,7 @@ func New( syncCh <-chan struct{}, refreshCh chan<- Message, er *EventRecorder, + featureGates featuregate.FeatureGate, disabledPlugins []string, ) *Daemon { return &Daemon{ @@ -113,6 +118,7 @@ func New( &workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(updateDelay), 1)}, workqueue.NewItemExponentialFailureRateLimiter(1*time.Second, maxUpdateBackoff)), "SriovNetworkNodeState"), eventRecorder: er, + featureGate: featureGates, disabledPlugins: disabledPlugins, } } @@ -286,6 +292,7 @@ func (dn *Daemon) operatorConfigAddHandler(obj interface{}) { } func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) { + oldCfg := old.(*sriovnetworkv1.SriovOperatorConfig) newCfg := new.(*sriovnetworkv1.SriovOperatorConfig) if newCfg.Namespace != vars.Namespace || newCfg.Name != consts.DefaultConfigName { log.Log.V(2).Info("unsupported SriovOperatorConfig", "namespace", newCfg.Namespace, "name", newCfg.Name) @@ -299,6 +306,13 @@ func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) { dn.disableDrain = newDisableDrain log.Log.Info("Set Disable Drain", "value", dn.disableDrain) } + + if !reflect.DeepEqual(oldCfg.Spec.FeatureGates, newCfg.Spec.FeatureGates) { + dn.featureGate.Init(newCfg.Spec.FeatureGates) + log.Log.Info("Updated featureGates", "featureGates", dn.featureGate.String()) + } + + vars.MlxPluginFwReset = dn.featureGate.IsEnabled(consts.MellanoxFirmwareResetFeatureGate) } func (dn *Daemon) nodeStateSyncHandler() error { diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index a1e29dcb2..d89e91991 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -18,18 +18,18 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" sriovnetworkv1 "github.com/k8snetworkplumbingwg/sriov-network-operator/api/v1" - mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock" - "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem" - snclient "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned" snclientset "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/client/clientset/versioned/fake" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate" mock_helper "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper/mock" + mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/openshift" plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/fake" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins/generic" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" + "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem" ) func TestConfigDaemon(t *testing.T) { @@ -151,6 +151,8 @@ var _ = Describe("Config Daemon", func() { vendorHelper.EXPECT().PrepareNMUdevRule([]string{"0x1014", "0x154c"}).Return(nil).AnyTimes() vendorHelper.EXPECT().PrepareVFRepUdevRule().Return(nil).AnyTimes() + featureGates := featuregate.New() + sut = New( kClient, snclient, @@ -162,6 +164,7 @@ var _ = Describe("Config Daemon", func() { syncCh, refreshCh, er, + featureGates, nil, ) diff --git a/pkg/plugins/mellanox/mellanox_plugin.go b/pkg/plugins/mellanox/mellanox_plugin.go index 827a226ac..10b0152bb 100644 --- a/pkg/plugins/mellanox/mellanox_plugin.go +++ b/pkg/plugins/mellanox/mellanox_plugin.go @@ -9,6 +9,7 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/consts" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" plugin "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/plugins" + "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" mlx "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vendors/mellanox" ) @@ -211,7 +212,10 @@ func (p *MellanoxPlugin) Apply() error { if err := p.helpers.MlxConfigFW(attributesToChange); err != nil { return err } - return p.helpers.MlxResetFW(pciAddressesToReset) + if vars.MlxPluginFwReset { + return p.helpers.MlxResetFW(pciAddressesToReset) + } + return nil } // nicHasExternallyManagedPFs returns true if one of the ports(interface) of the NIC is marked as externally managed diff --git a/pkg/vars/vars.go b/pkg/vars/vars.go index 9321261dc..fc7108ed8 100644 --- a/pkg/vars/vars.go +++ b/pkg/vars/vars.go @@ -54,6 +54,9 @@ var ( // ManageSoftwareBridges global variable which reflects state of manageSoftwareBridges feature ManageSoftwareBridges = false + // MlxPluginFwReset global variable enables mstfwreset before rebooting a node on VF changes + MlxPluginFwReset = false + // FilesystemRoot used by test to mock interactions with filesystem FilesystemRoot = ""