From 700e3bf416e8244e62d03737f5d5db4f80d48e07 Mon Sep 17 00:00:00 2001 From: "Jose Rivera (Hose)" Date: Mon, 4 Dec 2023 11:22:38 -0800 Subject: [PATCH] =?UTF-8?q?PWX-31340:=20Add=20capability=20to=20handle=20f?= =?UTF-8?q?atal=20pre-flight=20error=20which=20will=E2=80=A6=20(#1364)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * PWX-31340: Add capability to handle fatal pre-flight error which will… (#1356) * PWX-31340: Add capability to handle fatal pre-flight error which will prevent PX startup. Signed-off-by: Jose Rivera * PWX-31340: Make sure that 'passed' flag is set correctly. Signed-off-by: Jose Rivera * PWX-31340: PR review suggestion:: Update crd description & add comments explain fatal event msg collection/output. Signed-off-by: Jose Rivera * PWX-31340: Review changes - Use const and cleanup output msg. Signed-off-by: Jose Rivera * PWX-31340: Fix unit test failure because of the changes log msg. Signed-off-by: Jose Rivera * PWX-31340: Fix unit test failure because of the changes log msg. Signed-off-by: Jose Rivera --------- Signed-off-by: Jose Rivera * PWX-31340: Fix unit test failure because of the changes log msg. Signed-off-by: Jose Rivera --------- Signed-off-by: Jose Rivera --- deploy/crds/core_v1_storagenode_crd.yaml | 3 + drivers/storage/portworx/portworx_test.go | 115 +++++++++++++++++++++- drivers/storage/portworx/preflight.go | 30 +++++- pkg/apis/core/v1/storagenode.go | 2 + 4 files changed, 146 insertions(+), 4 deletions(-) diff --git a/deploy/crds/core_v1_storagenode_crd.yaml b/deploy/crds/core_v1_storagenode_crd.yaml index f4e1c408f..829969abf 100644 --- a/deploy/crds/core_v1_storagenode_crd.yaml +++ b/deploy/crds/core_v1_storagenode_crd.yaml @@ -148,6 +148,9 @@ spec: success: type: boolean description: If true, the check was successful + result: + type: string + description: Result of the check fatal, warning, success geography: type: object description: Contains topology information for the storage node. diff --git a/drivers/storage/portworx/portworx_test.go b/drivers/storage/portworx/portworx_test.go index 989a9ee39..11911669c 100644 --- a/drivers/storage/portworx/portworx_test.go +++ b/drivers/storage/portworx/portworx_test.go @@ -480,11 +480,124 @@ func TestValidateCheckFailure(t *testing.T) { require.Len(t, recorder.Events, 3) <-recorder.Events // Pop first event which is Default telemetry enabled event require.Contains(t, <-recorder.Events, - fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "usage pre-flight check failed")) + fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "usage pre-flight check")) require.Contains(t, <-recorder.Events, fmt.Sprintf("%v %v %s", v1.EventTypeNormal, util.PassPreFlight, "Not enabling PX-StoreV2")) } +func TestValidateCheckFatal(t *testing.T) { + driver := portworx{} + cluster := &corev1.StorageCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "px-cluster", + Namespace: "kube-test", + Annotations: map[string]string{ + pxutil.AnnotationPreflightCheck: "true", + }, + }, + } + + labels := map[string]string{ + "name": pxPreFlightDaemonSetName, + } + + clusterRef := metav1.NewControllerRef(cluster, pxutil.StorageClusterKind()) + preflightDS := &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: pxPreFlightDaemonSetName, + Namespace: cluster.Namespace, + Labels: labels, + UID: types.UID("preflight-ds-uid"), + OwnerReferences: []metav1.OwnerReference{*clusterRef}, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: labels, + }, + }, + } + + checks := []corev1.CheckResult{ + { + Type: "usage", + Reason: "fatal: PX-StoreV2 unsupported storage disk spec: unsupported cloud spec drive type found type=io2,size=100,iops=4000, only support [\"gp3\" \"io1\"]", + Success: false, + Result: "fatal", + }, + { + Type: "usage", + Reason: "px-runc: PX-StoreV2 unsupported storage disk spec: 'consume unused' (-A/-a) options not valid with PX-StoreV2", + Success: false, + }, + { + Type: "status", + Reason: "oci-mon: pre-flight completed", + Success: true, + }, + } + + status := corev1.NodeStatus{ + Checks: checks, + } + + storageNode := &corev1.StorageNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-1", + Namespace: cluster.Namespace, + OwnerReferences: []metav1.OwnerReference{*clusterRef}, + }, + Status: status, + } + + preFlightPod1 := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "preflight-1", + Namespace: cluster.Namespace, + OwnerReferences: []metav1.OwnerReference{{UID: preflightDS.UID}}, + }, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "portworx", + Ready: true, + }, + }, + }, + } + + coreops.SetInstance(coreops.New(fakek8sclient.NewSimpleClientset())) + k8sClient := testutil.FakeK8sClient(preflightDS) + + err := k8sClient.Create(context.TODO(), preFlightPod1) + require.NoError(t, err) + + preflightDS.Status.DesiredNumberScheduled = int32(1) + err = k8sClient.Status().Update(context.TODO(), preflightDS) + require.NoError(t, err) + + recorder := record.NewFakeRecorder(100) + err = driver.Init(k8sClient, runtime.NewScheme(), recorder) + require.NoError(t, err) + + err = driver.SetDefaultsOnStorageCluster(cluster) + require.NoError(t, err) + + err = k8sClient.Create(context.TODO(), storageNode) + require.NoError(t, err) + + err = driver.Validate(cluster) + require.Error(t, err) + require.NotEmpty(t, recorder.Events) + require.Len(t, recorder.Events, 4) + <-recorder.Events // Pop first event which is Default telemetry enabled event + require.Contains(t, <-recorder.Events, + fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "usage pre-flight check")) + require.Contains(t, <-recorder.Events, + fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "fatal: PX-StoreV2 unsupported storage disk spec")) + require.Contains(t, <-recorder.Events, + fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "PX-StoreV2 pre-check failed")) +} + func TestValidateMissingRequiredCheck(t *testing.T) { driver := portworx{} cluster := &corev1.StorageCluster{ diff --git a/drivers/storage/portworx/preflight.go b/drivers/storage/portworx/preflight.go index 6c0644d6b..14e957a4a 100644 --- a/drivers/storage/portworx/preflight.go +++ b/drivers/storage/portworx/preflight.go @@ -38,6 +38,8 @@ const ( // DefCmetaVsphere default metadata cloud device for DMthin Vsphere DefCmetaVsphere = "type=eagerzeroedthick,size=64" preFlightOutputLog = "/var/cores/px-pre-flight-output.log" + // Fatal result string from pre-flight check + Fatal = "fatal" ) // PreFlightPortworx provides a set of APIs to uninstall portworx @@ -327,6 +329,8 @@ func (u *preFlightPortworx) processNodesChecks(recorder record.EventRecorder, st } passed := true + isFatal := false + fmsgs := []string{} for _, node := range storageNodes { // Process storageNode checks list for failures. Also make sure the "status" check entry // exists, this indicates all the checks were submitted from the pre-flight pod. @@ -344,15 +348,27 @@ func (u *preFlightPortworx) processNodesChecks(recorder record.EventRecorder, st continue } - msg := fmt.Sprintf("%s pre-flight check ", check.Type) + msg := fmt.Sprintf("%s pre-flight check: ", check.Type) if check.Success { msg = msg + "passed: " + check.Reason k8sutil.InfoEvent(recorder, u.cluster, util.PassPreFlight, msg) continue } - msg = msg + "failed: " + check.Reason - k8sutil.WarningEvent(recorder, u.cluster, util.FailedPreFlight, msg) + passed = false // pre-flight status check failed, keep going for logging + + if check.Result == Fatal { + // This loop processes check results looking for any failures. However there + // is a difference between a failure and "fatal" failure. So collect all + // fatal msgs so they can be outputted all at once instead of being intermixed + // with other failed events. + fmsgs = append(fmsgs, check.Reason) + isFatal = true + continue + } + + msg = msg + check.Reason + k8sutil.WarningEvent(recorder, u.cluster, util.FailedPreFlight, msg) } if !statusExists { @@ -361,6 +377,14 @@ func (u *preFlightPortworx) processNodesChecks(recorder record.EventRecorder, st } } + if isFatal { + // Output all the "fatal" events collected above. So they are not intermixed with other failed events. + for _, fmsg := range fmsgs { + k8sutil.WarningEvent(recorder, u.cluster, util.FailedPreFlight, fmsg) + } + u.hardFail = true // Set hardFail to prevent PX startup + } + return passed } diff --git a/pkg/apis/core/v1/storagenode.go b/pkg/apis/core/v1/storagenode.go index c243d35ba..7feece004 100644 --- a/pkg/apis/core/v1/storagenode.go +++ b/pkg/apis/core/v1/storagenode.go @@ -100,6 +100,8 @@ type CheckResult struct { Reason string `json:"reason,omitempty"` // Success indicates if the check was successful or failed Success bool `json:"success,omitempty"` + // Result of the success or failure + Result string `json:"result,omitempty"` } // NetworkStatus network status of the storage node