Skip to content

Commit

Permalink
PWX-31340: Add capability to handle fatal pre-flight error which will… (
Browse files Browse the repository at this point in the history
#1364)

* PWX-31340: Add capability to handle fatal pre-flight error which will… (#1356)

* PWX-31340: Add capability to handle fatal pre-flight error which will prevent PX startup.

Signed-off-by: Jose Rivera <jose@portworx.com>

* PWX-31340: Make sure that 'passed' flag is set correctly.

Signed-off-by: Jose Rivera <jose@portworx.com>

* PWX-31340: PR review suggestion:: Update crd description & add comments explain fatal event msg collection/output.

Signed-off-by: Jose Rivera <jose@portworx.com>

* PWX-31340: Review changes - Use const and cleanup output msg.

Signed-off-by: Jose Rivera <jose@portworx.com>

* PWX-31340: Fix unit test failure because of the changes log msg.

Signed-off-by: Jose Rivera <jose@portworx.com>

* PWX-31340: Fix unit test failure because of the changes log msg.

Signed-off-by: Jose Rivera <jose@portworx.com>

---------

Signed-off-by: Jose Rivera <jose@portworx.com>

* PWX-31340: Fix unit test failure because of the changes log msg.

Signed-off-by: Jose Rivera <jose@portworx.com>

---------

Signed-off-by: Jose Rivera <jose@portworx.com>
  • Loading branch information
jrivera-px authored Dec 4, 2023
1 parent 95f7220 commit 700e3bf
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 4 deletions.
3 changes: 3 additions & 0 deletions deploy/crds/core_v1_storagenode_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ spec:
success:
type: boolean
description: If true, the check was successful
result:
type: string
description: Result of the check fatal, warning, success
geography:
type: object
description: Contains topology information for the storage node.
Expand Down
115 changes: 114 additions & 1 deletion drivers/storage/portworx/portworx_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -480,11 +480,124 @@ func TestValidateCheckFailure(t *testing.T) {
require.Len(t, recorder.Events, 3)
<-recorder.Events // Pop first event which is Default telemetry enabled event
require.Contains(t, <-recorder.Events,
fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "usage pre-flight check failed"))
fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "usage pre-flight check"))
require.Contains(t, <-recorder.Events,
fmt.Sprintf("%v %v %s", v1.EventTypeNormal, util.PassPreFlight, "Not enabling PX-StoreV2"))
}

func TestValidateCheckFatal(t *testing.T) {
driver := portworx{}
cluster := &corev1.StorageCluster{
ObjectMeta: metav1.ObjectMeta{
Name: "px-cluster",
Namespace: "kube-test",
Annotations: map[string]string{
pxutil.AnnotationPreflightCheck: "true",
},
},
}

labels := map[string]string{
"name": pxPreFlightDaemonSetName,
}

clusterRef := metav1.NewControllerRef(cluster, pxutil.StorageClusterKind())
preflightDS := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Name: pxPreFlightDaemonSetName,
Namespace: cluster.Namespace,
Labels: labels,
UID: types.UID("preflight-ds-uid"),
OwnerReferences: []metav1.OwnerReference{*clusterRef},
},
Spec: appsv1.DaemonSetSpec{
Selector: &metav1.LabelSelector{
MatchLabels: labels,
},
},
}

checks := []corev1.CheckResult{
{
Type: "usage",
Reason: "fatal: PX-StoreV2 unsupported storage disk spec: unsupported cloud spec drive type found type=io2,size=100,iops=4000, only support [\"gp3\" \"io1\"]",
Success: false,
Result: "fatal",
},
{
Type: "usage",
Reason: "px-runc: PX-StoreV2 unsupported storage disk spec: 'consume unused' (-A/-a) options not valid with PX-StoreV2",
Success: false,
},
{
Type: "status",
Reason: "oci-mon: pre-flight completed",
Success: true,
},
}

status := corev1.NodeStatus{
Checks: checks,
}

storageNode := &corev1.StorageNode{
ObjectMeta: metav1.ObjectMeta{
Name: "node-1",
Namespace: cluster.Namespace,
OwnerReferences: []metav1.OwnerReference{*clusterRef},
},
Status: status,
}

preFlightPod1 := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "preflight-1",
Namespace: cluster.Namespace,
OwnerReferences: []metav1.OwnerReference{{UID: preflightDS.UID}},
},
Status: v1.PodStatus{
ContainerStatuses: []v1.ContainerStatus{
{
Name: "portworx",
Ready: true,
},
},
},
}

coreops.SetInstance(coreops.New(fakek8sclient.NewSimpleClientset()))
k8sClient := testutil.FakeK8sClient(preflightDS)

err := k8sClient.Create(context.TODO(), preFlightPod1)
require.NoError(t, err)

preflightDS.Status.DesiredNumberScheduled = int32(1)
err = k8sClient.Status().Update(context.TODO(), preflightDS)
require.NoError(t, err)

recorder := record.NewFakeRecorder(100)
err = driver.Init(k8sClient, runtime.NewScheme(), recorder)
require.NoError(t, err)

err = driver.SetDefaultsOnStorageCluster(cluster)
require.NoError(t, err)

err = k8sClient.Create(context.TODO(), storageNode)
require.NoError(t, err)

err = driver.Validate(cluster)
require.Error(t, err)
require.NotEmpty(t, recorder.Events)
require.Len(t, recorder.Events, 4)
<-recorder.Events // Pop first event which is Default telemetry enabled event
require.Contains(t, <-recorder.Events,
fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "usage pre-flight check"))
require.Contains(t, <-recorder.Events,
fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "fatal: PX-StoreV2 unsupported storage disk spec"))
require.Contains(t, <-recorder.Events,
fmt.Sprintf("%v %v %s", v1.EventTypeWarning, util.FailedPreFlight, "PX-StoreV2 pre-check failed"))
}

func TestValidateMissingRequiredCheck(t *testing.T) {
driver := portworx{}
cluster := &corev1.StorageCluster{
Expand Down
30 changes: 27 additions & 3 deletions drivers/storage/portworx/preflight.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const (
// DefCmetaVsphere default metadata cloud device for DMthin Vsphere
DefCmetaVsphere = "type=eagerzeroedthick,size=64"
preFlightOutputLog = "/var/cores/px-pre-flight-output.log"
// Fatal result string from pre-flight check
Fatal = "fatal"
)

// PreFlightPortworx provides a set of APIs to uninstall portworx
Expand Down Expand Up @@ -327,6 +329,8 @@ func (u *preFlightPortworx) processNodesChecks(recorder record.EventRecorder, st
}

passed := true
isFatal := false
fmsgs := []string{}
for _, node := range storageNodes {
// Process storageNode checks list for failures. Also make sure the "status" check entry
// exists, this indicates all the checks were submitted from the pre-flight pod.
Expand All @@ -344,15 +348,27 @@ func (u *preFlightPortworx) processNodesChecks(recorder record.EventRecorder, st
continue
}

msg := fmt.Sprintf("%s pre-flight check ", check.Type)
msg := fmt.Sprintf("%s pre-flight check: ", check.Type)
if check.Success {
msg = msg + "passed: " + check.Reason
k8sutil.InfoEvent(recorder, u.cluster, util.PassPreFlight, msg)
continue
}
msg = msg + "failed: " + check.Reason
k8sutil.WarningEvent(recorder, u.cluster, util.FailedPreFlight, msg)

passed = false // pre-flight status check failed, keep going for logging

if check.Result == Fatal {
// This loop processes check results looking for any failures. However there
// is a difference between a failure and "fatal" failure. So collect all
// fatal msgs so they can be outputted all at once instead of being intermixed
// with other failed events.
fmsgs = append(fmsgs, check.Reason)
isFatal = true
continue
}

msg = msg + check.Reason
k8sutil.WarningEvent(recorder, u.cluster, util.FailedPreFlight, msg)
}

if !statusExists {
Expand All @@ -361,6 +377,14 @@ func (u *preFlightPortworx) processNodesChecks(recorder record.EventRecorder, st
}
}

if isFatal {
// Output all the "fatal" events collected above. So they are not intermixed with other failed events.
for _, fmsg := range fmsgs {
k8sutil.WarningEvent(recorder, u.cluster, util.FailedPreFlight, fmsg)
}
u.hardFail = true // Set hardFail to prevent PX startup
}

return passed
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/core/v1/storagenode.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ type CheckResult struct {
Reason string `json:"reason,omitempty"`
// Success indicates if the check was successful or failed
Success bool `json:"success,omitempty"`
// Result of the success or failure
Result string `json:"result,omitempty"`
}

// NetworkStatus network status of the storage node
Expand Down

0 comments on commit 700e3bf

Please sign in to comment.