diff --git a/pkg/util/test/util.go b/pkg/util/test/util.go index fb3b7e52e..0458ce7ab 100644 --- a/pkg/util/test/util.go +++ b/pkg/util/test/util.go @@ -5339,7 +5339,7 @@ func ValidateTelemetryV1Enabled(pxImageList map[string]string, cluster *corev1.S // ValidatePodDisruptionBudget validates the value of minavailable and number of disruptions for px-storage poddisruptionbudget func ValidatePodDisruptionBudget(cluster *corev1.StorageCluster, timeout, interval time.Duration) error { - logrus.Info("Validate px-storage poddisruptionbudget minAvailable and allowed disruptions") + logrus.Info("Validate portworx storage poddisruptionbudget") kbVer, err := GetK8SVersion() if err != nil { @@ -5354,7 +5354,44 @@ func ValidatePodDisruptionBudget(cluster *corev1.StorageCluster, timeout, interv // PodDisruptionBudget is supported for k8s version greater than or equal to 1.21 and operator version greater than or equal to 1.5.0 // Changing opVersion to 23.10.0 for PTX-23350 | TODO: add better logic with PTX-23407 - if k8sVersion.GreaterThanOrEqual(minSupportedK8sVersionForPdb) && opVersion.GreaterThanOrEqual(opVer23_10) && opVersion.LessThan(opVer24_2_0) && pxVersion.LessThan(pxVer3_1_2) { + // Smart and parallel upgrades is supported from px version 3.1.2 and operator version 24.2.0 + if k8sVersion.GreaterThanOrEqual(minSupportedK8sVersionForPdb) && opVersion.GreaterThanOrEqual(opVer24_2_0) && pxVersion.GreaterThanOrEqual(pxVer3_1_2) { + t := func() (interface{}, bool, error) { + nodes, err := operatorops.Instance().ListStorageNodes(cluster.Namespace) + if err != nil { + return nil, true, fmt.Errorf("failed to get storage nodes, Err: %v", err) + } + availableNodes := 0 + for _, node := range nodes.Items { + if *node.Status.NodeAttributes.Storage { + if node.Status.Phase == "Online" { + availableNodes++ + } else { + logrus.Infof("Node %s is in state [%s], PDB might be incorrect", node.Name, node.Status.Phase) + } + } + } + pdbs, err := policyops.Instance().ListPodDisruptionBudget(cluster.Namespace) + if err != nil { + return nil, true, fmt.Errorf("failed to list all poddisruptionbudgets, Err: %v", err) + } + actualNodePDBCount := 0 + for _, pdb := range pdbs.Items { + if strings.HasPrefix(pdb.Name, "px-") && pdb.Name != "px-kvdb" { + actualNodePDBCount++ + } + } + if actualNodePDBCount == availableNodes { + return nil, false, nil + } + return nil, true, fmt.Errorf("incorrect node PDB count. Expected node PDB count [%d], Actual node PDB count [%d]", availableNodes, actualNodePDBCount) + + } + if _, err := task.DoRetryWithTimeout(t, timeout, interval); err != nil { + return err + } + return nil + } else if k8sVersion.GreaterThanOrEqual(minSupportedK8sVersionForPdb) && opVersion.GreaterThanOrEqual(opVer23_10) { // This is only for non async DR setup t := func() (interface{}, bool, error) { @@ -5364,16 +5401,16 @@ func ValidatePodDisruptionBudget(cluster *corev1.StorageCluster, timeout, interv } nodeslen := 0 - availablenodes := 0 + availableNodes := 0 for _, node := range nodes.Items { if *node.Status.NodeAttributes.Storage { nodeslen++ if node.Status.Phase == "Online" { - availablenodes++ + availableNodes++ } } } - nodesUnavailable := nodeslen - availablenodes + nodesUnavailable := nodeslen - availableNodes // Skip PDB validation for px-storage if number of storage nodes is lesser than or equal to 2 if nodeslen <= 2 { logrus.Infof("Storage PDB does not exist for storage nodes lesser than or equal to 2, skipping PDB validattion") @@ -5951,3 +5988,93 @@ func RestoreEtcHosts(t *testing.T) { assert.Equal(t, bb.Len(), n, "short write") fd.Close() } + +func ValidateNodePDB(cluster *corev1.StorageCluster, timeout, interval time.Duration) error { + t := func() (interface{}, bool, error) { + nodes, err := coreops.Instance().GetNodes() + if err != nil { + return nil, true, fmt.Errorf("failed to get k8s nodes, Err: %v", err) + } + nodesPDBMap := make(map[string]bool) + + pdbs, err := policyops.Instance().ListPodDisruptionBudget(cluster.Namespace) + if err != nil { + return nil, true, fmt.Errorf("failed to get px-storage poddisruptionbudget, Err: %v", err) + } + + for _, pdb := range pdbs.Items { + if strings.HasPrefix(pdb.Name, "px-") && pdb.Name != "px-kvdb" { + nodesPDBMap[pdb.Name] = true + if pdb.Spec.MinAvailable.IntValue() != 1 { + return nil, true, fmt.Errorf("incorrect PDB minAvailable value for node %s. Expected PDB [%d], Actual PDB [%d]", strings.TrimPrefix(pdb.Name, "px-"), 1, pdb.Spec.MinAvailable.IntValue()) + } + } + } + // create map of storage nodes as well + storagenodes, err := operatorops.Instance().ListStorageNodes(cluster.Namespace) + if err != nil { + return nil, true, fmt.Errorf("failed to get storage nodes, Err: %v", err) + } + storageNodesMap := make(map[string]bool) + for _, node := range storagenodes.Items { + if *node.Status.NodeAttributes.Storage { + storageNodesMap[node.Name] = true + } + } + + for _, node := range nodes.Items { + if coreops.Instance().IsNodeMaster(node) { + continue + } + if _, ok := nodesPDBMap["px-"+node.Name]; !ok { + // return error only if the k8s node has a storage node in it + if _, ok := storageNodesMap[node.Name]; ok { + return nil, true, fmt.Errorf("PDB for node %s is missing", node.Name) + } + } + } + return nil, false, nil + } + if _, err := task.DoRetryWithTimeout(t, timeout, interval); err != nil { + return err + } + return nil +} + +func ValidateNodesSelectedForUpgrade(cluster *corev1.StorageCluster, minAvailable int, timeout, interval time.Duration) error { + t := func() (interface{}, bool, error) { + nodes, err := operatorops.Instance().ListStorageNodes(cluster.Namespace) + if err != nil { + return nil, true, fmt.Errorf("failed to get storage nodes, Err: %v", err) + } + totalStorageNodes := 0 + for _, node := range nodes.Items { + if *node.Status.NodeAttributes.Storage { + totalStorageNodes++ + } + } + if minAvailable == -1 { + // Setting minAvailable to quorum value + minAvailable = (totalStorageNodes / 2) + 1 + } + + pdbs, err := policyops.Instance().ListPodDisruptionBudget(cluster.Namespace) + if err != nil { + return nil, true, fmt.Errorf("failed to get px-storage poddisruptionbudget, Err: %v", err) + } + nodesReadyForUpgrade := 0 + for _, pdb := range pdbs.Items { + if strings.HasPrefix(pdb.Name, "px-") && pdb.Spec.MinAvailable.IntValue() == 0 { + nodesReadyForUpgrade++ + } + } + if nodesReadyForUpgrade <= (totalStorageNodes - minAvailable) { + return nil, false, nil + } + return nil, true, fmt.Errorf("nodes available for upgrade [%d] are more than expected [%d]", nodesReadyForUpgrade, totalStorageNodes-minAvailable) + } + if _, err := task.DoRetryWithTimeout(t, timeout, interval); err != nil { + return err + } + return nil +} diff --git a/test/integration_test/bluegreen_test.go b/test/integration_test/bluegreen_test.go index a9da4a845..5512852c1 100644 --- a/test/integration_test/bluegreen_test.go +++ b/test/integration_test/bluegreen_test.go @@ -6,7 +6,7 @@ package integrationtest import ( "bytes" "fmt" - "io" + "os" "sort" "strconv" @@ -131,13 +131,13 @@ var bgTestCases = []types.TestCase{ logrus.Infof("Attempt license expand on Trial via node %s", pl.Items[0].Spec.NodeName) var stdout, stderr bytes.Buffer - err = runInPortworxPod(&pl.Items[0], + err = ci_utils.RunInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, "/bin/sh", "-c", "/opt/pwx/bin/pxctl license trial; exec /opt/pwx/bin/pxctl license expand --start") require.Contains(t, stdout.String(), " not supported for Trial licenses") logrus.Infof("Installing license via node %s", pl.Items[0].Spec.NodeName) - err = runInPortworxPod(&pl.Items[0], + err = ci_utils.RunInPortworxPod(&pl.Items[0], bytes.NewReader([]byte(crippledTestLicense)), &stdout, &stderr, "/bin/sh", "-c", "base64 -d | /opt/pwx/bin/pxctl license add /dev/stdin") require.Equal(t, "", stderr.String()) @@ -146,7 +146,7 @@ var bgTestCases = []types.TestCase{ logrus.Infof("Renstalling license via node %s", pl.Items[2].Spec.NodeName) stdout.Reset() - err = runInPortworxPod(&pl.Items[2], + err = ci_utils.RunInPortworxPod(&pl.Items[2], bytes.NewReader([]byte(crippledTestLicense)), &stdout, &stderr, "/bin/sh", "-c", "base64 -d | /opt/pwx/bin/pxctl license add /dev/stdin") require.Equal(t, "", stderr.String()) @@ -157,7 +157,7 @@ var bgTestCases = []types.TestCase{ for _, p := range pl.Items { stdout.Reset() stderr.Reset() - err = runInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "license", "list") + err = ci_utils.RunInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "license", "list") require.Equal(t, "", stderr.String(), "unexpected STDERR on node %s", p.Spec.NodeName) require.Contains(t, stdout.String(), "PX-Enterprise Torpedo_TEST_license", @@ -262,7 +262,7 @@ var bgTestCases = []types.TestCase{ logrus.Infof("Extending license via node %s", pl.Items[0].Spec.NodeName) var stdout, stderr bytes.Buffer - err = runInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, + err = ci_utils.RunInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "license", "expand", "--start") require.NoError(t, err) require.Contains(t, stdout.String(), "Successfully initiated license extension") @@ -271,7 +271,7 @@ var bgTestCases = []types.TestCase{ for _, p := range pl.Items { stdout.Reset() stderr.Reset() - err = runInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "status") + err = ci_utils.RunInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "status") assert.Empty(t, stderr.String()) assert.Contains(t, stdout.String(), "NOTICE: License extension expires in ", "unexpected STDOUT @%s", p.Spec.NodeName) @@ -304,19 +304,19 @@ var bgTestCases = []types.TestCase{ tmpVolName := "testVol" + tmpSuffix logrus.Infof("Attempt volume creation on %s", lastPOD.Spec.NodeName) - err = runInPortworxPod(&lastPOD, nil, &stdout, &stderr, + err = ci_utils.RunInPortworxPod(&lastPOD, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "volume", "create", "--repl", "1", "--size", "3", tmpVolName) require.Contains(t, stdout.String(), "Volume successfully created") require.NoError(t, err) logrus.Infof("Attempt volume snapshot on %s", lastPOD.Spec.NodeName) - err = runInPortworxPod(&lastPOD, nil, &stdout, &stderr, + err = ci_utils.RunInPortworxPod(&lastPOD, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "volume", "snapshot", "create", "--name", "snap"+tmpSuffix, tmpVolName) require.Contains(t, stdout.String(), "Volume snap successful") require.NoError(t, err) logrus.Infof("Cleaning up volume / snapshot on %s", lastPOD.Spec.NodeName) - err = runInPortworxPod(&lastPOD, nil, &stdout, &stderr, + err = ci_utils.RunInPortworxPod(&lastPOD, nil, &stdout, &stderr, "/bin/sh", "-c", "/opt/pwx/bin/pxctl v delete --force snap"+tmpSuffix+ "; /opt/pwx/bin/pxctl v delete --force "+tmpVolName) require.Contains(t, stdout.String(), "Volume snap"+tmpSuffix+" successfully deleted") @@ -401,7 +401,7 @@ var bgTestCases = []types.TestCase{ for _, p := range pl.Items { stdout.Reset() stderr.Reset() - err = runInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "status") + err = ci_utils.RunInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "status") assert.Empty(t, stderr.String()) assert.Contains(t, stdout.String(), "NOTICE: License extension expires in ", "unexpected STDOUT @%s", p.Spec.NodeName) @@ -424,7 +424,7 @@ var bgTestCases = []types.TestCase{ logrus.Infof("Attempt license reinstall") var stdout, stderr bytes.Buffer - err = runInPortworxPod(&pl.Items[0], bytes.NewReader([]byte(crippledTestLicense)), &stdout, &stderr, + err = ci_utils.RunInPortworxPod(&pl.Items[0], bytes.NewReader([]byte(crippledTestLicense)), &stdout, &stderr, "/bin/sh", "-c", "base64 -d | /opt/pwx/bin/pxctl license add /dev/stdin") require.Equal(t, "", stderr.String()) require.Contains(t, strings.ToLower(stdout.String()), @@ -446,7 +446,7 @@ var bgTestCases = []types.TestCase{ logrus.Infof("End license extension while cluster over-allocated") var stdout, stderr bytes.Buffer - err = runInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, + err = ci_utils.RunInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "license", "expand", "--end") assert.Equal(t, "", stderr.String()) assert.Contains(t, stdout.String(), "Successfully turned off license extension") @@ -456,7 +456,7 @@ var bgTestCases = []types.TestCase{ for _, p := range pl.Items { stdout.Reset() stderr.Reset() - err = runInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "license", "ls") + err = ci_utils.RunInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "license", "ls") assert.Equal(t, "", stderr.String(), "did no expect errors @%s", p.Spec.NodeName) assert.Contains(t, stdout.String(), "ERROR: too many nodes in the cluster", @@ -585,7 +585,7 @@ var bgTestCases = []types.TestCase{ // get NodeID for the wiped node var stdout, stderr bytes.Buffer - err = runInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, + err = ci_utils.RunInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, "/bin/sh", "-c", "/opt/pwx/bin/pxctl status | grep "+lastNode+" | head -1 | awk '{print $2}'") lastNodeID := strings.Trim(stdout.String(), "\r\n\t ") require.NoError(t, err) @@ -595,7 +595,7 @@ var bgTestCases = []types.TestCase{ _, err = task.DoRetryWithTimeout( func() (interface{}, bool, error) { var stdout, stderr bytes.Buffer - runInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, + ci_utils.RunInPortworxPod(&pl.Items[0], nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "cluster", "delete", lastNodeID) if strings.Contains(stdout.String(), " successfully deleted.") { logrus.Debugf("Node %s successfully decomissioned", lastNode) @@ -613,7 +613,7 @@ var bgTestCases = []types.TestCase{ logrus.Infof("Checking PX status on all nodes") for _, p := range pl.Items[:len(pl.Items)-1] { var stdout, stderr bytes.Buffer - err = runInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "status") + err = ci_utils.RunInPortworxPod(&p, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "status") require.NoError(t, err, "unexpected error @%s", p.Spec.NodeName) require.Contains(t, stdout.String(), "License: PX-Enterprise Torpedo_TEST_license (expires in ", "unexpected content @%s", p.Spec.NodeName) @@ -624,27 +624,13 @@ var bgTestCases = []types.TestCase{ }, } -func runInPortworxPod(pod *v1.Pod, in io.Reader, out, err io.Writer, command ...string) error { - if pod == nil || len(command) <= 0 { - return os.ErrInvalid - } - - if logrus.IsLevelEnabled(logrus.DebugLevel) { - logrus.Debugf("run on %s via %s: `%s`", pod.Spec.NodeName, pod.Name, strings.Join(command, " ")) - } - - return coreops.Instance().RunCommandInPodEx(&coreops.RunCommandInPodExRequest{ - command, pod.Name, "portworx", pod.Namespace, false, in, out, err, - }) -} - func wipeNodeRunningPod(pod *v1.Pod) error { if pod == nil { return os.ErrInvalid } logrus.Debugf("Wiping PX on node %s using POD %s", pod.Spec.NodeName, pod.Name) var stdout, stderr bytes.Buffer - err := runInPortworxPod(pod, nil, &stdout, &stderr, + err := ci_utils.RunInPortworxPod(pod, nil, &stdout, &stderr, "nsenter", "--mount=/host_proc/1/ns/mnt", "--", "/bin/sh", "-c", "pxctl sv nw --all") if err != nil { return fmt.Errorf("node-wipe failed: %s (%s)", err, @@ -677,7 +663,7 @@ func taskWaitPxctlStatus(t *testing.T, nodeName, podName, expectedOutput string) // run `pxctl status` -- compare output var stdout, stderr bytes.Buffer - runInPortworxPod(monitoredPod, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "status") + ci_utils.RunInPortworxPod(monitoredPod, nil, &stdout, &stderr, "/opt/pwx/bin/pxctl", "status") s := strings.Trim(stdout.String(), "\r\n ") if strings.Contains(s, expectedOutput) { logrus.Infof("'pxctl status' @%s got expected %q", nodeName, expectedOutput) diff --git a/test/integration_test/node_pdb_test.go b/test/integration_test/node_pdb_test.go new file mode 100644 index 000000000..a64bba2e9 --- /dev/null +++ b/test/integration_test/node_pdb_test.go @@ -0,0 +1,344 @@ +package integrationtest + +import ( + "bytes" + "fmt" + "strings" + "testing" + "time" + + "github.com/hashicorp/go-version" + corev1 "github.com/libopenstorage/operator/pkg/apis/core/v1" + testutil "github.com/libopenstorage/operator/pkg/util/test" + "github.com/libopenstorage/operator/test/integration_test/types" + ci_utils "github.com/libopenstorage/operator/test/integration_test/utils" + + coreops "github.com/portworx/sched-ops/k8s/core" + "github.com/portworx/sched-ops/k8s/operator" + policyops "github.com/portworx/sched-ops/k8s/policy" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var ( + pxVer3_1_2, _ = version.NewVersion("3.1.2") +) + +var testNodePDBCases = []types.TestCase{ + { + TestName: "CreateNodePDBBasic", + TestrailCaseIDs: []string{"C299571", "C299572"}, + TestSpec: ci_utils.CreateStorageClusterTestSpecFunc(&corev1.StorageCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-stc"}, + }), + ShouldSkip: func(tc *types.TestCase) bool { + kbVer, err := testutil.GetK8SVersion() + if err != nil { + logrus.Info("Skipping PDB test due to Err: ", err) + return true + } + k8sVersion, _ := version.NewVersion(kbVer) + return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer24_2_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) + }, + TestFunc: CreateNodePDBBasic, + }, + { + TestName: "CreateNodePDBWithStoragelessNode", + TestrailCaseIDs: []string{"C299573"}, + TestSpec: ci_utils.CreateStorageClusterTestSpecFunc(&corev1.StorageCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-stc"}, + }), + ShouldSkip: func(tc *types.TestCase) bool { + if len(ci_utils.PxDeviceSpecs) == 0 { + logrus.Info("--portworx-device-specs is empty, cannot run PDBWithStoragelessNode test") + return true + } + kbVer, err := testutil.GetK8SVersion() + if err != nil { + logrus.Info("Skipping PDB test due to Err: ", err) + return true + } + k8sVersion, _ := version.NewVersion(kbVer) + return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer24_2_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) + }, + TestFunc: CreateNodePDBWithStoragelessNode, + }, + { + TestName: "MaxNodesAvailableForUpgrade", + TestrailCaseIDs: []string{"C299574", "C299575"}, + TestSpec: ci_utils.CreateStorageClusterTestSpecFunc(&corev1.StorageCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-stc"}, + }), + ShouldSkip: func(tc *types.TestCase) bool { + kbVer, err := testutil.GetK8SVersion() + if err != nil { + logrus.Info("Skipping PDB test due to Err: ", err) + return true + } + k8sVersion, _ := version.NewVersion(kbVer) + return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer24_2_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) + }, + TestFunc: MaxNodesAvailableForUpgrade, + }, + { + TestName: "NodePDBDisablingParallelUpgrade", + TestrailCaseIDs: []string{"C299576", "C299577"}, + TestSpec: ci_utils.CreateStorageClusterTestSpecFunc(&corev1.StorageCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-stc"}, + }), + ShouldSkip: func(tc *types.TestCase) bool { + kbVer, err := testutil.GetK8SVersion() + if err != nil { + logrus.Info("Skipping PDB test due to Err: ", err) + return true + } + k8sVersion, _ := version.NewVersion(kbVer) + return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer24_2_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) + }, + TestFunc: NodePDBDisablingParallelUpgrade, + }, + { + TestName: "NodesSelectedForUpgradeWithReplicas", + TestrailCaseIDs: []string{"C299578", "C299579"}, + TestSpec: ci_utils.CreateStorageClusterTestSpecFunc(&corev1.StorageCluster{ + ObjectMeta: metav1.ObjectMeta{Name: "test-stc"}, + }), + ShouldSkip: func(tc *types.TestCase) bool { + kbVer, err := testutil.GetK8SVersion() + if err != nil { + logrus.Info("Skipping PDB test due to Err: ", err) + return true + } + k8sVersion, _ := version.NewVersion(kbVer) + return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer24_2_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) + }, + TestFunc: NodesSelectedForUpgradeWithReplicas, + }, +} + +func CreateNodePDBBasic(tc *types.TestCase) func(*testing.T) { + return func(t *testing.T) { + testSpec := tc.TestSpec(t) + cluster, ok := testSpec.(*corev1.StorageCluster) + require.True(t, ok) + cluster = ci_utils.DeployAndValidateStorageCluster(cluster, ci_utils.PxSpecImages, t) + pxVersion := testutil.GetPortworxVersion(cluster) + + if pxVersion.GreaterThanOrEqual(pxVer3_1_2) { + logrus.Infof("Validating Node PDB names and default minAvailable") + err := testutil.ValidateNodePDB(cluster, ci_utils.DefaultValidateUpgradeTimeout, ci_utils.DefaultValidateUpgradeRetryInterval) + require.NoError(t, err) + } + ci_utils.UninstallAndValidateStorageCluster(cluster, t) + } + +} +func CreateNodePDBWithStoragelessNode(tc *types.TestCase) func(*testing.T) { + return func(t *testing.T) { + testSpec := tc.TestSpec(t) + cluster, ok := testSpec.(*corev1.StorageCluster) + require.True(t, ok) + + *cluster.Spec.CloudStorage.MaxStorageNodesPerZone = uint32(3) + logrus.Info("Validating PDB with storageless nodes using maxstoragenodesperzone value: ", *cluster.Spec.CloudStorage.MaxStorageNodesPerZone) + cluster = ci_utils.DeployAndValidateStorageCluster(cluster, ci_utils.PxSpecImages, t) + pxVersion := testutil.GetPortworxVersion(cluster) + if pxVersion.GreaterThanOrEqual(pxVer3_1_2) { + logrus.Infof("Validating Node PDB names and default minAvailable") + err := testutil.ValidateNodePDB(cluster, ci_utils.DefaultValidateUpgradeTimeout, ci_utils.DefaultValidateUpgradeRetryInterval) + require.NoError(t, err) + } + ci_utils.UninstallAndValidateStorageCluster(cluster, t) + + } +} + +func MaxNodesAvailableForUpgrade(tc *types.TestCase) func(*testing.T) { + return func(t *testing.T) { + testSpec := tc.TestSpec(t) + cluster, ok := testSpec.(*corev1.StorageCluster) + require.True(t, ok) + cluster = ci_utils.DeployAndValidateStorageCluster(cluster, ci_utils.PxSpecImages, t) + pxVersion := testutil.GetPortworxVersion(cluster) + + if pxVersion.GreaterThanOrEqual(pxVer3_1_2) { + err := ci_utils.CordonNodes() + require.NoError(t, err) + + logrus.Infof("Validating number of nodes ready for upgrade without minAvailable annotation") + err = testutil.ValidateNodesSelectedForUpgrade(cluster, -1, ci_utils.DefaultValidateUpgradeTimeout, ci_utils.DefaultValidateUpgradeRetryInterval) + require.NoError(t, err) + + k8snodecount, err := ci_utils.GetNonMasterK8sNodeCount() + require.NoError(t, err) + cluster, err = operator.Instance().GetStorageCluster(cluster.Name, cluster.Namespace) + require.NoError(t, err) + cluster.Annotations["portworx.io/storage-pdb-min-available"] = fmt.Sprintf("%d", k8snodecount-1) + cluster, err = ci_utils.UpdateStorageCluster(cluster) + require.NoError(t, err) + + logrus.Infof("Validating number of nodes ready for upgrade with minAvailable annotation %d", k8snodecount-1) + err = testutil.ValidateNodesSelectedForUpgrade(cluster, k8snodecount-1, ci_utils.DefaultValidateUpgradeTimeout, ci_utils.DefaultValidateUpgradeRetryInterval) + require.NoError(t, err) + + err = ci_utils.UncordonNodes() + require.NoError(t, err) + } + ci_utils.UninstallAndValidateStorageCluster(cluster, t) + } +} + +func NodePDBDisablingParallelUpgrade(tc *types.TestCase) func(*testing.T) { + return func(t *testing.T) { + testSpec := tc.TestSpec(t) + cluster, ok := testSpec.(*corev1.StorageCluster) + require.True(t, ok) + if cluster.Annotations == nil { + cluster.Annotations = make(map[string]string) + } + cluster.Annotations["portworx.io/disable-non-disruptive-upgrade"] = "true" + cluster = ci_utils.DeployAndValidateStorageCluster(cluster, ci_utils.PxSpecImages, t) + pxVersion := testutil.GetPortworxVersion(cluster) + if pxVersion.GreaterThanOrEqual(pxVer3_1_2) { + err := ci_utils.CordonNodes() + require.NoError(t, err) + k8snodecount, err := ci_utils.GetNonMasterK8sNodeCount() + require.NoError(t, err) + logrus.Infof("Validating number of nodes ready for upgrade without minAvailable annotation after disabling non-disruptive upgrade") + err = testutil.ValidateNodesSelectedForUpgrade(cluster, k8snodecount-1, ci_utils.DefaultValidateUpgradeTimeout, ci_utils.DefaultValidateUpgradeRetryInterval) + require.NoError(t, err) + + cluster, err = operator.Instance().GetStorageCluster(cluster.Name, cluster.Namespace) + require.NoError(t, err) + cluster.Annotations["portworx.io/storage-pdb-min-available"] = fmt.Sprintf("%d", k8snodecount-2) + cluster, err = ci_utils.UpdateStorageCluster(cluster) + require.NoError(t, err) + logrus.Infof("Validating number of nodes ready for upgrade with minAvailable annotation %d after disabling non-disruptive upgrade", k8snodecount-2) + err = testutil.ValidateNodesSelectedForUpgrade(cluster, k8snodecount-2, ci_utils.DefaultValidateUpgradeTimeout, ci_utils.DefaultValidateUpgradeRetryInterval) + require.NoError(t, err) + + err = ci_utils.UncordonNodes() + require.NoError(t, err) + } + ci_utils.UninstallAndValidateStorageCluster(cluster, t) + + } +} + +func NodesSelectedForUpgradeWithReplicas(tc *types.TestCase) func(*testing.T) { + return func(t *testing.T) { + testSpec := tc.TestSpec(t) + cluster, _ := testSpec.(*corev1.StorageCluster) + cluster = ci_utils.DeployAndValidateStorageCluster(cluster, ci_utils.PxSpecImages, t) + pxVersion := testutil.GetPortworxVersion(cluster) + + if pxVersion.GreaterThanOrEqual(pxVer3_1_2) { + + // Get px pods + pods, err := coreops.Instance().ListPods(map[string]string{"name": "portworx"}) + require.NoError(t, err) + require.NotEmpty(t, pods.Items) + + //Create a volume of replica 2 + replicaNodes := "" + var stdout, stderr bytes.Buffer + + storageNode, err := operator.Instance().GetStorageNode(pods.Items[0].Spec.NodeName, pods.Items[0].Namespace) + require.NoError(t, err) + replicaNodes = storageNode.Status.NodeUID + storageNode, err = operator.Instance().GetStorageNode(pods.Items[1].Spec.NodeName, pods.Items[1].Namespace) + require.NoError(t, err) + replicaNodes = replicaNodes + "," + storageNode.Status.NodeUID + + tmpVolName := "testVol" + logrus.Infof("Attempt volume creation on nodes %s", replicaNodes) + err = ci_utils.RunInPortworxPod(&pods.Items[2], nil, &stdout, &stderr, + "/opt/pwx/bin/pxctl", "volume", "create", "--repl", "2", "--nodes", replicaNodes, tmpVolName) + require.Contains(t, stdout.String(), "Volume successfully created") + require.NoError(t, err) + + // Cordon the nodes with volume replica + for i := 0; i < 2; i++ { + currNode, err := coreops.Instance().GetNodeByName(pods.Items[i].Spec.NodeName) + require.NoError(t, err) + currNode.Spec.Unschedulable = true + _, err = coreops.Instance().UpdateNode(currNode) + require.NoError(t, err) + } + + // sleep for 30seconds to allow the PDB to get updated + time.Sleep(30 * time.Second) + + // Validate the nodes selected for upgrade + logrus.Infof("Validating only 1 node with volume replica is ready for upgrade") + pdbs, err := policyops.Instance().ListPodDisruptionBudget(cluster.Namespace) + require.NoError(t, err) + isVolumeReplicaSelected := false + for _, pdb := range pdbs.Items { + if (strings.HasSuffix(pdb.Name, pods.Items[0].Spec.NodeName) || strings.HasSuffix(pdb.Name, pods.Items[1].Spec.NodeName)) && pdb.Spec.MinAvailable.IntValue() == 0 { + require.False(t, isVolumeReplicaSelected) + isVolumeReplicaSelected = true + } + } + + // Uncordon nodes + logrus.Infof("Uncordoning nodes %s", replicaNodes) + err = ci_utils.UncordonNodes() + require.NoError(t, err) + + // Case2: When 1 px on node with volume replica is down + logrus.Infof("Stopping PX on node %s", pods.Items[0].Spec.NodeName) + err = coreops.Instance().AddLabelOnNode(pods.Items[0].Spec.NodeName, "px/service", "stop") + require.NoError(t, err, "could not label node %s", pods.Items[0].Spec.NodeName) + sleep4 := 30 * time.Second + logrus.Infof("Sleeping for %s to allow portworx.service @%s to stop", sleep4, pods.Items[0].Spec.NodeName) + time.Sleep(sleep4) + // Cordon the nodes with volume replica + for i := 0; i < 2; i++ { + currNode, err := coreops.Instance().GetNodeByName(pods.Items[i].Spec.NodeName) + require.NoError(t, err) + currNode.Spec.Unschedulable = true + _, err = coreops.Instance().UpdateNode(currNode) + require.NoError(t, err) + } + // sleep for 30seconds to allow the PDB to get updated + time.Sleep(30 * time.Second) + // Neither of the nodes should be selected for upgrade + logrus.Infof("Validating no cordoned nodes are ready for upgrade with 1 volume replica down") + pdbs, err = policyops.Instance().ListPodDisruptionBudget(cluster.Namespace) + require.NoError(t, err) + for _, pdb := range pdbs.Items { + if strings.HasPrefix(pdb.Name, "px-") && pdb.Name != "px-kvdb" { + require.Equal(t, 1, pdb.Spec.MinAvailable.IntValue()) + } + } + + // Bring the node back up + logrus.Infof("Bringing portworx up on node %s", pods.Items[0].Spec.NodeName) + err = coreops.Instance().RemoveLabelOnNode(pods.Items[0].Spec.NodeName, "px/service") + require.NoError(t, err) + err = coreops.Instance().AddLabelOnNode(pods.Items[0].Spec.NodeName, "px/service", "start") + require.NoError(t, err, "could not label node %s", pods.Items[0].Spec.NodeName) + time.Sleep(sleep4) + + // Uncordon nodes + err = ci_utils.UncordonNodes() + require.NoError(t, err) + + // delete the volumes created + logrus.Infof("Cleaning up volumes on %s", replicaNodes) + err = ci_utils.RunInPortworxPod(&pods.Items[2], nil, &stdout, &stderr, "/bin/sh", "-c", "/opt/pwx/bin/pxctl v delete --force "+tmpVolName) + require.NoError(t, err) + require.Contains(t, stdout.String(), "Volume "+tmpVolName+" successfully deleted") + + } + ci_utils.UninstallAndValidateStorageCluster(cluster, t) + } +} + +func TestNodePDB(t *testing.T) { + for _, tc := range testNodePDBCases { + tc.RunTest(t) + } +} diff --git a/test/integration_test/pdb_test.go b/test/integration_test/pdb_test.go index 9a4d5c8c6..cc5694c62 100644 --- a/test/integration_test/pdb_test.go +++ b/test/integration_test/pdb_test.go @@ -37,7 +37,7 @@ var testStorageClusterPDBCases = []types.TestCase{ return true } k8sVersion, _ := version.NewVersion(kbVer) - return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer1_5_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) + return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer1_5_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) || ci_utils.PxOperatorVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer24_2_0) }, TestFunc: StoragelessNodePDB, }, @@ -54,7 +54,7 @@ var testStorageClusterPDBCases = []types.TestCase{ return true } k8sVersion, _ := version.NewVersion(kbVer) - return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer23_10_2) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) + return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer23_10_2) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) || ci_utils.PxOperatorVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer24_2_0) }, TestFunc: OverridePDBUsingValidAnnotation, }, @@ -72,7 +72,7 @@ var testStorageClusterPDBCases = []types.TestCase{ return true } k8sVersion, _ := version.NewVersion(kbVer) - return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer24_1_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) + return ci_utils.PxOperatorVersion.LessThan(ci_utils.PxOperatorVer24_1_0) || k8sVersion.LessThan(minSupportedK8sVersionForPdb) || ci_utils.PxOperatorVersion.GreaterThanOrEqual(ci_utils.PxOperatorVer24_2_0) }, TestFunc: OverridePDBUsingInvalidAnnotation, }, diff --git a/test/integration_test/utils/k8s.go b/test/integration_test/utils/k8s.go index 5a7eaf6af..48c34baf8 100644 --- a/test/integration_test/utils/k8s.go +++ b/test/integration_test/utils/k8s.go @@ -392,3 +392,62 @@ func validateSpec(in interface{}) (runtime.Object, error) { } return nil, fmt.Errorf("unsupported object: %v", reflect.TypeOf(in)) } + +func GetNonMasterK8sNodeCount() (int, error) { + nodes, err := coreops.Instance().GetNodes() + if err != nil { + return -1, err + } + nodesCount := 0 + for _, node := range nodes.Items { + if coreops.Instance().IsNodeMaster(node) { + continue + } + nodesCount++ + } + return nodesCount, nil +} + +func CordonNodes() error { + nodes, err := coreops.Instance().GetNodes() + if err != nil { + logrus.Errorf("failed to get storage nodes, Err: %v", err) + } + for _, node := range nodes.Items { + if coreops.Instance().IsNodeMaster(node) { + continue + } + currNode, err := coreops.Instance().GetNodeByName(node.Name) + if err != nil { + return fmt.Errorf("failed to get node %s, Err: %v", node.Name, err) + } + currNode.Spec.Unschedulable = true + _, err = coreops.Instance().UpdateNode(currNode) + if err != nil { + return fmt.Errorf("failed to cordon node %s, Err: %v", node.Name, err) + } + } + return nil +} + +func UncordonNodes() error { + nodes, err := coreops.Instance().GetNodes() + if err != nil { + logrus.Errorf("failed to get storage nodes, Err: %v", err) + } + for _, node := range nodes.Items { + if coreops.Instance().IsNodeMaster(node) { + continue + } + currNode, err := coreops.Instance().GetNodeByName(node.Name) + if err != nil { + return fmt.Errorf("failed to get node %s, Err: %v", node.Name, err) + } + currNode.Spec.Unschedulable = false + _, err = coreops.Instance().UpdateNode(currNode) + if err != nil { + return fmt.Errorf("failed to uncordon node %s, Err: %v", node.Name, err) + } + } + return nil +} diff --git a/test/integration_test/utils/px_operator.go b/test/integration_test/utils/px_operator.go index bc354a2a0..b793d4525 100644 --- a/test/integration_test/utils/px_operator.go +++ b/test/integration_test/utils/px_operator.go @@ -42,6 +42,8 @@ var ( PxOperatorVer23_10_3, _ = version.NewVersion("23.10.3-") // PxOperatorVer24_1_0 portworx-operator 24.1.0 version to correct invalid PDB minAvailable PxOperatorVer24_1_0, _ = version.NewVersion("24.1.0-") + // PxOperatorVer24_2_0 portworx-operator 24.2.0 version is minimum version smart and parallel upgrades is supported + PxOperatorVer24_2_0, _ = version.NewVersion("24.2.0-") ) // TODO: Install portworx-operator in test automation diff --git a/test/integration_test/utils/storagecluster.go b/test/integration_test/utils/storagecluster.go index 04bc8e2dd..355d0eb53 100644 --- a/test/integration_test/utils/storagecluster.go +++ b/test/integration_test/utils/storagecluster.go @@ -551,18 +551,3 @@ func ValidateStorageClusterComponents(cluster *corev1.StorageCluster) error { // TODO: Validate the components are running with expected configuration return nil } - -func GetNonMasterK8sNodeCount() (int, error) { - nodes, err := schedopsCore.Instance().GetNodes() - if err != nil { - return -1, err - } - nodesCount := 0 - for _, node := range nodes.Items { - if schedopsCore.Instance().IsNodeMaster(node) { - continue - } - nodesCount++ - } - return nodesCount, nil -} diff --git a/test/integration_test/utils/utils.go b/test/integration_test/utils/utils.go index db0a694af..7f08c20c0 100644 --- a/test/integration_test/utils/utils.go +++ b/test/integration_test/utils/utils.go @@ -3,6 +3,7 @@ package utils import ( "bytes" "fmt" + "io" "os" "regexp" "strings" @@ -194,3 +195,24 @@ func getPxStoreV2NodeCount(t *testing.T, px_status string) int { return strings.Count(strings.ToLower(out[0]), "px-storev2") } + +func RunInPortworxPod(pod *v1.Pod, in io.Reader, out, err io.Writer, command ...string) error { + if pod == nil || len(command) <= 0 { + return os.ErrInvalid + } + + if logrus.IsLevelEnabled(logrus.DebugLevel) { + logrus.Debugf("run on %s via %s: `%s`", pod.Spec.NodeName, pod.Name, strings.Join(command, " ")) + } + + return coreops.Instance().RunCommandInPodEx(&coreops.RunCommandInPodExRequest{ + Command: command, + PODName: pod.Name, + ContainerName: "portworx", + Namespace: pod.Namespace, + UseTTY: false, + Stdin: in, + Stdout: out, + Stderr: err, + }) +}