red-hat-storage · travisn · Feb 27, 2025 · Feb 26, 2025 · Feb 27, 2025 · Jan 21, 2025
diff --git a/build/csv/ceph/rook-ceph-operator.clusterserviceversion.yaml b/build/csv/ceph/rook-ceph-operator.clusterserviceversion.yaml
@@ -3455,6 +3455,8 @@ spec:
                 - csiaddonsnodes
               verbs:
                 - get
+                - watch
+                - list
                 - create
                 - update
                 - delete
@@ -3485,6 +3487,8 @@ spec:
                 - csiaddonsnodes
               verbs:
                 - get
+                - watch
+                - list
                 - create
                 - update
                 - delete
@@ -3526,6 +3530,8 @@ spec:
                 - csiaddonsnodes
               verbs:
                 - get
+                - watch
+                - list
                 - create
                 - update
                 - delete

diff --git a/pkg/operator/ceph/cluster/mon/mon.go b/pkg/operator/ceph/cluster/mon/mon.go
@@ -386,6 +386,18 @@ func (c *Cluster) ConfigureArbiter() error {
 		return errors.Wrapf(err, "failed to find two failure domains %q in the CRUSH map", failureDomain)
 	}
 
+	// Before entering stretch mode, we must create at least one pool based on the default stretch rule
+	// Wait for the .mgr pool to be created, which we expect is defined as a CephBlockPool
+	// We may be able to remove this code waiting for the pool once this is in the ceph release:
+	//  https://github.com/ceph/ceph/pull/61371
+	logger.Info("enabling stretch mode... waiting for the builtin .mgr pool to be created")
+	err = wait.PollUntilContextTimeout(c.ClusterInfo.Context, pollInterval, totalWaitTime, true, func(ctx context.Context) (bool, error) {
+		return c.builtinMgrPoolExists(), nil
+	})
+	if err != nil {
+		return errors.Wrapf(err, "failed to wait for .mgr pool to be created before setting the stretch tiebreaker")
+	}
+
 	// Set the mon tiebreaker
 	if err := cephclient.SetMonStretchTiebreaker(c.context, c.ClusterInfo, c.arbiterMon, failureDomain); err != nil {
 		return errors.Wrap(err, "failed to set mon tiebreaker")
@@ -394,6 +406,14 @@ func (c *Cluster) ConfigureArbiter() error {
 	return nil
 }
 
+func (c *Cluster) builtinMgrPoolExists() bool {
+	_, err := cephclient.GetPoolDetails(c.context, c.ClusterInfo, ".mgr")
+
+	// If the call fails, either the pool does not exist or else there is some Ceph error.
+	// Either way, we want to wait for confirmation of the pool existing.
+	return err == nil
+}
+
 func (c *Cluster) readyToConfigureArbiter(checkOSDPods bool) (bool, error) {
 	failureDomain := c.getFailureDomainName()
 

diff --git a/pkg/operator/ceph/controller/controller_utils.go b/pkg/operator/ceph/controller/controller_utils.go
@@ -169,13 +169,30 @@ func canIgnoreHealthErrStatusInReconcile(cephCluster cephv1.CephCluster, control
 		}
 	}
 
-	// If there is only one cause for HEALTH_ERR and it's on the allowed list of errors, ignore it.
-	var allowedErrStatus = []string{"MDS_ALL_DOWN"}
-	var ignoreHealthErr = len(healthErrKeys) == 1 && contains(allowedErrStatus, healthErrKeys[0])
-	if ignoreHealthErr {
-		logger.Debugf("%q: ignoring ceph status %q because only cause is %q (full status is %+v)", controllerName, cephCluster.Status.CephStatus.Health, healthErrKeys[0], cephCluster.Status.CephStatus)
+	// If there are no errors, the caller actually expects false to be returned so the absence
+	// of an error doesn't cause the health status to be ignored. In production, if there are no
+	// errors, we would anyway expect the health status to be ok or warning. False in this case
+	// will cover if the health status is blank.
+	if len(healthErrKeys) == 0 {
+		return false
 	}
-	return ignoreHealthErr
+
+	allowedErrStatus := map[string]struct{}{
+		"MDS_ALL_DOWN":     {},
+		"MGR_MODULE_ERROR": {},
+	}
+	allCanBeIgnored := true
+	for _, healthErrKey := range healthErrKeys {
+		if _, ok := allowedErrStatus[healthErrKey]; !ok {
+			allCanBeIgnored = false
+			break
+		}
+	}
+	if allCanBeIgnored {
+		logger.Debugf("%q: ignoring ceph error status (full status is %+v)", controllerName, cephCluster.Status.CephStatus)
+		return true
+	}
+	return false
 }
 
 // IsReadyToReconcile determines if a controller is ready to reconcile or not

diff --git a/pkg/operator/ceph/controller/controller_utils_test.go b/pkg/operator/ceph/controller/controller_utils_test.go
@@ -45,7 +45,11 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) {
 	var cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{
 		"MDS_ALL_DOWN": {
 			Severity: "HEALTH_ERR",
-			Message:  "MDS_ALL_DOWN",
+			Message:  "mds all down error",
+		},
+		"MGR_MODULE_ERROR": {
+			Severity: "HEALTH_ERR",
+			Message:  "mgr module error",
 		},
 		"TEST_OTHER": {
 			Severity: "HEALTH_WARN",
@@ -58,6 +62,22 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) {
 	})
 	assert.True(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))
 
+	cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{
+		"MGR_MODULE_ERROR": {
+			Severity: "HEALTH_ERR",
+			Message:  "mgr module error",
+		},
+	})
+	assert.True(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))
+
+	cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{
+		"MGR_MODULE_ERROR_FALSE": {
+			Severity: "HEALTH_ERR",
+			Message:  "mgr module error",
+		},
+	})
+	assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))
+
 	cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{
 		"MDS_ALL_DOWN": {
 			Severity: "HEALTH_ERR",
@@ -77,6 +97,10 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) {
 		},
 	})
 	assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))
+
+	// The empty status should return false
+	cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{})
+	assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))
 }
 
 func TestSetCephCommandsTimeout(t *testing.T) {

diff --git a/pkg/operator/ceph/file/controller_test.go b/pkg/operator/ceph/file/controller_test.go
@@ -254,6 +254,7 @@ func TestCephFilesystemController(t *testing.T) {
 			recorder:         record.NewFakeRecorder(5),
 			scheme:           s,
 			context:          c,
+			fsContexts:       make(map[string]*fsHealth),
 			opManagerContext: context.TODO(),
 		}
 		res, err := r.Reconcile(ctx, req)

diff --git a/pkg/operator/ceph/object/objectstore.go b/pkg/operator/ceph/object/objectstore.go
@@ -1091,7 +1091,7 @@ func createSimilarPools(ctx *Context, pools []string, cluster *cephv1.ClusterSpe
 	if configurePoolsConcurrently() {
 		waitGroup, _ := errgroup.WithContext(ctx.clusterInfo.Context)
 		for _, pool := range pools {
-			// Avoid the loop re-using the same value with a closure
+			// Avoid the loop reusing the same value with a closure
 			pool := pool
 
 			waitGroup.Go(func() error { return createRGWPool(ctx, cluster, poolSpec, pgCount, pool) })

diff --git a/tests/scripts/multus/host-cfg-ds.yaml b/tests/scripts/multus/host-cfg-ds.yaml
@@ -51,7 +51,7 @@ spec:
               last="${ip##*.}" # e.g., 3
 
               # add a shim to connect IFACE to the macvlan public network, with a static IP
-              # avoid IP conflicts by re-using the last part of the existing IFACE IP
+              # avoid IP conflicts by reusing the last part of the existing IFACE IP
               ip link add public-shim link ${IFACE} type macvlan mode bridge
               ip addr add ${NODE_PUBLIC_NET_IP_FIRST3}.${last}/24 dev public-shim
               ip link set public-shim up
-Original file line number
+Diff line change
@@ Expand Up / @@ -3455,6 +3455,8 @@ spec: @@
                     - csiaddonsnodes
                   verbs:
                     - get
+                    - watch
+                    - list
                     - create
                     - update
                     - delete
@@ Expand Down Expand Up / @@ -3485,6 +3487,8 @@ spec: @@
                     - csiaddonsnodes
                   verbs:
                     - get
+                    - watch
+                    - list
                     - create
                     - update
                     - delete
@@ Expand Down Expand Up / @@ -3526,6 +3530,8 @@ spec: @@
                     - csiaddonsnodes
                   verbs:
                     - get
+                    - watch
+                    - list
                     - create
                     - update
                     - delete
@@ Expand Down @@