diff --git a/build/csv/ceph/rook-ceph-operator.clusterserviceversion.yaml b/build/csv/ceph/rook-ceph-operator.clusterserviceversion.yaml index 17db9ccb3afe..2996fca35f01 100644 --- a/build/csv/ceph/rook-ceph-operator.clusterserviceversion.yaml +++ b/build/csv/ceph/rook-ceph-operator.clusterserviceversion.yaml @@ -3455,6 +3455,8 @@ spec: - csiaddonsnodes verbs: - get + - watch + - list - create - update - delete @@ -3485,6 +3487,8 @@ spec: - csiaddonsnodes verbs: - get + - watch + - list - create - update - delete @@ -3526,6 +3530,8 @@ spec: - csiaddonsnodes verbs: - get + - watch + - list - create - update - delete diff --git a/pkg/operator/ceph/cluster/mon/mon.go b/pkg/operator/ceph/cluster/mon/mon.go index 8375163e8d5b..4f6aed53bad3 100644 --- a/pkg/operator/ceph/cluster/mon/mon.go +++ b/pkg/operator/ceph/cluster/mon/mon.go @@ -386,6 +386,18 @@ func (c *Cluster) ConfigureArbiter() error { return errors.Wrapf(err, "failed to find two failure domains %q in the CRUSH map", failureDomain) } + // Before entering stretch mode, we must create at least one pool based on the default stretch rule + // Wait for the .mgr pool to be created, which we expect is defined as a CephBlockPool + // We may be able to remove this code waiting for the pool once this is in the ceph release: + // https://github.com/ceph/ceph/pull/61371 + logger.Info("enabling stretch mode... waiting for the builtin .mgr pool to be created") + err = wait.PollUntilContextTimeout(c.ClusterInfo.Context, pollInterval, totalWaitTime, true, func(ctx context.Context) (bool, error) { + return c.builtinMgrPoolExists(), nil + }) + if err != nil { + return errors.Wrapf(err, "failed to wait for .mgr pool to be created before setting the stretch tiebreaker") + } + // Set the mon tiebreaker if err := cephclient.SetMonStretchTiebreaker(c.context, c.ClusterInfo, c.arbiterMon, failureDomain); err != nil { return errors.Wrap(err, "failed to set mon tiebreaker") @@ -394,6 +406,14 @@ func (c *Cluster) ConfigureArbiter() error { return nil } +func (c *Cluster) builtinMgrPoolExists() bool { + _, err := cephclient.GetPoolDetails(c.context, c.ClusterInfo, ".mgr") + + // If the call fails, either the pool does not exist or else there is some Ceph error. + // Either way, we want to wait for confirmation of the pool existing. + return err == nil +} + func (c *Cluster) readyToConfigureArbiter(checkOSDPods bool) (bool, error) { failureDomain := c.getFailureDomainName() diff --git a/pkg/operator/ceph/controller/controller_utils.go b/pkg/operator/ceph/controller/controller_utils.go index acdf7920b819..f2c232666e34 100644 --- a/pkg/operator/ceph/controller/controller_utils.go +++ b/pkg/operator/ceph/controller/controller_utils.go @@ -169,13 +169,30 @@ func canIgnoreHealthErrStatusInReconcile(cephCluster cephv1.CephCluster, control } } - // If there is only one cause for HEALTH_ERR and it's on the allowed list of errors, ignore it. - var allowedErrStatus = []string{"MDS_ALL_DOWN"} - var ignoreHealthErr = len(healthErrKeys) == 1 && contains(allowedErrStatus, healthErrKeys[0]) - if ignoreHealthErr { - logger.Debugf("%q: ignoring ceph status %q because only cause is %q (full status is %+v)", controllerName, cephCluster.Status.CephStatus.Health, healthErrKeys[0], cephCluster.Status.CephStatus) + // If there are no errors, the caller actually expects false to be returned so the absence + // of an error doesn't cause the health status to be ignored. In production, if there are no + // errors, we would anyway expect the health status to be ok or warning. False in this case + // will cover if the health status is blank. + if len(healthErrKeys) == 0 { + return false } - return ignoreHealthErr + + allowedErrStatus := map[string]struct{}{ + "MDS_ALL_DOWN": {}, + "MGR_MODULE_ERROR": {}, + } + allCanBeIgnored := true + for _, healthErrKey := range healthErrKeys { + if _, ok := allowedErrStatus[healthErrKey]; !ok { + allCanBeIgnored = false + break + } + } + if allCanBeIgnored { + logger.Debugf("%q: ignoring ceph error status (full status is %+v)", controllerName, cephCluster.Status.CephStatus) + return true + } + return false } // IsReadyToReconcile determines if a controller is ready to reconcile or not diff --git a/pkg/operator/ceph/controller/controller_utils_test.go b/pkg/operator/ceph/controller/controller_utils_test.go index 82774c8c7e06..43fcb963f15d 100644 --- a/pkg/operator/ceph/controller/controller_utils_test.go +++ b/pkg/operator/ceph/controller/controller_utils_test.go @@ -45,7 +45,11 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) { var cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{ "MDS_ALL_DOWN": { Severity: "HEALTH_ERR", - Message: "MDS_ALL_DOWN", + Message: "mds all down error", + }, + "MGR_MODULE_ERROR": { + Severity: "HEALTH_ERR", + Message: "mgr module error", }, "TEST_OTHER": { Severity: "HEALTH_WARN", @@ -58,6 +62,22 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) { }) assert.True(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller")) + cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{ + "MGR_MODULE_ERROR": { + Severity: "HEALTH_ERR", + Message: "mgr module error", + }, + }) + assert.True(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller")) + + cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{ + "MGR_MODULE_ERROR_FALSE": { + Severity: "HEALTH_ERR", + Message: "mgr module error", + }, + }) + assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller")) + cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{ "MDS_ALL_DOWN": { Severity: "HEALTH_ERR", @@ -77,6 +97,10 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) { }, }) assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller")) + + // The empty status should return false + cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{}) + assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller")) } func TestSetCephCommandsTimeout(t *testing.T) { diff --git a/pkg/operator/ceph/file/controller_test.go b/pkg/operator/ceph/file/controller_test.go index 293d370abac6..8f1700b3500c 100644 --- a/pkg/operator/ceph/file/controller_test.go +++ b/pkg/operator/ceph/file/controller_test.go @@ -254,6 +254,7 @@ func TestCephFilesystemController(t *testing.T) { recorder: record.NewFakeRecorder(5), scheme: s, context: c, + fsContexts: make(map[string]*fsHealth), opManagerContext: context.TODO(), } res, err := r.Reconcile(ctx, req) diff --git a/pkg/operator/ceph/object/objectstore.go b/pkg/operator/ceph/object/objectstore.go index e2e79304afd5..44cf3d26868c 100644 --- a/pkg/operator/ceph/object/objectstore.go +++ b/pkg/operator/ceph/object/objectstore.go @@ -1091,7 +1091,7 @@ func createSimilarPools(ctx *Context, pools []string, cluster *cephv1.ClusterSpe if configurePoolsConcurrently() { waitGroup, _ := errgroup.WithContext(ctx.clusterInfo.Context) for _, pool := range pools { - // Avoid the loop re-using the same value with a closure + // Avoid the loop reusing the same value with a closure pool := pool waitGroup.Go(func() error { return createRGWPool(ctx, cluster, poolSpec, pgCount, pool) }) diff --git a/tests/scripts/multus/host-cfg-ds.yaml b/tests/scripts/multus/host-cfg-ds.yaml index 9fbedcfa381b..e3e868617d7f 100644 --- a/tests/scripts/multus/host-cfg-ds.yaml +++ b/tests/scripts/multus/host-cfg-ds.yaml @@ -51,7 +51,7 @@ spec: last="${ip##*.}" # e.g., 3 # add a shim to connect IFACE to the macvlan public network, with a static IP - # avoid IP conflicts by re-using the last part of the existing IFACE IP + # avoid IP conflicts by reusing the last part of the existing IFACE IP ip link add public-shim link ${IFACE} type macvlan mode bridge ip addr add ${NODE_PUBLIC_NET_IP_FIRST3}.${last}/24 dev public-shim ip link set public-shim up