Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DFBUGS-1696: mgr: Wait for builtin mgr pool to exist before enabling stretch #839

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions build/csv/ceph/rook-ceph-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3455,6 +3455,8 @@ spec:
- csiaddonsnodes
verbs:
- get
- watch
- list
- create
- update
- delete
Expand Down Expand Up @@ -3485,6 +3487,8 @@ spec:
- csiaddonsnodes
verbs:
- get
- watch
- list
- create
- update
- delete
Expand Down Expand Up @@ -3526,6 +3530,8 @@ spec:
- csiaddonsnodes
verbs:
- get
- watch
- list
- create
- update
- delete
Expand Down
20 changes: 20 additions & 0 deletions pkg/operator/ceph/cluster/mon/mon.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,18 @@ func (c *Cluster) ConfigureArbiter() error {
return errors.Wrapf(err, "failed to find two failure domains %q in the CRUSH map", failureDomain)
}

// Before entering stretch mode, we must create at least one pool based on the default stretch rule
// Wait for the .mgr pool to be created, which we expect is defined as a CephBlockPool
// We may be able to remove this code waiting for the pool once this is in the ceph release:
// https://github.com/ceph/ceph/pull/61371
logger.Info("enabling stretch mode... waiting for the builtin .mgr pool to be created")
err = wait.PollUntilContextTimeout(c.ClusterInfo.Context, pollInterval, totalWaitTime, true, func(ctx context.Context) (bool, error) {
return c.builtinMgrPoolExists(), nil
})
if err != nil {
return errors.Wrapf(err, "failed to wait for .mgr pool to be created before setting the stretch tiebreaker")
}

// Set the mon tiebreaker
if err := cephclient.SetMonStretchTiebreaker(c.context, c.ClusterInfo, c.arbiterMon, failureDomain); err != nil {
return errors.Wrap(err, "failed to set mon tiebreaker")
Expand All @@ -394,6 +406,14 @@ func (c *Cluster) ConfigureArbiter() error {
return nil
}

func (c *Cluster) builtinMgrPoolExists() bool {
_, err := cephclient.GetPoolDetails(c.context, c.ClusterInfo, ".mgr")

// If the call fails, either the pool does not exist or else there is some Ceph error.
// Either way, we want to wait for confirmation of the pool existing.
return err == nil
}

func (c *Cluster) readyToConfigureArbiter(checkOSDPods bool) (bool, error) {
failureDomain := c.getFailureDomainName()

Expand Down
29 changes: 23 additions & 6 deletions pkg/operator/ceph/controller/controller_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,13 +169,30 @@ func canIgnoreHealthErrStatusInReconcile(cephCluster cephv1.CephCluster, control
}
}

// If there is only one cause for HEALTH_ERR and it's on the allowed list of errors, ignore it.
var allowedErrStatus = []string{"MDS_ALL_DOWN"}
var ignoreHealthErr = len(healthErrKeys) == 1 && contains(allowedErrStatus, healthErrKeys[0])
if ignoreHealthErr {
logger.Debugf("%q: ignoring ceph status %q because only cause is %q (full status is %+v)", controllerName, cephCluster.Status.CephStatus.Health, healthErrKeys[0], cephCluster.Status.CephStatus)
// If there are no errors, the caller actually expects false to be returned so the absence
// of an error doesn't cause the health status to be ignored. In production, if there are no
// errors, we would anyway expect the health status to be ok or warning. False in this case
// will cover if the health status is blank.
if len(healthErrKeys) == 0 {
return false
}
return ignoreHealthErr

allowedErrStatus := map[string]struct{}{
"MDS_ALL_DOWN": {},
"MGR_MODULE_ERROR": {},
}
allCanBeIgnored := true
for _, healthErrKey := range healthErrKeys {
if _, ok := allowedErrStatus[healthErrKey]; !ok {
allCanBeIgnored = false
break
}
}
if allCanBeIgnored {
logger.Debugf("%q: ignoring ceph error status (full status is %+v)", controllerName, cephCluster.Status.CephStatus)
return true
}
return false
}

// IsReadyToReconcile determines if a controller is ready to reconcile or not
Expand Down
26 changes: 25 additions & 1 deletion pkg/operator/ceph/controller/controller_utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) {
var cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{
"MDS_ALL_DOWN": {
Severity: "HEALTH_ERR",
Message: "MDS_ALL_DOWN",
Message: "mds all down error",
},
"MGR_MODULE_ERROR": {
Severity: "HEALTH_ERR",
Message: "mgr module error",
},
"TEST_OTHER": {
Severity: "HEALTH_WARN",
Expand All @@ -58,6 +62,22 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) {
})
assert.True(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))

cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{
"MGR_MODULE_ERROR": {
Severity: "HEALTH_ERR",
Message: "mgr module error",
},
})
assert.True(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))

cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{
"MGR_MODULE_ERROR_FALSE": {
Severity: "HEALTH_ERR",
Message: "mgr module error",
},
})
assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))

cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{
"MDS_ALL_DOWN": {
Severity: "HEALTH_ERR",
Expand All @@ -77,6 +97,10 @@ func TestCanIgnoreHealthErrStatusInReconcile(t *testing.T) {
},
})
assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))

// The empty status should return false
cluster = CreateTestClusterFromStatusDetails(map[string]cephv1.CephHealthMessage{})
assert.False(t, canIgnoreHealthErrStatusInReconcile(cluster, "controller"))
}

func TestSetCephCommandsTimeout(t *testing.T) {
Expand Down
1 change: 1 addition & 0 deletions pkg/operator/ceph/file/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ func TestCephFilesystemController(t *testing.T) {
recorder: record.NewFakeRecorder(5),
scheme: s,
context: c,
fsContexts: make(map[string]*fsHealth),
opManagerContext: context.TODO(),
}
res, err := r.Reconcile(ctx, req)
Expand Down
2 changes: 1 addition & 1 deletion pkg/operator/ceph/object/objectstore.go
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,7 @@ func createSimilarPools(ctx *Context, pools []string, cluster *cephv1.ClusterSpe
if configurePoolsConcurrently() {
waitGroup, _ := errgroup.WithContext(ctx.clusterInfo.Context)
for _, pool := range pools {
// Avoid the loop re-using the same value with a closure
// Avoid the loop reusing the same value with a closure
pool := pool

waitGroup.Go(func() error { return createRGWPool(ctx, cluster, poolSpec, pgCount, pool) })
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/multus/host-cfg-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ spec:
last="${ip##*.}" # e.g., 3

# add a shim to connect IFACE to the macvlan public network, with a static IP
# avoid IP conflicts by re-using the last part of the existing IFACE IP
# avoid IP conflicts by reusing the last part of the existing IFACE IP
ip link add public-shim link ${IFACE} type macvlan mode bridge
ip addr add ${NODE_PUBLIC_NET_IP_FIRST3}.${last}/24 dev public-shim
ip link set public-shim up
Expand Down
Loading