Skip to content

Commit a3490ce

Browse files
committed
Fix double spares for failed vdev
It's possible for two spares to get attached to a single failed vdev. This happens when you have a failed disk that is spared, and then you replace the failed disk with a new disk, but during the resilver the new disk fails, and ZED kicks in a spare for the failed new disk. This commit checks for that condition and disallows it. Closes: #16547 Signed-off-by: Tony Hutter <hutter2@llnl.gov>
1 parent 8d14897 commit a3490ce

File tree

1 file changed

+82
-0
lines changed

1 file changed

+82
-0
lines changed

module/zfs/spa.c

+82
Original file line numberDiff line numberDiff line change
@@ -7430,6 +7430,82 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
74307430
return (0);
74317431
}
74327432

7433+
/*
7434+
* Given a vdev to be replaced and its parent, check for a possible
7435+
* "double spare" condition if a vdev is to be replaced by a spare. When this
7436+
* happens, you can get two spares assigned to one failed vdev.
7437+
*
7438+
* To trigger a double spare condition:
7439+
*
7440+
* 1. disk1 fails
7441+
* 2. 1st spare is kicked in for disk1 and it resilvers
7442+
* 3. Someone replaces disk1 with a new blank disk
7443+
* 4. New blank disk starts resilvering
7444+
* 5. While resilvering, new blank disk has IO errors and faults
7445+
* 6. 2nd spare is kicked in for new blank disk
7446+
* 7. At this point two spares are kicked in for the original disk1.
7447+
*
7448+
* It looks like this:
7449+
*
7450+
* NAME STATE READ WRITE CKSUM
7451+
* tank2 DEGRADED 0 0 0
7452+
* draid2:6d:10c:2s-0 DEGRADED 0 0 0
7453+
* scsi-0QEMU_QEMU_HARDDISK_d1 ONLINE 0 0 0
7454+
* scsi-0QEMU_QEMU_HARDDISK_d2 ONLINE 0 0 0
7455+
* scsi-0QEMU_QEMU_HARDDISK_d3 ONLINE 0 0 0
7456+
* scsi-0QEMU_QEMU_HARDDISK_d4 ONLINE 0 0 0
7457+
* scsi-0QEMU_QEMU_HARDDISK_d5 ONLINE 0 0 0
7458+
* scsi-0QEMU_QEMU_HARDDISK_d6 ONLINE 0 0 0
7459+
* scsi-0QEMU_QEMU_HARDDISK_d7 ONLINE 0 0 0
7460+
* scsi-0QEMU_QEMU_HARDDISK_d8 ONLINE 0 0 0
7461+
* scsi-0QEMU_QEMU_HARDDISK_d9 ONLINE 0 0 0
7462+
* spare-9 DEGRADED 0 0 0
7463+
* replacing-0 DEGRADED 0 93 0
7464+
* scsi-0QEMU_QEMU_HARDDISK_d10-part1/old UNAVAIL 0 0 0
7465+
* spare-1 DEGRADED 0 0 0
7466+
* scsi-0QEMU_QEMU_HARDDISK_d10 REMOVED 0 0 0
7467+
* draid2-0-0 ONLINE 0 0 0
7468+
* draid2-0-1 ONLINE 0 0 0
7469+
* spares
7470+
* draid2-0-0 INUSE currently in use
7471+
* draid2-0-1 INUSE currently in use
7472+
*
7473+
* ARGS:
7474+
*
7475+
* newvd: New spare disk
7476+
* pvd: Parent vdev_t the spare should attach to
7477+
*
7478+
* This function returns B_TRUE if adding the new vdev would create a double
7479+
* spare condition, B_FALSE otherwise.
7480+
*/
7481+
static boolean_t
7482+
spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd)
7483+
{
7484+
vdev_t *ppvd;
7485+
7486+
ppvd = pvd->vdev_parent;
7487+
if (ppvd == NULL || ppvd->vdev_ops == NULL)
7488+
return (B_FALSE);
7489+
7490+
/*
7491+
* To determine if this configuration would cause a double spare, we
7492+
* look at the vdev_op_type string of the parent vdev, and of the
7493+
* parent's parent vdev. We also look at vdev_isspare on the new disk.
7494+
* A double spare condition looks like this:
7495+
*
7496+
* 1. parent of parent's op is a spare or draid spare
7497+
* 2. parent's op is replacing
7498+
* 3. new disk is a spare
7499+
*/
7500+
if ((ppvd->vdev_ops == &vdev_spare_ops) ||
7501+
(ppvd->vdev_ops == &vdev_draid_spare_ops))
7502+
if (pvd->vdev_ops == &vdev_replacing_ops)
7503+
if (newvd->vdev_isspare)
7504+
return (B_TRUE);
7505+
7506+
return (B_FALSE);
7507+
}
7508+
74337509
/*
74347510
* Attach a device to a vdev specified by its guid. The vdev type can be
74357511
* a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
@@ -7604,6 +7680,12 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
76047680
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
76057681
}
76067682

7683+
if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) {
7684+
vdev_dbgmsg(newvd,
7685+
"disk would create double spares, ignore.");
7686+
return (spa_vdev_exit(spa, newrootvd, txg, EEXIST));
7687+
}
7688+
76077689
if (newvd->vdev_isspare)
76087690
pvops = &vdev_spare_ops;
76097691
else

0 commit comments

Comments
 (0)