Skip to content

range_tree: Add zfs_recover_rt parameter and extra debug info #17094

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions include/sys/range_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,32 @@ typedef enum zfs_range_seg_type {
ZFS_RANGE_SEG_NUM_TYPES,
} zfs_range_seg_type_t;

/*
* Range tree behavior flags.
*
* The UC (use case) flags are intended to support the zfs_recover_rt mode.
* The range tree's logic needs to know the context in order to correctly
* recover from an unexpected situation by exchanging potential data loss for
* a potential space leak:
*
* - If it knows that the tree represents allocated space then it should better
* perform an unexpected addition to the tree.
*
* - Similarly, if it's about free space (aka allocatable) then it should
* perform unexpected removals instead of silently ignoring the issue.
*
* The generic case means to simply ignore unexpected additions/removals as
* a recovery mechanism, without special treatment.
*
* Unexpected actions are logged with extra details such as a range tree
* name string, which can be marked as dynamic to be freed along with the tree
* instance destruction.
*/
#define ZFS_RANGE_TREE_F_UC_GENERIC (1 << 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think "GENERIC" is really meaningful. Easier and cleaner I think would be just to pass 0 if we can't say anything better (we really should).

#define ZFS_RANGE_TREE_F_UC_ALLOCATED_SPACE (1 << 1)
#define ZFS_RANGE_TREE_F_UC_FREE_SPACE (1 << 2)
#define ZFS_RANGE_TREE_F_DYN_NAME (1 << 3)

/*
* Note: the range_tree may not be accessed concurrently; consumers
* must provide external locking if required.
Expand All @@ -67,6 +93,9 @@ typedef struct zfs_range_tree {
void *rt_arg;
uint64_t rt_gap; /* allowable inter-segment gap */

uint64_t rt_flags;
const char *rt_name; /* details for debugging */

/*
* The rt_histogram maintains a histogram of ranges. Each bucket,
* rt_histogram[i], contains the number of ranges whose size is:
Expand Down Expand Up @@ -280,6 +309,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops,
uint64_t gap);
zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops,
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
zfs_range_tree_t *zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops,
zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
uint64_t flags, const char *name);
void zfs_range_tree_destroy(zfs_range_tree_t *rt);
boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start,
uint64_t size);
Expand Down
6 changes: 6 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -1987,6 +1987,12 @@ Set to attempt to recover from fatal errors.
This should only be used as a last resort,
as it typically results in leaked space, or worse.
.
.It Sy zfs_recover_rt Ns = Ns Sy 0 Ns | Ns 1 Pq int
Set to attempt to recover from fatal errors while adding or removing
unexpected segments to a range tree.
This should only be used as a last resort,
as it typically results in leaked space.
.
.It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int
Ignore hard I/O errors during device removal.
When set, if a device encounters a hard I/O error during the removal process
Expand Down
6 changes: 4 additions & 2 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -2435,8 +2435,10 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
int txgoff = tx->tx_txg & TXG_MASK;
if (dn->dn_free_ranges[txgoff] == NULL) {
dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL,
ZFS_RANGE_SEG64, NULL, 0, 0);
dn->dn_free_ranges[txgoff] =
zfs_range_tree_create_flags(
NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
ZFS_RANGE_TREE_F_UC_FREE_SPACE, "dn_free_ranges");
}
zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
Expand Down
73 changes: 55 additions & 18 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,17 @@ static metaslab_stats_t metaslab_stats = {
#define METASLABSTAT_BUMP(stat) \
atomic_inc_64(&metaslab_stats.stat.value.ui64);

static inline char *
zfs_rt_name(metaslab_group_t *mg, metaslab_t *ms,
const char *name)
{
return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}",
mg->mg_vd->vdev_spa->spa_name,
(u_longlong_t)mg->mg_vd->vdev_guid,
(u_longlong_t)ms->ms_id,
name));
}


static kstat_t *metaslab_ksp;

Expand Down Expand Up @@ -2753,30 +2764,53 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
zfs_range_seg_type_t type =
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);

ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start,
shift);
ms->ms_allocatable = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_allocatable"));
for (int t = 0; t < TXG_SIZE; t++) {
ms->ms_allocating[t] = zfs_range_tree_create(NULL, type,
NULL, start, shift);
}
ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift);
ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift);
ms->ms_allocating[t] = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_ALLOCATED_SPACE |
ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_allocating"));
}
ms->ms_freeing = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_freeing"));
ms->ms_freed = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_freed"));
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL,
start, shift);
}
ms->ms_checkpointing =
zfs_range_tree_create(NULL, type, NULL, start, shift);
ms->ms_unflushed_allocs =
zfs_range_tree_create(NULL, type, NULL, start, shift);
ms->ms_defer[t] = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_FREE_SPACE |
ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_defer"));
}
ms->ms_checkpointing = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_checkpointing"));
ms->ms_unflushed_allocs = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_ALLOCATED_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_unflushed_allocs"));

metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
mrap->mra_floor_shift = metaslab_by_size_min_shift;
ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops,
type, mrap, start, shift);
ms->ms_unflushed_frees = zfs_range_tree_create_flags(
&metaslab_rt_ops, type, mrap, start, shift,
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_unflushed_frees"));

ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift);
ms->ms_trim = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(mg, ms, "ms_trim"));

metaslab_group_add(mg, ms);
metaslab_set_fragmentation(ms, B_FALSE);
Expand Down Expand Up @@ -3750,7 +3784,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
&start, &shift);

condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift);
condense_tree = zfs_range_tree_create_flags(
NULL, type, NULL, start, shift,
ZFS_RANGE_TREE_F_UC_FREE_SPACE | ZFS_RANGE_TREE_F_DYN_NAME,
zfs_rt_name(msp->ms_group, msp, "condense_tree"));

for (int t = 0; t < TXG_DEFER_SIZE; t++) {
zfs_range_tree_walk(msp->ms_defer[t],
Expand Down
Loading
Loading