From c43df8bbbfdde03cc24b61203d7c2ab1bfefdf6c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 21 Feb 2025 05:42:42 +1100 Subject: [PATCH] vdev_file: unify FreeBSD and Linux implementations (#17046) Kernel & userspace specifics are in zfs_file_os.c, so there's no particular reason these have to be separate. The one platform-specific part is in the Linux kernel part, to offload flushes to a taskq if we're already inside a filesystem transaction. This would be normally be an unsatisfying wart, but I'm intending to remove this shortly, so I'm content to leave it gated for the moment. Reviewed-by: Allan Jude Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris --- lib/libzpool/Makefile.am | 2 +- module/Kbuild.in | 2 +- module/Makefile.bsd | 2 +- module/os/freebsd/zfs/vdev_file.c | 342 -------------------------- module/{os/linux => }/zfs/vdev_file.c | 60 +++-- 5 files changed, 31 insertions(+), 377 deletions(-) delete mode 100644 module/os/freebsd/zfs/vdev_file.c rename module/{os/linux => }/zfs/vdev_file.c (95%) diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 404b737c204d..8875393dcb22 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -46,7 +46,6 @@ nodist_libzpool_la_SOURCES = \ module/lua/lvm.c \ module/lua/lzio.c \ \ - module/os/linux/zfs/vdev_file.c \ module/os/linux/zfs/zio_crypt.c \ \ module/zcommon/cityhash.c \ @@ -143,6 +142,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/vdev.c \ module/zfs/vdev_draid.c \ module/zfs/vdev_draid_rand.c \ + module/zfs/vdev_file.c \ module/zfs/vdev_indirect.c \ module/zfs/vdev_indirect_births.c \ module/zfs/vdev_indirect_mapping.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 5190afc506f9..569c3a869015 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -385,6 +385,7 @@ ZFS_OBJS := \ vdev.o \ vdev_draid.o \ vdev_draid_rand.o \ + vdev_file.o \ vdev_indirect.o \ vdev_indirect_births.o \ vdev_indirect_mapping.o \ @@ -446,7 +447,6 @@ ZFS_OBJS_OS := \ spa_misc_os.o \ trace.o \ vdev_disk.o \ - vdev_file.o \ vdev_raidz.o \ vdev_label_os.o \ zfs_acl.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index c605069d07d3..dcd9800c7f02 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -199,7 +199,6 @@ SRCS+= abd_os.c \ kmod_core.c \ spa_os.c \ sysctl_os.c \ - vdev_file.c \ vdev_geom.c \ vdev_label_os.c \ zfs_acl.c \ @@ -313,6 +312,7 @@ SRCS+= abd.c \ vdev.c \ vdev_draid.c \ vdev_draid_rand.c \ + vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ vdev_indirect_mapping.c \ diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c deleted file mode 100644 index 6719c87f82e5..000000000000 --- a/module/os/freebsd/zfs/vdev_file.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Virtual device vector for files. - */ - -static taskq_t *vdev_file_taskq; - -static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; -static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; - -void -vdev_file_init(void) -{ - vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), - minclsyspri, max_ncpus, INT_MAX, 0); -} - -void -vdev_file_fini(void) -{ - taskq_destroy(vdev_file_taskq); -} - -static void -vdev_file_hold(vdev_t *vd) -{ - ASSERT3P(vd->vdev_path, !=, NULL); -} - -static void -vdev_file_rele(vdev_t *vd) -{ - ASSERT3P(vd->vdev_path, !=, NULL); -} - -static mode_t -vdev_file_open_mode(spa_mode_t spa_mode) -{ - mode_t mode = 0; - - if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { - mode = O_RDWR; - } else if (spa_mode & SPA_MODE_READ) { - mode = O_RDONLY; - } else if (spa_mode & SPA_MODE_WRITE) { - mode = O_WRONLY; - } - - return (mode | O_LARGEFILE); -} - -static int -vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *logical_ashift, uint64_t *physical_ashift) -{ - vdev_file_t *vf; - zfs_file_t *fp; - zfs_file_attr_t zfa; - int error; - - /* - * Rotational optimizations only make sense on block devices. - */ - vd->vdev_nonrot = B_TRUE; - - /* - * Allow TRIM on file based vdevs. This may not always be supported, - * since it depends on your kernel version and underlying filesystem - * type but it is always safe to attempt. - */ - vd->vdev_has_trim = B_TRUE; - - /* - * Disable secure TRIM on file based vdevs. There is no way to - * request this behavior from the underlying filesystem. - */ - vd->vdev_has_securetrim = B_FALSE; - - /* - * We must have a pathname, and it must be absolute. - */ - if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; - return (SET_ERROR(EINVAL)); - } - - /* - * Reopen the device if it's not currently open. Otherwise, - * just update the physical size of the device. - */ - if (vd->vdev_tsd != NULL) { - ASSERT(vd->vdev_reopening); - vf = vd->vdev_tsd; - goto skip_open; - } - - vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); - - /* - * We always open the files from the root of the global zone, even if - * we're in a local zone. If the user has gotten to this point, the - * administrator has already decided that the pool should be available - * to local zone users, so the underlying devices should be as well. - */ - ASSERT3P(vd->vdev_path, !=, NULL); - ASSERT(vd->vdev_path[0] == '/'); - - error = zfs_file_open(vd->vdev_path, - vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - vf->vf_file = fp; - -#ifdef _KERNEL - /* - * Make sure it's a regular file. - */ - if (zfs_file_getattr(fp, &zfa)) { - return (SET_ERROR(ENODEV)); - } - if (!S_ISREG(zfa.zfa_mode)) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (SET_ERROR(ENODEV)); - } -#endif - -skip_open: - - error = zfs_file_getattr(vf->vf_file, &zfa); - if (error) { - vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; - return (error); - } - - *max_psize = *psize = zfa.zfa_size; - *logical_ashift = vdev_file_logical_ashift; - *physical_ashift = vdev_file_physical_ashift; - - return (0); -} - -static void -vdev_file_close(vdev_t *vd) -{ - vdev_file_t *vf = vd->vdev_tsd; - - if (vd->vdev_reopening || vf == NULL) - return; - - if (vf->vf_file != NULL) { - zfs_file_close(vf->vf_file); - } - - vd->vdev_delayed_close = B_FALSE; - kmem_free(vf, sizeof (vdev_file_t)); - vd->vdev_tsd = NULL; -} - -/* - * Implements the interrupt side for file vdev types. This routine will be - * called when the I/O completes allowing us to transfer the I/O to the - * interrupt taskqs. For consistency, the code structure mimics disk vdev - * types. - */ -static void -vdev_file_io_intr(zio_t *zio) -{ - zio_delay_interrupt(zio); -} - -static void -vdev_file_io_strategy(void *arg) -{ - zio_t *zio = arg; - vdev_t *vd = zio->io_vd; - vdev_file_t *vf; - void *buf; - ssize_t resid; - loff_t off; - ssize_t size; - int err; - - off = zio->io_offset; - size = zio->io_size; - resid = 0; - - vf = vd->vdev_tsd; - - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - if (zio->io_type == ZIO_TYPE_READ) { - buf = abd_borrow_buf(zio->io_abd, zio->io_size); - err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); - abd_return_buf_copy(zio->io_abd, buf, size); - } else { - buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); - err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); - abd_return_buf(zio->io_abd, buf, size); - } - zio->io_error = err; - if (resid != 0 && zio->io_error == 0) - zio->io_error = ENOSPC; - - vdev_file_io_intr(zio); -} - -static void -vdev_file_io_start(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - vdev_file_t *vf = vd->vdev_tsd; - - if (zio->io_type == ZIO_TYPE_FLUSH) { - /* XXPOLICY */ - if (!vdev_readable(vd)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC); - - zio_execute(zio); - return; - } else if (zio->io_type == ZIO_TYPE_TRIM) { - ASSERT3U(zio->io_size, !=, 0); - zio->io_error = zfs_file_deallocate(vf->vf_file, - zio->io_offset, zio->io_size); - zio_execute(zio); - return; - } - ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); - zio->io_target_timestamp = zio_handle_io_delay(zio); - - VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, - TQ_SLEEP), !=, 0); -} - -static void -vdev_file_io_done(zio_t *zio) -{ - (void) zio; -} - -vdev_ops_t vdev_file_ops = { - .vdev_op_init = NULL, - .vdev_op_fini = NULL, - .vdev_op_open = vdev_file_open, - .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, - .vdev_op_min_asize = vdev_default_min_asize, - .vdev_op_min_alloc = NULL, - .vdev_op_io_start = vdev_file_io_start, - .vdev_op_io_done = vdev_file_io_done, - .vdev_op_state_change = NULL, - .vdev_op_need_resilver = NULL, - .vdev_op_hold = vdev_file_hold, - .vdev_op_rele = vdev_file_rele, - .vdev_op_remap = NULL, - .vdev_op_xlate = vdev_default_xlate, - .vdev_op_rebuild_asize = NULL, - .vdev_op_metaslab_init = NULL, - .vdev_op_config_generate = NULL, - .vdev_op_nparity = NULL, - .vdev_op_ndisks = NULL, - .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ -}; - -/* - * From userland we access disks just like files. - */ -#ifndef _KERNEL - -vdev_ops_t vdev_disk_ops = { - .vdev_op_init = NULL, - .vdev_op_fini = NULL, - .vdev_op_open = vdev_file_open, - .vdev_op_close = vdev_file_close, - .vdev_op_asize = vdev_default_asize, - .vdev_op_min_asize = vdev_default_min_asize, - .vdev_op_min_alloc = NULL, - .vdev_op_io_start = vdev_file_io_start, - .vdev_op_io_done = vdev_file_io_done, - .vdev_op_state_change = NULL, - .vdev_op_need_resilver = NULL, - .vdev_op_hold = vdev_file_hold, - .vdev_op_rele = vdev_file_rele, - .vdev_op_remap = NULL, - .vdev_op_xlate = vdev_default_xlate, - .vdev_op_rebuild_asize = NULL, - .vdev_op_metaslab_init = NULL, - .vdev_op_config_generate = NULL, - .vdev_op_nparity = NULL, - .vdev_op_ndisks = NULL, - .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ -}; - -#endif - -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, UINT, ZMOD_RW, - "Logical ashift for file-based devices"); -ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, UINT, ZMOD_RW, - "Physical ashift for file-based devices"); diff --git a/module/os/linux/zfs/vdev_file.c b/module/zfs/vdev_file.c similarity index 95% rename from module/os/linux/zfs/vdev_file.c rename to module/zfs/vdev_file.c index 2cab6532487a..224340405d70 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -21,26 +21,19 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ #include #include -#include #include #include -#include #include #include #include #include -#include -#include -#ifdef _KERNEL -#include -#include -#else -#include -#endif +#include + /* * Virtual device vector for files. */ @@ -58,16 +51,31 @@ static taskq_t *vdev_file_taskq; static uint_t vdev_file_logical_ashift = SPA_MINBLOCKSHIFT; static uint_t vdev_file_physical_ashift = SPA_MINBLOCKSHIFT; +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), + minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); + + VERIFY(vdev_file_taskq); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + static void vdev_file_hold(vdev_t *vd) { - ASSERT(vd->vdev_path != NULL); + ASSERT3P(vd->vdev_path, !=, NULL); } static void vdev_file_rele(vdev_t *vd) { - ASSERT(vd->vdev_path != NULL); + ASSERT3P(vd->vdev_path, !=, NULL); } static mode_t @@ -139,7 +147,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, * administrator has already decided that the pool should be available * to local zone users, so the underlying devices should be as well. */ - ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + ASSERT3P(vd->vdev_path, !=, NULL); + ASSERT3S(vd->vdev_path[0], ==, '/'); error = zfs_file_open(vd->vdev_path, vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); @@ -201,8 +210,8 @@ vdev_file_io_strategy(void *arg) zio_t *zio = (zio_t *)arg; vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; - ssize_t resid; void *buf; + ssize_t resid; loff_t off; ssize_t size; int err; @@ -211,6 +220,7 @@ vdev_file_io_strategy(void *arg) size = zio->io_size; resid = 0; + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); if (zio->io_type == ZIO_TYPE_READ) { buf = abd_borrow_buf(zio->io_abd, zio->io_size); err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); @@ -257,6 +267,7 @@ vdev_file_io_start(zio_t *zio) return; } +#ifdef __linux__ /* * We cannot safely call vfs_fsync() when PF_FSTRANS * is set in the current context. Filesystems like @@ -270,10 +281,9 @@ vdev_file_io_start(zio_t *zio) TASKQID_INVALID); return; } +#endif - zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); - - zio_execute(zio); + vdev_file_io_fsync(zio); return; } else if (zio->io_type == ZIO_TYPE_TRIM) { ASSERT3U(zio->io_size, !=, 0); @@ -283,6 +293,7 @@ vdev_file_io_start(zio_t *zio) return; } + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); zio->io_target_timestamp = zio_handle_io_delay(zio); VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, @@ -320,21 +331,6 @@ vdev_ops_t vdev_file_ops = { .vdev_op_leaf = B_TRUE /* leaf vdev */ }; -void -vdev_file_init(void) -{ - vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), - minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); - - VERIFY(vdev_file_taskq); -} - -void -vdev_file_fini(void) -{ - taskq_destroy(vdev_file_taskq); -} - /* * From userland we access disks just like files. */