diff --git a/README.md b/README.md index 20cab02..ff5dc95 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ The following figure demonstrates the throughput of read stress test on a large ![Large block read throughput under stress test on a 180-node cluster](docs/images/peak_throughput.jpg) +To benchmark 3FS, please use our [fio engine for USRBIO](benchmarks/fio_usrbio/README.md). + ### 2. GraySort We evaluated [smallpond](https://github.com/deepseek-ai/smallpond) using the GraySort benchmark, which measures sort performance on large-scale datasets. Our implementation adopts a two-phase approach: (1) partitioning data via shuffle using the prefix bits of keys, and (2) in-partition sorting. Both phases read/write data from/to 3FS. diff --git a/benchmarks/fio_usrbio/Makefile b/benchmarks/fio_usrbio/Makefile new file mode 100644 index 0000000..be028de --- /dev/null +++ b/benchmarks/fio_usrbio/Makefile @@ -0,0 +1,23 @@ +HF3FS_INCLUDE_DIR ?= /usr/include +HF3FS_LIB_DIR ?= /usr/lib +FIO_SRC_DIR ?= /usr/include + +PLUGIN_NAME = hf3fs_usrbio +SO_NAME = ${PLUGIN_NAME}.so +SRC = ${PLUGIN_NAME}.cpp +OBJ = ${PLUGIN_NAME}.o + +CXX = g++ +CXXFLAGS = -fPIC -fpermissive -O3 -D_GNU_SOURCE -shared -rdynamic -I${HF3FS_INCLUDE_DIR} -I${FIO_SRC_DIR} -include config-host.h +LDFLAGS = -L${HF3FS_LIB_DIR} -lhf3fs_api_shared -Wl,-rpath=${HF3FS_LIB_DIR} + +.PHONY: all clean + +all: ${SO_NAME} + +${SO_NAME}: ${SRC} + ${CXX} ${CXXFLAGS} $^ -o $@ ${LDFLAGS} + +clean: + rm -rf ${OBJ} ${SO_NAME} + diff --git a/benchmarks/fio_usrbio/README.md b/benchmarks/fio_usrbio/README.md new file mode 100644 index 0000000..dd43f4c --- /dev/null +++ b/benchmarks/fio_usrbio/README.md @@ -0,0 +1,35 @@ +# FIO engine for 3FS USRBIO + +This repository contains the [fio] external plugin used for benchmarking [3FS] USRBIO. + +## Build + +First, build 3FS and fio. + +Configure the following variables: +- `HF3FS_LIB_DIR`: directory contains `libhf3fs_api_shared.so`, the default path in 3FS repo is `3FS/build/src/lib/api`. +- `HF3FS_INCLUDE_DIR`: directory contains `hf3fs_usrbio.h`, the default path in 3FS repo is `3FS/src/lib/api`. +- `FIO_SRC_DIR`: directory contains `config-host.h`. After building fio, this header will be in the root of the fio repo. + +Then run: +``` +make HF3FS_LIB_DIR=${HF3FS_LIB_DIR} HF3FS_INCLUDE_DIR=${HF3FS_INCLUDE_DIR} FIO_SRC_DIR=${FIO_SRC_DIR} +``` + +You will get the external plugin as `hf3fs_usrbio.so`. + +## Usage + +To use this plugin, set the `ioengine` args in fio as `external:hf3fs_usrbio.so`. Please refer to [fio documentation] for further explanation. + +To benchmarking batched small I/Os, please set these four parameters to `batch_size` simultaneously: +``` +iodepth=1024 +iodepth_batch_submit=1024 +iodepth_batch_complete_min=1024 +iodepth_batch_complete_max=1024 +``` + +[fio]: https://github.com/axboe/fio +[3FS]: https://github.com/deepseek-ai/3FS +[fio documentation]: https://fio.readthedocs.io/en/latest/fio_doc.html diff --git a/benchmarks/fio_usrbio/hf3fs_usrbio.cpp b/benchmarks/fio_usrbio/hf3fs_usrbio.cpp new file mode 100644 index 0000000..7568fbd --- /dev/null +++ b/benchmarks/fio_usrbio/hf3fs_usrbio.cpp @@ -0,0 +1,286 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "hf3fs_usrbio.h" +#include +#include +#include +#include + +extern "C" { +#include "fio.h" +#include "optgroup.h" +} + +struct hf3fs_usrbio_options { + int dummy; + char *mountpoint; + int ior_depth; + int ior_timeout; +}; + +static struct fio_option options[] = { + { + .name = "mountpoint", + .lname = "hf3fs mount point", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct hf3fs_usrbio_options, mountpoint), + .help = "Mount point (e.g. /hf3fs/mount/point)", + .def = "", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_NETIO, + }, + { + .name = "ior_depth", + .lname = "hf3fs ior depth", + .type = FIO_OPT_INT, + .off1 = offsetof(struct hf3fs_usrbio_options, ior_depth), + .help = "Ior depth", + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_NETIO, + }, + { + .name = "ior_timeout", + .lname = "hf3fs ior timeout (in ms)", + .type = FIO_OPT_INT, + .off1 = offsetof(struct hf3fs_usrbio_options, ior_timeout), + .help = "Ior timeout", + .def = "1", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_NETIO, + }, + { + .name = NULL, + }, +}; + +#define LAST_POS(f) ((f)->engine_pos) + +struct hf3fs_usrbio_data { + struct hf3fs_iov iov; + struct hf3fs_ior ior_r; + struct hf3fs_ior ior_w; + std::vector io_us; + int queued; + int events; + enum fio_ddir last_ddir; +}; + +static int hf3fs_usrbio_init(struct thread_data *td) { + td->io_ops_data = static_cast(new hf3fs_usrbio_data); + struct hf3fs_usrbio_options *options = td->eo; + + auto &ior_r = static_cast(td->io_ops_data)->ior_r; + auto res = hf3fs_iorcreate3(&ior_r, options->mountpoint, td->o.iodepth, true, options->ior_depth, 0, options->ior_timeout, -1); + if (res < 0) { + return res; + } + + auto &ior_w = static_cast(td->io_ops_data)->ior_w; + res = hf3fs_iorcreate(&ior_w, options->mountpoint, td->o.iodepth, false, options->ior_depth, -1); + if (res < 0) { + return res; + } + + auto *data = static_cast(td->io_ops_data); + data->io_us.resize(td->o.iodepth); + data->queued = 0; + data->events = 0; + + return 0; +} + +static int fio_io_end(struct thread_data *td, struct io_u *io_u, int ret) { + if (io_u->file && ret >= 0 && ddir_rw(io_u->ddir)) { + LAST_POS(io_u->file) = io_u->offset + ret; + } + + if (ret != (int) io_u->xfer_buflen) { + if (ret >= 0) { + io_u->resid = io_u->xfer_buflen - ret; + io_u->error = 0; + return FIO_Q_COMPLETED; + } else { + io_u->error = errno; + } + } + + if (io_u->error) { + io_u_log_error(td, io_u); + td_verror(td, io_u->error, "xfer"); + } + + return FIO_Q_COMPLETED; +} + +static enum fio_q_status hf3fs_usrbio_queue(struct thread_data *td, struct io_u *io_u) { + auto &vec = static_cast(td->io_ops_data)->io_us; + auto *sd = static_cast(td->io_ops_data); + + if (io_u->ddir != sd->last_ddir) { + if (sd->queued != 0) { + return FIO_Q_BUSY; + } else { + vec[sd->queued++] = io_u; + sd->last_ddir = io_u->ddir; + return FIO_Q_QUEUED; + } + } else { + if (sd->queued == td->o.iodepth) { + return FIO_Q_BUSY; + } + vec[sd->queued++] = io_u; + return FIO_Q_QUEUED; + } +} + +static int hf3fs_usrbio_commit(struct thread_data *td) { + auto &vec = static_cast(td->io_ops_data)->io_us; + auto *sd = static_cast(td->io_ops_data); + auto &ior_r = static_cast(td->io_ops_data)->ior_r; + auto &ior_w = static_cast(td->io_ops_data)->ior_w; + auto &iov = static_cast(td->io_ops_data)->iov; + + if (sd->queued == 0) { + return 0; + } + + io_u_mark_submit(td, sd->queued); + + int res = 0; + bool read = (sd->last_ddir == DDIR_READ); + auto &ior = read ? ior_r : ior_w; + for (int i = 0; i < sd->queued; i++) { + res = hf3fs_prep_io(&ior, &iov, read, vec[i]->xfer_buf, vec[i]->file->fd, vec[i]->offset, vec[i]->xfer_buflen, nullptr); + if (res < 0) { + std::cout << "prep " << res << " " << vec[i]->file->fd << std::endl; + return res; + } + } + res = hf3fs_submit_ios(&ior); + if (res < 0) { + std::cout << "submit " << res << std::endl; + return res; + } + + std::vector cqe(sd->queued); + res = hf3fs_wait_for_ios(&ior, cqe.data(), sd->queued, sd->queued, nullptr); + if (res < 0) { + std::cout << "wait " << res << std::endl; + return res; + } + + for (int i = 0; i < sd->queued; i++) { + if (cqe[i].result < 0) { + std::cout << "cqe error " << res << std::endl; + return res; + } + } + + sd->events = sd->queued; + sd->queued = 0; + + return 0; +} + +static int hf3fs_usrbio_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec fio_unused *t) { + auto &vec = static_cast(td->io_ops_data)->io_us; + auto *sd = static_cast(td->io_ops_data); + int ret = 0; + if (min) { + ret = sd->events; + sd->events = 0; + } + return ret; +} + +static struct io_u *hf3fs_usrbio_event(struct thread_data *td, int event) { + auto &vec = static_cast(td->io_ops_data)->io_us; + return vec[event]; +} + +static void hf3fs_usrbio_cleanup(struct thread_data *td) { + delete static_cast(td->io_ops_data); +} + +static int hf3fs_usrbio_open(struct thread_data *td, struct fio_file *f) { + int flags = 0; + if (td_write(td)) { + if (!read_only) { + flags = O_RDWR; + } + } else if (td_read(td)) { + if (!read_only) { + flags = O_RDWR; + } else { + flags = O_RDONLY; + } + } + + f->fd = open(f->file_name, flags); + hf3fs_reg_fd(f->fd, 0); + td->o.open_files++; + return 0; +} + +static int hf3fs_usrbio_close(struct thread_data *td, struct fio_file *f) { + hf3fs_dereg_fd(f->fd); + close(f->fd); + f->fd = -1; + return 0; +} + +static int hf3fs_usrbio_alloc(struct thread_data *td, size_t total_mem) { + struct hf3fs_usrbio_options *options = td->eo; + + auto &iov = static_cast(td->io_ops_data)->iov; + auto res = hf3fs_iovcreate(&iov, options->mountpoint, total_mem, 0, -1); + if (res < 0) { + return res; + } + + td->orig_buffer = iov.base; + return 0; +} + +static void hf3fs_usrbio_free(struct thread_data *td) { + auto &iov = static_cast(td->io_ops_data)->iov; + hf3fs_iovdestroy(&iov); +} + +static int hf3fs_invalidate(struct thread_data *td, struct fio_file *f) { + return 0; +} + +extern "C" { + +static struct ioengine_ops ioengine; +void get_ioengine(struct ioengine_ops **ioengine_ptr) { + *ioengine_ptr = &ioengine; + + ioengine.name = "hf3fs_usrbio", + ioengine.version = FIO_IOOPS_VERSION; + ioengine.flags = FIO_SYNCIO | FIO_NODISKUTIL; + ioengine.init = hf3fs_usrbio_init; + ioengine.queue = hf3fs_usrbio_queue; + ioengine.commit = hf3fs_usrbio_commit; + ioengine.getevents = hf3fs_usrbio_getevents; + ioengine.event = hf3fs_usrbio_event; + ioengine.cleanup = hf3fs_usrbio_cleanup; + ioengine.open_file = hf3fs_usrbio_open; + ioengine.close_file = hf3fs_usrbio_close; + ioengine.invalidate = hf3fs_invalidate; + ioengine.get_file_size = generic_get_file_size; + ioengine.iomem_alloc = hf3fs_usrbio_alloc; + ioengine.iomem_free = hf3fs_usrbio_free; + ioengine.option_struct_size = sizeof(struct hf3fs_usrbio_options); + ioengine.options = options; +} + +}