From 94e7a061a392e5b309884cde9d4eb9dbf79649d6 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Tue, 24 Oct 2023 10:09:15 -0700 Subject: [PATCH 1/5] Make aio_fbsd work on SCALE --- debian/control | 3 + lib/tevent/testsuite.c | 18 +-- lib/tevent/tevent_epoll.c | 237 ++++++++++++++++++++++++++++++++- lib/tevent/tevent_kqueue.h | 33 ++++- lib/tevent/tevent_libaio.h | 54 ++++++++ lib/tevent/wscript | 1 + source3/modules/vfs_aio_fbsd.c | 36 +++-- 7 files changed, 354 insertions(+), 28 deletions(-) create mode 100644 lib/tevent/tevent_libaio.h diff --git a/debian/control b/debian/control index 9ed5f160fb5..dc4f58e9ca9 100644 --- a/debian/control +++ b/debian/control @@ -12,6 +12,8 @@ Build-Depends: bison, docbook-xsl, flex, libacl1-dev, + libaio1, + libaio-dev, libarchive-dev, libblkid-dev, libbsd-dev, @@ -61,6 +63,7 @@ Package: truenas-samba Architecture: any Pre-Depends: dpkg (>= 1.15.6~), ${misc:Pre-Depends} Depends: adduser, + libaio1, libpam-modules, libpam-runtime (>= 1.0.1-11), lsb-base (>= 4.1+Debian), diff --git a/lib/tevent/testsuite.c b/lib/tevent/testsuite.c index 1f5842ba135..5e1ef318062 100644 --- a/lib/tevent/testsuite.c +++ b/lib/tevent/testsuite.c @@ -38,7 +38,9 @@ #endif #ifdef HAVE_KQUEUE #include "tevent_kqueue.h" -#endif +#else +#include "tevent_libaio.h" +#endif /* HAVE_KQUEUE */ static struct tevent_context * @@ -1960,7 +1962,6 @@ static bool test_cached_pid(struct torture_context *test, return true; } -#ifdef HAVE_KQUEUE static bool aio_recv(struct torture_context *test, struct tevent_req *req) { struct tevent_aiocb *taiocbp = NULL; @@ -1987,10 +1988,7 @@ static bool aio_pread_send(struct torture_context *test, taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); - iocbp->aio_fildes = fd; - iocbp->aio_offset = offset; - iocbp->aio_buf = data; - iocbp->aio_nbytes = n; + tio_prep_pread(iocbp, fd, data, n, offset); ret = tevent_add_aio_read(taiocbp); torture_assert(test, ret != -1, "aio_pread_send()"); @@ -2016,10 +2014,7 @@ static bool aio_pwrite_send(struct torture_context *test, taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); - iocbp->aio_fildes = fd; - iocbp->aio_offset = offset; - iocbp->aio_buf = data; - iocbp->aio_nbytes = n; + tio_prep_write(iocbp, fd, data, n, offset); ret = tevent_add_aio_write(taiocbp); torture_assert(test, ret != -1, "aio_write_send()"); @@ -2043,7 +2038,7 @@ static bool aio_fsync_send(struct torture_context *test, taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); - iocbp->aio_fildes = fd; + tio_prep_fsync(iocbp, fd); ret = tevent_add_aio_fsync(taiocbp); torture_assert(test, ret != -1, "aio_write_send()"); @@ -2203,7 +2198,6 @@ static bool test_event_kqueue_aio_pwrite_cancel(struct torture_context *test, TALLOC_FREE(ev_ctx); return true; } -#endif struct torture_suite *torture_local_event(TALLOC_CTX *mem_ctx) { diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c index 2771a83fdd2..30ec8301700 100644 --- a/lib/tevent/tevent_epoll.c +++ b/lib/tevent/tevent_epoll.c @@ -25,14 +25,29 @@ License along with this library; if not, see . */ +#include #include "replace.h" #include "system/filesys.h" #include "system/select.h" #include "tevent.h" #include "tevent_internal.h" #include "tevent_util.h" +#include "libaio.h" +#include "tevent_libaio.h" -struct epoll_event_context { +#define LIBAIO_MAX_EV 256 +#define LIBAIO_MAX_RECV 16 + +struct libaio_data { + io_context_t ctx; + TALLOC_CTX *iocb_pool; + int io_event_fd; + struct io_event events[LIBAIO_MAX_RECV]; +}; + +struct timespec libaio_ts; + +typedef struct epoll_event_context { /* a pointer back to the generic event_context */ struct tevent_context *ev; @@ -44,12 +59,14 @@ struct epoll_event_context { bool panic_force_replay; bool *panic_state; bool (*panic_fallback)(struct tevent_context *ev, bool replay); -}; + struct libaio_data aio; +} epoll_ev_ctx_t; #define EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT (1<<0) #define EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR (1<<1) #define EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR (1<<2) #define EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX (1<<3) +#define EVTOEPOLL(x) (talloc_get_type_abort(x->additional_data, epoll_ev_ctx_t)) #ifdef TEST_PANIC_FALLBACK @@ -179,6 +196,100 @@ static uint32_t epoll_map_flags(uint16_t flags) return ret; } +static void free_libaio_data(struct libaio_data *data) +{ + io_queue_release(data->ctx); + close(data->io_event_fd); + data->io_event_fd = -1; + TALLOC_FREE(data->iocb_pool); +} + +static void process_io_event(epoll_ev_ctx_t *epoll_ev, + struct io_event *event) +{ + struct tevent_aiocb *tiocbp = NULL; + tiocbp = talloc_get_type_abort(event->data, struct tevent_aiocb); + + if (tiocbp->iocbp == NULL) { + tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL, + "iocb request is already completed.\n"); + abort(); + } + if (event->res < 0) { + int error = -event->res; + tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING, + "%s: processing AIO [%p] - failed: %s\n", + tiocbp->location, tiocbp->iocbp, + strerror(error)); + tiocbp->saved_errno = error; + tiocbp->rv = -1; + TALLOC_FREE(tiocbp->iocbp); + tevent_req_error(tiocbp->req, error); + return; + } + + tiocbp->rv = event->res; + return; +} + +static void libaio_eventfd_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, + void *private_data) +{ + int nevents, i; + epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(ev); + + /* io_getevents either returns event count or -errno */ + nevents = io_getevents(epoll_ev->aio.ctx, + 0, LIBAIO_MAX_RECV, + epoll_ev->aio.events, &libaio_ts); + + switch (nevents) { + case -EINTR: + if (epoll_ev->ev->signal_events) { + tevent_common_check_signal(epoll_ev->ev); + } + return; + case 0: + return; + }; + + if (nevents < 0) { + errno = -nevents; + epoll_panic(epoll_ev, "io_getevents() failed", false); + } + + for (i = 0; i < nevents; i++) { + process_io_event(epoll_ev, &epoll_ev->aio.events[i]); + } +} + +static void init_libaio_data(epoll_ev_ctx_t *epoll_ev) +{ + long error; + + if (epoll_ev->aio.io_event_fd != -1) { + return; + } + + epoll_ev->aio.io_event_fd = eventfd(0, 0); + if (epoll_ev->aio.io_event_fd == -1) { + abort(); + } + + error = io_queue_init(LIBAIO_MAX_EV, &epoll_ev->aio.ctx); + if (error) { + errno = -error; + abort(); + } + + epoll_ev->aio.iocb_pool = talloc_pool(epoll_ev, LIBAIO_MAX_EV * sizeof(struct iocb)); + if (epoll_ev == NULL) { + abort(); + } +} + /* free the epoll fd */ @@ -186,6 +297,7 @@ static int epoll_ctx_destructor(struct epoll_event_context *epoll_ev) { close(epoll_ev->epoll_fd); epoll_ev->epoll_fd = -1; + free_libaio_data(&epoll_ev->aio); return 0; } @@ -205,6 +317,9 @@ static int epoll_init_ctx(struct epoll_event_context *epoll_ev) epoll_ev->pid = tevent_cached_getpid(); talloc_set_destructor(epoll_ev, epoll_ctx_destructor); + // lazy-initialize our AIO event fd + epoll_ev->aio.io_event_fd = -1; + return 0; } @@ -946,6 +1061,124 @@ static int epoll_event_loop_once(struct tevent_context *ev, const char *location return epoll_event_loop(epoll_ev, &tval); } +static void tevent_aio_cancel(struct tevent_aiocb *taiocb) +{ + int error; + epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(taiocb->ev); + struct iocb *iocbp = taiocb->iocbp; + struct io_event event; + + tevent_debug( + taiocb->ev, TEVENT_DEBUG_WARNING, + "tevent_aio_cancel(): " + "taio: %p, iocbp: %p\n", + taiocb, iocbp + ); + + if (iocbp == NULL) { + abort(); + } + + error = io_cancel(epoll_ev->aio.ctx, iocbp, &event); + switch (error) { + case EFAULT: + // EFAULT If any of the data structures pointed to are invalid. + tevent_debug( + taiocb->ev, TEVENT_DEBUG_WARNING, + "tevent_aio_cancel(): " + "io_cancel() failed with EFAULT\n" + ); + abort(); + case EINVAL: + // EINVAL If aio_context specified by ctx is invalid. + tevent_debug( + taiocb->ev, TEVENT_DEBUG_WARNING, + "tevent_aio_cancel(): " + "io_cancel() failed with EINVAL\n" + ); + abort(); + case EAGAIN: + tevent_debug( + taiocb->ev, TEVENT_DEBUG_WARNING, + "tevent_aio_cancel(): " + "io_cancel() failed with EAGAIN\n" + ); + abort(); + case EINPROGRESS: + // Cancellation is in progress. Nothing for us to do. + break; + default: + abort(); + }; + + // release memory back to our iocb pool + TALLOC_FREE(taiocb->iocbp); +} + +static bool aio_req_cancel(struct tevent_req *req) +{ + struct tevent_aiocb *taiocb = tevent_req_data(req, struct tevent_aiocb); + tevent_aio_cancel(taiocb); + return true; +} + +static int aio_destructor(struct tevent_aiocb *taio) +{ + if (taio->iocbp != NULL) { + tevent_aio_cancel(taio); + } + return 0; +} + +struct iocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb) +{ + epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(taiocb->ev); + struct iocb *iocbp = NULL; + + tevent_req_set_cancel_fn(taiocb->req, aio_req_cancel); + iocbp = talloc_zero(epoll_ev->aio.iocb_pool, struct iocb); + if (iocbp == NULL) { + abort(); + } + + talloc_set_destructor(taiocb, aio_destructor); + return iocbp; +} + +static int _tevent_add_aio_op(struct tevent_aiocb *taiocb, + enum io_iocb_cmd opcode, + const char *location) +{ + int err; + epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(taiocb->ev); + + taiocb->location = location; + if (opcode != taiocb->iocbp->aio_lio_opcode) { + abort(); + } + + taiocb->iocbp-data = taiocb; + io_set_eventfd(taiocb->iocbp, epoll_ev->aio.io_event_fd); + + err = io_submit(epoll_ev->aio.ctx, 1, &taiocb->iocbp); + if (err) { + TALLOC_FREE(taiocb->iocbp); + errno = -err; + err = -1; + } + + return err; +} + +#define _tevent_add_aio_read(taiocb, __location__)\ + (int)_tevent_add_aio_op(taiocb, IO_CMD_PREAD, __location__) + +#define _tevent_add_aio_write(taiocb, __location__)\ + (int)_tevent_add_aio_op(taiocb, IO_CMD_PWRITE, __location__) + +#define _tevent_add_aio_fsync(taiocb, __location__)\ + (int)_tevent_add_aio_op(taiocb, IO_CMD_FSYNC, __location__) + static const struct tevent_ops epoll_event_ops = { .context_init = epoll_event_context_init, .add_fd = epoll_event_add_fd, diff --git a/lib/tevent/tevent_kqueue.h b/lib/tevent/tevent_kqueue.h index 54cc9245251..a5f8e0d34f1 100644 --- a/lib/tevent/tevent_kqueue.h +++ b/lib/tevent/tevent_kqueue.h @@ -24,11 +24,13 @@ */ #include +typedef struct aiocb iocb_t; + struct tevent_aiocb { const char *location; struct tevent_req *req; struct tevent_context *ev; - struct aiocb *iocbp; + iocb_t *iocbp; int saved_errno; int rv; }; @@ -46,3 +48,32 @@ int _tevent_add_aio_fsync(struct tevent_aiocb *taiocb, const char *location); (int)_tevent_add_aio_fsync(taiocb, __location__) struct aiocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb); + +static inline void tio_prep_pread(iocb_t *iocbp, + int fd, + void *buf, + size_t count, + long long offset) +{ + iocbp->aio_fildes = fd; + iocbp->aio_offset = offset; + iocbp->aio_buf = buf; + iocbp->aio_nbytes = count; +} + +static inline void tio_prep_pwrite(iocb_t *iocbp, + int fd, + void *buf, + size_t count, + long long offset) +{ + iocbp->aio_fildes = fd; + iocbp->aio_offset = offset; + iocbp->aio_buf = buf; + iocbp->aio_nbytes = count; +} + +static inline void tio_prep_fsync(iocb_t *iocbp, int fd) +{ + iocbp->aio_fildes = fd; +} diff --git a/lib/tevent/tevent_libaio.h b/lib/tevent/tevent_libaio.h new file mode 100644 index 00000000000..7bad3f761c3 --- /dev/null +++ b/lib/tevent/tevent_libaio.h @@ -0,0 +1,54 @@ +/* + Unix SMB/CIFS implementation. + + main select loop and event handling - kqueue implementation + + Copyright (C) iXsystems 2023 + + ** NOTE! The following LGPL license applies to the tevent + ** library. This does NOT imply that all of Samba is released + ** under the LGPL + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see . +*/ +#include "libaio.h" + +typedef struct iocb iocb_t; + +struct tevent_aiocb { + const char *location; + struct tevent_req *req; + struct tevent_context *ev; + iocb_t *iocbp; + int saved_errno; + int rv; +}; + +int _tevent_add_aio_read(struct tevent_aiocb *taiocb, const char *location); +#define tevent_add_aio_read(taiocb)\ + (int)_tevent_add_aio_read(taiocb, __location__) + +int _tevent_add_aio_write(struct tevent_aiocb *taiocb, const char *location); +#define tevent_add_aio_write(taiocb)\ + (int)_tevent_add_aio_write(taiocb, __location__) + +int _tevent_add_aio_fsync(struct tevent_aiocb *taiocb, const char *location); +#define tevent_add_aio_fsync(taiocb)\ + (int)_tevent_add_aio_fsync(taiocb, __location__) + +struct iocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb); + +#define tio_prep_pread io_prep_pread +#define tio_prep_pwrite io_prep_pwrite +#define tio_prep_fsync io_prep_fsync diff --git a/lib/tevent/wscript b/lib/tevent/wscript index b05f3fb7912..adb52e40dfc 100644 --- a/lib/tevent/wscript +++ b/lib/tevent/wscript @@ -125,6 +125,7 @@ def build(bld): vnum=VERSION, public_headers=('' if private_library else 'tevent.h'), public_headers_install=not private_library, + ldflags='-laio', pc_files='tevent.pc', private_library=private_library) diff --git a/source3/modules/vfs_aio_fbsd.c b/source3/modules/vfs_aio_fbsd.c index 2eec9dda47b..0c895d70f3c 100644 --- a/source3/modules/vfs_aio_fbsd.c +++ b/source3/modules/vfs_aio_fbsd.c @@ -18,9 +18,16 @@ #include "includes.h" #include "lib/util/tevent_unix.h" + +#ifdef HAVE_EPOLL +#include "lib/tevent/tevent_libaio.h" +#include "libaio.h" +#else #include "lib/tevent/tevent_kqueue.h" -#include "smbd/smbd.h" #include +#endif /*HAVE_EPOLL*/ + +#include "smbd/smbd.h" static struct tevent_req *vfs_aio_fbsd_pread_send(struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx, @@ -32,7 +39,7 @@ static struct tevent_req *vfs_aio_fbsd_pread_send(struct vfs_handle_struct *hand int ret; struct tevent_req *req = NULL; struct tevent_aiocb *taiocbp = NULL; - struct aiocb *iocbp = NULL; + iocb_t *iocbp = NULL; req = tevent_req_create(mem_ctx, &taiocbp, struct tevent_aiocb); if (req == NULL) { @@ -42,10 +49,11 @@ static struct tevent_req *vfs_aio_fbsd_pread_send(struct vfs_handle_struct *hand taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); - iocbp->aio_fildes = fsp_get_io_fd(fsp); - iocbp->aio_offset = offset; - iocbp->aio_buf = data; - iocbp->aio_nbytes = n; + tio_prep_pread(iocbp, + fsp_get_io_fd(fsp), + data, + n, + offset); ret = tevent_add_aio_read(taiocbp); if (ret != 0) { @@ -82,7 +90,7 @@ static struct tevent_req *vfs_aio_fbsd_pwrite_send(struct vfs_handle_struct *han int ret; struct tevent_req *req = NULL; struct tevent_aiocb *taiocbp = NULL; - struct aiocb *iocbp = NULL; + iocb_t *iocbp = NULL; req = tevent_req_create(mem_ctx, &taiocbp, struct tevent_aiocb); if (req == NULL) { @@ -92,10 +100,11 @@ static struct tevent_req *vfs_aio_fbsd_pwrite_send(struct vfs_handle_struct *han taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); - iocbp->aio_fildes = fsp_get_io_fd(fsp); - iocbp->aio_offset = offset; - iocbp->aio_buf = discard_const(data); - iocbp->aio_nbytes = n; + tio_prep_pwrite(iocbp, + fsp_get_io_fd(fsp), + discard_const(data), + n, + offset); ret = tevent_add_aio_write(taiocbp); if (ret != 0) { @@ -122,7 +131,7 @@ static struct tevent_req *vfs_aio_fbsd_fsync_send(struct vfs_handle_struct *hand int ret; struct tevent_req *req = NULL; struct tevent_aiocb *taiocbp = NULL; - struct aiocb *iocbp = NULL; + iocb_t *iocbp = NULL; req = tevent_req_create(mem_ctx, &taiocbp, struct tevent_aiocb); if (req == NULL) { @@ -132,7 +141,8 @@ static struct tevent_req *vfs_aio_fbsd_fsync_send(struct vfs_handle_struct *hand taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); - iocbp->aio_fildes = fsp_get_io_fd(fsp); + + tio_prep_fsync(iocbp, fsp_get_io_fd(fsp)); ret = tevent_add_aio_fsync(taiocbp); if (ret != 0) { From 43c0bc002a0e771305753b284a733b30d982a43b Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Tue, 24 Oct 2023 13:25:47 -0700 Subject: [PATCH 2/5] Fixes --- lib/tevent/testsuite.c | 10 +++----- lib/tevent/tevent_epoll.c | 51 +++++++++++++++++++------------------- lib/tevent/tevent_libaio.h | 25 ++++++++++++++++--- 3 files changed, 51 insertions(+), 35 deletions(-) diff --git a/lib/tevent/testsuite.c b/lib/tevent/testsuite.c index 5e1ef318062..667b98ac68e 100644 --- a/lib/tevent/testsuite.c +++ b/lib/tevent/testsuite.c @@ -1980,7 +1980,7 @@ static bool aio_pread_send(struct torture_context *test, int ret; struct tevent_req *req = NULL; struct tevent_aiocb *taiocbp = NULL; - struct aiocb *iocbp = NULL; + iocb_t *iocbp = NULL; req = tevent_req_create(test, &taiocbp, struct tevent_aiocb); torture_assert(test, req != NULL, "tevent_req_create()"); @@ -2006,7 +2006,7 @@ static bool aio_pwrite_send(struct torture_context *test, int ret; struct tevent_req *req = NULL; struct tevent_aiocb *taiocbp = NULL; - struct aiocb *iocbp = NULL; + iocb_t *iocbp = NULL; req = tevent_req_create(test, &taiocbp, struct tevent_aiocb); torture_assert(test, req != NULL, "tevent_req_create()"); @@ -2014,7 +2014,7 @@ static bool aio_pwrite_send(struct torture_context *test, taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); - tio_prep_write(iocbp, fd, data, n, offset); + tio_prep_pwrite(iocbp, fd, data, n, offset); ret = tevent_add_aio_write(taiocbp); torture_assert(test, ret != -1, "aio_write_send()"); @@ -2030,7 +2030,7 @@ static bool aio_fsync_send(struct torture_context *test, int ret; struct tevent_req *req = NULL; struct tevent_aiocb *taiocbp = NULL; - struct aiocb *iocbp = NULL; + iocb_t *iocbp = NULL; req = tevent_req_create(test, &taiocbp, struct tevent_aiocb); torture_assert(test, req != NULL, "tevent_req_create()"); @@ -2272,7 +2272,6 @@ struct torture_suite *torture_local_event_aio(TALLOC_CTX *mem_ctx) { struct torture_suite *suite = torture_suite_create(mem_ctx, "event_aio"); -#ifdef HAVE_KQUEUE torture_suite_add_simple_tcase_const(suite, "basic_kqueue_aio", test_event_kqueue_aio, @@ -2292,6 +2291,5 @@ struct torture_suite *torture_local_event_aio(TALLOC_CTX *mem_ctx) "aio_write_cancel_destructor", test_event_kqueue_aio_pwrite_cancel, NULL); -#endif /* HAVE_KQUEUE */ return suite; } diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c index 30ec8301700..fd364293566 100644 --- a/lib/tevent/tevent_epoll.c +++ b/lib/tevent/tevent_epoll.c @@ -137,10 +137,7 @@ _PRIVATE_ void tevent_epoll_set_panic_fallback(struct tevent_context *ev, bool (*panic_fallback)(struct tevent_context *ev, bool replay)) { - struct epoll_event_context *epoll_ev = - talloc_get_type_abort(ev->additional_data, - struct epoll_event_context); - + epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(ev); epoll_ev->panic_fallback = panic_fallback; } @@ -855,7 +852,7 @@ static int epoll_event_loop(struct epoll_event_context *epoll_ev, struct timeval static int epoll_event_context_init(struct tevent_context *ev) { int ret; - struct epoll_event_context *epoll_ev; + epoll_ev_ctx_t *epoll_ev; /* * We might be called during tevent_re_initialise() @@ -863,7 +860,7 @@ static int epoll_event_context_init(struct tevent_context *ev) */ TALLOC_FREE(ev->additional_data); - epoll_ev = talloc_zero(ev, struct epoll_event_context); + epoll_ev = talloc_zero(ev, epoll_ev_ctx_t); if (!epoll_ev) return -1; epoll_ev->ev = ev; epoll_ev->epoll_fd = -1; @@ -893,8 +890,7 @@ static int epoll_event_fd_destructor(struct tevent_fd *fde) return tevent_common_fd_destructor(fde); } - epoll_ev = talloc_get_type_abort(ev->additional_data, - struct epoll_event_context); + epoll_ev = EVTOEPOLL(ev); /* * we must remove the event from the list @@ -953,9 +949,7 @@ static struct tevent_fd *epoll_event_add_fd(struct tevent_context *ev, TALLOC_CT const char *handler_name, const char *location) { - struct epoll_event_context *epoll_ev = - talloc_get_type_abort(ev->additional_data, - struct epoll_event_context); + epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(ev); struct tevent_fd *fde; bool panic_triggered = false; pid_t old_pid = epoll_ev->pid; @@ -996,8 +990,7 @@ static void epoll_event_set_fd_flags(struct tevent_fd *fde, uint16_t flags) if (fde->flags == flags) return; ev = fde->event_ctx; - epoll_ev = talloc_get_type_abort(ev->additional_data, - struct epoll_event_context); + epoll_ev = EVTOEPOLL(ev); old_pid = epoll_ev->pid; fde->flags = flags; @@ -1021,9 +1014,7 @@ static void epoll_event_set_fd_flags(struct tevent_fd *fde, uint16_t flags) */ static int epoll_event_loop_once(struct tevent_context *ev, const char *location) { - struct epoll_event_context *epoll_ev = - talloc_get_type_abort(ev->additional_data, - struct epoll_event_context); + epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(ev); struct timeval tval; bool panic_triggered = false; @@ -1145,19 +1136,21 @@ struct iocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb) return iocbp; } -static int _tevent_add_aio_op(struct tevent_aiocb *taiocb, - enum io_iocb_cmd opcode, - const char *location) +static int tevent_add_aio_op(struct tevent_aiocb *taiocb, + enum io_iocb_cmd opcode, + const char *location) { int err; epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(taiocb->ev); + tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL, + "entering!\n"); taiocb->location = location; if (opcode != taiocb->iocbp->aio_lio_opcode) { abort(); } - taiocb->iocbp-data = taiocb; + taiocb->iocbp->data = taiocb; io_set_eventfd(taiocb->iocbp, epoll_ev->aio.io_event_fd); err = io_submit(epoll_ev->aio.ctx, 1, &taiocb->iocbp); @@ -1170,14 +1163,20 @@ static int _tevent_add_aio_op(struct tevent_aiocb *taiocb, return err; } -#define _tevent_add_aio_read(taiocb, __location__)\ - (int)_tevent_add_aio_op(taiocb, IO_CMD_PREAD, __location__) +int _tevent_add_aio_read(struct tevent_aiocb *taiocb, const char *location) +{ + return tevent_add_aio_op(taiocb, IO_CMD_PREAD, location); +} -#define _tevent_add_aio_write(taiocb, __location__)\ - (int)_tevent_add_aio_op(taiocb, IO_CMD_PWRITE, __location__) +int _tevent_add_aio_write(struct tevent_aiocb *taiocb, const char *location) +{ + return tevent_add_aio_op(taiocb, IO_CMD_PWRITE, location); +} -#define _tevent_add_aio_fsync(taiocb, __location__)\ - (int)_tevent_add_aio_op(taiocb, IO_CMD_FSYNC, __location__) +int _tevent_add_aio_fsync(struct tevent_aiocb *taiocb, const char *location) +{ + return tevent_add_aio_op(taiocb, IO_CMD_FSYNC, location); +} static const struct tevent_ops epoll_event_ops = { .context_init = epoll_event_context_init, diff --git a/lib/tevent/tevent_libaio.h b/lib/tevent/tevent_libaio.h index 7bad3f761c3..6eb0b54a6e0 100644 --- a/lib/tevent/tevent_libaio.h +++ b/lib/tevent/tevent_libaio.h @@ -49,6 +49,25 @@ int _tevent_add_aio_fsync(struct tevent_aiocb *taiocb, const char *location); struct iocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb); -#define tio_prep_pread io_prep_pread -#define tio_prep_pwrite io_prep_pwrite -#define tio_prep_fsync io_prep_fsync +static inline void tio_prep_pread(iocb_t *iocbp, + int fd, + void *buf, + size_t count, + long long offset) +{ + return io_prep_pread(iocbp, fd, buf, count, offset); +} + +static inline void tio_prep_pwrite(iocb_t *iocbp, + int fd, + void *buf, + size_t count, + long long offset) +{ + return io_prep_pwrite(iocbp, fd, buf, count, offset); +} + +static inline void tio_prep_fsync(iocb_t *iocbp, int fd) +{ + return io_prep_fsync(iocbp, fd); +} From 531aebffd026f559f87a31663ecf88c0b933a216 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Wed, 25 Oct 2023 12:17:06 -0700 Subject: [PATCH 3/5] Fixes --- lib/tevent/testsuite.c | 6 ++ lib/tevent/tevent_epoll.c | 157 ++++++++++++++++++++++++++++++------- lib/tevent/tevent_libaio.h | 3 + 3 files changed, 139 insertions(+), 27 deletions(-) diff --git a/lib/tevent/testsuite.c b/lib/tevent/testsuite.c index 667b98ac68e..303e54fd250 100644 --- a/lib/tevent/testsuite.c +++ b/lib/tevent/testsuite.c @@ -1988,6 +1988,8 @@ static bool aio_pread_send(struct torture_context *test, taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); + torture_assert(test, iocbp != NULL, "tevent_ctx_get_iocb()"); + tio_prep_pread(iocbp, fd, data, n, offset); ret = tevent_add_aio_read(taiocbp); @@ -2014,6 +2016,8 @@ static bool aio_pwrite_send(struct torture_context *test, taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); + torture_assert(test, iocbp != NULL, "tevent_ctx_get_iocb()"); + tio_prep_pwrite(iocbp, fd, data, n, offset); ret = tevent_add_aio_write(taiocbp); @@ -2038,6 +2042,8 @@ static bool aio_fsync_send(struct torture_context *test, taiocbp->req = req; iocbp = tevent_ctx_get_iocb(taiocbp); + torture_assert(test, iocbp != NULL, "tevent_ctx_get_iocb()"); + tio_prep_fsync(iocbp, fd); ret = tevent_add_aio_fsync(taiocbp); diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c index fd364293566..8da89b66c41 100644 --- a/lib/tevent/tevent_epoll.c +++ b/lib/tevent/tevent_epoll.c @@ -43,6 +43,7 @@ struct libaio_data { TALLOC_CTX *iocb_pool; int io_event_fd; struct io_event events[LIBAIO_MAX_RECV]; + struct tevent_fd *fde; }; struct timespec libaio_ts; @@ -198,6 +199,9 @@ static void free_libaio_data(struct libaio_data *data) io_queue_release(data->ctx); close(data->io_event_fd); data->io_event_fd = -1; + if (data->fde) { + talloc_set_destructor(data->fde, NULL); + } TALLOC_FREE(data->iocb_pool); } @@ -205,8 +209,31 @@ static void process_io_event(epoll_ev_ctx_t *epoll_ev, struct io_event *event) { struct tevent_aiocb *tiocbp = NULL; + + if (event->obj->data == NULL) { + TALLOC_FREE(event->obj); + return; + } + tiocbp = talloc_get_type_abort(event->data, struct tevent_aiocb); + switch (tiocbp->state) { + case TAIO_RUNNING: + // Generally, the AIO task should be running. + break; + case TAIO_CANCELLED: + // Event was cancelled. Remove destructor + // and free everything. + talloc_set_destructor(tiocbp, NULL); + TALLOC_FREE(tiocbp->iocbp); + return; + case TAIO_INIT: + case TAIO_COMPLETE: + abort(); + default: + abort(); + }; + if (tiocbp->iocbp == NULL) { tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL, "iocb request is already completed.\n"); @@ -221,11 +248,15 @@ static void process_io_event(epoll_ev_ctx_t *epoll_ev, tiocbp->saved_errno = error; tiocbp->rv = -1; TALLOC_FREE(tiocbp->iocbp); + tiocbp->state = TAIO_COMPLETE; tevent_req_error(tiocbp->req, error); return; } tiocbp->rv = event->res; + tiocbp->state = TAIO_COMPLETE; + TALLOC_FREE(tiocbp->iocbp); + tevent_req_done(tiocbp->req); return; } @@ -247,8 +278,10 @@ static void libaio_eventfd_handler(struct tevent_context *ev, if (epoll_ev->ev->signal_events) { tevent_common_check_signal(epoll_ev->ev); } + abort(); return; case 0: + abort(); return; }; @@ -262,15 +295,56 @@ static void libaio_eventfd_handler(struct tevent_context *ev, } } -static void init_libaio_data(epoll_ev_ctx_t *epoll_ev) +static int io_event_abort(struct tevent_fd *fde) +{ + tevent_debug(fde->event_ctx, TEVENT_DEBUG_FATAL, + "tevent_fd for io_event_fd unexpectedly freed.\n"); + abort(); +} + +static bool add_io_event_fd_to_epoll(epoll_ev_ctx_t *epoll_ev) +{ + int ret; + struct epoll_event event; + struct tevent_fd *fde = NULL; + + fde = tevent_common_add_fd(epoll_ev->ev, epoll_ev, + epoll_ev->aio.io_event_fd, + TEVENT_FD_READ | TEVENT_FD_WRITE, + libaio_eventfd_handler, + NULL, + "io_events_available", + __location__); + if (!fde) abort(); + + event = (struct epoll_event) { + .events = EPOLLIN|EPOLLET, + .data.ptr = fde, + }; + + ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_ADD, fde->fd, &event); + if (ret != 0) { + tevent_debug(epoll_ev->ev, TEVENT_DEBUG_ERROR, + "EPOLL_CTL_ADD failure for io_event_fd: %s\n", + strerror(errno)); + abort(); + } + + talloc_set_destructor(fde, io_event_abort); + epoll_ev->aio.fde = fde; + + return true; +} + +static bool init_libaio_data(epoll_ev_ctx_t *epoll_ev) { long error; if (epoll_ev->aio.io_event_fd != -1) { - return; + return true; } - epoll_ev->aio.io_event_fd = eventfd(0, 0); + epoll_ev->aio.io_event_fd = eventfd(0, EFD_NONBLOCK); if (epoll_ev->aio.io_event_fd == -1) { abort(); } @@ -285,6 +359,13 @@ static void init_libaio_data(epoll_ev_ctx_t *epoll_ev) if (epoll_ev == NULL) { abort(); } + + if (!add_io_event_fd_to_epoll(epoll_ev)) { + free_libaio_data(&epoll_ev->aio); + return false; + } + + return true; } /* @@ -784,6 +865,11 @@ static int epoll_event_loop(struct epoll_event_context *epoll_ev, struct timeval epoll_panic(epoll_ev, "epoll_wait() gave bad data", true); return -1; } + + if (fde == epoll_ev->aio.fde) { + return tevent_common_invoke_fd_handler(fde, TEVENT_FD_READ, NULL); + } + if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) { /* * Save off the multiplexed event in case we need @@ -1072,7 +1158,7 @@ static void tevent_aio_cancel(struct tevent_aiocb *taiocb) error = io_cancel(epoll_ev->aio.ctx, iocbp, &event); switch (error) { - case EFAULT: + case -EFAULT: // EFAULT If any of the data structures pointed to are invalid. tevent_debug( taiocb->ev, TEVENT_DEBUG_WARNING, @@ -1080,30 +1166,24 @@ static void tevent_aio_cancel(struct tevent_aiocb *taiocb) "io_cancel() failed with EFAULT\n" ); abort(); - case EINVAL: - // EINVAL If aio_context specified by ctx is invalid. - tevent_debug( - taiocb->ev, TEVENT_DEBUG_WARNING, - "tevent_aio_cancel(): " - "io_cancel() failed with EINVAL\n" - ); - abort(); - case EAGAIN: + case -EINVAL: + // Potentially already complete + // Due to design limitation of AIO implementation in kernel + // we can't differentiate between invalid io_ctx and this + case -EINPROGRESS: + // Cancellation is in progress. Nothing for us to do. + break; + case -EAGAIN: tevent_debug( taiocb->ev, TEVENT_DEBUG_WARNING, "tevent_aio_cancel(): " "io_cancel() failed with EAGAIN\n" ); abort(); - case EINPROGRESS: - // Cancellation is in progress. Nothing for us to do. - break; default: abort(); }; - - // release memory back to our iocb pool - TALLOC_FREE(taiocb->iocbp); + taiocb->state = TAIO_CANCELLED; } static bool aio_req_cancel(struct tevent_req *req) @@ -1115,10 +1195,22 @@ static bool aio_req_cancel(struct tevent_req *req) static int aio_destructor(struct tevent_aiocb *taio) { - if (taio->iocbp != NULL) { + switch (taio->state) { + case TAIO_INIT: + case TAIO_CANCELLED: + case TAIO_COMPLETE: + TALLOC_FREE(taio->iocbp); + return 0; + case TAIO_RUNNING: + // switch state to CANCELLED and + // hopefully pick up once we getevents tevent_aio_cancel(taio); - } - return 0; + taio->iocbp->data = NULL; + return 0; + default: + // unknown value + abort(); + }; } struct iocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb) @@ -1133,6 +1225,8 @@ struct iocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb) } talloc_set_destructor(taiocb, aio_destructor); + taiocb->iocbp = iocbp; + return iocbp; } @@ -1143,8 +1237,12 @@ static int tevent_add_aio_op(struct tevent_aiocb *taiocb, int err; epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(taiocb->ev); - tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL, - "entering!\n"); + if (!init_libaio_data(epoll_ev)) { + abort(); + errno = EAGAIN; + return -1; + } + taiocb->location = location; if (opcode != taiocb->iocbp->aio_lio_opcode) { abort(); @@ -1154,22 +1252,27 @@ static int tevent_add_aio_op(struct tevent_aiocb *taiocb, io_set_eventfd(taiocb->iocbp, epoll_ev->aio.io_event_fd); err = io_submit(epoll_ev->aio.ctx, 1, &taiocb->iocbp); - if (err) { + if (err < 0) { TALLOC_FREE(taiocb->iocbp); errno = -err; - err = -1; + abort(); + return -1; } - return err; + taiocb->state = TAIO_RUNNING; + + return 0; } int _tevent_add_aio_read(struct tevent_aiocb *taiocb, const char *location) { + taiocb->iocbp->aio_rw_flags |= RWF_NOWAIT; return tevent_add_aio_op(taiocb, IO_CMD_PREAD, location); } int _tevent_add_aio_write(struct tevent_aiocb *taiocb, const char *location) { + taiocb->iocbp->aio_rw_flags |= RWF_NOWAIT; return tevent_add_aio_op(taiocb, IO_CMD_PWRITE, location); } diff --git a/lib/tevent/tevent_libaio.h b/lib/tevent/tevent_libaio.h index 6eb0b54a6e0..6219aedd8aa 100644 --- a/lib/tevent/tevent_libaio.h +++ b/lib/tevent/tevent_libaio.h @@ -26,6 +26,8 @@ typedef struct iocb iocb_t; +enum taiocb_state { TAIO_INIT, TAIO_RUNNING, TAIO_COMPLETE, TAIO_CANCELLED }; + struct tevent_aiocb { const char *location; struct tevent_req *req; @@ -33,6 +35,7 @@ struct tevent_aiocb { iocb_t *iocbp; int saved_errno; int rv; + enum taiocb_state state; }; int _tevent_add_aio_read(struct tevent_aiocb *taiocb, const char *location); From aa4fec647aab0abd6cd93cc61adfa5d97b4ff69a Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Thu, 26 Oct 2023 03:45:33 -0700 Subject: [PATCH 4/5] Initial io_uring implementation --- lib/tevent/tevent_epoll.c | 205 +++++++++++++++---------------------- lib/tevent/tevent_libaio.h | 12 +-- lib/tevent/wscript | 2 +- 3 files changed, 89 insertions(+), 130 deletions(-) diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c index 8da89b66c41..e78aeec1129 100644 --- a/lib/tevent/tevent_epoll.c +++ b/lib/tevent/tevent_epoll.c @@ -26,23 +26,25 @@ */ #include +#include #include "replace.h" #include "system/filesys.h" #include "system/select.h" #include "tevent.h" #include "tevent_internal.h" #include "tevent_util.h" +#if 0 #include "libaio.h" +#endif #include "tevent_libaio.h" #define LIBAIO_MAX_EV 256 #define LIBAIO_MAX_RECV 16 struct libaio_data { - io_context_t ctx; - TALLOC_CTX *iocb_pool; + //io_context_t ctx; + struct io_uring ring; int io_event_fd; - struct io_event events[LIBAIO_MAX_RECV]; struct tevent_fd *fde; }; @@ -196,26 +198,24 @@ static uint32_t epoll_map_flags(uint16_t flags) static void free_libaio_data(struct libaio_data *data) { - io_queue_release(data->ctx); - close(data->io_event_fd); - data->io_event_fd = -1; + if (data->io_event_fd != -1) { + io_uring_queue_exit(&data->ring); + close(data->io_event_fd); + data->io_event_fd = -1; + } + if (data->fde) { talloc_set_destructor(data->fde, NULL); } - TALLOC_FREE(data->iocb_pool); } -static void process_io_event(epoll_ev_ctx_t *epoll_ev, - struct io_event *event) +static void process_io_cqe(epoll_ev_ctx_t *epoll_ev, + const struct io_uring_cqe *cqe) { struct tevent_aiocb *tiocbp = NULL; - if (event->obj->data == NULL) { - TALLOC_FREE(event->obj); - return; - } - - tiocbp = talloc_get_type_abort(event->data, struct tevent_aiocb); + tiocbp = talloc_get_type_abort(io_uring_cqe_get_data(cqe), + struct tevent_aiocb); switch (tiocbp->state) { case TAIO_RUNNING: @@ -234,66 +234,49 @@ static void process_io_event(epoll_ev_ctx_t *epoll_ev, abort(); }; - if (tiocbp->iocbp == NULL) { - tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL, - "iocb request is already completed.\n"); + if (cqe->flags) { abort(); } - if (event->res < 0) { - int error = -event->res; + + if (cqe->res < 0) { + int error = -cqe->res; tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING, "%s: processing AIO [%p] - failed: %s\n", tiocbp->location, tiocbp->iocbp, strerror(error)); tiocbp->saved_errno = error; tiocbp->rv = -1; - TALLOC_FREE(tiocbp->iocbp); tiocbp->state = TAIO_COMPLETE; tevent_req_error(tiocbp->req, error); return; } - tiocbp->rv = event->res; + tiocbp->rv = cqe->res; tiocbp->state = TAIO_COMPLETE; - TALLOC_FREE(tiocbp->iocbp); tevent_req_done(tiocbp->req); return; } -static void libaio_eventfd_handler(struct tevent_context *ev, - struct tevent_fd *fde, - uint16_t flags, - void *private_data) +static int process_uring_event(epoll_ev_ctx_t *epoll_ev) { - int nevents, i; - epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(ev); - - /* io_getevents either returns event count or -errno */ - nevents = io_getevents(epoll_ev->aio.ctx, - 0, LIBAIO_MAX_RECV, - epoll_ev->aio.events, &libaio_ts); + int error, cnt = 0; + unsigned head; + eventfd_t efd; + struct io_uring_cqe *cqe = NULL; - switch (nevents) { - case -EINTR: - if (epoll_ev->ev->signal_events) { - tevent_common_check_signal(epoll_ev->ev); - } - abort(); - return; - case 0: - abort(); - return; - }; - - if (nevents < 0) { - errno = -nevents; - epoll_panic(epoll_ev, "io_getevents() failed", false); + error = eventfd_read(epoll_ev->aio.io_event_fd, &efd); + if (error == -1) { + return -1; } - for (i = 0; i < nevents; i++) { - process_io_event(epoll_ev, &epoll_ev->aio.events[i]); + io_uring_for_each_cqe(&epoll_ev->aio.ring, head, cqe) { + process_io_cqe(epoll_ev, cqe); + cnt++; } -} + + io_uring_cq_advance(&epoll_ev->aio.ring, cnt); + return 0; +} static int io_event_abort(struct tevent_fd *fde) { @@ -311,7 +294,7 @@ static bool add_io_event_fd_to_epoll(epoll_ev_ctx_t *epoll_ev) fde = tevent_common_add_fd(epoll_ev->ev, epoll_ev, epoll_ev->aio.io_event_fd, TEVENT_FD_READ | TEVENT_FD_WRITE, - libaio_eventfd_handler, + NULL, NULL, "io_events_available", __location__); @@ -338,7 +321,7 @@ static bool add_io_event_fd_to_epoll(epoll_ev_ctx_t *epoll_ev) static bool init_libaio_data(epoll_ev_ctx_t *epoll_ev) { - long error; + int error; if (epoll_ev->aio.io_event_fd != -1) { return true; @@ -349,15 +332,19 @@ static bool init_libaio_data(epoll_ev_ctx_t *epoll_ev) abort(); } - error = io_queue_init(LIBAIO_MAX_EV, &epoll_ev->aio.ctx); + error = io_uring_queue_init(LIBAIO_MAX_EV, + &epoll_ev->aio.ring, + IORING_SETUP_SINGLE_ISSUER); if (error) { errno = -error; abort(); } - epoll_ev->aio.iocb_pool = talloc_pool(epoll_ev, LIBAIO_MAX_EV * sizeof(struct iocb)); - if (epoll_ev == NULL) { - abort(); + error = io_uring_register_eventfd(&epoll_ev->aio.ring, + epoll_ev->aio.io_event_fd); + if (error) { + free_libaio_data(&epoll_ev->aio); + return false; } if (!add_io_event_fd_to_epoll(epoll_ev)) { @@ -867,7 +854,7 @@ static int epoll_event_loop(struct epoll_event_context *epoll_ev, struct timeval } if (fde == epoll_ev->aio.fde) { - return tevent_common_invoke_fd_handler(fde, TEVENT_FD_READ, NULL); + return process_uring_event(epoll_ev); } if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) { @@ -1140,50 +1127,33 @@ static int epoll_event_loop_once(struct tevent_context *ev, const char *location static void tevent_aio_cancel(struct tevent_aiocb *taiocb) { - int error; + int ret; + struct io_uring_sqe *sqe = NULL; + struct io_uring_cqe *cqe = NULL; epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(taiocb->ev); - struct iocb *iocbp = taiocb->iocbp; - struct io_event event; - - tevent_debug( - taiocb->ev, TEVENT_DEBUG_WARNING, - "tevent_aio_cancel(): " - "taio: %p, iocbp: %p\n", - taiocb, iocbp - ); - if (iocbp == NULL) { + sqe = io_uring_get_sqe(&epoll_ev->aio.ring); + if (sqe == NULL) { abort(); } - error = io_cancel(epoll_ev->aio.ctx, iocbp, &event); - switch (error) { - case -EFAULT: - // EFAULT If any of the data structures pointed to are invalid. - tevent_debug( - taiocb->ev, TEVENT_DEBUG_WARNING, - "tevent_aio_cancel(): " - "io_cancel() failed with EFAULT\n" - ); + io_uring_prep_cancel(sqe, taiocb, 0); + + ret = io_uring_submit(&epoll_ev->aio.ring); + if (ret < 0) { abort(); - case -EINVAL: - // Potentially already complete - // Due to design limitation of AIO implementation in kernel - // we can't differentiate between invalid io_ctx and this - case -EINPROGRESS: - // Cancellation is in progress. Nothing for us to do. - break; - case -EAGAIN: - tevent_debug( - taiocb->ev, TEVENT_DEBUG_WARNING, - "tevent_aio_cancel(): " - "io_cancel() failed with EAGAIN\n" - ); + } + + ret = io_uring_wait_cqe(&epoll_ev->aio.ring, &cqe); + if (ret < 0) { abort(); - default: + } + + if (io_uring_cqe_get_data(cqe)) { abort(); - }; - taiocb->state = TAIO_CANCELLED; + } + + io_uring_cqe_seen(&epoll_ev->aio.ring, cqe); } static bool aio_req_cancel(struct tevent_req *req) @@ -1199,13 +1169,11 @@ static int aio_destructor(struct tevent_aiocb *taio) case TAIO_INIT: case TAIO_CANCELLED: case TAIO_COMPLETE: - TALLOC_FREE(taio->iocbp); return 0; case TAIO_RUNNING: // switch state to CANCELLED and // hopefully pick up once we getevents tevent_aio_cancel(taio); - taio->iocbp->data = NULL; return 0; default: // unknown value @@ -1213,25 +1181,27 @@ static int aio_destructor(struct tevent_aiocb *taio) }; } -struct iocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb) +iocb_t *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb) { epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(taiocb->ev); - struct iocb *iocbp = NULL; + + if (!init_libaio_data(epoll_ev)) { + abort(); + errno = EAGAIN; + return -1; + } tevent_req_set_cancel_fn(taiocb->req, aio_req_cancel); - iocbp = talloc_zero(epoll_ev->aio.iocb_pool, struct iocb); - if (iocbp == NULL) { + taiocb->iocbp = io_uring_get_sqe(&epoll_ev->aio.ring); + if (taiocb->iocbp == NULL) { abort(); } talloc_set_destructor(taiocb, aio_destructor); - taiocb->iocbp = iocbp; - - return iocbp; + return taiocb->iocbp; } static int tevent_add_aio_op(struct tevent_aiocb *taiocb, - enum io_iocb_cmd opcode, const char *location) { int err; @@ -1244,41 +1214,30 @@ static int tevent_add_aio_op(struct tevent_aiocb *taiocb, } taiocb->location = location; - if (opcode != taiocb->iocbp->aio_lio_opcode) { - abort(); - } - - taiocb->iocbp->data = taiocb; - io_set_eventfd(taiocb->iocbp, epoll_ev->aio.io_event_fd); + io_uring_sqe_set_data(taiocb->iocbp, (void *)taiocb); - err = io_submit(epoll_ev->aio.ctx, 1, &taiocb->iocbp); - if (err < 0) { - TALLOC_FREE(taiocb->iocbp); - errno = -err; + err = io_uring_submit(&epoll_ev->aio.ring); + if (err < 0) { abort(); - return -1; - } + } taiocb->state = TAIO_RUNNING; - return 0; } int _tevent_add_aio_read(struct tevent_aiocb *taiocb, const char *location) { - taiocb->iocbp->aio_rw_flags |= RWF_NOWAIT; - return tevent_add_aio_op(taiocb, IO_CMD_PREAD, location); + return tevent_add_aio_op(taiocb, location); } int _tevent_add_aio_write(struct tevent_aiocb *taiocb, const char *location) { - taiocb->iocbp->aio_rw_flags |= RWF_NOWAIT; - return tevent_add_aio_op(taiocb, IO_CMD_PWRITE, location); + return tevent_add_aio_op(taiocb, location); } int _tevent_add_aio_fsync(struct tevent_aiocb *taiocb, const char *location) { - return tevent_add_aio_op(taiocb, IO_CMD_FSYNC, location); + return tevent_add_aio_op(taiocb, location); } static const struct tevent_ops epoll_event_ops = { diff --git a/lib/tevent/tevent_libaio.h b/lib/tevent/tevent_libaio.h index 6219aedd8aa..3ae9469f3bc 100644 --- a/lib/tevent/tevent_libaio.h +++ b/lib/tevent/tevent_libaio.h @@ -22,9 +22,9 @@ You should have received a copy of the GNU Lesser General Public License along with this library; if not, see . */ -#include "libaio.h" +#include -typedef struct iocb iocb_t; +typedef struct io_uring_sqe iocb_t; enum taiocb_state { TAIO_INIT, TAIO_RUNNING, TAIO_COMPLETE, TAIO_CANCELLED }; @@ -50,7 +50,7 @@ int _tevent_add_aio_fsync(struct tevent_aiocb *taiocb, const char *location); #define tevent_add_aio_fsync(taiocb)\ (int)_tevent_add_aio_fsync(taiocb, __location__) -struct iocb *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb); +iocb_t *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb); static inline void tio_prep_pread(iocb_t *iocbp, int fd, @@ -58,7 +58,7 @@ static inline void tio_prep_pread(iocb_t *iocbp, size_t count, long long offset) { - return io_prep_pread(iocbp, fd, buf, count, offset); + return io_uring_prep_read(iocbp, fd, buf, count, offset); } static inline void tio_prep_pwrite(iocb_t *iocbp, @@ -67,10 +67,10 @@ static inline void tio_prep_pwrite(iocb_t *iocbp, size_t count, long long offset) { - return io_prep_pwrite(iocbp, fd, buf, count, offset); + return io_uring_prep_write(iocbp, fd, buf, count, offset); } static inline void tio_prep_fsync(iocb_t *iocbp, int fd) { - return io_prep_fsync(iocbp, fd); + return io_uring_prep_fsync(iocbp, fd, 0); } diff --git a/lib/tevent/wscript b/lib/tevent/wscript index adb52e40dfc..7eb9fcf7e36 100644 --- a/lib/tevent/wscript +++ b/lib/tevent/wscript @@ -111,7 +111,7 @@ def build(bld): private_library = True if not bld.CONFIG_SET('USING_SYSTEM_TEVENT'): - tevent_deps = 'replace talloc' + tevent_deps = 'replace talloc uring' if bld.CONFIG_SET('HAVE_PTHREAD'): tevent_deps += ' pthread' From 4f2ef2b509ff235c1ab55e5a6d381b7f860ffd04 Mon Sep 17 00:00:00 2001 From: Andrew Walker Date: Fri, 27 Oct 2023 09:00:48 -0700 Subject: [PATCH 5/5] Various fixes --- lib/tevent/testsuite.c | 8 +- lib/tevent/tevent_epoll.c | 200 +++++++++++++++++++++++++++++++++----- 2 files changed, 182 insertions(+), 26 deletions(-) diff --git a/lib/tevent/testsuite.c b/lib/tevent/testsuite.c index 303e54fd250..aced68d795e 100644 --- a/lib/tevent/testsuite.c +++ b/lib/tevent/testsuite.c @@ -2120,18 +2120,21 @@ static bool test_event_kqueue_aio_fsync_cancel(struct torture_context *test, torture_assert(test, fd != -1, "open() failed"); for (i = 0; i < INFLIGHT_REQ; i++) { + torture_comment(test, "fsync_cancel %d\n", i); struct tevent_req *req = NULL; ok = aio_fsync_send(test, ev_ctx, fd, &req); torture_assert(test, ok, "aio_fsync_send() failed"); TALLOC_FREE(req); } + torture_comment(test, "fsync_cancel complete\n"); if (tevent_loop_once(ev_ctx) == -1) { - TALLOC_FREE(ev_ctx); torture_fail(test, talloc_asprintf(test, "Failed event loop %s\n", strerror(errno))); + TALLOC_FREE(ev_ctx); return false; } + torture_comment(test, "preparing to free event context\n"); TALLOC_FREE(ev_ctx); return true; } @@ -2155,18 +2158,21 @@ static bool test_event_kqueue_aio_pread_cancel(struct torture_context *test, torture_assert(test, fd != -1, "open() failed"); for (i = 0; i < INFLIGHT_REQ; i++) { + torture_comment(test, "pread_cancel %d\n", i); struct tevent_req *req = NULL; ok = aio_fsync_send(test, ev_ctx, fd, &req); torture_assert(test, ok, "aio_fsync_send() failed"); TALLOC_FREE(req); } + torture_comment(test, "pread_cancel complete\n"); if (tevent_loop_once(ev_ctx) == -1) { TALLOC_FREE(ev_ctx); torture_fail(test, talloc_asprintf(test, "Failed event loop %s\n", strerror(errno))); return false; } + torture_comment(test, "preparing to free event context\n"); TALLOC_FREE(ev_ctx); return true; } diff --git a/lib/tevent/tevent_epoll.c b/lib/tevent/tevent_epoll.c index e78aeec1129..6ec7ac2186c 100644 --- a/lib/tevent/tevent_epoll.c +++ b/lib/tevent/tevent_epoll.c @@ -213,19 +213,22 @@ static void process_io_cqe(epoll_ev_ctx_t *epoll_ev, const struct io_uring_cqe *cqe) { struct tevent_aiocb *tiocbp = NULL; + void *data = io_uring_cqe_get_data(cqe); + int rv = cqe->res; - tiocbp = talloc_get_type_abort(io_uring_cqe_get_data(cqe), - struct tevent_aiocb); + if (rv == -ECANCELED) { + // We have cancelled the SQE and freed + // memory for struct tevent_aiocb. Nothing to do. + return; + } + + tiocbp = talloc_get_type_abort(data, struct tevent_aiocb); switch (tiocbp->state) { case TAIO_RUNNING: // Generally, the AIO task should be running. break; case TAIO_CANCELLED: - // Event was cancelled. Remove destructor - // and free everything. - talloc_set_destructor(tiocbp, NULL); - TALLOC_FREE(tiocbp->iocbp); return; case TAIO_INIT: case TAIO_COMPLETE: @@ -276,7 +279,7 @@ static int process_uring_event(epoll_ev_ctx_t *epoll_ev) io_uring_cq_advance(&epoll_ev->aio.ring, cnt); return 0; -} +} static int io_event_abort(struct tevent_fd *fde) { @@ -855,7 +858,7 @@ static int epoll_event_loop(struct epoll_event_context *epoll_ev, struct timeval if (fde == epoll_ev->aio.fde) { return process_uring_event(epoll_ev); - } + } if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) { /* @@ -1125,58 +1128,204 @@ static int epoll_event_loop_once(struct tevent_context *ev, const char *location return epoll_event_loop(epoll_ev, &tval); } -static void tevent_aio_cancel(struct tevent_aiocb *taiocb) +static bool drain_cqe_for_event(epoll_ev_ctx_t *epoll_ev, + struct tevent_aiocb *taiocb, + bool handle_request) +{ + int cnt = 0; + bool found_event = false; + unsigned head; + struct io_uring_cqe *cqe = NULL; + void *data = NULL; + + io_uring_for_each_cqe(&epoll_ev->aio.ring, head, cqe) { + data = io_uring_cqe_get_data(cqe); + + if (handle_request || (data != taiocb)) { + process_io_cqe(epoll_ev, cqe); + } + cnt++; + if (data == taiocb) { + found_event = true; + break; + } + } + + io_uring_cq_advance(&epoll_ev->aio.ring, cnt); + return found_event; +} + +static bool tevent_aio_cancel(struct tevent_aiocb *taiocb, + bool handle_request) { int ret; + bool saw_result = false; struct io_uring_sqe *sqe = NULL; struct io_uring_cqe *cqe = NULL; epoll_ev_ctx_t *epoll_ev = EVTOEPOLL(taiocb->ev); + void *data = NULL; sqe = io_uring_get_sqe(&epoll_ev->aio.ring); if (sqe == NULL) { abort(); } - io_uring_prep_cancel(sqe, taiocb, 0); + /* + * Cancel requests are always processed synchronously + * We deliberately don't set user_data so that cancellation + * CQEs are readily apparent + */ + io_uring_prep_cancel(sqe, taiocb, IORING_ASYNC_CANCEL_ALL); ret = io_uring_submit(&epoll_ev->aio.ring); if (ret < 0) { abort(); } - ret = io_uring_wait_cqe(&epoll_ev->aio.ring, &cqe); +again: + ret = io_uring_wait_cqe(&epoll_ev->aio.ring, &cqe); if (ret < 0) { abort(); } - if (io_uring_cqe_get_data(cqe)) { - abort(); + data = io_uring_cqe_get_data(cqe); + if (data == taiocb) { + /* + * We have hit upon the request that we're trying to async + * cancel. This often means that it was already sitting on our + * completion queue prior to issuing the cancel request. + * + * Depending on state of request at time it was cancelled, the + * result of the operation may also be -ECANCELED. In this case + * we don't need to do anything special, continue looping cqes + * till we get our cancellation request. + */ + saw_result = true; + + ret = cqe->res; + if (ret == -ECANCELED) { + io_uring_cqe_seen(&epoll_ev->aio.ring, cqe); + goto again; + } + + tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING, + "%s: AIO request finished prior to or during " + "processing of request for its cancellation.\n", + taiocb->location); + + if (handle_request) { + process_io_cqe(epoll_ev, cqe); + } + + io_uring_cqe_seen(&epoll_ev->aio.ring, cqe); + goto again; + } + + if (data != NULL) { + /* + * This is unfortunatey, but there's another request on the + * completion queue between us and our cancellation event. + * We're honor-bound to process it, and thus we do. + */ + process_io_cqe(epoll_ev, cqe); + io_uring_cqe_seen(&epoll_ev->aio.ring, cqe); + goto again; + } + + /* + * By the time we get here the cqe is for the async cancellation + * request. If it has failed, then our choices are limited. This + * cancellation function is often called by the talloc destructor + * for the initial tevent request, which means our choices are either: + * + * 1) prevent the destructor from freeing memory of aio request + * 2) drain the completion queue until we hit the request being + * cancelled. + * + * The second option is being used here to avoid leaking memory and + * to avoid potential for talloc library assertion due to using memory + * via talloc_get_type_abort() in process_io_cqe() that should have + * been freed. + */ + ret = cqe->res; + if ((ret < 0) && !saw_result) { + errno = -ret; + + switch (ret) { + case -EALREADY: + /* + * The execution state of request progressed far enough that + * cancellation is no longer possible. + * see man 3 io_uring_prep_cancel + */ + case -ENOENT: + /* + * The request identifed by taiocb could not be located. + * This could mean that it was completed before the + * cancellation request was issued (sitting on queue waiting + * to be processed in this case). + * see man 3 io_uring_prep_cancel + */ + tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING, + "%s: failed to cancel AIO request: %s", + taiocb->location, strerror(errno)); + + /* + * Regardless of how we got here, the original req + * should be waiting on the completion queue. + * The following function advances the queue as its + * read and so io_uring_cqe_seen() isn't needed + * at end of the function we're currently in. + */ + saw_result = drain_cqe_for_event(epoll_ev, taiocb, + handle_request); + default: + tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL, + "%s: failed to cancel AIO request: %s", + taiocb->location, strerror(errno)); + abort(); + }; + + if (!saw_result) { + /* + * We failed to safely handle this cancellation + * request. The completion queue event is unhandled + * and puts us at risk of a use-after-free. This is + * unusual enough of a situation that getting core + * is probably required. + */ + tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL, + "%s: failed to find event in completion " + "queue after cancellation attempt.\n", + taiocb->location, strerror(errno)); + abort(); + } + } else { + io_uring_cqe_seen(&epoll_ev->aio.ring, cqe); } - io_uring_cqe_seen(&epoll_ev->aio.ring, cqe); + taiocb->state = TAIO_CANCELLED; + return ret > 0; } static bool aio_req_cancel(struct tevent_req *req) { struct tevent_aiocb *taiocb = tevent_req_data(req, struct tevent_aiocb); - tevent_aio_cancel(taiocb); - return true; + return tevent_aio_cancel(taiocb, true); } static int aio_destructor(struct tevent_aiocb *taio) { switch (taio->state) { + case TAIO_RUNNING: + tevent_aio_cancel(taio, false); case TAIO_INIT: case TAIO_CANCELLED: case TAIO_COMPLETE: return 0; case TAIO_RUNNING: - // switch state to CANCELLED and - // hopefully pick up once we getevents - tevent_aio_cancel(taio); - return 0; default: - // unknown value + // unknown value. should never happen abort(); }; } @@ -1188,7 +1337,7 @@ iocb_t *tevent_ctx_get_iocb(struct tevent_aiocb *taiocb) if (!init_libaio_data(epoll_ev)) { abort(); errno = EAGAIN; - return -1; + return NULL; } tevent_req_set_cancel_fn(taiocb->req, aio_req_cancel); @@ -1213,7 +1362,7 @@ static int tevent_add_aio_op(struct tevent_aiocb *taiocb, return -1; } - taiocb->location = location; + taiocb->location = location; io_uring_sqe_set_data(taiocb->iocbp, (void *)taiocb); err = io_uring_submit(&epoll_ev->aio.ring); @@ -1221,8 +1370,9 @@ static int tevent_add_aio_op(struct tevent_aiocb *taiocb, abort(); } - taiocb->state = TAIO_RUNNING; - return 0; + taiocb->iocbp = NULL; + taiocb->state = TAIO_RUNNING; + return 0; } int _tevent_add_aio_read(struct tevent_aiocb *taiocb, const char *location)