2 * Use the io_uring of Linux (>= 5.1)
4 * Copyright (C) Volker Lendecke 2008
5 * Copyright (C) Jeremy Allison 2010
6 * Copyright (C) Stefan Metzmacher 2019
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "system/filesys.h"
25 #include "smbd/smbd.h"
26 #include "smbd/globals.h"
27 #include "lib/util/tevent_unix.h"
28 #include "lib/util/sys_rw.h"
29 #include "smbprofile.h"
32 struct vfs_io_uring_request;
34 struct vfs_io_uring_config {
35 struct io_uring uring;
36 struct tevent_fd *fde;
37 /* recursion guard. See comment above vfs_io_uring_queue_run() */
39 /* recursion guard. See comment above vfs_io_uring_queue_run() */
41 struct vfs_io_uring_request *queue;
42 struct vfs_io_uring_request *pending;
45 struct vfs_io_uring_request {
46 struct vfs_io_uring_request *prev, *next;
47 struct vfs_io_uring_request **list_head;
48 struct vfs_io_uring_config *config;
49 struct tevent_req *req;
50 struct io_uring_sqe sqe;
51 struct io_uring_cqe cqe;
52 void (*completion_fn)(struct vfs_io_uring_request *cur,
53 const char *location);
54 struct timespec start_time;
55 struct timespec end_time;
56 SMBPROFILE_BYTES_ASYNC_STATE(profile_bytes);
59 static void vfs_io_uring_finish_req(struct vfs_io_uring_request *cur,
60 const struct io_uring_cqe *cqe,
61 struct timespec end_time,
64 struct tevent_req *req =
65 talloc_get_type_abort(cur->req,
67 void *state = _tevent_req_data(req);
69 talloc_set_destructor(state, NULL);
70 if (cur->list_head != NULL) {
71 DLIST_REMOVE((*cur->list_head), cur);
72 cur->list_head = NULL;
76 SMBPROFILE_BYTES_ASYNC_SET_IDLE(cur->profile_bytes);
77 cur->end_time = end_time;
80 * We rely on being inside the _send() function
81 * or tevent_req_defer_callback() being called
84 cur->completion_fn(cur, location);
87 static void vfs_io_uring_config_destroy(struct vfs_io_uring_config *config,
91 struct vfs_io_uring_request *cur = NULL, *next = NULL;
92 struct timespec start_time;
93 struct timespec end_time;
94 struct io_uring_cqe err_cqe = {
98 PROFILE_TIMESTAMP(&start_time);
100 if (config->uring.ring_fd != -1) {
101 /* TODO: cancel queued and pending requests */
102 TALLOC_FREE(config->fde);
103 io_uring_queue_exit(&config->uring);
104 config->uring.ring_fd = -1;
107 PROFILE_TIMESTAMP(&end_time);
109 for (cur = config->pending; cur != NULL; cur = next) {
111 err_cqe.user_data = (uintptr_t)(void *)cur;
112 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
115 for (cur = config->queue; cur != NULL; cur = next) {
117 err_cqe.user_data = (uintptr_t)(void *)cur;
118 cur->start_time = start_time;
119 vfs_io_uring_finish_req(cur, &err_cqe, end_time, location);
123 static int vfs_io_uring_config_destructor(struct vfs_io_uring_config *config)
125 vfs_io_uring_config_destroy(config, -EUCLEAN, __location__);
129 static int vfs_io_uring_request_state_deny_destructor(void *_state)
131 struct __vfs_io_uring_generic_state {
132 struct vfs_io_uring_request ur;
133 } *state = (struct __vfs_io_uring_generic_state *)_state;
134 struct vfs_io_uring_request *cur = &state->ur;
136 /* our parent is gone */
139 /* remove ourself from any list */
140 DLIST_REMOVE((*cur->list_head), cur);
141 cur->list_head = NULL;
144 * Our state is about to go away,
145 * all we can do is shutting down the whole uring.
146 * But that's ok as we're most likely called from exit_server()
148 vfs_io_uring_config_destroy(cur->config, -ESHUTDOWN, __location__);
152 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
153 struct tevent_fd *fde,
157 static int vfs_io_uring_connect(vfs_handle_struct *handle, const char *service,
161 struct vfs_io_uring_config *config;
162 unsigned num_entries;
166 config = talloc_zero(handle->conn, struct vfs_io_uring_config);
167 if (config == NULL) {
168 DEBUG(0, ("talloc_zero() failed\n"));
172 SMB_VFS_HANDLE_SET_DATA(handle, config,
173 NULL, struct vfs_io_uring_config,
176 ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
181 num_entries = lp_parm_ulong(SNUM(handle->conn),
185 num_entries = MAX(num_entries, 1);
187 sqpoll = lp_parm_bool(SNUM(handle->conn),
192 flags |= IORING_SETUP_SQPOLL;
195 ret = io_uring_queue_init(num_entries, &config->uring, flags);
197 SMB_VFS_NEXT_DISCONNECT(handle);
202 talloc_set_destructor(config, vfs_io_uring_config_destructor);
204 #ifdef HAVE_IO_URING_RING_DONTFORK
205 ret = io_uring_ring_dontfork(&config->uring);
207 SMB_VFS_NEXT_DISCONNECT(handle);
211 #endif /* HAVE_IO_URING_RING_DONTFORK */
213 config->fde = tevent_add_fd(handle->conn->sconn->ev_ctx,
215 config->uring.ring_fd,
217 vfs_io_uring_fd_handler,
219 if (config->fde == NULL) {
221 SMB_VFS_NEXT_DISCONNECT(handle);
229 static void _vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
231 struct vfs_io_uring_request *cur = NULL, *next = NULL;
232 struct io_uring_cqe *cqe = NULL;
235 struct timespec start_time;
236 struct timespec end_time;
239 PROFILE_TIMESTAMP(&start_time);
241 if (config->uring.ring_fd == -1) {
242 vfs_io_uring_config_destroy(config, -ESTALE, __location__);
246 for (cur = config->queue; cur != NULL; cur = next) {
247 struct io_uring_sqe *sqe = NULL;
248 void *state = _tevent_req_data(cur->req);
252 sqe = io_uring_get_sqe(&config->uring);
257 talloc_set_destructor(state,
258 vfs_io_uring_request_state_deny_destructor);
259 DLIST_REMOVE(config->queue, cur);
261 DLIST_ADD_END(config->pending, cur);
262 cur->list_head = &config->pending;
263 SMBPROFILE_BYTES_ASYNC_SET_BUSY(cur->profile_bytes);
265 cur->start_time = start_time;
268 ret = io_uring_submit(&config->uring);
269 if (ret == -EAGAIN || ret == -EBUSY) {
270 /* We just retry later */
271 } else if (ret < 0) {
272 vfs_io_uring_config_destroy(config, ret, __location__);
276 PROFILE_TIMESTAMP(&end_time);
278 io_uring_for_each_cqe(&config->uring, cqhead, cqe) {
279 cur = (struct vfs_io_uring_request *)io_uring_cqe_get_data(cqe);
280 vfs_io_uring_finish_req(cur, cqe, end_time, __location__);
284 io_uring_cq_advance(&config->uring, nr);
288 * Wrapper function to prevent recursion which could happen
289 * if we called _vfs_io_uring_queue_run() directly without
292 * Looking at the pread call, we can have:
294 * vfs_io_uring_pread_send()
295 * ->vfs_io_uring_pread_submit() <-----------------------------------
296 * ->vfs_io_uring_request_submit() |
297 * ->vfs_io_uring_queue_run() |
298 * ->_vfs_io_uring_queue_run() |
300 * But inside _vfs_io_uring_queue_run() looks like: |
302 * _vfs_io_uring_queue_run() { |
303 * if (THIS_IO_COMPLETED) { |
304 * ->vfs_io_uring_finish_req() |
305 * ->cur->completion_fn() |
309 * cur->completion_fn() for pread is set to vfs_io_uring_pread_completion() |
311 * vfs_io_uring_pread_completion() { |
312 * if (READ_TERMINATED) { |
313 * -> tevent_req_done() - We're done, go back up the stack. |
317 * We have a short read - adjust the io vectors |
319 * ->vfs_io_uring_pread_submit() ---------------------------------------
322 * So before calling _vfs_io_uring_queue_run() we backet it with setting
323 * a flag config->busy, and unset it once _vfs_io_uring_queue_run() finally
324 * exits the retry loop.
326 * If we end up back into vfs_io_uring_queue_run() we notice we've done so
327 * as config->busy is set and don't recurse into _vfs_io_uring_queue_run().
329 * We set the second flag config->need_retry that tells us to loop in the
330 * vfs_io_uring_queue_run() call above us in the stack and return.
332 * When the outer call to _vfs_io_uring_queue_run() returns we are in
333 * a loop checking if config->need_retry was set. That happens if
334 * the short read case occurs and _vfs_io_uring_queue_run() ended up
335 * recursing into vfs_io_uring_queue_run().
337 * Once vfs_io_uring_pread_completion() finishes without a short
338 * read (the READ_TERMINATED case, tevent_req_done() is called)
339 * then config->need_retry is left as false, we exit the loop,
340 * set config->busy to false so the next top level call into
341 * vfs_io_uring_queue_run() won't think it's a recursed call
346 static void vfs_io_uring_queue_run(struct vfs_io_uring_config *config)
350 * We've recursed due to short read/write.
351 * Set need_retry to ensure we retry the
354 config->need_retry = true;
359 * Bracket the loop calling _vfs_io_uring_queue_run()
360 * with busy = true / busy = false.
361 * so we can detect recursion above.
367 config->need_retry = false;
368 _vfs_io_uring_queue_run(config);
369 } while (config->need_retry);
371 config->busy = false;
374 static void vfs_io_uring_fd_handler(struct tevent_context *ev,
375 struct tevent_fd *fde,
379 vfs_handle_struct *handle = (vfs_handle_struct *)private_data;
380 struct vfs_io_uring_config *config = NULL;
382 SMB_VFS_HANDLE_GET_DATA(handle, config,
383 struct vfs_io_uring_config,
384 smb_panic(__location__));
386 vfs_io_uring_queue_run(config);
389 struct vfs_io_uring_pread_state {
390 struct vfs_io_uring_request ur;
395 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
396 const char *location);
398 static struct tevent_req *vfs_io_uring_pread_send(struct vfs_handle_struct *handle,
400 struct tevent_context *ev,
401 struct files_struct *fsp,
403 size_t n, off_t offset)
405 struct tevent_req *req = NULL;
406 struct vfs_io_uring_pread_state *state = NULL;
407 struct vfs_io_uring_config *config = NULL;
410 SMB_VFS_HANDLE_GET_DATA(handle, config,
411 struct vfs_io_uring_config,
412 smb_panic(__location__));
414 req = tevent_req_create(mem_ctx, &state,
415 struct vfs_io_uring_pread_state);
419 state->ur.config = config;
421 state->ur.completion_fn = vfs_io_uring_pread_completion;
423 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pread, profile_p,
424 state->ur.profile_bytes, n);
425 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
427 ok = sys_valid_io_range(offset, n);
429 tevent_req_error(req, EINVAL);
430 return tevent_req_post(req, ev);
433 state->iov.iov_base = (void *)data;
434 state->iov.iov_len = n;
435 io_uring_prep_readv(&state->ur.sqe,
439 io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
440 DLIST_ADD_END(config->queue, &state->ur);
441 state->ur.list_head = &config->queue;
443 vfs_io_uring_queue_run(config);
445 if (!tevent_req_is_in_progress(req)) {
446 return tevent_req_post(req, ev);
449 tevent_req_defer_callback(req, ev);
453 static void vfs_io_uring_pread_completion(struct vfs_io_uring_request *cur,
454 const char *location)
456 struct vfs_io_uring_pread_state *state = tevent_req_data(
457 cur->req, struct vfs_io_uring_pread_state);
460 * We rely on being inside the _send() function
461 * or tevent_req_defer_callback() being called
465 if (cur->cqe.res < 0) {
466 int err = -cur->cqe.res;
467 _tevent_req_error(cur->req, err, location);
471 state->nread = state->ur.cqe.res;
472 tevent_req_done(cur->req);
475 static ssize_t vfs_io_uring_pread_recv(struct tevent_req *req,
476 struct vfs_aio_state *vfs_aio_state)
478 struct vfs_io_uring_pread_state *state = tevent_req_data(
479 req, struct vfs_io_uring_pread_state);
482 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
483 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
484 &state->ur.start_time);
486 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
487 tevent_req_received(req);
491 vfs_aio_state->error = 0;
494 tevent_req_received(req);
498 struct vfs_io_uring_pwrite_state {
499 struct vfs_io_uring_request ur;
504 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
505 const char *location);
507 static struct tevent_req *vfs_io_uring_pwrite_send(struct vfs_handle_struct *handle,
509 struct tevent_context *ev,
510 struct files_struct *fsp,
512 size_t n, off_t offset)
514 struct tevent_req *req = NULL;
515 struct vfs_io_uring_pwrite_state *state = NULL;
516 struct vfs_io_uring_config *config = NULL;
519 SMB_VFS_HANDLE_GET_DATA(handle, config,
520 struct vfs_io_uring_config,
521 smb_panic(__location__));
523 req = tevent_req_create(mem_ctx, &state,
524 struct vfs_io_uring_pwrite_state);
528 state->ur.config = config;
530 state->ur.completion_fn = vfs_io_uring_pwrite_completion;
532 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_pwrite, profile_p,
533 state->ur.profile_bytes, n);
534 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
536 ok = sys_valid_io_range(offset, n);
538 tevent_req_error(req, EINVAL);
539 return tevent_req_post(req, ev);
542 state->iov.iov_base = discard_const(data);
543 state->iov.iov_len = n;
544 io_uring_prep_writev(&state->ur.sqe,
548 io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
549 DLIST_ADD_END(config->queue, &state->ur);
550 state->ur.list_head = &config->queue;
552 vfs_io_uring_queue_run(config);
554 if (!tevent_req_is_in_progress(req)) {
555 return tevent_req_post(req, ev);
558 tevent_req_defer_callback(req, ev);
562 static void vfs_io_uring_pwrite_completion(struct vfs_io_uring_request *cur,
563 const char *location)
565 struct vfs_io_uring_pwrite_state *state = tevent_req_data(
566 cur->req, struct vfs_io_uring_pwrite_state);
569 * We rely on being inside the _send() function
570 * or tevent_req_defer_callback() being called
574 if (cur->cqe.res < 0) {
575 int err = -cur->cqe.res;
576 _tevent_req_error(cur->req, err, location);
580 state->nwritten = state->ur.cqe.res;
581 tevent_req_done(cur->req);
584 static ssize_t vfs_io_uring_pwrite_recv(struct tevent_req *req,
585 struct vfs_aio_state *vfs_aio_state)
587 struct vfs_io_uring_pwrite_state *state = tevent_req_data(
588 req, struct vfs_io_uring_pwrite_state);
591 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
592 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
593 &state->ur.start_time);
595 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
596 tevent_req_received(req);
600 vfs_aio_state->error = 0;
601 ret = state->nwritten;
603 tevent_req_received(req);
607 struct vfs_io_uring_fsync_state {
608 struct vfs_io_uring_request ur;
611 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
612 const char *location);
614 static struct tevent_req *vfs_io_uring_fsync_send(struct vfs_handle_struct *handle,
616 struct tevent_context *ev,
617 struct files_struct *fsp)
619 struct tevent_req *req = NULL;
620 struct vfs_io_uring_fsync_state *state = NULL;
621 struct vfs_io_uring_config *config = NULL;
623 SMB_VFS_HANDLE_GET_DATA(handle, config,
624 struct vfs_io_uring_config,
625 smb_panic(__location__));
627 req = tevent_req_create(mem_ctx, &state,
628 struct vfs_io_uring_fsync_state);
632 state->ur.config = config;
634 state->ur.completion_fn = vfs_io_uring_fsync_completion;
636 SMBPROFILE_BYTES_ASYNC_START(syscall_asys_fsync, profile_p,
637 state->ur.profile_bytes, 0);
638 SMBPROFILE_BYTES_ASYNC_SET_IDLE(state->ur.profile_bytes);
640 io_uring_prep_fsync(&state->ur.sqe,
642 0); /* fsync_flags */
643 io_uring_sqe_set_data(&state->ur.sqe, &state->ur);
644 DLIST_ADD_END(config->queue, &state->ur);
645 state->ur.list_head = &config->queue;
647 vfs_io_uring_queue_run(config);
649 if (!tevent_req_is_in_progress(req)) {
650 return tevent_req_post(req, ev);
653 tevent_req_defer_callback(req, ev);
657 static void vfs_io_uring_fsync_completion(struct vfs_io_uring_request *cur,
658 const char *location)
661 * We rely on being inside the _send() function
662 * or tevent_req_defer_callback() being called
666 if (cur->cqe.res < 0) {
667 int err = -cur->cqe.res;
668 _tevent_req_error(cur->req, err, location);
672 tevent_req_done(cur->req);
675 static int vfs_io_uring_fsync_recv(struct tevent_req *req,
676 struct vfs_aio_state *vfs_aio_state)
678 struct vfs_io_uring_fsync_state *state = tevent_req_data(
679 req, struct vfs_io_uring_fsync_state);
681 SMBPROFILE_BYTES_ASYNC_END(state->ur.profile_bytes);
682 vfs_aio_state->duration = nsec_time_diff(&state->ur.end_time,
683 &state->ur.start_time);
685 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
686 tevent_req_received(req);
690 vfs_aio_state->error = 0;
692 tevent_req_received(req);
696 static struct vfs_fn_pointers vfs_io_uring_fns = {
697 .connect_fn = vfs_io_uring_connect,
698 .pread_send_fn = vfs_io_uring_pread_send,
699 .pread_recv_fn = vfs_io_uring_pread_recv,
700 .pwrite_send_fn = vfs_io_uring_pwrite_send,
701 .pwrite_recv_fn = vfs_io_uring_pwrite_recv,
702 .fsync_send_fn = vfs_io_uring_fsync_send,
703 .fsync_recv_fn = vfs_io_uring_fsync_recv,
707 NTSTATUS vfs_io_uring_init(TALLOC_CTX *ctx)
709 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
710 "io_uring", &vfs_io_uring_fns);