Add a new module, aio_linux which implements Linux kernel aio support. Docs to follow.
[amitay/samba.git] / source3 / modules / vfs_aio_linux.c
diff --git a/source3/modules/vfs_aio_linux.c b/source3/modules/vfs_aio_linux.c
new file mode 100644 (file)
index 0000000..f6fa80a
--- /dev/null
@@ -0,0 +1,730 @@
+/*
+ * Simulate Posix AIO using Linux kernel AIO.
+ *
+ * Copyright (C) Jeremy Allison 2012
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "smbd/smbd.h"
+#include "smbd/globals.h"
+#include <sys/eventfd.h>
+#include <libaio.h>
+
+struct aio_extra;
+static int event_fd = -1;
+static io_context_t io_ctx;
+static int aio_linux_requestid;
+static struct io_event *io_recv_events;
+static struct fd_event *aio_read_event;
+
+struct aio_private_data {
+       struct aio_private_data *prev, *next;
+       int requestid;
+       SMB_STRUCT_AIOCB *aiocb;
+       struct iocb *event_iocb;
+       ssize_t ret_size;
+       int ret_errno;
+       bool cancelled;
+};
+
+/* List of outstanding requests we have. */
+static struct aio_private_data *pd_list;
+
+static void aio_linux_handle_completion(struct event_context *event_ctx,
+                       struct fd_event *event,
+                       uint16 flags,
+                       void *p);
+
+/************************************************************************
+ Housekeeping. Cleanup if no activity for 30 seconds.
+***********************************************************************/
+
+static void aio_linux_housekeeping(struct tevent_context *event_ctx,
+                                        struct tevent_timer *te,
+                                        struct timeval now,
+                                        void *private_data)
+{
+       /* Remove this timed event handler. */
+       TALLOC_FREE(te);
+
+       if (pd_list != NULL) {
+               /* Still busy. Look again in 30 seconds. */
+               (void)tevent_add_timer(event_ctx,
+                                       NULL,
+                                       timeval_current_ofs(30, 0),
+                                       aio_linux_housekeeping,
+                                       NULL);
+               return;
+       }
+
+       /* No activity for 30 seconds. Close out kernel resources. */
+       io_queue_release(io_ctx);
+       memset(&io_ctx, '\0', sizeof(io_ctx));
+
+       if (event_fd != -1) {
+               close(event_fd);
+               event_fd = -1;
+       }
+
+       TALLOC_FREE(aio_read_event);
+       TALLOC_FREE(io_recv_events);
+}
+
+/************************************************************************
+ Ensure event fd and aio context are initialized.
+***********************************************************************/
+
+static bool init_aio_linux(struct vfs_handle_struct *handle)
+{
+       struct tevent_timer *te = NULL;
+
+       if (event_fd != -1) {
+               /* Already initialized. */
+               return true;
+       }
+
+       /* Shedule a shutdown event for 30 seconds from now. */
+       te = tevent_add_timer(server_event_context(),
+                               NULL,
+                               timeval_current_ofs(30, 0),
+                               aio_linux_housekeeping,
+                               NULL);
+
+       if (te == NULL) {
+               goto fail;
+       }
+
+       /* Ensure we have enough space for aio_pending_size events. */
+       io_recv_events = talloc_zero_array(NULL,
+                               struct io_event,
+                               aio_pending_size);
+       if (io_recv_events == NULL) {
+               goto fail;
+       }
+
+       event_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+       if (event_fd == -1) {
+               goto fail;
+       }
+
+       aio_read_event = tevent_add_fd(server_event_context(),
+                               NULL,
+                               event_fd,
+                               TEVENT_FD_READ,
+                               aio_linux_handle_completion,
+                               NULL);
+       if (aio_read_event == NULL) {
+               goto fail;
+       }
+
+       if (io_queue_init(aio_pending_size, &io_ctx)) {
+               goto fail;
+       }
+
+       DEBUG(10,("init_aio_linux: initialized with up to %d events\n",
+                 aio_pending_size));
+
+       return true;
+
+  fail:
+
+       DEBUG(10,("init_aio_linux: initialization failed\n"));
+
+       TALLOC_FREE(te);
+       TALLOC_FREE(io_recv_events);
+       TALLOC_FREE(aio_read_event);
+       if (event_fd != -1) {
+               close(event_fd);
+               event_fd = -1;
+       }
+       memset(&io_ctx, '\0', sizeof(io_ctx));
+       return false;
+}
+
+/************************************************************************
+ Private data destructor.
+***********************************************************************/
+
+static int pd_destructor(struct aio_private_data *pd)
+{
+       DLIST_REMOVE(pd_list, pd);
+       return 0;
+}
+
+/************************************************************************
+ Create and initialize a private data struct.
+***********************************************************************/
+
+static struct aio_private_data *create_private_data(TALLOC_CTX *ctx,
+                                       SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_private_data *pd = talloc_zero(ctx, struct aio_private_data);
+       if (!pd) {
+               return NULL;
+       }
+       pd->event_iocb = talloc_zero(pd, struct iocb);
+       pd->requestid = aio_linux_requestid++;
+       pd->aiocb = aiocb;
+       pd->ret_size = -1;
+       pd->ret_errno = EINPROGRESS;
+       talloc_set_destructor(pd, pd_destructor);
+       DLIST_ADD_END(pd_list, pd, struct aio_private_data *);
+       return pd;
+}
+
+/************************************************************************
+ Initiate an asynchronous pread call.
+***********************************************************************/
+
+static int aio_linux_read(struct vfs_handle_struct *handle,
+                               struct files_struct *fsp,
+                               SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
+       struct aio_private_data *pd = NULL;
+       int ret;
+
+       if (!init_aio_linux(handle)) {
+               return -1;
+       }
+
+       pd = create_private_data(aio_ex, aiocb);
+       if (pd == NULL) {
+               DEBUG(10, ("aio_linux_read: Could not create private data.\n"));
+               return -1;
+       }
+
+       io_prep_pread(pd->event_iocb,
+                       pd->aiocb->aio_fildes,
+                       discard_const(pd->aiocb->aio_buf),
+                       pd->aiocb->aio_nbytes,
+                       pd->aiocb->aio_offset);
+       io_set_eventfd(pd->event_iocb, event_fd);
+       /* Use the callback pointer as a private data ptr. */
+       io_set_callback(pd->event_iocb, (io_callback_t)pd);
+
+       ret = io_submit(io_ctx, 1, &pd->event_iocb);
+       if (ret < 0) {
+               errno = ret;
+               return -1;
+       }
+
+       DEBUG(10, ("aio_linux_read: requestid=%d read requested "
+               "of %llu bytes at offset %llu\n",
+               pd->requestid,
+               (unsigned long long)pd->aiocb->aio_nbytes,
+               (unsigned long long)pd->aiocb->aio_offset));
+
+       return 0;
+}
+
+/************************************************************************
+ Initiate an asynchronous pwrite call.
+***********************************************************************/
+
+static int aio_linux_write(struct vfs_handle_struct *handle,
+                               struct files_struct *fsp,
+                               SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
+       struct aio_private_data *pd = NULL;
+       int ret;
+
+       if (!init_aio_linux(handle)) {
+               return -1;
+       }
+
+       pd = create_private_data(aio_ex, aiocb);
+       if (pd == NULL) {
+               DEBUG(10, ("aio_linux_write: Could not create private data.\n"));
+               return -1;
+       }
+
+       io_prep_pwrite(pd->event_iocb,
+                       pd->aiocb->aio_fildes,
+                       discard_const(pd->aiocb->aio_buf),
+                       pd->aiocb->aio_nbytes,
+                       pd->aiocb->aio_offset);
+       io_set_eventfd(pd->event_iocb, event_fd);
+       /* Use the callback pointer as a private data ptr. */
+       io_set_callback(pd->event_iocb, (io_callback_t)pd);
+
+       ret = io_submit(io_ctx, 1, &pd->event_iocb);
+       if (ret < 0) {
+               errno = ret;
+               return -1;
+       }
+
+       DEBUG(10, ("aio_linux_write: requestid=%d pwrite requested "
+               "of %llu bytes at offset %llu\n",
+               pd->requestid,
+               (unsigned long long)pd->aiocb->aio_nbytes,
+               (unsigned long long)pd->aiocb->aio_offset));
+
+       return 0;
+}
+
+/************************************************************************
+ Handle a single finished io.
+***********************************************************************/
+
+static void aio_linux_handle_io_finished(struct io_event *ioev)
+{
+       struct aio_extra *aio_ex = NULL;
+       struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
+
+       /* ioev->res2 contains the -errno if error. */
+       /* ioev->res contains the number of bytes sent/received. */
+       if (ioev->res2) {
+               pd->ret_size = -1;
+               pd->ret_errno = -ioev->res2;
+       } else {
+               pd->ret_size = ioev->res;
+               pd->ret_errno = 0;
+       }
+
+       aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr;
+       smbd_aio_complete_aio_ex(aio_ex);
+
+       DEBUG(10,("aio_linux_handle_io_finished: requestid %d completed\n",
+               pd->requestid ));
+       TALLOC_FREE(aio_ex);
+}
+
+/************************************************************************
+ Callback when multiple IOs complete.
+***********************************************************************/
+
+static void aio_linux_handle_completion(struct event_context *event_ctx,
+                               struct fd_event *event,
+                               uint16 flags,
+                               void *p)
+{
+       uint64_t num_events = 0;
+
+       DEBUG(10, ("aio_linux_handle_completion called with flags=%d\n",
+                       (int)flags));
+
+       if ((flags & EVENT_FD_READ) == 0) {
+               return;
+       }
+
+       /* Read the number of events available. */
+       if (sys_read(event_fd, &num_events, sizeof(num_events)) !=
+                       sizeof(num_events)) {
+               smb_panic("aio_linux_handle_completion: invalid read");
+       }
+
+       while (num_events > 0) {
+               uint64_t events_to_read = MIN(num_events, aio_pending_size);
+               struct timespec ts;
+               int i;
+               int ret;
+
+               ts.tv_sec = 0;
+               ts.tv_nsec = 0;
+
+               ret = io_getevents(io_ctx,
+                       1,
+                       (long)events_to_read,
+                       io_recv_events,
+                       &ts);
+
+               if (ret < 0) {
+                       errno = -ret;
+                       DEBUG(1, ("aio_linux_handle_completion: "
+                               "io_getevents error %s\n",
+                               strerror(errno) ));
+                       return;
+               }
+
+               if (ret == 0) {
+                       DEBUG(10, ("aio_linux_handle_completion: "
+                               "io_getevents returned 0\n"));
+                       continue;
+               }
+
+               /* ret is positive. */
+               for (i = 0; i < ret; i++) {
+                       aio_linux_handle_io_finished(&io_recv_events[i]);
+               }
+
+               num_events -= ret;
+       }
+}
+
+/************************************************************************
+ Find the private data by aiocb.
+***********************************************************************/
+
+static struct aio_private_data *find_private_data_by_aiocb(SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_private_data *pd;
+
+       for (pd = pd_list; pd != NULL; pd = pd->next) {
+               if (pd->aiocb == aiocb) {
+                       return pd;
+               }
+       }
+
+       return NULL;
+}
+
+/************************************************************************
+ Called to return the result of a completed AIO.
+ Should only be called if aio_error returns something other than EINPROGRESS.
+ Returns:
+       Any other value - return from IO operation.
+***********************************************************************/
+
+static ssize_t aio_linux_return_fn(struct vfs_handle_struct *handle,
+                               struct files_struct *fsp,
+                               SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
+
+       if (pd == NULL) {
+               errno = EINVAL;
+               DEBUG(0, ("aio_linux_return_fn: returning EINVAL\n"));
+               return -1;
+       }
+
+       pd->aiocb = NULL;
+
+       if (pd->ret_size == -1) {
+               errno = pd->ret_errno;
+       }
+
+       return pd->ret_size;
+}
+
+/************************************************************************
+ Called to check the result of an AIO.
+ Returns:
+       EINPROGRESS - still in progress.
+       EINVAL - invalid aiocb.
+       ECANCELED - request was cancelled.
+       0 - request completed successfully.
+       Any other value - errno from IO operation.
+***********************************************************************/
+
+static int aio_linux_error_fn(struct vfs_handle_struct *handle,
+                            struct files_struct *fsp,
+                            SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
+
+       if (pd == NULL) {
+               return EINVAL;
+       }
+       if (pd->cancelled) {
+               return ECANCELED;
+       }
+       return pd->ret_errno;
+}
+
+/************************************************************************
+ Called to request the cancel of an AIO, or all of them on a specific
+ fsp if aiocb == NULL.
+***********************************************************************/
+
+static int aio_linux_cancel(struct vfs_handle_struct *handle,
+                       struct files_struct *fsp,
+                       SMB_STRUCT_AIOCB *aiocb)
+{
+       struct aio_private_data *pd = NULL;
+
+       for (pd = pd_list; pd != NULL; pd = pd->next) {
+               if (pd->aiocb == NULL) {
+                       continue;
+               }
+               if (pd->aiocb->aio_fildes != fsp->fh->fd) {
+                       continue;
+               }
+               if ((aiocb != NULL) && (pd->aiocb != aiocb)) {
+                       continue;
+               }
+
+               /*
+                * We let the kernel do its job, but we discard the result when
+                * it's finished. NB. Should I call io_cancel here ?
+                */
+
+               pd->cancelled = true;
+       }
+
+       return AIO_CANCELED;
+}
+
+/************************************************************************
+ Callback for a previously detected job completion deferred to the main
+ loop.
+***********************************************************************/
+
+static void aio_linux_handle_immediate(struct tevent_context *ctx,
+                               struct tevent_immediate *im,
+                               void *private_data)
+{
+       struct io_event *ioev = (struct io_event *)private_data;
+
+       aio_linux_handle_io_finished(ioev);
+       TALLOC_FREE(ioev);
+}
+
+/************************************************************************
+ Private data struct used in suspend completion code.
+***********************************************************************/
+
+struct suspend_private {
+       int num_entries;
+       int num_finished;
+       const SMB_STRUCT_AIOCB * const *aiocb_array;
+};
+
+/************************************************************************
+ Handle a single finished io from suspend.
+***********************************************************************/
+
+static void aio_linux_handle_suspend_io_finished(struct suspend_private *sp,
+                                               struct io_event *ioev)
+{
+       struct aio_private_data *pd = (struct aio_private_data *)ioev->data;
+       struct io_event *new_ioev = NULL;
+       struct tevent_immediate *im = NULL;
+       int i;
+
+       /* Is this a requestid with an aiocb we're interested in ? */
+       for (i = 0; i < sp->num_entries; i++) {
+               if (sp->aiocb_array[i] == pd->aiocb) {
+                       sp->num_finished++;
+                       aio_linux_handle_io_finished(ioev);
+                       return;
+               }
+       }
+
+       /* Jobid completed we weren't waiting for.
+          We must reshedule this as an immediate event
+          on the main event context. */
+       im = tevent_create_immediate(NULL);
+       if (!im) {
+               exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
+       }
+
+       new_ioev = (struct io_event *)talloc_memdup(NULL,
+                                               ioev,
+                                               sizeof(struct io_event));
+       if (!new_ioev) {
+               exit_server_cleanly("aio_linux_handle_suspend_completion: no memory");
+       }
+
+       DEBUG(10,("aio_linux_handle_suspend_completion: "
+                       "re-scheduling requestid %d\n",
+                       pd->requestid));
+
+       tevent_schedule_immediate(im,
+                       server_event_context(),
+                       aio_linux_handle_immediate,
+                       (void *)new_ioev);
+}
+
+/************************************************************************
+ Callback when an IO completes from a suspend call.
+***********************************************************************/
+
+static void aio_linux_handle_suspend_completion(struct event_context *event_ctx,
+                               struct fd_event *event,
+                               uint16 flags,
+                               void *p)
+{
+       struct suspend_private *sp = (struct suspend_private *)p;
+       uint64_t remaining_events = sp->num_entries - sp->num_finished;
+       uint64_t num_events = 0;
+
+       DEBUG(10, ("aio_linux_handle_suspend_completion called with flags=%d\n",
+                       (int)flags));
+
+       if ((flags & EVENT_FD_READ) == 0) {
+               return;
+       }
+
+       /* Read the number of events available. */
+       if (sys_read(event_fd, &num_events, sizeof(num_events)) !=
+                       sizeof(num_events)) {
+               smb_panic("aio_linux_handle_completion: invalid read");
+       }
+
+       while (num_events > 0) {
+               uint64_t events_to_read = MIN(num_events, remaining_events);
+               struct timespec ts;
+               int i;
+               int ret;
+
+               ts.tv_sec = 0;
+               ts.tv_nsec = 0;
+
+               ret = io_getevents(io_ctx,
+                       1,
+                       (long)events_to_read,
+                       io_recv_events,
+                       &ts);
+
+               if (ret < 0) {
+                       errno = -ret;
+                       DEBUG(1, ("aio_linux_handle_suspend_completion: "
+                               "io_getevents error %s\n",
+                               strerror(errno) ));
+                       return;
+               }
+
+               if (ret == 0) {
+                       DEBUG(10, ("aio_linux_handle_suspend_completion: "
+                               "io_getevents returned 0\n"));
+                       continue;
+               }
+
+               /* ret is positive. */
+               for (i = 0; i < ret; i++) {
+                       aio_linux_handle_suspend_io_finished(sp,
+                                       &io_recv_events[i]);
+               }
+
+               num_events -= ret;
+       }
+}
+
+static void aio_linux_suspend_timed_out(struct tevent_context *event_ctx,
+                                       struct tevent_timer *te,
+                                       struct timeval now,
+                                       void *private_data)
+{
+       bool *timed_out = (bool *)private_data;
+       /* Remove this timed event handler. */
+       TALLOC_FREE(te);
+       *timed_out = true;
+}
+
+/************************************************************************
+ Called to request everything to stop until all IO is completed.
+***********************************************************************/
+
+static int aio_linux_suspend(struct vfs_handle_struct *handle,
+                       struct files_struct *fsp,
+                       const SMB_STRUCT_AIOCB * const aiocb_array[],
+                       int n,
+                       const struct timespec *timeout)
+{
+       struct event_context *ev = NULL;
+       struct fd_event *sock_event = NULL;
+       int ret = -1;
+       struct suspend_private sp;
+       bool timed_out = false;
+       TALLOC_CTX *frame = talloc_stackframe();
+
+       /* This is a blocking call, and has to use a sub-event loop. */
+       ev = event_context_init(frame);
+       if (ev == NULL) {
+               errno = ENOMEM;
+               goto out;
+       }
+
+       if (timeout) {
+               struct timeval tv = convert_timespec_to_timeval(*timeout);
+               struct tevent_timer *te = tevent_add_timer(ev,
+                                               frame,
+                                               timeval_current_ofs(tv.tv_sec,
+                                                                   tv.tv_usec),
+                                               aio_linux_suspend_timed_out,
+                                               &timed_out);
+               if (!te) {
+                       errno = ENOMEM;
+                       goto out;
+               }
+       }
+
+       ZERO_STRUCT(sp);
+       sp.num_entries = n;
+       sp.aiocb_array = aiocb_array;
+       sp.num_finished = 0;
+
+       sock_event = tevent_add_fd(ev,
+                               frame,
+                               event_fd,
+                               TEVENT_FD_READ,
+                               aio_linux_handle_suspend_completion,
+                               (void *)&sp);
+       if (sock_event == NULL) {
+               goto out;
+       }
+       /*
+        * We're going to cheat here. We know that smbd/aio.c
+        * only calls this when it's waiting for every single
+        * outstanding call to finish on a close, so just wait
+        * individually for each IO to complete. We don't care
+        * what order they finish - only that they all do. JRA.
+        */
+       while (sp.num_entries != sp.num_finished) {
+               if (tevent_loop_once(ev) == -1) {
+                       goto out;
+               }
+
+               if (timed_out) {
+                       errno = EAGAIN;
+                       goto out;
+               }
+       }
+
+       ret = 0;
+
+  out:
+
+       TALLOC_FREE(frame);
+       return ret;
+}
+
+static int aio_linux_connect(vfs_handle_struct *handle, const char *service,
+                              const char *user)
+{
+       /*********************************************************************
+        * How many io_events to initialize ?
+        * 128 per process seems insane as a default until you realize that
+        * (a) Throttling is done in SMB2 via the crediting algorithm.
+        * (b) SMB1 clients are limited to max_mux (50) outstanding
+        *     requests and Windows clients don't use this anyway.
+        * Essentially we want this to be unlimited unless smb.conf
+        * says different.
+        *********************************************************************/
+       aio_pending_size = lp_parm_int(
+               SNUM(handle->conn), "aio_linux", "aio num events", 128);
+       return SMB_VFS_NEXT_CONNECT(handle, service, user);
+}
+
+static struct vfs_fn_pointers vfs_aio_linux_fns = {
+       .connect_fn = aio_linux_connect,
+       .aio_read_fn = aio_linux_read,
+       .aio_write_fn = aio_linux_write,
+       .aio_return_fn = aio_linux_return_fn,
+       .aio_cancel_fn = aio_linux_cancel,
+       .aio_error_fn = aio_linux_error_fn,
+       .aio_suspend_fn = aio_linux_suspend,
+};
+
+NTSTATUS vfs_aio_linux_init(void)
+{
+       return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
+                               "aio_linux", &vfs_aio_linux_fns);
+}