vfs_aio_fork: Fix a crash in aio_fork
[nivanova/samba-autobuild/.git] / source3 / modules / vfs_aio_fork.c
index 21f63d0b87b3478cb4c8eeecfb9262270f601ca5..3eaa26774f5b0f364cd3c33d8a450607ac152b9a 100644 (file)
@@ -2,6 +2,7 @@
  * Simulate the Posix AIO using mmap/fork
  *
  * Copyright (C) Volker Lendecke 2008
+ * Copyright (C) Jeremy Allison 2010
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  */
 
 #include "includes.h"
+#include "system/filesys.h"
+#include "system/shmem.h"
+#include "smbd/smbd.h"
+#include "smbd/globals.h"
+#include "lib/async_req/async_sock.h"
+#include "lib/util/tevent_unix.h"
+#include "lib/util/sys_rw.h"
+#include "lib/util/sys_rw_data.h"
+#include "lib/util/msghdr.h"
+#include "smbprofile.h"
+
+#if !defined(HAVE_STRUCT_MSGHDR_MSG_CONTROL) && !defined(HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTS)
+# error Can not pass file descriptors
+#endif
+
+#undef recvmsg
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
+
+struct aio_child_list;
+
+struct aio_fork_config {
+       bool erratic_testing_mode;
+       struct aio_child_list *children;
+};
 
 struct mmap_area {
        size_t size;
-       volatile void *ptr;
+       void *ptr;
 };
 
 static int mmap_area_destructor(struct mmap_area *area)
 {
-       munmap((void *)area->ptr, area->size);
+       munmap(discard_const(area->ptr), area->size);
        return 0;
 }
 
@@ -51,6 +79,7 @@ static struct mmap_area *mmap_area_init(TALLOC_CTX *mem_ctx, size_t size)
 
        result->ptr = mmap(NULL, size, PROT_READ|PROT_WRITE,
                           MAP_SHARED|MAP_FILE, fd, 0);
+       close(fd);
        if (result->ptr == MAP_FAILED) {
                DEBUG(1, ("mmap failed: %s\n", strerror(errno)));
                goto fail;
@@ -66,15 +95,44 @@ fail:
        return NULL;
 }
 
+enum cmd_type {
+       READ_CMD,
+       WRITE_CMD,
+       FSYNC_CMD
+};
+
+static const char *cmd_type_str(enum cmd_type cmd)
+{
+       const char *result;
+
+       switch (cmd) {
+       case READ_CMD:
+               result = "READ";
+               break;
+       case WRITE_CMD:
+               result = "WRITE";
+               break;
+       case FSYNC_CMD:
+               result = "FSYNC";
+               break;
+       default:
+               result = "<UNKNOWN>";
+               break;
+       }
+       return result;
+}
+
 struct rw_cmd {
        size_t n;
-       SMB_OFF_T offset;
-       bool read_cmd;
+       off_t offset;
+       enum cmd_type cmd;
+       bool erratic_testing_mode;
 };
 
 struct rw_ret {
        ssize_t size;
        int ret_errno;
+       uint64_t duration;
 };
 
 struct aio_child_list;
@@ -82,139 +140,87 @@ struct aio_child_list;
 struct aio_child {
        struct aio_child *prev, *next;
        struct aio_child_list *list;
-       SMB_STRUCT_AIOCB *aiocb;
        pid_t pid;
        int sockfd;
-       struct fd_event *sock_event;
-       struct rw_ret retval;
-       struct mmap_area *map;  /* ==NULL means write request */
+       struct mmap_area *map;
        bool dont_delete;       /* Marked as in use since last cleanup */
-       bool cancelled;
-       bool read_cmd;
+       bool busy;
 };
 
 struct aio_child_list {
        struct aio_child *children;
-       struct timed_event *cleanup_event;
+       struct tevent_timer *cleanup_event;
 };
 
-static void free_aio_children(void **p)
-{
-       TALLOC_FREE(*p);
-}
-
 static ssize_t read_fd(int fd, void *ptr, size_t nbytes, int *recvfd)
 {
-       struct msghdr msg;
        struct iovec iov[1];
+       struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 1 };
        ssize_t n;
-#ifndef HAVE_MSGHDR_MSG_CONTROL
-       int newfd;
-#endif
+       size_t bufsize = msghdr_prep_recv_fds(NULL, NULL, 0, 1);
+       uint8_t buf[bufsize];
 
-#ifdef HAVE_MSGHDR_MSG_CONTROL
-       union {
-         struct cmsghdr        cm;
-         char                          control[CMSG_SPACE(sizeof(int))];
-       } control_un;
-       struct cmsghdr  *cmptr;
-
-       msg.msg_control = control_un.control;
-       msg.msg_controllen = sizeof(control_un.control);
-#else
-#if HAVE_MSGHDR_MSG_ACCTRIGHTS
-       msg.msg_accrights = (caddr_t) &newfd;
-       msg.msg_accrightslen = sizeof(int);
-#else
-#error Can not pass file descriptors
-#endif
-#endif
-
-       msg.msg_name = NULL;
-       msg.msg_namelen = 0;
+       msghdr_prep_recv_fds(&msg, buf, bufsize, 1);
 
-       iov[0].iov_base = ptr;
+       iov[0].iov_base = (void *)ptr;
        iov[0].iov_len = nbytes;
-       msg.msg_iov = iov;
-       msg.msg_iovlen = 1;
 
-       if ( (n = recvmsg(fd, &msg, 0)) <= 0) {
-               return(n);
+       do {
+               n = recvmsg(fd, &msg, 0);
+       } while ((n == -1) && (errno == EINTR));
+
+       if (n <= 0) {
+               return n;
        }
 
-#ifdef HAVE_MSGHDR_MSG_CONTROL
-       if ((cmptr = CMSG_FIRSTHDR(&msg)) != NULL
-           && cmptr->cmsg_len == CMSG_LEN(sizeof(int))) {
-               if (cmptr->cmsg_level != SOL_SOCKET) {
-                       DEBUG(10, ("control level != SOL_SOCKET"));
-                       errno = EINVAL;
-                       return -1;
-               }
-               if (cmptr->cmsg_type != SCM_RIGHTS) {
-                       DEBUG(10, ("control type != SCM_RIGHTS"));
-                       errno = EINVAL;
-                       return -1;
+       {
+               size_t num_fds = msghdr_extract_fds(&msg, NULL, 0);
+               int fds[num_fds];
+
+               msghdr_extract_fds(&msg, fds, num_fds);
+
+               if (num_fds != 1) {
+                       size_t i;
+
+                       for (i=0; i<num_fds; i++) {
+                               close(fds[i]);
+                       }
+
+                       *recvfd = -1;
+                       return n;
                }
-               *recvfd = *((int *) CMSG_DATA(cmptr));
-       } else {
-               *recvfd = -1;           /* descriptor was not passed */
-       }
-#else
-       if (msg.msg_accrightslen == sizeof(int)) {
-               *recvfd = newfd;
-       }
-       else {
-               *recvfd = -1;           /* descriptor was not passed */
+
+               *recvfd = fds[0];
        }
-#endif
 
        return(n);
 }
 
 static ssize_t write_fd(int fd, void *ptr, size_t nbytes, int sendfd)
 {
-       struct msghdr   msg;
-       struct iovec    iov[1];
-
-#ifdef HAVE_MSGHDR_MSG_CONTROL
-       union {
-               struct cmsghdr  cm;
-               char control[CMSG_SPACE(sizeof(int))];
-       } control_un;
-       struct cmsghdr  *cmptr;
-
-       ZERO_STRUCT(msg);
-       ZERO_STRUCT(control_un);
-
-       msg.msg_control = control_un.control;
-       msg.msg_controllen = sizeof(control_un.control);
-
-       cmptr = CMSG_FIRSTHDR(&msg);
-       cmptr->cmsg_len = CMSG_LEN(sizeof(int));
-       cmptr->cmsg_level = SOL_SOCKET;
-       cmptr->cmsg_type = SCM_RIGHTS;
-       *((int *) CMSG_DATA(cmptr)) = sendfd;
-#else
-       ZERO_STRUCT(msg);
-       msg.msg_accrights = (caddr_t) &sendfd;
-       msg.msg_accrightslen = sizeof(int);
-#endif
+       struct msghdr msg = {0};
+       size_t bufsize = msghdr_prep_fds(NULL, NULL, 0, &sendfd, 1);
+       uint8_t buf[bufsize];
+       struct iovec iov;
+       ssize_t sent;
 
-       msg.msg_name = NULL;
-       msg.msg_namelen = 0;
+       msghdr_prep_fds(&msg, buf, bufsize, &sendfd, 1);
 
-       ZERO_STRUCT(iov);
-       iov[0].iov_base = ptr;
-       iov[0].iov_len = nbytes;
-       msg.msg_iov = iov;
+       iov.iov_base = (void *)ptr;
+       iov.iov_len = nbytes;
+       msg.msg_iov = &iov;
        msg.msg_iovlen = 1;
 
-       return (sendmsg(fd, &msg, 0));
+       do {
+               sent = sendmsg(fd, &msg, 0);
+       } while ((sent == -1) && (errno == EINTR));
+
+       return sent;
 }
 
-static void aio_child_cleanup(struct event_context *event_ctx,
-                             struct timed_event *te,
-                             const struct timeval *now,
+static void aio_child_cleanup(struct tevent_context *event_ctx,
+                             struct tevent_timer *te,
+                             struct timeval now,
                              void *private_data)
 {
        struct aio_child_list *list = talloc_get_type_abort(
@@ -226,7 +232,7 @@ static void aio_child_cleanup(struct event_context *event_ctx,
        for (child = list->children; child != NULL; child = next) {
                next = child->next;
 
-               if (child->aiocb != NULL) {
+               if (child->busy) {
                        DEBUG(10, ("child %d currently active\n",
                                   (int)child->pid));
                        continue;
@@ -243,15 +249,15 @@ static void aio_child_cleanup(struct event_context *event_ctx,
                           "deleting\n", (int)child->pid));
 
                TALLOC_FREE(child);
+               child = next;
        }
 
        if (list->children != NULL) {
                /*
                 * Re-schedule the next cleanup round
                 */
-               list->cleanup_event = event_add_timed(smbd_event_context(), list,
-                                                     timeval_add(now, 30, 0),
-                                                     "aio_child_cleanup",
+               list->cleanup_event = tevent_add_timer(server_event_context(), list,
+                                                     timeval_add(&now, 30, 0),
                                                      aio_child_cleanup, list);
 
        }
@@ -259,19 +265,19 @@ static void aio_child_cleanup(struct event_context *event_ctx,
 
 static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle)
 {
-       struct aio_child_list *data = NULL;
+       struct aio_fork_config *config;
+       struct aio_child_list *children;
 
-       if (SMB_VFS_HANDLE_TEST_DATA(handle)) {
-               SMB_VFS_HANDLE_GET_DATA(handle, data, struct aio_child_list,
-                                       return NULL);
-       }
+       SMB_VFS_HANDLE_GET_DATA(handle, config, struct aio_fork_config,
+                               return NULL);
 
-       if (data == NULL) {
-               data = TALLOC_ZERO_P(NULL, struct aio_child_list);
-               if (data == NULL) {
+       if (config->children == NULL) {
+               config->children = talloc_zero(config, struct aio_child_list);
+               if (config->children == NULL) {
                        return NULL;
                }
        }
+       children = config->children;
 
        /*
         * Regardless of whether the child_list had been around or not, make
@@ -279,23 +285,18 @@ static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle
         * delete itself when it finds that no children are around anymore.
         */
 
-       if (data->cleanup_event == NULL) {
-               data->cleanup_event = event_add_timed(smbd_event_context(), data,
-                                                     timeval_current_ofs(30, 0),
-                                                     "aio_child_cleanup",
-                                                     aio_child_cleanup, data);
-               if (data->cleanup_event == NULL) {
-                       TALLOC_FREE(data);
+       if (children->cleanup_event == NULL) {
+               children->cleanup_event =
+                       tevent_add_timer(server_event_context(), children,
+                                        timeval_current_ofs(30, 0),
+                                        aio_child_cleanup, children);
+               if (children->cleanup_event == NULL) {
+                       TALLOC_FREE(config->children);
                        return NULL;
                }
        }
 
-       if (!SMB_VFS_HANDLE_TEST_DATA(handle)) {
-               SMB_VFS_HANDLE_SET_DATA(handle, data, free_aio_children,
-                                       struct aio_child_list, return False);
-       }
-
-       return data;
+       return children;
 }
 
 static void aio_child_loop(int sockfd, struct mmap_area *map)
@@ -305,6 +306,7 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                ssize_t ret;
                struct rw_cmd cmd_struct;
                struct rw_ret ret_struct;
+               struct timespec start, end;
 
                ret = read_fd(sockfd, &cmd_struct, sizeof(cmd_struct), &fd);
                if (ret != sizeof(cmd_struct)) {
@@ -314,13 +316,12 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                }
 
                DEBUG(10, ("aio_child_loop: %s %d bytes at %d from fd %d\n",
-                          cmd_struct.read_cmd ? "read" : "write",
+                          cmd_type_str(cmd_struct.cmd),
                           (int)cmd_struct.n, (int)cmd_struct.offset, fd));
 
-#ifdef ENABLE_BUILD_FARM_HACKS
-               {
+               if (cmd_struct.erratic_testing_mode) {
                        /*
-                        * In the build farm, we want erratic behaviour for
+                        * For developer testing, we want erratic behaviour for
                         * async I/O times
                         */
                        uint8_t randval;
@@ -334,22 +335,38 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                        DEBUG(10, ("delaying for %u msecs\n", msecs));
                        smb_msleep(msecs);
                }
-#endif
-
 
                ZERO_STRUCT(ret_struct);
 
-               if (cmd_struct.read_cmd) {
+               PROFILE_TIMESTAMP(&start);
+
+               switch (cmd_struct.cmd) {
+               case READ_CMD:
                        ret_struct.size = sys_pread(
-                               fd, (void *)map->ptr, cmd_struct.n,
+                               fd, discard_const(map->ptr), cmd_struct.n,
                                cmd_struct.offset);
-               }
-               else {
+#if 0
+/* This breaks "make test" when run with aio_fork module. */
+#ifdef DEVELOPER
+                       ret_struct.size = MAX(1, ret_struct.size * 0.9);
+#endif
+#endif
+                       break;
+               case WRITE_CMD:
                        ret_struct.size = sys_pwrite(
-                               fd, (void *)map->ptr, cmd_struct.n,
+                               fd, discard_const(map->ptr), cmd_struct.n,
                                cmd_struct.offset);
+                       break;
+               case FSYNC_CMD:
+                       ret_struct.size = fsync(fd);
+                       break;
+               default:
+                       ret_struct.size = -1;
+                       errno = EINVAL;
                }
 
+               PROFILE_TIMESTAMP(&end);
+               ret_struct.duration = nsec_time_diff(&end, &start);
                DEBUG(10, ("aio_child_loop: syscall returned %d\n",
                           (int)ret_struct.size));
 
@@ -357,6 +374,14 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                        ret_struct.ret_errno = errno;
                }
 
+               /*
+                * Close the fd before telling our parent we're done. The
+                * parent might close and re-open the file very quickly, and
+                * with system-level share modes (GPFS) we would get an
+                * unjustified SHARING_VIOLATION.
+                */
+               close(fd);
+
                ret = write_data(sockfd, (char *)&ret_struct,
                                 sizeof(ret_struct));
                if (ret != sizeof(ret_struct)) {
@@ -364,77 +389,62 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                                   strerror(errno)));
                        exit(2);
                }
-
-               close(fd);
        }
 }
 
-static void handle_aio_completion(struct event_context *event_ctx,
-                                 struct fd_event *event, uint16 flags,
-                                 void *p)
+static int aio_child_destructor(struct aio_child *child)
 {
-       struct aio_child *child = (struct aio_child *)p;
-       uint16 mid;
-
-       DEBUG(10, ("handle_aio_completion called with flags=%d\n", flags));
-
-       if ((flags & EVENT_FD_READ) == 0) {
-               return;
-       }
-
-       if (!NT_STATUS_IS_OK(read_data(child->sockfd,
-                                      (char *)&child->retval,
-                                      sizeof(child->retval)))) {
-               DEBUG(0, ("aio child %d died\n", (int)child->pid));
-               child->retval.size = -1;
-               child->retval.ret_errno = EIO;
-       }
+       char c=0;
 
-       if (child->cancelled) {
-               child->aiocb = NULL;
-               child->cancelled = false;
-               return;
-       }
+       SMB_ASSERT(!child->busy);
 
-       if (child->read_cmd && (child->retval.size > 0)) {
-               SMB_ASSERT(child->retval.size <= child->aiocb->aio_nbytes);
-               memcpy((void *)child->aiocb->aio_buf, (void *)child->map->ptr,
-                      child->retval.size);
-       }
+       DEBUG(10, ("aio_child_destructor: removing child %d on fd %d\n",
+                  (int)child->pid, child->sockfd));
 
-       mid = child->aiocb->aio_sigevent.sigev_value.sival_int;
-
-       DEBUG(10, ("mid %d finished\n", (int)mid));
-
-       aio_request_done(mid);
-       process_aio_queue();
-}
-
-static int aio_child_destructor(struct aio_child *child)
-{
-       SMB_ASSERT((child->aiocb == NULL) || child->cancelled);
+       /*
+        * closing the sockfd makes the child not return from recvmsg() on RHEL
+        * 5.5 so instead force the child to exit by writing bad data to it
+        */
+       sys_write_v(child->sockfd, &c, sizeof(c));
        close(child->sockfd);
        DLIST_REMOVE(child->list->children, child);
        return 0;
 }
 
-static NTSTATUS create_aio_child(struct aio_child_list *children,
-                                size_t map_size,
-                                struct aio_child **presult)
+/*
+ * We have to close all fd's in open files, we might incorrectly hold a system
+ * level share mode on a file.
+ */
+
+static struct files_struct *close_fsp_fd(struct files_struct *fsp,
+                                        void *private_data)
+{
+       if ((fsp->fh != NULL) && (fsp->fh->fd != -1)) {
+               close(fsp->fh->fd);
+               fsp->fh->fd = -1;
+       }
+       return NULL;
+}
+
+static int create_aio_child(struct smbd_server_connection *sconn,
+                           struct aio_child_list *children,
+                           size_t map_size,
+                           struct aio_child **presult)
 {
        struct aio_child *result;
        int fdpair[2];
-       NTSTATUS status;
+       int ret;
 
        fdpair[0] = fdpair[1] = -1;
 
-       result = TALLOC_ZERO_P(children, struct aio_child);
-       NT_STATUS_HAVE_NO_MEMORY(result);
+       result = talloc_zero(children, struct aio_child);
+       if (result == NULL) {
+               return ENOMEM;
+       }
 
        if (socketpair(AF_UNIX, SOCK_STREAM, 0, fdpair) == -1) {
-               status = map_nt_error_from_unix(errno);
+               ret = errno;
                DEBUG(10, ("socketpair() failed: %s\n", strerror(errno)));
-               TALLOC_FREE(result);
                goto fail;
        }
 
@@ -442,14 +452,14 @@ static NTSTATUS create_aio_child(struct aio_child_list *children,
 
        result->map = mmap_area_init(result, map_size);
        if (result->map == NULL) {
-               status = map_nt_error_from_unix(errno);
+               ret = errno;
                DEBUG(0, ("Could not create mmap area\n"));
                goto fail;
        }
 
-       result->pid = sys_fork();
+       result->pid = fork();
        if (result->pid == -1) {
-               status = map_nt_error_from_unix(errno);
+               ret = errno;
                DEBUG(0, ("fork failed: %s\n", strerror(errno)));
                goto fail;
        }
@@ -457,24 +467,16 @@ static NTSTATUS create_aio_child(struct aio_child_list *children,
        if (result->pid == 0) {
                close(fdpair[0]);
                result->sockfd = fdpair[1];
+               files_forall(sconn, close_fsp_fd, NULL);
                aio_child_loop(result->sockfd, result->map);
        }
 
-       DEBUG(10, ("Child %d created\n", result->pid));
+       DEBUG(10, ("Child %d created with sockfd %d\n",
+                  (int)result->pid, fdpair[0]));
 
        result->sockfd = fdpair[0];
        close(fdpair[1]);
 
-       result->sock_event = event_add_fd(smbd_event_context(), result,
-                                         result->sockfd, EVENT_FD_READ,
-                                         handle_aio_completion,
-                                         result);
-       if (result->sock_event == NULL) {
-               status = NT_STATUS_NO_MEMORY;
-               DEBUG(0, ("event_add_fd failed\n"));
-               goto fail;
-       }
-
        result->list = children;
        DLIST_ADD(children->children, result);
 
@@ -482,247 +484,438 @@ static NTSTATUS create_aio_child(struct aio_child_list *children,
 
        *presult = result;
 
-       return NT_STATUS_OK;
+       return 0;
 
  fail:
        if (fdpair[0] != -1) close(fdpair[0]);
        if (fdpair[1] != -1) close(fdpair[1]);
        TALLOC_FREE(result);
 
-       return status;
+       return ret;
 }
 
-static NTSTATUS get_idle_child(struct vfs_handle_struct *handle,
-                              struct aio_child **pchild)
+static int get_idle_child(struct vfs_handle_struct *handle,
+                         struct aio_child **pchild)
 {
        struct aio_child_list *children;
        struct aio_child *child;
-       NTSTATUS status;
 
        children = init_aio_children(handle);
        if (children == NULL) {
-               return NT_STATUS_NO_MEMORY;
+               return ENOMEM;
        }
 
        for (child = children->children; child != NULL; child = child->next) {
-               if (child->aiocb == NULL) {
-                       /* idle */
+               if (!child->busy) {
                        break;
                }
        }
 
        if (child == NULL) {
+               int ret;
+
                DEBUG(10, ("no idle child found, creating new one\n"));
 
-               status = create_aio_child(children, 128*1024, &child);
-               if (!NT_STATUS_IS_OK(status)) {
+               ret = create_aio_child(handle->conn->sconn, children,
+                                         128*1024, &child);
+               if (ret != 0) {
                        DEBUG(10, ("create_aio_child failed: %s\n",
-                                  nt_errstr(status)));
-                       return status;
+                                  strerror(errno)));
+                       return ret;
                }
        }
 
        child->dont_delete = true;
+       child->busy = true;
 
        *pchild = child;
-       return NT_STATUS_OK;
+       return 0;
 }
 
-static int aio_fork_read(struct vfs_handle_struct *handle,
-                        struct files_struct *fsp, SMB_STRUCT_AIOCB *aiocb)
-{
+struct aio_fork_pread_state {
        struct aio_child *child;
-       struct rw_cmd cmd;
        ssize_t ret;
-       NTSTATUS status;
+       struct vfs_aio_state vfs_aio_state;
+};
 
-       if (aiocb->aio_nbytes > 128*1024) {
-               /* TODO: support variable buffers */
-               errno = EINVAL;
-               return -1;
+static void aio_fork_pread_done(struct tevent_req *subreq);
+
+static struct tevent_req *aio_fork_pread_send(struct vfs_handle_struct *handle,
+                                             TALLOC_CTX *mem_ctx,
+                                             struct tevent_context *ev,
+                                             struct files_struct *fsp,
+                                             void *data,
+                                             size_t n, off_t offset)
+{
+       struct tevent_req *req, *subreq;
+       struct aio_fork_pread_state *state;
+       struct rw_cmd cmd;
+       ssize_t written;
+       int err;
+       struct aio_fork_config *config;
+
+       SMB_VFS_HANDLE_GET_DATA(handle, config,
+                               struct aio_fork_config,
+                               return NULL);
+
+       req = tevent_req_create(mem_ctx, &state, struct aio_fork_pread_state);
+       if (req == NULL) {
+               return NULL;
        }
 
-       status = get_idle_child(handle, &child);
-       if (!NT_STATUS_IS_OK(status)) {
-               DEBUG(10, ("Could not get an idle child\n"));
-               return -1;
+       if (n > 128*1024) {
+               /* TODO: support variable buffers */
+               tevent_req_error(req, EINVAL);
+               return tevent_req_post(req, ev);
        }
 
-       child->read_cmd = true;
-       child->aiocb = aiocb;
-       child->retval.ret_errno = EINPROGRESS;
+       err = get_idle_child(handle, &state->child);
+       if (err != 0) {
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
+       }
 
        ZERO_STRUCT(cmd);
-       cmd.n = aiocb->aio_nbytes;
-       cmd.offset = aiocb->aio_offset;
-       cmd.read_cmd = child->read_cmd;
+       cmd.n = n;
+       cmd.offset = offset;
+       cmd.cmd = READ_CMD;
+       cmd.erratic_testing_mode = config->erratic_testing_mode;
 
        DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
-                  (int)child->pid));
+                  (int)state->child->pid));
 
-       ret = write_fd(child->sockfd, &cmd, sizeof(cmd), fsp->fh->fd);
-       if (ret == -1) {
-               DEBUG(10, ("write_fd failed: %s\n", strerror(errno)));
-               return -1;
+       /*
+        * Not making this async. We're writing into an empty unix
+        * domain socket. This should never block.
+        */
+       written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
+                          fsp->fh->fd);
+       if (written == -1) {
+               err = errno;
+
+               TALLOC_FREE(state->child);
+
+               DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
        }
 
-       return 0;
+       subreq = read_packet_send(state, ev, state->child->sockfd,
+                                 sizeof(struct rw_ret), NULL, NULL);
+       if (tevent_req_nomem(subreq, req)) {
+               TALLOC_FREE(state->child); /* we sent sth down */
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, aio_fork_pread_done, req);
+       return req;
 }
 
-static int aio_fork_write(struct vfs_handle_struct *handle,
-                         struct files_struct *fsp, SMB_STRUCT_AIOCB *aiocb)
+static void aio_fork_pread_done(struct tevent_req *subreq)
 {
-       struct aio_child *child;
-       struct rw_cmd cmd;
-       ssize_t ret;
-       NTSTATUS status;
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct aio_fork_pread_state *state = tevent_req_data(
+               req, struct aio_fork_pread_state);
+       ssize_t nread;
+       uint8_t *buf;
+       int err;
+       struct rw_ret *retbuf;
+
+       nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
+       TALLOC_FREE(subreq);
+       if (nread == -1) {
+               TALLOC_FREE(state->child);
+               tevent_req_error(req, err);
+               return;
+       }
 
-       if (aiocb->aio_nbytes > 128*1024) {
-               /* TODO: support variable buffers */
-               errno = EINVAL;
+       state->child->busy = false;
+
+       retbuf = (struct rw_ret *)buf;
+       state->ret = retbuf->size;
+       state->vfs_aio_state.error = retbuf->ret_errno;
+       state->vfs_aio_state.duration = retbuf->duration;
+       tevent_req_done(req);
+}
+
+static ssize_t aio_fork_pread_recv(struct tevent_req *req,
+                                  struct vfs_aio_state *vfs_aio_state)
+{
+       struct aio_fork_pread_state *state = tevent_req_data(
+               req, struct aio_fork_pread_state);
+
+       if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
                return -1;
        }
+       *vfs_aio_state = state->vfs_aio_state;
+       return state->ret;
+}
 
-       status = get_idle_child(handle, &child);
-       if (!NT_STATUS_IS_OK(status)) {
-               DEBUG(10, ("Could not get an idle child\n"));
-               return -1;
+struct aio_fork_pwrite_state {
+       struct aio_child *child;
+       ssize_t ret;
+       struct vfs_aio_state vfs_aio_state;
+};
+
+static void aio_fork_pwrite_done(struct tevent_req *subreq);
+
+static struct tevent_req *aio_fork_pwrite_send(
+       struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx,
+       struct tevent_context *ev, struct files_struct *fsp,
+       const void *data, size_t n, off_t offset)
+{
+       struct tevent_req *req, *subreq;
+       struct aio_fork_pwrite_state *state;
+       struct rw_cmd cmd;
+       ssize_t written;
+       int err;
+       struct aio_fork_config *config;
+       SMB_VFS_HANDLE_GET_DATA(handle, config,
+                               struct aio_fork_config,
+                               return NULL);
+
+       req = tevent_req_create(mem_ctx, &state, struct aio_fork_pwrite_state);
+       if (req == NULL) {
+               return NULL;
        }
 
-       child->read_cmd = false;
-       child->aiocb = aiocb;
-       child->retval.ret_errno = EINPROGRESS;
+       if (n > 128*1024) {
+               /* TODO: support variable buffers */
+               tevent_req_error(req, EINVAL);
+               return tevent_req_post(req, ev);
+       }
 
-       memcpy((void *)child->map->ptr, (void *)aiocb->aio_buf,
-              aiocb->aio_nbytes);
+       err = get_idle_child(handle, &state->child);
+       if (err != 0) {
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
+       }
 
        ZERO_STRUCT(cmd);
-       cmd.n = aiocb->aio_nbytes;
-       cmd.offset = aiocb->aio_offset;
-       cmd.read_cmd = child->read_cmd;
+       cmd.n = n;
+       cmd.offset = offset;
+       cmd.cmd = WRITE_CMD;
+       cmd.erratic_testing_mode = config->erratic_testing_mode;
 
        DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
-                  (int)child->pid));
+                  (int)state->child->pid));
 
-       ret = write_fd(child->sockfd, &cmd, sizeof(cmd), fsp->fh->fd);
-       if (ret == -1) {
-               DEBUG(10, ("write_fd failed: %s\n", strerror(errno)));
-               return -1;
+       /*
+        * Not making this async. We're writing into an empty unix
+        * domain socket. This should never block.
+        */
+       written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
+                          fsp->fh->fd);
+       if (written == -1) {
+               err = errno;
+
+               TALLOC_FREE(state->child);
+
+               DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
        }
 
-       return 0;
+       subreq = read_packet_send(state, ev, state->child->sockfd,
+                                 sizeof(struct rw_ret), NULL, NULL);
+       if (tevent_req_nomem(subreq, req)) {
+               TALLOC_FREE(state->child); /* we sent sth down */
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, aio_fork_pwrite_done, req);
+       return req;
 }
 
-static struct aio_child *aio_fork_find_child(struct vfs_handle_struct *handle,
-                                            SMB_STRUCT_AIOCB *aiocb)
+static void aio_fork_pwrite_done(struct tevent_req *subreq)
 {
-       struct aio_child_list *children;
-       struct aio_child *child;
-
-       children = init_aio_children(handle);
-       if (children == NULL) {
-               return NULL;
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct aio_fork_pwrite_state *state = tevent_req_data(
+               req, struct aio_fork_pwrite_state);
+       ssize_t nread;
+       uint8_t *buf;
+       int err;
+       struct rw_ret *retbuf;
+
+       nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
+       TALLOC_FREE(subreq);
+       if (nread == -1) {
+               TALLOC_FREE(state->child);
+               tevent_req_error(req, err);
+               return;
        }
 
-       for (child = children->children; child != NULL; child = child->next) {
-               if (child->aiocb == aiocb) {
-                       return child;
-               }
-       }
+       state->child->busy = false;
 
-       return NULL;
+       retbuf = (struct rw_ret *)buf;
+       state->ret = retbuf->size;
+       state->vfs_aio_state.error = retbuf->ret_errno;
+       state->vfs_aio_state.duration = retbuf->duration;
+       tevent_req_done(req);
 }
 
-static ssize_t aio_fork_return_fn(struct vfs_handle_struct *handle,
-                                 struct files_struct *fsp,
-                                 SMB_STRUCT_AIOCB *aiocb)
+static ssize_t aio_fork_pwrite_recv(struct tevent_req *req,
+                                   struct vfs_aio_state *vfs_aio_state)
 {
-       struct aio_child *child = aio_fork_find_child(handle, aiocb);
+       struct aio_fork_pwrite_state *state = tevent_req_data(
+               req, struct aio_fork_pwrite_state);
 
-       if (child == NULL) {
-               errno = EINVAL;
-               DEBUG(0, ("returning EINVAL\n"));
+       if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
                return -1;
        }
+       *vfs_aio_state = state->vfs_aio_state;
+       return state->ret;
+}
+
+struct aio_fork_fsync_state {
+       struct aio_child *child;
+       ssize_t ret;
+       struct vfs_aio_state vfs_aio_state;
+};
+
+static void aio_fork_fsync_done(struct tevent_req *subreq);
+
+static struct tevent_req *aio_fork_fsync_send(
+       struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx,
+       struct tevent_context *ev, struct files_struct *fsp)
+{
+       struct tevent_req *req, *subreq;
+       struct aio_fork_fsync_state *state;
+       struct rw_cmd cmd;
+       ssize_t written;
+       int err;
+       struct aio_fork_config *config;
+
+       SMB_VFS_HANDLE_GET_DATA(handle, config,
+                               struct aio_fork_config,
+                               return NULL);
+
+       req = tevent_req_create(mem_ctx, &state, struct aio_fork_fsync_state);
+       if (req == NULL) {
+               return NULL;
+       }
+
+       err = get_idle_child(handle, &state->child);
+       if (err != 0) {
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
+       }
 
-       child->aiocb = NULL;
+       ZERO_STRUCT(cmd);
+       cmd.cmd = FSYNC_CMD;
+       cmd.erratic_testing_mode = config->erratic_testing_mode;
+
+       DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
+                  (int)state->child->pid));
 
-       if (child->retval.size == -1) {
-               errno = child->retval.ret_errno;
+       /*
+        * Not making this async. We're writing into an empty unix
+        * domain socket. This should never block.
+        */
+       written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
+                          fsp->fh->fd);
+       if (written == -1) {
+               err = errno;
+
+               TALLOC_FREE(state->child);
+
+               DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
        }
 
-       return child->retval.size;
+       subreq = read_packet_send(state, ev, state->child->sockfd,
+                                 sizeof(struct rw_ret), NULL, NULL);
+       if (tevent_req_nomem(subreq, req)) {
+               TALLOC_FREE(state->child); /* we sent sth down */
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, aio_fork_fsync_done, req);
+       return req;
 }
 
-static int aio_fork_cancel(struct vfs_handle_struct *handle,
-                          struct files_struct *fsp,
-                          SMB_STRUCT_AIOCB *aiocb)
+static void aio_fork_fsync_done(struct tevent_req *subreq)
 {
-       struct aio_child_list *children;
-       struct aio_child *child;
-
-       children = init_aio_children(handle);
-       if (children == NULL) {
-               errno = EINVAL;
-               return -1;
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct aio_fork_fsync_state *state = tevent_req_data(
+               req, struct aio_fork_fsync_state);
+       ssize_t nread;
+       uint8_t *buf;
+       int err;
+       struct rw_ret *retbuf;
+
+       nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
+       TALLOC_FREE(subreq);
+       if (nread == -1) {
+               TALLOC_FREE(state->child);
+               tevent_req_error(req, err);
+               return;
        }
 
-       for (child = children->children; child != NULL; child = child->next) {
-               if (child->aiocb == NULL) {
-                       continue;
-               }
-               if (child->aiocb->aio_fildes != fsp->fh->fd) {
-                       continue;
-               }
-               if ((aiocb != NULL) && (child->aiocb != aiocb)) {
-                       continue;
-               }
+       state->child->busy = false;
 
-               /*
-                * We let the child do its job, but we discard the result when
-                * it's finished.
-                */
+       retbuf = (struct rw_ret *)buf;
+       state->ret = retbuf->size;
+       state->vfs_aio_state.error = retbuf->ret_errno;
+       state->vfs_aio_state.duration = retbuf->duration;
+       tevent_req_done(req);
+}
 
-               child->cancelled = true;
-       }
+static int aio_fork_fsync_recv(struct tevent_req *req,
+                              struct vfs_aio_state *vfs_aio_state)
+{
+       struct aio_fork_fsync_state *state = tevent_req_data(
+               req, struct aio_fork_fsync_state);
 
-       return AIO_CANCELED;
+       if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
+               return -1;
+       }
+       *vfs_aio_state = state->vfs_aio_state;
+       return state->ret;
 }
 
-static int aio_fork_error_fn(struct vfs_handle_struct *handle,
-                            struct files_struct *fsp,
-                            SMB_STRUCT_AIOCB *aiocb)
+static int aio_fork_connect(vfs_handle_struct *handle, const char *service,
+                           const char *user)
 {
-       struct aio_child *child = aio_fork_find_child(handle, aiocb);
+       int ret;
+       struct aio_fork_config *config;
+       ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
 
-       if (child == NULL) {
-               errno = EINVAL;
+       if (ret < 0) {
+               return ret;
+       }
+
+       config = talloc_zero(handle->conn, struct aio_fork_config);
+       if (!config) {
+               SMB_VFS_NEXT_DISCONNECT(handle);
+               DEBUG(0, ("talloc_zero() failed\n"));
                return -1;
        }
 
-       return child->retval.ret_errno;
+       config->erratic_testing_mode = lp_parm_bool(SNUM(handle->conn), "vfs_aio_fork",
+                                                   "erratic_testing_mode", false);
+       
+       SMB_VFS_HANDLE_SET_DATA(handle, config,
+                               NULL, struct aio_fork_config,
+                               return -1);
+
+       return 0;
 }
 
-/* VFS operations structure */
-
-static vfs_op_tuple aio_fork_ops[] = {
-       {SMB_VFS_OP(aio_fork_read),     SMB_VFS_OP_AIO_READ,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_write),    SMB_VFS_OP_AIO_WRITE,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_return_fn), SMB_VFS_OP_AIO_RETURN,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_cancel),   SMB_VFS_OP_AIO_CANCEL,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_error_fn), SMB_VFS_OP_AIO_ERROR,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(NULL),              SMB_VFS_OP_NOOP,
-        SMB_VFS_LAYER_NOOP}
+static struct vfs_fn_pointers vfs_aio_fork_fns = {
+       .connect_fn = aio_fork_connect,
+       .pread_send_fn = aio_fork_pread_send,
+       .pread_recv_fn = aio_fork_pread_recv,
+       .pwrite_send_fn = aio_fork_pwrite_send,
+       .pwrite_recv_fn = aio_fork_pwrite_recv,
+       .fsync_send_fn = aio_fork_fsync_send,
+       .fsync_recv_fn = aio_fork_fsync_recv,
 };
 
-NTSTATUS vfs_aio_fork_init(void);
-NTSTATUS vfs_aio_fork_init(void)
+NTSTATUS vfs_aio_fork_init(TALLOC_CTX *);
+NTSTATUS vfs_aio_fork_init(TALLOC_CTX *ctx)
 {
        return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
-                               "aio_fork", aio_fork_ops);
+                               "aio_fork", &vfs_aio_fork_fns);
 }