build: Remove special case for the build farm
[samba.git] / source3 / modules / vfs_aio_fork.c
index 8568ec39165d6e250e06d3ab5d279afb42f68240..811d44e6bbe11059e15c3174fb1e8512997a5013 100644 (file)
@@ -2,6 +2,7 @@
  * Simulate the Posix AIO using mmap/fork
  *
  * Copyright (C) Volker Lendecke 2008
+ * Copyright (C) Jeremy Allison 2010
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  */
 
 #include "includes.h"
+#include "system/filesys.h"
+#include "system/shmem.h"
+#include "smbd/smbd.h"
+#include "smbd/globals.h"
+#include "lib/async_req/async_sock.h"
+#include "lib/util/tevent_unix.h"
+
+#undef recvmsg
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
 
 struct mmap_area {
        size_t size;
@@ -68,10 +81,37 @@ fail:
        return NULL;
 }
 
+enum cmd_type {
+       READ_CMD,
+       WRITE_CMD,
+       FSYNC_CMD
+};
+
+static const char *cmd_type_str(enum cmd_type cmd)
+{
+       const char *result;
+
+       switch (cmd) {
+       case READ_CMD:
+               result = "READ";
+               break;
+       case WRITE_CMD:
+               result = "WRITE";
+               break;
+       case FSYNC_CMD:
+               result = "FSYNC";
+               break;
+       default:
+               result = "<UNKNOWN>";
+               break;
+       }
+       return result;
+}
+
 struct rw_cmd {
        size_t n;
-       SMB_OFF_T offset;
-       bool read_cmd;
+       off_t offset;
+       enum cmd_type cmd;
 };
 
 struct rw_ret {
@@ -84,15 +124,11 @@ struct aio_child_list;
 struct aio_child {
        struct aio_child *prev, *next;
        struct aio_child_list *list;
-       SMB_STRUCT_AIOCB *aiocb;
        pid_t pid;
        int sockfd;
-       struct fd_event *sock_event;
-       struct rw_ret retval;
-       struct mmap_area *map;  /* ==NULL means write request */
+       struct mmap_area *map;
        bool dont_delete;       /* Marked as in use since last cleanup */
-       bool cancelled;
-       bool read_cmd;
+       bool busy;
 };
 
 struct aio_child_list {
@@ -134,6 +170,7 @@ static ssize_t read_fd(int fd, void *ptr, size_t nbytes, int *recvfd)
 
        msg.msg_name = NULL;
        msg.msg_namelen = 0;
+       msg.msg_flags = 0;
 
        iov[0].iov_base = (void *)ptr;
        iov[0].iov_len = nbytes;
@@ -157,7 +194,7 @@ static ssize_t read_fd(int fd, void *ptr, size_t nbytes, int *recvfd)
                        errno = EINVAL;
                        return -1;
                }
-               *recvfd = *((int *) CMSG_DATA(cmptr));
+               memcpy(recvfd, CMSG_DATA(cmptr), sizeof(*recvfd));
        } else {
                *recvfd = -1;           /* descriptor was not passed */
        }
@@ -195,7 +232,7 @@ static ssize_t write_fd(int fd, void *ptr, size_t nbytes, int sendfd)
        cmptr->cmsg_len = CMSG_LEN(sizeof(int));
        cmptr->cmsg_level = SOL_SOCKET;
        cmptr->cmsg_type = SCM_RIGHTS;
-       *((int *) CMSG_DATA(cmptr)) = sendfd;
+       memcpy(CMSG_DATA(cmptr), &sendfd, sizeof(sendfd));
 #else
        ZERO_STRUCT(msg);
        msg.msg_accrights = (caddr_t) &sendfd;
@@ -228,7 +265,7 @@ static void aio_child_cleanup(struct event_context *event_ctx,
        for (child = list->children; child != NULL; child = next) {
                next = child->next;
 
-               if (child->aiocb != NULL) {
+               if (child->busy) {
                        DEBUG(10, ("child %d currently active\n",
                                   (int)child->pid));
                        continue;
@@ -245,13 +282,14 @@ static void aio_child_cleanup(struct event_context *event_ctx,
                           "deleting\n", (int)child->pid));
 
                TALLOC_FREE(child);
+               child = next;
        }
 
        if (list->children != NULL) {
                /*
                 * Re-schedule the next cleanup round
                 */
-               list->cleanup_event = event_add_timed(smbd_event_context(), list,
+               list->cleanup_event = event_add_timed(server_event_context(), list,
                                                      timeval_add(&now, 30, 0),
                                                      aio_child_cleanup, list);
 
@@ -268,7 +306,7 @@ static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle
        }
 
        if (data == NULL) {
-               data = TALLOC_ZERO_P(NULL, struct aio_child_list);
+               data = talloc_zero(NULL, struct aio_child_list);
                if (data == NULL) {
                        return NULL;
                }
@@ -281,7 +319,7 @@ static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle
         */
 
        if (data->cleanup_event == NULL) {
-               data->cleanup_event = event_add_timed(smbd_event_context(), data,
+               data->cleanup_event = event_add_timed(server_event_context(), data,
                                                      timeval_current_ofs(30, 0),
                                                      aio_child_cleanup, data);
                if (data->cleanup_event == NULL) {
@@ -314,13 +352,13 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                }
 
                DEBUG(10, ("aio_child_loop: %s %d bytes at %d from fd %d\n",
-                          cmd_struct.read_cmd ? "read" : "write",
+                          cmd_type_str(cmd_struct.cmd),
                           (int)cmd_struct.n, (int)cmd_struct.offset, fd));
 
-#ifdef ENABLE_BUILD_FARM_HACKS
+#ifdef DEVELOPER
                {
                        /*
-                        * In the build farm, we want erratic behaviour for
+                        * For developer testing, we want erratic behaviour for
                         * async I/O times
                         */
                        uint8_t randval;
@@ -339,15 +377,29 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
 
                ZERO_STRUCT(ret_struct);
 
-               if (cmd_struct.read_cmd) {
+               switch (cmd_struct.cmd) {
+               case READ_CMD:
                        ret_struct.size = sys_pread(
                                fd, (void *)map->ptr, cmd_struct.n,
                                cmd_struct.offset);
-               }
-               else {
+#if 0
+/* This breaks "make test" when run with aio_fork module. */
+#ifdef DEVELOPER
+                       ret_struct.size = MAX(1, ret_struct.size * 0.9);
+#endif
+#endif
+                       break;
+               case WRITE_CMD:
                        ret_struct.size = sys_pwrite(
                                fd, (void *)map->ptr, cmd_struct.n,
                                cmd_struct.offset);
+                       break;
+               case FSYNC_CMD:
+                       ret_struct.size = fsync(fd);
+                       break;
+               default:
+                       ret_struct.size = -1;
+                       errno = EINVAL;
                }
 
                DEBUG(10, ("aio_child_loop: syscall returned %d\n",
@@ -357,6 +409,14 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                        ret_struct.ret_errno = errno;
                }
 
+               /*
+                * Close the fd before telling our parent we're done. The
+                * parent might close and re-open the file very quickly, and
+                * with system-level share modes (GPFS) we would get an
+                * unjustified SHARING_VIOLATION.
+                */
+               close(fd);
+
                ret = write_data(sockfd, (char *)&ret_struct,
                                 sizeof(ret_struct));
                if (ret != sizeof(ret_struct)) {
@@ -364,74 +424,61 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                                   strerror(errno)));
                        exit(2);
                }
-
-               close(fd);
        }
 }
 
-static void handle_aio_completion(struct event_context *event_ctx,
-                                 struct fd_event *event, uint16 flags,
-                                 void *p)
+static int aio_child_destructor(struct aio_child *child)
 {
-       struct aio_child *child = (struct aio_child *)p;
-       uint16 mid;
+       char c=0;
 
-       DEBUG(10, ("handle_aio_completion called with flags=%d\n", flags));
+       SMB_ASSERT(!child->busy);
 
-       if ((flags & EVENT_FD_READ) == 0) {
-               return;
-       }
+       DEBUG(10, ("aio_child_destructor: removing child %d on fd %d\n",
+                       child->pid, child->sockfd));
 
-       if (!NT_STATUS_IS_OK(read_data(child->sockfd,
-                                      (char *)&child->retval,
-                                      sizeof(child->retval)))) {
-               DEBUG(0, ("aio child %d died\n", (int)child->pid));
-               child->retval.size = -1;
-               child->retval.ret_errno = EIO;
-       }
-
-       if (child->cancelled) {
-               child->aiocb = NULL;
-               child->cancelled = false;
-               return;
-       }
-
-       if (child->read_cmd && (child->retval.size > 0)) {
-               SMB_ASSERT(child->retval.size <= child->aiocb->aio_nbytes);
-               memcpy((void *)child->aiocb->aio_buf, (void *)child->map->ptr,
-                      child->retval.size);
-       }
-
-       mid = child->aiocb->aio_sigevent.sigev_value.sival_int;
-
-       DEBUG(10, ("mid %d finished\n", (int)mid));
-
-       smbd_aio_complete_mid(mid);
-}
-
-static int aio_child_destructor(struct aio_child *child)
-{
-       SMB_ASSERT((child->aiocb == NULL) || child->cancelled);
+       /*
+        * closing the sockfd makes the child not return from recvmsg() on RHEL
+        * 5.5 so instead force the child to exit by writing bad data to it
+        */
+       write(child->sockfd, &c, sizeof(c));
        close(child->sockfd);
        DLIST_REMOVE(child->list->children, child);
        return 0;
 }
 
-static NTSTATUS create_aio_child(struct aio_child_list *children,
-                                size_t map_size,
-                                struct aio_child **presult)
+/*
+ * We have to close all fd's in open files, we might incorrectly hold a system
+ * level share mode on a file.
+ */
+
+static struct files_struct *close_fsp_fd(struct files_struct *fsp,
+                                        void *private_data)
+{
+       if ((fsp->fh != NULL) && (fsp->fh->fd != -1)) {
+               close(fsp->fh->fd);
+               fsp->fh->fd = -1;
+       }
+       return NULL;
+}
+
+static int create_aio_child(struct smbd_server_connection *sconn,
+                           struct aio_child_list *children,
+                           size_t map_size,
+                           struct aio_child **presult)
 {
        struct aio_child *result;
        int fdpair[2];
-       NTSTATUS status;
+       int ret;
 
        fdpair[0] = fdpair[1] = -1;
 
-       result = TALLOC_ZERO_P(children, struct aio_child);
-       NT_STATUS_HAVE_NO_MEMORY(result);
+       result = talloc_zero(children, struct aio_child);
+       if (result == NULL) {
+               return ENOMEM;
+       }
 
        if (socketpair(AF_UNIX, SOCK_STREAM, 0, fdpair) == -1) {
-               status = map_nt_error_from_unix(errno);
+               ret = errno;
                DEBUG(10, ("socketpair() failed: %s\n", strerror(errno)));
                goto fail;
        }
@@ -440,14 +487,14 @@ static NTSTATUS create_aio_child(struct aio_child_list *children,
 
        result->map = mmap_area_init(result, map_size);
        if (result->map == NULL) {
-               status = map_nt_error_from_unix(errno);
+               ret = errno;
                DEBUG(0, ("Could not create mmap area\n"));
                goto fail;
        }
 
-       result->pid = sys_fork();
+       result->pid = fork();
        if (result->pid == -1) {
-               status = map_nt_error_from_unix(errno);
+               ret = errno;
                DEBUG(0, ("fork failed: %s\n", strerror(errno)));
                goto fail;
        }
@@ -455,24 +502,16 @@ static NTSTATUS create_aio_child(struct aio_child_list *children,
        if (result->pid == 0) {
                close(fdpair[0]);
                result->sockfd = fdpair[1];
+               files_forall(sconn, close_fsp_fd, NULL);
                aio_child_loop(result->sockfd, result->map);
        }
 
-       DEBUG(10, ("Child %d created\n", result->pid));
+       DEBUG(10, ("Child %d created with sockfd %d\n",
+                       result->pid, fdpair[0]));
 
        result->sockfd = fdpair[0];
        close(fdpair[1]);
 
-       result->sock_event = event_add_fd(smbd_event_context(), result,
-                                         result->sockfd, EVENT_FD_READ,
-                                         handle_aio_completion,
-                                         result);
-       if (result->sock_event == NULL) {
-               status = NT_STATUS_NO_MEMORY;
-               DEBUG(0, ("event_add_fd failed\n"));
-               goto fail;
-       }
-
        result->list = children;
        DLIST_ADD(children->children, result);
 
@@ -480,247 +519,410 @@ static NTSTATUS create_aio_child(struct aio_child_list *children,
 
        *presult = result;
 
-       return NT_STATUS_OK;
+       return 0;
 
  fail:
        if (fdpair[0] != -1) close(fdpair[0]);
        if (fdpair[1] != -1) close(fdpair[1]);
        TALLOC_FREE(result);
 
-       return status;
+       return ret;
 }
 
-static NTSTATUS get_idle_child(struct vfs_handle_struct *handle,
-                              struct aio_child **pchild)
+static int get_idle_child(struct vfs_handle_struct *handle,
+                         struct aio_child **pchild)
 {
        struct aio_child_list *children;
        struct aio_child *child;
-       NTSTATUS status;
 
        children = init_aio_children(handle);
        if (children == NULL) {
-               return NT_STATUS_NO_MEMORY;
+               return ENOMEM;
        }
 
        for (child = children->children; child != NULL; child = child->next) {
-               if (child->aiocb == NULL) {
-                       /* idle */
+               if (!child->busy) {
                        break;
                }
        }
 
        if (child == NULL) {
+               int ret;
+
                DEBUG(10, ("no idle child found, creating new one\n"));
 
-               status = create_aio_child(children, 128*1024, &child);
-               if (!NT_STATUS_IS_OK(status)) {
+               ret = create_aio_child(handle->conn->sconn, children,
+                                         128*1024, &child);
+               if (ret != 0) {
                        DEBUG(10, ("create_aio_child failed: %s\n",
-                                  nt_errstr(status)));
-                       return status;
+                                  strerror(errno)));
+                       return ret;
                }
        }
 
        child->dont_delete = true;
+       child->busy = true;
 
        *pchild = child;
-       return NT_STATUS_OK;
+       return 0;
 }
 
-static int aio_fork_read(struct vfs_handle_struct *handle,
-                        struct files_struct *fsp, SMB_STRUCT_AIOCB *aiocb)
-{
+struct aio_fork_pread_state {
        struct aio_child *child;
-       struct rw_cmd cmd;
        ssize_t ret;
-       NTSTATUS status;
+       int err;
+};
 
-       if (aiocb->aio_nbytes > 128*1024) {
-               /* TODO: support variable buffers */
-               errno = EINVAL;
-               return -1;
+static void aio_fork_pread_done(struct tevent_req *subreq);
+
+static struct tevent_req *aio_fork_pread_send(struct vfs_handle_struct *handle,
+                                             TALLOC_CTX *mem_ctx,
+                                             struct tevent_context *ev,
+                                             struct files_struct *fsp,
+                                             void *data,
+                                             size_t n, off_t offset)
+{
+       struct tevent_req *req, *subreq;
+       struct aio_fork_pread_state *state;
+       struct rw_cmd cmd;
+       ssize_t written;
+       int err;
+
+       req = tevent_req_create(mem_ctx, &state, struct aio_fork_pread_state);
+       if (req == NULL) {
+               return NULL;
        }
 
-       status = get_idle_child(handle, &child);
-       if (!NT_STATUS_IS_OK(status)) {
-               DEBUG(10, ("Could not get an idle child\n"));
-               return -1;
+       if (n > 128*1024) {
+               /* TODO: support variable buffers */
+               tevent_req_error(req, EINVAL);
+               return tevent_req_post(req, ev);
        }
 
-       child->read_cmd = true;
-       child->aiocb = aiocb;
-       child->retval.ret_errno = EINPROGRESS;
+       err = get_idle_child(handle, &state->child);
+       if (err != 0) {
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
+       }
 
        ZERO_STRUCT(cmd);
-       cmd.n = aiocb->aio_nbytes;
-       cmd.offset = aiocb->aio_offset;
-       cmd.read_cmd = child->read_cmd;
+       cmd.n = n;
+       cmd.offset = offset;
+       cmd.cmd = READ_CMD;
 
        DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
-                  (int)child->pid));
+                  (int)state->child->pid));
 
-       ret = write_fd(child->sockfd, &cmd, sizeof(cmd), fsp->fh->fd);
-       if (ret == -1) {
-               DEBUG(10, ("write_fd failed: %s\n", strerror(errno)));
-               return -1;
+       /*
+        * Not making this async. We're writing into an empty unix
+        * domain socket. This should never block.
+        */
+       written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
+                          fsp->fh->fd);
+       if (written == -1) {
+               err = errno;
+
+               TALLOC_FREE(state->child);
+
+               DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
        }
 
-       return 0;
+       subreq = read_packet_send(state, ev, state->child->sockfd,
+                                 sizeof(struct rw_ret), NULL, NULL);
+       if (tevent_req_nomem(subreq, req)) {
+               TALLOC_FREE(state->child); /* we sent sth down */
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, aio_fork_pread_done, req);
+       return req;
 }
 
-static int aio_fork_write(struct vfs_handle_struct *handle,
-                         struct files_struct *fsp, SMB_STRUCT_AIOCB *aiocb)
+static void aio_fork_pread_done(struct tevent_req *subreq)
 {
-       struct aio_child *child;
-       struct rw_cmd cmd;
-       ssize_t ret;
-       NTSTATUS status;
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct aio_fork_pread_state *state = tevent_req_data(
+               req, struct aio_fork_pread_state);
+       ssize_t nread;
+       uint8_t *buf;
+       int err;
+       struct rw_ret *retbuf;
+
+       nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
+       TALLOC_FREE(subreq);
+       if (nread == -1) {
+               TALLOC_FREE(state->child);
+               tevent_req_error(req, err);
+               return;
+       }
 
-       if (aiocb->aio_nbytes > 128*1024) {
-               /* TODO: support variable buffers */
-               errno = EINVAL;
+       state->child->busy = false;
+
+       retbuf = (struct rw_ret *)buf;
+       state->ret = retbuf->size;
+       state->err = retbuf->ret_errno;
+       tevent_req_done(req);
+}
+
+static ssize_t aio_fork_pread_recv(struct tevent_req *req, int *err)
+{
+       struct aio_fork_pread_state *state = tevent_req_data(
+               req, struct aio_fork_pread_state);
+
+       if (tevent_req_is_unix_error(req, err)) {
                return -1;
        }
+       if (state->ret == -1) {
+               *err = state->err;
+       }
+       return state->ret;
+}
 
-       status = get_idle_child(handle, &child);
-       if (!NT_STATUS_IS_OK(status)) {
-               DEBUG(10, ("Could not get an idle child\n"));
-               return -1;
+struct aio_fork_pwrite_state {
+       struct aio_child *child;
+       ssize_t ret;
+       int err;
+};
+
+static void aio_fork_pwrite_done(struct tevent_req *subreq);
+
+static struct tevent_req *aio_fork_pwrite_send(
+       struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx,
+       struct tevent_context *ev, struct files_struct *fsp,
+       const void *data, size_t n, off_t offset)
+{
+       struct tevent_req *req, *subreq;
+       struct aio_fork_pwrite_state *state;
+       struct rw_cmd cmd;
+       ssize_t written;
+       int err;
+
+       req = tevent_req_create(mem_ctx, &state, struct aio_fork_pwrite_state);
+       if (req == NULL) {
+               return NULL;
        }
 
-       child->read_cmd = false;
-       child->aiocb = aiocb;
-       child->retval.ret_errno = EINPROGRESS;
+       if (n > 128*1024) {
+               /* TODO: support variable buffers */
+               tevent_req_error(req, EINVAL);
+               return tevent_req_post(req, ev);
+       }
 
-       memcpy((void *)child->map->ptr, (void *)aiocb->aio_buf,
-              aiocb->aio_nbytes);
+       err = get_idle_child(handle, &state->child);
+       if (err != 0) {
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
+       }
 
        ZERO_STRUCT(cmd);
-       cmd.n = aiocb->aio_nbytes;
-       cmd.offset = aiocb->aio_offset;
-       cmd.read_cmd = child->read_cmd;
+       cmd.n = n;
+       cmd.offset = offset;
+       cmd.cmd = WRITE_CMD;
 
        DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
-                  (int)child->pid));
+                  (int)state->child->pid));
 
-       ret = write_fd(child->sockfd, &cmd, sizeof(cmd), fsp->fh->fd);
-       if (ret == -1) {
-               DEBUG(10, ("write_fd failed: %s\n", strerror(errno)));
-               return -1;
+       /*
+        * Not making this async. We're writing into an empty unix
+        * domain socket. This should never block.
+        */
+       written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
+                          fsp->fh->fd);
+       if (written == -1) {
+               err = errno;
+
+               TALLOC_FREE(state->child);
+
+               DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
        }
 
-       return 0;
+       subreq = read_packet_send(state, ev, state->child->sockfd,
+                                 sizeof(struct rw_ret), NULL, NULL);
+       if (tevent_req_nomem(subreq, req)) {
+               TALLOC_FREE(state->child); /* we sent sth down */
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, aio_fork_pwrite_done, req);
+       return req;
 }
 
-static struct aio_child *aio_fork_find_child(struct vfs_handle_struct *handle,
-                                            SMB_STRUCT_AIOCB *aiocb)
+static void aio_fork_pwrite_done(struct tevent_req *subreq)
 {
-       struct aio_child_list *children;
-       struct aio_child *child;
-
-       children = init_aio_children(handle);
-       if (children == NULL) {
-               return NULL;
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct aio_fork_pwrite_state *state = tevent_req_data(
+               req, struct aio_fork_pwrite_state);
+       ssize_t nread;
+       uint8_t *buf;
+       int err;
+       struct rw_ret *retbuf;
+
+       nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
+       TALLOC_FREE(subreq);
+       if (nread == -1) {
+               TALLOC_FREE(state->child);
+               tevent_req_error(req, err);
+               return;
        }
 
-       for (child = children->children; child != NULL; child = child->next) {
-               if (child->aiocb == aiocb) {
-                       return child;
-               }
-       }
+       state->child->busy = false;
 
-       return NULL;
+       retbuf = (struct rw_ret *)buf;
+       state->ret = retbuf->size;
+       state->err = retbuf->ret_errno;
+       tevent_req_done(req);
 }
 
-static ssize_t aio_fork_return_fn(struct vfs_handle_struct *handle,
-                                 struct files_struct *fsp,
-                                 SMB_STRUCT_AIOCB *aiocb)
+static ssize_t aio_fork_pwrite_recv(struct tevent_req *req, int *err)
 {
-       struct aio_child *child = aio_fork_find_child(handle, aiocb);
+       struct aio_fork_pwrite_state *state = tevent_req_data(
+               req, struct aio_fork_pwrite_state);
 
-       if (child == NULL) {
-               errno = EINVAL;
-               DEBUG(0, ("returning EINVAL\n"));
+       if (tevent_req_is_unix_error(req, err)) {
                return -1;
        }
-
-       child->aiocb = NULL;
-
-       if (child->retval.size == -1) {
-               errno = child->retval.ret_errno;
+       if (state->ret == -1) {
+               *err = state->err;
        }
-
-       return child->retval.size;
+       return state->ret;
 }
 
-static int aio_fork_cancel(struct vfs_handle_struct *handle,
-                          struct files_struct *fsp,
-                          SMB_STRUCT_AIOCB *aiocb)
-{
-       struct aio_child_list *children;
+struct aio_fork_fsync_state {
        struct aio_child *child;
+       ssize_t ret;
+       int err;
+};
 
-       children = init_aio_children(handle);
-       if (children == NULL) {
-               errno = EINVAL;
-               return -1;
+static void aio_fork_fsync_done(struct tevent_req *subreq);
+
+static struct tevent_req *aio_fork_fsync_send(
+       struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx,
+       struct tevent_context *ev, struct files_struct *fsp)
+{
+       struct tevent_req *req, *subreq;
+       struct aio_fork_fsync_state *state;
+       struct rw_cmd cmd;
+       ssize_t written;
+       int err;
+
+       req = tevent_req_create(mem_ctx, &state, struct aio_fork_fsync_state);
+       if (req == NULL) {
+               return NULL;
        }
 
-       for (child = children->children; child != NULL; child = child->next) {
-               if (child->aiocb == NULL) {
-                       continue;
-               }
-               if (child->aiocb->aio_fildes != fsp->fh->fd) {
-                       continue;
-               }
-               if ((aiocb != NULL) && (child->aiocb != aiocb)) {
-                       continue;
-               }
+       err = get_idle_child(handle, &state->child);
+       if (err != 0) {
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
+       }
 
-               /*
-                * We let the child do its job, but we discard the result when
-                * it's finished.
-                */
+       ZERO_STRUCT(cmd);
+       cmd.cmd = FSYNC_CMD;
+
+       DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
+                  (int)state->child->pid));
+
+       /*
+        * Not making this async. We're writing into an empty unix
+        * domain socket. This should never block.
+        */
+       written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
+                          fsp->fh->fd);
+       if (written == -1) {
+               err = errno;
+
+               TALLOC_FREE(state->child);
+
+               DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
+               tevent_req_error(req, err);
+               return tevent_req_post(req, ev);
+       }
 
-               child->cancelled = true;
+       subreq = read_packet_send(state, ev, state->child->sockfd,
+                                 sizeof(struct rw_ret), NULL, NULL);
+       if (tevent_req_nomem(subreq, req)) {
+               TALLOC_FREE(state->child); /* we sent sth down */
+               return tevent_req_post(req, ev);
+       }
+       tevent_req_set_callback(subreq, aio_fork_fsync_done, req);
+       return req;
+}
+
+static void aio_fork_fsync_done(struct tevent_req *subreq)
+{
+       struct tevent_req *req = tevent_req_callback_data(
+               subreq, struct tevent_req);
+       struct aio_fork_fsync_state *state = tevent_req_data(
+               req, struct aio_fork_fsync_state);
+       ssize_t nread;
+       uint8_t *buf;
+       int err;
+       struct rw_ret *retbuf;
+
+       nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
+       TALLOC_FREE(subreq);
+       if (nread == -1) {
+               TALLOC_FREE(state->child);
+               tevent_req_error(req, err);
+               return;
        }
 
-       return AIO_CANCELED;
+       state->child->busy = false;
+
+       retbuf = (struct rw_ret *)buf;
+       state->ret = retbuf->size;
+       state->err = retbuf->ret_errno;
+       tevent_req_done(req);
 }
 
-static int aio_fork_error_fn(struct vfs_handle_struct *handle,
-                            struct files_struct *fsp,
-                            SMB_STRUCT_AIOCB *aiocb)
+static int aio_fork_fsync_recv(struct tevent_req *req, int *err)
 {
-       struct aio_child *child = aio_fork_find_child(handle, aiocb);
+       struct aio_fork_fsync_state *state = tevent_req_data(
+               req, struct aio_fork_fsync_state);
 
-       if (child == NULL) {
-               errno = EINVAL;
+       if (tevent_req_is_unix_error(req, err)) {
                return -1;
        }
+       if (state->ret == -1) {
+               *err = state->err;
+       }
+       return state->ret;
+}
 
-       return child->retval.ret_errno;
+static int aio_fork_connect(vfs_handle_struct *handle, const char *service,
+                           const char *user)
+{
+       /*********************************************************************
+        * How many threads to initialize ?
+        * 100 per process seems insane as a default until you realize that
+        * (a) Threads terminate after 1 second when idle.
+        * (b) Throttling is done in SMB2 via the crediting algorithm.
+        * (c) SMB1 clients are limited to max_mux (50) outstanding
+        *     requests and Windows clients don't use this anyway.
+        * Essentially we want this to be unlimited unless smb.conf
+        * says different.
+        *********************************************************************/
+       aio_pending_size = 100;
+       return SMB_VFS_NEXT_CONNECT(handle, service, user);
 }
 
-/* VFS operations structure */
-
-static vfs_op_tuple aio_fork_ops[] = {
-       {SMB_VFS_OP(aio_fork_read),     SMB_VFS_OP_AIO_READ,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_write),    SMB_VFS_OP_AIO_WRITE,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_return_fn), SMB_VFS_OP_AIO_RETURN,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_cancel),   SMB_VFS_OP_AIO_CANCEL,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_error_fn), SMB_VFS_OP_AIO_ERROR,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(NULL),              SMB_VFS_OP_NOOP,
-        SMB_VFS_LAYER_NOOP}
+static struct vfs_fn_pointers vfs_aio_fork_fns = {
+       .connect_fn = aio_fork_connect,
+       .pread_send_fn = aio_fork_pread_send,
+       .pread_recv_fn = aio_fork_pread_recv,
+       .pwrite_send_fn = aio_fork_pwrite_send,
+       .pwrite_recv_fn = aio_fork_pwrite_recv,
+       .fsync_send_fn = aio_fork_fsync_send,
+       .fsync_recv_fn = aio_fork_fsync_recv,
 };
 
 NTSTATUS vfs_aio_fork_init(void);
 NTSTATUS vfs_aio_fork_init(void)
 {
        return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
-                               "aio_fork", aio_fork_ops);
+                               "aio_fork", &vfs_aio_fork_fns);
 }