s3-talloc Change TALLOC_ZERO_P() to talloc_zero()
[kai/samba.git] / source3 / modules / vfs_aio_fork.c
index 7914e8f4014519e8392c7eca6ed755746005d34f..961efd85de2996d3b321dfdb7d3f446c8c2fbe11 100644 (file)
@@ -2,6 +2,7 @@
  * Simulate the Posix AIO using mmap/fork
  *
  * Copyright (C) Volker Lendecke 2008
+ * Copyright (C) Jeremy Allison 2010
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  */
 
 #include "includes.h"
+#include "system/filesys.h"
+#include "system/shmem.h"
+#include "smbd/smbd.h"
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
 
 struct mmap_area {
        size_t size;
@@ -135,7 +143,7 @@ static ssize_t read_fd(int fd, void *ptr, size_t nbytes, int *recvfd)
        msg.msg_name = NULL;
        msg.msg_namelen = 0;
 
-       iov[0].iov_base = ptr;
+       iov[0].iov_base = (void *)ptr;
        iov[0].iov_len = nbytes;
        msg.msg_iov = iov;
        msg.msg_iovlen = 1;
@@ -206,7 +214,7 @@ static ssize_t write_fd(int fd, void *ptr, size_t nbytes, int sendfd)
        msg.msg_namelen = 0;
 
        ZERO_STRUCT(iov);
-       iov[0].iov_base = ptr;
+       iov[0].iov_base = (void *)ptr;
        iov[0].iov_len = nbytes;
        msg.msg_iov = iov;
        msg.msg_iovlen = 1;
@@ -216,7 +224,7 @@ static ssize_t write_fd(int fd, void *ptr, size_t nbytes, int sendfd)
 
 static void aio_child_cleanup(struct event_context *event_ctx,
                              struct timed_event *te,
-                             const struct timeval *now,
+                             struct timeval now,
                              void *private_data)
 {
        struct aio_child_list *list = talloc_get_type_abort(
@@ -245,15 +253,15 @@ static void aio_child_cleanup(struct event_context *event_ctx,
                           "deleting\n", (int)child->pid));
 
                TALLOC_FREE(child);
+               child = next;
        }
 
        if (list->children != NULL) {
                /*
                 * Re-schedule the next cleanup round
                 */
-               list->cleanup_event = event_add_timed(smbd_event_context(), list,
-                                                     timeval_add(now, 30, 0),
-                                                     "aio_child_cleanup",
+               list->cleanup_event = event_add_timed(server_event_context(), list,
+                                                     timeval_add(&now, 30, 0),
                                                      aio_child_cleanup, list);
 
        }
@@ -269,7 +277,7 @@ static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle
        }
 
        if (data == NULL) {
-               data = TALLOC_ZERO_P(NULL, struct aio_child_list);
+               data = talloc_zero(NULL, struct aio_child_list);
                if (data == NULL) {
                        return NULL;
                }
@@ -282,9 +290,8 @@ static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle
         */
 
        if (data->cleanup_event == NULL) {
-               data->cleanup_event = event_add_timed(smbd_event_context(), data,
+               data->cleanup_event = event_add_timed(server_event_context(), data,
                                                      timeval_current_ofs(30, 0),
-                                                     "aio_child_cleanup",
                                                      aio_child_cleanup, data);
                if (data->cleanup_event == NULL) {
                        TALLOC_FREE(data);
@@ -345,6 +352,12 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                        ret_struct.size = sys_pread(
                                fd, (void *)map->ptr, cmd_struct.n,
                                cmd_struct.offset);
+#if 0
+/* This breaks "make test" when run with aio_fork module. */
+#ifdef ENABLE_BUILD_FARM_HACKS
+                       ret_struct.size = MAX(1, ret_struct.size * 0.9);
+#endif
+#endif
                }
                else {
                        ret_struct.size = sys_pwrite(
@@ -359,6 +372,14 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                        ret_struct.ret_errno = errno;
                }
 
+               /*
+                * Close the fd before telling our parent we're done. The
+                * parent might close and re-open the file very quickly, and
+                * with system-level share modes (GPFS) we would get an
+                * unjustified SHARING_VIOLATION.
+                */
+               close(fd);
+
                ret = write_data(sockfd, (char *)&ret_struct,
                                 sizeof(ret_struct));
                if (ret != sizeof(ret_struct)) {
@@ -366,8 +387,6 @@ static void aio_child_loop(int sockfd, struct mmap_area *map)
                                   strerror(errno)));
                        exit(2);
                }
-
-               close(fd);
        }
 }
 
@@ -375,8 +394,9 @@ static void handle_aio_completion(struct event_context *event_ctx,
                                  struct fd_event *event, uint16 flags,
                                  void *p)
 {
+       struct aio_extra *aio_ex = NULL;
        struct aio_child *child = (struct aio_child *)p;
-       uint16 mid;
+       NTSTATUS status;
 
        DEBUG(10, ("handle_aio_completion called with flags=%d\n", flags));
 
@@ -384,14 +404,22 @@ static void handle_aio_completion(struct event_context *event_ctx,
                return;
        }
 
-       if (!NT_STATUS_IS_OK(read_data(child->sockfd,
-                                      (char *)&child->retval,
-                                      sizeof(child->retval)))) {
-               DEBUG(0, ("aio child %d died\n", (int)child->pid));
+       status = read_data(child->sockfd, (char *)&child->retval,
+                          sizeof(child->retval));
+
+       if (!NT_STATUS_IS_OK(status)) {
+               DEBUG(1, ("aio child %d died: %s\n", (int)child->pid,
+                         nt_errstr(status)));
                child->retval.size = -1;
                child->retval.ret_errno = EIO;
        }
 
+       if (child->aiocb == NULL) {
+               DEBUG(1, ("Inactive child died\n"));
+               TALLOC_FREE(child);
+               return;
+       }
+
        if (child->cancelled) {
                child->aiocb = NULL;
                child->cancelled = false;
@@ -404,23 +432,46 @@ static void handle_aio_completion(struct event_context *event_ctx,
                       child->retval.size);
        }
 
-       mid = child->aiocb->aio_sigevent.sigev_value.sival_int;
-
-       DEBUG(10, ("mid %d finished\n", (int)mid));
-
-       aio_request_done(mid);
-       process_aio_queue();
+       aio_ex = (struct aio_extra *)child->aiocb->aio_sigevent.sigev_value.sival_ptr;
+       smbd_aio_complete_aio_ex(aio_ex);
 }
 
 static int aio_child_destructor(struct aio_child *child)
 {
+       char c=0;
+
        SMB_ASSERT((child->aiocb == NULL) || child->cancelled);
+
+       DEBUG(10, ("aio_child_destructor: removing child %d on fd %d\n",
+                       child->pid, child->sockfd));
+
+       /*
+        * closing the sockfd makes the child not return from recvmsg() on RHEL
+        * 5.5 so instead force the child to exit by writing bad data to it
+        */
+       write(child->sockfd, &c, sizeof(c));
        close(child->sockfd);
        DLIST_REMOVE(child->list->children, child);
        return 0;
 }
 
-static NTSTATUS create_aio_child(struct aio_child_list *children,
+/*
+ * We have to close all fd's in open files, we might incorrectly hold a system
+ * level share mode on a file.
+ */
+
+static struct files_struct *close_fsp_fd(struct files_struct *fsp,
+                                        void *private_data)
+{
+       if ((fsp->fh != NULL) && (fsp->fh->fd != -1)) {
+               close(fsp->fh->fd);
+               fsp->fh->fd = -1;
+       }
+       return NULL;
+}
+
+static NTSTATUS create_aio_child(struct smbd_server_connection *sconn,
+                                struct aio_child_list *children,
                                 size_t map_size,
                                 struct aio_child **presult)
 {
@@ -430,7 +481,7 @@ static NTSTATUS create_aio_child(struct aio_child_list *children,
 
        fdpair[0] = fdpair[1] = -1;
 
-       result = TALLOC_ZERO_P(children, struct aio_child);
+       result = talloc_zero(children, struct aio_child);
        NT_STATUS_HAVE_NO_MEMORY(result);
 
        if (socketpair(AF_UNIX, SOCK_STREAM, 0, fdpair) == -1) {
@@ -458,15 +509,17 @@ static NTSTATUS create_aio_child(struct aio_child_list *children,
        if (result->pid == 0) {
                close(fdpair[0]);
                result->sockfd = fdpair[1];
+               files_forall(sconn, close_fsp_fd, NULL);
                aio_child_loop(result->sockfd, result->map);
        }
 
-       DEBUG(10, ("Child %d created\n", result->pid));
+       DEBUG(10, ("Child %d created with sockfd %d\n",
+                       result->pid, fdpair[0]));
 
        result->sockfd = fdpair[0];
        close(fdpair[1]);
 
-       result->sock_event = event_add_fd(smbd_event_context(), result,
+       result->sock_event = event_add_fd(server_event_context(), result,
                                          result->sockfd, EVENT_FD_READ,
                                          handle_aio_completion,
                                          result);
@@ -515,7 +568,8 @@ static NTSTATUS get_idle_child(struct vfs_handle_struct *handle,
        if (child == NULL) {
                DEBUG(10, ("no idle child found, creating new one\n"));
 
-               status = create_aio_child(children, 128*1024, &child);
+               status = create_aio_child(handle->conn->sconn, children,
+                                         128*1024, &child);
                if (!NT_STATUS_IS_OK(status)) {
                        DEBUG(10, ("create_aio_child failed: %s\n",
                                   nt_errstr(status)));
@@ -704,26 +758,138 @@ static int aio_fork_error_fn(struct vfs_handle_struct *handle,
        return child->retval.ret_errno;
 }
 
-/* VFS operations structure */
-
-static vfs_op_tuple aio_fork_ops[] = {
-       {SMB_VFS_OP(aio_fork_read),     SMB_VFS_OP_AIO_READ,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_write),    SMB_VFS_OP_AIO_WRITE,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_return_fn), SMB_VFS_OP_AIO_RETURN,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_cancel),   SMB_VFS_OP_AIO_CANCEL,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(aio_fork_error_fn), SMB_VFS_OP_AIO_ERROR,
-        SMB_VFS_LAYER_TRANSPARENT},
-       {SMB_VFS_OP(NULL),              SMB_VFS_OP_NOOP,
-        SMB_VFS_LAYER_NOOP}
+static void aio_fork_suspend_timed_out(struct tevent_context *event_ctx,
+                                       struct tevent_timer *te,
+                                       struct timeval now,
+                                       void *private_data)
+{
+       bool *timed_out = (bool *)private_data;
+       /* Remove this timed event handler. */
+       TALLOC_FREE(te);
+       *timed_out = true;
+}
+
+static int aio_fork_suspend(struct vfs_handle_struct *handle,
+                       struct files_struct *fsp,
+                       const SMB_STRUCT_AIOCB * const aiocb_array[],
+                       int n,
+                       const struct timespec *timeout)
+{
+       struct aio_child_list *children = NULL;
+       TALLOC_CTX *frame = talloc_stackframe();
+       struct event_context *ev = NULL;
+       int i;
+       int ret = -1;
+       bool timed_out = false;
+
+       children = init_aio_children(handle);
+       if (children == NULL) {
+               errno = EINVAL;
+               goto out;
+       }
+
+       /* This is a blocking call, and has to use a sub-event loop. */
+       ev = event_context_init(frame);
+       if (ev == NULL) {
+               errno = ENOMEM;
+               goto out;
+       }
+
+       if (timeout) {
+               struct timeval tv = convert_timespec_to_timeval(*timeout);
+               struct tevent_timer *te = tevent_add_timer(ev,
+                                               frame,
+                                               timeval_current_ofs(tv.tv_sec,
+                                                                   tv.tv_usec),
+                                               aio_fork_suspend_timed_out,
+                                               &timed_out);
+               if (!te) {
+                       errno = ENOMEM;
+                       goto out;
+               }
+       }
+
+       for (i = 0; i < n; i++) {
+               struct aio_child *child = NULL;
+               const SMB_STRUCT_AIOCB *aiocb = aiocb_array[i];
+
+               if (!aiocb) {
+                       continue;
+               }
+
+               /*
+                * We're going to cheat here. We know that smbd/aio.c
+                * only calls this when it's waiting for every single
+                * outstanding call to finish on a close, so just wait
+                * individually for each IO to complete. We don't care
+                * what order they finish - only that they all do. JRA.
+                */
+
+               for (child = children->children; child != NULL; child = child->next) {
+                       if (child->aiocb == NULL) {
+                               continue;
+                       }
+                       if (child->aiocb->aio_fildes != fsp->fh->fd) {
+                               continue;
+                       }
+                       if (child->aiocb != aiocb) {
+                               continue;
+                       }
+
+                       if (child->aiocb->aio_sigevent.sigev_value.sival_ptr == NULL) {
+                               continue;
+                       }
+
+                       /* We're never using this event on the
+                        * main event context again... */
+                       TALLOC_FREE(child->sock_event);
+
+                       child->sock_event = event_add_fd(ev,
+                                               child,
+                                               child->sockfd,
+                                               EVENT_FD_READ,
+                                               handle_aio_completion,
+                                               child);
+
+                       while (1) {
+                               if (tevent_loop_once(ev) == -1) {
+                                       goto out;
+                               }
+
+                               if (timed_out) {
+                                       errno = EAGAIN;
+                                       goto out;
+                               }
+
+                               /* We set child->aiocb to NULL in our hooked
+                                * AIO_RETURN(). */
+                               if (child->aiocb == NULL) {
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       ret = 0;
+
+  out:
+
+       TALLOC_FREE(frame);
+       return ret;
+}
+
+static struct vfs_fn_pointers vfs_aio_fork_fns = {
+       .aio_read = aio_fork_read,
+       .aio_write = aio_fork_write,
+       .aio_return_fn = aio_fork_return_fn,
+       .aio_cancel = aio_fork_cancel,
+       .aio_error_fn = aio_fork_error_fn,
+       .aio_suspend = aio_fork_suspend,
 };
 
 NTSTATUS vfs_aio_fork_init(void);
 NTSTATUS vfs_aio_fork_init(void)
 {
        return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
-                               "aio_fork", aio_fork_ops);
+                               "aio_fork", &vfs_aio_fork_fns);
 }