VFS: Modify opendir to take a const struct smb_filename * instead of const char *
[metze/samba/wip.git] / source3 / modules / vfs_aio_pthread.c
index 9217b69e80f03ffbe385e4ae83bb06ff5a668b0d..7037b633e5e19e653c80f93bc0972cbe37d5cf76 100644 (file)
 #include "system/filesys.h"
 #include "system/shmem.h"
 #include "smbd/smbd.h"
-#include "pthreadpool.h"
-
-struct aio_extra;
-static struct pthreadpool *pool;
-static int aio_pthread_jobid;
-
-struct aio_private_data {
-       struct aio_private_data *prev, *next;
-       int jobid;
-       SMB_STRUCT_AIOCB *aiocb;
-       ssize_t ret_size;
-       int ret_errno;
-       bool cancelled;
-       bool write_command;
-};
-
-/* List of outstanding requests we have. */
-struct aio_private_data *pd_list;
-
-static void aio_pthread_handle_completion(struct event_context *event_ctx,
-                               struct fd_event *event,
-                               uint16 flags,
-                               void *p);
-
-/************************************************************************
- How many threads to initialize ?
-***********************************************************************/
-
-static int aio_get_num_threads(void)
-{
-       int num_cores = sys_get_number_of_cores();
-       DEBUG(10,("aio_get_num_threads: sys_get_number_of_cores "
-               "returned %d\n",
-               num_cores));
-       num_cores *= 2;
-       if (num_cores < 1) {
-               num_cores = 1;
-       }
-       /* Even on a single processor box give a little
-          concurrency. */
-       return MIN(4,num_cores);
-}
-
-#if 0
-/************************************************************************
- Called every 30 seconds to destroy pool if it's idle.
-***********************************************************************/
-
-static void idle_pool_destroy_timer(struct tevent_context *ev,
-                       struct tevent_timer *te,
-                       struct timeval current_time,
-                       void *private_data)
-{
-       struct timeval ne;
-
-       TALLOC_FREE(te);
-
-       if (pool && pd_list == NULL) {
-               if (pthreadpool_destroy(pool) == 0) {
-                       pool = NULL;
-               }
-               DEBUG(10,("idle_pool_destroy_timer: destroyed AIO pool.\n"));
-               return;
-       }
-
-       /* Here, the IO is still active. */
-
-       /* Set an event up for 30 seconds time - if we have
-          no outstanding IO at this time shut the threadpool
-          down. */
-       ne = tevent_timeval_current_ofs(30, 0);
-       tevent_add_timer(server_event_context(),
-                       NULL,
-                       ne,
-                       idle_pool_destroy_timer,
-                       NULL);
-}
+#include "smbd/globals.h"
+#include "lib/pthreadpool/pthreadpool.h"
+#ifdef HAVE_LINUX_FALLOC_H
+#include <linux/falloc.h>
 #endif
 
+#if defined(HAVE_OPENAT) && defined(USE_LINUX_THREAD_CREDENTIALS)
+
 /************************************************************************
  Ensure thread pool is initialized.
 ***********************************************************************/
 
-static bool init_aio_threadpool(void)
+static bool init_aio_threadpool(struct tevent_context *ev_ctx,
+                               struct pthreadpool **pp_pool,
+                               void (*completion_fn)(struct tevent_context *,
+                                               struct tevent_fd *,
+                                               uint16_t,
+                                               void *))
 {
-       struct fd_event *sock_event = NULL;
+       struct tevent_fd *sock_event = NULL;
        int ret = 0;
-       int num_threads;
-#if 0
-       struct timeval ne;
-#endif
 
-       if (pool) {
+       if (*pp_pool) {
                return true;
        }
 
-       num_threads = aio_get_num_threads();
-       ret = pthreadpool_init(num_threads, &pool);
+       ret = pthreadpool_init(lp_aio_max_threads(), pp_pool);
        if (ret) {
                errno = ret;
                return false;
        }
-       sock_event = tevent_add_fd(server_event_context(),
+       sock_event = tevent_add_fd(ev_ctx,
                                NULL,
-                               pthreadpool_signal_fd(pool),
+                               pthreadpool_signal_fd(*pp_pool),
                                TEVENT_FD_READ,
-                               aio_pthread_handle_completion,
+                               completion_fn,
                                NULL);
        if (sock_event == NULL) {
-               pthreadpool_destroy(pool);
-               pool = NULL;
+               pthreadpool_destroy(*pp_pool);
+               *pp_pool = NULL;
                return false;
        }
 
-#if 0
-       /* Set an event up for 30 seconds time - if we have
-          no outstanding IO at this time shut the threadpool
-          down. */
-       ne = tevent_timeval_current_ofs(30, 0);
-       tevent_add_timer(server_event_context(),
-                       NULL,
-                       ne,
-                       idle_pool_destroy_timer,
-                       NULL);
-#endif
-
-       DEBUG(10,("init_aio_threadpool: initialized with %d threads\n",
-                       num_threads));
+       DEBUG(10,("init_aio_threadpool: initialized with up to %d threads\n",
+                 (int)lp_aio_max_threads()));
 
        return true;
 }
 
+/*
+ * We must have openat() to do any thread-based
+ * asynchronous opens. We also must be using
+ * thread-specific credentials (Linux-only
+ * for now).
+ */
 
-/************************************************************************
- Worker function - core of the pthread aio engine.
- This is the function that actually does the IO.
-***********************************************************************/
-
-static void aio_worker(void *private_data)
-{
-       struct aio_private_data *pd =
-                       (struct aio_private_data *)private_data;
-
-       if (pd->write_command) {
-               pd->ret_size = pwrite(pd->aiocb->aio_fildes,
-                               (const void *)pd->aiocb->aio_buf,
-                               pd->aiocb->aio_nbytes,
-                               pd->aiocb->aio_offset);
-       } else {
-               pd->ret_size = pread(pd->aiocb->aio_fildes,
-                               (void *)pd->aiocb->aio_buf,
-                               pd->aiocb->aio_nbytes,
-                               pd->aiocb->aio_offset);
-       }
-       if (pd->ret_size == -1) {
-               pd->ret_errno = errno;
-       } else {
-               pd->ret_errno = 0;
-       }
-}
-
-/************************************************************************
- Private data destructor.
-***********************************************************************/
-
-static int pd_destructor(struct aio_private_data *pd)
-{
-       DLIST_REMOVE(pd_list, pd);
-       return 0;
-}
-
-/************************************************************************
- Create and initialize a private data struct.
-***********************************************************************/
-
-static struct aio_private_data *create_private_data(TALLOC_CTX *ctx,
-                                       SMB_STRUCT_AIOCB *aiocb)
-{
-       struct aio_private_data *pd = talloc_zero(ctx, struct aio_private_data);
-       if (!pd) {
-               return NULL;
-       }
-       pd->jobid = aio_pthread_jobid++;
-       pd->aiocb = aiocb;
-       pd->ret_size = -1;
-       pd->ret_errno = EINPROGRESS;
-       talloc_set_destructor(pd, pd_destructor);
-       DLIST_ADD_END(pd_list, pd, struct aio_private_data *);
-       return pd;
-}
-
-/************************************************************************
- Spin off a threadpool (if needed) and initiate a pread call.
-***********************************************************************/
-
-static int aio_pthread_read(struct vfs_handle_struct *handle,
-                               struct files_struct *fsp,
-                               SMB_STRUCT_AIOCB *aiocb)
-{
-       struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
-       struct aio_private_data *pd = NULL;
-       int ret;
-
-       if (!init_aio_threadpool()) {
-               return -1;
-       }
-
-       pd = create_private_data(aio_ex, aiocb);
-       if (pd == NULL) {
-               DEBUG(10, ("aio_pthread_read: Could not create private data.\n"));
-               return -1;
-       }
+/*
+ * NB. This threadpool is shared over all
+ * instances of this VFS module in this
+ * process, as is the current jobid.
+ */
 
-       ret = pthreadpool_add_job(pool, pd->jobid, aio_worker, (void *)pd);
-       if (ret) {
-               errno = ret;
-               return -1;
-       }
+static struct pthreadpool *open_pool;
+static int aio_pthread_open_jobid;
 
-       DEBUG(10, ("aio_pthread_read: jobid=%d pread requested "
-               "of %llu bytes at offset %llu\n",
-               pd->jobid,
-               (unsigned long long)pd->aiocb->aio_nbytes,
-               (unsigned long long)pd->aiocb->aio_offset));
+struct aio_open_private_data {
+       struct aio_open_private_data *prev, *next;
+       /* Inputs. */
+       int jobid;
+       int dir_fd;
+       int flags;
+       mode_t mode;
+       uint64_t mid;
+       bool in_progress;
+       const char *fname;
+       char *dname;
+       struct smbd_server_connection *sconn;
+       const struct security_unix_token *ux_tok;
+       uint64_t initial_allocation_size;
+       /* Returns. */
+       int ret_fd;
+       int ret_errno;
+};
 
-       return 0;
-}
+/* List of outstanding requests we have. */
+static struct aio_open_private_data *open_pd_list;
 
 /************************************************************************
Spin off a threadpool (if needed) and initiate a pwrite call.
Find the open private data by jobid.
 ***********************************************************************/
 
-static int aio_pthread_write(struct vfs_handle_struct *handle,
-                               struct files_struct *fsp,
-                               SMB_STRUCT_AIOCB *aiocb)
+static struct aio_open_private_data *find_open_private_data_by_jobid(int jobid)
 {
-       struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
-       struct aio_private_data *pd = NULL;
-       int ret;
-
-       if (!init_aio_threadpool()) {
-               return -1;
-       }
-
-       pd = create_private_data(aio_ex, aiocb);
-       if (pd == NULL) {
-               DEBUG(10, ("aio_pthread_write: Could not create private data.\n"));
-               return -1;
-       }
-
-       pd->write_command = true;
+       struct aio_open_private_data *opd;
 
-       ret = pthreadpool_add_job(pool, pd->jobid, aio_worker, (void *)pd);
-       if (ret) {
-               errno = ret;
-               return -1;
+       for (opd = open_pd_list; opd != NULL; opd = opd->next) {
+               if (opd->jobid == jobid) {
+                       return opd;
+               }
        }
 
-       DEBUG(10, ("aio_pthread_write: jobid=%d pwrite requested "
-               "of %llu bytes at offset %llu\n",
-               pd->jobid,
-               (unsigned long long)pd->aiocb->aio_nbytes,
-               (unsigned long long)pd->aiocb->aio_offset));
-
-       return 0;
+       return NULL;
 }
 
 /************************************************************************
- Find the private data by jobid.
+ Find the open private data by mid.
 ***********************************************************************/
 
-static struct aio_private_data *find_private_data_by_jobid(int jobid)
+static struct aio_open_private_data *find_open_private_data_by_mid(uint64_t mid)
 {
-       struct aio_private_data *pd;
+       struct aio_open_private_data *opd;
 
-       for (pd = pd_list; pd != NULL; pd = pd->next) {
-               if (pd->jobid == jobid) {
-                       return pd;
+       for (opd = open_pd_list; opd != NULL; opd = opd->next) {
+               if (opd->mid == mid) {
+                       return opd;
                }
        }
 
@@ -310,350 +147,353 @@ static struct aio_private_data *find_private_data_by_jobid(int jobid)
 }
 
 /************************************************************************
- Callback when an IO completes.
+ Callback when an open completes.
 ***********************************************************************/
 
-static void aio_pthread_handle_completion(struct event_context *event_ctx,
-                               struct fd_event *event,
-                               uint16 flags,
+static void aio_open_handle_completion(struct tevent_context *event_ctx,
+                               struct tevent_fd *event,
+                               uint16_t flags,
                                void *p)
 {
-       struct aio_extra *aio_ex = NULL;
-       struct aio_private_data *pd = NULL;
+       struct aio_open_private_data *opd = NULL;
        int jobid = 0;
        int ret;
+       struct smbXsrv_connection *xconn;
 
-       DEBUG(10, ("aio_pthread_handle_completion called with flags=%d\n",
-                       (int)flags));
+       DEBUG(10, ("aio_open_handle_completion called with flags=%d\n",
+               (int)flags));
 
-       if ((flags & EVENT_FD_READ) == 0) {
+       if ((flags & TEVENT_FD_READ) == 0) {
                return;
        }
 
-       ret = pthreadpool_finished_job(pool, &jobid);
-       if (ret) {
-               smb_panic("aio_pthread_handle_completion");
+       ret = pthreadpool_finished_jobs(open_pool, &jobid, 1);
+       if (ret != 1) {
+               smb_panic("aio_open_handle_completion");
+               /* notreached. */
                return;
        }
 
-       pd = find_private_data_by_jobid(jobid);
-       if (pd == NULL) {
-               DEBUG(1, ("aio_pthread_handle_completion cannot find jobid %d\n",
-                         jobid));
+       opd = find_open_private_data_by_jobid(jobid);
+       if (opd == NULL) {
+               DEBUG(0, ("aio_open_handle_completion cannot find jobid %d\n",
+                       jobid));
+               smb_panic("aio_open_handle_completion - no jobid");
+               /* notreached. */
                return;
        }
 
-       aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr;
-       smbd_aio_complete_aio_ex(aio_ex);
-
-       DEBUG(10,("aio_pthread_handle_completion: jobid %d completed\n",
-               jobid ));
-
-}
+       DEBUG(10,("aio_open_handle_completion: jobid %d mid %llu "
+               "for file %s/%s completed\n",
+               jobid,
+               (unsigned long long)opd->mid,
+               opd->dname,
+               opd->fname));
 
-/************************************************************************
- Find the private data by aiocb.
-***********************************************************************/
+       opd->in_progress = false;
 
-static struct aio_private_data *find_private_data_by_aiocb(SMB_STRUCT_AIOCB *aiocb)
-{
-       struct aio_private_data *pd;
+       /*
+        * TODO: In future we need a proper algorithm
+        * to find the correct connection for a fsp.
+        * For now we only have one connection, so this is correct...
+        */
+       xconn = opd->sconn->client->connections;
 
-       for (pd = pd_list; pd != NULL; pd = pd->next) {
-               if (pd->aiocb == aiocb) {
-                       return pd;
+       /* Find outstanding event and reschedule. */
+       if (!schedule_deferred_open_message_smb(xconn, opd->mid)) {
+               /*
+                * Outstanding event didn't exist or was
+                * cancelled. Free up the fd and throw
+                * away the result.
+                */
+               if (opd->ret_fd != -1) {
+                       close(opd->ret_fd);
+                       opd->ret_fd = -1;
                }
+               TALLOC_FREE(opd);
        }
-
-       return NULL;
 }
 
-/************************************************************************
- Called to return the result of a completed AIO.
- Should only be called if aio_error returns something other than EINPROGRESS.
- Returns:
      Any other value - return from IO operation.
-***********************************************************************/
+/*****************************************************************
+ The core of the async open code - the worker function. Note we
+ use the new openat() system call to avoid any problems with
+ current working directory changes plus we change credentials
on the thread to prevent any security race conditions.
+*****************************************************************/
 
-static ssize_t aio_pthread_return_fn(struct vfs_handle_struct *handle,
-                               struct files_struct *fsp,
-                               SMB_STRUCT_AIOCB *aiocb)
+static void aio_open_worker(void *private_data)
 {
-       struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
-
-       if (pd == NULL) {
-               errno = EINVAL;
-               DEBUG(0, ("aio_pthread_return_fn: returning EINVAL\n"));
-               return -1;
+       struct aio_open_private_data *opd =
+               (struct aio_open_private_data *)private_data;
+
+       /* Become the correct credential on this thread. */
+       if (set_thread_credentials(opd->ux_tok->uid,
+                               opd->ux_tok->gid,
+                               (size_t)opd->ux_tok->ngroups,
+                               opd->ux_tok->groups) != 0) {
+               opd->ret_fd = -1;
+               opd->ret_errno = errno;
+               return;
        }
 
-       pd->aiocb = NULL;
+       opd->ret_fd = openat(opd->dir_fd,
+                       opd->fname,
+                       opd->flags,
+                       opd->mode);
 
-       if (pd->ret_size == -1) {
-               errno = pd->ret_errno;
-       }
+       if (opd->ret_fd == -1) {
+               opd->ret_errno = errno;
+       } else {
+               /* Create was successful. */
+               opd->ret_errno = 0;
 
-       return pd->ret_size;
+#if defined(HAVE_LINUX_FALLOCATE)
+               /*
+                * See if we can set the initial
+                * allocation size. We don't record
+                * the return for this as it's an
+                * optimization - the upper layer
+                * will also do this for us once
+                * the open returns.
+                */
+               if (opd->initial_allocation_size) {
+                       (void)fallocate(opd->ret_fd,
+                                       FALLOC_FL_KEEP_SIZE,
+                                       0,
+                                       (off_t)opd->initial_allocation_size);
+               }
+#endif
+       }
 }
 
 /************************************************************************
- Called to check the result of an AIO.
- Returns:
-       EINPROGRESS - still in progress.
-       EINVAL - invalid aiocb.
-       ECANCELED - request was cancelled.
-       0 - request completed successfully.
-       Any other value - errno from IO operation.
+ Open private data destructor.
 ***********************************************************************/
 
-static int aio_pthread_error_fn(struct vfs_handle_struct *handle,
-                            struct files_struct *fsp,
-                            SMB_STRUCT_AIOCB *aiocb)
+static int opd_destructor(struct aio_open_private_data *opd)
 {
-       struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
-
-       if (pd == NULL) {
-               return EINVAL;
-       }
-       if (pd->cancelled) {
-               return ECANCELED;
+       if (opd->dir_fd != -1) {
+               close(opd->dir_fd);
        }
-       return pd->ret_errno;
+       DLIST_REMOVE(open_pd_list, opd);
+       return 0;
 }
 
 /************************************************************************
- Called to request the cancel of an AIO, or all of them on a specific
- fsp if aiocb == NULL.
+ Create and initialize a private data struct for async open.
 ***********************************************************************/
 
-static int aio_pthread_cancel(struct vfs_handle_struct *handle,
-                       struct files_struct *fsp,
-                       SMB_STRUCT_AIOCB *aiocb)
+static struct aio_open_private_data *create_private_open_data(const files_struct *fsp,
+                                       int flags,
+                                       mode_t mode)
 {
-       struct aio_private_data *pd = NULL;
-
-       for (pd = pd_list; pd != NULL; pd = pd->next) {
-               if (pd->aiocb == NULL) {
-                       continue;
-               }
-               if (pd->aiocb->aio_fildes != fsp->fh->fd) {
-                       continue;
-               }
-               if ((aiocb != NULL) && (pd->aiocb != aiocb)) {
-                       continue;
-               }
-
-               /*
-                * We let the child do its job, but we discard the result when
-                * it's finished.
-                */
+       struct aio_open_private_data *opd = talloc_zero(NULL,
+                                       struct aio_open_private_data);
+       const char *fname = NULL;
 
-               pd->cancelled = true;
+       if (!opd) {
+               return NULL;
        }
 
-       return AIO_CANCELED;
-}
+       opd->jobid = aio_pthread_open_jobid++;
+       opd->dir_fd = -1;
+       opd->ret_fd = -1;
+       opd->ret_errno = EINPROGRESS;
+       opd->flags = flags;
+       opd->mode = mode;
+       opd->mid = fsp->mid;
+       opd->in_progress = true;
+       opd->sconn = fsp->conn->sconn;
+       opd->initial_allocation_size = fsp->initial_allocation_size;
+
+       /* Copy our current credentials. */
+       opd->ux_tok = copy_unix_token(opd, get_current_utok(fsp->conn));
+       if (opd->ux_tok == NULL) {
+               TALLOC_FREE(opd);
+               return NULL;
+       }
 
-/************************************************************************
- Callback for a previously detected job completion.
-***********************************************************************/
+       /*
+        * Copy the parent directory name and the
+        * relative path within it.
+        */
+       if (parent_dirname(opd,
+                       fsp->fsp_name->base_name,
+                       &opd->dname,
+                       &fname) == false) {
+               TALLOC_FREE(opd);
+               return NULL;
+       }
+       opd->fname = talloc_strdup(opd, fname);
+       if (opd->fname == NULL) {
+               TALLOC_FREE(opd);
+               return NULL;
+       }
 
-static void aio_pthread_handle_immediate(struct tevent_context *ctx,
-                               struct tevent_immediate *im,
-                               void *private_data)
-{
-       struct aio_extra *aio_ex = NULL;
-       int *pjobid = (int *)private_data;
-       struct aio_private_data *pd = find_private_data_by_jobid(*pjobid);
-
-       if (pd == NULL) {
-               DEBUG(1, ("aio_pthread_handle_immediate cannot find jobid %d\n",
-                         *pjobid));
-               TALLOC_FREE(pjobid);
-               return;
+#if defined(O_DIRECTORY)
+       opd->dir_fd = open(opd->dname, O_RDONLY|O_DIRECTORY);
+#else
+       opd->dir_fd = open(opd->dname, O_RDONLY);
+#endif
+       if (opd->dir_fd == -1) {
+               TALLOC_FREE(opd);
+               return NULL;
        }
 
-       TALLOC_FREE(pjobid);
-       aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr;
-       smbd_aio_complete_aio_ex(aio_ex);
+       talloc_set_destructor(opd, opd_destructor);
+       DLIST_ADD_END(open_pd_list, opd);
+       return opd;
 }
 
-/************************************************************************
- Private data struct used in suspend completion code.
-***********************************************************************/
-
-struct suspend_private {
-       int num_entries;
-       int num_finished;
-       const SMB_STRUCT_AIOCB * const *aiocb_array;
-};
-
-/************************************************************************
- Callback when an IO completes from a suspend call.
-***********************************************************************/
+/*****************************************************************
+ Setup an async open.
+*****************************************************************/
 
-static void aio_pthread_handle_suspend_completion(struct event_context *event_ctx,
-                               struct fd_event *event,
-                               uint16 flags,
-                               void *p)
+static int open_async(const files_struct *fsp,
+                       int flags,
+                       mode_t mode)
 {
-       struct suspend_private *sp = (struct suspend_private *)p;
-       struct aio_private_data *pd = NULL;
-       struct tevent_immediate *im = NULL;
-       int *pjobid = NULL;
-       int i;
-
-       DEBUG(10, ("aio_pthread_handle_suspend_completion called with flags=%d\n",
-                       (int)flags));
+       struct aio_open_private_data *opd = NULL;
+       int ret;
 
-       if ((flags & EVENT_FD_READ) == 0) {
-               return;
+       if (!init_aio_threadpool(fsp->conn->sconn->ev_ctx,
+                       &open_pool,
+                       aio_open_handle_completion)) {
+               return -1;
        }
 
-       pjobid = talloc_array(NULL, int, 1);
-       if (pjobid) {
-               smb_panic("aio_pthread_handle_suspend_completion: no memory.");
+       opd = create_private_open_data(fsp, flags, mode);
+       if (opd == NULL) {
+               DEBUG(10, ("open_async: Could not create private data.\n"));
+               return -1;
        }
 
-       if (pthreadpool_finished_job(pool, pjobid)) {
-               smb_panic("aio_pthread_handle_suspend_completion: can't find job.");
-               return;
+       ret = pthreadpool_add_job(open_pool,
+                               opd->jobid,
+                               aio_open_worker,
+                               (void *)opd);
+       if (ret) {
+               errno = ret;
+               return -1;
        }
 
-       pd = find_private_data_by_jobid(*pjobid);
-       if (pd == NULL) {
-               DEBUG(1, ("aio_pthread_handle_completion cannot find jobid %d\n",
-                         *pjobid));
-               TALLOC_FREE(pjobid);
-               return;
-       }
+       DEBUG(5,("open_async: mid %llu jobid %d created for file %s/%s\n",
+               (unsigned long long)opd->mid,
+               opd->jobid,
+               opd->dname,
+               opd->fname));
 
-       /* Is this a jobid with an aiocb we're interested in ? */
-       for (i = 0; i < sp->num_entries; i++) {
-               if (sp->aiocb_array[i] == pd->aiocb) {
-                       sp->num_finished++;
-                       TALLOC_FREE(pjobid);
-                       return;
-               }
-       }
+       /* Cause the calling code to reschedule us. */
+       errno = EINTR; /* Maps to NT_STATUS_RETRY. */
+       return -1;
+}
+
+/*****************************************************************
+ Look for a matching SMB2 mid. If we find it we're rescheduled,
+ just return the completed open.
+*****************************************************************/
+
+static bool find_completed_open(files_struct *fsp,
+                               int *p_fd,
+                               int *p_errno)
+{
+       struct aio_open_private_data *opd;
 
-       /* Jobid completed we weren't waiting for.
-          We must reshedule this as an immediate event
-          on the main event context. */
-       im = tevent_create_immediate(NULL);
-       if (!im) {
-               exit_server_cleanly("aio_pthread_handle_suspend_completion: no memory");
+       opd = find_open_private_data_by_mid(fsp->mid);
+       if (!opd) {
+               return false;
        }
 
-       DEBUG(10,("aio_pthread_handle_suspend_completion: "
-                       "re-scheduling job id %d\n",
-                       *pjobid));
+       if (opd->in_progress) {
+               DEBUG(0,("find_completed_open: mid %llu "
+                       "jobid %d still in progress for "
+                       "file %s/%s. PANIC !\n",
+                       (unsigned long long)opd->mid,
+                       opd->jobid,
+                       opd->dname,
+                       opd->fname));
+               /* Disaster ! This is an open timeout. Just panic. */
+               smb_panic("find_completed_open - in_progress\n");
+               /* notreached. */
+               return false;
+       }
 
-       tevent_schedule_immediate(im,
-                       server_event_context(),
-                       aio_pthread_handle_immediate,
-                       (void *)pjobid);
-}
+       *p_fd = opd->ret_fd;
+       *p_errno = opd->ret_errno;
 
+       DEBUG(5,("find_completed_open: mid %llu returning "
+               "fd = %d, errno = %d (%s) "
+               "jobid (%d) for file %s\n",
+               (unsigned long long)opd->mid,
+               opd->ret_fd,
+               opd->ret_errno,
+               strerror(opd->ret_errno),
+               opd->jobid,
+               smb_fname_str_dbg(fsp->fsp_name)));
 
-static void aio_pthread_suspend_timed_out(struct tevent_context *event_ctx,
-                                       struct tevent_timer *te,
-                                       struct timeval now,
-                                       void *private_data)
-{
-       bool *timed_out = (bool *)private_data;
-       /* Remove this timed event handler. */
-       TALLOC_FREE(te);
-       *timed_out = true;
+       /* Now we can free the opd. */
+       TALLOC_FREE(opd);
+       return true;
 }
 
-/************************************************************************
- Called to request everything to stop until all IO is completed.
-***********************************************************************/
+/*****************************************************************
+ The core open function. Only go async on O_CREAT|O_EXCL
+ opens to prevent any race conditions.
+*****************************************************************/
 
-static int aio_pthread_suspend(struct vfs_handle_struct *handle,
-                       struct files_struct *fsp,
-                       const SMB_STRUCT_AIOCB * const aiocb_array[],
-                       int n,
-                       const struct timespec *timeout)
+static int aio_pthread_open_fn(vfs_handle_struct *handle,
+                       struct smb_filename *smb_fname,
+                       files_struct *fsp,
+                       int flags,
+                       mode_t mode)
 {
-       struct event_context *ev = NULL;
-       struct fd_event *sock_event = NULL;
-       int ret = -1;
-       struct suspend_private sp;
-       bool timed_out = false;
-       TALLOC_CTX *frame = talloc_stackframe();
-
-       /* This is a blocking call, and has to use a sub-event loop. */
-       ev = event_context_init(frame);
-       if (ev == NULL) {
-               errno = ENOMEM;
-               goto out;
+       int my_errno = 0;
+       int fd = -1;
+       bool aio_allow_open = lp_parm_bool(
+               SNUM(handle->conn), "aio_pthread", "aio open", false);
+
+       if (smb_fname->stream_name) {
+               /* Don't handle stream opens. */
+               errno = ENOENT;
+               return -1;
        }
 
-       if (timeout) {
-               struct timeval tv = convert_timespec_to_timeval(*timeout);
-               struct tevent_timer *te = tevent_add_timer(ev,
-                                               frame,
-                                               timeval_current_ofs(tv.tv_sec,
-                                                                   tv.tv_usec),
-                                               aio_pthread_suspend_timed_out,
-                                               &timed_out);
-               if (!te) {
-                       errno = ENOMEM;
-                       goto out;
-               }
+       if (!aio_allow_open) {
+               /* aio opens turned off. */
+               return open(smb_fname->base_name, flags, mode);
        }
 
-       ZERO_STRUCT(sp);
-       sp.num_entries = n;
-       sp.aiocb_array = aiocb_array;
-       sp.num_finished = 0;
+       if (!(flags & O_CREAT)) {
+               /* Only creates matter. */
+               return open(smb_fname->base_name, flags, mode);
+       }
 
-       sock_event = tevent_add_fd(ev,
-                               frame,
-                               pthreadpool_signal_fd(pool),
-                               TEVENT_FD_READ,
-                               aio_pthread_handle_suspend_completion,
-                               (void *)&sp);
-       if (sock_event == NULL) {
-               pthreadpool_destroy(pool);
-               pool = NULL;
-               goto out;
+       if (!(flags & O_EXCL)) {
+               /* Only creates with O_EXCL matter. */
+               return open(smb_fname->base_name, flags, mode);
        }
+
        /*
-        * We're going to cheat here. We know that smbd/aio.c
-        * only calls this when it's waiting for every single
-        * outstanding call to finish on a close, so just wait
-        * individually for each IO to complete. We don't care
-        * what order they finish - only that they all do. JRA.
+        * See if this is a reentrant call - i.e. is this a
+        * restart of an existing open that just completed.
         */
-       while (sp.num_entries != sp.num_finished) {
-               if (tevent_loop_once(ev) == -1) {
-                       goto out;
-               }
 
-               if (timed_out) {
-                       errno = EAGAIN;
-                       goto out;
-               }
+       if (find_completed_open(fsp,
+                               &fd,
+                               &my_errno)) {
+               errno = my_errno;
+               return fd;
        }
 
-       ret = 0;
-
-  out:
-
-       TALLOC_FREE(frame);
-       return ret;
+       /* Ok, it's a create exclusive call - pass it to a thread helper. */
+       return open_async(fsp, flags, mode);
 }
+#endif
 
 static struct vfs_fn_pointers vfs_aio_pthread_fns = {
-       .aio_read_fn = aio_pthread_read,
-       .aio_write_fn = aio_pthread_write,
-       .aio_return_fn = aio_pthread_return_fn,
-       .aio_cancel_fn = aio_pthread_cancel,
-       .aio_error_fn = aio_pthread_error_fn,
-       .aio_suspend_fn = aio_pthread_suspend,
+#if defined(HAVE_OPENAT) && defined(USE_LINUX_THREAD_CREDENTIALS)
+       .open_fn = aio_pthread_open_fn,
+#endif
 };
 
 NTSTATUS vfs_aio_pthread_init(void);