X-Git-Url: http://git.samba.org/samba.git/?a=blobdiff_plain;f=source3%2Fmodules%2Fvfs_aio_fork.c;h=8e47531ebd1470a8d05f14ea9c5107ef40e954a9;hb=fb137d6070c8cd872a66a4628b0e392cf40c180e;hp=961efd85de2996d3b321dfdb7d3f446c8c2fbe11;hpb=6da26870e0ae5acd6ff49a30ec2f6886b44d095e;p=vlendec%2Fsamba-autobuild%2F.git diff --git a/source3/modules/vfs_aio_fork.c b/source3/modules/vfs_aio_fork.c index 961efd85de2..8e47531ebd1 100644 --- a/source3/modules/vfs_aio_fork.c +++ b/source3/modules/vfs_aio_fork.c @@ -23,11 +23,28 @@ #include "system/filesys.h" #include "system/shmem.h" #include "smbd/smbd.h" +#include "smbd/globals.h" +#include "lib/async_req/async_sock.h" +#include "lib/util/tevent_unix.h" +#include "lib/util/sys_rw.h" +#include "lib/util/sys_rw_data.h" +#include "lib/util/msghdr.h" +#include "smbprofile.h" + +#if !defined(HAVE_STRUCT_MSGHDR_MSG_CONTROL) && !defined(HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTS) +# error Can not pass file descriptors +#endif + +#undef recvmsg #ifndef MAP_FILE #define MAP_FILE 0 #endif +struct aio_fork_config { + bool erratic_testing_mode; +}; + struct mmap_area { size_t size; volatile void *ptr; @@ -35,7 +52,7 @@ struct mmap_area { static int mmap_area_destructor(struct mmap_area *area) { - munmap((void *)area->ptr, area->size); + munmap(discard_const(area->ptr), area->size); return 0; } @@ -59,13 +76,12 @@ static struct mmap_area *mmap_area_init(TALLOC_CTX *mem_ctx, size_t size) result->ptr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FILE, fd, 0); + close(fd); if (result->ptr == MAP_FAILED) { DEBUG(1, ("mmap failed: %s\n", strerror(errno))); goto fail; } - close(fd); - result->size = size; talloc_set_destructor(result, mmap_area_destructor); @@ -76,15 +92,44 @@ fail: return NULL; } +enum cmd_type { + READ_CMD, + WRITE_CMD, + FSYNC_CMD +}; + +static const char *cmd_type_str(enum cmd_type cmd) +{ + const char *result; + + switch (cmd) { + case READ_CMD: + result = "READ"; + break; + case WRITE_CMD: + result = "WRITE"; + break; + case FSYNC_CMD: + result = "FSYNC"; + break; + default: + result = ""; + break; + } + return result; +} + struct rw_cmd { size_t n; - SMB_OFF_T offset; - bool read_cmd; + off_t offset; + enum cmd_type cmd; + bool erratic_testing_mode; }; struct rw_ret { ssize_t size; int ret_errno; + uint64_t duration; }; struct aio_child_list; @@ -92,20 +137,16 @@ struct aio_child_list; struct aio_child { struct aio_child *prev, *next; struct aio_child_list *list; - SMB_STRUCT_AIOCB *aiocb; pid_t pid; int sockfd; - struct fd_event *sock_event; - struct rw_ret retval; - struct mmap_area *map; /* ==NULL means write request */ + struct mmap_area *map; bool dont_delete; /* Marked as in use since last cleanup */ - bool cancelled; - bool read_cmd; + bool busy; }; struct aio_child_list { struct aio_child *children; - struct timed_event *cleanup_event; + struct tevent_timer *cleanup_event; }; static void free_aio_children(void **p) @@ -115,115 +156,72 @@ static void free_aio_children(void **p) static ssize_t read_fd(int fd, void *ptr, size_t nbytes, int *recvfd) { - struct msghdr msg; struct iovec iov[1]; + struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 1 }; ssize_t n; -#ifndef HAVE_MSGHDR_MSG_CONTROL - int newfd; -#endif + size_t bufsize = msghdr_prep_recv_fds(NULL, NULL, 0, 1); + uint8_t buf[bufsize]; -#ifdef HAVE_MSGHDR_MSG_CONTROL - union { - struct cmsghdr cm; - char control[CMSG_SPACE(sizeof(int))]; - } control_un; - struct cmsghdr *cmptr; - - msg.msg_control = control_un.control; - msg.msg_controllen = sizeof(control_un.control); -#else -#if HAVE_MSGHDR_MSG_ACCTRIGHTS - msg.msg_accrights = (caddr_t) &newfd; - msg.msg_accrightslen = sizeof(int); -#else -#error Can not pass file descriptors -#endif -#endif - - msg.msg_name = NULL; - msg.msg_namelen = 0; + msghdr_prep_recv_fds(&msg, buf, bufsize, 1); iov[0].iov_base = (void *)ptr; iov[0].iov_len = nbytes; - msg.msg_iov = iov; - msg.msg_iovlen = 1; - if ( (n = recvmsg(fd, &msg, 0)) <= 0) { - return(n); + do { + n = recvmsg(fd, &msg, 0); + } while ((n == -1) && (errno == EINTR)); + + if (n <= 0) { + return n; } -#ifdef HAVE_MSGHDR_MSG_CONTROL - if ((cmptr = CMSG_FIRSTHDR(&msg)) != NULL - && cmptr->cmsg_len == CMSG_LEN(sizeof(int))) { - if (cmptr->cmsg_level != SOL_SOCKET) { - DEBUG(10, ("control level != SOL_SOCKET")); - errno = EINVAL; - return -1; - } - if (cmptr->cmsg_type != SCM_RIGHTS) { - DEBUG(10, ("control type != SCM_RIGHTS")); - errno = EINVAL; - return -1; + { + size_t num_fds = msghdr_extract_fds(&msg, NULL, 0); + int fds[num_fds]; + + msghdr_extract_fds(&msg, fds, num_fds); + + if (num_fds != 1) { + size_t i; + + for (i=0; icmsg_len = CMSG_LEN(sizeof(int)); - cmptr->cmsg_level = SOL_SOCKET; - cmptr->cmsg_type = SCM_RIGHTS; - *((int *) CMSG_DATA(cmptr)) = sendfd; -#else - ZERO_STRUCT(msg); - msg.msg_accrights = (caddr_t) &sendfd; - msg.msg_accrightslen = sizeof(int); -#endif + struct msghdr msg = {0}; + size_t bufsize = msghdr_prep_fds(NULL, NULL, 0, &sendfd, 1); + uint8_t buf[bufsize]; + struct iovec iov; + ssize_t sent; - msg.msg_name = NULL; - msg.msg_namelen = 0; + msghdr_prep_fds(&msg, buf, bufsize, &sendfd, 1); - ZERO_STRUCT(iov); - iov[0].iov_base = (void *)ptr; - iov[0].iov_len = nbytes; - msg.msg_iov = iov; + iov.iov_base = (void *)ptr; + iov.iov_len = nbytes; + msg.msg_iov = &iov; msg.msg_iovlen = 1; - return (sendmsg(fd, &msg, 0)); + do { + sent = sendmsg(fd, &msg, 0); + } while ((sent == -1) && (errno == EINTR)); + + return sent; } -static void aio_child_cleanup(struct event_context *event_ctx, - struct timed_event *te, +static void aio_child_cleanup(struct tevent_context *event_ctx, + struct tevent_timer *te, struct timeval now, void *private_data) { @@ -236,7 +234,7 @@ static void aio_child_cleanup(struct event_context *event_ctx, for (child = list->children; child != NULL; child = next) { next = child->next; - if (child->aiocb != NULL) { + if (child->busy) { DEBUG(10, ("child %d currently active\n", (int)child->pid)); continue; @@ -260,7 +258,7 @@ static void aio_child_cleanup(struct event_context *event_ctx, /* * Re-schedule the next cleanup round */ - list->cleanup_event = event_add_timed(server_event_context(), list, + list->cleanup_event = tevent_add_timer(server_event_context(), list, timeval_add(&now, 30, 0), aio_child_cleanup, list); @@ -290,7 +288,7 @@ static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle */ if (data->cleanup_event == NULL) { - data->cleanup_event = event_add_timed(server_event_context(), data, + data->cleanup_event = tevent_add_timer(server_event_context(), data, timeval_current_ofs(30, 0), aio_child_cleanup, data); if (data->cleanup_event == NULL) { @@ -314,6 +312,7 @@ static void aio_child_loop(int sockfd, struct mmap_area *map) ssize_t ret; struct rw_cmd cmd_struct; struct rw_ret ret_struct; + struct timespec start, end; ret = read_fd(sockfd, &cmd_struct, sizeof(cmd_struct), &fd); if (ret != sizeof(cmd_struct)) { @@ -323,13 +322,12 @@ static void aio_child_loop(int sockfd, struct mmap_area *map) } DEBUG(10, ("aio_child_loop: %s %d bytes at %d from fd %d\n", - cmd_struct.read_cmd ? "read" : "write", + cmd_type_str(cmd_struct.cmd), (int)cmd_struct.n, (int)cmd_struct.offset, fd)); -#ifdef ENABLE_BUILD_FARM_HACKS - { + if (cmd_struct.erratic_testing_mode) { /* - * In the build farm, we want erratic behaviour for + * For developer testing, we want erratic behaviour for * async I/O times */ uint8_t randval; @@ -343,28 +341,38 @@ static void aio_child_loop(int sockfd, struct mmap_area *map) DEBUG(10, ("delaying for %u msecs\n", msecs)); smb_msleep(msecs); } -#endif - ZERO_STRUCT(ret_struct); - if (cmd_struct.read_cmd) { + PROFILE_TIMESTAMP(&start); + + switch (cmd_struct.cmd) { + case READ_CMD: ret_struct.size = sys_pread( - fd, (void *)map->ptr, cmd_struct.n, + fd, discard_const(map->ptr), cmd_struct.n, cmd_struct.offset); #if 0 /* This breaks "make test" when run with aio_fork module. */ -#ifdef ENABLE_BUILD_FARM_HACKS +#ifdef DEVELOPER ret_struct.size = MAX(1, ret_struct.size * 0.9); #endif #endif - } - else { + break; + case WRITE_CMD: ret_struct.size = sys_pwrite( - fd, (void *)map->ptr, cmd_struct.n, + fd, discard_const(map->ptr), cmd_struct.n, cmd_struct.offset); + break; + case FSYNC_CMD: + ret_struct.size = fsync(fd); + break; + default: + ret_struct.size = -1; + errno = EINVAL; } + PROFILE_TIMESTAMP(&end); + ret_struct.duration = nsec_time_diff(&end, &start); DEBUG(10, ("aio_child_loop: syscall returned %d\n", (int)ret_struct.size)); @@ -390,66 +398,20 @@ static void aio_child_loop(int sockfd, struct mmap_area *map) } } -static void handle_aio_completion(struct event_context *event_ctx, - struct fd_event *event, uint16 flags, - void *p) -{ - struct aio_extra *aio_ex = NULL; - struct aio_child *child = (struct aio_child *)p; - NTSTATUS status; - - DEBUG(10, ("handle_aio_completion called with flags=%d\n", flags)); - - if ((flags & EVENT_FD_READ) == 0) { - return; - } - - status = read_data(child->sockfd, (char *)&child->retval, - sizeof(child->retval)); - - if (!NT_STATUS_IS_OK(status)) { - DEBUG(1, ("aio child %d died: %s\n", (int)child->pid, - nt_errstr(status))); - child->retval.size = -1; - child->retval.ret_errno = EIO; - } - - if (child->aiocb == NULL) { - DEBUG(1, ("Inactive child died\n")); - TALLOC_FREE(child); - return; - } - - if (child->cancelled) { - child->aiocb = NULL; - child->cancelled = false; - return; - } - - if (child->read_cmd && (child->retval.size > 0)) { - SMB_ASSERT(child->retval.size <= child->aiocb->aio_nbytes); - memcpy((void *)child->aiocb->aio_buf, (void *)child->map->ptr, - child->retval.size); - } - - aio_ex = (struct aio_extra *)child->aiocb->aio_sigevent.sigev_value.sival_ptr; - smbd_aio_complete_aio_ex(aio_ex); -} - static int aio_child_destructor(struct aio_child *child) { char c=0; - SMB_ASSERT((child->aiocb == NULL) || child->cancelled); + SMB_ASSERT(!child->busy); DEBUG(10, ("aio_child_destructor: removing child %d on fd %d\n", - child->pid, child->sockfd)); + (int)child->pid, child->sockfd)); /* * closing the sockfd makes the child not return from recvmsg() on RHEL * 5.5 so instead force the child to exit by writing bad data to it */ - write(child->sockfd, &c, sizeof(c)); + sys_write_v(child->sockfd, &c, sizeof(c)); close(child->sockfd); DLIST_REMOVE(child->list->children, child); return 0; @@ -470,22 +432,24 @@ static struct files_struct *close_fsp_fd(struct files_struct *fsp, return NULL; } -static NTSTATUS create_aio_child(struct smbd_server_connection *sconn, - struct aio_child_list *children, - size_t map_size, - struct aio_child **presult) +static int create_aio_child(struct smbd_server_connection *sconn, + struct aio_child_list *children, + size_t map_size, + struct aio_child **presult) { struct aio_child *result; int fdpair[2]; - NTSTATUS status; + int ret; fdpair[0] = fdpair[1] = -1; result = talloc_zero(children, struct aio_child); - NT_STATUS_HAVE_NO_MEMORY(result); + if (result == NULL) { + return ENOMEM; + } if (socketpair(AF_UNIX, SOCK_STREAM, 0, fdpair) == -1) { - status = map_nt_error_from_unix(errno); + ret = errno; DEBUG(10, ("socketpair() failed: %s\n", strerror(errno))); goto fail; } @@ -494,14 +458,14 @@ static NTSTATUS create_aio_child(struct smbd_server_connection *sconn, result->map = mmap_area_init(result, map_size); if (result->map == NULL) { - status = map_nt_error_from_unix(errno); + ret = errno; DEBUG(0, ("Could not create mmap area\n")); goto fail; } - result->pid = sys_fork(); + result->pid = fork(); if (result->pid == -1) { - status = map_nt_error_from_unix(errno); + ret = errno; DEBUG(0, ("fork failed: %s\n", strerror(errno))); goto fail; } @@ -514,21 +478,11 @@ static NTSTATUS create_aio_child(struct smbd_server_connection *sconn, } DEBUG(10, ("Child %d created with sockfd %d\n", - result->pid, fdpair[0])); + (int)result->pid, fdpair[0])); result->sockfd = fdpair[0]; close(fdpair[1]); - result->sock_event = event_add_fd(server_event_context(), result, - result->sockfd, EVENT_FD_READ, - handle_aio_completion, - result); - if (result->sock_event == NULL) { - status = NT_STATUS_NO_MEMORY; - DEBUG(0, ("event_add_fd failed\n")); - goto fail; - } - result->list = children; DLIST_ADD(children->children, result); @@ -536,359 +490,437 @@ static NTSTATUS create_aio_child(struct smbd_server_connection *sconn, *presult = result; - return NT_STATUS_OK; + return 0; fail: if (fdpair[0] != -1) close(fdpair[0]); if (fdpair[1] != -1) close(fdpair[1]); TALLOC_FREE(result); - return status; + return ret; } -static NTSTATUS get_idle_child(struct vfs_handle_struct *handle, - struct aio_child **pchild) +static int get_idle_child(struct vfs_handle_struct *handle, + struct aio_child **pchild) { struct aio_child_list *children; struct aio_child *child; - NTSTATUS status; children = init_aio_children(handle); if (children == NULL) { - return NT_STATUS_NO_MEMORY; + return ENOMEM; } for (child = children->children; child != NULL; child = child->next) { - if (child->aiocb == NULL) { - /* idle */ + if (!child->busy) { break; } } if (child == NULL) { + int ret; + DEBUG(10, ("no idle child found, creating new one\n")); - status = create_aio_child(handle->conn->sconn, children, + ret = create_aio_child(handle->conn->sconn, children, 128*1024, &child); - if (!NT_STATUS_IS_OK(status)) { + if (ret != 0) { DEBUG(10, ("create_aio_child failed: %s\n", - nt_errstr(status))); - return status; + strerror(errno))); + return ret; } } child->dont_delete = true; + child->busy = true; *pchild = child; - return NT_STATUS_OK; + return 0; } -static int aio_fork_read(struct vfs_handle_struct *handle, - struct files_struct *fsp, SMB_STRUCT_AIOCB *aiocb) -{ +struct aio_fork_pread_state { struct aio_child *child; - struct rw_cmd cmd; ssize_t ret; - NTSTATUS status; + struct vfs_aio_state vfs_aio_state; +}; - if (aiocb->aio_nbytes > 128*1024) { - /* TODO: support variable buffers */ - errno = EINVAL; - return -1; +static void aio_fork_pread_done(struct tevent_req *subreq); + +static struct tevent_req *aio_fork_pread_send(struct vfs_handle_struct *handle, + TALLOC_CTX *mem_ctx, + struct tevent_context *ev, + struct files_struct *fsp, + void *data, + size_t n, off_t offset) +{ + struct tevent_req *req, *subreq; + struct aio_fork_pread_state *state; + struct rw_cmd cmd; + ssize_t written; + int err; + struct aio_fork_config *config; + + SMB_VFS_HANDLE_GET_DATA(handle, config, + struct aio_fork_config, + return NULL); + + req = tevent_req_create(mem_ctx, &state, struct aio_fork_pread_state); + if (req == NULL) { + return NULL; } - status = get_idle_child(handle, &child); - if (!NT_STATUS_IS_OK(status)) { - DEBUG(10, ("Could not get an idle child\n")); - return -1; + if (n > 128*1024) { + /* TODO: support variable buffers */ + tevent_req_error(req, EINVAL); + return tevent_req_post(req, ev); } - child->read_cmd = true; - child->aiocb = aiocb; - child->retval.ret_errno = EINPROGRESS; + err = get_idle_child(handle, &state->child); + if (err != 0) { + tevent_req_error(req, err); + return tevent_req_post(req, ev); + } ZERO_STRUCT(cmd); - cmd.n = aiocb->aio_nbytes; - cmd.offset = aiocb->aio_offset; - cmd.read_cmd = child->read_cmd; + cmd.n = n; + cmd.offset = offset; + cmd.cmd = READ_CMD; + cmd.erratic_testing_mode = config->erratic_testing_mode; DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd, - (int)child->pid)); + (int)state->child->pid)); - ret = write_fd(child->sockfd, &cmd, sizeof(cmd), fsp->fh->fd); - if (ret == -1) { - DEBUG(10, ("write_fd failed: %s\n", strerror(errno))); - return -1; + /* + * Not making this async. We're writing into an empty unix + * domain socket. This should never block. + */ + written = write_fd(state->child->sockfd, &cmd, sizeof(cmd), + fsp->fh->fd); + if (written == -1) { + err = errno; + + TALLOC_FREE(state->child); + + DEBUG(10, ("write_fd failed: %s\n", strerror(err))); + tevent_req_error(req, err); + return tevent_req_post(req, ev); } - return 0; + subreq = read_packet_send(state, ev, state->child->sockfd, + sizeof(struct rw_ret), NULL, NULL); + if (tevent_req_nomem(subreq, req)) { + TALLOC_FREE(state->child); /* we sent sth down */ + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, aio_fork_pread_done, req); + return req; } -static int aio_fork_write(struct vfs_handle_struct *handle, - struct files_struct *fsp, SMB_STRUCT_AIOCB *aiocb) +static void aio_fork_pread_done(struct tevent_req *subreq) { - struct aio_child *child; - struct rw_cmd cmd; - ssize_t ret; - NTSTATUS status; + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct aio_fork_pread_state *state = tevent_req_data( + req, struct aio_fork_pread_state); + ssize_t nread; + uint8_t *buf; + int err; + struct rw_ret *retbuf; + + nread = read_packet_recv(subreq, talloc_tos(), &buf, &err); + TALLOC_FREE(subreq); + if (nread == -1) { + TALLOC_FREE(state->child); + tevent_req_error(req, err); + return; + } - if (aiocb->aio_nbytes > 128*1024) { - /* TODO: support variable buffers */ - errno = EINVAL; + state->child->busy = false; + + retbuf = (struct rw_ret *)buf; + state->ret = retbuf->size; + state->vfs_aio_state.error = retbuf->ret_errno; + state->vfs_aio_state.duration = retbuf->duration; + tevent_req_done(req); +} + +static ssize_t aio_fork_pread_recv(struct tevent_req *req, + struct vfs_aio_state *vfs_aio_state) +{ + struct aio_fork_pread_state *state = tevent_req_data( + req, struct aio_fork_pread_state); + + if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) { return -1; } + *vfs_aio_state = state->vfs_aio_state; + return state->ret; +} - status = get_idle_child(handle, &child); - if (!NT_STATUS_IS_OK(status)) { - DEBUG(10, ("Could not get an idle child\n")); - return -1; +struct aio_fork_pwrite_state { + struct aio_child *child; + ssize_t ret; + struct vfs_aio_state vfs_aio_state; +}; + +static void aio_fork_pwrite_done(struct tevent_req *subreq); + +static struct tevent_req *aio_fork_pwrite_send( + struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx, + struct tevent_context *ev, struct files_struct *fsp, + const void *data, size_t n, off_t offset) +{ + struct tevent_req *req, *subreq; + struct aio_fork_pwrite_state *state; + struct rw_cmd cmd; + ssize_t written; + int err; + struct aio_fork_config *config; + SMB_VFS_HANDLE_GET_DATA(handle, config, + struct aio_fork_config, + return NULL); + + req = tevent_req_create(mem_ctx, &state, struct aio_fork_pwrite_state); + if (req == NULL) { + return NULL; } - child->read_cmd = false; - child->aiocb = aiocb; - child->retval.ret_errno = EINPROGRESS; + if (n > 128*1024) { + /* TODO: support variable buffers */ + tevent_req_error(req, EINVAL); + return tevent_req_post(req, ev); + } - memcpy((void *)child->map->ptr, (void *)aiocb->aio_buf, - aiocb->aio_nbytes); + err = get_idle_child(handle, &state->child); + if (err != 0) { + tevent_req_error(req, err); + return tevent_req_post(req, ev); + } ZERO_STRUCT(cmd); - cmd.n = aiocb->aio_nbytes; - cmd.offset = aiocb->aio_offset; - cmd.read_cmd = child->read_cmd; + cmd.n = n; + cmd.offset = offset; + cmd.cmd = WRITE_CMD; + cmd.erratic_testing_mode = config->erratic_testing_mode; DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd, - (int)child->pid)); + (int)state->child->pid)); - ret = write_fd(child->sockfd, &cmd, sizeof(cmd), fsp->fh->fd); - if (ret == -1) { - DEBUG(10, ("write_fd failed: %s\n", strerror(errno))); - return -1; + /* + * Not making this async. We're writing into an empty unix + * domain socket. This should never block. + */ + written = write_fd(state->child->sockfd, &cmd, sizeof(cmd), + fsp->fh->fd); + if (written == -1) { + err = errno; + + TALLOC_FREE(state->child); + + DEBUG(10, ("write_fd failed: %s\n", strerror(err))); + tevent_req_error(req, err); + return tevent_req_post(req, ev); } - return 0; + subreq = read_packet_send(state, ev, state->child->sockfd, + sizeof(struct rw_ret), NULL, NULL); + if (tevent_req_nomem(subreq, req)) { + TALLOC_FREE(state->child); /* we sent sth down */ + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, aio_fork_pwrite_done, req); + return req; } -static struct aio_child *aio_fork_find_child(struct vfs_handle_struct *handle, - SMB_STRUCT_AIOCB *aiocb) +static void aio_fork_pwrite_done(struct tevent_req *subreq) { - struct aio_child_list *children; - struct aio_child *child; - - children = init_aio_children(handle); - if (children == NULL) { - return NULL; + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct aio_fork_pwrite_state *state = tevent_req_data( + req, struct aio_fork_pwrite_state); + ssize_t nread; + uint8_t *buf; + int err; + struct rw_ret *retbuf; + + nread = read_packet_recv(subreq, talloc_tos(), &buf, &err); + TALLOC_FREE(subreq); + if (nread == -1) { + TALLOC_FREE(state->child); + tevent_req_error(req, err); + return; } - for (child = children->children; child != NULL; child = child->next) { - if (child->aiocb == aiocb) { - return child; - } - } + state->child->busy = false; - return NULL; + retbuf = (struct rw_ret *)buf; + state->ret = retbuf->size; + state->vfs_aio_state.error = retbuf->ret_errno; + state->vfs_aio_state.duration = retbuf->duration; + tevent_req_done(req); } -static ssize_t aio_fork_return_fn(struct vfs_handle_struct *handle, - struct files_struct *fsp, - SMB_STRUCT_AIOCB *aiocb) +static ssize_t aio_fork_pwrite_recv(struct tevent_req *req, + struct vfs_aio_state *vfs_aio_state) { - struct aio_child *child = aio_fork_find_child(handle, aiocb); + struct aio_fork_pwrite_state *state = tevent_req_data( + req, struct aio_fork_pwrite_state); - if (child == NULL) { - errno = EINVAL; - DEBUG(0, ("returning EINVAL\n")); + if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) { return -1; } + *vfs_aio_state = state->vfs_aio_state; + return state->ret; +} - child->aiocb = NULL; - - if (child->retval.size == -1) { - errno = child->retval.ret_errno; - } +struct aio_fork_fsync_state { + struct aio_child *child; + ssize_t ret; + struct vfs_aio_state vfs_aio_state; +}; - return child->retval.size; -} +static void aio_fork_fsync_done(struct tevent_req *subreq); -static int aio_fork_cancel(struct vfs_handle_struct *handle, - struct files_struct *fsp, - SMB_STRUCT_AIOCB *aiocb) +static struct tevent_req *aio_fork_fsync_send( + struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx, + struct tevent_context *ev, struct files_struct *fsp) { - struct aio_child_list *children; - struct aio_child *child; + struct tevent_req *req, *subreq; + struct aio_fork_fsync_state *state; + struct rw_cmd cmd; + ssize_t written; + int err; + struct aio_fork_config *config; - children = init_aio_children(handle); - if (children == NULL) { - errno = EINVAL; - return -1; + SMB_VFS_HANDLE_GET_DATA(handle, config, + struct aio_fork_config, + return NULL); + + req = tevent_req_create(mem_ctx, &state, struct aio_fork_fsync_state); + if (req == NULL) { + return NULL; } - for (child = children->children; child != NULL; child = child->next) { - if (child->aiocb == NULL) { - continue; - } - if (child->aiocb->aio_fildes != fsp->fh->fd) { - continue; - } - if ((aiocb != NULL) && (child->aiocb != aiocb)) { - continue; - } + err = get_idle_child(handle, &state->child); + if (err != 0) { + tevent_req_error(req, err); + return tevent_req_post(req, ev); + } - /* - * We let the child do its job, but we discard the result when - * it's finished. - */ + ZERO_STRUCT(cmd); + cmd.cmd = FSYNC_CMD; + cmd.erratic_testing_mode = config->erratic_testing_mode; - child->cancelled = true; - } + DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd, + (int)state->child->pid)); - return AIO_CANCELED; -} + /* + * Not making this async. We're writing into an empty unix + * domain socket. This should never block. + */ + written = write_fd(state->child->sockfd, &cmd, sizeof(cmd), + fsp->fh->fd); + if (written == -1) { + err = errno; -static int aio_fork_error_fn(struct vfs_handle_struct *handle, - struct files_struct *fsp, - SMB_STRUCT_AIOCB *aiocb) -{ - struct aio_child *child = aio_fork_find_child(handle, aiocb); + TALLOC_FREE(state->child); - if (child == NULL) { - errno = EINVAL; - return -1; + DEBUG(10, ("write_fd failed: %s\n", strerror(err))); + tevent_req_error(req, err); + return tevent_req_post(req, ev); } - return child->retval.ret_errno; -} - -static void aio_fork_suspend_timed_out(struct tevent_context *event_ctx, - struct tevent_timer *te, - struct timeval now, - void *private_data) -{ - bool *timed_out = (bool *)private_data; - /* Remove this timed event handler. */ - TALLOC_FREE(te); - *timed_out = true; + subreq = read_packet_send(state, ev, state->child->sockfd, + sizeof(struct rw_ret), NULL, NULL); + if (tevent_req_nomem(subreq, req)) { + TALLOC_FREE(state->child); /* we sent sth down */ + return tevent_req_post(req, ev); + } + tevent_req_set_callback(subreq, aio_fork_fsync_done, req); + return req; } -static int aio_fork_suspend(struct vfs_handle_struct *handle, - struct files_struct *fsp, - const SMB_STRUCT_AIOCB * const aiocb_array[], - int n, - const struct timespec *timeout) +static void aio_fork_fsync_done(struct tevent_req *subreq) { - struct aio_child_list *children = NULL; - TALLOC_CTX *frame = talloc_stackframe(); - struct event_context *ev = NULL; - int i; - int ret = -1; - bool timed_out = false; - - children = init_aio_children(handle); - if (children == NULL) { - errno = EINVAL; - goto out; - } - - /* This is a blocking call, and has to use a sub-event loop. */ - ev = event_context_init(frame); - if (ev == NULL) { - errno = ENOMEM; - goto out; - } - - if (timeout) { - struct timeval tv = convert_timespec_to_timeval(*timeout); - struct tevent_timer *te = tevent_add_timer(ev, - frame, - timeval_current_ofs(tv.tv_sec, - tv.tv_usec), - aio_fork_suspend_timed_out, - &timed_out); - if (!te) { - errno = ENOMEM; - goto out; - } + struct tevent_req *req = tevent_req_callback_data( + subreq, struct tevent_req); + struct aio_fork_fsync_state *state = tevent_req_data( + req, struct aio_fork_fsync_state); + ssize_t nread; + uint8_t *buf; + int err; + struct rw_ret *retbuf; + + nread = read_packet_recv(subreq, talloc_tos(), &buf, &err); + TALLOC_FREE(subreq); + if (nread == -1) { + TALLOC_FREE(state->child); + tevent_req_error(req, err); + return; } - for (i = 0; i < n; i++) { - struct aio_child *child = NULL; - const SMB_STRUCT_AIOCB *aiocb = aiocb_array[i]; + state->child->busy = false; - if (!aiocb) { - continue; - } + retbuf = (struct rw_ret *)buf; + state->ret = retbuf->size; + state->vfs_aio_state.error = retbuf->ret_errno; + state->vfs_aio_state.duration = retbuf->duration; + tevent_req_done(req); +} - /* - * We're going to cheat here. We know that smbd/aio.c - * only calls this when it's waiting for every single - * outstanding call to finish on a close, so just wait - * individually for each IO to complete. We don't care - * what order they finish - only that they all do. JRA. - */ +static int aio_fork_fsync_recv(struct tevent_req *req, + struct vfs_aio_state *vfs_aio_state) +{ + struct aio_fork_fsync_state *state = tevent_req_data( + req, struct aio_fork_fsync_state); - for (child = children->children; child != NULL; child = child->next) { - if (child->aiocb == NULL) { - continue; - } - if (child->aiocb->aio_fildes != fsp->fh->fd) { - continue; - } - if (child->aiocb != aiocb) { - continue; - } + if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) { + return -1; + } + *vfs_aio_state = state->vfs_aio_state; + return state->ret; +} - if (child->aiocb->aio_sigevent.sigev_value.sival_ptr == NULL) { - continue; - } +static int aio_fork_connect(vfs_handle_struct *handle, const char *service, + const char *user) +{ + int ret; + struct aio_fork_config *config; + ret = SMB_VFS_NEXT_CONNECT(handle, service, user); - /* We're never using this event on the - * main event context again... */ - TALLOC_FREE(child->sock_event); - - child->sock_event = event_add_fd(ev, - child, - child->sockfd, - EVENT_FD_READ, - handle_aio_completion, - child); - - while (1) { - if (tevent_loop_once(ev) == -1) { - goto out; - } - - if (timed_out) { - errno = EAGAIN; - goto out; - } - - /* We set child->aiocb to NULL in our hooked - * AIO_RETURN(). */ - if (child->aiocb == NULL) { - break; - } - } - } + if (ret < 0) { + return ret; } - ret = 0; + config = talloc_zero(handle->conn, struct aio_fork_config); + if (!config) { + SMB_VFS_NEXT_DISCONNECT(handle); + DEBUG(0, ("talloc_zero() failed\n")); + return -1; + } - out: + config->erratic_testing_mode = lp_parm_bool(SNUM(handle->conn), "vfs_aio_fork", + "erratic_testing_mode", false); + + SMB_VFS_HANDLE_SET_DATA(handle, config, + NULL, struct aio_fork_config, + return -1); - TALLOC_FREE(frame); - return ret; + return 0; } static struct vfs_fn_pointers vfs_aio_fork_fns = { - .aio_read = aio_fork_read, - .aio_write = aio_fork_write, - .aio_return_fn = aio_fork_return_fn, - .aio_cancel = aio_fork_cancel, - .aio_error_fn = aio_fork_error_fn, - .aio_suspend = aio_fork_suspend, + .connect_fn = aio_fork_connect, + .pread_send_fn = aio_fork_pread_send, + .pread_recv_fn = aio_fork_pread_recv, + .pwrite_send_fn = aio_fork_pwrite_send, + .pwrite_recv_fn = aio_fork_pwrite_recv, + .fsync_send_fn = aio_fork_fsync_send, + .fsync_recv_fn = aio_fork_fsync_recv, }; -NTSTATUS vfs_aio_fork_init(void); -NTSTATUS vfs_aio_fork_init(void) +NTSTATUS vfs_aio_fork_init(TALLOC_CTX *); +NTSTATUS vfs_aio_fork_init(TALLOC_CTX *ctx) { return smb_register_vfs(SMB_VFS_INTERFACE_VERSION, "aio_fork", &vfs_aio_fork_fns);