* Simulate the Posix AIO using mmap/fork
*
* Copyright (C) Volker Lendecke 2008
+ * Copyright (C) Jeremy Allison 2010
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
*/
#include "includes.h"
+#include "system/filesys.h"
+#include "system/shmem.h"
+#include "smbd/smbd.h"
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
struct mmap_area {
size_t size;
goto fail;
}
+ close(fd);
+
result->size = size;
talloc_set_destructor(result, mmap_area_destructor);
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov[0].iov_base = ptr;
+ iov[0].iov_base = (void *)ptr;
iov[0].iov_len = nbytes;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_namelen = 0;
ZERO_STRUCT(iov);
- iov[0].iov_base = ptr;
+ iov[0].iov_base = (void *)ptr;
iov[0].iov_len = nbytes;
msg.msg_iov = iov;
msg.msg_iovlen = 1;
static void aio_child_cleanup(struct event_context *event_ctx,
struct timed_event *te,
- const struct timeval *now,
+ struct timeval now,
void *private_data)
{
struct aio_child_list *list = talloc_get_type_abort(
"deleting\n", (int)child->pid));
TALLOC_FREE(child);
+ child = next;
}
if (list->children != NULL) {
/*
* Re-schedule the next cleanup round
*/
- list->cleanup_event = event_add_timed(smbd_event_context(), list,
- timeval_add(now, 30, 0),
- "aio_child_cleanup",
+ list->cleanup_event = event_add_timed(server_event_context(), list,
+ timeval_add(&now, 30, 0),
aio_child_cleanup, list);
}
}
if (data == NULL) {
- data = TALLOC_ZERO_P(NULL, struct aio_child_list);
+ data = talloc_zero(NULL, struct aio_child_list);
if (data == NULL) {
return NULL;
}
*/
if (data->cleanup_event == NULL) {
- data->cleanup_event = event_add_timed(smbd_event_context(), data,
+ data->cleanup_event = event_add_timed(server_event_context(), data,
timeval_current_ofs(30, 0),
- "aio_child_cleanup",
aio_child_cleanup, data);
if (data->cleanup_event == NULL) {
TALLOC_FREE(data);
ret_struct.size = sys_pread(
fd, (void *)map->ptr, cmd_struct.n,
cmd_struct.offset);
+#if 0
+/* This breaks "make test" when run with aio_fork module. */
+#ifdef ENABLE_BUILD_FARM_HACKS
+ ret_struct.size = MAX(1, ret_struct.size * 0.9);
+#endif
+#endif
}
else {
ret_struct.size = sys_pwrite(
ret_struct.ret_errno = errno;
}
+ /*
+ * Close the fd before telling our parent we're done. The
+ * parent might close and re-open the file very quickly, and
+ * with system-level share modes (GPFS) we would get an
+ * unjustified SHARING_VIOLATION.
+ */
+ close(fd);
+
ret = write_data(sockfd, (char *)&ret_struct,
sizeof(ret_struct));
if (ret != sizeof(ret_struct)) {
strerror(errno)));
exit(2);
}
-
- close(fd);
}
}
struct fd_event *event, uint16 flags,
void *p)
{
+ struct aio_extra *aio_ex = NULL;
struct aio_child *child = (struct aio_child *)p;
- uint16 mid;
+ NTSTATUS status;
DEBUG(10, ("handle_aio_completion called with flags=%d\n", flags));
return;
}
- if (!NT_STATUS_IS_OK(read_data(child->sockfd,
- (char *)&child->retval,
- sizeof(child->retval)))) {
- DEBUG(0, ("aio child %d died\n", (int)child->pid));
+ status = read_data(child->sockfd, (char *)&child->retval,
+ sizeof(child->retval));
+
+ if (!NT_STATUS_IS_OK(status)) {
+ DEBUG(1, ("aio child %d died: %s\n", (int)child->pid,
+ nt_errstr(status)));
child->retval.size = -1;
child->retval.ret_errno = EIO;
}
+ if (child->aiocb == NULL) {
+ DEBUG(1, ("Inactive child died\n"));
+ TALLOC_FREE(child);
+ return;
+ }
+
if (child->cancelled) {
child->aiocb = NULL;
child->cancelled = false;
child->retval.size);
}
- mid = child->aiocb->aio_sigevent.sigev_value.sival_int;
-
- DEBUG(10, ("mid %d finished\n", (int)mid));
-
- aio_request_done(mid);
- process_aio_queue();
+ aio_ex = (struct aio_extra *)child->aiocb->aio_sigevent.sigev_value.sival_ptr;
+ smbd_aio_complete_aio_ex(aio_ex);
}
static int aio_child_destructor(struct aio_child *child)
{
+ char c=0;
+
SMB_ASSERT((child->aiocb == NULL) || child->cancelled);
+
+ DEBUG(10, ("aio_child_destructor: removing child %d on fd %d\n",
+ child->pid, child->sockfd));
+
+ /*
+ * closing the sockfd makes the child not return from recvmsg() on RHEL
+ * 5.5 so instead force the child to exit by writing bad data to it
+ */
+ write(child->sockfd, &c, sizeof(c));
close(child->sockfd);
DLIST_REMOVE(child->list->children, child);
return 0;
}
-static NTSTATUS create_aio_child(struct aio_child_list *children,
+/*
+ * We have to close all fd's in open files, we might incorrectly hold a system
+ * level share mode on a file.
+ */
+
+static struct files_struct *close_fsp_fd(struct files_struct *fsp,
+ void *private_data)
+{
+ if ((fsp->fh != NULL) && (fsp->fh->fd != -1)) {
+ close(fsp->fh->fd);
+ fsp->fh->fd = -1;
+ }
+ return NULL;
+}
+
+static NTSTATUS create_aio_child(struct smbd_server_connection *sconn,
+ struct aio_child_list *children,
size_t map_size,
struct aio_child **presult)
{
fdpair[0] = fdpair[1] = -1;
- result = TALLOC_ZERO_P(children, struct aio_child);
+ result = talloc_zero(children, struct aio_child);
NT_STATUS_HAVE_NO_MEMORY(result);
if (socketpair(AF_UNIX, SOCK_STREAM, 0, fdpair) == -1) {
status = map_nt_error_from_unix(errno);
DEBUG(10, ("socketpair() failed: %s\n", strerror(errno)));
- TALLOC_FREE(result);
goto fail;
}
if (result->pid == 0) {
close(fdpair[0]);
result->sockfd = fdpair[1];
+ files_forall(sconn, close_fsp_fd, NULL);
aio_child_loop(result->sockfd, result->map);
}
- DEBUG(10, ("Child %d created\n", result->pid));
+ DEBUG(10, ("Child %d created with sockfd %d\n",
+ result->pid, fdpair[0]));
result->sockfd = fdpair[0];
close(fdpair[1]);
- result->sock_event = event_add_fd(smbd_event_context(), result,
+ result->sock_event = event_add_fd(server_event_context(), result,
result->sockfd, EVENT_FD_READ,
handle_aio_completion,
result);
if (child == NULL) {
DEBUG(10, ("no idle child found, creating new one\n"));
- status = create_aio_child(children, 128*1024, &child);
+ status = create_aio_child(handle->conn->sconn, children,
+ 128*1024, &child);
if (!NT_STATUS_IS_OK(status)) {
DEBUG(10, ("create_aio_child failed: %s\n",
nt_errstr(status)));
return child->retval.ret_errno;
}
-/* VFS operations structure */
-
-static vfs_op_tuple aio_fork_ops[] = {
- {SMB_VFS_OP(aio_fork_read), SMB_VFS_OP_AIO_READ,
- SMB_VFS_LAYER_TRANSPARENT},
- {SMB_VFS_OP(aio_fork_write), SMB_VFS_OP_AIO_WRITE,
- SMB_VFS_LAYER_TRANSPARENT},
- {SMB_VFS_OP(aio_fork_return_fn), SMB_VFS_OP_AIO_RETURN,
- SMB_VFS_LAYER_TRANSPARENT},
- {SMB_VFS_OP(aio_fork_cancel), SMB_VFS_OP_AIO_CANCEL,
- SMB_VFS_LAYER_TRANSPARENT},
- {SMB_VFS_OP(aio_fork_error_fn), SMB_VFS_OP_AIO_ERROR,
- SMB_VFS_LAYER_TRANSPARENT},
- {SMB_VFS_OP(NULL), SMB_VFS_OP_NOOP,
- SMB_VFS_LAYER_NOOP}
+static void aio_fork_suspend_timed_out(struct tevent_context *event_ctx,
+ struct tevent_timer *te,
+ struct timeval now,
+ void *private_data)
+{
+ bool *timed_out = (bool *)private_data;
+ /* Remove this timed event handler. */
+ TALLOC_FREE(te);
+ *timed_out = true;
+}
+
+static int aio_fork_suspend(struct vfs_handle_struct *handle,
+ struct files_struct *fsp,
+ const SMB_STRUCT_AIOCB * const aiocb_array[],
+ int n,
+ const struct timespec *timeout)
+{
+ struct aio_child_list *children = NULL;
+ TALLOC_CTX *frame = talloc_stackframe();
+ struct event_context *ev = NULL;
+ int i;
+ int ret = -1;
+ bool timed_out = false;
+
+ children = init_aio_children(handle);
+ if (children == NULL) {
+ errno = EINVAL;
+ goto out;
+ }
+
+ /* This is a blocking call, and has to use a sub-event loop. */
+ ev = event_context_init(frame);
+ if (ev == NULL) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ if (timeout) {
+ struct timeval tv = convert_timespec_to_timeval(*timeout);
+ struct tevent_timer *te = tevent_add_timer(ev,
+ frame,
+ timeval_current_ofs(tv.tv_sec,
+ tv.tv_usec),
+ aio_fork_suspend_timed_out,
+ &timed_out);
+ if (!te) {
+ errno = ENOMEM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ struct aio_child *child = NULL;
+ const SMB_STRUCT_AIOCB *aiocb = aiocb_array[i];
+
+ if (!aiocb) {
+ continue;
+ }
+
+ /*
+ * We're going to cheat here. We know that smbd/aio.c
+ * only calls this when it's waiting for every single
+ * outstanding call to finish on a close, so just wait
+ * individually for each IO to complete. We don't care
+ * what order they finish - only that they all do. JRA.
+ */
+
+ for (child = children->children; child != NULL; child = child->next) {
+ if (child->aiocb == NULL) {
+ continue;
+ }
+ if (child->aiocb->aio_fildes != fsp->fh->fd) {
+ continue;
+ }
+ if (child->aiocb != aiocb) {
+ continue;
+ }
+
+ if (child->aiocb->aio_sigevent.sigev_value.sival_ptr == NULL) {
+ continue;
+ }
+
+ /* We're never using this event on the
+ * main event context again... */
+ TALLOC_FREE(child->sock_event);
+
+ child->sock_event = event_add_fd(ev,
+ child,
+ child->sockfd,
+ EVENT_FD_READ,
+ handle_aio_completion,
+ child);
+
+ while (1) {
+ if (tevent_loop_once(ev) == -1) {
+ goto out;
+ }
+
+ if (timed_out) {
+ errno = EAGAIN;
+ goto out;
+ }
+
+ /* We set child->aiocb to NULL in our hooked
+ * AIO_RETURN(). */
+ if (child->aiocb == NULL) {
+ break;
+ }
+ }
+ }
+ }
+
+ ret = 0;
+
+ out:
+
+ TALLOC_FREE(frame);
+ return ret;
+}
+
+static struct vfs_fn_pointers vfs_aio_fork_fns = {
+ .aio_read = aio_fork_read,
+ .aio_write = aio_fork_write,
+ .aio_return_fn = aio_fork_return_fn,
+ .aio_cancel = aio_fork_cancel,
+ .aio_error_fn = aio_fork_error_fn,
+ .aio_suspend = aio_fork_suspend,
};
NTSTATUS vfs_aio_fork_init(void);
NTSTATUS vfs_aio_fork_init(void)
{
return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
- "aio_fork", aio_fork_ops);
+ "aio_fork", &vfs_aio_fork_fns);
}