2 * Simulate the Posix AIO using mmap/fork
4 * Copyright (C) Volker Lendecke 2008
5 * Copyright (C) Jeremy Allison 2010
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 #include "system/filesys.h"
24 #include "system/shmem.h"
25 #include "smbd/smbd.h"
26 #include "smbd/globals.h"
27 #include "lib/async_req/async_sock.h"
28 #include "lib/util/tevent_unix.h"
29 #include "lib/util/sys_rw.h"
30 #include "lib/util/sys_rw_data.h"
31 #include "lib/util/msghdr.h"
32 #include "smbprofile.h"
34 #if !defined(HAVE_STRUCT_MSGHDR_MSG_CONTROL) && !defined(HAVE_STRUCT_MSGHDR_MSG_ACCRIGHTS)
35 # error Can not pass file descriptors
44 struct aio_fork_config {
45 bool erratic_testing_mode;
53 static int mmap_area_destructor(struct mmap_area *area)
55 munmap(discard_const(area->ptr), area->size);
59 static struct mmap_area *mmap_area_init(TALLOC_CTX *mem_ctx, size_t size)
61 struct mmap_area *result;
64 result = talloc(mem_ctx, struct mmap_area);
66 DEBUG(0, ("talloc failed\n"));
70 fd = open("/dev/zero", O_RDWR);
72 DEBUG(3, ("open(\"/dev/zero\") failed: %s\n",
77 result->ptr = mmap(NULL, size, PROT_READ|PROT_WRITE,
78 MAP_SHARED|MAP_FILE, fd, 0);
80 if (result->ptr == MAP_FAILED) {
81 DEBUG(1, ("mmap failed: %s\n", strerror(errno)));
86 talloc_set_destructor(result, mmap_area_destructor);
101 static const char *cmd_type_str(enum cmd_type cmd)
116 result = "<UNKNOWN>";
126 bool erratic_testing_mode;
135 struct aio_child_list;
138 struct aio_child *prev, *next;
139 struct aio_child_list *list;
142 struct mmap_area *map;
143 bool dont_delete; /* Marked as in use since last cleanup */
147 struct aio_child_list {
148 struct aio_child *children;
149 struct tevent_timer *cleanup_event;
152 static void free_aio_children(void **p)
157 static ssize_t read_fd(int fd, void *ptr, size_t nbytes, int *recvfd)
160 struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 1 };
162 size_t bufsize = msghdr_prep_recv_fds(NULL, NULL, 0, 1);
163 uint8_t buf[bufsize];
165 msghdr_prep_recv_fds(&msg, buf, bufsize, 1);
167 iov[0].iov_base = (void *)ptr;
168 iov[0].iov_len = nbytes;
171 n = recvmsg(fd, &msg, 0);
172 } while ((n == -1) && (errno == EINTR));
179 size_t num_fds = msghdr_extract_fds(&msg, NULL, 0);
182 msghdr_extract_fds(&msg, fds, num_fds);
187 for (i=0; i<num_fds; i++) {
201 static ssize_t write_fd(int fd, void *ptr, size_t nbytes, int sendfd)
203 struct msghdr msg = {0};
204 size_t bufsize = msghdr_prep_fds(NULL, NULL, 0, &sendfd, 1);
205 uint8_t buf[bufsize];
209 msghdr_prep_fds(&msg, buf, bufsize, &sendfd, 1);
211 iov.iov_base = (void *)ptr;
212 iov.iov_len = nbytes;
217 sent = sendmsg(fd, &msg, 0);
218 } while ((sent == -1) && (errno == EINTR));
223 static void aio_child_cleanup(struct tevent_context *event_ctx,
224 struct tevent_timer *te,
228 struct aio_child_list *list = talloc_get_type_abort(
229 private_data, struct aio_child_list);
230 struct aio_child *child, *next;
232 TALLOC_FREE(list->cleanup_event);
234 for (child = list->children; child != NULL; child = next) {
238 DEBUG(10, ("child %d currently active\n",
243 if (child->dont_delete) {
244 DEBUG(10, ("Child %d was active since last cleanup\n",
246 child->dont_delete = false;
250 DEBUG(10, ("Child %d idle for more than 30 seconds, "
251 "deleting\n", (int)child->pid));
257 if (list->children != NULL) {
259 * Re-schedule the next cleanup round
261 list->cleanup_event = tevent_add_timer(server_event_context(), list,
262 timeval_add(&now, 30, 0),
263 aio_child_cleanup, list);
268 static struct aio_child_list *init_aio_children(struct vfs_handle_struct *handle)
270 struct aio_child_list *data = NULL;
272 if (SMB_VFS_HANDLE_TEST_DATA(handle)) {
273 SMB_VFS_HANDLE_GET_DATA(handle, data, struct aio_child_list,
278 data = talloc_zero(NULL, struct aio_child_list);
285 * Regardless of whether the child_list had been around or not, make
286 * sure that we have a cleanup timed event. This timed event will
287 * delete itself when it finds that no children are around anymore.
290 if (data->cleanup_event == NULL) {
291 data->cleanup_event = tevent_add_timer(server_event_context(), data,
292 timeval_current_ofs(30, 0),
293 aio_child_cleanup, data);
294 if (data->cleanup_event == NULL) {
300 if (!SMB_VFS_HANDLE_TEST_DATA(handle)) {
301 SMB_VFS_HANDLE_SET_DATA(handle, data, free_aio_children,
302 struct aio_child_list, return False);
308 static void aio_child_loop(int sockfd, struct mmap_area *map)
313 struct rw_cmd cmd_struct;
314 struct rw_ret ret_struct;
315 struct timespec start, end;
317 ret = read_fd(sockfd, &cmd_struct, sizeof(cmd_struct), &fd);
318 if (ret != sizeof(cmd_struct)) {
319 DEBUG(10, ("read_fd returned %d: %s\n", (int)ret,
324 DEBUG(10, ("aio_child_loop: %s %d bytes at %d from fd %d\n",
325 cmd_type_str(cmd_struct.cmd),
326 (int)cmd_struct.n, (int)cmd_struct.offset, fd));
328 if (cmd_struct.erratic_testing_mode) {
330 * For developer testing, we want erratic behaviour for
336 * use generate_random_buffer, we just forked from a
337 * common parent state
339 generate_random_buffer(&randval, sizeof(randval));
340 msecs = randval + 20;
341 DEBUG(10, ("delaying for %u msecs\n", msecs));
345 ZERO_STRUCT(ret_struct);
347 PROFILE_TIMESTAMP(&start);
349 switch (cmd_struct.cmd) {
351 ret_struct.size = sys_pread(
352 fd, discard_const(map->ptr), cmd_struct.n,
355 /* This breaks "make test" when run with aio_fork module. */
357 ret_struct.size = MAX(1, ret_struct.size * 0.9);
362 ret_struct.size = sys_pwrite(
363 fd, discard_const(map->ptr), cmd_struct.n,
367 ret_struct.size = fsync(fd);
370 ret_struct.size = -1;
374 PROFILE_TIMESTAMP(&end);
375 ret_struct.duration = nsec_time_diff(&end, &start);
376 DEBUG(10, ("aio_child_loop: syscall returned %d\n",
377 (int)ret_struct.size));
379 if (ret_struct.size == -1) {
380 ret_struct.ret_errno = errno;
384 * Close the fd before telling our parent we're done. The
385 * parent might close and re-open the file very quickly, and
386 * with system-level share modes (GPFS) we would get an
387 * unjustified SHARING_VIOLATION.
391 ret = write_data(sockfd, (char *)&ret_struct,
393 if (ret != sizeof(ret_struct)) {
394 DEBUG(10, ("could not write ret_struct: %s\n",
401 static int aio_child_destructor(struct aio_child *child)
405 SMB_ASSERT(!child->busy);
407 DEBUG(10, ("aio_child_destructor: removing child %d on fd %d\n",
408 (int)child->pid, child->sockfd));
411 * closing the sockfd makes the child not return from recvmsg() on RHEL
412 * 5.5 so instead force the child to exit by writing bad data to it
414 sys_write_v(child->sockfd, &c, sizeof(c));
415 close(child->sockfd);
416 DLIST_REMOVE(child->list->children, child);
421 * We have to close all fd's in open files, we might incorrectly hold a system
422 * level share mode on a file.
425 static struct files_struct *close_fsp_fd(struct files_struct *fsp,
428 if ((fsp->fh != NULL) && (fsp->fh->fd != -1)) {
435 static int create_aio_child(struct smbd_server_connection *sconn,
436 struct aio_child_list *children,
438 struct aio_child **presult)
440 struct aio_child *result;
444 fdpair[0] = fdpair[1] = -1;
446 result = talloc_zero(children, struct aio_child);
447 if (result == NULL) {
451 if (socketpair(AF_UNIX, SOCK_STREAM, 0, fdpair) == -1) {
453 DEBUG(10, ("socketpair() failed: %s\n", strerror(errno)));
457 DEBUG(10, ("fdpair = %d/%d\n", fdpair[0], fdpair[1]));
459 result->map = mmap_area_init(result, map_size);
460 if (result->map == NULL) {
462 DEBUG(0, ("Could not create mmap area\n"));
466 result->pid = fork();
467 if (result->pid == -1) {
469 DEBUG(0, ("fork failed: %s\n", strerror(errno)));
473 if (result->pid == 0) {
475 result->sockfd = fdpair[1];
476 files_forall(sconn, close_fsp_fd, NULL);
477 aio_child_loop(result->sockfd, result->map);
480 DEBUG(10, ("Child %d created with sockfd %d\n",
481 (int)result->pid, fdpair[0]));
483 result->sockfd = fdpair[0];
486 result->list = children;
487 DLIST_ADD(children->children, result);
489 talloc_set_destructor(result, aio_child_destructor);
496 if (fdpair[0] != -1) close(fdpair[0]);
497 if (fdpair[1] != -1) close(fdpair[1]);
503 static int get_idle_child(struct vfs_handle_struct *handle,
504 struct aio_child **pchild)
506 struct aio_child_list *children;
507 struct aio_child *child;
509 children = init_aio_children(handle);
510 if (children == NULL) {
514 for (child = children->children; child != NULL; child = child->next) {
523 DEBUG(10, ("no idle child found, creating new one\n"));
525 ret = create_aio_child(handle->conn->sconn, children,
528 DEBUG(10, ("create_aio_child failed: %s\n",
534 child->dont_delete = true;
541 struct aio_fork_pread_state {
542 struct aio_child *child;
544 struct vfs_aio_state vfs_aio_state;
547 static void aio_fork_pread_done(struct tevent_req *subreq);
549 static struct tevent_req *aio_fork_pread_send(struct vfs_handle_struct *handle,
551 struct tevent_context *ev,
552 struct files_struct *fsp,
554 size_t n, off_t offset)
556 struct tevent_req *req, *subreq;
557 struct aio_fork_pread_state *state;
561 struct aio_fork_config *config;
563 SMB_VFS_HANDLE_GET_DATA(handle, config,
564 struct aio_fork_config,
567 req = tevent_req_create(mem_ctx, &state, struct aio_fork_pread_state);
573 /* TODO: support variable buffers */
574 tevent_req_error(req, EINVAL);
575 return tevent_req_post(req, ev);
578 err = get_idle_child(handle, &state->child);
580 tevent_req_error(req, err);
581 return tevent_req_post(req, ev);
588 cmd.erratic_testing_mode = config->erratic_testing_mode;
590 DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
591 (int)state->child->pid));
594 * Not making this async. We're writing into an empty unix
595 * domain socket. This should never block.
597 written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
602 TALLOC_FREE(state->child);
604 DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
605 tevent_req_error(req, err);
606 return tevent_req_post(req, ev);
609 subreq = read_packet_send(state, ev, state->child->sockfd,
610 sizeof(struct rw_ret), NULL, NULL);
611 if (tevent_req_nomem(subreq, req)) {
612 TALLOC_FREE(state->child); /* we sent sth down */
613 return tevent_req_post(req, ev);
615 tevent_req_set_callback(subreq, aio_fork_pread_done, req);
619 static void aio_fork_pread_done(struct tevent_req *subreq)
621 struct tevent_req *req = tevent_req_callback_data(
622 subreq, struct tevent_req);
623 struct aio_fork_pread_state *state = tevent_req_data(
624 req, struct aio_fork_pread_state);
628 struct rw_ret *retbuf;
630 nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
633 TALLOC_FREE(state->child);
634 tevent_req_error(req, err);
638 state->child->busy = false;
640 retbuf = (struct rw_ret *)buf;
641 state->ret = retbuf->size;
642 state->vfs_aio_state.error = retbuf->ret_errno;
643 state->vfs_aio_state.duration = retbuf->duration;
644 tevent_req_done(req);
647 static ssize_t aio_fork_pread_recv(struct tevent_req *req,
648 struct vfs_aio_state *vfs_aio_state)
650 struct aio_fork_pread_state *state = tevent_req_data(
651 req, struct aio_fork_pread_state);
653 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
656 *vfs_aio_state = state->vfs_aio_state;
660 struct aio_fork_pwrite_state {
661 struct aio_child *child;
663 struct vfs_aio_state vfs_aio_state;
666 static void aio_fork_pwrite_done(struct tevent_req *subreq);
668 static struct tevent_req *aio_fork_pwrite_send(
669 struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx,
670 struct tevent_context *ev, struct files_struct *fsp,
671 const void *data, size_t n, off_t offset)
673 struct tevent_req *req, *subreq;
674 struct aio_fork_pwrite_state *state;
678 struct aio_fork_config *config;
679 SMB_VFS_HANDLE_GET_DATA(handle, config,
680 struct aio_fork_config,
683 req = tevent_req_create(mem_ctx, &state, struct aio_fork_pwrite_state);
689 /* TODO: support variable buffers */
690 tevent_req_error(req, EINVAL);
691 return tevent_req_post(req, ev);
694 err = get_idle_child(handle, &state->child);
696 tevent_req_error(req, err);
697 return tevent_req_post(req, ev);
704 cmd.erratic_testing_mode = config->erratic_testing_mode;
706 DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
707 (int)state->child->pid));
710 * Not making this async. We're writing into an empty unix
711 * domain socket. This should never block.
713 written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
718 TALLOC_FREE(state->child);
720 DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
721 tevent_req_error(req, err);
722 return tevent_req_post(req, ev);
725 subreq = read_packet_send(state, ev, state->child->sockfd,
726 sizeof(struct rw_ret), NULL, NULL);
727 if (tevent_req_nomem(subreq, req)) {
728 TALLOC_FREE(state->child); /* we sent sth down */
729 return tevent_req_post(req, ev);
731 tevent_req_set_callback(subreq, aio_fork_pwrite_done, req);
735 static void aio_fork_pwrite_done(struct tevent_req *subreq)
737 struct tevent_req *req = tevent_req_callback_data(
738 subreq, struct tevent_req);
739 struct aio_fork_pwrite_state *state = tevent_req_data(
740 req, struct aio_fork_pwrite_state);
744 struct rw_ret *retbuf;
746 nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
749 TALLOC_FREE(state->child);
750 tevent_req_error(req, err);
754 state->child->busy = false;
756 retbuf = (struct rw_ret *)buf;
757 state->ret = retbuf->size;
758 state->vfs_aio_state.error = retbuf->ret_errno;
759 state->vfs_aio_state.duration = retbuf->duration;
760 tevent_req_done(req);
763 static ssize_t aio_fork_pwrite_recv(struct tevent_req *req,
764 struct vfs_aio_state *vfs_aio_state)
766 struct aio_fork_pwrite_state *state = tevent_req_data(
767 req, struct aio_fork_pwrite_state);
769 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
772 *vfs_aio_state = state->vfs_aio_state;
776 struct aio_fork_fsync_state {
777 struct aio_child *child;
779 struct vfs_aio_state vfs_aio_state;
782 static void aio_fork_fsync_done(struct tevent_req *subreq);
784 static struct tevent_req *aio_fork_fsync_send(
785 struct vfs_handle_struct *handle, TALLOC_CTX *mem_ctx,
786 struct tevent_context *ev, struct files_struct *fsp)
788 struct tevent_req *req, *subreq;
789 struct aio_fork_fsync_state *state;
793 struct aio_fork_config *config;
795 SMB_VFS_HANDLE_GET_DATA(handle, config,
796 struct aio_fork_config,
799 req = tevent_req_create(mem_ctx, &state, struct aio_fork_fsync_state);
804 err = get_idle_child(handle, &state->child);
806 tevent_req_error(req, err);
807 return tevent_req_post(req, ev);
812 cmd.erratic_testing_mode = config->erratic_testing_mode;
814 DEBUG(10, ("sending fd %d to child %d\n", fsp->fh->fd,
815 (int)state->child->pid));
818 * Not making this async. We're writing into an empty unix
819 * domain socket. This should never block.
821 written = write_fd(state->child->sockfd, &cmd, sizeof(cmd),
826 TALLOC_FREE(state->child);
828 DEBUG(10, ("write_fd failed: %s\n", strerror(err)));
829 tevent_req_error(req, err);
830 return tevent_req_post(req, ev);
833 subreq = read_packet_send(state, ev, state->child->sockfd,
834 sizeof(struct rw_ret), NULL, NULL);
835 if (tevent_req_nomem(subreq, req)) {
836 TALLOC_FREE(state->child); /* we sent sth down */
837 return tevent_req_post(req, ev);
839 tevent_req_set_callback(subreq, aio_fork_fsync_done, req);
843 static void aio_fork_fsync_done(struct tevent_req *subreq)
845 struct tevent_req *req = tevent_req_callback_data(
846 subreq, struct tevent_req);
847 struct aio_fork_fsync_state *state = tevent_req_data(
848 req, struct aio_fork_fsync_state);
852 struct rw_ret *retbuf;
854 nread = read_packet_recv(subreq, talloc_tos(), &buf, &err);
857 TALLOC_FREE(state->child);
858 tevent_req_error(req, err);
862 state->child->busy = false;
864 retbuf = (struct rw_ret *)buf;
865 state->ret = retbuf->size;
866 state->vfs_aio_state.error = retbuf->ret_errno;
867 state->vfs_aio_state.duration = retbuf->duration;
868 tevent_req_done(req);
871 static int aio_fork_fsync_recv(struct tevent_req *req,
872 struct vfs_aio_state *vfs_aio_state)
874 struct aio_fork_fsync_state *state = tevent_req_data(
875 req, struct aio_fork_fsync_state);
877 if (tevent_req_is_unix_error(req, &vfs_aio_state->error)) {
880 *vfs_aio_state = state->vfs_aio_state;
884 static int aio_fork_connect(vfs_handle_struct *handle, const char *service,
888 struct aio_fork_config *config;
889 ret = SMB_VFS_NEXT_CONNECT(handle, service, user);
895 config = talloc_zero(handle->conn, struct aio_fork_config);
897 SMB_VFS_NEXT_DISCONNECT(handle);
898 DEBUG(0, ("talloc_zero() failed\n"));
902 config->erratic_testing_mode = lp_parm_bool(SNUM(handle->conn), "vfs_aio_fork",
903 "erratic_testing_mode", false);
905 SMB_VFS_HANDLE_SET_DATA(handle, config,
906 NULL, struct aio_fork_config,
912 static struct vfs_fn_pointers vfs_aio_fork_fns = {
913 .connect_fn = aio_fork_connect,
914 .pread_send_fn = aio_fork_pread_send,
915 .pread_recv_fn = aio_fork_pread_recv,
916 .pwrite_send_fn = aio_fork_pwrite_send,
917 .pwrite_recv_fn = aio_fork_pwrite_recv,
918 .fsync_send_fn = aio_fork_fsync_send,
919 .fsync_recv_fn = aio_fork_fsync_recv,
922 NTSTATUS vfs_aio_fork_init(TALLOC_CTX *);
923 NTSTATUS vfs_aio_fork_init(TALLOC_CTX *ctx)
925 return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
926 "aio_fork", &vfs_aio_fork_fns);