s3-prefork: Listening fds must be in non-blocking mode
[idra/samba.git] / source3 / lib / events.c
index 90d86c6c79feac40d34e38ee94259d0ef7b52360..77589f8e7e2060d83d1ffca952e7c1c599ac9c65 100644 (file)
@@ -2,7 +2,7 @@
    Unix SMB/CIFS implementation.
    Timed event library.
    Copyright (C) Andrew Tridgell 1992-1998
-   Copyright (C) Volker Lendecke 2005
+   Copyright (C) Volker Lendecke 2005-2007
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
 */
 
 #include "includes.h"
-#include <tevent_internal.h>
+#include "lib/tevent/tevent_internal.h"
+#include "../lib/util/select.h"
+#include "system/select.h"
+
+struct tevent_poll_private {
+       /*
+        * Index from file descriptor into the pollfd array
+        */
+       int *pollfd_idx;
+
+       /*
+        * Cache for s3_event_loop_once to avoid reallocs
+        */
+       struct pollfd *pfds;
+};
 
-void event_fd_set_writeable(struct tevent_fd *fde)
+static struct tevent_poll_private *tevent_get_poll_private(
+       struct tevent_context *ev)
 {
-       TEVENT_FD_WRITEABLE(fde);
+       struct tevent_poll_private *state;
+
+       state = (struct tevent_poll_private *)ev->additional_data;
+       if (state == NULL) {
+               state = talloc_zero(ev, struct tevent_poll_private);
+               ev->additional_data = (void *)state;
+               if (state == NULL) {
+                       DEBUG(10, ("talloc failed\n"));
+               }
+       }
+       return state;
 }
 
-void event_fd_set_not_writeable(struct tevent_fd *fde)
+static void count_fds(struct tevent_context *ev,
+                     int *pnum_fds, int *pmax_fd)
 {
-       TEVENT_FD_NOT_WRITEABLE(fde);
+       struct tevent_fd *fde;
+       int num_fds = 0;
+       int max_fd = 0;
+
+       for (fde = ev->fd_events; fde != NULL; fde = fde->next) {
+               if (fde->flags & (EVENT_FD_READ|EVENT_FD_WRITE)) {
+                       num_fds += 1;
+                       if (fde->fd > max_fd) {
+                               max_fd = fde->fd;
+                       }
+               }
+       }
+       *pnum_fds = num_fds;
+       *pmax_fd = max_fd;
 }
 
-void event_fd_set_readable(struct tevent_fd *fde)
+bool event_add_to_poll_args(struct tevent_context *ev, TALLOC_CTX *mem_ctx,
+                           struct pollfd **pfds, int *pnum_pfds,
+                           int *ptimeout)
 {
-       TEVENT_FD_READABLE(fde);
-}
+       struct tevent_poll_private *state;
+       struct tevent_fd *fde;
+       int i, num_fds, max_fd, num_pollfds, idx_len;
+       struct pollfd *fds;
+       struct timeval now, diff;
+       int timeout;
 
-void event_fd_set_not_readable(struct tevent_fd *fde)
-{
-       TEVENT_FD_NOT_READABLE(fde);
-}
+       state = tevent_get_poll_private(ev);
+       if (state == NULL) {
+               return false;
+       }
+       count_fds(ev, &num_fds, &max_fd);
 
-/*
- * Return if there's something in the queue
- */
+       idx_len = max_fd+1;
 
-bool event_add_to_select_args(struct tevent_context *ev,
-                             const struct timeval *now,
-                             fd_set *read_fds, fd_set *write_fds,
-                             struct timeval *timeout, int *maxfd)
-{
-       struct tevent_fd *fde;
-       struct timeval diff;
-       bool ret = false;
+       if (talloc_array_length(state->pollfd_idx) < idx_len) {
+               state->pollfd_idx = talloc_realloc(
+                       state, state->pollfd_idx, int, idx_len);
+               if (state->pollfd_idx == NULL) {
+                       DEBUG(10, ("talloc_realloc failed\n"));
+                       return false;
+               }
+       }
+
+       fds = *pfds;
+       num_pollfds = *pnum_pfds;
+
+       /*
+        * The +1 is for the sys_poll calling convention. It expects
+        * an array 1 longer for the signal pipe
+        */
+
+       if (talloc_array_length(fds) < num_pollfds + num_fds + 1) {
+               fds = talloc_realloc(mem_ctx, fds, struct pollfd,
+                                          num_pollfds + num_fds + 1);
+               if (fds == NULL) {
+                       DEBUG(10, ("talloc_realloc failed\n"));
+                       return false;
+               }
+       }
+
+       memset(&fds[num_pollfds], 0, sizeof(struct pollfd) * num_fds);
+
+       /*
+        * This needs tuning. We need to cope with multiple fde's for a file
+        * descriptor. The problem is that we need to re-use pollfd_idx across
+        * calls for efficiency. One way would be a direct bitmask that might
+        * be initialized quicker, but our bitmap_init implementation is
+        * pretty heavy-weight as well.
+        */
+       for (i=0; i<idx_len; i++) {
+               state->pollfd_idx[i] = -1;
+       }
 
        for (fde = ev->fd_events; fde; fde = fde->next) {
-               if (fde->flags & EVENT_FD_READ) {
-                       FD_SET(fde->fd, read_fds);
-                       ret = true;
+               struct pollfd *pfd;
+
+               if ((fde->flags & (EVENT_FD_READ|EVENT_FD_WRITE)) == 0) {
+                       continue;
                }
-               if (fde->flags & EVENT_FD_WRITE) {
-                       FD_SET(fde->fd, write_fds);
-                       ret = true;
+
+               if (state->pollfd_idx[fde->fd] == -1) {
+                       /*
+                        * We haven't seen this fd yet. Allocate a new pollfd.
+                        */
+                       state->pollfd_idx[fde->fd] = num_pollfds;
+                       pfd = &fds[num_pollfds];
+                       num_pollfds += 1;
+               } else {
+                       /*
+                        * We have already seen this fd. OR in the flags.
+                        */
+                       pfd = &fds[state->pollfd_idx[fde->fd]];
                }
 
-               if ((fde->flags & (EVENT_FD_READ|EVENT_FD_WRITE))
-                   && (fde->fd > *maxfd)) {
-                       *maxfd = fde->fd;
+               pfd->fd = fde->fd;
+
+               if (fde->flags & EVENT_FD_READ) {
+                       pfd->events |= (POLLIN|POLLHUP);
+               }
+               if (fde->flags & EVENT_FD_WRITE) {
+                       pfd->events |= POLLOUT;
                }
        }
+       *pfds = fds;
+       *pnum_pfds = num_pollfds;
 
+       if (ev->immediate_events != NULL) {
+               *ptimeout = 0;
+               return true;
+       }
        if (ev->timer_events == NULL) {
-               return ret;
+               *ptimeout = MIN(*ptimeout, INT_MAX);
+               return true;
        }
 
-       diff = timeval_until(now, &ev->timer_events->next_event);
-       *timeout = timeval_min(timeout, &diff);
+       now = timeval_current();
+       diff = timeval_until(&now, &ev->timer_events->next_event);
+       timeout = timeval_to_msec(diff);
+
+       if (timeout < *ptimeout) {
+               *ptimeout = timeout;
+       }
 
        return true;
 }
 
-bool run_events(struct tevent_context *ev,
-               int selrtn, fd_set *read_fds, fd_set *write_fds)
+bool run_events_poll(struct tevent_context *ev, int pollrtn,
+                    struct pollfd *pfds, int num_pfds)
 {
+       struct tevent_poll_private *state;
+       int *pollfd_idx;
        struct tevent_fd *fde;
        struct timeval now;
 
@@ -100,29 +203,86 @@ bool run_events(struct tevent_context *ev,
 
        if ((ev->timer_events != NULL)
            && (timeval_compare(&now, &ev->timer_events->next_event) >= 0)) {
+               /* this older events system did not auto-free timed
+                  events on running them, and had a race condition
+                  where the event could be called twice if the
+                  talloc_free of the te happened after the callback
+                  made a call which invoked the event loop. To avoid
+                  this while still allowing old code which frees the
+                  te, we need to create a temporary context which
+                  will be used to ensure the te is freed. We also
+                  remove the te from the timed event list before we
+                  call the handler, to ensure we can't loop */
+
+               struct tevent_timer *te = ev->timer_events;
+               TALLOC_CTX *tmp_ctx = talloc_new(ev);
 
                DEBUG(10, ("Running timed event \"%s\" %p\n",
                           ev->timer_events->handler_name, ev->timer_events));
 
-               ev->timer_events->handler(ev, ev->timer_events, now,
-                                         ev->timer_events->private_data);
+               DLIST_REMOVE(ev->timer_events, te);
+               talloc_steal(tmp_ctx, te);
+
+               te->handler(ev, te, now, te->private_data);
+
+               talloc_free(tmp_ctx);
                return true;
        }
 
-       if (selrtn == 0) {
+       if (pollrtn <= 0) {
                /*
                 * No fd ready
                 */
                return false;
        }
 
+       state = (struct tevent_poll_private *)ev->additional_data;
+       pollfd_idx = state->pollfd_idx;
+
        for (fde = ev->fd_events; fde; fde = fde->next) {
+               struct pollfd *pfd;
                uint16 flags = 0;
 
-               if (FD_ISSET(fde->fd, read_fds)) flags |= EVENT_FD_READ;
-               if (FD_ISSET(fde->fd, write_fds)) flags |= EVENT_FD_WRITE;
+               if ((fde->flags & (EVENT_FD_READ|EVENT_FD_WRITE)) == 0) {
+                       continue;
+               }
+
+               if (pollfd_idx[fde->fd] >= num_pfds) {
+                       DEBUG(1, ("internal error: pollfd_idx[fde->fd] (%d) "
+                                 ">= num_pfds (%d)\n", pollfd_idx[fde->fd],
+                                 num_pfds));
+                       return false;
+               }
+               pfd = &pfds[pollfd_idx[fde->fd]];
+
+               if (pfd->fd != fde->fd) {
+                       DEBUG(1, ("internal error: pfd->fd (%d) "
+                                 "!= fde->fd (%d)\n", pollfd_idx[fde->fd],
+                                  num_pfds));
+                       return false;
+               }
+
+               if (pfd->revents & (POLLHUP|POLLERR)) {
+                       /* If we only wait for EVENT_FD_WRITE, we
+                          should not tell the event handler about it,
+                          and remove the writable flag, as we only
+                          report errors when waiting for read events
+                          to match the select behavior. */
+                       if (!(fde->flags & EVENT_FD_READ)) {
+                               EVENT_FD_NOT_WRITEABLE(fde);
+                               continue;
+                       }
+                       flags |= EVENT_FD_READ;
+               }
 
+               if (pfd->revents & POLLIN) {
+                       flags |= EVENT_FD_READ;
+               }
+               if (pfd->revents & POLLOUT) {
+                       flags |= EVENT_FD_WRITE;
+               }
                if (flags & fde->flags) {
+                       DLIST_DEMOTE(ev->fd_events, fde, struct tevent_fd);
                        fde->handler(ev, fde, flags, fde->private_data);
                        return true;
                }
@@ -131,15 +291,18 @@ bool run_events(struct tevent_context *ev,
        return false;
 }
 
-
 struct timeval *get_timed_events_timeout(struct tevent_context *ev,
                                         struct timeval *to_ret)
 {
        struct timeval now;
 
-       if (ev->timer_events == NULL) {
+       if ((ev->timer_events == NULL) && (ev->immediate_events == NULL)) {
                return NULL;
        }
+       if (ev->immediate_events != NULL) {
+               *to_ret = timeval_zero();
+               return to_ret;
+       }
 
        now = timeval_current();
        *to_ret = timeval_until(&now, &ev->timer_events->next_event);
@@ -152,46 +315,41 @@ struct timeval *get_timed_events_timeout(struct tevent_context *ev,
 
 static int s3_event_loop_once(struct tevent_context *ev, const char *location)
 {
-       struct timeval now, to;
-       fd_set r_fds, w_fds;
-       int maxfd = 0;
+       struct tevent_poll_private *state;
+       int timeout;
+       int num_pfds;
        int ret;
 
-       FD_ZERO(&r_fds);
-       FD_ZERO(&w_fds);
+       timeout = INT_MAX;
 
-       to.tv_sec = 9999;       /* Max timeout */
-       to.tv_usec = 0;
+       state = tevent_get_poll_private(ev);
+       if (state == NULL) {
+               errno = ENOMEM;
+               return -1;
+       }
 
-       if (run_events(ev, 0, NULL, NULL)) {
+       if (run_events_poll(ev, 0, NULL, 0)) {
                return 0;
        }
 
-       GetTimeOfDay(&now);
-
-       if (!event_add_to_select_args(ev, &now, &r_fds, &w_fds, &to, &maxfd)) {
+       num_pfds = 0;
+       if (!event_add_to_poll_args(ev, state,
+                                   &state->pfds, &num_pfds, &timeout)) {
                return -1;
        }
 
-       ret = sys_select(maxfd+1, &r_fds, &w_fds, NULL, &to);
-
+       ret = sys_poll(state->pfds, num_pfds, timeout);
        if (ret == -1 && errno != EINTR) {
                tevent_debug(ev, TEVENT_DEBUG_FATAL,
-                            "sys_select() failed: %d:%s\n",
+                            "poll() failed: %d:%s\n",
                             errno, strerror(errno));
                return -1;
        }
 
-       run_events(ev, ret, &r_fds, &w_fds);
+       run_events_poll(ev, ret, state->pfds, num_pfds);
        return 0;
 }
 
-void event_context_reinit(struct tevent_context *ev)
-{
-       tevent_common_context_destructor(ev);
-       return;
-}
-
 static int s3_event_context_init(struct tevent_context *ev)
 {
        return 0;
@@ -277,7 +435,7 @@ static void s3_event_debug(void *context, enum tevent_debug_level level,
                samba_level = 2;
                break;
        case TEVENT_DEBUG_TRACE:
-               samba_level = 10;
+               samba_level = 11;
                break;
 
        };
@@ -302,3 +460,83 @@ struct tevent_context *s3_tevent_context_init(TALLOC_CTX *mem_ctx)
        return ev;
 }
 
+struct idle_event {
+       struct timed_event *te;
+       struct timeval interval;
+       char *name;
+       bool (*handler)(const struct timeval *now, void *private_data);
+       void *private_data;
+};
+
+static void smbd_idle_event_handler(struct event_context *ctx,
+                                   struct timed_event *te,
+                                   struct timeval now,
+                                   void *private_data)
+{
+       struct idle_event *event =
+               talloc_get_type_abort(private_data, struct idle_event);
+
+       TALLOC_FREE(event->te);
+
+       DEBUG(10,("smbd_idle_event_handler: %s %p called\n",
+                 event->name, event->te));
+
+       if (!event->handler(&now, event->private_data)) {
+               DEBUG(10,("smbd_idle_event_handler: %s %p stopped\n",
+                         event->name, event->te));
+               /* Don't repeat, delete ourselves */
+               TALLOC_FREE(event);
+               return;
+       }
+
+       DEBUG(10,("smbd_idle_event_handler: %s %p rescheduled\n",
+                 event->name, event->te));
+
+       event->te = event_add_timed(ctx, event,
+                                   timeval_sum(&now, &event->interval),
+                                   smbd_idle_event_handler, event);
+
+       /* We can't do much but fail here. */
+       SMB_ASSERT(event->te != NULL);
+}
+
+struct idle_event *event_add_idle(struct event_context *event_ctx,
+                                 TALLOC_CTX *mem_ctx,
+                                 struct timeval interval,
+                                 const char *name,
+                                 bool (*handler)(const struct timeval *now,
+                                                 void *private_data),
+                                 void *private_data)
+{
+       struct idle_event *result;
+       struct timeval now = timeval_current();
+
+       result = talloc(mem_ctx, struct idle_event);
+       if (result == NULL) {
+               DEBUG(0, ("talloc failed\n"));
+               return NULL;
+       }
+
+       result->interval = interval;
+       result->handler = handler;
+       result->private_data = private_data;
+
+       if (!(result->name = talloc_asprintf(result, "idle_evt(%s)", name))) {
+               DEBUG(0, ("talloc failed\n"));
+               TALLOC_FREE(result);
+               return NULL;
+       }
+
+       result->te = event_add_timed(event_ctx, result,
+                                    timeval_sum(&now, &interval),
+                                    smbd_idle_event_handler, result);
+       if (result->te == NULL) {
+               DEBUG(0, ("event_add_timed failed\n"));
+               TALLOC_FREE(result);
+               return NULL;
+       }
+
+       DEBUG(10,("event_add_idle: %s %p\n", result->name, result->te));
+       return result;
+}
+