2 Unix SMB/CIFS implementation.
4 main select loop and event handling - epoll implementation
6 Copyright (C) Andrew Tridgell 2003-2005
7 Copyright (C) Stefan Metzmacher 2005
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "system/time.h"
25 #include "system/filesys.h"
26 #include "system/network.h"
27 #include "lib/util/dlinklist.h"
28 #include "lib/events/events.h"
29 #include "lib/events/events_internal.h"
30 #include <sys/epoll.h>
32 extern pid_t ctdbd_pid;
34 struct epoll_event_context {
35 /* a pointer back to the generic event_context */
36 struct event_context *ev;
38 /* list of filedescriptor events */
39 struct fd_event *fd_events;
41 /* number of registered fd event handlers */
44 /* when using epoll this is the handle from epoll_create */
51 called when a epoll call fails, and we should fallback
54 static void epoll_fallback_to_select(struct epoll_event_context *epoll_ev, const char *reason)
56 DEBUG(0,("%s (%s) - falling back to select()\n", reason, strerror(errno)));
57 close(epoll_ev->epoll_fd);
58 epoll_ev->epoll_fd = -1;
59 talloc_set_destructor(epoll_ev, NULL);
63 map from EVENT_FD_* to EPOLLIN/EPOLLOUT
65 static uint32_t epoll_map_flags(uint16_t flags)
68 if (flags & EVENT_FD_READ) ret |= (EPOLLIN | EPOLLERR | EPOLLHUP);
69 if (flags & EVENT_FD_WRITE) ret |= (EPOLLOUT | EPOLLERR | EPOLLHUP);
76 static int epoll_ctx_destructor(struct epoll_event_context *epoll_ev)
78 close(epoll_ev->epoll_fd);
79 epoll_ev->epoll_fd = -1;
86 static void epoll_init_ctx(struct epoll_event_context *epoll_ev)
90 epoll_ev->epoll_fd = epoll_create(64);
92 /* on exec, don't inherit the fd */
93 v = fcntl(epoll_ev->epoll_fd, F_GETFD, 0);
94 fcntl(epoll_ev->epoll_fd, F_SETFD, v | FD_CLOEXEC);
96 epoll_ev->pid = getpid();
97 talloc_set_destructor(epoll_ev, epoll_ctx_destructor);
100 static void epoll_add_event(struct epoll_event_context *epoll_ev, struct fd_event *fde);
103 reopen the epoll handle when our pid changes
104 see http://junkcode.samba.org/ftp/unpacked/junkcode/epoll_fork.c for an
105 demonstration of why this is needed
107 static void epoll_check_reopen(struct epoll_event_context *epoll_ev)
109 struct fd_event *fde;
112 if (epoll_ev->pid == getpid()) {
116 close(epoll_ev->epoll_fd);
117 epoll_ev->epoll_fd = epoll_create(64);
118 if (epoll_ev->epoll_fd == -1) {
119 DEBUG(0,("Failed to recreate epoll handle after fork\n"));
123 /* on exec, don't inherit the fd */
124 v = fcntl(epoll_ev->epoll_fd, F_GETFD, 0);
125 fcntl(epoll_ev->epoll_fd, F_SETFD, v | FD_CLOEXEC);
127 epoll_ev->pid = getpid();
128 for (fde=epoll_ev->fd_events;fde;fde=fde->next) {
129 epoll_add_event(epoll_ev, fde);
133 #define EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT (1<<0)
134 #define EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR (1<<1)
135 #define EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR (1<<2)
138 add the epoll event to the given fd_event
140 static void epoll_add_event(struct epoll_event_context *epoll_ev, struct fd_event *fde)
142 struct epoll_event event;
144 if (epoll_ev->epoll_fd == -1) return;
146 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
148 /* if we don't want events yet, don't add an epoll_event */
149 if (fde->flags == 0) return;
152 event.events = epoll_map_flags(fde->flags);
153 event.data.ptr = fde;
154 if (epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_ADD, fde->fd, &event) != 0) {
155 epoll_fallback_to_select(epoll_ev, "EPOLL_CTL_ADD failed");
157 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
159 /* only if we want to read we want to tell the event handler about errors */
160 if (fde->flags & EVENT_FD_READ) {
161 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
166 delete the epoll event for given fd_event
168 static void epoll_del_event(struct epoll_event_context *epoll_ev, struct fd_event *fde)
170 struct epoll_event event;
172 DLIST_REMOVE(epoll_ev->fd_events, fde);
174 if (epoll_ev->epoll_fd == -1) return;
176 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
178 /* if there's no epoll_event, we don't need to delete it */
179 if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT)) return;
182 event.events = epoll_map_flags(fde->flags);
183 event.data.ptr = fde;
184 if (epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_DEL, fde->fd, &event) != 0) {
185 DEBUG(0,("epoll_del_event failed! probable early close bug (%s)\n", strerror(errno)));
187 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
191 change the epoll event to the given fd_event
193 static void epoll_mod_event(struct epoll_event_context *epoll_ev, struct fd_event *fde)
195 struct epoll_event event;
196 if (epoll_ev->epoll_fd == -1) return;
198 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
201 event.events = epoll_map_flags(fde->flags);
202 event.data.ptr = fde;
203 if (epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_MOD, fde->fd, &event) != 0) {
204 epoll_fallback_to_select(epoll_ev, "EPOLL_CTL_MOD failed");
207 /* only if we want to read we want to tell the event handler about errors */
208 if (fde->flags & EVENT_FD_READ) {
209 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
213 static void epoll_change_event(struct epoll_event_context *epoll_ev, struct fd_event *fde)
215 bool got_error = (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR);
216 bool want_read = (fde->flags & EVENT_FD_READ);
217 bool want_write= (fde->flags & EVENT_FD_WRITE);
219 if (epoll_ev->epoll_fd == -1) return;
221 fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
223 /* there's already an event */
224 if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT) {
225 if (want_read || (want_write && !got_error)) {
226 epoll_mod_event(epoll_ev, fde);
230 * if we want to match the select behavior, we need to remove the epoll_event
231 * when the caller isn't interested in events.
233 * this is because epoll reports EPOLLERR and EPOLLHUP, even without asking for them
235 epoll_del_event(epoll_ev, fde);
239 /* there's no epoll_event attached to the fde */
240 if (want_read || (want_write && !got_error)) {
241 DLIST_ADD(epoll_ev->fd_events, fde);
242 epoll_add_event(epoll_ev, fde);
248 event loop handling using epoll
250 static int epoll_event_loop(struct epoll_event_context *epoll_ev, struct timeval *tvalp)
254 struct epoll_event events[MAXEVENTS];
257 if (epoll_ev->epoll_fd == -1) return -1;
260 /* it's better to trigger timed events a bit later than to early */
261 timeout = ((tvalp->tv_usec+999) / 1000) + (tvalp->tv_sec*1000);
264 if (epoll_ev->ev->num_signal_handlers &&
265 common_event_check_signal(epoll_ev->ev)) {
269 ret = epoll_wait(epoll_ev->epoll_fd, events, MAXEVENTS, timeout);
271 if (ret == -1 && errno == EINTR && epoll_ev->ev->num_signal_handlers) {
272 if (common_event_check_signal(epoll_ev->ev)) {
277 if (ret == -1 && errno != EINTR) {
278 epoll_fallback_to_select(epoll_ev, "epoll_wait() failed");
282 if (ret == 0 && tvalp) {
283 /* we don't care about a possible delay here */
284 common_event_loop_timer_delay(epoll_ev->ev);
288 for (i=0;i<ret;i++) {
289 struct fd_event *fde = talloc_get_type(events[i].data.ptr,
294 epoll_fallback_to_select(epoll_ev, "epoll_wait() gave bad data");
297 if (events[i].events & (EPOLLHUP|EPOLLERR)) {
298 fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR;
300 * if we only wait for EVENT_FD_WRITE, we should not tell the
301 * event handler about it, and remove the epoll_event,
302 * as we only report errors when waiting for read events,
303 * to match the select() behavior
305 if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR)) {
306 epoll_del_event(epoll_ev, fde);
309 flags |= EVENT_FD_READ;
311 if (events[i].events & EPOLLIN) flags |= EVENT_FD_READ;
312 if (events[i].events & EPOLLOUT) flags |= EVENT_FD_WRITE;
314 fde->handler(epoll_ev->ev, fde, flags, fde->private_data);
323 create a epoll_event_context structure.
325 static int epoll_event_context_init(struct event_context *ev)
327 struct epoll_event_context *epoll_ev;
329 epoll_ev = talloc_zero(ev, struct epoll_event_context);
330 if (!epoll_ev) return -1;
332 epoll_ev->epoll_fd = -1;
334 epoll_init_ctx(epoll_ev);
336 ev->additional_data = epoll_ev;
343 static int epoll_event_fd_destructor(struct fd_event *fde)
345 struct event_context *ev = fde->event_ctx;
346 struct epoll_event_context *epoll_ev = talloc_get_type(ev->additional_data,
347 struct epoll_event_context);
349 epoll_check_reopen(epoll_ev);
351 epoll_ev->num_fd_events--;
353 epoll_del_event(epoll_ev, fde);
355 if (fde->flags & EVENT_FD_AUTOCLOSE) {
365 return NULL on failure (memory allocation error)
367 static struct fd_event *epoll_event_add_fd(struct event_context *ev, TALLOC_CTX *mem_ctx,
368 int fd, uint16_t flags,
369 event_fd_handler_t handler,
372 struct epoll_event_context *epoll_ev = talloc_get_type(ev->additional_data,
373 struct epoll_event_context);
374 struct fd_event *fde;
376 epoll_check_reopen(epoll_ev);
378 fde = talloc(mem_ctx?mem_ctx:ev, struct fd_event);
379 if (!fde) return NULL;
384 fde->handler = handler;
385 fde->private_data = private_data;
386 fde->additional_flags = 0;
387 fde->additional_data = NULL;
389 epoll_ev->num_fd_events++;
390 talloc_set_destructor(fde, epoll_event_fd_destructor);
392 DLIST_ADD(epoll_ev->fd_events, fde);
393 epoll_add_event(epoll_ev, fde);
400 return the fd event flags
402 static uint16_t epoll_event_get_fd_flags(struct fd_event *fde)
408 set the fd event flags
410 static void epoll_event_set_fd_flags(struct fd_event *fde, uint16_t flags)
412 struct event_context *ev;
413 struct epoll_event_context *epoll_ev;
415 if (fde->flags == flags) return;
418 epoll_ev = talloc_get_type(ev->additional_data, struct epoll_event_context);
422 epoll_check_reopen(epoll_ev);
424 epoll_change_event(epoll_ev, fde);
428 do a single event loop using the events defined in ev
430 static int epoll_event_loop_once(struct event_context *ev)
432 struct epoll_event_context *epoll_ev = talloc_get_type(ev->additional_data,
433 struct epoll_event_context);
436 tval = common_event_loop_timer_delay(ev);
437 if (timeval_is_zero(&tval)) {
441 epoll_check_reopen(epoll_ev);
443 return epoll_event_loop(epoll_ev, &tval);
447 return on failure or (with 0) if all fd events are removed
449 static int epoll_event_loop_wait(struct event_context *ev)
453 struct epoll_event_context *epoll_ev = talloc_get_type(ev->additional_data,
454 struct epoll_event_context);
455 while (epoll_ev->num_fd_events) {
456 if (epoll_event_loop_once(ev) != 0) {
459 if (getpid() == ctdbd_pid) {
463 DEBUG(0,(__location__ " ERROR Time skipped backward by %d seconds\n", (int)(t-new_t)));
465 /* We assume here that we get at least one event every 5 seconds */
467 DEBUG(0,(__location__ " ERROR Time jumped forward by %d seconds\n", (int)(new_t-t)));
477 static const struct event_ops epoll_event_ops = {
478 .context_init = epoll_event_context_init,
479 .add_fd = epoll_event_add_fd,
480 .get_fd_flags = epoll_event_get_fd_flags,
481 .set_fd_flags = epoll_event_set_fd_flags,
482 .add_timed = common_event_add_timed,
483 .add_signal = common_event_add_signal,
484 .loop_once = epoll_event_loop_once,
485 .loop_wait = epoll_event_loop_wait,
488 bool events_epoll_init(void)
490 return event_register_backend("epoll", &epoll_event_ops);
494 NTSTATUS s4_events_epoll_init(void)
496 if (!events_epoll_init()) {
497 return NT_STATUS_INTERNAL_ERROR;