Add "aio num threads" parameter to allow manual configuration of
[slow/samba.git] / source3 / modules / vfs_aio_pthread.c
1 /*
2  * Simulate Posix AIO using pthreads.
3  *
4  * Based on the aio_fork work from Volker and Volker's pthreadpool library.
5  *
6  * Copyright (C) Volker Lendecke 2008
7  * Copyright (C) Jeremy Allison 2012
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  */
23
24 #include "includes.h"
25 #include "system/filesys.h"
26 #include "system/shmem.h"
27 #include "smbd/smbd.h"
28 #include "pthreadpool.h"
29
30 struct aio_extra;
31 static struct pthreadpool *pool;
32 static int aio_pthread_jobid;
33
34 struct aio_private_data {
35         struct aio_private_data *prev, *next;
36         int jobid;
37         SMB_STRUCT_AIOCB *aiocb;
38         ssize_t ret_size;
39         int ret_errno;
40         bool cancelled;
41         bool write_command;
42 };
43
44 /* List of outstanding requests we have. */
45 struct aio_private_data *pd_list;
46
47 static void aio_pthread_handle_completion(struct event_context *event_ctx,
48                                 struct fd_event *event,
49                                 uint16 flags,
50                                 void *p);
51
52 /************************************************************************
53  How many threads to initialize ?
54  100 per process seems insane as a default until you realize that
55  (a) Threads terminate after 1 second when idle.
56  (b) Throttling is done in SMB2 via the crediting algorithm.
57  (c) SMB1 clients are limited to max_mux (50) outstanding requests and
58      Windows clients don't use this anyway.
59  Essentially we want this to be unlimited unless smb.conf says different.
60 ***********************************************************************/
61
62 static int aio_get_num_threads(struct vfs_handle_struct *handle)
63 {
64         return lp_parm_bool(SNUM(handle->conn),
65                                 "aio_pthread",
66                                 "aio num threads",
67                                 100);
68 }
69
70 #if 0
71 /************************************************************************
72  Called every 30 seconds to destroy pool if it's idle.
73 ***********************************************************************/
74
75 static void idle_pool_destroy_timer(struct tevent_context *ev,
76                         struct tevent_timer *te,
77                         struct timeval current_time,
78                         void *private_data)
79 {
80         struct timeval ne;
81
82         TALLOC_FREE(te);
83
84         if (pool && pd_list == NULL) {
85                 if (pthreadpool_destroy(pool) == 0) {
86                         pool = NULL;
87                 }
88                 DEBUG(10,("idle_pool_destroy_timer: destroyed AIO pool.\n"));
89                 return;
90         }
91
92         /* Here, the IO is still active. */
93
94         /* Set an event up for 30 seconds time - if we have
95            no outstanding IO at this time shut the threadpool
96            down. */
97         ne = tevent_timeval_current_ofs(30, 0);
98         tevent_add_timer(server_event_context(),
99                         NULL,
100                         ne,
101                         idle_pool_destroy_timer,
102                         NULL);
103 }
104 #endif
105
106 /************************************************************************
107  Ensure thread pool is initialized.
108 ***********************************************************************/
109
110 static bool init_aio_threadpool(struct vfs_handle_struct *handle)
111 {
112         struct fd_event *sock_event = NULL;
113         int ret = 0;
114         int num_threads;
115 #if 0
116         struct timeval ne;
117 #endif
118
119         if (pool) {
120                 return true;
121         }
122
123         num_threads = aio_get_num_threads(handle);
124         ret = pthreadpool_init(num_threads, &pool);
125         if (ret) {
126                 errno = ret;
127                 return false;
128         }
129         sock_event = tevent_add_fd(server_event_context(),
130                                 NULL,
131                                 pthreadpool_signal_fd(pool),
132                                 TEVENT_FD_READ,
133                                 aio_pthread_handle_completion,
134                                 NULL);
135         if (sock_event == NULL) {
136                 pthreadpool_destroy(pool);
137                 pool = NULL;
138                 return false;
139         }
140
141 #if 0
142         /* Set an event up for 30 seconds time - if we have
143            no outstanding IO at this time shut the threadpool
144            down. */
145         ne = tevent_timeval_current_ofs(30, 0);
146         tevent_add_timer(server_event_context(),
147                         NULL,
148                         ne,
149                         idle_pool_destroy_timer,
150                         NULL);
151 #endif
152
153         DEBUG(10,("init_aio_threadpool: initialized with %d threads\n",
154                         num_threads));
155
156         return true;
157 }
158
159
160 /************************************************************************
161  Worker function - core of the pthread aio engine.
162  This is the function that actually does the IO.
163 ***********************************************************************/
164
165 static void aio_worker(void *private_data)
166 {
167         struct aio_private_data *pd =
168                         (struct aio_private_data *)private_data;
169
170         if (pd->write_command) {
171                 pd->ret_size = pwrite(pd->aiocb->aio_fildes,
172                                 (const void *)pd->aiocb->aio_buf,
173                                 pd->aiocb->aio_nbytes,
174                                 pd->aiocb->aio_offset);
175         } else {
176                 pd->ret_size = pread(pd->aiocb->aio_fildes,
177                                 (void *)pd->aiocb->aio_buf,
178                                 pd->aiocb->aio_nbytes,
179                                 pd->aiocb->aio_offset);
180         }
181         if (pd->ret_size == -1) {
182                 pd->ret_errno = errno;
183         } else {
184                 pd->ret_errno = 0;
185         }
186 }
187
188 /************************************************************************
189  Private data destructor.
190 ***********************************************************************/
191
192 static int pd_destructor(struct aio_private_data *pd)
193 {
194         DLIST_REMOVE(pd_list, pd);
195         return 0;
196 }
197
198 /************************************************************************
199  Create and initialize a private data struct.
200 ***********************************************************************/
201
202 static struct aio_private_data *create_private_data(TALLOC_CTX *ctx,
203                                         SMB_STRUCT_AIOCB *aiocb)
204 {
205         struct aio_private_data *pd = talloc_zero(ctx, struct aio_private_data);
206         if (!pd) {
207                 return NULL;
208         }
209         pd->jobid = aio_pthread_jobid++;
210         pd->aiocb = aiocb;
211         pd->ret_size = -1;
212         pd->ret_errno = EINPROGRESS;
213         talloc_set_destructor(pd, pd_destructor);
214         DLIST_ADD_END(pd_list, pd, struct aio_private_data *);
215         return pd;
216 }
217
218 /************************************************************************
219  Spin off a threadpool (if needed) and initiate a pread call.
220 ***********************************************************************/
221
222 static int aio_pthread_read(struct vfs_handle_struct *handle,
223                                 struct files_struct *fsp,
224                                 SMB_STRUCT_AIOCB *aiocb)
225 {
226         struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
227         struct aio_private_data *pd = NULL;
228         int ret;
229
230         if (!init_aio_threadpool(handle)) {
231                 return -1;
232         }
233
234         pd = create_private_data(aio_ex, aiocb);
235         if (pd == NULL) {
236                 DEBUG(10, ("aio_pthread_read: Could not create private data.\n"));
237                 return -1;
238         }
239
240         ret = pthreadpool_add_job(pool, pd->jobid, aio_worker, (void *)pd);
241         if (ret) {
242                 errno = ret;
243                 return -1;
244         }
245
246         DEBUG(10, ("aio_pthread_read: jobid=%d pread requested "
247                 "of %llu bytes at offset %llu\n",
248                 pd->jobid,
249                 (unsigned long long)pd->aiocb->aio_nbytes,
250                 (unsigned long long)pd->aiocb->aio_offset));
251
252         return 0;
253 }
254
255 /************************************************************************
256  Spin off a threadpool (if needed) and initiate a pwrite call.
257 ***********************************************************************/
258
259 static int aio_pthread_write(struct vfs_handle_struct *handle,
260                                 struct files_struct *fsp,
261                                 SMB_STRUCT_AIOCB *aiocb)
262 {
263         struct aio_extra *aio_ex = (struct aio_extra *)aiocb->aio_sigevent.sigev_value.sival_ptr;
264         struct aio_private_data *pd = NULL;
265         int ret;
266
267         if (!init_aio_threadpool(handle)) {
268                 return -1;
269         }
270
271         pd = create_private_data(aio_ex, aiocb);
272         if (pd == NULL) {
273                 DEBUG(10, ("aio_pthread_write: Could not create private data.\n"));
274                 return -1;
275         }
276
277         pd->write_command = true;
278
279         ret = pthreadpool_add_job(pool, pd->jobid, aio_worker, (void *)pd);
280         if (ret) {
281                 errno = ret;
282                 return -1;
283         }
284
285         DEBUG(10, ("aio_pthread_write: jobid=%d pwrite requested "
286                 "of %llu bytes at offset %llu\n",
287                 pd->jobid,
288                 (unsigned long long)pd->aiocb->aio_nbytes,
289                 (unsigned long long)pd->aiocb->aio_offset));
290
291         return 0;
292 }
293
294 /************************************************************************
295  Find the private data by jobid.
296 ***********************************************************************/
297
298 static struct aio_private_data *find_private_data_by_jobid(int jobid)
299 {
300         struct aio_private_data *pd;
301
302         for (pd = pd_list; pd != NULL; pd = pd->next) {
303                 if (pd->jobid == jobid) {
304                         return pd;
305                 }
306         }
307
308         return NULL;
309 }
310
311 /************************************************************************
312  Callback when an IO completes.
313 ***********************************************************************/
314
315 static void aio_pthread_handle_completion(struct event_context *event_ctx,
316                                 struct fd_event *event,
317                                 uint16 flags,
318                                 void *p)
319 {
320         struct aio_extra *aio_ex = NULL;
321         struct aio_private_data *pd = NULL;
322         int jobid = 0;
323         int ret;
324
325         DEBUG(10, ("aio_pthread_handle_completion called with flags=%d\n",
326                         (int)flags));
327
328         if ((flags & EVENT_FD_READ) == 0) {
329                 return;
330         }
331
332         ret = pthreadpool_finished_job(pool, &jobid);
333         if (ret) {
334                 smb_panic("aio_pthread_handle_completion");
335                 return;
336         }
337
338         pd = find_private_data_by_jobid(jobid);
339         if (pd == NULL) {
340                 DEBUG(1, ("aio_pthread_handle_completion cannot find jobid %d\n",
341                           jobid));
342                 return;
343         }
344
345         aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr;
346         smbd_aio_complete_aio_ex(aio_ex);
347
348         DEBUG(10,("aio_pthread_handle_completion: jobid %d completed\n",
349                 jobid ));
350
351 }
352
353 /************************************************************************
354  Find the private data by aiocb.
355 ***********************************************************************/
356
357 static struct aio_private_data *find_private_data_by_aiocb(SMB_STRUCT_AIOCB *aiocb)
358 {
359         struct aio_private_data *pd;
360
361         for (pd = pd_list; pd != NULL; pd = pd->next) {
362                 if (pd->aiocb == aiocb) {
363                         return pd;
364                 }
365         }
366
367         return NULL;
368 }
369
370 /************************************************************************
371  Called to return the result of a completed AIO.
372  Should only be called if aio_error returns something other than EINPROGRESS.
373  Returns:
374         Any other value - return from IO operation.
375 ***********************************************************************/
376
377 static ssize_t aio_pthread_return_fn(struct vfs_handle_struct *handle,
378                                 struct files_struct *fsp,
379                                 SMB_STRUCT_AIOCB *aiocb)
380 {
381         struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
382
383         if (pd == NULL) {
384                 errno = EINVAL;
385                 DEBUG(0, ("aio_pthread_return_fn: returning EINVAL\n"));
386                 return -1;
387         }
388
389         pd->aiocb = NULL;
390
391         if (pd->ret_size == -1) {
392                 errno = pd->ret_errno;
393         }
394
395         return pd->ret_size;
396 }
397
398 /************************************************************************
399  Called to check the result of an AIO.
400  Returns:
401         EINPROGRESS - still in progress.
402         EINVAL - invalid aiocb.
403         ECANCELED - request was cancelled.
404         0 - request completed successfully.
405         Any other value - errno from IO operation.
406 ***********************************************************************/
407
408 static int aio_pthread_error_fn(struct vfs_handle_struct *handle,
409                              struct files_struct *fsp,
410                              SMB_STRUCT_AIOCB *aiocb)
411 {
412         struct aio_private_data *pd = find_private_data_by_aiocb(aiocb);
413
414         if (pd == NULL) {
415                 return EINVAL;
416         }
417         if (pd->cancelled) {
418                 return ECANCELED;
419         }
420         return pd->ret_errno;
421 }
422
423 /************************************************************************
424  Called to request the cancel of an AIO, or all of them on a specific
425  fsp if aiocb == NULL.
426 ***********************************************************************/
427
428 static int aio_pthread_cancel(struct vfs_handle_struct *handle,
429                         struct files_struct *fsp,
430                         SMB_STRUCT_AIOCB *aiocb)
431 {
432         struct aio_private_data *pd = NULL;
433
434         for (pd = pd_list; pd != NULL; pd = pd->next) {
435                 if (pd->aiocb == NULL) {
436                         continue;
437                 }
438                 if (pd->aiocb->aio_fildes != fsp->fh->fd) {
439                         continue;
440                 }
441                 if ((aiocb != NULL) && (pd->aiocb != aiocb)) {
442                         continue;
443                 }
444
445                 /*
446                  * We let the child do its job, but we discard the result when
447                  * it's finished.
448                  */
449
450                 pd->cancelled = true;
451         }
452
453         return AIO_CANCELED;
454 }
455
456 /************************************************************************
457  Callback for a previously detected job completion.
458 ***********************************************************************/
459
460 static void aio_pthread_handle_immediate(struct tevent_context *ctx,
461                                 struct tevent_immediate *im,
462                                 void *private_data)
463 {
464         struct aio_extra *aio_ex = NULL;
465         int *pjobid = (int *)private_data;
466         struct aio_private_data *pd = find_private_data_by_jobid(*pjobid);
467
468         if (pd == NULL) {
469                 DEBUG(1, ("aio_pthread_handle_immediate cannot find jobid %d\n",
470                           *pjobid));
471                 TALLOC_FREE(pjobid);
472                 return;
473         }
474
475         TALLOC_FREE(pjobid);
476         aio_ex = (struct aio_extra *)pd->aiocb->aio_sigevent.sigev_value.sival_ptr;
477         smbd_aio_complete_aio_ex(aio_ex);
478 }
479
480 /************************************************************************
481  Private data struct used in suspend completion code.
482 ***********************************************************************/
483
484 struct suspend_private {
485         int num_entries;
486         int num_finished;
487         const SMB_STRUCT_AIOCB * const *aiocb_array;
488 };
489
490 /************************************************************************
491  Callback when an IO completes from a suspend call.
492 ***********************************************************************/
493
494 static void aio_pthread_handle_suspend_completion(struct event_context *event_ctx,
495                                 struct fd_event *event,
496                                 uint16 flags,
497                                 void *p)
498 {
499         struct suspend_private *sp = (struct suspend_private *)p;
500         struct aio_private_data *pd = NULL;
501         struct tevent_immediate *im = NULL;
502         int *pjobid = NULL;
503         int i;
504
505         DEBUG(10, ("aio_pthread_handle_suspend_completion called with flags=%d\n",
506                         (int)flags));
507
508         if ((flags & EVENT_FD_READ) == 0) {
509                 return;
510         }
511
512         pjobid = talloc_array(NULL, int, 1);
513         if (pjobid) {
514                 smb_panic("aio_pthread_handle_suspend_completion: no memory.");
515         }
516
517         if (pthreadpool_finished_job(pool, pjobid)) {
518                 smb_panic("aio_pthread_handle_suspend_completion: can't find job.");
519                 return;
520         }
521
522         pd = find_private_data_by_jobid(*pjobid);
523         if (pd == NULL) {
524                 DEBUG(1, ("aio_pthread_handle_completion cannot find jobid %d\n",
525                           *pjobid));
526                 TALLOC_FREE(pjobid);
527                 return;
528         }
529
530         /* Is this a jobid with an aiocb we're interested in ? */
531         for (i = 0; i < sp->num_entries; i++) {
532                 if (sp->aiocb_array[i] == pd->aiocb) {
533                         sp->num_finished++;
534                         TALLOC_FREE(pjobid);
535                         return;
536                 }
537         }
538
539         /* Jobid completed we weren't waiting for.
540            We must reshedule this as an immediate event
541            on the main event context. */
542         im = tevent_create_immediate(NULL);
543         if (!im) {
544                 exit_server_cleanly("aio_pthread_handle_suspend_completion: no memory");
545         }
546
547         DEBUG(10,("aio_pthread_handle_suspend_completion: "
548                         "re-scheduling job id %d\n",
549                         *pjobid));
550
551         tevent_schedule_immediate(im,
552                         server_event_context(),
553                         aio_pthread_handle_immediate,
554                         (void *)pjobid);
555 }
556
557
558 static void aio_pthread_suspend_timed_out(struct tevent_context *event_ctx,
559                                         struct tevent_timer *te,
560                                         struct timeval now,
561                                         void *private_data)
562 {
563         bool *timed_out = (bool *)private_data;
564         /* Remove this timed event handler. */
565         TALLOC_FREE(te);
566         *timed_out = true;
567 }
568
569 /************************************************************************
570  Called to request everything to stop until all IO is completed.
571 ***********************************************************************/
572
573 static int aio_pthread_suspend(struct vfs_handle_struct *handle,
574                         struct files_struct *fsp,
575                         const SMB_STRUCT_AIOCB * const aiocb_array[],
576                         int n,
577                         const struct timespec *timeout)
578 {
579         struct event_context *ev = NULL;
580         struct fd_event *sock_event = NULL;
581         int ret = -1;
582         struct suspend_private sp;
583         bool timed_out = false;
584         TALLOC_CTX *frame = talloc_stackframe();
585
586         /* This is a blocking call, and has to use a sub-event loop. */
587         ev = event_context_init(frame);
588         if (ev == NULL) {
589                 errno = ENOMEM;
590                 goto out;
591         }
592
593         if (timeout) {
594                 struct timeval tv = convert_timespec_to_timeval(*timeout);
595                 struct tevent_timer *te = tevent_add_timer(ev,
596                                                 frame,
597                                                 timeval_current_ofs(tv.tv_sec,
598                                                                     tv.tv_usec),
599                                                 aio_pthread_suspend_timed_out,
600                                                 &timed_out);
601                 if (!te) {
602                         errno = ENOMEM;
603                         goto out;
604                 }
605         }
606
607         ZERO_STRUCT(sp);
608         sp.num_entries = n;
609         sp.aiocb_array = aiocb_array;
610         sp.num_finished = 0;
611
612         sock_event = tevent_add_fd(ev,
613                                 frame,
614                                 pthreadpool_signal_fd(pool),
615                                 TEVENT_FD_READ,
616                                 aio_pthread_handle_suspend_completion,
617                                 (void *)&sp);
618         if (sock_event == NULL) {
619                 pthreadpool_destroy(pool);
620                 pool = NULL;
621                 goto out;
622         }
623         /*
624          * We're going to cheat here. We know that smbd/aio.c
625          * only calls this when it's waiting for every single
626          * outstanding call to finish on a close, so just wait
627          * individually for each IO to complete. We don't care
628          * what order they finish - only that they all do. JRA.
629          */
630         while (sp.num_entries != sp.num_finished) {
631                 if (tevent_loop_once(ev) == -1) {
632                         goto out;
633                 }
634
635                 if (timed_out) {
636                         errno = EAGAIN;
637                         goto out;
638                 }
639         }
640
641         ret = 0;
642
643   out:
644
645         TALLOC_FREE(frame);
646         return ret;
647 }
648
649 static struct vfs_fn_pointers vfs_aio_pthread_fns = {
650         .aio_read_fn = aio_pthread_read,
651         .aio_write_fn = aio_pthread_write,
652         .aio_return_fn = aio_pthread_return_fn,
653         .aio_cancel_fn = aio_pthread_cancel,
654         .aio_error_fn = aio_pthread_error_fn,
655         .aio_suspend_fn = aio_pthread_suspend,
656 };
657
658 NTSTATUS vfs_aio_pthread_init(void);
659 NTSTATUS vfs_aio_pthread_init(void)
660 {
661         return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
662                                 "aio_pthread", &vfs_aio_pthread_fns);
663 }