s3: smbd: Remove aio_pending_size from globals.
[nivanova/samba-autobuild/.git] / source3 / modules / vfs_aio_pthread.c
1 /*
2  * Simulate Posix AIO using pthreads.
3  *
4  * Based on the aio_fork work from Volker and Volker's pthreadpool library.
5  *
6  * Copyright (C) Volker Lendecke 2008
7  * Copyright (C) Jeremy Allison 2012
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  */
23
24 #include "includes.h"
25 #include "system/filesys.h"
26 #include "system/shmem.h"
27 #include "smbd/smbd.h"
28 #include "smbd/globals.h"
29 #include "lib/pthreadpool/pthreadpool.h"
30 #ifdef HAVE_LINUX_FALLOC_H
31 #include <linux/falloc.h>
32 #endif
33
34 #if defined(HAVE_OPENAT) && defined(USE_LINUX_THREAD_CREDENTIALS)
35
36 /************************************************************************
37  Ensure thread pool is initialized.
38 ***********************************************************************/
39
40 static bool init_aio_threadpool(struct tevent_context *ev_ctx,
41                                 struct pthreadpool **pp_pool,
42                                 void (*completion_fn)(struct tevent_context *,
43                                                 struct tevent_fd *,
44                                                 uint16_t,
45                                                 void *))
46 {
47         struct tevent_fd *sock_event = NULL;
48         int ret = 0;
49
50         if (*pp_pool) {
51                 return true;
52         }
53
54         ret = pthreadpool_init(get_aio_pending_size(), pp_pool);
55         if (ret) {
56                 errno = ret;
57                 return false;
58         }
59         sock_event = tevent_add_fd(ev_ctx,
60                                 NULL,
61                                 pthreadpool_signal_fd(*pp_pool),
62                                 TEVENT_FD_READ,
63                                 completion_fn,
64                                 NULL);
65         if (sock_event == NULL) {
66                 pthreadpool_destroy(*pp_pool);
67                 *pp_pool = NULL;
68                 return false;
69         }
70
71         DEBUG(10,("init_aio_threadpool: initialized with up to %d threads\n",
72                   get_aio_pending_size()));
73
74         return true;
75 }
76
77 /*
78  * We must have openat() to do any thread-based
79  * asynchronous opens. We also must be using
80  * thread-specific credentials (Linux-only
81  * for now).
82  */
83
84 /*
85  * NB. This threadpool is shared over all
86  * instances of this VFS module in this
87  * process, as is the current jobid.
88  */
89
90 static struct pthreadpool *open_pool;
91 static int aio_pthread_open_jobid;
92
93 struct aio_open_private_data {
94         struct aio_open_private_data *prev, *next;
95         /* Inputs. */
96         int jobid;
97         int dir_fd;
98         int flags;
99         mode_t mode;
100         uint64_t mid;
101         bool in_progress;
102         const char *fname;
103         char *dname;
104         struct smbd_server_connection *sconn;
105         const struct security_unix_token *ux_tok;
106         uint64_t initial_allocation_size;
107         /* Returns. */
108         int ret_fd;
109         int ret_errno;
110 };
111
112 /* List of outstanding requests we have. */
113 static struct aio_open_private_data *open_pd_list;
114
115 /************************************************************************
116  Find the open private data by jobid.
117 ***********************************************************************/
118
119 static struct aio_open_private_data *find_open_private_data_by_jobid(int jobid)
120 {
121         struct aio_open_private_data *opd;
122
123         for (opd = open_pd_list; opd != NULL; opd = opd->next) {
124                 if (opd->jobid == jobid) {
125                         return opd;
126                 }
127         }
128
129         return NULL;
130 }
131
132 /************************************************************************
133  Find the open private data by mid.
134 ***********************************************************************/
135
136 static struct aio_open_private_data *find_open_private_data_by_mid(uint64_t mid)
137 {
138         struct aio_open_private_data *opd;
139
140         for (opd = open_pd_list; opd != NULL; opd = opd->next) {
141                 if (opd->mid == mid) {
142                         return opd;
143                 }
144         }
145
146         return NULL;
147 }
148
149 /************************************************************************
150  Callback when an open completes.
151 ***********************************************************************/
152
153 static void aio_open_handle_completion(struct tevent_context *event_ctx,
154                                 struct tevent_fd *event,
155                                 uint16_t flags,
156                                 void *p)
157 {
158         struct aio_open_private_data *opd = NULL;
159         int jobid = 0;
160         int ret;
161         struct smbXsrv_connection *xconn;
162
163         DEBUG(10, ("aio_open_handle_completion called with flags=%d\n",
164                 (int)flags));
165
166         if ((flags & TEVENT_FD_READ) == 0) {
167                 return;
168         }
169
170         ret = pthreadpool_finished_jobs(open_pool, &jobid, 1);
171         if (ret != 1) {
172                 smb_panic("aio_open_handle_completion");
173                 /* notreached. */
174                 return;
175         }
176
177         opd = find_open_private_data_by_jobid(jobid);
178         if (opd == NULL) {
179                 DEBUG(0, ("aio_open_handle_completion cannot find jobid %d\n",
180                         jobid));
181                 smb_panic("aio_open_handle_completion - no jobid");
182                 /* notreached. */
183                 return;
184         }
185
186         DEBUG(10,("aio_open_handle_completion: jobid %d mid %llu "
187                 "for file %s/%s completed\n",
188                 jobid,
189                 (unsigned long long)opd->mid,
190                 opd->dname,
191                 opd->fname));
192
193         opd->in_progress = false;
194
195         /*
196          * TODO: In future we need a proper algorithm
197          * to find the correct connection for a fsp.
198          * For now we only have one connection, so this is correct...
199          */
200         xconn = opd->sconn->client->connections;
201
202         /* Find outstanding event and reschedule. */
203         if (!schedule_deferred_open_message_smb(xconn, opd->mid)) {
204                 /*
205                  * Outstanding event didn't exist or was
206                  * cancelled. Free up the fd and throw
207                  * away the result.
208                  */
209                 if (opd->ret_fd != -1) {
210                         close(opd->ret_fd);
211                         opd->ret_fd = -1;
212                 }
213                 TALLOC_FREE(opd);
214         }
215 }
216
217 /*****************************************************************
218  The core of the async open code - the worker function. Note we
219  use the new openat() system call to avoid any problems with
220  current working directory changes plus we change credentials
221  on the thread to prevent any security race conditions.
222 *****************************************************************/
223
224 static void aio_open_worker(void *private_data)
225 {
226         struct aio_open_private_data *opd =
227                 (struct aio_open_private_data *)private_data;
228
229         /* Become the correct credential on this thread. */
230         if (set_thread_credentials(opd->ux_tok->uid,
231                                 opd->ux_tok->gid,
232                                 (size_t)opd->ux_tok->ngroups,
233                                 opd->ux_tok->groups) != 0) {
234                 opd->ret_fd = -1;
235                 opd->ret_errno = errno;
236                 return;
237         }
238
239         opd->ret_fd = openat(opd->dir_fd,
240                         opd->fname,
241                         opd->flags,
242                         opd->mode);
243
244         if (opd->ret_fd == -1) {
245                 opd->ret_errno = errno;
246         } else {
247                 /* Create was successful. */
248                 opd->ret_errno = 0;
249
250 #if defined(HAVE_LINUX_FALLOCATE)
251                 /*
252                  * See if we can set the initial
253                  * allocation size. We don't record
254                  * the return for this as it's an
255                  * optimization - the upper layer
256                  * will also do this for us once
257                  * the open returns.
258                  */
259                 if (opd->initial_allocation_size) {
260                         (void)fallocate(opd->ret_fd,
261                                         FALLOC_FL_KEEP_SIZE,
262                                         0,
263                                         (off_t)opd->initial_allocation_size);
264                 }
265 #endif
266         }
267 }
268
269 /************************************************************************
270  Open private data destructor.
271 ***********************************************************************/
272
273 static int opd_destructor(struct aio_open_private_data *opd)
274 {
275         if (opd->dir_fd != -1) {
276                 close(opd->dir_fd);
277         }
278         DLIST_REMOVE(open_pd_list, opd);
279         return 0;
280 }
281
282 /************************************************************************
283  Create and initialize a private data struct for async open.
284 ***********************************************************************/
285
286 static struct aio_open_private_data *create_private_open_data(const files_struct *fsp,
287                                         int flags,
288                                         mode_t mode)
289 {
290         struct aio_open_private_data *opd = talloc_zero(NULL,
291                                         struct aio_open_private_data);
292         const char *fname = NULL;
293
294         if (!opd) {
295                 return NULL;
296         }
297
298         opd->jobid = aio_pthread_open_jobid++;
299         opd->dir_fd = -1;
300         opd->ret_fd = -1;
301         opd->ret_errno = EINPROGRESS;
302         opd->flags = flags;
303         opd->mode = mode;
304         opd->mid = fsp->mid;
305         opd->in_progress = true;
306         opd->sconn = fsp->conn->sconn;
307         opd->initial_allocation_size = fsp->initial_allocation_size;
308
309         /* Copy our current credentials. */
310         opd->ux_tok = copy_unix_token(opd, get_current_utok(fsp->conn));
311         if (opd->ux_tok == NULL) {
312                 TALLOC_FREE(opd);
313                 return NULL;
314         }
315
316         /*
317          * Copy the parent directory name and the
318          * relative path within it.
319          */
320         if (parent_dirname(opd,
321                         fsp->fsp_name->base_name,
322                         &opd->dname,
323                         &fname) == false) {
324                 TALLOC_FREE(opd);
325                 return NULL;
326         }
327         opd->fname = talloc_strdup(opd, fname);
328         if (opd->fname == NULL) {
329                 TALLOC_FREE(opd);
330                 return NULL;
331         }
332
333 #if defined(O_DIRECTORY)
334         opd->dir_fd = open(opd->dname, O_RDONLY|O_DIRECTORY);
335 #else
336         opd->dir_fd = open(opd->dname, O_RDONLY);
337 #endif
338         if (opd->dir_fd == -1) {
339                 TALLOC_FREE(opd);
340                 return NULL;
341         }
342
343         talloc_set_destructor(opd, opd_destructor);
344         DLIST_ADD_END(open_pd_list, opd, struct aio_open_private_data *);
345         return opd;
346 }
347
348 /*****************************************************************
349  Setup an async open.
350 *****************************************************************/
351
352 static int open_async(const files_struct *fsp,
353                         int flags,
354                         mode_t mode)
355 {
356         struct aio_open_private_data *opd = NULL;
357         int ret;
358
359         if (!init_aio_threadpool(fsp->conn->sconn->ev_ctx,
360                         &open_pool,
361                         aio_open_handle_completion)) {
362                 return -1;
363         }
364
365         opd = create_private_open_data(fsp, flags, mode);
366         if (opd == NULL) {
367                 DEBUG(10, ("open_async: Could not create private data.\n"));
368                 return -1;
369         }
370
371         ret = pthreadpool_add_job(open_pool,
372                                 opd->jobid,
373                                 aio_open_worker,
374                                 (void *)opd);
375         if (ret) {
376                 errno = ret;
377                 return -1;
378         }
379
380         DEBUG(5,("open_async: mid %llu jobid %d created for file %s/%s\n",
381                 (unsigned long long)opd->mid,
382                 opd->jobid,
383                 opd->dname,
384                 opd->fname));
385
386         /* Cause the calling code to reschedule us. */
387         errno = EINTR; /* Maps to NT_STATUS_RETRY. */
388         return -1;
389 }
390
391 /*****************************************************************
392  Look for a matching SMB2 mid. If we find it we're rescheduled,
393  just return the completed open.
394 *****************************************************************/
395
396 static bool find_completed_open(files_struct *fsp,
397                                 int *p_fd,
398                                 int *p_errno)
399 {
400         struct aio_open_private_data *opd;
401
402         opd = find_open_private_data_by_mid(fsp->mid);
403         if (!opd) {
404                 return false;
405         }
406
407         if (opd->in_progress) {
408                 DEBUG(0,("find_completed_open: mid %llu "
409                         "jobid %d still in progress for "
410                         "file %s/%s. PANIC !\n",
411                         (unsigned long long)opd->mid,
412                         opd->jobid,
413                         opd->dname,
414                         opd->fname));
415                 /* Disaster ! This is an open timeout. Just panic. */
416                 smb_panic("find_completed_open - in_progress\n");
417                 /* notreached. */
418                 return false;
419         }
420
421         *p_fd = opd->ret_fd;
422         *p_errno = opd->ret_errno;
423
424         DEBUG(5,("find_completed_open: mid %llu returning "
425                 "fd = %d, errno = %d (%s) "
426                 "jobid (%d) for file %s\n",
427                 (unsigned long long)opd->mid,
428                 opd->ret_fd,
429                 opd->ret_errno,
430                 strerror(opd->ret_errno),
431                 opd->jobid,
432                 smb_fname_str_dbg(fsp->fsp_name)));
433
434         /* Now we can free the opd. */
435         TALLOC_FREE(opd);
436         return true;
437 }
438
439 /*****************************************************************
440  The core open function. Only go async on O_CREAT|O_EXCL
441  opens to prevent any race conditions.
442 *****************************************************************/
443
444 static int aio_pthread_open_fn(vfs_handle_struct *handle,
445                         struct smb_filename *smb_fname,
446                         files_struct *fsp,
447                         int flags,
448                         mode_t mode)
449 {
450         int my_errno = 0;
451         int fd = -1;
452         bool aio_allow_open = lp_parm_bool(
453                 SNUM(handle->conn), "aio_pthread", "aio open", false);
454
455         if (smb_fname->stream_name) {
456                 /* Don't handle stream opens. */
457                 errno = ENOENT;
458                 return -1;
459         }
460
461         if (!aio_allow_open) {
462                 /* aio opens turned off. */
463                 return open(smb_fname->base_name, flags, mode);
464         }
465
466         if (!(flags & O_CREAT)) {
467                 /* Only creates matter. */
468                 return open(smb_fname->base_name, flags, mode);
469         }
470
471         if (!(flags & O_EXCL)) {
472                 /* Only creates with O_EXCL matter. */
473                 return open(smb_fname->base_name, flags, mode);
474         }
475
476         /*
477          * See if this is a reentrant call - i.e. is this a
478          * restart of an existing open that just completed.
479          */
480
481         if (find_completed_open(fsp,
482                                 &fd,
483                                 &my_errno)) {
484                 errno = my_errno;
485                 return fd;
486         }
487
488         /* Ok, it's a create exclusive call - pass it to a thread helper. */
489         return open_async(fsp, flags, mode);
490 }
491 #endif
492
493 static struct vfs_fn_pointers vfs_aio_pthread_fns = {
494 #if defined(HAVE_OPENAT) && defined(USE_LINUX_THREAD_CREDENTIALS)
495         .open_fn = aio_pthread_open_fn,
496 #endif
497 };
498
499 NTSTATUS vfs_aio_pthread_init(void);
500 NTSTATUS vfs_aio_pthread_init(void)
501 {
502         return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
503                                 "aio_pthread", &vfs_aio_pthread_fns);
504 }