s3-vfs: Put vfs_aixacl_util.c helper functions into a header file
[kai/samba.git] / source3 / modules / vfs_aio_pthread.c
1 /*
2  * Simulate Posix AIO using pthreads.
3  *
4  * Based on the aio_fork work from Volker and Volker's pthreadpool library.
5  *
6  * Copyright (C) Volker Lendecke 2008
7  * Copyright (C) Jeremy Allison 2012
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22  */
23
24 #include "includes.h"
25 #include "system/filesys.h"
26 #include "system/shmem.h"
27 #include "smbd/smbd.h"
28 #include "smbd/globals.h"
29 #include "lib/pthreadpool/pthreadpool.h"
30 #ifdef HAVE_LINUX_FALLOC_H
31 #include <linux/falloc.h>
32 #endif
33
34 #if defined(HAVE_OPENAT) && defined(USE_LINUX_THREAD_CREDENTIALS)
35
36 /************************************************************************
37  Ensure thread pool is initialized.
38 ***********************************************************************/
39
40 static bool init_aio_threadpool(struct event_context *ev_ctx,
41                                 struct pthreadpool **pp_pool,
42                                 void (*completion_fn)(struct event_context *,
43                                                 struct fd_event *,
44                                                 uint16,
45                                                 void *))
46 {
47         struct fd_event *sock_event = NULL;
48         int ret = 0;
49
50         if (*pp_pool) {
51                 return true;
52         }
53
54         ret = pthreadpool_init(aio_pending_size, pp_pool);
55         if (ret) {
56                 errno = ret;
57                 return false;
58         }
59         sock_event = tevent_add_fd(ev_ctx,
60                                 NULL,
61                                 pthreadpool_signal_fd(*pp_pool),
62                                 TEVENT_FD_READ,
63                                 completion_fn,
64                                 NULL);
65         if (sock_event == NULL) {
66                 pthreadpool_destroy(*pp_pool);
67                 *pp_pool = NULL;
68                 return false;
69         }
70
71         DEBUG(10,("init_aio_threadpool: initialized with up to %d threads\n",
72                   aio_pending_size));
73
74         return true;
75 }
76
77 /*
78  * We must have openat() to do any thread-based
79  * asynchronous opens. We also must be using
80  * thread-specific credentials (Linux-only
81  * for now).
82  */
83
84 /*
85  * NB. This threadpool is shared over all
86  * instances of this VFS module in this
87  * process, as is the current jobid.
88  */
89
90 static struct pthreadpool *open_pool;
91 static int aio_pthread_open_jobid;
92
93 struct aio_open_private_data {
94         struct aio_open_private_data *prev, *next;
95         /* Inputs. */
96         int jobid;
97         int dir_fd;
98         int flags;
99         mode_t mode;
100         uint64_t mid;
101         bool in_progress;
102         const char *fname;
103         char *dname;
104         struct smbd_server_connection *sconn;
105         const struct security_unix_token *ux_tok;
106         uint64_t initial_allocation_size;
107         /* Returns. */
108         int ret_fd;
109         int ret_errno;
110 };
111
112 /* List of outstanding requests we have. */
113 static struct aio_open_private_data *open_pd_list;
114
115 /************************************************************************
116  Find the open private data by jobid.
117 ***********************************************************************/
118
119 static struct aio_open_private_data *find_open_private_data_by_jobid(int jobid)
120 {
121         struct aio_open_private_data *opd;
122
123         for (opd = open_pd_list; opd != NULL; opd = opd->next) {
124                 if (opd->jobid == jobid) {
125                         return opd;
126                 }
127         }
128
129         return NULL;
130 }
131
132 /************************************************************************
133  Find the open private data by mid.
134 ***********************************************************************/
135
136 static struct aio_open_private_data *find_open_private_data_by_mid(uint64_t mid)
137 {
138         struct aio_open_private_data *opd;
139
140         for (opd = open_pd_list; opd != NULL; opd = opd->next) {
141                 if (opd->mid == mid) {
142                         return opd;
143                 }
144         }
145
146         return NULL;
147 }
148
149 /************************************************************************
150  Callback when an open completes.
151 ***********************************************************************/
152
153 static void aio_open_handle_completion(struct event_context *event_ctx,
154                                 struct fd_event *event,
155                                 uint16 flags,
156                                 void *p)
157 {
158         struct aio_open_private_data *opd = NULL;
159         int jobid = 0;
160         int ret;
161
162         DEBUG(10, ("aio_open_handle_completion called with flags=%d\n",
163                 (int)flags));
164
165         if ((flags & EVENT_FD_READ) == 0) {
166                 return;
167         }
168
169         ret = pthreadpool_finished_job(open_pool, &jobid);
170         if (ret) {
171                 smb_panic("aio_open_handle_completion");
172                 /* notreached. */
173                 return;
174         }
175
176         opd = find_open_private_data_by_jobid(jobid);
177         if (opd == NULL) {
178                 DEBUG(0, ("aio_open_handle_completion cannot find jobid %d\n",
179                         jobid));
180                 smb_panic("aio_open_handle_completion - no jobid");
181                 /* notreached. */
182                 return;
183         }
184
185         DEBUG(10,("aio_open_handle_completion: jobid %d mid %llu "
186                 "for file %s/%s completed\n",
187                 jobid,
188                 (unsigned long long)opd->mid,
189                 opd->dname,
190                 opd->fname));
191
192         opd->in_progress = false;
193
194         /* Find outstanding event and reschdule. */
195         if (!schedule_deferred_open_message_smb(opd->sconn, opd->mid)) {
196                 /*
197                  * Outstanding event didn't exist or was
198                  * cancelled. Free up the fd and throw
199                  * away the result.
200                  */
201                 if (opd->ret_fd != -1) {
202                         close(opd->ret_fd);
203                         opd->ret_fd = -1;
204                 }
205                 TALLOC_FREE(opd);
206         }
207 }
208
209 /*****************************************************************
210  The core of the async open code - the worker function. Note we
211  use the new openat() system call to avoid any problems with
212  current working directory changes plus we change credentials
213  on the thread to prevent any security race conditions.
214 *****************************************************************/
215
216 static void aio_open_worker(void *private_data)
217 {
218         struct aio_open_private_data *opd =
219                 (struct aio_open_private_data *)private_data;
220
221         /* Become the correct credential on this thread. */
222         if (set_thread_credentials(opd->ux_tok->uid,
223                                 opd->ux_tok->gid,
224                                 (size_t)opd->ux_tok->ngroups,
225                                 opd->ux_tok->groups) != 0) {
226                 opd->ret_fd = -1;
227                 opd->ret_errno = errno;
228                 return;
229         }
230
231         opd->ret_fd = openat(opd->dir_fd,
232                         opd->fname,
233                         opd->flags,
234                         opd->mode);
235
236         if (opd->ret_fd == -1) {
237                 opd->ret_errno = errno;
238         } else {
239                 /* Create was successful. */
240                 opd->ret_errno = 0;
241
242 #if defined(HAVE_LINUX_FALLOCATE)
243                 /*
244                  * See if we can set the initial
245                  * allocation size. We don't record
246                  * the return for this as it's an
247                  * optimization - the upper layer
248                  * will also do this for us once
249                  * the open returns.
250                  */
251                 if (opd->initial_allocation_size) {
252                         (void)fallocate(opd->ret_fd,
253                                         FALLOC_FL_KEEP_SIZE,
254                                         0,
255                                         (off_t)opd->initial_allocation_size);
256                 }
257 #endif
258         }
259 }
260
261 /************************************************************************
262  Open private data destructor.
263 ***********************************************************************/
264
265 static int opd_destructor(struct aio_open_private_data *opd)
266 {
267         if (opd->dir_fd != -1) {
268                 close(opd->dir_fd);
269         }
270         DLIST_REMOVE(open_pd_list, opd);
271         return 0;
272 }
273
274 /************************************************************************
275  Create and initialize a private data struct for async open.
276 ***********************************************************************/
277
278 static struct aio_open_private_data *create_private_open_data(const files_struct *fsp,
279                                         int flags,
280                                         mode_t mode)
281 {
282         struct aio_open_private_data *opd = talloc_zero(NULL,
283                                         struct aio_open_private_data);
284         const char *fname = NULL;
285
286         if (!opd) {
287                 return NULL;
288         }
289
290         opd->jobid = aio_pthread_open_jobid++;
291         opd->dir_fd = -1;
292         opd->ret_fd = -1;
293         opd->ret_errno = EINPROGRESS;
294         opd->flags = flags;
295         opd->mode = mode;
296         opd->mid = fsp->mid;
297         opd->in_progress = true;
298         opd->sconn = fsp->conn->sconn;
299         opd->initial_allocation_size = fsp->initial_allocation_size;
300
301         /* Copy our current credentials. */
302         opd->ux_tok = copy_unix_token(opd, get_current_utok(fsp->conn));
303         if (opd->ux_tok == NULL) {
304                 TALLOC_FREE(opd);
305                 return NULL;
306         }
307
308         /*
309          * Copy the parent directory name and the
310          * relative path within it.
311          */
312         if (parent_dirname(opd,
313                         fsp->fsp_name->base_name,
314                         &opd->dname,
315                         &fname) == false) {
316                 TALLOC_FREE(opd);
317                 return NULL;
318         }
319         opd->fname = talloc_strdup(opd, fname);
320         if (opd->fname == NULL) {
321                 TALLOC_FREE(opd);
322                 return NULL;
323         }
324
325 #if defined(O_DIRECTORY)
326         opd->dir_fd = open(opd->dname, O_RDONLY|O_DIRECTORY);
327 #else
328         opd->dir_fd = open(opd->dname, O_RDONLY);
329 #endif
330         if (opd->dir_fd == -1) {
331                 TALLOC_FREE(opd);
332                 return NULL;
333         }
334
335         talloc_set_destructor(opd, opd_destructor);
336         DLIST_ADD_END(open_pd_list, opd, struct aio_open_private_data *);
337         return opd;
338 }
339
340 /*****************************************************************
341  Setup an async open.
342 *****************************************************************/
343
344 static int open_async(const files_struct *fsp,
345                         int flags,
346                         mode_t mode)
347 {
348         struct aio_open_private_data *opd = NULL;
349         int ret;
350
351         if (!init_aio_threadpool(fsp->conn->sconn->ev_ctx,
352                         &open_pool,
353                         aio_open_handle_completion)) {
354                 return -1;
355         }
356
357         opd = create_private_open_data(fsp, flags, mode);
358         if (opd == NULL) {
359                 DEBUG(10, ("open_async: Could not create private data.\n"));
360                 return -1;
361         }
362
363         ret = pthreadpool_add_job(open_pool,
364                                 opd->jobid,
365                                 aio_open_worker,
366                                 (void *)opd);
367         if (ret) {
368                 errno = ret;
369                 return -1;
370         }
371
372         DEBUG(5,("open_async: mid %llu jobid %d created for file %s/%s\n",
373                 (unsigned long long)opd->mid,
374                 opd->jobid,
375                 opd->dname,
376                 opd->fname));
377
378         /* Cause the calling code to reschedule us. */
379         errno = EINTR; /* Maps to NT_STATUS_RETRY. */
380         return -1;
381 }
382
383 /*****************************************************************
384  Look for a matching SMB2 mid. If we find it we're rescheduled,
385  just return the completed open.
386 *****************************************************************/
387
388 static bool find_completed_open(files_struct *fsp,
389                                 int *p_fd,
390                                 int *p_errno)
391 {
392         struct aio_open_private_data *opd;
393
394         opd = find_open_private_data_by_mid(fsp->mid);
395         if (!opd) {
396                 return false;
397         }
398
399         if (opd->in_progress) {
400                 DEBUG(0,("find_completed_open: mid %llu "
401                         "jobid %d still in progress for "
402                         "file %s/%s. PANIC !\n",
403                         (unsigned long long)opd->mid,
404                         opd->jobid,
405                         opd->dname,
406                         opd->fname));
407                 /* Disaster ! This is an open timeout. Just panic. */
408                 smb_panic("find_completed_open - in_progress\n");
409                 /* notreached. */
410                 return false;
411         }
412
413         *p_fd = opd->ret_fd;
414         *p_errno = opd->ret_errno;
415
416         DEBUG(5,("find_completed_open: mid %llu returning "
417                 "fd = %d, errno = %d (%s) "
418                 "jobid (%d) for file %s\n",
419                 (unsigned long long)opd->mid,
420                 opd->ret_fd,
421                 opd->ret_errno,
422                 strerror(opd->ret_errno),
423                 opd->jobid,
424                 smb_fname_str_dbg(fsp->fsp_name)));
425
426         /* Now we can free the opd. */
427         TALLOC_FREE(opd);
428         return true;
429 }
430
431 /*****************************************************************
432  The core open function. Only go async on O_CREAT|O_EXCL
433  opens to prevent any race conditions.
434 *****************************************************************/
435
436 static int aio_pthread_open_fn(vfs_handle_struct *handle,
437                         struct smb_filename *smb_fname,
438                         files_struct *fsp,
439                         int flags,
440                         mode_t mode)
441 {
442         int my_errno = 0;
443         int fd = -1;
444         bool aio_allow_open = lp_parm_bool(
445                 SNUM(handle->conn), "aio_pthread", "aio open", false);
446
447         if (smb_fname->stream_name) {
448                 /* Don't handle stream opens. */
449                 errno = ENOENT;
450                 return -1;
451         }
452
453         if (!aio_allow_open) {
454                 /* aio opens turned off. */
455                 return open(smb_fname->base_name, flags, mode);
456         }
457
458         if (!(flags & O_CREAT)) {
459                 /* Only creates matter. */
460                 return open(smb_fname->base_name, flags, mode);
461         }
462
463         if (!(flags & O_EXCL)) {
464                 /* Only creates with O_EXCL matter. */
465                 return open(smb_fname->base_name, flags, mode);
466         }
467
468         /*
469          * See if this is a reentrant call - i.e. is this a
470          * restart of an existing open that just completed.
471          */
472
473         if (find_completed_open(fsp,
474                                 &fd,
475                                 &my_errno)) {
476                 errno = my_errno;
477                 return fd;
478         }
479
480         /* Ok, it's a create exclusive call - pass it to a thread helper. */
481         return open_async(fsp, flags, mode);
482 }
483 #endif
484
485 static struct vfs_fn_pointers vfs_aio_pthread_fns = {
486 #if defined(HAVE_OPENAT) && defined(USE_LINUX_THREAD_CREDENTIALS)
487         .open_fn = aio_pthread_open_fn,
488 #endif
489 };
490
491 NTSTATUS vfs_aio_pthread_init(void);
492 NTSTATUS vfs_aio_pthread_init(void)
493 {
494         return smb_register_vfs(SMB_VFS_INTERFACE_VERSION,
495                                 "aio_pthread", &vfs_aio_pthread_fns);
496 }