io_uring/io-wq: don't use static creds/mm assignments
authorJens Axboe <axboe@kernel.dk>
Mon, 27 Jan 2020 23:34:48 +0000 (16:34 -0700)
committerJens Axboe <axboe@kernel.dk>
Wed, 29 Jan 2020 00:44:20 +0000 (17:44 -0700)
We currently setup the io_wq with a static set of mm and creds. Even for
a single-use io-wq per io_uring, this is suboptimal as we have may have
multiple enters of the ring. For sharing the io-wq backend, it doesn't
work at all.

Switch to passing in the creds and mm when the work item is setup. This
means that async work is no longer deferred to the io_uring mm and creds,
it is done with the current mm and creds.

Flag this behavior with IORING_FEAT_CUR_PERSONALITY, so applications know
they can rely on the current personality (mm and creds) being the same
for direct issue and async issue.

Reviewed-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
fs/io-wq.c
fs/io-wq.h
fs/io_uring.c
include/uapi/linux/io_uring.h

index 54e270ae12ab7bd18539edac8ce8e887c1798815..7ccedc82a70348d4439a7a0c945efbe75159b0c5 100644 (file)
@@ -56,7 +56,8 @@ struct io_worker {
 
        struct rcu_head rcu;
        struct mm_struct *mm;
-       const struct cred *creds;
+       const struct cred *cur_creds;
+       const struct cred *saved_creds;
        struct files_struct *restore_files;
 };
 
@@ -109,8 +110,6 @@ struct io_wq {
 
        struct task_struct *manager;
        struct user_struct *user;
-       const struct cred *creds;
-       struct mm_struct *mm;
        refcount_t refs;
        struct completion done;
 
@@ -137,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
 {
        bool dropped_lock = false;
 
-       if (worker->creds) {
-               revert_creds(worker->creds);
-               worker->creds = NULL;
+       if (worker->saved_creds) {
+               revert_creds(worker->saved_creds);
+               worker->cur_creds = worker->saved_creds = NULL;
        }
 
        if (current->files != worker->restore_files) {
@@ -398,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
        return NULL;
 }
 
+static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
+{
+       if (worker->mm) {
+               unuse_mm(worker->mm);
+               mmput(worker->mm);
+               worker->mm = NULL;
+       }
+       if (!work->mm) {
+               set_fs(KERNEL_DS);
+               return;
+       }
+       if (mmget_not_zero(work->mm)) {
+               use_mm(work->mm);
+               if (!worker->mm)
+                       set_fs(USER_DS);
+               worker->mm = work->mm;
+               /* hang on to this mm */
+               work->mm = NULL;
+               return;
+       }
+
+       /* failed grabbing mm, ensure work gets cancelled */
+       work->flags |= IO_WQ_WORK_CANCEL;
+}
+
+static void io_wq_switch_creds(struct io_worker *worker,
+                              struct io_wq_work *work)
+{
+       const struct cred *old_creds = override_creds(work->creds);
+
+       worker->cur_creds = work->creds;
+       if (worker->saved_creds)
+               put_cred(old_creds); /* creds set by previous switch */
+       else
+               worker->saved_creds = old_creds;
+}
+
 static void io_worker_handle_work(struct io_worker *worker)
        __releases(wqe->lock)
 {
@@ -446,18 +482,10 @@ next:
                        current->files = work->files;
                        task_unlock(current);
                }
-               if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
-                   wq->mm) {
-                       if (mmget_not_zero(wq->mm)) {
-                               use_mm(wq->mm);
-                               set_fs(USER_DS);
-                               worker->mm = wq->mm;
-                       } else {
-                               work->flags |= IO_WQ_WORK_CANCEL;
-                       }
-               }
-               if (!worker->creds)
-                       worker->creds = override_creds(wq->creds);
+               if (work->mm != worker->mm)
+                       io_wq_switch_mm(worker, work);
+               if (worker->cur_creds != work->creds)
+                       io_wq_switch_creds(worker, work);
                /*
                 * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
                 * the worker function will do the right thing.
@@ -1037,7 +1065,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
        /* caller must already hold a reference to this */
        wq->user = data->user;
-       wq->creds = data->creds;
 
        for_each_node(node) {
                struct io_wqe *wqe;
@@ -1064,9 +1091,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
 
        init_completion(&wq->done);
 
-       /* caller must have already done mmgrab() on this mm */
-       wq->mm = data->mm;
-
        wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
        if (!IS_ERR(wq->manager)) {
                wake_up_process(wq->manager);
index 1cd039af88134aa3163ebfa0320a05ab43543db7..167316ad447ed5b41583840db1653e3a74e4a358 100644 (file)
@@ -7,7 +7,6 @@ enum {
        IO_WQ_WORK_CANCEL       = 1,
        IO_WQ_WORK_HAS_MM       = 2,
        IO_WQ_WORK_HASHED       = 4,
-       IO_WQ_WORK_NEEDS_USER   = 8,
        IO_WQ_WORK_NEEDS_FILES  = 16,
        IO_WQ_WORK_UNBOUND      = 32,
        IO_WQ_WORK_INTERNAL     = 64,
@@ -74,6 +73,8 @@ struct io_wq_work {
        };
        void (*func)(struct io_wq_work **);
        struct files_struct *files;
+       struct mm_struct *mm;
+       const struct cred *creds;
        unsigned flags;
 };
 
@@ -83,15 +84,15 @@ struct io_wq_work {
                (work)->func = _func;                   \
                (work)->flags = 0;                      \
                (work)->files = NULL;                   \
+               (work)->mm = NULL;                      \
+               (work)->creds = NULL;                   \
        } while (0)                                     \
 
 typedef void (get_work_fn)(struct io_wq_work *);
 typedef void (put_work_fn)(struct io_wq_work *);
 
 struct io_wq_data {
-       struct mm_struct *mm;
        struct user_struct *user;
-       const struct cred *creds;
 
        get_work_fn *get_work;
        put_work_fn *put_work;
index 1dd20305c6644b292c76b612983fae7527b46ce2..0ea36911745ddb378cddcc1866c99b2fa8325854 100644 (file)
@@ -875,6 +875,29 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
        }
 }
 
+static inline void io_req_work_grab_env(struct io_kiocb *req,
+                                       const struct io_op_def *def)
+{
+       if (!req->work.mm && def->needs_mm) {
+               mmgrab(current->mm);
+               req->work.mm = current->mm;
+       }
+       if (!req->work.creds)
+               req->work.creds = get_current_cred();
+}
+
+static inline void io_req_work_drop_env(struct io_kiocb *req)
+{
+       if (req->work.mm) {
+               mmdrop(req->work.mm);
+               req->work.mm = NULL;
+       }
+       if (req->work.creds) {
+               put_cred(req->work.creds);
+               req->work.creds = NULL;
+       }
+}
+
 static inline bool io_prep_async_work(struct io_kiocb *req,
                                      struct io_kiocb **link)
 {
@@ -888,8 +911,8 @@ static inline bool io_prep_async_work(struct io_kiocb *req,
                if (def->unbound_nonreg_file)
                        req->work.flags |= IO_WQ_WORK_UNBOUND;
        }
-       if (def->needs_mm)
-               req->work.flags |= IO_WQ_WORK_NEEDS_USER;
+
+       io_req_work_grab_env(req, def);
 
        *link = io_prep_linked_timeout(req);
        return do_hashed;
@@ -1180,6 +1203,8 @@ static void __io_req_aux_free(struct io_kiocb *req)
                else
                        fput(req->file);
        }
+
+       io_req_work_drop_env(req);
 }
 
 static void __io_free_req(struct io_kiocb *req)
@@ -3963,6 +3988,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
 {
        ssize_t ret = 0;
 
+       io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+
        switch (req->opcode) {
        case IORING_OP_NOP:
                break;
@@ -5725,9 +5752,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
                goto err;
        }
 
-       data.mm = ctx->sqo_mm;
        data.user = ctx->user;
-       data.creds = ctx->creds;
        data.get_work = io_get_work;
        data.put_work = io_put_work;
 
@@ -6535,7 +6560,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
                goto err;
 
        p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
-                       IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS;
+                       IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
+                       IORING_FEAT_CUR_PERSONALITY;
        trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
        return ret;
 err:
index 57d05cc5e27159cb9ff93ee204907deca9b262d5..9988e82f858b80589b1bf0eb903b70ab7fe7d8ee 100644 (file)
@@ -195,6 +195,7 @@ struct io_uring_params {
 #define IORING_FEAT_NODROP             (1U << 1)
 #define IORING_FEAT_SUBMIT_STABLE      (1U << 2)
 #define IORING_FEAT_RW_CUR_POS         (1U << 3)
+#define IORING_FEAT_CUR_PERSONALITY    (1U << 4)
 
 /*
  * io_uring_register(2) opcodes and arguments