io_uring/kbuf.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include <linux/kernel.h>
   3 #include <linux/errno.h>
   4 #include <linux/fs.h>
   5 #include <linux/file.h>
   6 #include <linux/mm.h>
   7 #include <linux/slab.h>
   8 #include <linux/namei.h>
   9 #include <linux/poll.h>
  10 #include <linux/io_uring.h>
  11
  12 #include <uapi/linux/io_uring.h>
  13
  14 #include "io_uring.h"
  15 #include "opdef.h"
  16 #include "kbuf.h"
  17
  18 #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
  19
  20 #define BGID_ARRAY      64
  21
  22 /* BIDs are addressed by a 16-bit field in a CQE */
  23 #define MAX_BIDS_PER_BGID (1 << 16)
  24
  25 struct kmem_cache *io_buf_cachep;
  26
  27 struct io_provide_buf {
  28         struct file                     *file;
  29         __u64                           addr;
  30         __u32                           len;
  31         __u32                           bgid;
  32         __u32                           nbufs;
  33         __u16                           bid;
  34 };
  35
  36 struct io_buf_free {
  37         struct hlist_node               list;
  38         void                            *mem;
  39         size_t                          size;
  40         int                             inuse;
  41 };
  42
  43 static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
  44                                                    struct io_buffer_list *bl,
  45                                                    unsigned int bgid)
  46 {
  47         if (bl && bgid < BGID_ARRAY)
  48                 return &bl[bgid];
  49
  50         return xa_load(&ctx->io_bl_xa, bgid);
  51 }
  52
  53 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
  54                                                         unsigned int bgid)
  55 {
  56         lockdep_assert_held(&ctx->uring_lock);
  57
  58         return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
  59 }
  60
  61 static int io_buffer_add_list(struct io_ring_ctx *ctx,
  62                               struct io_buffer_list *bl, unsigned int bgid)
  63 {
  64         /*
  65          * Store buffer group ID and finally mark the list as visible.
  66          * The normal lookup doesn't care about the visibility as we're
  67          * always under the ->uring_lock, but the RCU lookup from mmap does.
  68          */
  69         bl->bgid = bgid;
  70         smp_store_release(&bl->is_ready, 1);
  71
  72         if (bgid < BGID_ARRAY)
  73                 return 0;
  74
  75         return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
  76 }
  77
  78 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
  79 {
  80         struct io_ring_ctx *ctx = req->ctx;
  81         struct io_buffer_list *bl;
  82         struct io_buffer *buf;
  83
  84         /*
  85          * For legacy provided buffer mode, don't recycle if we already did
  86          * IO to this buffer. For ring-mapped provided buffer mode, we should
  87          * increment ring->head to explicitly monopolize the buffer to avoid
  88          * multiple use.
  89          */
  90         if (req->flags & REQ_F_PARTIAL_IO)
  91                 return false;
  92
  93         io_ring_submit_lock(ctx, issue_flags);
  94
  95         buf = req->kbuf;
  96         bl = io_buffer_get_list(ctx, buf->bgid);
  97         list_add(&buf->list, &bl->buf_list);
  98         req->flags &= ~REQ_F_BUFFER_SELECTED;
  99         req->buf_index = buf->bgid;
 100
 101         io_ring_submit_unlock(ctx, issue_flags);
 102         return true;
 103 }
 104
 105 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
 106 {
 107         unsigned int cflags;
 108
 109         /*
 110          * We can add this buffer back to two lists:
 111          *
 112          * 1) The io_buffers_cache list. This one is protected by the
 113          *    ctx->uring_lock. If we already hold this lock, add back to this
 114          *    list as we can grab it from issue as well.
 115          * 2) The io_buffers_comp list. This one is protected by the
 116          *    ctx->completion_lock.
 117          *
 118          * We migrate buffers from the comp_list to the issue cache list
 119          * when we need one.
 120          */
 121         if (req->flags & REQ_F_BUFFER_RING) {
 122                 /* no buffers to recycle for this case */
 123                 cflags = __io_put_kbuf_list(req, NULL);
 124         } else if (issue_flags & IO_URING_F_UNLOCKED) {
 125                 struct io_ring_ctx *ctx = req->ctx;
 126
 127                 spin_lock(&ctx->completion_lock);
 128                 cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
 129                 spin_unlock(&ctx->completion_lock);
 130         } else {
 131                 lockdep_assert_held(&req->ctx->uring_lock);
 132
 133                 cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
 134         }
 135         return cflags;
 136 }
 137
 138 static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 139                                               struct io_buffer_list *bl)
 140 {
 141         if (!list_empty(&bl->buf_list)) {
 142                 struct io_buffer *kbuf;
 143
 144                 kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
 145                 list_del(&kbuf->list);
 146                 if (*len == 0 || *len > kbuf->len)
 147                         *len = kbuf->len;
 148                 req->flags |= REQ_F_BUFFER_SELECTED;
 149                 req->kbuf = kbuf;
 150                 req->buf_index = kbuf->bid;
 151                 return u64_to_user_ptr(kbuf->addr);
 152         }
 153         return NULL;
 154 }
 155
 156 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 157                                           struct io_buffer_list *bl,
 158                                           unsigned int issue_flags)
 159 {
 160         struct io_uring_buf_ring *br = bl->buf_ring;
 161         struct io_uring_buf *buf;
 162         __u16 head = bl->head;
 163
 164         if (unlikely(smp_load_acquire(&br->tail) == head))
 165                 return NULL;
 166
 167         head &= bl->mask;
 168         /* mmaped buffers are always contig */
 169         if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
 170                 buf = &br->bufs[head];
 171         } else {
 172                 int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
 173                 int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
 174                 buf = page_address(bl->buf_pages[index]);
 175                 buf += off;
 176         }
 177         if (*len == 0 || *len > buf->len)
 178                 *len = buf->len;
 179         req->flags |= REQ_F_BUFFER_RING;
 180         req->buf_list = bl;
 181         req->buf_index = buf->bid;
 182
 183         if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
 184                 /*
 185                  * If we came in unlocked, we have no choice but to consume the
 186                  * buffer here, otherwise nothing ensures that the buffer won't
 187                  * get used by others. This does mean it'll be pinned until the
 188                  * IO completes, coming in unlocked means we're being called from
 189                  * io-wq context and there may be further retries in async hybrid
 190                  * mode. For the locked case, the caller must call commit when
 191                  * the transfer completes (or if we get -EAGAIN and must poll of
 192                  * retry).
 193                  */
 194                 req->buf_list = NULL;
 195                 bl->head++;
 196         }
 197         return u64_to_user_ptr(buf->addr);
 198 }
 199
 200 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 201                               unsigned int issue_flags)
 202 {
 203         struct io_ring_ctx *ctx = req->ctx;
 204         struct io_buffer_list *bl;
 205         void __user *ret = NULL;
 206
 207         io_ring_submit_lock(req->ctx, issue_flags);
 208
 209         bl = io_buffer_get_list(ctx, req->buf_index);
 210         if (likely(bl)) {
 211                 if (bl->is_mapped)
 212                         ret = io_ring_buffer_select(req, len, bl, issue_flags);
 213                 else
 214                         ret = io_provided_buffer_select(req, len, bl);
 215         }
 216         io_ring_submit_unlock(req->ctx, issue_flags);
 217         return ret;
 218 }
 219
 220 static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
 221 {
 222         struct io_buffer_list *bl;
 223         int i;
 224
 225         bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
 226         if (!bl)
 227                 return -ENOMEM;
 228
 229         for (i = 0; i < BGID_ARRAY; i++) {
 230                 INIT_LIST_HEAD(&bl[i].buf_list);
 231                 bl[i].bgid = i;
 232         }
 233
 234         smp_store_release(&ctx->io_bl, bl);
 235         return 0;
 236 }
 237
 238 /*
 239  * Mark the given mapped range as free for reuse
 240  */
 241 static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
 242 {
 243         struct io_buf_free *ibf;
 244
 245         hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
 246                 if (bl->buf_ring == ibf->mem) {
 247                         ibf->inuse = 0;
 248                         return;
 249                 }
 250         }
 251
 252         /* can't happen... */
 253         WARN_ON_ONCE(1);
 254 }
 255
 256 static int __io_remove_buffers(struct io_ring_ctx *ctx,
 257                                struct io_buffer_list *bl, unsigned nbufs)
 258 {
 259         unsigned i = 0;
 260
 261         /* shouldn't happen */
 262         if (!nbufs)
 263                 return 0;
 264
 265         if (bl->is_mapped) {
 266                 i = bl->buf_ring->tail - bl->head;
 267                 if (bl->is_mmap) {
 268                         /*
 269                          * io_kbuf_list_free() will free the page(s) at
 270                          * ->release() time.
 271                          */
 272                         io_kbuf_mark_free(ctx, bl);
 273                         bl->buf_ring = NULL;
 274                         bl->is_mmap = 0;
 275                 } else if (bl->buf_nr_pages) {
 276                         int j;
 277
 278                         for (j = 0; j < bl->buf_nr_pages; j++)
 279                                 unpin_user_page(bl->buf_pages[j]);
 280                         kvfree(bl->buf_pages);
 281                         bl->buf_pages = NULL;
 282                         bl->buf_nr_pages = 0;
 283                 }
 284                 /* make sure it's seen as empty */
 285                 INIT_LIST_HEAD(&bl->buf_list);
 286                 bl->is_mapped = 0;
 287                 return i;
 288         }
 289
 290         /* protects io_buffers_cache */
 291         lockdep_assert_held(&ctx->uring_lock);
 292
 293         while (!list_empty(&bl->buf_list)) {
 294                 struct io_buffer *nxt;
 295
 296                 nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
 297                 list_move(&nxt->list, &ctx->io_buffers_cache);
 298                 if (++i == nbufs)
 299                         return i;
 300                 cond_resched();
 301         }
 302
 303         return i;
 304 }
 305
 306 void io_destroy_buffers(struct io_ring_ctx *ctx)
 307 {
 308         struct io_buffer_list *bl;
 309         struct list_head *item, *tmp;
 310         struct io_buffer *buf;
 311         unsigned long index;
 312         int i;
 313
 314         for (i = 0; i < BGID_ARRAY; i++) {
 315                 if (!ctx->io_bl)
 316                         break;
 317                 __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
 318         }
 319
 320         xa_for_each(&ctx->io_bl_xa, index, bl) {
 321                 xa_erase(&ctx->io_bl_xa, bl->bgid);
 322                 __io_remove_buffers(ctx, bl, -1U);
 323                 kfree_rcu(bl, rcu);
 324         }
 325
 326         /*
 327          * Move deferred locked entries to cache before pruning
 328          */
 329         spin_lock(&ctx->completion_lock);
 330         if (!list_empty(&ctx->io_buffers_comp))
 331                 list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
 332         spin_unlock(&ctx->completion_lock);
 333
 334         list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
 335                 buf = list_entry(item, struct io_buffer, list);
 336                 kmem_cache_free(io_buf_cachep, buf);
 337         }
 338 }
 339
 340 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 341 {
 342         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 343         u64 tmp;
 344
 345         if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
 346             sqe->splice_fd_in)
 347                 return -EINVAL;
 348
 349         tmp = READ_ONCE(sqe->fd);
 350         if (!tmp || tmp > MAX_BIDS_PER_BGID)
 351                 return -EINVAL;
 352
 353         memset(p, 0, sizeof(*p));
 354         p->nbufs = tmp;
 355         p->bgid = READ_ONCE(sqe->buf_group);
 356         return 0;
 357 }
 358
 359 int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 360 {
 361         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 362         struct io_ring_ctx *ctx = req->ctx;
 363         struct io_buffer_list *bl;
 364         int ret = 0;
 365
 366         io_ring_submit_lock(ctx, issue_flags);
 367
 368         ret = -ENOENT;
 369         bl = io_buffer_get_list(ctx, p->bgid);
 370         if (bl) {
 371                 ret = -EINVAL;
 372                 /* can't use provide/remove buffers command on mapped buffers */
 373                 if (!bl->is_mapped)
 374                         ret = __io_remove_buffers(ctx, bl, p->nbufs);
 375         }
 376         io_ring_submit_unlock(ctx, issue_flags);
 377         if (ret < 0)
 378                 req_set_fail(req);
 379         io_req_set_res(req, ret, 0);
 380         return IOU_OK;
 381 }
 382
 383 int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 384 {
 385         unsigned long size, tmp_check;
 386         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 387         u64 tmp;
 388
 389         if (sqe->rw_flags || sqe->splice_fd_in)
 390                 return -EINVAL;
 391
 392         tmp = READ_ONCE(sqe->fd);
 393         if (!tmp || tmp > MAX_BIDS_PER_BGID)
 394                 return -E2BIG;
 395         p->nbufs = tmp;
 396         p->addr = READ_ONCE(sqe->addr);
 397         p->len = READ_ONCE(sqe->len);
 398
 399         if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 400                                 &size))
 401                 return -EOVERFLOW;
 402         if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
 403                 return -EOVERFLOW;
 404
 405         size = (unsigned long)p->len * p->nbufs;
 406         if (!access_ok(u64_to_user_ptr(p->addr), size))
 407                 return -EFAULT;
 408
 409         p->bgid = READ_ONCE(sqe->buf_group);
 410         tmp = READ_ONCE(sqe->off);
 411         if (tmp > USHRT_MAX)
 412                 return -E2BIG;
 413         if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
 414                 return -EINVAL;
 415         p->bid = tmp;
 416         return 0;
 417 }
 418
 419 #define IO_BUFFER_ALLOC_BATCH 64
 420
 421 static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
 422 {
 423         struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
 424         int allocated;
 425
 426         /*
 427          * Completions that don't happen inline (eg not under uring_lock) will
 428          * add to ->io_buffers_comp. If we don't have any free buffers, check
 429          * the completion list and splice those entries first.
 430          */
 431         if (!list_empty_careful(&ctx->io_buffers_comp)) {
 432                 spin_lock(&ctx->completion_lock);
 433                 if (!list_empty(&ctx->io_buffers_comp)) {
 434                         list_splice_init(&ctx->io_buffers_comp,
 435                                                 &ctx->io_buffers_cache);
 436                         spin_unlock(&ctx->completion_lock);
 437                         return 0;
 438                 }
 439                 spin_unlock(&ctx->completion_lock);
 440         }
 441
 442         /*
 443          * No free buffers and no completion entries either. Allocate a new
 444          * batch of buffer entries and add those to our freelist.
 445          */
 446
 447         allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
 448                                           ARRAY_SIZE(bufs), (void **) bufs);
 449         if (unlikely(!allocated)) {
 450                 /*
 451                  * Bulk alloc is all-or-nothing. If we fail to get a batch,
 452                  * retry single alloc to be on the safe side.
 453                  */
 454                 bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
 455                 if (!bufs[0])
 456                         return -ENOMEM;
 457                 allocated = 1;
 458         }
 459
 460         while (allocated)
 461                 list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
 462
 463         return 0;
 464 }
 465
 466 static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
 467                           struct io_buffer_list *bl)
 468 {
 469         struct io_buffer *buf;
 470         u64 addr = pbuf->addr;
 471         int i, bid = pbuf->bid;
 472
 473         for (i = 0; i < pbuf->nbufs; i++) {
 474                 if (list_empty(&ctx->io_buffers_cache) &&
 475                     io_refill_buffer_cache(ctx))
 476                         break;
 477                 buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
 478                                         list);
 479                 list_move_tail(&buf->list, &bl->buf_list);
 480                 buf->addr = addr;
 481                 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
 482                 buf->bid = bid;
 483                 buf->bgid = pbuf->bgid;
 484                 addr += pbuf->len;
 485                 bid++;
 486                 cond_resched();
 487         }
 488
 489         return i ? 0 : -ENOMEM;
 490 }
 491
 492 int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 493 {
 494         struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
 495         struct io_ring_ctx *ctx = req->ctx;
 496         struct io_buffer_list *bl;
 497         int ret = 0;
 498
 499         io_ring_submit_lock(ctx, issue_flags);
 500
 501         if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
 502                 ret = io_init_bl_list(ctx);
 503                 if (ret)
 504                         goto err;
 505         }
 506
 507         bl = io_buffer_get_list(ctx, p->bgid);
 508         if (unlikely(!bl)) {
 509                 bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
 510                 if (!bl) {
 511                         ret = -ENOMEM;
 512                         goto err;
 513                 }
 514                 INIT_LIST_HEAD(&bl->buf_list);
 515                 ret = io_buffer_add_list(ctx, bl, p->bgid);
 516                 if (ret) {
 517                         /*
 518                          * Doesn't need rcu free as it was never visible, but
 519                          * let's keep it consistent throughout. Also can't
 520                          * be a lower indexed array group, as adding one
 521                          * where lookup failed cannot happen.
 522                          */
 523                         if (p->bgid >= BGID_ARRAY)
 524                                 kfree_rcu(bl, rcu);
 525                         else
 526                                 WARN_ON_ONCE(1);
 527                         goto err;
 528                 }
 529         }
 530         /* can't add buffers via this command for a mapped buffer ring */
 531         if (bl->is_mapped) {
 532                 ret = -EINVAL;
 533                 goto err;
 534         }
 535
 536         ret = io_add_buffers(ctx, p, bl);
 537 err:
 538         io_ring_submit_unlock(ctx, issue_flags);
 539
 540         if (ret < 0)
 541                 req_set_fail(req);
 542         io_req_set_res(req, ret, 0);
 543         return IOU_OK;
 544 }
 545
 546 static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 547                             struct io_buffer_list *bl)
 548 {
 549         struct io_uring_buf_ring *br;
 550         struct page **pages;
 551         int i, nr_pages;
 552
 553         pages = io_pin_pages(reg->ring_addr,
 554                              flex_array_size(br, bufs, reg->ring_entries),
 555                              &nr_pages);
 556         if (IS_ERR(pages))
 557                 return PTR_ERR(pages);
 558
 559         /*
 560          * Apparently some 32-bit boxes (ARM) will return highmem pages,
 561          * which then need to be mapped. We could support that, but it'd
 562          * complicate the code and slowdown the common cases quite a bit.
 563          * So just error out, returning -EINVAL just like we did on kernels
 564          * that didn't support mapped buffer rings.
 565          */
 566         for (i = 0; i < nr_pages; i++)
 567                 if (PageHighMem(pages[i]))
 568                         goto error_unpin;
 569
 570         br = page_address(pages[0]);
 571 #ifdef SHM_COLOUR
 572         /*
 573          * On platforms that have specific aliasing requirements, SHM_COLOUR
 574          * is set and we must guarantee that the kernel and user side align
 575          * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
 576          * the application mmap's the provided ring buffer. Fail the request
 577          * if we, by chance, don't end up with aligned addresses. The app
 578          * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
 579          * this transparently.
 580          */
 581         if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
 582                 goto error_unpin;
 583 #endif
 584         bl->buf_pages = pages;
 585         bl->buf_nr_pages = nr_pages;
 586         bl->buf_ring = br;
 587         bl->is_mapped = 1;
 588         bl->is_mmap = 0;
 589         return 0;
 590 error_unpin:
 591         for (i = 0; i < nr_pages; i++)
 592                 unpin_user_page(pages[i]);
 593         kvfree(pages);
 594         return -EINVAL;
 595 }
 596
 597 /*
 598  * See if we have a suitable region that we can reuse, rather than allocate
 599  * both a new io_buf_free and mem region again. We leave it on the list as
 600  * even a reused entry will need freeing at ring release.
 601  */
 602 static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
 603                                                     size_t ring_size)
 604 {
 605         struct io_buf_free *ibf, *best = NULL;
 606         size_t best_dist;
 607
 608         hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
 609                 size_t dist;
 610
 611                 if (ibf->inuse || ibf->size < ring_size)
 612                         continue;
 613                 dist = ibf->size - ring_size;
 614                 if (!best || dist < best_dist) {
 615                         best = ibf;
 616                         if (!dist)
 617                                 break;
 618                         best_dist = dist;
 619                 }
 620         }
 621
 622         return best;
 623 }
 624
 625 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
 626                               struct io_uring_buf_reg *reg,
 627                               struct io_buffer_list *bl)
 628 {
 629         struct io_buf_free *ibf;
 630         size_t ring_size;
 631         void *ptr;
 632
 633         ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
 634
 635         /* Reuse existing entry, if we can */
 636         ibf = io_lookup_buf_free_entry(ctx, ring_size);
 637         if (!ibf) {
 638                 ptr = io_mem_alloc(ring_size);
 639                 if (IS_ERR(ptr))
 640                         return PTR_ERR(ptr);
 641
 642                 /* Allocate and store deferred free entry */
 643                 ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
 644                 if (!ibf) {
 645                         io_mem_free(ptr);
 646                         return -ENOMEM;
 647                 }
 648                 ibf->mem = ptr;
 649                 ibf->size = ring_size;
 650                 hlist_add_head(&ibf->list, &ctx->io_buf_list);
 651         }
 652         ibf->inuse = 1;
 653         bl->buf_ring = ibf->mem;
 654         bl->is_mapped = 1;
 655         bl->is_mmap = 1;
 656         return 0;
 657 }
 658
 659 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 660 {
 661         struct io_uring_buf_reg reg;
 662         struct io_buffer_list *bl, *free_bl = NULL;
 663         int ret;
 664
 665         lockdep_assert_held(&ctx->uring_lock);
 666
 667         if (copy_from_user(&reg, arg, sizeof(reg)))
 668                 return -EFAULT;
 669
 670         if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 671                 return -EINVAL;
 672         if (reg.flags & ~IOU_PBUF_RING_MMAP)
 673                 return -EINVAL;
 674         if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
 675                 if (!reg.ring_addr)
 676                         return -EFAULT;
 677                 if (reg.ring_addr & ~PAGE_MASK)
 678                         return -EINVAL;
 679         } else {
 680                 if (reg.ring_addr)
 681                         return -EINVAL;
 682         }
 683
 684         if (!is_power_of_2(reg.ring_entries))
 685                 return -EINVAL;
 686
 687         /* cannot disambiguate full vs empty due to head/tail size */
 688         if (reg.ring_entries >= 65536)
 689                 return -EINVAL;
 690
 691         if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
 692                 int ret = io_init_bl_list(ctx);
 693                 if (ret)
 694                         return ret;
 695         }
 696
 697         bl = io_buffer_get_list(ctx, reg.bgid);
 698         if (bl) {
 699                 /* if mapped buffer ring OR classic exists, don't allow */
 700                 if (bl->is_mapped || !list_empty(&bl->buf_list))
 701                         return -EEXIST;
 702         } else {
 703                 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
 704                 if (!bl)
 705                         return -ENOMEM;
 706         }
 707
 708         if (!(reg.flags & IOU_PBUF_RING_MMAP))
 709                 ret = io_pin_pbuf_ring(&reg, bl);
 710         else
 711                 ret = io_alloc_pbuf_ring(ctx, &reg, bl);
 712
 713         if (!ret) {
 714                 bl->nr_entries = reg.ring_entries;
 715                 bl->mask = reg.ring_entries - 1;
 716
 717                 io_buffer_add_list(ctx, bl, reg.bgid);
 718                 return 0;
 719         }
 720
 721         kfree_rcu(free_bl, rcu);
 722         return ret;
 723 }
 724
 725 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 726 {
 727         struct io_uring_buf_reg reg;
 728         struct io_buffer_list *bl;
 729
 730         lockdep_assert_held(&ctx->uring_lock);
 731
 732         if (copy_from_user(&reg, arg, sizeof(reg)))
 733                 return -EFAULT;
 734         if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 735                 return -EINVAL;
 736         if (reg.flags)
 737                 return -EINVAL;
 738
 739         bl = io_buffer_get_list(ctx, reg.bgid);
 740         if (!bl)
 741                 return -ENOENT;
 742         if (!bl->is_mapped)
 743                 return -EINVAL;
 744
 745         __io_remove_buffers(ctx, bl, -1U);
 746         if (bl->bgid >= BGID_ARRAY) {
 747                 xa_erase(&ctx->io_bl_xa, bl->bgid);
 748                 kfree_rcu(bl, rcu);
 749         }
 750         return 0;
 751 }
 752
 753 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
 754 {
 755         struct io_uring_buf_status buf_status;
 756         struct io_buffer_list *bl;
 757         int i;
 758
 759         if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
 760                 return -EFAULT;
 761
 762         for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
 763                 if (buf_status.resv[i])
 764                         return -EINVAL;
 765
 766         bl = io_buffer_get_list(ctx, buf_status.buf_group);
 767         if (!bl)
 768                 return -ENOENT;
 769         if (!bl->is_mapped)
 770                 return -EINVAL;
 771
 772         buf_status.head = bl->head;
 773         if (copy_to_user(arg, &buf_status, sizeof(buf_status)))
 774                 return -EFAULT;
 775
 776         return 0;
 777 }
 778
 779 void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 780 {
 781         struct io_buffer_list *bl;
 782
 783         bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
 784
 785         if (!bl || !bl->is_mmap)
 786                 return NULL;
 787         /*
 788          * Ensure the list is fully setup. Only strictly needed for RCU lookup
 789          * via mmap, and in that case only for the array indexed groups. For
 790          * the xarray lookups, it's either visible and ready, or not at all.
 791          */
 792         if (!smp_load_acquire(&bl->is_ready))
 793                 return NULL;
 794
 795         return bl->buf_ring;
 796 }
 797
 798 /*
 799  * Called at or after ->release(), free the mmap'ed buffers that we used
 800  * for memory mapped provided buffer rings.
 801  */
 802 void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
 803 {
 804         struct io_buf_free *ibf;
 805         struct hlist_node *tmp;
 806
 807         hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
 808                 hlist_del(&ibf->list);
 809                 io_mem_free(ibf->mem);
 810                 kfree(ibf);
 811         }
 812 }