fs/eventfd.c

   1 /*
   2  *  fs/eventfd.c
   3  *
   4  *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
   5  *
   6  */
   7
   8 #include <linux/file.h>
   9 #include <linux/poll.h>
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/sched/signal.h>
  13 #include <linux/kernel.h>
  14 #include <linux/slab.h>
  15 #include <linux/list.h>
  16 #include <linux/spinlock.h>
  17 #include <linux/anon_inodes.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/export.h>
  20 #include <linux/kref.h>
  21 #include <linux/eventfd.h>
  22 #include <linux/proc_fs.h>
  23 #include <linux/seq_file.h>
  24
  25 struct eventfd_ctx {
  26         struct kref kref;
  27         wait_queue_head_t wqh;
  28         /*
  29          * Every time that a write(2) is performed on an eventfd, the
  30          * value of the __u64 being written is added to "count" and a
  31          * wakeup is performed on "wqh". A read(2) will return the "count"
  32          * value to userspace, and will reset "count" to zero. The kernel
  33          * side eventfd_signal() also, adds to the "count" counter and
  34          * issue a wakeup.
  35          */
  36         __u64 count;
  37         unsigned int flags;
  38 };
  39
  40 /**
  41  * eventfd_signal - Adds @n to the eventfd counter.
  42  * @ctx: [in] Pointer to the eventfd context.
  43  * @n: [in] Value of the counter to be added to the eventfd internal counter.
  44  *          The value cannot be negative.
  45  *
  46  * This function is supposed to be called by the kernel in paths that do not
  47  * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  48  * value, and we signal this as overflow condition by returning a EPOLLERR
  49  * to poll(2).
  50  *
  51  * Returns the amount by which the counter was incremented.  This will be less
  52  * than @n if the counter has overflowed.
  53  */
  54 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  55 {
  56         unsigned long flags;
  57
  58         spin_lock_irqsave(&ctx->wqh.lock, flags);
  59         if (ULLONG_MAX - ctx->count < n)
  60                 n = ULLONG_MAX - ctx->count;
  61         ctx->count += n;
  62         if (waitqueue_active(&ctx->wqh))
  63                 wake_up_locked_poll(&ctx->wqh, EPOLLIN);
  64         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  65
  66         return n;
  67 }
  68 EXPORT_SYMBOL_GPL(eventfd_signal);
  69
  70 static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  71 {
  72         kfree(ctx);
  73 }
  74
  75 static void eventfd_free(struct kref *kref)
  76 {
  77         struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  78
  79         eventfd_free_ctx(ctx);
  80 }
  81
  82 /**
  83  * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  84  * @ctx: [in] Pointer to eventfd context.
  85  *
  86  * The eventfd context reference must have been previously acquired either
  87  * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
  88  */
  89 void eventfd_ctx_put(struct eventfd_ctx *ctx)
  90 {
  91         kref_put(&ctx->kref, eventfd_free);
  92 }
  93 EXPORT_SYMBOL_GPL(eventfd_ctx_put);
  94
  95 static int eventfd_release(struct inode *inode, struct file *file)
  96 {
  97         struct eventfd_ctx *ctx = file->private_data;
  98
  99         wake_up_poll(&ctx->wqh, EPOLLHUP);
 100         eventfd_ctx_put(ctx);
 101         return 0;
 102 }
 103
 104 static struct wait_queue_head *
 105 eventfd_get_poll_head(struct file *file, __poll_t events)
 106 {
 107         struct eventfd_ctx *ctx = file->private_data;
 108
 109         return &ctx->wqh;
 110 }
 111
 112 static __poll_t eventfd_poll_mask(struct file *file, __poll_t eventmask)
 113 {
 114         struct eventfd_ctx *ctx = file->private_data;
 115         __poll_t events = 0;
 116         u64 count;
 117
 118         /*
 119          * All writes to ctx->count occur within ctx->wqh.lock.  This read
 120          * can be done outside ctx->wqh.lock because we know that poll_wait
 121          * takes that lock (through add_wait_queue) if our caller will sleep.
 122          *
 123          * The read _can_ therefore seep into add_wait_queue's critical
 124          * section, but cannot move above it!  add_wait_queue's spin_lock acts
 125          * as an acquire barrier and ensures that the read be ordered properly
 126          * against the writes.  The following CAN happen and is safe:
 127          *
 128          *     poll                               write
 129          *     -----------------                  ------------
 130          *     lock ctx->wqh.lock (in poll_wait)
 131          *     count = ctx->count
 132          *     __add_wait_queue
 133          *     unlock ctx->wqh.lock
 134          *                                        lock ctx->qwh.lock
 135          *                                        ctx->count += n
 136          *                                        if (waitqueue_active)
 137          *                                          wake_up_locked_poll
 138          *                                        unlock ctx->qwh.lock
 139          *     eventfd_poll returns 0
 140          *
 141          * but the following, which would miss a wakeup, cannot happen:
 142          *
 143          *     poll                               write
 144          *     -----------------                  ------------
 145          *     count = ctx->count (INVALID!)
 146          *                                        lock ctx->qwh.lock
 147          *                                        ctx->count += n
 148          *                                        **waitqueue_active is false**
 149          *                                        **no wake_up_locked_poll!**
 150          *                                        unlock ctx->qwh.lock
 151          *     lock ctx->wqh.lock (in poll_wait)
 152          *     __add_wait_queue
 153          *     unlock ctx->wqh.lock
 154          *     eventfd_poll returns 0
 155          */
 156         count = READ_ONCE(ctx->count);
 157
 158         if (count > 0)
 159                 events |= (EPOLLIN & eventmask);
 160         if (count == ULLONG_MAX)
 161                 events |= EPOLLERR;
 162         if (ULLONG_MAX - 1 > count)
 163                 events |= (EPOLLOUT & eventmask);
 164
 165         return events;
 166 }
 167
 168 static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 169 {
 170         *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 171         ctx->count -= *cnt;
 172 }
 173
 174 /**
 175  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 176  * @ctx: [in] Pointer to eventfd context.
 177  * @wait: [in] Wait queue to be removed.
 178  * @cnt: [out] Pointer to the 64-bit counter value.
 179  *
 180  * Returns %0 if successful, or the following error codes:
 181  *
 182  * -EAGAIN      : The operation would have blocked.
 183  *
 184  * This is used to atomically remove a wait queue entry from the eventfd wait
 185  * queue head, and read/reset the counter value.
 186  */
 187 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 188                                   __u64 *cnt)
 189 {
 190         unsigned long flags;
 191
 192         spin_lock_irqsave(&ctx->wqh.lock, flags);
 193         eventfd_ctx_do_read(ctx, cnt);
 194         __remove_wait_queue(&ctx->wqh, wait);
 195         if (*cnt != 0 && waitqueue_active(&ctx->wqh))
 196                 wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 197         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 198
 199         return *cnt != 0 ? 0 : -EAGAIN;
 200 }
 201 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 202
 203 static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 204                             loff_t *ppos)
 205 {
 206         struct eventfd_ctx *ctx = file->private_data;
 207         ssize_t res;
 208         __u64 ucnt = 0;
 209         DECLARE_WAITQUEUE(wait, current);
 210
 211         if (count < sizeof(ucnt))
 212                 return -EINVAL;
 213
 214         spin_lock_irq(&ctx->wqh.lock);
 215         res = -EAGAIN;
 216         if (ctx->count > 0)
 217                 res = sizeof(ucnt);
 218         else if (!(file->f_flags & O_NONBLOCK)) {
 219                 __add_wait_queue(&ctx->wqh, &wait);
 220                 for (;;) {
 221                         set_current_state(TASK_INTERRUPTIBLE);
 222                         if (ctx->count > 0) {
 223                                 res = sizeof(ucnt);
 224                                 break;
 225                         }
 226                         if (signal_pending(current)) {
 227                                 res = -ERESTARTSYS;
 228                                 break;
 229                         }
 230                         spin_unlock_irq(&ctx->wqh.lock);
 231                         schedule();
 232                         spin_lock_irq(&ctx->wqh.lock);
 233                 }
 234                 __remove_wait_queue(&ctx->wqh, &wait);
 235                 __set_current_state(TASK_RUNNING);
 236         }
 237         if (likely(res > 0)) {
 238                 eventfd_ctx_do_read(ctx, &ucnt);
 239                 if (waitqueue_active(&ctx->wqh))
 240                         wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
 241         }
 242         spin_unlock_irq(&ctx->wqh.lock);
 243
 244         if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
 245                 return -EFAULT;
 246
 247         return res;
 248 }
 249
 250 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
 251                              loff_t *ppos)
 252 {
 253         struct eventfd_ctx *ctx = file->private_data;
 254         ssize_t res;
 255         __u64 ucnt;
 256         DECLARE_WAITQUEUE(wait, current);
 257
 258         if (count < sizeof(ucnt))
 259                 return -EINVAL;
 260         if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 261                 return -EFAULT;
 262         if (ucnt == ULLONG_MAX)
 263                 return -EINVAL;
 264         spin_lock_irq(&ctx->wqh.lock);
 265         res = -EAGAIN;
 266         if (ULLONG_MAX - ctx->count > ucnt)
 267                 res = sizeof(ucnt);
 268         else if (!(file->f_flags & O_NONBLOCK)) {
 269                 __add_wait_queue(&ctx->wqh, &wait);
 270                 for (res = 0;;) {
 271                         set_current_state(TASK_INTERRUPTIBLE);
 272                         if (ULLONG_MAX - ctx->count > ucnt) {
 273                                 res = sizeof(ucnt);
 274                                 break;
 275                         }
 276                         if (signal_pending(current)) {
 277                                 res = -ERESTARTSYS;
 278                                 break;
 279                         }
 280                         spin_unlock_irq(&ctx->wqh.lock);
 281                         schedule();
 282                         spin_lock_irq(&ctx->wqh.lock);
 283                 }
 284                 __remove_wait_queue(&ctx->wqh, &wait);
 285                 __set_current_state(TASK_RUNNING);
 286         }
 287         if (likely(res > 0)) {
 288                 ctx->count += ucnt;
 289                 if (waitqueue_active(&ctx->wqh))
 290                         wake_up_locked_poll(&ctx->wqh, EPOLLIN);
 291         }
 292         spin_unlock_irq(&ctx->wqh.lock);
 293
 294         return res;
 295 }
 296
 297 #ifdef CONFIG_PROC_FS
 298 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 299 {
 300         struct eventfd_ctx *ctx = f->private_data;
 301
 302         spin_lock_irq(&ctx->wqh.lock);
 303         seq_printf(m, "eventfd-count: %16llx\n",
 304                    (unsigned long long)ctx->count);
 305         spin_unlock_irq(&ctx->wqh.lock);
 306 }
 307 #endif
 308
 309 static const struct file_operations eventfd_fops = {
 310 #ifdef CONFIG_PROC_FS
 311         .show_fdinfo    = eventfd_show_fdinfo,
 312 #endif
 313         .release        = eventfd_release,
 314         .get_poll_head  = eventfd_get_poll_head,
 315         .poll_mask      = eventfd_poll_mask,
 316         .read           = eventfd_read,
 317         .write          = eventfd_write,
 318         .llseek         = noop_llseek,
 319 };
 320
 321 /**
 322  * eventfd_fget - Acquire a reference of an eventfd file descriptor.
 323  * @fd: [in] Eventfd file descriptor.
 324  *
 325  * Returns a pointer to the eventfd file structure in case of success, or the
 326  * following error pointer:
 327  *
 328  * -EBADF    : Invalid @fd file descriptor.
 329  * -EINVAL   : The @fd file descriptor is not an eventfd file.
 330  */
 331 struct file *eventfd_fget(int fd)
 332 {
 333         struct file *file;
 334
 335         file = fget(fd);
 336         if (!file)
 337                 return ERR_PTR(-EBADF);
 338         if (file->f_op != &eventfd_fops) {
 339                 fput(file);
 340                 return ERR_PTR(-EINVAL);
 341         }
 342
 343         return file;
 344 }
 345 EXPORT_SYMBOL_GPL(eventfd_fget);
 346
 347 /**
 348  * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
 349  * @fd: [in] Eventfd file descriptor.
 350  *
 351  * Returns a pointer to the internal eventfd context, otherwise the error
 352  * pointers returned by the following functions:
 353  *
 354  * eventfd_fget
 355  */
 356 struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 357 {
 358         struct eventfd_ctx *ctx;
 359         struct fd f = fdget(fd);
 360         if (!f.file)
 361                 return ERR_PTR(-EBADF);
 362         ctx = eventfd_ctx_fileget(f.file);
 363         fdput(f);
 364         return ctx;
 365 }
 366 EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
 367
 368 /**
 369  * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
 370  * @file: [in] Eventfd file pointer.
 371  *
 372  * Returns a pointer to the internal eventfd context, otherwise the error
 373  * pointer:
 374  *
 375  * -EINVAL   : The @fd file descriptor is not an eventfd file.
 376  */
 377 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 378 {
 379         struct eventfd_ctx *ctx;
 380
 381         if (file->f_op != &eventfd_fops)
 382                 return ERR_PTR(-EINVAL);
 383
 384         ctx = file->private_data;
 385         kref_get(&ctx->kref);
 386         return ctx;
 387 }
 388 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 389
 390 static int do_eventfd(unsigned int count, int flags)
 391 {
 392         struct eventfd_ctx *ctx;
 393         int fd;
 394
 395         /* Check the EFD_* constants for consistency.  */
 396         BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 397         BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 398
 399         if (flags & ~EFD_FLAGS_SET)
 400                 return -EINVAL;
 401
 402         ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 403         if (!ctx)
 404                 return -ENOMEM;
 405
 406         kref_init(&ctx->kref);
 407         init_waitqueue_head(&ctx->wqh);
 408         ctx->count = count;
 409         ctx->flags = flags;
 410
 411         fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
 412                               O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
 413         if (fd < 0)
 414                 eventfd_free_ctx(ctx);
 415
 416         return fd;
 417 }
 418
 419 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 420 {
 421         return do_eventfd(count, flags);
 422 }
 423
 424 SYSCALL_DEFINE1(eventfd, unsigned int, count)
 425 {
 426         return do_eventfd(count, 0);
 427 }
 428