Merge tag 'drm-next-2019-01-05' of git://anongit.freedesktop.org/drm/drm
[sfrench/cifs-2.6.git] / fs / eventpoll.c
index 8a5a1010886bad1f5c2918565807b5b3aaa988e1..a5d219d920e755aa7761253c87a1da6470f26782 100644 (file)
@@ -381,7 +381,8 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
  */
 static inline int ep_events_available(struct eventpoll *ep)
 {
-       return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+       return !list_empty_careful(&ep->rdllist) ||
+               READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
 }
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
@@ -471,7 +472,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
  *                  no re-entered.
  *
  * @ncalls: Pointer to the nested_calls structure to be used for this call.
- * @max_nests: Maximum number of allowed nesting calls.
  * @nproc: Nested call core function pointer.
  * @priv: Opaque data to be passed to the @nproc callback.
  * @cookie: Cookie to be used to identify this nested call.
@@ -480,7 +480,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
  * Returns: Returns the code returned by the @nproc callback, or -1 if
  *          the maximum recursion limit has been exceeded.
  */
-static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+static int ep_call_nested(struct nested_calls *ncalls,
                          int (*nproc)(void *, void *, int), void *priv,
                          void *cookie, void *ctx)
 {
@@ -499,7 +499,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
         */
        list_for_each_entry(tncur, lsthead, llink) {
                if (tncur->ctx == ctx &&
-                   (tncur->cookie == cookie || ++call_nests > max_nests)) {
+                   (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
                        /*
                         * Ops ... loop detected or maximum nest level reached.
                         * We abort this wake by breaking the cycle itself.
@@ -573,7 +573,7 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
 {
        int this_cpu = get_cpu();
 
-       ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+       ep_call_nested(&poll_safewake_ncalls,
                       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
 
        put_cpu();
@@ -699,7 +699,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
         */
        spin_lock_irq(&ep->wq.lock);
        list_splice_init(&ep->rdllist, &txlist);
-       ep->ovflist = NULL;
+       WRITE_ONCE(ep->ovflist, NULL);
        spin_unlock_irq(&ep->wq.lock);
 
        /*
@@ -713,7 +713,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
         * other events might have been queued by the poll callback.
         * We re-insert them inside the main ready-list here.
         */
-       for (nepi = ep->ovflist; (epi = nepi) != NULL;
+       for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
                /*
                 * We need to check if the item is already in the list.
@@ -731,7 +731,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
         * releasing the lock, events will be queued in the normal way inside
         * ep->rdllist.
         */
-       ep->ovflist = EP_UNACTIVE_PTR;
+       WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
 
        /*
         * Quickly re-inject items left on "txlist".
@@ -1154,10 +1154,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
-       if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+       if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
                if (epi->next == EP_UNACTIVE_PTR) {
-                       epi->next = ep->ovflist;
-                       ep->ovflist = epi;
+                       epi->next = READ_ONCE(ep->ovflist);
+                       WRITE_ONCE(ep->ovflist, epi);
                        if (epi->ws) {
                                /*
                                 * Activate ep->ws since epi->ws may get
@@ -1333,7 +1333,6 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
                                }
                        } else {
                                error = ep_call_nested(&poll_loop_ncalls,
-                                                       EP_MAX_NESTS,
                                                        reverse_path_check_proc,
                                                        child_file, child_file,
                                                        current);
@@ -1367,7 +1366,7 @@ static int reverse_path_check(void)
        /* let's call this for all tfiles */
        list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
                path_count_init();
-               error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+               error = ep_call_nested(&poll_loop_ncalls,
                                        reverse_path_check_proc, current_file,
                                        current_file, current);
                if (error)
@@ -1626,21 +1625,24 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
 {
        struct ep_send_events_data *esed = priv;
        __poll_t revents;
-       struct epitem *epi;
-       struct epoll_event __user *uevent;
+       struct epitem *epi, *tmp;
+       struct epoll_event __user *uevent = esed->events;
        struct wakeup_source *ws;
        poll_table pt;
 
        init_poll_funcptr(&pt, NULL);
+       esed->res = 0;
 
        /*
         * We can loop without lock because we are passed a task private list.
         * Items cannot vanish during the loop because ep_scan_ready_list() is
         * holding "mtx" during this call.
         */
-       for (esed->res = 0, uevent = esed->events;
-            !list_empty(head) && esed->res < esed->maxevents;) {
-               epi = list_first_entry(head, struct epitem, rdllink);
+       lockdep_assert_held(&ep->mtx);
+
+       list_for_each_entry_safe(epi, tmp, head, rdllink) {
+               if (esed->res >= esed->maxevents)
+                       break;
 
                /*
                 * Activate ep->ws before deactivating epi->ws to prevent
@@ -1660,42 +1662,42 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
 
                list_del_init(&epi->rdllink);
 
-               revents = ep_item_poll(epi, &pt, 1);
-
                /*
                 * If the event mask intersect the caller-requested one,
                 * deliver the event to userspace. Again, ep_scan_ready_list()
-                * is holding "mtx", so no operations coming from userspace
+                * is holding ep->mtx, so no operations coming from userspace
                 * can change the item.
                 */
-               if (revents) {
-                       if (__put_user(revents, &uevent->events) ||
-                           __put_user(epi->event.data, &uevent->data)) {
-                               list_add(&epi->rdllink, head);
-                               ep_pm_stay_awake(epi);
-                               if (!esed->res)
-                                       esed->res = -EFAULT;
-                               return 0;
-                       }
-                       esed->res++;
-                       uevent++;
-                       if (epi->event.events & EPOLLONESHOT)
-                               epi->event.events &= EP_PRIVATE_BITS;
-                       else if (!(epi->event.events & EPOLLET)) {
-                               /*
-                                * If this file has been added with Level
-                                * Trigger mode, we need to insert back inside
-                                * the ready list, so that the next call to
-                                * epoll_wait() will check again the events
-                                * availability. At this point, no one can insert
-                                * into ep->rdllist besides us. The epoll_ctl()
-                                * callers are locked out by
-                                * ep_scan_ready_list() holding "mtx" and the
-                                * poll callback will queue them in ep->ovflist.
-                                */
-                               list_add_tail(&epi->rdllink, &ep->rdllist);
-                               ep_pm_stay_awake(epi);
-                       }
+               revents = ep_item_poll(epi, &pt, 1);
+               if (!revents)
+                       continue;
+
+               if (__put_user(revents, &uevent->events) ||
+                   __put_user(epi->event.data, &uevent->data)) {
+                       list_add(&epi->rdllink, head);
+                       ep_pm_stay_awake(epi);
+                       if (!esed->res)
+                               esed->res = -EFAULT;
+                       return 0;
+               }
+               esed->res++;
+               uevent++;
+               if (epi->event.events & EPOLLONESHOT)
+                       epi->event.events &= EP_PRIVATE_BITS;
+               else if (!(epi->event.events & EPOLLET)) {
+                       /*
+                        * If this file has been added with Level
+                        * Trigger mode, we need to insert back inside
+                        * the ready list, so that the next call to
+                        * epoll_wait() will check again the events
+                        * availability. At this point, no one can insert
+                        * into ep->rdllist besides us. The epoll_ctl()
+                        * callers are locked out by
+                        * ep_scan_ready_list() holding "mtx" and the
+                        * poll callback will queue them in ep->ovflist.
+                        */
+                       list_add_tail(&epi->rdllink, &ep->rdllist);
+                       ep_pm_stay_awake(epi);
                }
        }
 
@@ -1747,6 +1749,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 {
        int res = 0, eavail, timed_out = 0;
        u64 slack = 0;
+       bool waiter = false;
        wait_queue_entry_t wait;
        ktime_t expires, *to = NULL;
 
@@ -1761,11 +1764,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
        } else if (timeout == 0) {
                /*
                 * Avoid the unnecessary trip to the wait queue loop, if the
-                * caller specified a non blocking operation.
+                * caller specified a non blocking operation. We still need
+                * lock because we could race and not see an epi being added
+                * to the ready list while in irq callback. Thus incorrectly
+                * returning 0 back to userspace.
                 */
                timed_out = 1;
+
                spin_lock_irq(&ep->wq.lock);
-               goto check_events;
+               eavail = ep_events_available(ep);
+               spin_unlock_irq(&ep->wq.lock);
+
+               goto send_events;
        }
 
 fetch_events:
@@ -1773,64 +1783,66 @@ fetch_events:
        if (!ep_events_available(ep))
                ep_busy_loop(ep, timed_out);
 
-       spin_lock_irq(&ep->wq.lock);
+       eavail = ep_events_available(ep);
+       if (eavail)
+               goto send_events;
 
-       if (!ep_events_available(ep)) {
-               /*
-                * Busy poll timed out.  Drop NAPI ID for now, we can add
-                * it back in when we have moved a socket with a valid NAPI
-                * ID onto the ready list.
-                */
-               ep_reset_busy_poll_napi_id(ep);
+       /*
+        * Busy poll timed out.  Drop NAPI ID for now, we can add
+        * it back in when we have moved a socket with a valid NAPI
+        * ID onto the ready list.
+        */
+       ep_reset_busy_poll_napi_id(ep);
 
-               /*
-                * We don't have any available event to return to the caller.
-                * We need to sleep here, and we will be wake up by
-                * ep_poll_callback() when events will become available.
-                */
+       /*
+        * We don't have any available event to return to the caller.  We need
+        * to sleep here, and we will be woken by ep_poll_callback() when events
+        * become available.
+        */
+       if (!waiter) {
+               waiter = true;
                init_waitqueue_entry(&wait, current);
-               __add_wait_queue_exclusive(&ep->wq, &wait);
 
-               for (;;) {
-                       /*
-                        * We don't want to sleep if the ep_poll_callback() sends us
-                        * a wakeup in between. That's why we set the task state
-                        * to TASK_INTERRUPTIBLE before doing the checks.
-                        */
-                       set_current_state(TASK_INTERRUPTIBLE);
-                       /*
-                        * Always short-circuit for fatal signals to allow
-                        * threads to make a timely exit without the chance of
-                        * finding more events available and fetching
-                        * repeatedly.
-                        */
-                       if (fatal_signal_pending(current)) {
-                               res = -EINTR;
-                               break;
-                       }
-                       if (ep_events_available(ep) || timed_out)
-                               break;
-                       if (signal_pending(current)) {
-                               res = -EINTR;
-                               break;
-                       }
+               spin_lock_irq(&ep->wq.lock);
+               __add_wait_queue_exclusive(&ep->wq, &wait);
+               spin_unlock_irq(&ep->wq.lock);
+       }
 
-                       spin_unlock_irq(&ep->wq.lock);
-                       if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
-                               timed_out = 1;
+       for (;;) {
+               /*
+                * We don't want to sleep if the ep_poll_callback() sends us
+                * a wakeup in between. That's why we set the task state
+                * to TASK_INTERRUPTIBLE before doing the checks.
+                */
+               set_current_state(TASK_INTERRUPTIBLE);
+               /*
+                * Always short-circuit for fatal signals to allow
+                * threads to make a timely exit without the chance of
+                * finding more events available and fetching
+                * repeatedly.
+                */
+               if (fatal_signal_pending(current)) {
+                       res = -EINTR;
+                       break;
+               }
 
-                       spin_lock_irq(&ep->wq.lock);
+               eavail = ep_events_available(ep);
+               if (eavail)
+                       break;
+               if (signal_pending(current)) {
+                       res = -EINTR;
+                       break;
                }
 
-               __remove_wait_queue(&ep->wq, &wait);
-               __set_current_state(TASK_RUNNING);
+               if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+                       timed_out = 1;
+                       break;
+               }
        }
-check_events:
-       /* Is it worth to try to dig for events ? */
-       eavail = ep_events_available(ep);
 
-       spin_unlock_irq(&ep->wq.lock);
+       __set_current_state(TASK_RUNNING);
 
+send_events:
        /*
         * Try to transfer events to user space. In case we get 0 events and
         * there's still timeout left over, we go trying again in search of
@@ -1840,6 +1852,12 @@ check_events:
            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
                goto fetch_events;
 
+       if (waiter) {
+               spin_lock_irq(&ep->wq.lock);
+               __remove_wait_queue(&ep->wq, &wait);
+               spin_unlock_irq(&ep->wq.lock);
+       }
+
        return res;
 }
 
@@ -1876,7 +1894,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
                        ep_tovisit = epi->ffd.file->private_data;
                        if (ep_tovisit->visited)
                                continue;
-                       error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                       error = ep_call_nested(&poll_loop_ncalls,
                                        ep_loop_check_proc, epi->ffd.file,
                                        ep_tovisit, current);
                        if (error != 0)
@@ -1916,7 +1934,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
        int ret;
        struct eventpoll *ep_cur, *ep_next;
 
-       ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+       ret = ep_call_nested(&poll_loop_ncalls,
                              ep_loop_check_proc, file, ep, current);
        /* clear visited list */
        list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
@@ -2172,7 +2190,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
                return -EINVAL;
 
        /* Verify that the area passed by the user is writeable */
-       if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
+       if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
                return -EFAULT;
 
        /* Get the "struct file *" for the eventpoll file */