*
* 1) epmutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->lock (spinlock)
+ * 3) ep->wq.lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a spinlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->wq.lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
* of epoll file descriptors, we use the current recursion depth as
* the lockdep subkey.
* It is possible to drop the "ep->mtx" and to use the global
- * mutex "epmutex" (together with "ep->lock") to have it working,
+ * mutex "epmutex" (together with "ep->wq.lock") to have it working,
* but having "ep->mtx" will make the interface more scalable.
* Events that require holding "epmutex" are very rare, while for
* normal operations the epoll private "ep->mtx" will guarantee
* This structure is stored inside the "private_data" member of the file
* structure and represents the main data structure for the eventpoll
* interface.
+ *
+ * Access to it is protected by the lock inside wq.
*/
struct eventpoll {
- /* Protect the access to this structure */
- spinlock_t lock;
-
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
/*
* This is a single linked list that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
- * holding ->lock.
+ * holding ->wq.lock.
*/
struct epitem *ovflist;
}
/* Tells us if the item is currently linked */
-static inline int ep_is_linked(struct list_head *p)
+static inline int ep_is_linked(struct epitem *epi)
{
- return !list_empty(p);
+ return !list_empty(&epi->rdllink);
}
static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
return ep_events_available(ep) || busy_loop_timeout(start_time);
}
-#endif /* CONFIG_NET_RX_BUSY_POLL */
/*
* Busy poll if globally on and supporting sockets found && no events,
*/
static void ep_busy_loop(struct eventpoll *ep, int nonblock)
{
-#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int napi_id = READ_ONCE(ep->napi_id);
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
-#endif
}
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
{
-#ifdef CONFIG_NET_RX_BUSY_POLL
if (ep->napi_id)
ep->napi_id = 0;
-#endif
}
/*
*/
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
-#ifdef CONFIG_NET_RX_BUSY_POLL
struct eventpoll *ep;
unsigned int napi_id;
struct socket *sock;
/* record NAPI ID for use in next busy poll */
ep->napi_id = napi_id;
-#endif
}
+#else
+
+static inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
+{
+}
+
+static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+{
+}
+
+static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
+{
+}
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
/**
* ep_call_nested - Perform a bound (possibly) nested call, by checking
* that the recursion limit is not exceeded, and that
{
__poll_t res;
int pwake = 0;
- unsigned long flags;
struct epitem *epi, *nepi;
LIST_HEAD(txlist);
+ lockdep_assert_irqs_enabled();
+
/*
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl().
* because we want the "sproc" callback to be able to do it
* in a lockless way.
*/
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
list_splice_init(&ep->rdllist, &txlist);
ep->ovflist = NULL;
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
/*
* Now call the callback function.
*/
res = (*sproc)(ep, &txlist, priv);
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
* queued into ->ovflist but the "txlist" might already
* contain them, and the list_splice() below takes care of them.
*/
- if (!ep_is_linked(&epi->rdllink)) {
+ if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
if (!ep_locked)
mutex_unlock(&ep->mtx);
*/
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
- unsigned long flags;
struct file *file = epi->ffd.file;
+ lockdep_assert_irqs_enabled();
+
/*
- * Removes poll wait queue hooks. We _have_ to do this without holding
- * the "ep->lock" otherwise a deadlock might occur. This because of the
- * sequence of the lock acquisition. Here we do "ep->lock" then the wait
- * queue head lock when unregistering the wait queue. The wakeup callback
- * will run by holding the wait queue head lock and will call our callback
- * that will try to get "ep->lock".
+ * Removes poll wait queue hooks.
*/
ep_unregister_pollwait(ep, epi);
rb_erase_cached(&epi->rbn, &ep->rbr);
- spin_lock_irqsave(&ep->lock, flags);
- if (ep_is_linked(&epi->rdllink))
+ spin_lock_irq(&ep->wq.lock);
+ if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
* Walks through the whole tree by freeing each "struct epitem". At this
* point we are sure no poll callbacks will be lingering around, and also by
* holding "epmutex" we can be sure that no file cleanup code will hit
- * us during this operation. So we can avoid the lock on "ep->lock".
+ * us during this operation. So we can avoid the lock on "ep->wq.lock".
* We do not need to lock ep->mtx, either, we only do it to prevent
* a lockdep warning.
*/
if (unlikely(!ep))
goto free_uid;
- spin_lock_init(&ep->lock);
mutex_init(&ep->mtx);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
__poll_t pollflags = key_to_poll(key);
int ewake = 0;
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irqsave(&ep->wq.lock, flags);
ep_set_busy_poll_napi_id(epi);
}
/* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(&epi->rdllink)) {
+ if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake_rcu(epi);
}
pwake++;
out_unlock:
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irqrestore(&ep->wq.lock, flags);
/* We have to call this outside the lock */
if (pwake)
{
int error, pwake = 0;
__poll_t revents;
- unsigned long flags;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
+ lockdep_assert_irqs_enabled();
+
user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
/* If the file is already "ready" we drop it inside the ready list */
- if (revents && !ep_is_linked(&epi->rdllink)) {
+ if (revents && !ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
pwake++;
}
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
atomic_long_inc(&ep->user->epoll_watches);
* list, since that is used/cleaned only inside a section bound by "mtx".
* And ep_insert() is called with "mtx" held.
*/
- spin_lock_irqsave(&ep->lock, flags);
- if (ep_is_linked(&epi->rdllink))
+ spin_lock_irq(&ep->wq.lock);
+ if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
wakeup_source_unregister(ep_wakeup_source(epi));
int pwake = 0;
poll_table pt;
+ lockdep_assert_irqs_enabled();
+
init_poll_funcptr(&pt, NULL);
/*
* 1) Flush epi changes above to other CPUs. This ensures
* we do not miss events from ep_poll_callback if an
* event occurs immediately after we call f_op->poll().
- * We need this because we did not take ep->lock while
+ * We need this because we did not take ep->wq.lock while
* changing epi above (but ep_poll_callback does take
- * ep->lock).
+ * ep->wq.lock).
*
* 2) We also need to ensure we do not miss _past_ events
* when calling f_op->poll(). This barrier also
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- spin_lock_irq(&ep->lock);
- if (!ep_is_linked(&epi->rdllink)) {
+ spin_lock_irq(&ep->wq.lock);
+ if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- spin_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->wq.lock);
}
/* We have to call this outside the lock */
int maxevents, long timeout)
{
int res = 0, eavail, timed_out = 0;
- unsigned long flags;
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
+ lockdep_assert_irqs_enabled();
+
if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);
* caller specified a non blocking operation.
*/
timed_out = 1;
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
goto check_events;
}
if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
if (!ep_events_available(ep)) {
/*
break;
}
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
- spin_lock_irqsave(&ep->lock, flags);
+ spin_lock_irq(&ep->wq.lock);
}
__remove_wait_queue(&ep->wq, &wait);
/* Is it worth to try to dig for events ? */
eavail = ep_events_available(ep);
- spin_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irq(&ep->wq.lock);
/*
* Try to transfer events to user space. In case we get 0 events and