Merge branch 'for-3.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata
[sfrench/cifs-2.6.git] / kernel / futex.c
index f6ff0191ecf72aca1d42f128771af55f9c859d13..67dacaf93e56c0edb12b74a84b40da9fe596ea49 100644 (file)
 #include <linux/sched/rt.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
+#include <linux/bootmem.h>
 
 #include <asm/futex.h>
 
 #include "locking/rtmutex_common.h"
 
-int __read_mostly futex_cmpxchg_enabled;
+/*
+ * Basic futex operation and ordering guarantees:
+ *
+ * The waiter reads the futex value in user space and calls
+ * futex_wait(). This function computes the hash bucket and acquires
+ * the hash bucket lock. After that it reads the futex user space value
+ * again and verifies that the data has not changed. If it has not changed
+ * it enqueues itself into the hash bucket, releases the hash bucket lock
+ * and schedules.
+ *
+ * The waker side modifies the user space value of the futex and calls
+ * futex_wake(). This function computes the hash bucket and acquires the
+ * hash bucket lock. Then it looks for waiters on that futex in the hash
+ * bucket and wakes them.
+ *
+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
+ * the hb spinlock can be avoided and simply return. In order for this
+ * optimization to work, ordering guarantees must exist so that the waiter
+ * being added to the list is acknowledged when the list is concurrently being
+ * checked by the waker, avoiding scenarios like the following:
+ *
+ * CPU 0                               CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ *   futex_wait(futex, val);
+ *   uval = *futex;
+ *                                     *futex = newval;
+ *                                     sys_futex(WAKE, futex);
+ *                                       futex_wake(futex);
+ *                                       if (queue_empty())
+ *                                         return;
+ *   if (uval == val)
+ *      lock(hash_bucket(futex));
+ *      queue();
+ *     unlock(hash_bucket(futex));
+ *     schedule();
+ *
+ * This would cause the waiter on CPU 0 to wait forever because it
+ * missed the transition of the user space value from val to newval
+ * and the waker did not find the waiter in the hash bucket queue.
+ *
+ * The correct serialization ensures that a waiter either observes
+ * the changed user space value before blocking or is woken by a
+ * concurrent waker:
+ *
+ * CPU 0                                 CPU 1
+ * val = *futex;
+ * sys_futex(WAIT, futex, val);
+ *   futex_wait(futex, val);
+ *
+ *   waiters++;
+ *   mb(); (A) <-- paired with -.
+ *                              |
+ *   lock(hash_bucket(futex));  |
+ *                              |
+ *   uval = *futex;             |
+ *                              |        *futex = newval;
+ *                              |        sys_futex(WAKE, futex);
+ *                              |          futex_wake(futex);
+ *                              |
+ *                              `------->  mb(); (B)
+ *   if (uval == val)
+ *     queue();
+ *     unlock(hash_bucket(futex));
+ *     schedule();                         if (waiters)
+ *                                           lock(hash_bucket(futex));
+ *                                           wake_waiters(futex);
+ *                                           unlock(hash_bucket(futex));
+ *
+ * Where (A) orders the waiters increment and the futex value read -- this
+ * is guaranteed by the head counter in the hb spinlock; and where (B)
+ * orders the write to futex and the waiters read -- this is done by the
+ * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
+ * depending on the futex type.
+ *
+ * This yields the following case (where X:=waiters, Y:=futex):
+ *
+ *     X = Y = 0
+ *
+ *     w[X]=1          w[Y]=1
+ *     MB              MB
+ *     r[Y]=y          r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
+ * the guarantee that we cannot both miss the futex variable change and the
+ * enqueue.
+ */
 
-#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
+int __read_mostly futex_cmpxchg_enabled;
+#endif
 
 /*
  * Futex flags used to encode options to functions and preserve them across
@@ -147,11 +236,59 @@ static const struct futex_q futex_q_init = {
  * waiting on a futex.
  */
 struct futex_hash_bucket {
+       atomic_t waiters;
        spinlock_t lock;
        struct plist_head chain;
-};
+} ____cacheline_aligned_in_smp;
+
+static unsigned long __read_mostly futex_hashsize;
 
-static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
+static struct futex_hash_bucket *futex_queues;
+
+static inline void futex_get_mm(union futex_key *key)
+{
+       atomic_inc(&key->private.mm->mm_count);
+       /*
+        * Ensure futex_get_mm() implies a full barrier such that
+        * get_futex_key() implies a full barrier. This is relied upon
+        * as full barrier (B), see the ordering comment above.
+        */
+       smp_mb__after_atomic_inc();
+}
+
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+       atomic_inc(&hb->waiters);
+       /*
+        * Full barrier (A), see the ordering comment above.
+        */
+       smp_mb__after_atomic_inc();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+       atomic_dec(&hb->waiters);
+#endif
+}
+
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+       return atomic_read(&hb->waiters);
+#else
+       return 1;
+#endif
+}
 
 /*
  * We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +298,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
        u32 hash = jhash2((u32*)&key->both.word,
                          (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
                          key->both.offset);
-       return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
+       return &futex_queues[hash & (futex_hashsize - 1)];
 }
 
 /*
@@ -187,10 +324,10 @@ static void get_futex_key_refs(union futex_key *key)
 
        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
        case FUT_OFF_INODE:
-               ihold(key->shared.inode);
+               ihold(key->shared.inode); /* implies MB (B) */
                break;
        case FUT_OFF_MMSHARED:
-               atomic_inc(&key->private.mm->mm_count);
+               futex_get_mm(key); /* implies MB (B) */
                break;
        }
 }
@@ -264,7 +401,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
        if (!fshared) {
                key->private.mm = mm;
                key->private.address = address;
-               get_futex_key_refs(key);
+               get_futex_key_refs(key);  /* implies MB (B) */
                return 0;
        }
 
@@ -371,7 +508,7 @@ again:
                key->shared.pgoff = basepage_index(page);
        }
 
-       get_futex_key_refs(key);
+       get_futex_key_refs(key); /* implies MB (B) */
 
 out:
        unlock_page(page_head);
@@ -598,13 +735,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 {
        struct futex_pi_state *pi_state = NULL;
        struct futex_q *this, *next;
-       struct plist_head *head;
        struct task_struct *p;
        pid_t pid = uval & FUTEX_TID_MASK;
 
-       head = &hb->chain;
-
-       plist_for_each_entry_safe(this, next, head, list) {
+       plist_for_each_entry_safe(this, next, &hb->chain, list) {
                if (match_futex(&this->key, key)) {
                        /*
                         * Another waiter already exists - bump up
@@ -838,6 +972,7 @@ static void __unqueue_futex(struct futex_q *q)
 
        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
        plist_del(&q->list, &hb->chain);
+       hb_waiters_dec(hb);
 }
 
 /*
@@ -986,7 +1121,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-       struct plist_head *head;
        union futex_key key = FUTEX_KEY_INIT;
        int ret;
 
@@ -998,10 +1132,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
                goto out;
 
        hb = hash_futex(&key);
+
+       /* Make sure we really have tasks to wakeup */
+       if (!hb_waiters_pending(hb))
+               goto out_put_key;
+
        spin_lock(&hb->lock);
-       head = &hb->chain;
 
-       plist_for_each_entry_safe(this, next, head, list) {
+       plist_for_each_entry_safe(this, next, &hb->chain, list) {
                if (match_futex (&this->key, &key)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
@@ -1019,6 +1157,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
        }
 
        spin_unlock(&hb->lock);
+out_put_key:
        put_futex_key(&key);
 out:
        return ret;
@@ -1034,7 +1173,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb1, *hb2;
-       struct plist_head *head;
        struct futex_q *this, *next;
        int ret, op_ret;
 
@@ -1082,9 +1220,7 @@ retry_private:
                goto retry;
        }
 
-       head = &hb1->chain;
-
-       plist_for_each_entry_safe(this, next, head, list) {
+       plist_for_each_entry_safe(this, next, &hb1->chain, list) {
                if (match_futex (&this->key, &key1)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
@@ -1097,10 +1233,8 @@ retry_private:
        }
 
        if (op_ret > 0) {
-               head = &hb2->chain;
-
                op_ret = 0;
-               plist_for_each_entry_safe(this, next, head, list) {
+               plist_for_each_entry_safe(this, next, &hb2->chain, list) {
                        if (match_futex (&this->key, &key2)) {
                                if (this->pi_state || this->rt_waiter) {
                                        ret = -EINVAL;
@@ -1142,7 +1276,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
         */
        if (likely(&hb1->chain != &hb2->chain)) {
                plist_del(&q->list, &hb1->chain);
+               hb_waiters_dec(hb1);
                plist_add(&q->list, &hb2->chain);
+               hb_waiters_inc(hb2);
                q->lock_ptr = &hb2->lock;
        }
        get_futex_key_refs(key2);
@@ -1270,7 +1406,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        int drop_count = 0, task_count = 0, ret;
        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
-       struct plist_head *head1;
        struct futex_q *this, *next;
        u32 curval2;
 
@@ -1393,8 +1528,7 @@ retry_private:
                }
        }
 
-       head1 = &hb1->chain;
-       plist_for_each_entry_safe(this, next, head1, list) {
+       plist_for_each_entry_safe(this, next, &hb1->chain, list) {
                if (task_count - nr_wake >= nr_requeue)
                        break;
 
@@ -1487,17 +1621,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
        struct futex_hash_bucket *hb;
 
        hb = hash_futex(&q->key);
+
+       /*
+        * Increment the counter before taking the lock so that
+        * a potential waker won't miss a to-be-slept task that is
+        * waiting for the spinlock. This is safe as all queue_lock()
+        * users end up calling queue_me(). Similarly, for housekeeping,
+        * decrement the counter at queue_unlock() when some error has
+        * occurred and we don't end up adding the task to the list.
+        */
+       hb_waiters_inc(hb);
+
        q->lock_ptr = &hb->lock;
 
-       spin_lock(&hb->lock);
+       spin_lock(&hb->lock); /* implies MB (A) */
        return hb;
 }
 
 static inline void
-queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
+queue_unlock(struct futex_hash_bucket *hb)
        __releases(&hb->lock)
 {
        spin_unlock(&hb->lock);
+       hb_waiters_dec(hb);
 }
 
 /**
@@ -1867,7 +2013,7 @@ retry_private:
        ret = get_futex_value_locked(&uval, uaddr);
 
        if (ret) {
-               queue_unlock(q, *hb);
+               queue_unlock(*hb);
 
                ret = get_user(uval, uaddr);
                if (ret)
@@ -1881,7 +2027,7 @@ retry_private:
        }
 
        if (uval != val) {
-               queue_unlock(q, *hb);
+               queue_unlock(*hb);
                ret = -EWOULDBLOCK;
        }
 
@@ -2029,7 +2175,7 @@ retry_private:
                         * Task is exiting and we just wait for the
                         * exit to complete.
                         */
-                       queue_unlock(&q, hb);
+                       queue_unlock(hb);
                        put_futex_key(&q.key);
                        cond_resched();
                        goto retry;
@@ -2081,7 +2227,7 @@ retry_private:
        goto out_put_key;
 
 out_unlock_put_key:
-       queue_unlock(&q, hb);
+       queue_unlock(hb);
 
 out_put_key:
        put_futex_key(&q.key);
@@ -2091,7 +2237,7 @@ out:
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
 uaddr_faulted:
-       queue_unlock(&q, hb);
+       queue_unlock(hb);
 
        ret = fault_in_user_writeable(uaddr);
        if (ret)
@@ -2113,7 +2259,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-       struct plist_head *head;
        union futex_key key = FUTEX_KEY_INIT;
        u32 uval, vpid = task_pid_vnr(current);
        int ret;
@@ -2153,9 +2298,7 @@ retry:
         * Ok, other tasks may need to be woken up - check waiters
         * and do the wakeup if necessary:
         */
-       head = &hb->chain;
-
-       plist_for_each_entry_safe(this, next, head, list) {
+       plist_for_each_entry_safe(this, next, &hb->chain, list) {
                if (!match_futex (&this->key, &key))
                        continue;
                ret = wake_futex_pi(uaddr, uval, this);
@@ -2232,6 +2375,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
                 * Unqueue the futex_q and determine which it was.
                 */
                plist_del(&q->list, &hb->chain);
+               hb_waiters_dec(hb);
 
                /* Handle spurious wakeups gracefully */
                ret = -EWOULDBLOCK;
@@ -2316,6 +2460,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * code while we sleep on uaddr.
         */
        debug_rt_mutex_init_waiter(&rt_waiter);
+       RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+       RB_CLEAR_NODE(&rt_waiter.tree_entry);
        rt_waiter.task = NULL;
 
        ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2731,10 +2877,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
 {
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
        u32 curval;
-       int i;
 
        /*
         * This will fail and we want it. Some arch implementations do
@@ -2748,8 +2894,31 @@ static int __init futex_init(void)
         */
        if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
                futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
+       unsigned int futex_shift;
+       unsigned long i;
+
+#if CONFIG_BASE_SMALL
+       futex_hashsize = 16;
+#else
+       futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+#endif
+
+       futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+                                              futex_hashsize, 0,
+                                              futex_hashsize < 256 ? HASH_SMALL : 0,
+                                              &futex_shift, NULL,
+                                              futex_hashsize, futex_hashsize);
+       futex_hashsize = 1UL << futex_shift;
+
+       futex_detect_cmpxchg();
 
-       for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+       for (i = 0; i < futex_hashsize; i++) {
+               atomic_set(&futex_queues[i].waiters, 0);
                plist_head_init(&futex_queues[i].chain);
                spin_lock_init(&futex_queues[i].lock);
        }