kernel/futex/pi.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2
   3 #include <linux/slab.h>
   4 #include <linux/sched/rt.h>
   5 #include <linux/sched/task.h>
   6
   7 #include "futex.h"
   8 #include "../locking/rtmutex_common.h"
   9
  10 /*
  11  * PI code:
  12  */
  13 int refill_pi_state_cache(void)
  14 {
  15         struct futex_pi_state *pi_state;
  16
  17         if (likely(current->pi_state_cache))
  18                 return 0;
  19
  20         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
  21
  22         if (!pi_state)
  23                 return -ENOMEM;
  24
  25         INIT_LIST_HEAD(&pi_state->list);
  26         /* pi_mutex gets initialized later */
  27         pi_state->owner = NULL;
  28         refcount_set(&pi_state->refcount, 1);
  29         pi_state->key = FUTEX_KEY_INIT;
  30
  31         current->pi_state_cache = pi_state;
  32
  33         return 0;
  34 }
  35
  36 static struct futex_pi_state *alloc_pi_state(void)
  37 {
  38         struct futex_pi_state *pi_state = current->pi_state_cache;
  39
  40         WARN_ON(!pi_state);
  41         current->pi_state_cache = NULL;
  42
  43         return pi_state;
  44 }
  45
  46 static void pi_state_update_owner(struct futex_pi_state *pi_state,
  47                                   struct task_struct *new_owner)
  48 {
  49         struct task_struct *old_owner = pi_state->owner;
  50
  51         lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
  52
  53         if (old_owner) {
  54                 raw_spin_lock(&old_owner->pi_lock);
  55                 WARN_ON(list_empty(&pi_state->list));
  56                 list_del_init(&pi_state->list);
  57                 raw_spin_unlock(&old_owner->pi_lock);
  58         }
  59
  60         if (new_owner) {
  61                 raw_spin_lock(&new_owner->pi_lock);
  62                 WARN_ON(!list_empty(&pi_state->list));
  63                 list_add(&pi_state->list, &new_owner->pi_state_list);
  64                 pi_state->owner = new_owner;
  65                 raw_spin_unlock(&new_owner->pi_lock);
  66         }
  67 }
  68
  69 void get_pi_state(struct futex_pi_state *pi_state)
  70 {
  71         WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
  72 }
  73
  74 /*
  75  * Drops a reference to the pi_state object and frees or caches it
  76  * when the last reference is gone.
  77  */
  78 void put_pi_state(struct futex_pi_state *pi_state)
  79 {
  80         if (!pi_state)
  81                 return;
  82
  83         if (!refcount_dec_and_test(&pi_state->refcount))
  84                 return;
  85
  86         /*
  87          * If pi_state->owner is NULL, the owner is most probably dying
  88          * and has cleaned up the pi_state already
  89          */
  90         if (pi_state->owner) {
  91                 unsigned long flags;
  92
  93                 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
  94                 pi_state_update_owner(pi_state, NULL);
  95                 rt_mutex_proxy_unlock(&pi_state->pi_mutex);
  96                 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
  97         }
  98
  99         if (current->pi_state_cache) {
 100                 kfree(pi_state);
 101         } else {
 102                 /*
 103                  * pi_state->list is already empty.
 104                  * clear pi_state->owner.
 105                  * refcount is at 0 - put it back to 1.
 106                  */
 107                 pi_state->owner = NULL;
 108                 refcount_set(&pi_state->refcount, 1);
 109                 current->pi_state_cache = pi_state;
 110         }
 111 }
 112
 113 /*
 114  * We need to check the following states:
 115  *
 116  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 117  *
 118  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 119  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 120  *
 121  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 122  *
 123  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 124  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 125  *
 126  * [6]  Found  | Found    | task      | 0         | 1      | Valid
 127  *
 128  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 129  *
 130  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 131  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 132  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 133  *
 134  * [1]  Indicates that the kernel can acquire the futex atomically. We
 135  *      came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 136  *
 137  * [2]  Valid, if TID does not belong to a kernel thread. If no matching
 138  *      thread is found then it indicates that the owner TID has died.
 139  *
 140  * [3]  Invalid. The waiter is queued on a non PI futex
 141  *
 142  * [4]  Valid state after exit_robust_list(), which sets the user space
 143  *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 144  *
 145  * [5]  The user space value got manipulated between exit_robust_list()
 146  *      and exit_pi_state_list()
 147  *
 148  * [6]  Valid state after exit_pi_state_list() which sets the new owner in
 149  *      the pi_state but cannot access the user space value.
 150  *
 151  * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 152  *
 153  * [8]  Owner and user space value match
 154  *
 155  * [9]  There is no transient state which sets the user space TID to 0
 156  *      except exit_robust_list(), but this is indicated by the
 157  *      FUTEX_OWNER_DIED bit. See [4]
 158  *
 159  * [10] There is no transient state which leaves owner and user space
 160  *      TID out of sync. Except one error case where the kernel is denied
 161  *      write access to the user address, see fixup_pi_state_owner().
 162  *
 163  *
 164  * Serialization and lifetime rules:
 165  *
 166  * hb->lock:
 167  *
 168  *      hb -> futex_q, relation
 169  *      futex_q -> pi_state, relation
 170  *
 171  *      (cannot be raw because hb can contain arbitrary amount
 172  *       of futex_q's)
 173  *
 174  * pi_mutex->wait_lock:
 175  *
 176  *      {uval, pi_state}
 177  *
 178  *      (and pi_mutex 'obviously')
 179  *
 180  * p->pi_lock:
 181  *
 182  *      p->pi_state_list -> pi_state->list, relation
 183  *      pi_mutex->owner -> pi_state->owner, relation
 184  *
 185  * pi_state->refcount:
 186  *
 187  *      pi_state lifetime
 188  *
 189  *
 190  * Lock order:
 191  *
 192  *   hb->lock
 193  *     pi_mutex->wait_lock
 194  *       p->pi_lock
 195  *
 196  */
 197
 198 /*
 199  * Validate that the existing waiter has a pi_state and sanity check
 200  * the pi_state against the user space value. If correct, attach to
 201  * it.
 202  */
 203 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
 204                               struct futex_pi_state *pi_state,
 205                               struct futex_pi_state **ps)
 206 {
 207         pid_t pid = uval & FUTEX_TID_MASK;
 208         u32 uval2;
 209         int ret;
 210
 211         /*
 212          * Userspace might have messed up non-PI and PI futexes [3]
 213          */
 214         if (unlikely(!pi_state))
 215                 return -EINVAL;
 216
 217         /*
 218          * We get here with hb->lock held, and having found a
 219          * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
 220          * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
 221          * which in turn means that futex_lock_pi() still has a reference on
 222          * our pi_state.
 223          *
 224          * The waiter holding a reference on @pi_state also protects against
 225          * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
 226          * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
 227          * free pi_state before we can take a reference ourselves.
 228          */
 229         WARN_ON(!refcount_read(&pi_state->refcount));
 230
 231         /*
 232          * Now that we have a pi_state, we can acquire wait_lock
 233          * and do the state validation.
 234          */
 235         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 236
 237         /*
 238          * Since {uval, pi_state} is serialized by wait_lock, and our current
 239          * uval was read without holding it, it can have changed. Verify it
 240          * still is what we expect it to be, otherwise retry the entire
 241          * operation.
 242          */
 243         if (futex_get_value_locked(&uval2, uaddr))
 244                 goto out_efault;
 245
 246         if (uval != uval2)
 247                 goto out_eagain;
 248
 249         /*
 250          * Handle the owner died case:
 251          */
 252         if (uval & FUTEX_OWNER_DIED) {
 253                 /*
 254                  * exit_pi_state_list sets owner to NULL and wakes the
 255                  * topmost waiter. The task which acquires the
 256                  * pi_state->rt_mutex will fixup owner.
 257                  */
 258                 if (!pi_state->owner) {
 259                         /*
 260                          * No pi state owner, but the user space TID
 261                          * is not 0. Inconsistent state. [5]
 262                          */
 263                         if (pid)
 264                                 goto out_einval;
 265                         /*
 266                          * Take a ref on the state and return success. [4]
 267                          */
 268                         goto out_attach;
 269                 }
 270
 271                 /*
 272                  * If TID is 0, then either the dying owner has not
 273                  * yet executed exit_pi_state_list() or some waiter
 274                  * acquired the rtmutex in the pi state, but did not
 275                  * yet fixup the TID in user space.
 276                  *
 277                  * Take a ref on the state and return success. [6]
 278                  */
 279                 if (!pid)
 280                         goto out_attach;
 281         } else {
 282                 /*
 283                  * If the owner died bit is not set, then the pi_state
 284                  * must have an owner. [7]
 285                  */
 286                 if (!pi_state->owner)
 287                         goto out_einval;
 288         }
 289
 290         /*
 291          * Bail out if user space manipulated the futex value. If pi
 292          * state exists then the owner TID must be the same as the
 293          * user space TID. [9/10]
 294          */
 295         if (pid != task_pid_vnr(pi_state->owner))
 296                 goto out_einval;
 297
 298 out_attach:
 299         get_pi_state(pi_state);
 300         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 301         *ps = pi_state;
 302         return 0;
 303
 304 out_einval:
 305         ret = -EINVAL;
 306         goto out_error;
 307
 308 out_eagain:
 309         ret = -EAGAIN;
 310         goto out_error;
 311
 312 out_efault:
 313         ret = -EFAULT;
 314         goto out_error;
 315
 316 out_error:
 317         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 318         return ret;
 319 }
 320
 321 static int handle_exit_race(u32 __user *uaddr, u32 uval,
 322                             struct task_struct *tsk)
 323 {
 324         u32 uval2;
 325
 326         /*
 327          * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
 328          * caller that the alleged owner is busy.
 329          */
 330         if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
 331                 return -EBUSY;
 332
 333         /*
 334          * Reread the user space value to handle the following situation:
 335          *
 336          * CPU0                         CPU1
 337          *
 338          * sys_exit()                   sys_futex()
 339          *  do_exit()                    futex_lock_pi()
 340          *                                futex_lock_pi_atomic()
 341          *   exit_signals(tsk)              No waiters:
 342          *    tsk->flags |= PF_EXITING;     *uaddr == 0x00000PID
 343          *  mm_release(tsk)                 Set waiter bit
 344          *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
 345          *      Set owner died              attach_to_pi_owner() {
 346          *    *uaddr = 0xC0000000;           tsk = get_task(PID);
 347          *   }                               if (!tsk->flags & PF_EXITING) {
 348          *  ...                                attach();
 349          *  tsk->futex_state =               } else {
 350          *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
 351          *                                        FUTEX_STATE_DEAD)
 352          *                                       return -EAGAIN;
 353          *                                     return -ESRCH; <--- FAIL
 354          *                                   }
 355          *
 356          * Returning ESRCH unconditionally is wrong here because the
 357          * user space value has been changed by the exiting task.
 358          *
 359          * The same logic applies to the case where the exiting task is
 360          * already gone.
 361          */
 362         if (futex_get_value_locked(&uval2, uaddr))
 363                 return -EFAULT;
 364
 365         /* If the user space value has changed, try again. */
 366         if (uval2 != uval)
 367                 return -EAGAIN;
 368
 369         /*
 370          * The exiting task did not have a robust list, the robust list was
 371          * corrupted or the user space value in *uaddr is simply bogus.
 372          * Give up and tell user space.
 373          */
 374         return -ESRCH;
 375 }
 376
 377 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
 378                                  struct futex_pi_state **ps)
 379 {
 380         /*
 381          * No existing pi state. First waiter. [2]
 382          *
 383          * This creates pi_state, we have hb->lock held, this means nothing can
 384          * observe this state, wait_lock is irrelevant.
 385          */
 386         struct futex_pi_state *pi_state = alloc_pi_state();
 387
 388         /*
 389          * Initialize the pi_mutex in locked state and make @p
 390          * the owner of it:
 391          */
 392         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 393
 394         /* Store the key for possible exit cleanups: */
 395         pi_state->key = *key;
 396
 397         WARN_ON(!list_empty(&pi_state->list));
 398         list_add(&pi_state->list, &p->pi_state_list);
 399         /*
 400          * Assignment without holding pi_state->pi_mutex.wait_lock is safe
 401          * because there is no concurrency as the object is not published yet.
 402          */
 403         pi_state->owner = p;
 404
 405         *ps = pi_state;
 406 }
 407 /*
 408  * Lookup the task for the TID provided from user space and attach to
 409  * it after doing proper sanity checks.
 410  */
 411 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 412                               struct futex_pi_state **ps,
 413                               struct task_struct **exiting)
 414 {
 415         pid_t pid = uval & FUTEX_TID_MASK;
 416         struct task_struct *p;
 417
 418         /*
 419          * We are the first waiter - try to look up the real owner and attach
 420          * the new pi_state to it, but bail out when TID = 0 [1]
 421          *
 422          * The !pid check is paranoid. None of the call sites should end up
 423          * with pid == 0, but better safe than sorry. Let the caller retry
 424          */
 425         if (!pid)
 426                 return -EAGAIN;
 427         p = find_get_task_by_vpid(pid);
 428         if (!p)
 429                 return handle_exit_race(uaddr, uval, NULL);
 430
 431         if (unlikely(p->flags & PF_KTHREAD)) {
 432                 put_task_struct(p);
 433                 return -EPERM;
 434         }
 435
 436         /*
 437          * We need to look at the task state to figure out, whether the
 438          * task is exiting. To protect against the change of the task state
 439          * in futex_exit_release(), we do this protected by p->pi_lock:
 440          */
 441         raw_spin_lock_irq(&p->pi_lock);
 442         if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
 443                 /*
 444                  * The task is on the way out. When the futex state is
 445                  * FUTEX_STATE_DEAD, we know that the task has finished
 446                  * the cleanup:
 447                  */
 448                 int ret = handle_exit_race(uaddr, uval, p);
 449
 450                 raw_spin_unlock_irq(&p->pi_lock);
 451                 /*
 452                  * If the owner task is between FUTEX_STATE_EXITING and
 453                  * FUTEX_STATE_DEAD then store the task pointer and keep
 454                  * the reference on the task struct. The calling code will
 455                  * drop all locks, wait for the task to reach
 456                  * FUTEX_STATE_DEAD and then drop the refcount. This is
 457                  * required to prevent a live lock when the current task
 458                  * preempted the exiting task between the two states.
 459                  */
 460                 if (ret == -EBUSY)
 461                         *exiting = p;
 462                 else
 463                         put_task_struct(p);
 464                 return ret;
 465         }
 466
 467         __attach_to_pi_owner(p, key, ps);
 468         raw_spin_unlock_irq(&p->pi_lock);
 469
 470         put_task_struct(p);
 471
 472         return 0;
 473 }
 474
 475 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 476 {
 477         int err;
 478         u32 curval;
 479
 480         if (unlikely(should_fail_futex(true)))
 481                 return -EFAULT;
 482
 483         err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 484         if (unlikely(err))
 485                 return err;
 486
 487         /* If user space value changed, let the caller retry */
 488         return curval != uval ? -EAGAIN : 0;
 489 }
 490
 491 /**
 492  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 493  * @uaddr:              the pi futex user address
 494  * @hb:                 the pi futex hash bucket
 495  * @key:                the futex key associated with uaddr and hb
 496  * @ps:                 the pi_state pointer where we store the result of the
 497  *                      lookup
 498  * @task:               the task to perform the atomic lock work for.  This will
 499  *                      be "current" except in the case of requeue pi.
 500  * @exiting:            Pointer to store the task pointer of the owner task
 501  *                      which is in the middle of exiting
 502  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 503  *
 504  * Return:
 505  *  -  0 - ready to wait;
 506  *  -  1 - acquired the lock;
 507  *  - <0 - error
 508  *
 509  * The hb->lock must be held by the caller.
 510  *
 511  * @exiting is only set when the return value is -EBUSY. If so, this holds
 512  * a refcount on the exiting task on return and the caller needs to drop it
 513  * after waiting for the exit to complete.
 514  */
 515 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 516                          union futex_key *key,
 517                          struct futex_pi_state **ps,
 518                          struct task_struct *task,
 519                          struct task_struct **exiting,
 520                          int set_waiters)
 521 {
 522         u32 uval, newval, vpid = task_pid_vnr(task);
 523         struct futex_q *top_waiter;
 524         int ret;
 525
 526         /*
 527          * Read the user space value first so we can validate a few
 528          * things before proceeding further.
 529          */
 530         if (futex_get_value_locked(&uval, uaddr))
 531                 return -EFAULT;
 532
 533         if (unlikely(should_fail_futex(true)))
 534                 return -EFAULT;
 535
 536         /*
 537          * Detect deadlocks.
 538          */
 539         if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
 540                 return -EDEADLK;
 541
 542         if ((unlikely(should_fail_futex(true))))
 543                 return -EDEADLK;
 544
 545         /*
 546          * Lookup existing state first. If it exists, try to attach to
 547          * its pi_state.
 548          */
 549         top_waiter = futex_top_waiter(hb, key);
 550         if (top_waiter)
 551                 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
 552
 553         /*
 554          * No waiter and user TID is 0. We are here because the
 555          * waiters or the owner died bit is set or called from
 556          * requeue_cmp_pi or for whatever reason something took the
 557          * syscall.
 558          */
 559         if (!(uval & FUTEX_TID_MASK)) {
 560                 /*
 561                  * We take over the futex. No other waiters and the user space
 562                  * TID is 0. We preserve the owner died bit.
 563                  */
 564                 newval = uval & FUTEX_OWNER_DIED;
 565                 newval |= vpid;
 566
 567                 /* The futex requeue_pi code can enforce the waiters bit */
 568                 if (set_waiters)
 569                         newval |= FUTEX_WAITERS;
 570
 571                 ret = lock_pi_update_atomic(uaddr, uval, newval);
 572                 if (ret)
 573                         return ret;
 574
 575                 /*
 576                  * If the waiter bit was requested the caller also needs PI
 577                  * state attached to the new owner of the user space futex.
 578                  *
 579                  * @task is guaranteed to be alive and it cannot be exiting
 580                  * because it is either sleeping or waiting in
 581                  * futex_requeue_pi_wakeup_sync().
 582                  *
 583                  * No need to do the full attach_to_pi_owner() exercise
 584                  * because @task is known and valid.
 585                  */
 586                 if (set_waiters) {
 587                         raw_spin_lock_irq(&task->pi_lock);
 588                         __attach_to_pi_owner(task, key, ps);
 589                         raw_spin_unlock_irq(&task->pi_lock);
 590                 }
 591                 return 1;
 592         }
 593
 594         /*
 595          * First waiter. Set the waiters bit before attaching ourself to
 596          * the owner. If owner tries to unlock, it will be forced into
 597          * the kernel and blocked on hb->lock.
 598          */
 599         newval = uval | FUTEX_WAITERS;
 600         ret = lock_pi_update_atomic(uaddr, uval, newval);
 601         if (ret)
 602                 return ret;
 603         /*
 604          * If the update of the user space value succeeded, we try to
 605          * attach to the owner. If that fails, no harm done, we only
 606          * set the FUTEX_WAITERS bit in the user space variable.
 607          */
 608         return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
 609 }
 610
 611 /*
 612  * Caller must hold a reference on @pi_state.
 613  */
 614 static int wake_futex_pi(u32 __user *uaddr, u32 uval,
 615                          struct futex_pi_state *pi_state,
 616                          struct rt_mutex_waiter *top_waiter)
 617 {
 618         struct task_struct *new_owner;
 619         bool postunlock = false;
 620         DEFINE_RT_WAKE_Q(wqh);
 621         u32 curval, newval;
 622         int ret = 0;
 623
 624         new_owner = top_waiter->task;
 625
 626         /*
 627          * We pass it to the next owner. The WAITERS bit is always kept
 628          * enabled while there is PI state around. We cleanup the owner
 629          * died bit, because we are the owner.
 630          */
 631         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 632
 633         if (unlikely(should_fail_futex(true))) {
 634                 ret = -EFAULT;
 635                 goto out_unlock;
 636         }
 637
 638         ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 639         if (!ret && (curval != uval)) {
 640                 /*
 641                  * If a unconditional UNLOCK_PI operation (user space did not
 642                  * try the TID->0 transition) raced with a waiter setting the
 643                  * FUTEX_WAITERS flag between get_user() and locking the hash
 644                  * bucket lock, retry the operation.
 645                  */
 646                 if ((FUTEX_TID_MASK & curval) == uval)
 647                         ret = -EAGAIN;
 648                 else
 649                         ret = -EINVAL;
 650         }
 651
 652         if (!ret) {
 653                 /*
 654                  * This is a point of no return; once we modified the uval
 655                  * there is no going back and subsequent operations must
 656                  * not fail.
 657                  */
 658                 pi_state_update_owner(pi_state, new_owner);
 659                 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
 660         }
 661
 662 out_unlock:
 663         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 664
 665         if (postunlock)
 666                 rt_mutex_postunlock(&wqh);
 667
 668         return ret;
 669 }
 670
 671 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 672                                   struct task_struct *argowner)
 673 {
 674         struct futex_pi_state *pi_state = q->pi_state;
 675         struct task_struct *oldowner, *newowner;
 676         u32 uval, curval, newval, newtid;
 677         int err = 0;
 678
 679         oldowner = pi_state->owner;
 680
 681         /*
 682          * We are here because either:
 683          *
 684          *  - we stole the lock and pi_state->owner needs updating to reflect
 685          *    that (@argowner == current),
 686          *
 687          * or:
 688          *
 689          *  - someone stole our lock and we need to fix things to point to the
 690          *    new owner (@argowner == NULL).
 691          *
 692          * Either way, we have to replace the TID in the user space variable.
 693          * This must be atomic as we have to preserve the owner died bit here.
 694          *
 695          * Note: We write the user space value _before_ changing the pi_state
 696          * because we can fault here. Imagine swapped out pages or a fork
 697          * that marked all the anonymous memory readonly for cow.
 698          *
 699          * Modifying pi_state _before_ the user space value would leave the
 700          * pi_state in an inconsistent state when we fault here, because we
 701          * need to drop the locks to handle the fault. This might be observed
 702          * in the PID checks when attaching to PI state .
 703          */
 704 retry:
 705         if (!argowner) {
 706                 if (oldowner != current) {
 707                         /*
 708                          * We raced against a concurrent self; things are
 709                          * already fixed up. Nothing to do.
 710                          */
 711                         return 0;
 712                 }
 713
 714                 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
 715                         /* We got the lock. pi_state is correct. Tell caller. */
 716                         return 1;
 717                 }
 718
 719                 /*
 720                  * The trylock just failed, so either there is an owner or
 721                  * there is a higher priority waiter than this one.
 722                  */
 723                 newowner = rt_mutex_owner(&pi_state->pi_mutex);
 724                 /*
 725                  * If the higher priority waiter has not yet taken over the
 726                  * rtmutex then newowner is NULL. We can't return here with
 727                  * that state because it's inconsistent vs. the user space
 728                  * state. So drop the locks and try again. It's a valid
 729                  * situation and not any different from the other retry
 730                  * conditions.
 731                  */
 732                 if (unlikely(!newowner)) {
 733                         err = -EAGAIN;
 734                         goto handle_err;
 735                 }
 736         } else {
 737                 WARN_ON_ONCE(argowner != current);
 738                 if (oldowner == current) {
 739                         /*
 740                          * We raced against a concurrent self; things are
 741                          * already fixed up. Nothing to do.
 742                          */
 743                         return 1;
 744                 }
 745                 newowner = argowner;
 746         }
 747
 748         newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
 749         /* Owner died? */
 750         if (!pi_state->owner)
 751                 newtid |= FUTEX_OWNER_DIED;
 752
 753         err = futex_get_value_locked(&uval, uaddr);
 754         if (err)
 755                 goto handle_err;
 756
 757         for (;;) {
 758                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
 759
 760                 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 761                 if (err)
 762                         goto handle_err;
 763
 764                 if (curval == uval)
 765                         break;
 766                 uval = curval;
 767         }
 768
 769         /*
 770          * We fixed up user space. Now we need to fix the pi_state
 771          * itself.
 772          */
 773         pi_state_update_owner(pi_state, newowner);
 774
 775         return argowner == current;
 776
 777         /*
 778          * In order to reschedule or handle a page fault, we need to drop the
 779          * locks here. In the case of a fault, this gives the other task
 780          * (either the highest priority waiter itself or the task which stole
 781          * the rtmutex) the chance to try the fixup of the pi_state. So once we
 782          * are back from handling the fault we need to check the pi_state after
 783          * reacquiring the locks and before trying to do another fixup. When
 784          * the fixup has been done already we simply return.
 785          *
 786          * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
 787          * drop hb->lock since the caller owns the hb -> futex_q relation.
 788          * Dropping the pi_mutex->wait_lock requires the state revalidate.
 789          */
 790 handle_err:
 791         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 792         spin_unlock(q->lock_ptr);
 793
 794         switch (err) {
 795         case -EFAULT:
 796                 err = fault_in_user_writeable(uaddr);
 797                 break;
 798
 799         case -EAGAIN:
 800                 cond_resched();
 801                 err = 0;
 802                 break;
 803
 804         default:
 805                 WARN_ON_ONCE(1);
 806                 break;
 807         }
 808
 809         spin_lock(q->lock_ptr);
 810         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 811
 812         /*
 813          * Check if someone else fixed it for us:
 814          */
 815         if (pi_state->owner != oldowner)
 816                 return argowner == current;
 817
 818         /* Retry if err was -EAGAIN or the fault in succeeded */
 819         if (!err)
 820                 goto retry;
 821
 822         /*
 823          * fault_in_user_writeable() failed so user state is immutable. At
 824          * best we can make the kernel state consistent but user state will
 825          * be most likely hosed and any subsequent unlock operation will be
 826          * rejected due to PI futex rule [10].
 827          *
 828          * Ensure that the rtmutex owner is also the pi_state owner despite
 829          * the user space value claiming something different. There is no
 830          * point in unlocking the rtmutex if current is the owner as it
 831          * would need to wait until the next waiter has taken the rtmutex
 832          * to guarantee consistent state. Keep it simple. Userspace asked
 833          * for this wreckaged state.
 834          *
 835          * The rtmutex has an owner - either current or some other
 836          * task. See the EAGAIN loop above.
 837          */
 838         pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
 839
 840         return err;
 841 }
 842
 843 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 844                                 struct task_struct *argowner)
 845 {
 846         struct futex_pi_state *pi_state = q->pi_state;
 847         int ret;
 848
 849         lockdep_assert_held(q->lock_ptr);
 850
 851         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 852         ret = __fixup_pi_state_owner(uaddr, q, argowner);
 853         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 854         return ret;
 855 }
 856
 857 /**
 858  * fixup_pi_owner() - Post lock pi_state and corner case management
 859  * @uaddr:      user address of the futex
 860  * @q:          futex_q (contains pi_state and access to the rt_mutex)
 861  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
 862  *
 863  * After attempting to lock an rt_mutex, this function is called to cleanup
 864  * the pi_state owner as well as handle race conditions that may allow us to
 865  * acquire the lock. Must be called with the hb lock held.
 866  *
 867  * Return:
 868  *  -  1 - success, lock taken;
 869  *  -  0 - success, lock not taken;
 870  *  - <0 - on error (-EFAULT)
 871  */
 872 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 873 {
 874         if (locked) {
 875                 /*
 876                  * Got the lock. We might not be the anticipated owner if we
 877                  * did a lock-steal - fix up the PI-state in that case:
 878                  *
 879                  * Speculative pi_state->owner read (we don't hold wait_lock);
 880                  * since we own the lock pi_state->owner == current is the
 881                  * stable state, anything else needs more attention.
 882                  */
 883                 if (q->pi_state->owner != current)
 884                         return fixup_pi_state_owner(uaddr, q, current);
 885                 return 1;
 886         }
 887
 888         /*
 889          * If we didn't get the lock; check if anybody stole it from us. In
 890          * that case, we need to fix up the uval to point to them instead of
 891          * us, otherwise bad things happen. [10]
 892          *
 893          * Another speculative read; pi_state->owner == current is unstable
 894          * but needs our attention.
 895          */
 896         if (q->pi_state->owner == current)
 897                 return fixup_pi_state_owner(uaddr, q, NULL);
 898
 899         /*
 900          * Paranoia check. If we did not take the lock, then we should not be
 901          * the owner of the rt_mutex. Warn and establish consistent state.
 902          */
 903         if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
 904                 return fixup_pi_state_owner(uaddr, q, current);
 905
 906         return 0;
 907 }
 908
 909 /*
 910  * Userspace tried a 0 -> TID atomic transition of the futex value
 911  * and failed. The kernel side here does the whole locking operation:
 912  * if there are waiters then it will block as a consequence of relying
 913  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
 914  * a 0 value of the futex too.).
 915  *
 916  * Also serves as futex trylock_pi()'ing, and due semantics.
 917  */
 918 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
 919 {
 920         struct hrtimer_sleeper timeout, *to;
 921         struct task_struct *exiting = NULL;
 922         struct rt_mutex_waiter rt_waiter;
 923         struct futex_hash_bucket *hb;
 924         struct futex_q q = futex_q_init;
 925         int res, ret;
 926
 927         if (!IS_ENABLED(CONFIG_FUTEX_PI))
 928                 return -ENOSYS;
 929
 930         if (refill_pi_state_cache())
 931                 return -ENOMEM;
 932
 933         to = futex_setup_timer(time, &timeout, flags, 0);
 934
 935 retry:
 936         ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE);
 937         if (unlikely(ret != 0))
 938                 goto out;
 939
 940 retry_private:
 941         hb = futex_q_lock(&q);
 942
 943         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
 944                                    &exiting, 0);
 945         if (unlikely(ret)) {
 946                 /*
 947                  * Atomic work succeeded and we got the lock,
 948                  * or failed. Either way, we do _not_ block.
 949                  */
 950                 switch (ret) {
 951                 case 1:
 952                         /* We got the lock. */
 953                         ret = 0;
 954                         goto out_unlock_put_key;
 955                 case -EFAULT:
 956                         goto uaddr_faulted;
 957                 case -EBUSY:
 958                 case -EAGAIN:
 959                         /*
 960                          * Two reasons for this:
 961                          * - EBUSY: Task is exiting and we just wait for the
 962                          *   exit to complete.
 963                          * - EAGAIN: The user space value changed.
 964                          */
 965                         futex_q_unlock(hb);
 966                         /*
 967                          * Handle the case where the owner is in the middle of
 968                          * exiting. Wait for the exit to complete otherwise
 969                          * this task might loop forever, aka. live lock.
 970                          */
 971                         wait_for_owner_exiting(ret, exiting);
 972                         cond_resched();
 973                         goto retry;
 974                 default:
 975                         goto out_unlock_put_key;
 976                 }
 977         }
 978
 979         WARN_ON(!q.pi_state);
 980
 981         /*
 982          * Only actually queue now that the atomic ops are done:
 983          */
 984         __futex_queue(&q, hb);
 985
 986         if (trylock) {
 987                 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
 988                 /* Fixup the trylock return value: */
 989                 ret = ret ? 0 : -EWOULDBLOCK;
 990                 goto no_block;
 991         }
 992
 993         /*
 994          * Must be done before we enqueue the waiter, here is unfortunately
 995          * under the hb lock, but that *should* work because it does nothing.
 996          */
 997         rt_mutex_pre_schedule();
 998
 999         rt_mutex_init_waiter(&rt_waiter);
1000
1001         /*
1002          * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1003          * hold it while doing rt_mutex_start_proxy(), because then it will
1004          * include hb->lock in the blocking chain, even through we'll not in
1005          * fact hold it while blocking. This will lead it to report -EDEADLK
1006          * and BUG when futex_unlock_pi() interleaves with this.
1007          *
1008          * Therefore acquire wait_lock while holding hb->lock, but drop the
1009          * latter before calling __rt_mutex_start_proxy_lock(). This
1010          * interleaves with futex_unlock_pi() -- which does a similar lock
1011          * handoff -- such that the latter can observe the futex_q::pi_state
1012          * before __rt_mutex_start_proxy_lock() is done.
1013          */
1014         raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1015         spin_unlock(q.lock_ptr);
1016         /*
1017          * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1018          * such that futex_unlock_pi() is guaranteed to observe the waiter when
1019          * it sees the futex_q::pi_state.
1020          */
1021         ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1022         raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1023
1024         if (ret) {
1025                 if (ret == 1)
1026                         ret = 0;
1027                 goto cleanup;
1028         }
1029
1030         if (unlikely(to))
1031                 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1032
1033         ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1034
1035 cleanup:
1036         /*
1037          * If we failed to acquire the lock (deadlock/signal/timeout), we must
1038          * must unwind the above, however we canont lock hb->lock because
1039          * rt_mutex already has a waiter enqueued and hb->lock can itself try
1040          * and enqueue an rt_waiter through rtlock.
1041          *
1042          * Doing the cleanup without holding hb->lock can cause inconsistent
1043          * state between hb and pi_state, but only in the direction of not
1044          * seeing a waiter that is leaving.
1045          *
1046          * See futex_unlock_pi(), it deals with this inconsistency.
1047          *
1048          * There be dragons here, since we must deal with the inconsistency on
1049          * the way out (here), it is impossible to detect/warn about the race
1050          * the other way around (missing an incoming waiter).
1051          *
1052          * What could possibly go wrong...
1053          */
1054         if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1055                 ret = 0;
1056
1057         /*
1058          * Now that the rt_waiter has been dequeued, it is safe to use
1059          * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up
1060          * the
1061          */
1062         spin_lock(q.lock_ptr);
1063         /*
1064          * Waiter is unqueued.
1065          */
1066         rt_mutex_post_schedule();
1067 no_block:
1068         /*
1069          * Fixup the pi_state owner and possibly acquire the lock if we
1070          * haven't already.
1071          */
1072         res = fixup_pi_owner(uaddr, &q, !ret);
1073         /*
1074          * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1075          * the lock, clear our -ETIMEDOUT or -EINTR.
1076          */
1077         if (res)
1078                 ret = (res < 0) ? res : 0;
1079
1080         futex_unqueue_pi(&q);
1081         spin_unlock(q.lock_ptr);
1082         goto out;
1083
1084 out_unlock_put_key:
1085         futex_q_unlock(hb);
1086
1087 out:
1088         if (to) {
1089                 hrtimer_cancel(&to->timer);
1090                 destroy_hrtimer_on_stack(&to->timer);
1091         }
1092         return ret != -EINTR ? ret : -ERESTARTNOINTR;
1093
1094 uaddr_faulted:
1095         futex_q_unlock(hb);
1096
1097         ret = fault_in_user_writeable(uaddr);
1098         if (ret)
1099                 goto out;
1100
1101         if (!(flags & FLAGS_SHARED))
1102                 goto retry_private;
1103
1104         goto retry;
1105 }
1106
1107 /*
1108  * Userspace attempted a TID -> 0 atomic transition, and failed.
1109  * This is the in-kernel slowpath: we look up the PI state (if any),
1110  * and do the rt-mutex unlock.
1111  */
1112 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1113 {
1114         u32 curval, uval, vpid = task_pid_vnr(current);
1115         union futex_key key = FUTEX_KEY_INIT;
1116         struct futex_hash_bucket *hb;
1117         struct futex_q *top_waiter;
1118         int ret;
1119
1120         if (!IS_ENABLED(CONFIG_FUTEX_PI))
1121                 return -ENOSYS;
1122
1123 retry:
1124         if (get_user(uval, uaddr))
1125                 return -EFAULT;
1126         /*
1127          * We release only a lock we actually own:
1128          */
1129         if ((uval & FUTEX_TID_MASK) != vpid)
1130                 return -EPERM;
1131
1132         ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE);
1133         if (ret)
1134                 return ret;
1135
1136         hb = futex_hash(&key);
1137         spin_lock(&hb->lock);
1138 retry_hb:
1139
1140         /*
1141          * Check waiters first. We do not trust user space values at
1142          * all and we at least want to know if user space fiddled
1143          * with the futex value instead of blindly unlocking.
1144          */
1145         top_waiter = futex_top_waiter(hb, &key);
1146         if (top_waiter) {
1147                 struct futex_pi_state *pi_state = top_waiter->pi_state;
1148                 struct rt_mutex_waiter *rt_waiter;
1149
1150                 ret = -EINVAL;
1151                 if (!pi_state)
1152                         goto out_unlock;
1153
1154                 /*
1155                  * If current does not own the pi_state then the futex is
1156                  * inconsistent and user space fiddled with the futex value.
1157                  */
1158                 if (pi_state->owner != current)
1159                         goto out_unlock;
1160
1161                 /*
1162                  * By taking wait_lock while still holding hb->lock, we ensure
1163                  * there is no point where we hold neither; and thereby
1164                  * wake_futex_pi() must observe any new waiters.
1165                  *
1166                  * Since the cleanup: case in futex_lock_pi() removes the
1167                  * rt_waiter without holding hb->lock, it is possible for
1168                  * wake_futex_pi() to not find a waiter while the above does,
1169                  * in this case the waiter is on the way out and it can be
1170                  * ignored.
1171                  *
1172                  * In particular; this forces __rt_mutex_start_proxy() to
1173                  * complete such that we're guaranteed to observe the
1174                  * rt_waiter.
1175                  */
1176                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1177
1178                 /*
1179                  * Futex vs rt_mutex waiter state -- if there are no rt_mutex
1180                  * waiters even though futex thinks there are, then the waiter
1181                  * is leaving. The entry needs to be removed from the list so a
1182                  * new futex_lock_pi() is not using this stale PI-state while
1183                  * the futex is available in user space again.
1184                  * There can be more than one task on its way out so it needs
1185                  * to retry.
1186                  */
1187                 rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
1188                 if (!rt_waiter) {
1189                         __futex_unqueue(top_waiter);
1190                         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
1191                         goto retry_hb;
1192                 }
1193
1194                 get_pi_state(pi_state);
1195                 spin_unlock(&hb->lock);
1196
1197                 /* drops pi_state->pi_mutex.wait_lock */
1198                 ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter);
1199
1200                 put_pi_state(pi_state);
1201
1202                 /*
1203                  * Success, we're done! No tricky corner cases.
1204                  */
1205                 if (!ret)
1206                         return ret;
1207                 /*
1208                  * The atomic access to the futex value generated a
1209                  * pagefault, so retry the user-access and the wakeup:
1210                  */
1211                 if (ret == -EFAULT)
1212                         goto pi_faulted;
1213                 /*
1214                  * A unconditional UNLOCK_PI op raced against a waiter
1215                  * setting the FUTEX_WAITERS bit. Try again.
1216                  */
1217                 if (ret == -EAGAIN)
1218                         goto pi_retry;
1219                 /*
1220                  * wake_futex_pi has detected invalid state. Tell user
1221                  * space.
1222                  */
1223                 return ret;
1224         }
1225
1226         /*
1227          * We have no kernel internal state, i.e. no waiters in the
1228          * kernel. Waiters which are about to queue themselves are stuck
1229          * on hb->lock. So we can safely ignore them. We do neither
1230          * preserve the WAITERS bit not the OWNER_DIED one. We are the
1231          * owner.
1232          */
1233         if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1234                 spin_unlock(&hb->lock);
1235                 switch (ret) {
1236                 case -EFAULT:
1237                         goto pi_faulted;
1238
1239                 case -EAGAIN:
1240                         goto pi_retry;
1241
1242                 default:
1243                         WARN_ON_ONCE(1);
1244                         return ret;
1245                 }
1246         }
1247
1248         /*
1249          * If uval has changed, let user space handle it.
1250          */
1251         ret = (curval == uval) ? 0 : -EAGAIN;
1252
1253 out_unlock:
1254         spin_unlock(&hb->lock);
1255         return ret;
1256
1257 pi_retry:
1258         cond_resched();
1259         goto retry;
1260
1261 pi_faulted:
1262
1263         ret = fault_in_user_writeable(uaddr);
1264         if (!ret)
1265                 goto retry;
1266
1267         return ret;
1268 }
1269