Linux 6.9-rc4
[sfrench/cifs-2.6.git] / kernel / exit.c
index 1803efb2922ffaaa78ebf01e064c820cd8538b4e..41a12630cbbc9cd80b6b5a154041c514b46ad3fe 100644 (file)
@@ -48,9 +48,9 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
-#include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <linux/writeback.h>
 #include <linux/shm.h>
 #include <linux/kcov.h>
+#include <linux/kmsan.h>
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
-
+#include <linux/io_uring.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/sysfs.h>
+#include <linux/user_events.h>
 #include <linux/uaccess.h>
+
+#include <uapi/linux/wait.h>
+
 #include <asm/unistd.h>
-#include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+#include "exit.h"
+
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_exit_table[] = {
+       {
+               .procname       = "oops_limit",
+               .data           = &oops_limit,
+               .maxlen         = sizeof(oops_limit),
+               .mode           = 0644,
+               .proc_handler   = proc_douintvec,
+       },
+       { }
+};
+
+static __init int kernel_exit_sysctls_init(void)
+{
+       register_sysctl_init("kernel", kern_exit_table);
+       return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+                              char *page)
+{
+       return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+       sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+       return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
        nr_threads--;
@@ -82,7 +137,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
        }
-       list_del_rcu(&p->thread_group);
        list_del_rcu(&p->thread_node);
 }
 
@@ -94,7 +148,7 @@ static void __exit_signal(struct task_struct *tsk)
        struct signal_struct *sig = tsk->signal;
        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
-       struct tty_struct *uninitialized_var(tty);
+       struct tty_struct *tty;
        u64 utime, stime;
 
        sighand = rcu_dereference_check(tsk->sighand,
@@ -103,17 +157,8 @@ static void __exit_signal(struct task_struct *tsk)
 
 #ifdef CONFIG_POSIX_TIMERS
        posix_cpu_timers_exit(tsk);
-       if (group_dead) {
+       if (group_dead)
                posix_cpu_timers_exit_group(tsk);
-       } else {
-               /*
-                * This can only happen if the caller is de_thread().
-                * FIXME: this is the temporary hack, we should teach
-                * posix-cpu-timers to handle this case correctly.
-                */
-               if (unlikely(has_group_leader_pid(tsk)))
-                       posix_cpu_timers_exit_group(tsk);
-       }
 #endif
 
        if (group_dead) {
@@ -125,7 +170,7 @@ static void __exit_signal(struct task_struct *tsk)
                 * then notify it:
                 */
                if (sig->notify_count > 0 && !--sig->notify_count)
-                       wake_up_process(sig->group_exit_task);
+                       wake_up_process(sig->group_exec_task);
 
                if (tsk == sig->curr_target)
                        sig->curr_target = next_thread(tsk);
@@ -177,27 +222,40 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
+       kprobe_flush_task(tsk);
+       rethook_flush_task(tsk);
        perf_event_delayed_put(tsk);
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
 }
 
+void put_task_struct_rcu_user(struct task_struct *task)
+{
+       if (refcount_dec_and_test(&task->rcu_users))
+               call_rcu(&task->rcu, delayed_put_task_struct);
+}
+
+void __weak release_thread(struct task_struct *dead_task)
+{
+}
 
 void release_task(struct task_struct *p)
 {
        struct task_struct *leader;
+       struct pid *thread_pid;
        int zap_leader;
 repeat:
        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
-       atomic_dec(&__task_cred(p)->user->processes);
+       dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        rcu_read_unlock();
 
-       proc_flush_task(p);
+       cgroup_release(p);
 
        write_lock_irq(&tasklist_lock);
        ptrace_release_task(p);
+       thread_pid = get_pid(p->thread_pid);
        __exit_signal(p);
 
        /*
@@ -220,80 +278,20 @@ repeat:
        }
 
        write_unlock_irq(&tasklist_lock);
-       cgroup_release(p);
+       seccomp_filter_release(p);
+       proc_flush_pid(thread_pid);
+       put_pid(thread_pid);
        release_thread(p);
-       call_rcu(&p->rcu, delayed_put_task_struct);
+       put_task_struct_rcu_user(p);
 
        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
 }
 
-/*
- * Note that if this function returns a valid task_struct pointer (!NULL)
- * task->usage must remain >0 for the duration of the RCU critical section.
- */
-struct task_struct *task_rcu_dereference(struct task_struct **ptask)
-{
-       struct sighand_struct *sighand;
-       struct task_struct *task;
-
-       /*
-        * We need to verify that release_task() was not called and thus
-        * delayed_put_task_struct() can't run and drop the last reference
-        * before rcu_read_unlock(). We check task->sighand != NULL,
-        * but we can read the already freed and reused memory.
-        */
-retry:
-       task = rcu_dereference(*ptask);
-       if (!task)
-               return NULL;
-
-       probe_kernel_address(&task->sighand, sighand);
-
-       /*
-        * Pairs with atomic_dec_and_test() in put_task_struct(). If this task
-        * was already freed we can not miss the preceding update of this
-        * pointer.
-        */
-       smp_rmb();
-       if (unlikely(task != READ_ONCE(*ptask)))
-               goto retry;
-
-       /*
-        * We've re-checked that "task == *ptask", now we have two different
-        * cases:
-        *
-        * 1. This is actually the same task/task_struct. In this case
-        *    sighand != NULL tells us it is still alive.
-        *
-        * 2. This is another task which got the same memory for task_struct.
-        *    We can't know this of course, and we can not trust
-        *    sighand != NULL.
-        *
-        *    In this case we actually return a random value, but this is
-        *    correct.
-        *
-        *    If we return NULL - we can pretend that we actually noticed that
-        *    *ptask was updated when the previous task has exited. Or pretend
-        *    that probe_slab_address(&sighand) reads NULL.
-        *
-        *    If we return the new task (because sighand is not NULL for any
-        *    reason) - this is fine too. This (new) task can't go away before
-        *    another gp pass.
-        *
-        *    And note: We could even eliminate the false positive if re-read
-        *    task->sighand once again to avoid the falsely NULL. But this case
-        *    is very unlikely so we don't care.
-        */
-       if (!sighand)
-               return NULL;
-
-       return task;
-}
-
-void rcuwait_wake_up(struct rcuwait *w)
+int rcuwait_wake_up(struct rcuwait *w)
 {
+       int ret = 0;
        struct task_struct *task;
 
        rcu_read_lock();
@@ -301,7 +299,7 @@ void rcuwait_wake_up(struct rcuwait *w)
        /*
         * Order condition vs @task, such that everything prior to the load
         * of @task is visible. This is the condition as to why the user called
-        * rcuwait_trywake() in the first place. Pairs with set_current_state()
+        * rcuwait_wake() in the first place. Pairs with set_current_state()
         * barrier (A) in rcuwait_wait_event().
         *
         *    WAIT                WAKE
@@ -311,15 +309,14 @@ void rcuwait_wake_up(struct rcuwait *w)
         */
        smp_mb(); /* (B) */
 
-       /*
-        * Avoid using task_rcu_dereference() magic as long as we are careful,
-        * see comment in rcuwait_wait_event() regarding ->exit_state.
-        */
        task = rcu_dereference(w->task);
        if (task)
-               wake_up_process(task);
+               ret = wake_up_process(task);
        rcu_read_unlock();
+
+       return ret;
 }
+EXPORT_SYMBOL_GPL(rcuwait_wake_up);
 
 /*
  * Determine if a process group is "orphaned", according to the POSIX
@@ -402,6 +399,49 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
        }
 }
 
+static void coredump_task_exit(struct task_struct *tsk)
+{
+       struct core_state *core_state;
+
+       /*
+        * Serialize with any possible pending coredump.
+        * We must hold siglock around checking core_state
+        * and setting PF_POSTCOREDUMP.  The core-inducing thread
+        * will increment ->nr_threads for each thread in the
+        * group without PF_POSTCOREDUMP set.
+        */
+       spin_lock_irq(&tsk->sighand->siglock);
+       tsk->flags |= PF_POSTCOREDUMP;
+       core_state = tsk->signal->core_state;
+       spin_unlock_irq(&tsk->sighand->siglock);
+
+       /* The vhost_worker does not particpate in coredumps */
+       if (core_state &&
+           ((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) {
+               struct core_thread self;
+
+               self.task = current;
+               if (self.task->flags & PF_SIGNALED)
+                       self.next = xchg(&core_state->dumper.next, &self);
+               else
+                       self.task = NULL;
+               /*
+                * Implies mb(), the result of xchg() must be visible
+                * to core_state->dumper.
+                */
+               if (atomic_dec_and_test(&core_state->nr_threads))
+                       complete(&core_state->startup);
+
+               for (;;) {
+                       set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
+                       if (!self.task) /* see coredump_finish() */
+                               break;
+                       schedule();
+               }
+               __set_current_state(TASK_RUNNING);
+       }
+}
+
 #ifdef CONFIG_MEMCG
 /*
  * A task is exiting.   If it owned this mm, find a new owner for the mm.
@@ -485,6 +525,7 @@ assign_new_owner:
                goto retry;
        }
        WRITE_ONCE(mm->owner, c);
+       lru_gen_migrate_mm(mm);
        task_unlock(c);
        put_task_struct(c);
 }
@@ -497,52 +538,33 @@ assign_new_owner:
 static void exit_mm(void)
 {
        struct mm_struct *mm = current->mm;
-       struct core_state *core_state;
 
-       mm_release(current, mm);
+       exit_mm_release(current, mm);
        if (!mm)
                return;
-       sync_mm_rss(mm);
-       /*
-        * Serialize with any possible pending coredump.
-        * We must hold mmap_sem around checking core_state
-        * and clearing tsk->mm.  The core-inducing thread
-        * will increment ->nr_threads for each thread in the
-        * group with ->mm != NULL.
-        */
-       down_read(&mm->mmap_sem);
-       core_state = mm->core_state;
-       if (core_state) {
-               struct core_thread self;
-
-               up_read(&mm->mmap_sem);
-
-               self.task = current;
-               self.next = xchg(&core_state->dumper.next, &self);
-               /*
-                * Implies mb(), the result of xchg() must be visible
-                * to core_state->dumper.
-                */
-               if (atomic_dec_and_test(&core_state->nr_threads))
-                       complete(&core_state->startup);
-
-               for (;;) {
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       if (!self.task) /* see coredump_finish() */
-                               break;
-                       freezable_schedule();
-               }
-               __set_current_state(TASK_RUNNING);
-               down_read(&mm->mmap_sem);
-       }
-       mmgrab(mm);
+       mmap_read_lock(mm);
+       mmgrab_lazy_tlb(mm);
        BUG_ON(mm != current->active_mm);
        /* more a memory barrier than a real lock */
        task_lock(current);
+       /*
+        * When a thread stops operating on an address space, the loop
+        * in membarrier_private_expedited() may not observe that
+        * tsk->mm, and the loop in membarrier_global_expedited() may
+        * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
+        * rq->membarrier_state, so those would not issue an IPI.
+        * Membarrier requires a memory barrier after accessing
+        * user-space memory, before clearing tsk->mm or the
+        * rq->membarrier_state.
+        */
+       smp_mb__after_spinlock();
+       local_irq_disable();
        current->mm = NULL;
-       up_read(&mm->mmap_sem);
+       membarrier_update_current_mm(NULL);
        enter_lazy_tlb(mm, current);
+       local_irq_enable();
        task_unlock(current);
+       mmap_read_unlock(mm);
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
@@ -579,10 +601,6 @@ static struct task_struct *find_child_reaper(struct task_struct *father,
        }
 
        write_unlock_irq(&tasklist_lock);
-       if (unlikely(pid_ns == &init_pid_ns)) {
-               panic("Attempted to kill init! exitcode=0x%08x\n",
-                       father->signal->group_exit_code ?: father->exit_code);
-       }
 
        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
@@ -685,8 +703,8 @@ static void forget_original_parent(struct task_struct *father,
        reaper = find_new_reaper(father, reaper);
        list_for_each_entry(p, &father->children, sibling) {
                for_each_thread(p, t) {
-                       t->real_parent = reaper;
-                       BUG_ON((!t->ptrace) != (t->parent == father));
+                       RCU_INIT_POINTER(t->real_parent, reaper);
+                       BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
                        if (likely(!t->ptrace))
                                t->parent = t->real_parent;
                        if (t->pdeath_signal)
@@ -720,6 +738,14 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);
 
+       tsk->exit_state = EXIT_ZOMBIE;
+       /*
+        * sub-thread or delay_group_leader(), wake up the
+        * PIDFD_THREAD waiters.
+        */
+       if (!thread_group_empty(tsk))
+               do_notify_pidfd(tsk);
+
        if (unlikely(tsk->ptrace)) {
                int sig = thread_group_leader(tsk) &&
                                thread_group_empty(tsk) &&
@@ -733,13 +759,14 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
                autoreap = true;
        }
 
-       tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
-       if (tsk->exit_state == EXIT_DEAD)
+       if (autoreap) {
+               tsk->exit_state = EXIT_DEAD;
                list_add(&tsk->ptrace_entry, &dead);
+       }
 
        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
-               wake_up_process(tsk->signal->group_exit_task);
+               wake_up_process(tsk->signal->group_exec_task);
        write_unlock_irq(&tasklist_lock);
 
        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
@@ -772,83 +799,57 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
 
+static void synchronize_group_exit(struct task_struct *tsk, long code)
+{
+       struct sighand_struct *sighand = tsk->sighand;
+       struct signal_struct *signal = tsk->signal;
+
+       spin_lock_irq(&sighand->siglock);
+       signal->quick_threads--;
+       if ((signal->quick_threads == 0) &&
+           !(signal->flags & SIGNAL_GROUP_EXIT)) {
+               signal->flags = SIGNAL_GROUP_EXIT;
+               signal->group_exit_code = code;
+               signal->group_stop_count = 0;
+       }
+       spin_unlock_irq(&sighand->siglock);
+}
+
 void __noreturn do_exit(long code)
 {
        struct task_struct *tsk = current;
        int group_dead;
 
-       profile_task_exit(tsk);
-       kcov_task_exit(tsk);
+       WARN_ON(irqs_disabled());
 
-       WARN_ON(blk_needs_flush_plug(tsk));
+       synchronize_group_exit(tsk, code);
 
-       if (unlikely(in_interrupt()))
-               panic("Aiee, killing interrupt handler!");
-       if (unlikely(!tsk->pid))
-               panic("Attempted to kill the idle task!");
+       WARN_ON(tsk->plug);
 
-       /*
-        * If do_exit is called because this processes oopsed, it's possible
-        * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
-        * continuing. Amongst other possible reasons, this is to prevent
-        * mm_release()->clear_child_tid() from writing to a user-controlled
-        * kernel address.
-        */
-       set_fs(USER_DS);
+       kcov_task_exit(tsk);
+       kmsan_task_exit(tsk);
 
+       coredump_task_exit(tsk);
        ptrace_event(PTRACE_EVENT_EXIT, code);
+       user_events_exit(tsk);
 
-       validate_creds_for_do_exit(tsk);
-
-       /*
-        * We're taking recursive faults here in do_exit. Safest is to just
-        * leave this task alone and wait for reboot.
-        */
-       if (unlikely(tsk->flags & PF_EXITING)) {
-               pr_alert("Fixing recursive fault but reboot is needed!\n");
-               /*
-                * We can do this unlocked here. The futex code uses
-                * this flag just to verify whether the pi state
-                * cleanup has been done or not. In the worst case it
-                * loops once more. We pretend that the cleanup was
-                * done as there is no way to return. Either the
-                * OWNER_DIED bit is set by now or we push the blocked
-                * task into the wait for ever nirwana as well.
-                */
-               tsk->flags |= PF_EXITPIDONE;
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               schedule();
-       }
-
+       io_uring_files_cancel();
        exit_signals(tsk);  /* sets PF_EXITING */
-       /*
-        * Ensure that all new tsk->pi_lock acquisitions must observe
-        * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
-        */
-       smp_mb();
-       /*
-        * Ensure that we must observe the pi_state in exit_mm() ->
-        * mm_release() -> exit_pi_state_list().
-        */
-       raw_spin_lock_irq(&tsk->pi_lock);
-       raw_spin_unlock_irq(&tsk->pi_lock);
 
-       if (unlikely(in_atomic())) {
-               pr_info("note: %s[%d] exited with preempt_count %d\n",
-                       current->comm, task_pid_nr(current),
-                       preempt_count());
-               preempt_count_set(PREEMPT_ENABLED);
-       }
-
-       /* sync mm's RSS info before statistics gathering */
-       if (tsk->mm)
-               sync_mm_rss(tsk->mm);
        acct_update_integrals(tsk);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
+               /*
+                * If the last thread of global init has exited, panic
+                * immediately to get a useable coredump.
+                */
+               if (unlikely(is_global_init(tsk)))
+                       panic("Attempted to kill init! exitcode=0x%08x\n",
+                               tsk->signal->group_exit_code ?: (int)code);
+
 #ifdef CONFIG_POSIX_TIMERS
                hrtimer_cancel(&tsk->signal->real_timer);
-               exit_itimers(tsk->signal);
+               exit_itimers(tsk);
 #endif
                if (tsk->mm)
                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
@@ -876,7 +877,6 @@ void __noreturn do_exit(long code)
        exit_task_namespaces(tsk);
        exit_task_work(tsk);
        exit_thread(tsk);
-       exit_umh(tsk);
 
        /*
         * Flush inherited counters to the parent - before the parent
@@ -906,12 +906,6 @@ void __noreturn do_exit(long code)
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held();
-       /*
-        * We can do this unlocked here. The futex code uses this flag
-        * just to verify whether the pi state cleanup has been done
-        * or not. In the worst case it loops once more.
-        */
-       tsk->flags |= PF_EXITPIDONE;
 
        if (tsk->io_context)
                exit_io_context(tsk);
@@ -922,7 +916,7 @@ void __noreturn do_exit(long code)
        if (tsk->task_frag.page)
                put_page(tsk->task_frag.page);
 
-       validate_creds_for_do_exit(tsk);
+       exit_task_stack_account(tsk);
 
        check_stack_usage();
        preempt_disable();
@@ -934,16 +928,66 @@ void __noreturn do_exit(long code)
        lockdep_free_task(tsk);
        do_task_dead();
 }
-EXPORT_SYMBOL_GPL(do_exit);
 
-void complete_and_exit(struct completion *comp, long code)
+void __noreturn make_task_dead(int signr)
 {
-       if (comp)
-               complete(comp);
+       /*
+        * Take the task off the cpu after something catastrophic has
+        * happened.
+        *
+        * We can get here from a kernel oops, sometimes with preemption off.
+        * Start by checking for critical errors.
+        * Then fix up important state like USER_DS and preemption.
+        * Then do everything else.
+        */
+       struct task_struct *tsk = current;
+       unsigned int limit;
+
+       if (unlikely(in_interrupt()))
+               panic("Aiee, killing interrupt handler!");
+       if (unlikely(!tsk->pid))
+               panic("Attempted to kill the idle task!");
+
+       if (unlikely(irqs_disabled())) {
+               pr_info("note: %s[%d] exited with irqs disabled\n",
+                       current->comm, task_pid_nr(current));
+               local_irq_enable();
+       }
+       if (unlikely(in_atomic())) {
+               pr_info("note: %s[%d] exited with preempt_count %d\n",
+                       current->comm, task_pid_nr(current),
+                       preempt_count());
+               preempt_count_set(PREEMPT_ENABLED);
+       }
+
+       /*
+        * Every time the system oopses, if the oops happens while a reference
+        * to an object was held, the reference leaks.
+        * If the oops doesn't also leak memory, repeated oopsing can cause
+        * reference counters to wrap around (if they're not using refcount_t).
+        * This means that repeated oopsing can make unexploitable-looking bugs
+        * exploitable through repeated oopsing.
+        * To make sure this can't happen, place an upper bound on how often the
+        * kernel may oops without panic().
+        */
+       limit = READ_ONCE(oops_limit);
+       if (atomic_inc_return(&oops_count) >= limit && limit)
+               panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+       /*
+        * We're taking recursive faults here in make_task_dead. Safest is to just
+        * leave this task alone and wait for reboot.
+        */
+       if (unlikely(tsk->flags & PF_EXITING)) {
+               pr_alert("Fixing recursive fault but reboot is needed!\n");
+               futex_exit_recursive(tsk);
+               tsk->exit_state = EXIT_DEAD;
+               refcount_inc(&tsk->rcu_users);
+               do_task_dead();
+       }
 
-       do_exit(code);
+       do_exit(signr);
 }
-EXPORT_SYMBOL(complete_and_exit);
 
 SYSCALL_DEFINE1(exit, int, error_code)
 {
@@ -954,22 +998,24 @@ SYSCALL_DEFINE1(exit, int, error_code)
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
  */
-void
+void __noreturn
 do_group_exit(int exit_code)
 {
        struct signal_struct *sig = current->signal;
 
-       BUG_ON(exit_code & 0x80); /* core dumps don't get here */
-
-       if (signal_group_exit(sig))
+       if (sig->flags & SIGNAL_GROUP_EXIT)
                exit_code = sig->group_exit_code;
-       else if (!thread_group_empty(current)) {
+       else if (sig->group_exec_task)
+               exit_code = 0;
+       else {
                struct sighand_struct *const sighand = current->sighand;
 
                spin_lock_irq(&sighand->siglock);
-               if (signal_group_exit(sig))
+               if (sig->flags & SIGNAL_GROUP_EXIT)
                        /* Another thread got here before we took the lock.  */
                        exit_code = sig->group_exit_code;
+               else if (sig->group_exec_task)
+                       exit_code = 0;
                else {
                        sig->group_exit_code = exit_code;
                        sig->flags = SIGNAL_GROUP_EXIT;
@@ -994,26 +1040,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
        return 0;
 }
 
-struct waitid_info {
-       pid_t pid;
-       uid_t uid;
-       int status;
-       int cause;
-};
-
-struct wait_opts {
-       enum pid_type           wo_type;
-       int                     wo_flags;
-       struct pid              *wo_pid;
-
-       struct waitid_info      *wo_info;
-       int                     wo_stat;
-       struct rusage           *wo_rusage;
-
-       wait_queue_entry_t              child_wait;
-       int                     notask_error;
-};
-
 static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
        return  wo->wo_type == PIDTYPE_MAX ||
@@ -1064,7 +1090,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                return 0;
 
        if (unlikely(wo->wo_flags & WNOWAIT)) {
-               status = p->exit_code;
+               status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+                       ? p->signal->group_exit_code : p->exit_code;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
                sched_annotate_sleep();
@@ -1106,18 +1133,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                 * p->signal fields because the whole thread group is dead
                 * and nobody can change them.
                 *
-                * psig->stats_lock also protects us from our sub-theads
-                * which can reap other children at the same time. Until
-                * we change k_getrusage()-like users to rely on this lock
-                * we have to take ->siglock as well.
+                * psig->stats_lock also protects us from our sub-threads
+                * which can reap other children at the same time.
                 *
                 * We use thread_group_cputime_adjusted() to get times for
                 * the thread group, which consolidates times for all threads
                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
-               spin_lock_irq(&current->sighand->siglock);
-               write_seqlock(&psig->stats_lock);
+               write_seqlock_irq(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1140,8 +1164,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
-               write_sequnlock(&psig->stats_lock);
-               spin_unlock_irq(&current->sighand->siglock);
+               write_sequnlock_irq(&psig->stats_lock);
        }
 
        if (wo->wo_rusage)
@@ -1476,6 +1499,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
        return 0;
 }
 
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+       if (!eligible_pid(wo, p))
+               return false;
+
+       if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+               return false;
+
+       return true;
+}
+
 static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                int sync, void *key)
 {
@@ -1483,32 +1517,62 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                                child_wait);
        struct task_struct *p = key;
 
-       if (!eligible_pid(wo, p))
-               return 0;
-
-       if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
-               return 0;
+       if (pid_child_should_wake(wo, p))
+               return default_wake_function(wait, mode, sync, key);
 
-       return default_wake_function(wait, mode, sync, key);
+       return 0;
 }
 
 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
 {
        __wake_up_sync_key(&parent->signal->wait_chldexit,
-                               TASK_INTERRUPTIBLE, 1, p);
+                          TASK_INTERRUPTIBLE, p);
 }
 
-static long do_wait(struct wait_opts *wo)
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+                                struct task_struct *target)
 {
-       struct task_struct *tsk;
+       struct task_struct *parent =
+               !ptrace ? target->real_parent : target->parent;
+
+       return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+                                    same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+       bool ptrace;
+       struct task_struct *target;
        int retval;
 
-       trace_sched_process_wait(wo->wo_pid);
+       ptrace = false;
+       target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+       if (target && is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       ptrace = true;
+       target = pid_task(wo->wo_pid, PIDTYPE_PID);
+       if (target && target->ptrace &&
+           is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       return 0;
+}
+
+long __do_wait(struct wait_opts *wo)
+{
+       long retval;
 
-       init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
-       wo->child_wait.private = current;
-       add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
-repeat:
        /*
         * If there is nothing that can match our criteria, just get out.
         * We will clear ->notask_error to zero if we see any child that
@@ -1517,48 +1581,73 @@ repeat:
         */
        wo->notask_error = -ECHILD;
        if ((wo->wo_type < PIDTYPE_MAX) &&
-          (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
+          (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
                goto notask;
 
-       set_current_state(TASK_INTERRUPTIBLE);
        read_lock(&tasklist_lock);
-       tsk = current;
-       do {
-               retval = do_wait_thread(wo, tsk);
-               if (retval)
-                       goto end;
 
-               retval = ptrace_do_wait(wo, tsk);
+       if (wo->wo_type == PIDTYPE_PID) {
+               retval = do_wait_pid(wo);
                if (retval)
-                       goto end;
+                       return retval;
+       } else {
+               struct task_struct *tsk = current;
 
-               if (wo->wo_flags & __WNOTHREAD)
-                       break;
-       } while_each_thread(current, tsk);
+               do {
+                       retval = do_wait_thread(wo, tsk);
+                       if (retval)
+                               return retval;
+
+                       retval = ptrace_do_wait(wo, tsk);
+                       if (retval)
+                               return retval;
+
+                       if (wo->wo_flags & __WNOTHREAD)
+                               break;
+               } while_each_thread(current, tsk);
+       }
        read_unlock(&tasklist_lock);
 
 notask:
        retval = wo->notask_error;
-       if (!retval && !(wo->wo_flags & WNOHANG)) {
-               retval = -ERESTARTSYS;
-               if (!signal_pending(current)) {
-                       schedule();
-                       goto repeat;
-               }
-       }
-end:
+       if (!retval && !(wo->wo_flags & WNOHANG))
+               return -ERESTARTSYS;
+
+       return retval;
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+       int retval;
+
+       trace_sched_process_wait(wo->wo_pid);
+
+       init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+       wo->child_wait.private = current;
+       add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+
+       do {
+               set_current_state(TASK_INTERRUPTIBLE);
+               retval = __do_wait(wo);
+               if (retval != -ERESTARTSYS)
+                       break;
+               if (signal_pending(current))
+                       break;
+               schedule();
+       } while (1);
+
        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
        return retval;
 }
 
-static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
-                         int options, struct rusage *ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+                         struct waitid_info *infop, int options,
+                         struct rusage *ru)
 {
-       struct wait_opts wo;
+       unsigned int f_flags = 0;
        struct pid *pid = NULL;
        enum pid_type type;
-       long ret;
 
        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
@@ -1574,27 +1663,59 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
                type = PIDTYPE_PID;
                if (upid <= 0)
                        return -EINVAL;
+
+               pid = find_get_pid(upid);
                break;
        case P_PGID:
                type = PIDTYPE_PGID;
-               if (upid <= 0)
+               if (upid < 0)
+                       return -EINVAL;
+
+               if (upid)
+                       pid = find_get_pid(upid);
+               else
+                       pid = get_task_pid(current, PIDTYPE_PGID);
+               break;
+       case P_PIDFD:
+               type = PIDTYPE_PID;
+               if (upid < 0)
                        return -EINVAL;
+
+               pid = pidfd_get_pid(upid, &f_flags);
+               if (IS_ERR(pid))
+                       return PTR_ERR(pid);
+
                break;
        default:
                return -EINVAL;
        }
 
-       if (type < PIDTYPE_MAX)
-               pid = find_get_pid(upid);
+       wo->wo_type     = type;
+       wo->wo_pid      = pid;
+       wo->wo_flags    = options;
+       wo->wo_info     = infop;
+       wo->wo_rusage   = ru;
+       if (f_flags & O_NONBLOCK)
+               wo->wo_flags |= WNOHANG;
+
+       return 0;
+}
+
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+                         int options, struct rusage *ru)
+{
+       struct wait_opts wo;
+       long ret;
+
+       ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+       if (ret)
+               return ret;
 
-       wo.wo_type      = type;
-       wo.wo_pid       = pid;
-       wo.wo_flags     = options;
-       wo.wo_info      = infop;
-       wo.wo_rusage    = ru;
        ret = do_wait(&wo);
+       if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
+               ret = -EAGAIN;
 
-       put_pid(pid);
+       put_pid(wo.wo_pid);
        return ret;
 }
 
@@ -1615,7 +1736,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        if (!infop)
                return err;
 
-       if (!user_access_begin(infop, sizeof(*infop)))
+       if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;
 
        unsafe_put_user(signo, &infop->si_signo, Efault);
@@ -1624,10 +1745,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
-       user_access_end();
+       user_write_access_end();
        return err;
 Efault:
-       user_access_end();
+       user_write_access_end();
        return -EFAULT;
 }
 
@@ -1674,6 +1795,22 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
        return ret;
 }
 
+int kernel_wait(pid_t pid, int *stat)
+{
+       struct wait_opts wo = {
+               .wo_type        = PIDTYPE_PID,
+               .wo_pid         = find_get_pid(pid),
+               .wo_flags       = WEXITED,
+       };
+       int ret;
+
+       ret = do_wait(&wo);
+       if (ret > 0 && wo.wo_stat)
+               *stat = wo.wo_stat;
+       put_pid(wo.wo_pid);
+       return ret;
+}
+
 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
                int, options, struct rusage __user *, ru)
 {
@@ -1742,7 +1879,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
        if (!infop)
                return err;
 
-       if (!user_access_begin(infop, sizeof(*infop)))
+       if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;
 
        unsafe_put_user(signo, &infop->si_signo, Efault);
@@ -1751,15 +1888,22 @@ COMPAT_SYSCALL_DEFINE5(waitid,
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
-       user_access_end();
+       user_write_access_end();
        return err;
 Efault:
-       user_access_end();
+       user_write_access_end();
        return -EFAULT;
 }
 #endif
 
-__weak void abort(void)
+/*
+ * This needs to be __function_aligned as GCC implicitly makes any
+ * implementation of abort() cold and drops alignment specified by
+ * -falign-functions=N.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
+ */
+__weak __function_aligned void abort(void)
 {
        BUG();