perf/core: Fix race between close() and fork()
authorPeter Zijlstra <peterz@infradead.org>
Sat, 13 Jul 2019 09:21:25 +0000 (11:21 +0200)
committerIngo Molnar <mingo@kernel.org>
Sat, 13 Jul 2019 09:21:25 +0000 (11:21 +0200)
Syzcaller reported the following Use-after-Free bug:

close() clone()

  copy_process()
    perf_event_init_task()
      perf_event_init_context()
        mutex_lock(parent_ctx->mutex)
inherit_task_group()
  inherit_group()
    inherit_event()
      mutex_lock(event->child_mutex)
      // expose event on child list
      list_add_tail()
      mutex_unlock(event->child_mutex)
        mutex_unlock(parent_ctx->mutex)

    ...
    goto bad_fork_*

  bad_fork_cleanup_perf:
    perf_event_free_task()

  perf_release()
    perf_event_release_kernel()
      list_for_each_entry()
mutex_lock(ctx->mutex)
mutex_lock(event->child_mutex)
// event is from the failing inherit
// on the other CPU
perf_remove_from_context()
list_move()
mutex_unlock(event->child_mutex)
mutex_unlock(ctx->mutex)

      mutex_lock(ctx->mutex)
      list_for_each_entry_safe()
        // event already stolen
      mutex_unlock(ctx->mutex)

    delayed_free_task()
      free_task()

     list_for_each_entry_safe()
       list_del()
       free_event()
         _free_event()
   // and so event->hw.target
   // is the already freed failed clone()
   if (event->hw.target)
     put_task_struct(event->hw.target)
       // WHOOPSIE, already quite dead

Which puts the lie to the the comment on perf_event_free_task():
'unexposed, unused context' not so much.

Which is a 'fun' confluence of fail; copy_process() doing an
unconditional free_task() and not respecting refcounts, and perf having
creative locking. In particular:

  82d94856fa22 ("perf/core: Fix lock inversion between perf,trace,cpuhp")

seems to have overlooked this 'fun' parade.

Solve it by using the fact that detached events still have a reference
count on their (previous) context. With this perf_event_free_task()
can detect when events have escaped and wait for their destruction.

Debugged-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Reported-by: syzbot+a24c397a29ad22d86c98@syzkaller.appspotmail.com
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 82d94856fa22 ("perf/core: Fix lock inversion between perf,trace,cpuhp")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/events/core.c

index 785d708f8553a001227a3259264d7b8fdccbcee8..5dd19bedbf64747d27332ba9b9b029d0e0cec510 100644 (file)
@@ -4465,12 +4465,20 @@ static void _free_event(struct perf_event *event)
        if (event->destroy)
                event->destroy(event);
 
-       if (event->ctx)
-               put_ctx(event->ctx);
-
+       /*
+        * Must be after ->destroy(), due to uprobe_perf_close() using
+        * hw.target.
+        */
        if (event->hw.target)
                put_task_struct(event->hw.target);
 
+       /*
+        * perf_event_free_task() relies on put_ctx() being 'last', in particular
+        * all task references must be cleaned up.
+        */
+       if (event->ctx)
+               put_ctx(event->ctx);
+
        exclusive_event_destroy(event);
        module_put(event->pmu->module);
 
@@ -4650,8 +4658,17 @@ again:
        mutex_unlock(&event->child_mutex);
 
        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+               void *var = &child->ctx->refcount;
+
                list_del(&child->child_list);
                free_event(child);
+
+               /*
+                * Wake any perf_event_free_task() waiting for this event to be
+                * freed.
+                */
+               smp_mb(); /* pairs with wait_var_event() */
+               wake_up_var(var);
        }
 
 no_ctx:
@@ -11527,11 +11544,11 @@ static void perf_free_event(struct perf_event *event,
 }
 
 /*
- * Free an unexposed, unused context as created by inheritance by
- * perf_event_init_task below, used by fork() in case of fail.
+ * Free a context as created by inheritance by perf_event_init_task() below,
+ * used by fork() in case of fail.
  *
- * Not all locks are strictly required, but take them anyway to be nice and
- * help out with the lockdep assertions.
+ * Even though the task has never lived, the context and events have been
+ * exposed through the child_list, so we must take care tearing it all down.
  */
 void perf_event_free_task(struct task_struct *task)
 {
@@ -11561,7 +11578,23 @@ void perf_event_free_task(struct task_struct *task)
                        perf_free_event(event, ctx);
 
                mutex_unlock(&ctx->mutex);
-               put_ctx(ctx);
+
+               /*
+                * perf_event_release_kernel() could've stolen some of our
+                * child events and still have them on its free_list. In that
+                * case we must wait for these events to have been freed (in
+                * particular all their references to this task must've been
+                * dropped).
+                *
+                * Without this copy_process() will unconditionally free this
+                * task (irrespective of its reference count) and
+                * _free_event()'s put_task_struct(event->hw.target) will be a
+                * use-after-free.
+                *
+                * Wait for all events to drop their context reference.
+                */
+               wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+               put_ctx(ctx); /* must be last */
        }
 }