Merge branch 'for-6.9/amd-sfh' into for-linus
[sfrench/cifs-2.6.git] / lib / stackdepot.c
index a0be5d05c7f08187667c91c7d0886843df52225c..5caa1f566553843911ffdf2edafd32ee70277ea8 100644 (file)
@@ -14,6 +14,7 @@
 
 #define pr_fmt(fmt) "stackdepot: " fmt
 
+#include <linux/debugfs.h>
 #include <linux/gfp.h>
 #include <linux/jhash.h>
 #include <linux/kernel.h>
@@ -21,8 +22,9 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
-#include <linux/percpu.h>
 #include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
 #include <linux/refcount.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -67,12 +69,28 @@ union handle_parts {
 };
 
 struct stack_record {
-       struct list_head list;          /* Links in hash table or freelist */
+       struct list_head hash_list;     /* Links in the hash table */
        u32 hash;                       /* Hash in hash table */
        u32 size;                       /* Number of stored frames */
-       union handle_parts handle;
+       union handle_parts handle;      /* Constant after initialization */
        refcount_t count;
-       unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];    /* Frames */
+       union {
+               unsigned long entries[CONFIG_STACKDEPOT_MAX_FRAMES];    /* Frames */
+               struct {
+                       /*
+                        * An important invariant of the implementation is to
+                        * only place a stack record onto the freelist iff its
+                        * refcount is zero. Because stack records with a zero
+                        * refcount are never considered as valid, it is safe to
+                        * union @entries and freelist management state below.
+                        * Conversely, as soon as an entry is off the freelist
+                        * and its refcount becomes non-zero, the below must not
+                        * be accessed until being placed back on the freelist.
+                        */
+                       struct list_head free_list;     /* Links in the freelist */
+                       unsigned long rcu_state;        /* RCU cookie */
+               };
+       };
 };
 
 #define DEPOT_STACK_RECORD_SIZE \
@@ -112,8 +130,25 @@ static LIST_HEAD(free_stacks);
  * yet allocated or if the limit on the number of pools is reached.
  */
 static bool new_pool_required = true;
-/* Lock that protects the variables above. */
-static DEFINE_RWLOCK(pool_rwlock);
+/* The lock must be held when performing pool or freelist modifications. */
+static DEFINE_RAW_SPINLOCK(pool_lock);
+
+/* Statistics counters for debugfs. */
+enum depot_counter_id {
+       DEPOT_COUNTER_ALLOCS,
+       DEPOT_COUNTER_FREES,
+       DEPOT_COUNTER_INUSE,
+       DEPOT_COUNTER_FREELIST_SIZE,
+       DEPOT_COUNTER_COUNT,
+};
+static long counters[DEPOT_COUNTER_COUNT];
+static const char *const counter_names[] = {
+       [DEPOT_COUNTER_ALLOCS]          = "allocations",
+       [DEPOT_COUNTER_FREES]           = "frees",
+       [DEPOT_COUNTER_INUSE]           = "in_use",
+       [DEPOT_COUNTER_FREELIST_SIZE]   = "freelist_size",
+};
+static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
 
 static int __init disable_stack_depot(char *str)
 {
@@ -258,14 +293,15 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(stack_depot_init);
 
-/* Initializes a stack depol pool. */
+/*
+ * Initializes new stack depot @pool, release all its entries to the freelist,
+ * and update the list of pools.
+ */
 static void depot_init_pool(void *pool)
 {
        int offset;
 
-       lockdep_assert_held_write(&pool_rwlock);
-
-       WARN_ON(!list_empty(&free_stacks));
+       lockdep_assert_held(&pool_lock);
 
        /* Initialize handles and link stack records into the freelist. */
        for (offset = 0; offset <= DEPOT_POOL_SIZE - DEPOT_STACK_RECORD_SIZE;
@@ -276,18 +312,36 @@ static void depot_init_pool(void *pool)
                stack->handle.offset = offset >> DEPOT_STACK_ALIGN;
                stack->handle.extra = 0;
 
-               list_add(&stack->list, &free_stacks);
+               /*
+                * Stack traces of size 0 are never saved, and we can simply use
+                * the size field as an indicator if this is a new unused stack
+                * record in the freelist.
+                */
+               stack->size = 0;
+
+               INIT_LIST_HEAD(&stack->hash_list);
+               /*
+                * Add to the freelist front to prioritize never-used entries:
+                * required in case there are entries in the freelist, but their
+                * RCU cookie still belongs to the current RCU grace period
+                * (there can still be concurrent readers).
+                */
+               list_add(&stack->free_list, &free_stacks);
+               counters[DEPOT_COUNTER_FREELIST_SIZE]++;
        }
 
        /* Save reference to the pool to be used by depot_fetch_stack(). */
        stack_pools[pools_num] = pool;
-       pools_num++;
+
+       /* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
+       WRITE_ONCE(pools_num, pools_num + 1);
+       ASSERT_EXCLUSIVE_WRITER(pools_num);
 }
 
 /* Keeps the preallocated memory to be used for a new stack depot pool. */
 static void depot_keep_new_pool(void **prealloc)
 {
-       lockdep_assert_held_write(&pool_rwlock);
+       lockdep_assert_held(&pool_lock);
 
        /*
         * If a new pool is already saved or the maximum number of
@@ -310,17 +364,16 @@ static void depot_keep_new_pool(void **prealloc)
         * number of pools is reached. In either case, take note that
         * keeping another pool is not required.
         */
-       new_pool_required = false;
+       WRITE_ONCE(new_pool_required, false);
 }
 
-/* Updates references to the current and the next stack depot pools. */
-static bool depot_update_pools(void **prealloc)
+/*
+ * Try to initialize a new stack depot pool from either a previous or the
+ * current pre-allocation, and release all its entries to the freelist.
+ */
+static bool depot_try_init_pool(void **prealloc)
 {
-       lockdep_assert_held_write(&pool_rwlock);
-
-       /* Check if we still have objects in the freelist. */
-       if (!list_empty(&free_stacks))
-               goto out_keep_prealloc;
+       lockdep_assert_held(&pool_lock);
 
        /* Check if we have a new pool saved and use it. */
        if (new_pool) {
@@ -329,10 +382,9 @@ static bool depot_update_pools(void **prealloc)
 
                /* Take note that we might need a new new_pool. */
                if (pools_num < DEPOT_MAX_POOLS)
-                       new_pool_required = true;
+                       WRITE_ONCE(new_pool_required, true);
 
-               /* Try keeping the preallocated memory for new_pool. */
-               goto out_keep_prealloc;
+               return true;
        }
 
        /* Bail out if we reached the pool limit. */
@@ -349,12 +401,32 @@ static bool depot_update_pools(void **prealloc)
        }
 
        return false;
+}
+
+/* Try to find next free usable entry. */
+static struct stack_record *depot_pop_free(void)
+{
+       struct stack_record *stack;
+
+       lockdep_assert_held(&pool_lock);
+
+       if (list_empty(&free_stacks))
+               return NULL;
+
+       /*
+        * We maintain the invariant that the elements in front are least
+        * recently used, and are therefore more likely to be associated with an
+        * RCU grace period in the past. Consequently it is sufficient to only
+        * check the first entry.
+        */
+       stack = list_first_entry(&free_stacks, struct stack_record, free_list);
+       if (stack->size && !poll_state_synchronize_rcu(stack->rcu_state))
+               return NULL;
+
+       list_del(&stack->free_list);
+       counters[DEPOT_COUNTER_FREELIST_SIZE]--;
 
-out_keep_prealloc:
-       /* Keep the preallocated memory for a new pool if required. */
-       if (*prealloc)
-               depot_keep_new_pool(prealloc);
-       return true;
+       return stack;
 }
 
 /* Allocates a new stack in a stack depot pool. */
@@ -363,19 +435,22 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
 {
        struct stack_record *stack;
 
-       lockdep_assert_held_write(&pool_rwlock);
+       lockdep_assert_held(&pool_lock);
 
-       /* Update current and new pools if required and possible. */
-       if (!depot_update_pools(prealloc))
+       /* This should already be checked by public API entry points. */
+       if (WARN_ON_ONCE(!size))
                return NULL;
 
        /* Check if we have a stack record to save the stack trace. */
-       if (list_empty(&free_stacks))
-               return NULL;
-
-       /* Get and unlink the first entry from the freelist. */
-       stack = list_first_entry(&free_stacks, struct stack_record, list);
-       list_del(&stack->list);
+       stack = depot_pop_free();
+       if (!stack) {
+               /* No usable entries on the freelist - try to refill the freelist. */
+               if (!depot_try_init_pool(prealloc))
+                       return NULL;
+               stack = depot_pop_free();
+               if (WARN_ON(!stack))
+                       return NULL;
+       }
 
        /* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
        if (size > CONFIG_STACKDEPOT_MAX_FRAMES)
@@ -394,38 +469,80 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc)
         */
        kmsan_unpoison_memory(stack, DEPOT_STACK_RECORD_SIZE);
 
+       counters[DEPOT_COUNTER_ALLOCS]++;
+       counters[DEPOT_COUNTER_INUSE]++;
        return stack;
 }
 
 static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
 {
+       const int pools_num_cached = READ_ONCE(pools_num);
        union handle_parts parts = { .handle = handle };
        void *pool;
        size_t offset = parts.offset << DEPOT_STACK_ALIGN;
        struct stack_record *stack;
 
-       lockdep_assert_held(&pool_rwlock);
+       lockdep_assert_not_held(&pool_lock);
 
-       if (parts.pool_index > pools_num) {
+       if (parts.pool_index > pools_num_cached) {
                WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
-                    parts.pool_index, pools_num, handle);
+                    parts.pool_index, pools_num_cached, handle);
                return NULL;
        }
 
        pool = stack_pools[parts.pool_index];
-       if (!pool)
+       if (WARN_ON(!pool))
                return NULL;
 
        stack = pool + offset;
+       if (WARN_ON(!refcount_read(&stack->count)))
+               return NULL;
+
        return stack;
 }
 
 /* Links stack into the freelist. */
 static void depot_free_stack(struct stack_record *stack)
 {
-       lockdep_assert_held_write(&pool_rwlock);
+       unsigned long flags;
+
+       lockdep_assert_not_held(&pool_lock);
 
-       list_add(&stack->list, &free_stacks);
+       raw_spin_lock_irqsave(&pool_lock, flags);
+       printk_deferred_enter();
+
+       /*
+        * Remove the entry from the hash list. Concurrent list traversal may
+        * still observe the entry, but since the refcount is zero, this entry
+        * will no longer be considered as valid.
+        */
+       list_del_rcu(&stack->hash_list);
+
+       /*
+        * Due to being used from constrained contexts such as the allocators,
+        * NMI, or even RCU itself, stack depot cannot rely on primitives that
+        * would sleep (such as synchronize_rcu()) or recursively call into
+        * stack depot again (such as call_rcu()).
+        *
+        * Instead, get an RCU cookie, so that we can ensure this entry isn't
+        * moved onto another list until the next grace period, and concurrent
+        * RCU list traversal remains safe.
+        */
+       stack->rcu_state = get_state_synchronize_rcu();
+
+       /*
+        * Add the entry to the freelist tail, so that older entries are
+        * considered first - their RCU cookie is more likely to no longer be
+        * associated with the current grace period.
+        */
+       list_add_tail(&stack->free_list, &free_stacks);
+
+       counters[DEPOT_COUNTER_FREELIST_SIZE]++;
+       counters[DEPOT_COUNTER_FREES]++;
+       counters[DEPOT_COUNTER_INUSE]--;
+
+       printk_deferred_exit();
+       raw_spin_unlock_irqrestore(&pool_lock, flags);
 }
 
 /* Calculates the hash for a stack. */
@@ -453,22 +570,52 @@ int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
 
 /* Finds a stack in a bucket of the hash table. */
 static inline struct stack_record *find_stack(struct list_head *bucket,
-                                            unsigned long *entries, int size,
-                                            u32 hash)
+                                             unsigned long *entries, int size,
+                                             u32 hash, depot_flags_t flags)
 {
-       struct list_head *pos;
-       struct stack_record *found;
+       struct stack_record *stack, *ret = NULL;
+
+       /*
+        * Stack depot may be used from instrumentation that instruments RCU or
+        * tracing itself; use variant that does not call into RCU and cannot be
+        * traced.
+        *
+        * Note: Such use cases must take care when using refcounting to evict
+        * unused entries, because the stack record free-then-reuse code paths
+        * do call into RCU.
+        */
+       rcu_read_lock_sched_notrace();
 
-       lockdep_assert_held(&pool_rwlock);
+       list_for_each_entry_rcu(stack, bucket, hash_list) {
+               if (stack->hash != hash || stack->size != size)
+                       continue;
+
+               /*
+                * This may race with depot_free_stack() accessing the freelist
+                * management state unioned with @entries. The refcount is zero
+                * in that case and the below refcount_inc_not_zero() will fail.
+                */
+               if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
+                       continue;
+
+               /*
+                * Try to increment refcount. If this succeeds, the stack record
+                * is valid and has not yet been freed.
+                *
+                * If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
+                * to then call stack_depot_put() later, and we can assume that
+                * a stack record is never placed back on the freelist.
+                */
+               if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
+                       continue;
 
-       list_for_each(pos, bucket) {
-               found = list_entry(pos, struct stack_record, list);
-               if (found->hash == hash &&
-                   found->size == size &&
-                   !stackdepot_memcmp(entries, found->entries, size))
-                       return found;
+               ret = stack;
+               break;
        }
-       return NULL;
+
+       rcu_read_unlock_sched_notrace();
+
+       return ret;
 }
 
 depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
@@ -482,7 +629,6 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
        struct page *page = NULL;
        void *prealloc = NULL;
        bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
-       bool need_alloc = false;
        unsigned long flags;
        u32 hash;
 
@@ -505,31 +651,16 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
        hash = hash_stack(entries, nr_entries);
        bucket = &stack_table[hash & stack_hash_mask];
 
-       read_lock_irqsave(&pool_rwlock, flags);
-       printk_deferred_enter();
-
-       /* Fast path: look the stack trace up without full locking. */
-       found = find_stack(bucket, entries, nr_entries, hash);
-       if (found) {
-               if (depot_flags & STACK_DEPOT_FLAG_GET)
-                       refcount_inc(&found->count);
-               printk_deferred_exit();
-               read_unlock_irqrestore(&pool_rwlock, flags);
+       /* Fast path: look the stack trace up without locking. */
+       found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
+       if (found)
                goto exit;
-       }
-
-       /* Take note if another stack pool needs to be allocated. */
-       if (new_pool_required)
-               need_alloc = true;
-
-       printk_deferred_exit();
-       read_unlock_irqrestore(&pool_rwlock, flags);
 
        /*
         * Allocate memory for a new pool if required now:
         * we won't be able to do that under the lock.
         */
-       if (unlikely(can_alloc && need_alloc)) {
+       if (unlikely(can_alloc && READ_ONCE(new_pool_required))) {
                /*
                 * Zero out zone modifiers, as we don't have specific zone
                 * requirements. Keep the flags related to allocation in atomic
@@ -543,31 +674,36 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
                        prealloc = page_address(page);
        }
 
-       write_lock_irqsave(&pool_rwlock, flags);
+       raw_spin_lock_irqsave(&pool_lock, flags);
        printk_deferred_enter();
 
-       found = find_stack(bucket, entries, nr_entries, hash);
+       /* Try to find again, to avoid concurrently inserting duplicates. */
+       found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
        if (!found) {
                struct stack_record *new =
                        depot_alloc_stack(entries, nr_entries, hash, &prealloc);
 
                if (new) {
-                       list_add(&new->list, bucket);
+                       /*
+                        * This releases the stack record into the bucket and
+                        * makes it visible to readers in find_stack().
+                        */
+                       list_add_rcu(&new->hash_list, bucket);
                        found = new;
                }
-       } else {
-               if (depot_flags & STACK_DEPOT_FLAG_GET)
-                       refcount_inc(&found->count);
+       }
+
+       if (prealloc) {
                /*
-                * Stack depot already contains this stack trace, but let's
-                * keep the preallocated memory for future.
+                * Either stack depot already contains this stack trace, or
+                * depot_alloc_stack() did not consume the preallocated memory.
+                * Try to keep the preallocated memory for future.
                 */
-               if (prealloc)
-                       depot_keep_new_pool(&prealloc);
+               depot_keep_new_pool(&prealloc);
        }
 
        printk_deferred_exit();
-       write_unlock_irqrestore(&pool_rwlock, flags);
+       raw_spin_unlock_irqrestore(&pool_lock, flags);
 exit:
        if (prealloc) {
                /* Stack depot didn't use this memory, free it. */
@@ -592,7 +728,6 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
                               unsigned long **entries)
 {
        struct stack_record *stack;
-       unsigned long flags;
 
        *entries = NULL;
        /*
@@ -604,13 +739,13 @@ unsigned int stack_depot_fetch(depot_stack_handle_t handle,
        if (!handle || stack_depot_disabled)
                return 0;
 
-       read_lock_irqsave(&pool_rwlock, flags);
-       printk_deferred_enter();
-
        stack = depot_fetch_stack(handle);
-
-       printk_deferred_exit();
-       read_unlock_irqrestore(&pool_rwlock, flags);
+       /*
+        * Should never be NULL, otherwise this is a use-after-put (or just a
+        * corrupt handle).
+        */
+       if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
+               return 0;
 
        *entries = stack->entries;
        return stack->size;
@@ -620,29 +755,20 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch);
 void stack_depot_put(depot_stack_handle_t handle)
 {
        struct stack_record *stack;
-       unsigned long flags;
 
        if (!handle || stack_depot_disabled)
                return;
 
-       write_lock_irqsave(&pool_rwlock, flags);
-       printk_deferred_enter();
-
        stack = depot_fetch_stack(handle);
-       if (WARN_ON(!stack))
-               goto out;
-
-       if (refcount_dec_and_test(&stack->count)) {
-               /* Unlink stack from the hash table. */
-               list_del(&stack->list);
+       /*
+        * Should always be able to find the stack record, otherwise this is an
+        * unbalanced put attempt (or corrupt handle).
+        */
+       if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
+               return;
 
-               /* Free stack. */
+       if (refcount_dec_and_test(&stack->count))
                depot_free_stack(stack);
-       }
-
-out:
-       printk_deferred_exit();
-       write_unlock_irqrestore(&pool_rwlock, flags);
 }
 EXPORT_SYMBOL_GPL(stack_depot_put);
 
@@ -690,3 +816,30 @@ unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
        return parts.extra;
 }
 EXPORT_SYMBOL(stack_depot_get_extra_bits);
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+       /*
+        * data race ok: These are just statistics counters, and approximate
+        * statistics are ok for debugging.
+        */
+       seq_printf(seq, "pools: %d\n", data_race(pools_num));
+       for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
+               seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
+
+       return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(stats);
+
+static int depot_debugfs_init(void)
+{
+       struct dentry *dir;
+
+       if (stack_depot_disabled)
+               return 0;
+
+       dir = debugfs_create_dir("stackdepot", NULL);
+       debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
+       return 0;
+}
+late_initcall(depot_debugfs_init);