Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu
[sfrench/cifs-2.6.git] / mm / percpu.c
index f99e9306b9394a5f74def05b814e675cca6f9efd..b4cebeca4c0c1616ef44d0f95526bc09417fb06e 100644 (file)
 
 #include "percpu-internal.h"
 
-/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+/*
+ * The slots are sorted by the size of the biggest continuous free area.
+ * 1-31 bytes share the same slot.
+ */
 #define PCPU_SLOT_BASE_SHIFT           5
 /* chunks in slots below this are subject to being sidelined on failed alloc */
 #define PCPU_SLOT_FAIL_THRESHOLD       3
@@ -132,6 +135,9 @@ static int pcpu_unit_size __ro_after_init;
 static int pcpu_nr_units __ro_after_init;
 static int pcpu_atom_size __ro_after_init;
 int pcpu_nr_slots __ro_after_init;
+static int pcpu_free_slot __ro_after_init;
+int pcpu_sidelined_slot __ro_after_init;
+int pcpu_to_depopulate_slot __ro_after_init;
 static size_t pcpu_chunk_struct_size __ro_after_init;
 
 /* cpus with the lowest and highest unit addresses */
@@ -173,10 +179,10 @@ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
 static LIST_HEAD(pcpu_map_extend_chunks);
 
 /*
- * The number of empty populated pages by chunk type, protected by pcpu_lock.
+ * The number of empty populated pages, protected by pcpu_lock.
  * The reserved chunk doesn't contribute to the count.
  */
-int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES];
+int pcpu_nr_empty_pop_pages;
 
 /*
  * The number of populated pages in use by the allocator, protected by
@@ -234,7 +240,7 @@ static int __pcpu_size_to_slot(int size)
 static int pcpu_size_to_slot(int size)
 {
        if (size == pcpu_unit_size)
-               return pcpu_nr_slots - 1;
+               return pcpu_free_slot;
        return __pcpu_size_to_slot(size);
 }
 
@@ -303,6 +309,25 @@ static unsigned long pcpu_block_off_to_off(int index, int off)
        return index * PCPU_BITMAP_BLOCK_BITS + off;
 }
 
+/**
+ * pcpu_check_block_hint - check against the contig hint
+ * @block: block of interest
+ * @bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Check to see if the allocation can fit in the block's contig hint.
+ * Note, a chunk uses the same hints as a block so this can also check against
+ * the chunk's contig hint.
+ */
+static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
+                                 size_t align)
+{
+       int bit_off = ALIGN(block->contig_hint_start, align) -
+               block->contig_hint_start;
+
+       return bit_off + bits <= block->contig_hint;
+}
+
 /*
  * pcpu_next_hint - determine which hint to use
  * @block: block of interest
@@ -507,13 +532,10 @@ static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
                              bool move_front)
 {
        if (chunk != pcpu_reserved_chunk) {
-               struct list_head *pcpu_slot;
-
-               pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
                if (move_front)
-                       list_move(&chunk->list, &pcpu_slot[slot]);
+                       list_move(&chunk->list, &pcpu_chunk_lists[slot]);
                else
-                       list_move_tail(&chunk->list, &pcpu_slot[slot]);
+                       list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
        }
 }
 
@@ -539,10 +561,36 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
        int nslot = pcpu_chunk_slot(chunk);
 
+       /* leave isolated chunks in-place */
+       if (chunk->isolated)
+               return;
+
        if (oslot != nslot)
                __pcpu_chunk_move(chunk, nslot, oslot < nslot);
 }
 
+static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
+{
+       lockdep_assert_held(&pcpu_lock);
+
+       if (!chunk->isolated) {
+               chunk->isolated = true;
+               pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
+       }
+       list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
+}
+
+static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
+{
+       lockdep_assert_held(&pcpu_lock);
+
+       if (chunk->isolated) {
+               chunk->isolated = false;
+               pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
+               pcpu_chunk_relocate(chunk, -1);
+       }
+}
+
 /*
  * pcpu_update_empty_pages - update empty page counters
  * @chunk: chunk of interest
@@ -555,8 +603,8 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
 {
        chunk->nr_empty_pop_pages += nr;
-       if (chunk != pcpu_reserved_chunk)
-               pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
+       if (chunk != pcpu_reserved_chunk && !chunk->isolated)
+               pcpu_nr_empty_pop_pages += nr;
 }
 
 /*
@@ -1063,14 +1111,11 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
        int bit_off, bits, next_off;
 
        /*
-        * Check to see if the allocation can fit in the chunk's contig hint.
-        * This is an optimization to prevent scanning by assuming if it
-        * cannot fit in the global hint, there is memory pressure and creating
-        * a new chunk would happen soon.
+        * This is an optimization to prevent scanning by assuming if the
+        * allocation cannot fit in the global hint, there is memory pressure
+        * and creating a new chunk would happen soon.
         */
-       bit_off = ALIGN(chunk_md->contig_hint_start, align) -
-                 chunk_md->contig_hint_start;
-       if (bit_off + alloc_bits > chunk_md->contig_hint)
+       if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
                return -1;
 
        bit_off = pcpu_next_hint(chunk_md, alloc_bits);
@@ -1352,7 +1397,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
                      alloc_size);
 
 #ifdef CONFIG_MEMCG_KMEM
-       /* first chunk isn't memcg-aware */
+       /* first chunk is free to use */
        chunk->obj_cgroups = NULL;
 #endif
        pcpu_init_md_blocks(chunk);
@@ -1394,7 +1439,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
        return chunk;
 }
 
-static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
 {
        struct pcpu_chunk *chunk;
        int region_bits;
@@ -1423,7 +1468,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
                goto md_blocks_fail;
 
 #ifdef CONFIG_MEMCG_KMEM
-       if (pcpu_is_memcg_chunk(type)) {
+       if (!mem_cgroup_kmem_disabled()) {
                chunk->obj_cgroups =
                        pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                        sizeof(struct obj_cgroup *), gfp);
@@ -1536,8 +1581,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp);
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
-                                           gfp_t gfp);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1580,25 +1624,25 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
-                                                    struct obj_cgroup **objcgp)
+static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+                                     struct obj_cgroup **objcgp)
 {
        struct obj_cgroup *objcg;
 
        if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
-               return PCPU_CHUNK_ROOT;
+               return true;
 
        objcg = get_obj_cgroup_from_current();
        if (!objcg)
-               return PCPU_CHUNK_ROOT;
+               return true;
 
        if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
                obj_cgroup_put(objcg);
-               return PCPU_FAIL_ALLOC;
+               return false;
        }
 
        *objcgp = objcg;
-       return PCPU_CHUNK_MEMCG;
+       return true;
 }
 
 static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
@@ -1608,7 +1652,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
        if (!objcg)
                return;
 
-       if (chunk) {
+       if (likely(chunk && chunk->obj_cgroups)) {
                chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
 
                rcu_read_lock();
@@ -1625,10 +1669,12 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
 {
        struct obj_cgroup *objcg;
 
-       if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
+       if (unlikely(!chunk->obj_cgroups))
                return;
 
        objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+       if (!objcg)
+               return;
        chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
 
        obj_cgroup_uncharge(objcg, size * num_possible_cpus());
@@ -1642,10 +1688,10 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
 }
 
 #else /* CONFIG_MEMCG_KMEM */
-static enum pcpu_chunk_type
+static bool
 pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
 {
-       return PCPU_CHUNK_ROOT;
+       return true;
 }
 
 static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
@@ -1680,8 +1726,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
-       enum pcpu_chunk_type type;
-       struct list_head *pcpu_slot;
        struct obj_cgroup *objcg = NULL;
        static int warn_limit = 10;
        struct pcpu_chunk *chunk, *next;
@@ -1717,10 +1761,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
                return NULL;
        }
 
-       type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
-       if (unlikely(type == PCPU_FAIL_ALLOC))
+       if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
                return NULL;
-       pcpu_slot = pcpu_chunk_list(type);
 
        if (!is_atomic) {
                /*
@@ -1758,8 +1800,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 restart:
        /* search through normal chunks */
-       for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
-               list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
+       for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
+               list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
+                                        list) {
                        off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                  is_atomic);
                        if (off < 0) {
@@ -1769,9 +1812,10 @@ restart:
                        }
 
                        off = pcpu_alloc_area(chunk, bits, bit_align, off);
-                       if (off >= 0)
+                       if (off >= 0) {
+                               pcpu_reintegrate_chunk(chunk);
                                goto area_found;
-
+                       }
                }
        }
 
@@ -1787,8 +1831,8 @@ restart:
                goto fail;
        }
 
-       if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
-               chunk = pcpu_create_chunk(type, pcpu_gfp);
+       if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
+               chunk = pcpu_create_chunk(pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
@@ -1832,7 +1876,7 @@ area_found:
                mutex_unlock(&pcpu_alloc_mutex);
        }
 
-       if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW)
+       if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work();
 
        /* clear the areas and return address relative to base address */
@@ -1930,33 +1974,28 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 }
 
 /**
- * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
- * @type: chunk type
+ * pcpu_balance_free - manage the amount of free chunks
+ * @empty_only: free chunks only if there are no populated pages
  *
- * Reclaim all fully free chunks except for the first one.  This is also
- * responsible for maintaining the pool of empty populated pages.  However,
- * it is possible that this is called when physical memory is scarce causing
- * OOM killer to be triggered.  We should avoid doing so until an actual
- * allocation causes the failure as it is possible that requests can be
- * serviced from already backed regions.
+ * If empty_only is %false, reclaim all fully free chunks regardless of the
+ * number of populated pages.  Otherwise, only reclaim chunks that have no
+ * populated pages.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
  */
-static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
+static void pcpu_balance_free(bool empty_only)
 {
-       /* gfp flags passed to underlying allocators */
-       const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        LIST_HEAD(to_free);
-       struct list_head *pcpu_slot = pcpu_chunk_list(type);
-       struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
+       struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
        struct pcpu_chunk *chunk, *next;
-       int slot, nr_to_pop, ret;
+
+       lockdep_assert_held(&pcpu_lock);
 
        /*
         * There's no reason to keep around multiple unused chunks and VM
         * areas can be scarce.  Destroy all free chunks except for one.
         */
-       mutex_lock(&pcpu_alloc_mutex);
-       spin_lock_irq(&pcpu_lock);
-
        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);
 
@@ -1964,11 +2003,14 @@ static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;
 
-               list_move(&chunk->list, &to_free);
+               if (!empty_only || chunk->nr_empty_pop_pages == 0)
+                       list_move(&chunk->list, &to_free);
        }
 
-       spin_unlock_irq(&pcpu_lock);
+       if (list_empty(&to_free))
+               return;
 
+       spin_unlock_irq(&pcpu_lock);
        list_for_each_entry_safe(chunk, next, &to_free, list) {
                unsigned int rs, re;
 
@@ -1982,6 +2024,29 @@ static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
                pcpu_destroy_chunk(chunk);
                cond_resched();
        }
+       spin_lock_irq(&pcpu_lock);
+}
+
+/**
+ * pcpu_balance_populated - manage the amount of populated pages
+ *
+ * Maintain a certain amount of populated pages to satisfy atomic allocations.
+ * It is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered.  We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
+ */
+static void pcpu_balance_populated(void)
+{
+       /* gfp flags passed to underlying allocators */
+       const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+       struct pcpu_chunk *chunk;
+       int slot, nr_to_pop, ret;
+
+       lockdep_assert_held(&pcpu_lock);
 
        /*
         * Ensure there are certain number of free populated pages for
@@ -2000,23 +2065,21 @@ retry_pop:
                pcpu_atomic_alloc_failed = false;
        } else {
                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
-                                 pcpu_nr_empty_pop_pages[type],
+                                 pcpu_nr_empty_pop_pages,
                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
        }
 
-       for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
+       for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
                unsigned int nr_unpop = 0, rs, re;
 
                if (!nr_to_pop)
                        break;
 
-               spin_lock_irq(&pcpu_lock);
-               list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+               list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
                        nr_unpop = chunk->nr_pages - chunk->nr_populated;
                        if (nr_unpop)
                                break;
                }
-               spin_unlock_irq(&pcpu_lock);
 
                if (!nr_unpop)
                        continue;
@@ -2026,12 +2089,13 @@ retry_pop:
                                             chunk->nr_pages) {
                        int nr = min_t(int, re - rs, nr_to_pop);
 
+                       spin_unlock_irq(&pcpu_lock);
                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
+                       cond_resched();
+                       spin_lock_irq(&pcpu_lock);
                        if (!ret) {
                                nr_to_pop -= nr;
-                               spin_lock_irq(&pcpu_lock);
                                pcpu_chunk_populated(chunk, rs, rs + nr);
-                               spin_unlock_irq(&pcpu_lock);
                        } else {
                                nr_to_pop = 0;
                        }
@@ -2043,30 +2107,133 @@ retry_pop:
 
        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
-               chunk = pcpu_create_chunk(type, gfp);
+               spin_unlock_irq(&pcpu_lock);
+               chunk = pcpu_create_chunk(gfp);
+               cond_resched();
+               spin_lock_irq(&pcpu_lock);
                if (chunk) {
-                       spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_relocate(chunk, -1);
-                       spin_unlock_irq(&pcpu_lock);
                        goto retry_pop;
                }
        }
+}
 
-       mutex_unlock(&pcpu_alloc_mutex);
+/**
+ * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
+ *
+ * Scan over chunks in the depopulate list and try to release unused populated
+ * pages back to the system.  Depopulated chunks are sidelined to prevent
+ * repopulating these pages unless required.  Fully free chunks are reintegrated
+ * and freed accordingly (1 is kept around).  If we drop below the empty
+ * populated pages threshold, reintegrate the chunk if it has empty free pages.
+ * Each chunk is scanned in the reverse order to keep populated pages close to
+ * the beginning of the chunk.
+ *
+ * CONTEXT:
+ * pcpu_lock (can be dropped temporarily)
+ *
+ */
+static void pcpu_reclaim_populated(void)
+{
+       struct pcpu_chunk *chunk;
+       struct pcpu_block_md *block;
+       int i, end;
+
+       lockdep_assert_held(&pcpu_lock);
+
+restart:
+       /*
+        * Once a chunk is isolated to the to_depopulate list, the chunk is no
+        * longer discoverable to allocations whom may populate pages.  The only
+        * other accessor is the free path which only returns area back to the
+        * allocator not touching the populated bitmap.
+        */
+       while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) {
+               chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot],
+                                        struct pcpu_chunk, list);
+               WARN_ON(chunk->immutable);
+
+               /*
+                * Scan chunk's pages in the reverse order to keep populated
+                * pages close to the beginning of the chunk.
+                */
+               for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
+                       /* no more work to do */
+                       if (chunk->nr_empty_pop_pages == 0)
+                               break;
+
+                       /* reintegrate chunk to prevent atomic alloc failures */
+                       if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+                               pcpu_reintegrate_chunk(chunk);
+                               goto restart;
+                       }
+
+                       /*
+                        * If the page is empty and populated, start or
+                        * extend the (i, end) range.  If i == 0, decrease
+                        * i and perform the depopulation to cover the last
+                        * (first) page in the chunk.
+                        */
+                       block = chunk->md_blocks + i;
+                       if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
+                           test_bit(i, chunk->populated)) {
+                               if (end == -1)
+                                       end = i;
+                               if (i > 0)
+                                       continue;
+                               i--;
+                       }
+
+                       /* depopulate if there is an active range */
+                       if (end == -1)
+                               continue;
+
+                       spin_unlock_irq(&pcpu_lock);
+                       pcpu_depopulate_chunk(chunk, i + 1, end + 1);
+                       cond_resched();
+                       spin_lock_irq(&pcpu_lock);
+
+                       pcpu_chunk_depopulated(chunk, i + 1, end + 1);
+
+                       /* reset the range and continue */
+                       end = -1;
+               }
+
+               if (chunk->free_bytes == pcpu_unit_size)
+                       pcpu_reintegrate_chunk(chunk);
+               else
+                       list_move(&chunk->list,
+                                 &pcpu_chunk_lists[pcpu_sidelined_slot]);
+       }
 }
 
 /**
  * pcpu_balance_workfn - manage the amount of free chunks and populated pages
  * @work: unused
  *
- * Call __pcpu_balance_workfn() for each chunk type.
+ * For each chunk type, manage the number of fully free chunks and the number of
+ * populated pages.  An important thing to consider is when pages are freed and
+ * how they contribute to the global counts.
  */
 static void pcpu_balance_workfn(struct work_struct *work)
 {
-       enum pcpu_chunk_type type;
+       /*
+        * pcpu_balance_free() is called twice because the first time we may
+        * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
+        * to grow other chunks.  This then gives pcpu_reclaim_populated() time
+        * to move fully free chunks to the active list to be freed if
+        * appropriate.
+        */
+       mutex_lock(&pcpu_alloc_mutex);
+       spin_lock_irq(&pcpu_lock);
+
+       pcpu_balance_free(false);
+       pcpu_reclaim_populated();
+       pcpu_balance_populated();
+       pcpu_balance_free(true);
 
-       for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
-               __pcpu_balance_workfn(type);
+       spin_unlock_irq(&pcpu_lock);
+       mutex_unlock(&pcpu_alloc_mutex);
 }
 
 /**
@@ -2085,7 +2252,6 @@ void free_percpu(void __percpu *ptr)
        unsigned long flags;
        int size, off;
        bool need_balance = false;
-       struct list_head *pcpu_slot;
 
        if (!ptr)
                return;
@@ -2101,19 +2267,24 @@ void free_percpu(void __percpu *ptr)
 
        size = pcpu_free_area(chunk, off);
 
-       pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
-
        pcpu_memcg_free_hook(chunk, off, size);
 
-       /* if there are more than one fully free chunks, wake up grim reaper */
-       if (chunk->free_bytes == pcpu_unit_size) {
+       /*
+        * If there are more than one fully free chunks, wake up grim reaper.
+        * If the chunk is isolated, it may be in the process of being
+        * reclaimed.  Let reclaim manage cleaning up of that chunk.
+        */
+       if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
                struct pcpu_chunk *pos;
 
-               list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
+               list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
+       } else if (pcpu_should_reclaim_chunk(chunk)) {
+               pcpu_isolate_chunk(chunk);
+               need_balance = true;
        }
 
        trace_percpu_free_percpu(chunk->base_addr, off, ptr);
@@ -2414,7 +2585,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        int map_size;
        unsigned long tmp_addr;
        size_t alloc_size;
-       enum pcpu_chunk_type type;
 
 #define PCPU_SETUP_BUG_ON(cond)        do {                                    \
        if (unlikely(cond)) {                                           \
@@ -2528,22 +2698,24 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        pcpu_stats_save_ai(ai);
 
        /*
-        * Allocate chunk slots.  The additional last slot is for
-        * empty chunks.
+        * Allocate chunk slots.  The slots after the active slots are:
+        *   sidelined_slot - isolated, depopulated chunks
+        *   free_slot - fully free chunks
+        *   to_depopulate_slot - isolated, chunks to depopulate
         */
-       pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
+       pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
+       pcpu_free_slot = pcpu_sidelined_slot + 1;
+       pcpu_to_depopulate_slot = pcpu_free_slot + 1;
+       pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
        pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
-                                         sizeof(pcpu_chunk_lists[0]) *
-                                         PCPU_NR_CHUNK_TYPES,
+                                         sizeof(pcpu_chunk_lists[0]),
                                          SMP_CACHE_BYTES);
        if (!pcpu_chunk_lists)
                panic("%s: Failed to allocate %zu bytes\n", __func__,
-                     pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
-                     PCPU_NR_CHUNK_TYPES);
+                     pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]));
 
-       for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
-               for (i = 0; i < pcpu_nr_slots; i++)
-                       INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
+       for (i = 0; i < pcpu_nr_slots; i++)
+               INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
 
        /*
         * The end of the static region needs to be aligned with the
@@ -2580,7 +2752,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
        /* link the first chunk in */
        pcpu_first_chunk = chunk;
-       pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages;
+       pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
        /* include all regions of the first chunk */
@@ -2733,6 +2905,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
         * Related to atom_size, which could be much larger than the unit_size.
         */
        last_allocs = INT_MAX;
+       best_upa = 0;
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;
 
@@ -2759,6 +2932,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
                last_allocs = allocs;
                best_upa = upa;
        }
+       BUG_ON(!best_upa);
        upa = best_upa;
 
        /* allocate and fill alloc_info */