mm, pcp: reduce lock contention for draining high-order pages
authorHuang Ying <ying.huang@intel.com>
Mon, 16 Oct 2023 05:29:56 +0000 (13:29 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 25 Oct 2023 23:47:10 +0000 (16:47 -0700)
In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages
on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when
PCP is mostly used for high-order pages freeing to improve the cache-hot
pages reusing between page allocating and freeing CPUs.

On system with small per-CPU data cache slice, pages shouldn't be cached
before draining to guarantee cache-hot.  But on a system with large
per-CPU data cache slice, some pages can be cached before draining to
reduce zone lock contention.

So, in this patch, instead of draining without any caching, "pcp->batch"
pages will be cached in PCP before draining if the size of the per-CPU
data cache slice is more than "3 * batch".

In theory, if the size of per-CPU data cache slice is more than "2 *
batch", we can reuse cache-hot pages between CPUs.  But considering the
other usage of cache (code, other data accessing, etc.), "3 * batch" is
used.

Note: "3 * batch" is chosen to make sure the optimization works on recent
x86_64 server CPUs.  If you want to increase it, please check whether it
breaks the optimization.

On a 2-socket Intel server with 128 logical CPU, with the patch, the
network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite
with 16-pair processes increase 70.5%.  The cycles% of the spinlock
contention (mostly for zone lock) decreases from 46.1% to 21.3%.  The
number of PCP draining for high order pages freeing (free_high) decreases
89.9%.  The cache miss rate keeps 0.2%.

Link: https://lkml.kernel.org/r/20231016053002.756205-4-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <jweiner@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
drivers/base/cacheinfo.c
include/linux/gfp.h
include/linux/mmzone.h
mm/page_alloc.c

index 585c66fce9d95563dd8d564db148ac91f1106871..f1e79263fe61eb410dd27b5ac6b13b6c196e290a 100644 (file)
@@ -950,6 +950,7 @@ static int cacheinfo_cpu_online(unsigned int cpu)
        if (rc)
                goto err;
        update_per_cpu_data_slice_size(true, cpu);
+       setup_pcp_cacheinfo();
        return 0;
 err:
        free_cache_attributes(cpu);
@@ -963,6 +964,7 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu)
 
        free_cache_attributes(cpu);
        update_per_cpu_data_slice_size(false, cpu);
+       setup_pcp_cacheinfo();
        return 0;
 }
 
index 665f06675c834e45f9624b1a990d658f91eb99f3..665edc11fb9faaf5961a876287ff05847d16718e 100644 (file)
@@ -325,6 +325,7 @@ void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
 
 void page_alloc_init_late(void);
+void setup_pcp_cacheinfo(void);
 
 /*
  * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
index de313f1c15f9987846940fd37c7d7660ee2d1ab1..efe72b3f7872399204113b8c0d7cc8f3e0193a1d 100644 (file)
@@ -680,8 +680,14 @@ enum zone_watermarks {
  * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
  * previous page freeing.  To avoid to drain PCP for an accident
  * high-order page freeing.
+ *
+ * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
+ * draining PCP for consecutive high-order pages freeing without
+ * allocation if data cache slice of CPU is large enough.  To reduce
+ * zone lock contention and keep cache-hot pages reusing.
  */
 #define        PCPF_PREV_FREE_HIGH_ORDER       BIT(0)
+#define        PCPF_FREE_HIGH_BATCH            BIT(1)
 
 struct per_cpu_pages {
        spinlock_t lock;        /* Protects lists field */
index de547ef9a9adcfb28cf2986200165f5e5d638608..b76b1de48a3076f3edd30632db37f53833df9d89 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/psi.h>
 #include <linux/khugepaged.h>
 #include <linux/delayacct.h>
+#include <linux/cacheinfo.h>
 #include <asm/div64.h>
 #include "internal.h"
 #include "shuffle.h"
@@ -2385,7 +2386,9 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
         */
        if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
                free_high = (pcp->free_factor &&
-                            (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER));
+                            (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
+                            (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
+                             pcp->count >= READ_ONCE(pcp->batch)));
                pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
        } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
                pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
@@ -5418,6 +5421,39 @@ static void zone_pcp_update(struct zone *zone, int cpu_online)
        mutex_unlock(&pcp_batch_high_lock);
 }
 
+static void zone_pcp_update_cacheinfo(struct zone *zone)
+{
+       int cpu;
+       struct per_cpu_pages *pcp;
+       struct cpu_cacheinfo *cci;
+
+       for_each_online_cpu(cpu) {
+               pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+               cci = get_cpu_cacheinfo(cpu);
+               /*
+                * If data cache slice of CPU is large enough, "pcp->batch"
+                * pages can be preserved in PCP before draining PCP for
+                * consecutive high-order pages freeing without allocation.
+                * This can reduce zone lock contention without hurting
+                * cache-hot pages sharing.
+                */
+               spin_lock(&pcp->lock);
+               if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
+                       pcp->flags |= PCPF_FREE_HIGH_BATCH;
+               else
+                       pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
+               spin_unlock(&pcp->lock);
+       }
+}
+
+void setup_pcp_cacheinfo(void)
+{
+       struct zone *zone;
+
+       for_each_populated_zone(zone)
+               zone_pcp_update_cacheinfo(zone);
+}
+
 /*
  * Allocate per cpu pagesets and initialize them.
  * Before this call only boot pagesets were available.