sched/fair: use folio_xchg_access_time() in numa_hint_fault_latency()

[sfrench/cifs-2.6.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 8dbff6e7ad4f53af8965f5ab5ace0c1f2a758c63..3ee9d3564c20cb90211bf8350a86ddaff28f79e4 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1722,12 +1722,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat)
   * The smaller the hint page fault latency, the higher the possibility
   * for the page to be hot.
   */
-static int numa_hint_fault_latency(struct page *page)
+static int numa_hint_fault_latency(struct folio *folio)
  {
         int last_time, time;
  
         time = jiffies_to_msecs(jiffies);
-       last_time = xchg_page_access_time(page, time);
+       last_time = folio_xchg_access_time(folio, time);
  
         return (time - last_time) & PAGE_ACCESS_TIME_MASK;
  }
@@ -1784,7 +1784,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
         }
  }
  
-bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
                                 int src_nid, int dst_cpu)
  {
         struct numa_group *ng = deref_curr_numa_group(p);
@@ -1814,16 +1814,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                 numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
  
                 th = pgdat->nbp_threshold ? : def_th;
-               latency = numa_hint_fault_latency(page);
+               latency = numa_hint_fault_latency(folio);
                 if (latency >= th)
                         return false;
  
                 return !numa_promotion_rate_limit(pgdat, rate_limit,
-                                                 thp_nr_pages(page));
+                                                 folio_nr_pages(folio));
         }
  
         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
-       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+       last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid);
  
         if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
             !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
@@ -6619,6 +6619,7 @@ dequeue_throttle:
  /* Working cpumask for: load_balance, load_balance_newidle. */
  static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
  static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
+static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
  
  #ifdef CONFIG_NO_HZ_COMMON
  
@@ -9579,7 +9580,7 @@ static inline long sibling_imbalance(struct lb_env *env,
         imbalance /= ncores_local + ncores_busiest;
  
         /* Take advantage of resource in an empty sched group */
-       if (imbalance == 0 && local->sum_nr_running == 0 &&
+       if (imbalance <= 1 && local->sum_nr_running == 0 &&
             busiest->sum_nr_running > 1)
                 imbalance = 2;
  
@@ -9767,6 +9768,15 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                 break;
  
         case group_smt_balance:
+               /*
+                * Check if we have spare CPUs on either SMT group to
+                * choose has spare or fully busy handling.
+                */
+               if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
+                       goto has_spare;
+
+               fallthrough;
+
         case group_fully_busy:
                 /*
                  * Select the fully busy group with highest avg_load. In
@@ -9806,6 +9816,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                         else
                                 return true;
                 }
+has_spare:
  
                 /*
                  * Select not overloaded group with lowest number of idle cpus
@@ -10917,6 +10928,7 @@ static int active_load_balance_cpu_stop(void *data);
  
  static int should_we_balance(struct lb_env *env)
  {
+       struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
         struct sched_group *sg = env->sd->groups;
         int cpu, idle_smt = -1;
  
@@ -10940,8 +10952,9 @@ static int should_we_balance(struct lb_env *env)
                 return 1;
         }
  
+       cpumask_copy(swb_cpus, group_balance_mask(sg));
         /* Try to find first idle CPU */
-       for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
+       for_each_cpu_and(cpu, swb_cpus, env->cpus) {
                 if (!idle_cpu(cpu))
                         continue;
  
@@ -10953,6 +10966,14 @@ static int should_we_balance(struct lb_env *env)
                 if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
                         if (idle_smt == -1)
                                 idle_smt = cpu;
+                       /*
+                        * If the core is not idle, and first SMT sibling which is
+                        * idle has been found, then its not needed to check other
+                        * SMT siblings for idleness:
+                        */
+#ifdef CONFIG_SCHED_SMT
+                       cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
+#endif
                         continue;
                 }
  
@@ -12918,6 +12939,8 @@ __init void init_sched_fair_class(void)
         for_each_possible_cpu(i) {
                 zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
                 zalloc_cpumask_var_node(&per_cpu(select_rq_mask,    i), GFP_KERNEL, cpu_to_node(i));
+               zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
+                                       GFP_KERNEL, cpu_to_node(i));
  
  #ifdef CONFIG_CFS_BANDWIDTH
                 INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));