sched/fair: Introduce SIS_UTIL to search idle CPU based on sum of util_avg

[sfrench/cifs-2.6.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 7400600b4db64c295dbc1a54b081b66c0dd44a9a..f80ae86bb404e5f4ac58341f9a270eb66b774207 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6332,6 +6332,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
  {
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
         int i, cpu, idle_cpu = -1, nr = INT_MAX;
+       struct sched_domain_shared *sd_share;
         struct rq *this_rq = this_rq();
         int this = smp_processor_id();
         struct sched_domain *this_sd;
@@ -6371,6 +6372,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
                 time = cpu_clock(this);
         }
  
+       if (sched_feat(SIS_UTIL)) {
+               sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+               if (sd_share) {
+                       /* because !--nr is the condition to stop scan */
+                       nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+                       /* overloaded LLC is unlikely to have idle cpu/core */
+                       if (nr == 1)
+                               return -1;
+               }
+       }
+
         for_each_cpu_wrap(cpu, cpus, target + 1) {
                 if (has_idle_core) {
                         i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -9224,6 +9236,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
         return idlest;
  }
  
+static void update_idle_cpu_scan(struct lb_env *env,
+                                unsigned long sum_util)
+{
+       struct sched_domain_shared *sd_share;
+       int llc_weight, pct;
+       u64 x, y, tmp;
+       /*
+        * Update the number of CPUs to scan in LLC domain, which could
+        * be used as a hint in select_idle_cpu(). The update of sd_share
+        * could be expensive because it is within a shared cache line.
+        * So the write of this hint only occurs during periodic load
+        * balancing, rather than CPU_NEWLY_IDLE, because the latter
+        * can fire way more frequently than the former.
+        */
+       if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+               return;
+
+       llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+       if (env->sd->span_weight != llc_weight)
+               return;
+
+       sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+       if (!sd_share)
+               return;
+
+       /*
+        * The number of CPUs to search drops as sum_util increases, when
+        * sum_util hits 85% or above, the scan stops.
+        * The reason to choose 85% as the threshold is because this is the
+        * imbalance_pct(117) when a LLC sched group is overloaded.
+        *
+        * let y = SCHED_CAPACITY_SCALE - p * x^2                       [1]
+        * and y'= y / SCHED_CAPACITY_SCALE
+        *
+        * x is the ratio of sum_util compared to the CPU capacity:
+        * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+        * y' is the ratio of CPUs to be scanned in the LLC domain,
+        * and the number of CPUs to scan is calculated by:
+        *
+        * nr_scan = llc_weight * y'                                    [2]
+        *
+        * When x hits the threshold of overloaded, AKA, when
+        * x = 100 / pct, y drops to 0. According to [1],
+        * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+        *
+        * Scale x by SCHED_CAPACITY_SCALE:
+        * x' = sum_util / llc_weight;                                  [3]
+        *
+        * and finally [1] becomes:
+        * y = SCHED_CAPACITY_SCALE -
+        *     x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE)            [4]
+        *
+        */
+       /* equation [3] */
+       x = sum_util;
+       do_div(x, llc_weight);
+
+       /* equation [4] */
+       pct = env->sd->imbalance_pct;
+       tmp = x * x * pct * pct;
+       do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+       tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+       y = SCHED_CAPACITY_SCALE - tmp;
+
+       /* equation [2] */
+       y *= llc_weight;
+       do_div(y, SCHED_CAPACITY_SCALE);
+       if ((int)y != sd_share->nr_idle_scan)
+               WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
+}
+
  /**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
@@ -9236,6 +9319,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
+       unsigned long sum_util = 0;
         int sg_status = 0;
  
         do {
@@ -9268,6 +9352,7 @@ next_group:
                 sds->total_load += sgs->group_load;
                 sds->total_capacity += sgs->group_capacity;
  
+               sum_util += sgs->group_util;
                 sg = sg->next;
         } while (sg != env->sd->groups);
  
@@ -9293,6 +9378,8 @@ next_group:
                 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
                 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
         }
+
+       update_idle_cpu_scan(env, sum_util);
  }
  
  /**