Merge tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel...

[sfrench/cifs-2.6.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index d1a765cdf6e41933a54d51a368840232e53f033d..2048138ce54b574a3ba56b9f6bf7b1cefac1fd32 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,8 +51,6 @@
  
  #include <asm/switch_to.h>
  
-#include <linux/sched/cond_resched.h>
-
  #include "sched.h"
  #include "stats.h"
  #include "autogroup.h"
@@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  unsigned int sysctl_sched_base_slice                   = 750000ULL;
  static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
  
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
  const_debug unsigned int sysctl_sched_migration_cost   = 500000UL;
  
  int sched_thermal_decay_shift;
@@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
  
  #ifdef CONFIG_SYSCTL
  static struct ctl_table sched_fair_sysctls[] = {
-       {
-               .procname       = "sched_child_runs_first",
-               .data           = &sysctl_sched_child_runs_first,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
  #ifdef CONFIG_CFS_BANDWIDTH
         {
                 .procname       = "sched_cfs_bandwidth_slice_us",
@@ -664,6 +649,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
         cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
  }
  
+/*
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * For this to be so, the result of this function must have a left bias.
+ */
  u64 avg_vruntime(struct cfs_rq *cfs_rq)
  {
         struct sched_entity *curr = cfs_rq->curr;
@@ -677,8 +666,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
                 load += weight;
         }
  
-       if (load)
+       if (load) {
+               /* sign flips effective floor / ceil */
+               if (avg < 0)
+                       avg -= (load - 1);
                 avg = div_s64(avg, load);
+       }
  
         return cfs_rq->min_vruntime + avg;
  }
@@ -864,14 +857,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
   *
   * Which allows an EDF like search on (sub)trees.
   */
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
  {
         struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
         struct sched_entity *curr = cfs_rq->curr;
         struct sched_entity *best = NULL;
+       struct sched_entity *best_left = NULL;
  
         if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
                 curr = NULL;
+       best = curr;
  
         /*
          * Once selected, run a task until it either becomes non-eligible or
@@ -892,33 +887,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
                 }
  
                 /*
-                * If this entity has an earlier deadline than the previous
-                * best, take this one. If it also has the earliest deadline
-                * of its subtree, we're done.
+                * Now we heap search eligible trees for the best (min_)deadline
                  */
-               if (!best || deadline_gt(deadline, best, se)) {
+               if (!best || deadline_gt(deadline, best, se))
                         best = se;
-                       if (best->deadline == best->min_deadline)
-                               break;
-               }
  
                 /*
-                * If the earlest deadline in this subtree is in the fully
-                * eligible left half of our space, go there.
+                * Every se in a left branch is eligible, keep track of the
+                * branch with the best min_deadline
                  */
+               if (node->rb_left) {
+                       struct sched_entity *left = __node_2_se(node->rb_left);
+
+                       if (!best_left || deadline_gt(min_deadline, best_left, left))
+                               best_left = left;
+
+                       /*
+                        * min_deadline is in the left branch. rb_left and all
+                        * descendants are eligible, so immediately switch to the second
+                        * loop.
+                        */
+                       if (left->min_deadline == se->min_deadline)
+                               break;
+               }
+
+               /* min_deadline is at this node, no need to look right */
+               if (se->deadline == se->min_deadline)
+                       break;
+
+               /* else min_deadline is in the right branch. */
+               node = node->rb_right;
+       }
+
+       /*
+        * We ran into an eligible node which is itself the best.
+        * (Or nr_running == 0 and both are NULL)
+        */
+       if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
+               return best;
+
+       /*
+        * Now best_left and all of its children are eligible, and we are just
+        * looking for deadline == min_deadline
+        */
+       node = &best_left->run_node;
+       while (node) {
+               struct sched_entity *se = __node_2_se(node);
+
+               /* min_deadline is the current node */
+               if (se->deadline == se->min_deadline)
+                       return se;
+
+               /* min_deadline is in the left branch */
                 if (node->rb_left &&
                     __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
                         node = node->rb_left;
                         continue;
                 }
  
+               /* else min_deadline is in the right branch */
                 node = node->rb_right;
         }
+       return NULL;
+}
  
-       if (!best || (curr && deadline_gt(deadline, best, curr)))
-               best = curr;
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+       struct sched_entity *se = __pick_eevdf(cfs_rq);
  
-       if (unlikely(!best)) {
+       if (!se) {
                 struct sched_entity *left = __pick_first_entity(cfs_rq);
                 if (left) {
                         pr_err("EEVDF scheduling fail, picking leftmost\n");
@@ -926,7 +963,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
                 }
         }
  
-       return best;
+       return se;
  }
  
  #ifdef CONFIG_SCHED_DEBUG
@@ -2847,19 +2884,7 @@ static void task_numa_placement(struct task_struct *p)
         }
  
         /* Cannot migrate task to CPU-less node */
-       if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
-               int near_nid = max_nid;
-               int distance, near_distance = INT_MAX;
-
-               for_each_node_state(nid, N_CPU) {
-                       distance = node_distance(max_nid, nid);
-                       if (distance < near_distance) {
-                               near_nid = nid;
-                               near_distance = distance;
-                       }
-               }
-               max_nid = near_nid;
-       }
+       max_nid = numa_nearest_node(max_nid, N_CPU);
  
         if (ng) {
                 numa_group_count_active_nodes(ng);
@@ -3130,7 +3155,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
         p->mm->numa_scan_offset = 0;
  }
  
-static bool vma_is_accessed(struct vm_area_struct *vma)
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
  {
         unsigned long pids;
         /*
@@ -3142,8 +3167,20 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
         if (READ_ONCE(current->mm->numa_scan_seq) < 2)
                 return true;
  
-       pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
-       return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+       pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
+       if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+               return true;
+
+       /*
+        * Complete a scan that has already started regardless of PID access, or
+        * some VMAs may never be scanned in multi-threaded applications:
+        */
+       if (mm->numa_scan_offset > vma->vm_start) {
+               trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+               return true;
+       }
+
+       return false;
  }
  
  #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3163,6 +3200,8 @@ static void task_numa_work(struct callback_head *work)
         unsigned long nr_pte_updates = 0;
         long pages, virtpages;
         struct vma_iterator vmi;
+       bool vma_pids_skipped;
+       bool vma_pids_forced = false;
  
         SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
  
@@ -3205,7 +3244,6 @@ static void task_numa_work(struct callback_head *work)
          */
         p->node_stamp += 2 * TICK_NSEC;
  
-       start = mm->numa_scan_offset;
         pages = sysctl_numa_balancing_scan_size;
         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
         virtpages = pages * 8;     /* Scan up to this much virtual space */
@@ -3215,6 +3253,16 @@ static void task_numa_work(struct callback_head *work)
  
         if (!mmap_read_trylock(mm))
                 return;
+
+       /*
+        * VMAs are skipped if the current PID has not trapped a fault within
+        * the VMA recently. Allow scanning to be forced if there is no
+        * suitable VMA remaining.
+        */
+       vma_pids_skipped = false;
+
+retry_pids:
+       start = mm->numa_scan_offset;
         vma_iter_init(&vmi, mm, start);
         vma = vma_next(&vmi);
         if (!vma) {
@@ -3227,6 +3275,7 @@ static void task_numa_work(struct callback_head *work)
         do {
                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
                         continue;
                 }
  
@@ -3237,15 +3286,19 @@ static void task_numa_work(struct callback_head *work)
                  * as migrating the pages will be of marginal benefit.
                  */
                 if (!vma->vm_mm ||
-                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
                         continue;
+               }
  
                 /*
                  * Skip inaccessible VMAs to avoid any confusion between
                  * PROT_NONE and NUMA hinting ptes
                  */
-               if (!vma_is_accessible(vma))
+               if (!vma_is_accessible(vma)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
                         continue;
+               }
  
                 /* Initialise new per-VMA NUMAB state. */
                 if (!vma->numab_state) {
@@ -3258,8 +3311,15 @@ static void task_numa_work(struct callback_head *work)
                                 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
  
                         /* Reset happens after 4 times scan delay of scan start */
-                       vma->numab_state->next_pid_reset =  vma->numab_state->next_scan +
+                       vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
                                 msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+                       /*
+                        * Ensure prev_scan_seq does not match numa_scan_seq,
+                        * to prevent VMAs being skipped prematurely on the
+                        * first scan:
+                        */
+                        vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
                 }
  
                 /*
@@ -3267,23 +3327,35 @@ static void task_numa_work(struct callback_head *work)
                  * delay the scan for new VMAs.
                  */
                 if (mm->numa_scan_seq && time_before(jiffies,
-                                               vma->numab_state->next_scan))
+                                               vma->numab_state->next_scan)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
                         continue;
+               }
+
+               /* RESET access PIDs regularly for old VMAs. */
+               if (mm->numa_scan_seq &&
+                               time_after(jiffies, vma->numab_state->pids_active_reset)) {
+                       vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
+                               msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+                       vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+                       vma->numab_state->pids_active[1] = 0;
+               }
  
-               /* Do not scan the VMA if task has not accessed */
-               if (!vma_is_accessed(vma))
+               /* Do not rescan VMAs twice within the same sequence. */
+               if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+                       mm->numa_scan_offset = vma->vm_end;
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
                         continue;
+               }
  
                 /*
-                * RESET access PIDs regularly for old VMAs. Resetting after checking
-                * vma for recent access to avoid clearing PID info before access..
+                * Do not scan the VMA if task has not accessed it, unless no other
+                * VMA candidate exists.
                  */
-               if (mm->numa_scan_seq &&
-                               time_after(jiffies, vma->numab_state->next_pid_reset)) {
-                       vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
-                               msecs_to_jiffies(VMA_PID_RESET_PERIOD);
-                       vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
-                       vma->numab_state->access_pids[1] = 0;
+               if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+                       vma_pids_skipped = true;
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+                       continue;
                 }
  
                 do {
@@ -3310,8 +3382,28 @@ static void task_numa_work(struct callback_head *work)
  
                         cond_resched();
                 } while (end != vma->vm_end);
+
+               /* VMA scan is complete, do not scan until next sequence. */
+               vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+               /*
+                * Only force scan within one VMA at a time, to limit the
+                * cost of scanning a potentially uninteresting VMA.
+                */
+               if (vma_pids_forced)
+                       break;
         } for_each_vma(vmi, vma);
  
+       /*
+        * If no VMAs are remaining and VMAs were skipped due to the PID
+        * not accessing the VMA previously, then force a scan to ensure
+        * forward progress:
+        */
+       if (!vma && !vma_pids_forced && vma_pids_skipped) {
+               vma_pids_forced = true;
+               goto retry_pids;
+       }
+
  out:
         /*
          * It is possible to reach the end of the VMA list but the last few
@@ -3605,6 +3697,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                  */
                 deadline = div_s64(deadline * old_weight, weight);
                 se->deadline = se->vruntime + deadline;
+               if (se != cfs_rq->curr)
+                       min_deadline_cb_propagate(&se->run_node, NULL);
         }
  
  #ifdef CONFIG_SMP
@@ -3888,7 +3982,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
   */
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
  {
-       long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+       long delta;
+       u64 now;
  
         /*
          * No need to update load_avg for root_task_group as it is not used.
@@ -3896,9 +3991,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
         if (cfs_rq->tg == &root_task_group)
                 return;
  
+       /*
+        * For migration heavy workloads, access to tg->load_avg can be
+        * unbound. Limit the update rate to at most once per ms.
+        */
+       now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+       if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+               return;
+
+       delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
         if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+               cfs_rq->last_update_tg_load_avg = now;
         }
  }
  
@@ -4572,22 +4677,6 @@ static inline unsigned long task_util_est(struct task_struct *p)
         return max(task_util(p), _task_util_est(p));
  }
  
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-                                            unsigned long uclamp_min,
-                                            unsigned long uclamp_max)
-{
-       return clamp(task_util_est(p), uclamp_min, uclamp_max);
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-                                            unsigned long uclamp_min,
-                                            unsigned long uclamp_max)
-{
-       return task_util_est(p);
-}
-#endif
-
  static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
                                     struct task_struct *p)
  {
@@ -4691,7 +4780,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
          * To avoid overestimation of actual task utilization, skip updates if
          * we cannot grant there is idle time in this CPU.
          */
-       if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
+       if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
                 return;
  
         /*
@@ -4739,14 +4828,14 @@ static inline int util_fits_cpu(unsigned long util,
                 return fits;
  
         /*
-        * We must use capacity_orig_of() for comparing against uclamp_min and
+        * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
          * uclamp_max. We only care about capacity pressure (by using
          * capacity_of()) for comparing against the real util.
          *
          * If a task is boosted to 1024 for example, we don't want a tiny
          * pressure to skew the check whether it fits a CPU or not.
          *
-        * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+        * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
          * should fit a little cpu even if there's some pressure.
          *
          * Only exception is for thermal pressure since it has a direct impact
@@ -4758,7 +4847,7 @@ static inline int util_fits_cpu(unsigned long util,
          * For uclamp_max, we can tolerate a drop in performance level as the
          * goal is to cap the task. So it's okay if it's getting less.
          */
-       capacity_orig = capacity_orig_of(cpu);
+       capacity_orig = arch_scale_cpu_capacity(cpu);
         capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
  
         /*
@@ -4878,7 +4967,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
  
  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
  {
-       return true;
+       return !cfs_rq->nr_running;
  }
  
  #define UPDATE_TG      0x0
@@ -4919,10 +5008,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
  static void
  place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  {
-       u64 vslice = calc_delta_fair(se->slice, se);
-       u64 vruntime = avg_vruntime(cfs_rq);
+       u64 vslice, vruntime = avg_vruntime(cfs_rq);
         s64 lag = 0;
  
+       se->slice = sysctl_sched_base_slice;
+       vslice = calc_delta_fair(se->slice, se);
+
         /*
          * Due to how V is constructed as the weighted average of entities,
          * adding tasks with positive lag, or removing tasks with negative lag
@@ -5211,7 +5302,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
   * 4) do not run the "skip" process, if something else is available
   */
  static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct cfs_rq *cfs_rq)
  {
         /*
          * Enabling NEXT_BUDDY will affect latency but not fairness.
@@ -5755,13 +5846,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
  
  static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
  {
-       struct cfs_rq *local_unthrottle = NULL;
         int this_cpu = smp_processor_id();
         u64 runtime, remaining = 1;
         bool throttled = false;
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *tmp;
         struct rq_flags rf;
         struct rq *rq;
+       LIST_HEAD(local_unthrottle);
  
         rcu_read_lock();
         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5777,11 +5868,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
                 if (!cfs_rq_throttled(cfs_rq))
                         goto next;
  
-#ifdef CONFIG_SMP
                 /* Already queued for async unthrottle */
                 if (!list_empty(&cfs_rq->throttled_csd_list))
                         goto next;
-#endif
  
                 /* By the above checks, this should never be true */
                 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
@@ -5798,11 +5887,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
  
                 /* we check whether we're throttled above */
                 if (cfs_rq->runtime_remaining > 0) {
-                       if (cpu_of(rq) != this_cpu ||
-                           SCHED_WARN_ON(local_unthrottle))
+                       if (cpu_of(rq) != this_cpu) {
                                 unthrottle_cfs_rq_async(cfs_rq);
-                       else
-                               local_unthrottle = cfs_rq;
+                       } else {
+                               /*
+                                * We currently only expect to be unthrottling
+                                * a single cfs_rq locally.
+                                */
+                               SCHED_WARN_ON(!list_empty(&local_unthrottle));
+                               list_add_tail(&cfs_rq->throttled_csd_list,
+                                             &local_unthrottle);
+                       }
                 } else {
                         throttled = true;
                 }
@@ -5810,15 +5905,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
  next:
                 rq_unlock_irqrestore(rq, &rf);
         }
-       rcu_read_unlock();
  
-       if (local_unthrottle) {
-               rq = cpu_rq(this_cpu);
+       list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+                                throttled_csd_list) {
+               struct rq *rq = rq_of(cfs_rq);
+
                 rq_lock_irqsave(rq, &rf);
-               if (cfs_rq_throttled(local_unthrottle))
-                       unthrottle_cfs_rq(local_unthrottle);
+
+               list_del_init(&cfs_rq->throttled_csd_list);
+
+               if (cfs_rq_throttled(cfs_rq))
+                       unthrottle_cfs_rq(cfs_rq);
+
                 rq_unlock_irqrestore(rq, &rf);
         }
+       SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
+       rcu_read_unlock();
  
         return throttled;
  }
@@ -6148,9 +6251,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
  {
         cfs_rq->runtime_enabled = 0;
         INIT_LIST_HEAD(&cfs_rq->throttled_list);
-#ifdef CONFIG_SMP
         INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-#endif
  }
  
  void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7108,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
         struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
         int i, cpu, idle_cpu = -1, nr = INT_MAX;
         struct sched_domain_shared *sd_share;
-       struct rq *this_rq = this_rq();
-       int this = smp_processor_id();
-       struct sched_domain *this_sd = NULL;
-       u64 time = 0;
  
         cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
  
-       if (sched_feat(SIS_PROP) && !has_idle_core) {
-               u64 avg_cost, avg_idle, span_avg;
-               unsigned long now = jiffies;
-
-               this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-               if (!this_sd)
-                       return -1;
-
-               /*
-                * If we're busy, the assumption that the last idle period
-                * predicts the future is flawed; age away the remaining
-                * predicted idle time.
-                */
-               if (unlikely(this_rq->wake_stamp < now)) {
-                       while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
-                               this_rq->wake_stamp++;
-                               this_rq->wake_avg_idle >>= 1;
-                       }
-               }
-
-               avg_idle = this_rq->wake_avg_idle;
-               avg_cost = this_sd->avg_scan_cost + 1;
-
-               span_avg = sd->span_weight * avg_idle;
-               if (span_avg > 4*avg_cost)
-                       nr = div_u64(span_avg, avg_cost);
-               else
-                       nr = 4;
-
-               time = cpu_clock(this);
-       }
-
         if (sched_feat(SIS_UTIL)) {
                 sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
                 if (sd_share) {
@@ -7158,6 +7223,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
                 }
         }
  
+       if (static_branch_unlikely(&sched_cluster_active)) {
+               struct sched_group *sg = sd->groups;
+
+               if (sg->flags & SD_CLUSTER) {
+                       for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+                               if (!cpumask_test_cpu(cpu, cpus))
+                                       continue;
+
+                               if (has_idle_core) {
+                                       i = select_idle_core(p, cpu, cpus, &idle_cpu);
+                                       if ((unsigned int)i < nr_cpumask_bits)
+                                               return i;
+                               } else {
+                                       if (--nr <= 0)
+                                               return -1;
+                                       idle_cpu = __select_idle_cpu(cpu, p);
+                                       if ((unsigned int)idle_cpu < nr_cpumask_bits)
+                                               return idle_cpu;
+                               }
+                       }
+                       cpumask_andnot(cpus, cpus, sched_group_span(sg));
+               }
+       }
+
         for_each_cpu_wrap(cpu, cpus, target + 1) {
                 if (has_idle_core) {
                         i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7165,7 +7254,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
                                 return i;
  
                 } else {
-                       if (!--nr)
+                       if (--nr <= 0)
                                 return -1;
                         idle_cpu = __select_idle_cpu(cpu, p);
                         if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -7176,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
         if (has_idle_core)
                 set_idle_cores(target, false);
  
-       if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
-               time = cpu_clock(this) - time;
-
-               /*
-                * Account for the scan cost of wakeups against the average
-                * idle time.
-                */
-               this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
-
-               update_avg(&this_sd->avg_scan_cost, time);
-       }
-
         return idle_cpu;
  }
  
@@ -7227,7 +7304,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
                  * Look for the CPU with best capacity.
                  */
                 else if (fits < 0)
-                       cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+                       cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
  
                 /*
                  * First, select CPU which fits better (-1 being better than 0).
@@ -7267,7 +7344,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         bool has_idle_core = false;
         struct sched_domain *sd;
         unsigned long task_util, util_min, util_max;
-       int i, recent_used_cpu;
+       int i, recent_used_cpu, prev_aff = -1;
  
         /*
          * On asymmetric system, update task utilization because we will check
@@ -7294,8 +7371,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
          */
         if (prev != target && cpus_share_cache(prev, target) &&
             (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-           asym_fits_cpu(task_util, util_min, util_max, prev))
-               return prev;
+           asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+               if (!static_branch_unlikely(&sched_cluster_active) ||
+                   cpus_share_resources(prev, target))
+                       return prev;
+
+               prev_aff = prev;
+       }
  
         /*
          * Allow a per-cpu kthread to stack with the wakee if the
@@ -7322,7 +7405,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
             (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
             cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
             asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
-               return recent_used_cpu;
+
+               if (!static_branch_unlikely(&sched_cluster_active) ||
+                   cpus_share_resources(recent_used_cpu, target))
+                       return recent_used_cpu;
+
+       } else {
+               recent_used_cpu = -1;
         }
  
         /*
@@ -7363,6 +7452,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         if ((unsigned)i < nr_cpumask_bits)
                 return i;
  
+       /*
+        * For cluster machines which have lower sharing cache like L2 or
+        * LLC Tag, we tend to find an idle CPU in the target's cluster
+        * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+        * use them if possible when no idle CPU found in select_idle_cpu().
+        */
+       if ((unsigned int)prev_aff < nr_cpumask_bits)
+               return prev_aff;
+       if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+               return recent_used_cpu;
+
         return target;
  }
  
@@ -7469,7 +7569,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
                 util = max(util, util_est);
         }
  
-       return min(util, capacity_orig_of(cpu));
+       return min(util, arch_scale_cpu_capacity(cpu));
  }
  
  unsigned long cpu_util_cfs(int cpu)
@@ -7621,11 +7721,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
  {
         unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
         unsigned long busy_time = eenv->pd_busy_time;
+       unsigned long energy;
  
         if (dst_cpu >= 0)
                 busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
  
-       return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+       energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+       trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+       return energy;
  }
  
  /*
@@ -7700,7 +7805,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
         target = prev_cpu;
  
         sync_entity_load_avg(&p->se);
-       if (!uclamp_task_util(p, p_util_min, p_util_max))
+       if (!task_util_est(p) && p_util_min == 0)
                 goto unlock;
  
         eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -7708,11 +7813,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
         for (; pd; pd = pd->next) {
                 unsigned long util_min = p_util_min, util_max = p_util_max;
                 unsigned long cpu_cap, cpu_thermal_cap, util;
-               unsigned long cur_delta, max_spare_cap = 0;
+               long prev_spare_cap = -1, max_spare_cap = -1;
                 unsigned long rq_util_min, rq_util_max;
-               unsigned long prev_spare_cap = 0;
+               unsigned long cur_delta, base_energy;
                 int max_spare_cap_cpu = -1;
-               unsigned long base_energy;
                 int fits, max_fits = -1;
  
                 cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7775,7 +7879,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                                 prev_spare_cap = cpu_cap;
                                 prev_fits = fits;
                         } else if ((fits > max_fits) ||
-                                  ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
+                                  ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
                                 /*
                                  * Find the CPU with the maximum spare capacity
                                  * among the remaining CPUs in the performance
@@ -7787,7 +7891,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                         }
                 }
  
-               if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
+               if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
                         continue;
  
                 eenv_pd_busy_time(&eenv, cpus, p);
@@ -7795,7 +7899,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                 base_energy = compute_energy(&eenv, pd, cpus, p, -1);
  
                 /* Evaluate the energy impact of using prev_cpu. */
-               if (prev_spare_cap > 0) {
+               if (prev_spare_cap > -1) {
                         prev_delta = compute_energy(&eenv, pd, cpus, p,
                                                     prev_cpu);
                         /* CPU utilization has changed */
@@ -7996,7 +8100,7 @@ static void set_next_buddy(struct sched_entity *se)
  /*
   * Preempt the current task with a newly woken task if needed:
   */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
  {
         struct task_struct *curr = rq->curr;
         struct sched_entity *se = &curr->se, *pse = &p->se;
@@ -8009,7 +8113,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
  
         /*
          * This is possible from callers such as attach_tasks(), in which we
-        * unconditionally check_preempt_curr() after an enqueue (which may have
+        * unconditionally wakeup_preempt() after an enqueue (which may have
          * lead to a throttle).  This both saves work and prevents false
          * next-buddy nomination below.
          */
@@ -8101,7 +8205,7 @@ again:
                                 goto again;
                 }
  
-               se = pick_next_entity(cfs_rq, curr);
+               se = pick_next_entity(cfs_rq);
                 cfs_rq = group_cfs_rq(se);
         } while (cfs_rq);
  
@@ -8164,7 +8268,7 @@ again:
                         }
                 }
  
-               se = pick_next_entity(cfs_rq, curr);
+               se = pick_next_entity(cfs_rq);
                 cfs_rq = group_cfs_rq(se);
         } while (cfs_rq);
  
@@ -8203,7 +8307,7 @@ simple:
                 put_prev_task(rq, prev);
  
         do {
-               se = pick_next_entity(cfs_rq, NULL);
+               se = pick_next_entity(cfs_rq);
                 set_next_entity(cfs_rq, se);
                 cfs_rq = group_cfs_rq(se);
         } while (cfs_rq);
@@ -8916,7 +9020,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
  
         WARN_ON_ONCE(task_rq(p) != rq);
         activate_task(rq, p, ENQUEUE_NOCLOCK);
-       check_preempt_curr(rq, p, 0);
+       wakeup_preempt(rq, p, 0);
  }
  
  /*
@@ -9256,8 +9360,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
         unsigned long capacity = scale_rt_capacity(cpu);
         struct sched_group *sdg = sd->groups;
  
-       cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
         if (!capacity)
                 capacity = 1;
  
@@ -9333,7 +9435,7 @@ static inline int
  check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
  {
         return ((rq->cpu_capacity * sd->imbalance_pct) <
-                               (rq->cpu_capacity_orig * 100));
+                               (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
  }
  
  /*
@@ -9344,7 +9446,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
  static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
  {
         return rq->misfit_task_load &&
-               (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+               (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
                  check_cpu_capacity(rq, sd));
  }
  
@@ -9496,7 +9598,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
   * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
   * imbalances in the number of CPUS are dealt with in find_busiest_group().
   *
- * If we are balancing load within an SMT core, or at DIE domain level, always
+ * If we are balancing load within an SMT core, or at PKG domain level, always
   * proceed.
   *
   * Return: true if @env::dst_cpu can do with asym_packing load balance. False
@@ -11195,13 +11297,15 @@ more_balance:
                                 busiest->push_cpu = this_cpu;
                                 active_balance = 1;
                         }
-                       raw_spin_rq_unlock_irqrestore(busiest, flags);
  
+                       preempt_disable();
+                       raw_spin_rq_unlock_irqrestore(busiest, flags);
                         if (active_balance) {
                                 stop_one_cpu_nowait(cpu_of(busiest),
                                         active_load_balance_cpu_stop, busiest,
                                         &busiest->active_balance_work);
                         }
+                       preempt_enable();
                 }
         } else {
                 sd->nr_balance_failed = 0;
@@ -11509,36 +11613,39 @@ static inline int on_null_domain(struct rq *rq)
  
  #ifdef CONFIG_NO_HZ_COMMON
  /*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
   *   needed, they will kick the idle load balancer, which then does idle
   *   load balancing for all the idle CPUs.
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
+ *
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
   *   anywhere yet.
   */
-
  static inline int find_new_ilb(void)
  {
-       int ilb;
         const struct cpumask *hk_mask;
+       int ilb_cpu;
  
         hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
  
-       for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
+       for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
  
-               if (ilb == smp_processor_id())
+               if (ilb_cpu == smp_processor_id())
                         continue;
  
-               if (idle_cpu(ilb))
-                       return ilb;
+               if (idle_cpu(ilb_cpu))
+                       return ilb_cpu;
         }
  
-       return nr_cpu_ids;
+       return -1;
  }
  
  /*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
   */
  static void kick_ilb(unsigned int flags)
  {
@@ -11552,8 +11659,7 @@ static void kick_ilb(unsigned int flags)
                 nohz.next_balance = jiffies+1;
  
         ilb_cpu = find_new_ilb();
-
-       if (ilb_cpu >= nr_cpu_ids)
+       if (ilb_cpu < 0)
                 return;
  
         /*
@@ -11566,7 +11672,7 @@ static void kick_ilb(unsigned int flags)
  
         /*
          * This way we generate an IPI on the target CPU which
-        * is idle. And the softirq performing nohz idle load balance
+        * is idle, and the softirq performing NOHZ idle load balancing
          * will be run before returning from the IPI.
          */
         smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
@@ -11595,7 +11701,7 @@ static void nohz_balancer_kick(struct rq *rq)
  
         /*
          * None are in tickless mode and hence no need for NOHZ idle load
-        * balancing.
+        * balancing:
          */
         if (likely(!atomic_read(&nohz.nr_cpus)))
                 return;
@@ -11617,9 +11723,8 @@ static void nohz_balancer_kick(struct rq *rq)
         sd = rcu_dereference(rq->sd);
         if (sd) {
                 /*
-                * If there's a CFS task and the current CPU has reduced
-                * capacity; kick the ILB to see if there's a better CPU to run
-                * on.
+                * If there's a runnable CFS task and the current CPU has reduced
+                * capacity, kick the ILB to see if there's a better CPU to run on:
                  */
                 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
                         flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
@@ -11671,11 +11776,11 @@ static void nohz_balancer_kick(struct rq *rq)
         if (sds) {
                 /*
                  * If there is an imbalance between LLC domains (IOW we could
-                * increase the overall cache use), we need some less-loaded LLC
-                * domain to pull some load. Likewise, we may need to spread
+                * increase the overall cache utilization), we need a less-loaded LLC
+                * domain to pull some load from. Likewise, we may need to spread
                  * load within the current LLC domain (e.g. packed SMT cores but
                  * other CPUs are idle). We can't really know from here how busy
-                * the others are - so just get a nohz balance going if it looks
+                * the others are - so just get a NOHZ balance going if it looks
                  * like this LLC domain has tasks we could move.
                  */
                 nr_busy = atomic_read(&sds->nr_busy_cpus);
@@ -11945,8 +12050,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
  }
  
  /*
- * Check if we need to run the ILB for updating blocked load before entering
- * idle state.
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * cpu about to enter idle, because it can take a long time.
   */
  void nohz_run_idle_balance(int cpu)
  {
@@ -12391,7 +12507,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
                 if (p->prio > oldprio)
                         resched_curr(rq);
         } else
-               check_preempt_curr(rq, p, 0);
+               wakeup_preempt(rq, p, 0);
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12493,7 +12609,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
                 if (task_current(rq, p))
                         resched_curr(rq);
                 else
-                       check_preempt_curr(rq, p, 0);
+                       wakeup_preempt(rq, p, 0);
         }
  }
  
@@ -12852,7 +12968,7 @@ DEFINE_SCHED_CLASS(fair) = {
         .yield_task             = yield_task_fair,
         .yield_to_task          = yield_to_task_fair,
  
-       .check_preempt_curr     = check_preempt_wakeup,
+       .wakeup_preempt         = check_preempt_wakeup_fair,
  
         .pick_next_task         = __pick_next_task_fair,
         .put_prev_task          = put_prev_task_fair,