Merge tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel...
[sfrench/cifs-2.6.git] / kernel / sched / fair.c
index d1a765cdf6e41933a54d51a368840232e53f033d..2048138ce54b574a3ba56b9f6bf7b1cefac1fd32 100644 (file)
@@ -51,8 +51,6 @@
 
 #include <asm/switch_to.h>
 
-#include <linux/sched/cond_resched.h>
-
 #include "sched.h"
 #include "stats.h"
 #include "autogroup.h"
@@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
 unsigned int sysctl_sched_base_slice                   = 750000ULL;
 static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
 
-/*
- * After fork, child runs first. If set to 0 (default) then
- * parent will (try to) run first.
- */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
-
 const_debug unsigned int sysctl_sched_migration_cost   = 500000UL;
 
 int sched_thermal_decay_shift;
@@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
 
 #ifdef CONFIG_SYSCTL
 static struct ctl_table sched_fair_sysctls[] = {
-       {
-               .procname       = "sched_child_runs_first",
-               .data           = &sysctl_sched_child_runs_first,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec,
-       },
 #ifdef CONFIG_CFS_BANDWIDTH
        {
                .procname       = "sched_cfs_bandwidth_slice_us",
@@ -664,6 +649,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
        cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
 }
 
+/*
+ * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * For this to be so, the result of this function must have a left bias.
+ */
 u64 avg_vruntime(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
@@ -677,8 +666,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
                load += weight;
        }
 
-       if (load)
+       if (load) {
+               /* sign flips effective floor / ceil */
+               if (avg < 0)
+                       avg -= (load - 1);
                avg = div_s64(avg, load);
+       }
 
        return cfs_rq->min_vruntime + avg;
 }
@@ -864,14 +857,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
  *
  * Which allows an EDF like search on (sub)trees.
  */
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq)
 {
        struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
        struct sched_entity *curr = cfs_rq->curr;
        struct sched_entity *best = NULL;
+       struct sched_entity *best_left = NULL;
 
        if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
                curr = NULL;
+       best = curr;
 
        /*
         * Once selected, run a task until it either becomes non-eligible or
@@ -892,33 +887,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
                }
 
                /*
-                * If this entity has an earlier deadline than the previous
-                * best, take this one. If it also has the earliest deadline
-                * of its subtree, we're done.
+                * Now we heap search eligible trees for the best (min_)deadline
                 */
-               if (!best || deadline_gt(deadline, best, se)) {
+               if (!best || deadline_gt(deadline, best, se))
                        best = se;
-                       if (best->deadline == best->min_deadline)
-                               break;
-               }
 
                /*
-                * If the earlest deadline in this subtree is in the fully
-                * eligible left half of our space, go there.
+                * Every se in a left branch is eligible, keep track of the
+                * branch with the best min_deadline
                 */
+               if (node->rb_left) {
+                       struct sched_entity *left = __node_2_se(node->rb_left);
+
+                       if (!best_left || deadline_gt(min_deadline, best_left, left))
+                               best_left = left;
+
+                       /*
+                        * min_deadline is in the left branch. rb_left and all
+                        * descendants are eligible, so immediately switch to the second
+                        * loop.
+                        */
+                       if (left->min_deadline == se->min_deadline)
+                               break;
+               }
+
+               /* min_deadline is at this node, no need to look right */
+               if (se->deadline == se->min_deadline)
+                       break;
+
+               /* else min_deadline is in the right branch. */
+               node = node->rb_right;
+       }
+
+       /*
+        * We ran into an eligible node which is itself the best.
+        * (Or nr_running == 0 and both are NULL)
+        */
+       if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0)
+               return best;
+
+       /*
+        * Now best_left and all of its children are eligible, and we are just
+        * looking for deadline == min_deadline
+        */
+       node = &best_left->run_node;
+       while (node) {
+               struct sched_entity *se = __node_2_se(node);
+
+               /* min_deadline is the current node */
+               if (se->deadline == se->min_deadline)
+                       return se;
+
+               /* min_deadline is in the left branch */
                if (node->rb_left &&
                    __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
                        node = node->rb_left;
                        continue;
                }
 
+               /* else min_deadline is in the right branch */
                node = node->rb_right;
        }
+       return NULL;
+}
 
-       if (!best || (curr && deadline_gt(deadline, best, curr)))
-               best = curr;
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
+{
+       struct sched_entity *se = __pick_eevdf(cfs_rq);
 
-       if (unlikely(!best)) {
+       if (!se) {
                struct sched_entity *left = __pick_first_entity(cfs_rq);
                if (left) {
                        pr_err("EEVDF scheduling fail, picking leftmost\n");
@@ -926,7 +963,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
                }
        }
 
-       return best;
+       return se;
 }
 
 #ifdef CONFIG_SCHED_DEBUG
@@ -2847,19 +2884,7 @@ static void task_numa_placement(struct task_struct *p)
        }
 
        /* Cannot migrate task to CPU-less node */
-       if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
-               int near_nid = max_nid;
-               int distance, near_distance = INT_MAX;
-
-               for_each_node_state(nid, N_CPU) {
-                       distance = node_distance(max_nid, nid);
-                       if (distance < near_distance) {
-                               near_nid = nid;
-                               near_distance = distance;
-                       }
-               }
-               max_nid = near_nid;
-       }
+       max_nid = numa_nearest_node(max_nid, N_CPU);
 
        if (ng) {
                numa_group_count_active_nodes(ng);
@@ -3130,7 +3155,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
        p->mm->numa_scan_offset = 0;
 }
 
-static bool vma_is_accessed(struct vm_area_struct *vma)
+static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
 {
        unsigned long pids;
        /*
@@ -3142,8 +3167,20 @@ static bool vma_is_accessed(struct vm_area_struct *vma)
        if (READ_ONCE(current->mm->numa_scan_seq) < 2)
                return true;
 
-       pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
-       return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+       pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
+       if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+               return true;
+
+       /*
+        * Complete a scan that has already started regardless of PID access, or
+        * some VMAs may never be scanned in multi-threaded applications:
+        */
+       if (mm->numa_scan_offset > vma->vm_start) {
+               trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
+               return true;
+       }
+
+       return false;
 }
 
 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
@@ -3163,6 +3200,8 @@ static void task_numa_work(struct callback_head *work)
        unsigned long nr_pte_updates = 0;
        long pages, virtpages;
        struct vma_iterator vmi;
+       bool vma_pids_skipped;
+       bool vma_pids_forced = false;
 
        SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
 
@@ -3205,7 +3244,6 @@ static void task_numa_work(struct callback_head *work)
         */
        p->node_stamp += 2 * TICK_NSEC;
 
-       start = mm->numa_scan_offset;
        pages = sysctl_numa_balancing_scan_size;
        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
        virtpages = pages * 8;     /* Scan up to this much virtual space */
@@ -3215,6 +3253,16 @@ static void task_numa_work(struct callback_head *work)
 
        if (!mmap_read_trylock(mm))
                return;
+
+       /*
+        * VMAs are skipped if the current PID has not trapped a fault within
+        * the VMA recently. Allow scanning to be forced if there is no
+        * suitable VMA remaining.
+        */
+       vma_pids_skipped = false;
+
+retry_pids:
+       start = mm->numa_scan_offset;
        vma_iter_init(&vmi, mm, start);
        vma = vma_next(&vmi);
        if (!vma) {
@@ -3227,6 +3275,7 @@ static void task_numa_work(struct callback_head *work)
        do {
                if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
                        is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
                        continue;
                }
 
@@ -3237,15 +3286,19 @@ static void task_numa_work(struct callback_head *work)
                 * as migrating the pages will be of marginal benefit.
                 */
                if (!vma->vm_mm ||
-                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
                        continue;
+               }
 
                /*
                 * Skip inaccessible VMAs to avoid any confusion between
                 * PROT_NONE and NUMA hinting ptes
                 */
-               if (!vma_is_accessible(vma))
+               if (!vma_is_accessible(vma)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
                        continue;
+               }
 
                /* Initialise new per-VMA NUMAB state. */
                if (!vma->numab_state) {
@@ -3258,8 +3311,15 @@ static void task_numa_work(struct callback_head *work)
                                msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
 
                        /* Reset happens after 4 times scan delay of scan start */
-                       vma->numab_state->next_pid_reset =  vma->numab_state->next_scan +
+                       vma->numab_state->pids_active_reset =  vma->numab_state->next_scan +
                                msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+
+                       /*
+                        * Ensure prev_scan_seq does not match numa_scan_seq,
+                        * to prevent VMAs being skipped prematurely on the
+                        * first scan:
+                        */
+                        vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
                }
 
                /*
@@ -3267,23 +3327,35 @@ static void task_numa_work(struct callback_head *work)
                 * delay the scan for new VMAs.
                 */
                if (mm->numa_scan_seq && time_before(jiffies,
-                                               vma->numab_state->next_scan))
+                                               vma->numab_state->next_scan)) {
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
                        continue;
+               }
+
+               /* RESET access PIDs regularly for old VMAs. */
+               if (mm->numa_scan_seq &&
+                               time_after(jiffies, vma->numab_state->pids_active_reset)) {
+                       vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
+                               msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+                       vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
+                       vma->numab_state->pids_active[1] = 0;
+               }
 
-               /* Do not scan the VMA if task has not accessed */
-               if (!vma_is_accessed(vma))
+               /* Do not rescan VMAs twice within the same sequence. */
+               if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
+                       mm->numa_scan_offset = vma->vm_end;
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
                        continue;
+               }
 
                /*
-                * RESET access PIDs regularly for old VMAs. Resetting after checking
-                * vma for recent access to avoid clearing PID info before access..
+                * Do not scan the VMA if task has not accessed it, unless no other
+                * VMA candidate exists.
                 */
-               if (mm->numa_scan_seq &&
-                               time_after(jiffies, vma->numab_state->next_pid_reset)) {
-                       vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
-                               msecs_to_jiffies(VMA_PID_RESET_PERIOD);
-                       vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
-                       vma->numab_state->access_pids[1] = 0;
+               if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
+                       vma_pids_skipped = true;
+                       trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
+                       continue;
                }
 
                do {
@@ -3310,8 +3382,28 @@ static void task_numa_work(struct callback_head *work)
 
                        cond_resched();
                } while (end != vma->vm_end);
+
+               /* VMA scan is complete, do not scan until next sequence. */
+               vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
+
+               /*
+                * Only force scan within one VMA at a time, to limit the
+                * cost of scanning a potentially uninteresting VMA.
+                */
+               if (vma_pids_forced)
+                       break;
        } for_each_vma(vmi, vma);
 
+       /*
+        * If no VMAs are remaining and VMAs were skipped due to the PID
+        * not accessing the VMA previously, then force a scan to ensure
+        * forward progress:
+        */
+       if (!vma && !vma_pids_forced && vma_pids_skipped) {
+               vma_pids_forced = true;
+               goto retry_pids;
+       }
+
 out:
        /*
         * It is possible to reach the end of the VMA list but the last few
@@ -3605,6 +3697,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                 */
                deadline = div_s64(deadline * old_weight, weight);
                se->deadline = se->vruntime + deadline;
+               if (se != cfs_rq->curr)
+                       min_deadline_cb_propagate(&se->run_node, NULL);
        }
 
 #ifdef CONFIG_SMP
@@ -3888,7 +3982,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
  */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
-       long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+       long delta;
+       u64 now;
 
        /*
         * No need to update load_avg for root_task_group as it is not used.
@@ -3896,9 +3991,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
        if (cfs_rq->tg == &root_task_group)
                return;
 
+       /*
+        * For migration heavy workloads, access to tg->load_avg can be
+        * unbound. Limit the update rate to at most once per ms.
+        */
+       now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
+       if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
+               return;
+
+       delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
        if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
                atomic_long_add(delta, &cfs_rq->tg->load_avg);
                cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
+               cfs_rq->last_update_tg_load_avg = now;
        }
 }
 
@@ -4572,22 +4677,6 @@ static inline unsigned long task_util_est(struct task_struct *p)
        return max(task_util(p), _task_util_est(p));
 }
 
-#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-                                            unsigned long uclamp_min,
-                                            unsigned long uclamp_max)
-{
-       return clamp(task_util_est(p), uclamp_min, uclamp_max);
-}
-#else
-static inline unsigned long uclamp_task_util(struct task_struct *p,
-                                            unsigned long uclamp_min,
-                                            unsigned long uclamp_max)
-{
-       return task_util_est(p);
-}
-#endif
-
 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
                                    struct task_struct *p)
 {
@@ -4691,7 +4780,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
         * To avoid overestimation of actual task utilization, skip updates if
         * we cannot grant there is idle time in this CPU.
         */
-       if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
+       if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
                return;
 
        /*
@@ -4739,14 +4828,14 @@ static inline int util_fits_cpu(unsigned long util,
                return fits;
 
        /*
-        * We must use capacity_orig_of() for comparing against uclamp_min and
+        * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
         * uclamp_max. We only care about capacity pressure (by using
         * capacity_of()) for comparing against the real util.
         *
         * If a task is boosted to 1024 for example, we don't want a tiny
         * pressure to skew the check whether it fits a CPU or not.
         *
-        * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+        * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
         * should fit a little cpu even if there's some pressure.
         *
         * Only exception is for thermal pressure since it has a direct impact
@@ -4758,7 +4847,7 @@ static inline int util_fits_cpu(unsigned long util,
         * For uclamp_max, we can tolerate a drop in performance level as the
         * goal is to cap the task. So it's okay if it's getting less.
         */
-       capacity_orig = capacity_orig_of(cpu);
+       capacity_orig = arch_scale_cpu_capacity(cpu);
        capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
 
        /*
@@ -4878,7 +4967,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 {
-       return true;
+       return !cfs_rq->nr_running;
 }
 
 #define UPDATE_TG      0x0
@@ -4919,10 +5008,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-       u64 vslice = calc_delta_fair(se->slice, se);
-       u64 vruntime = avg_vruntime(cfs_rq);
+       u64 vslice, vruntime = avg_vruntime(cfs_rq);
        s64 lag = 0;
 
+       se->slice = sysctl_sched_base_slice;
+       vslice = calc_delta_fair(se->slice, se);
+
        /*
         * Due to how V is constructed as the weighted average of entities,
         * adding tasks with positive lag, or removing tasks with negative lag
@@ -5211,7 +5302,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * 4) do not run the "skip" process, if something else is available
  */
 static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+pick_next_entity(struct cfs_rq *cfs_rq)
 {
        /*
         * Enabling NEXT_BUDDY will affect latency but not fairness.
@@ -5755,13 +5846,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
 
 static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 {
-       struct cfs_rq *local_unthrottle = NULL;
        int this_cpu = smp_processor_id();
        u64 runtime, remaining = 1;
        bool throttled = false;
-       struct cfs_rq *cfs_rq;
+       struct cfs_rq *cfs_rq, *tmp;
        struct rq_flags rf;
        struct rq *rq;
+       LIST_HEAD(local_unthrottle);
 
        rcu_read_lock();
        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -5777,11 +5868,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
                if (!cfs_rq_throttled(cfs_rq))
                        goto next;
 
-#ifdef CONFIG_SMP
                /* Already queued for async unthrottle */
                if (!list_empty(&cfs_rq->throttled_csd_list))
                        goto next;
-#endif
 
                /* By the above checks, this should never be true */
                SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
@@ -5798,11 +5887,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 
                /* we check whether we're throttled above */
                if (cfs_rq->runtime_remaining > 0) {
-                       if (cpu_of(rq) != this_cpu ||
-                           SCHED_WARN_ON(local_unthrottle))
+                       if (cpu_of(rq) != this_cpu) {
                                unthrottle_cfs_rq_async(cfs_rq);
-                       else
-                               local_unthrottle = cfs_rq;
+                       } else {
+                               /*
+                                * We currently only expect to be unthrottling
+                                * a single cfs_rq locally.
+                                */
+                               SCHED_WARN_ON(!list_empty(&local_unthrottle));
+                               list_add_tail(&cfs_rq->throttled_csd_list,
+                                             &local_unthrottle);
+                       }
                } else {
                        throttled = true;
                }
@@ -5810,15 +5905,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
 next:
                rq_unlock_irqrestore(rq, &rf);
        }
-       rcu_read_unlock();
 
-       if (local_unthrottle) {
-               rq = cpu_rq(this_cpu);
+       list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
+                                throttled_csd_list) {
+               struct rq *rq = rq_of(cfs_rq);
+
                rq_lock_irqsave(rq, &rf);
-               if (cfs_rq_throttled(local_unthrottle))
-                       unthrottle_cfs_rq(local_unthrottle);
+
+               list_del_init(&cfs_rq->throttled_csd_list);
+
+               if (cfs_rq_throttled(cfs_rq))
+                       unthrottle_cfs_rq(cfs_rq);
+
                rq_unlock_irqrestore(rq, &rf);
        }
+       SCHED_WARN_ON(!list_empty(&local_unthrottle));
+
+       rcu_read_unlock();
 
        return throttled;
 }
@@ -6148,9 +6251,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
        cfs_rq->runtime_enabled = 0;
        INIT_LIST_HEAD(&cfs_rq->throttled_list);
-#ifdef CONFIG_SMP
        INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
-#endif
 }
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -7108,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
        int i, cpu, idle_cpu = -1, nr = INT_MAX;
        struct sched_domain_shared *sd_share;
-       struct rq *this_rq = this_rq();
-       int this = smp_processor_id();
-       struct sched_domain *this_sd = NULL;
-       u64 time = 0;
 
        cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
-       if (sched_feat(SIS_PROP) && !has_idle_core) {
-               u64 avg_cost, avg_idle, span_avg;
-               unsigned long now = jiffies;
-
-               this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
-               if (!this_sd)
-                       return -1;
-
-               /*
-                * If we're busy, the assumption that the last idle period
-                * predicts the future is flawed; age away the remaining
-                * predicted idle time.
-                */
-               if (unlikely(this_rq->wake_stamp < now)) {
-                       while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
-                               this_rq->wake_stamp++;
-                               this_rq->wake_avg_idle >>= 1;
-                       }
-               }
-
-               avg_idle = this_rq->wake_avg_idle;
-               avg_cost = this_sd->avg_scan_cost + 1;
-
-               span_avg = sd->span_weight * avg_idle;
-               if (span_avg > 4*avg_cost)
-                       nr = div_u64(span_avg, avg_cost);
-               else
-                       nr = 4;
-
-               time = cpu_clock(this);
-       }
-
        if (sched_feat(SIS_UTIL)) {
                sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
                if (sd_share) {
@@ -7158,6 +7223,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
                }
        }
 
+       if (static_branch_unlikely(&sched_cluster_active)) {
+               struct sched_group *sg = sd->groups;
+
+               if (sg->flags & SD_CLUSTER) {
+                       for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
+                               if (!cpumask_test_cpu(cpu, cpus))
+                                       continue;
+
+                               if (has_idle_core) {
+                                       i = select_idle_core(p, cpu, cpus, &idle_cpu);
+                                       if ((unsigned int)i < nr_cpumask_bits)
+                                               return i;
+                               } else {
+                                       if (--nr <= 0)
+                                               return -1;
+                                       idle_cpu = __select_idle_cpu(cpu, p);
+                                       if ((unsigned int)idle_cpu < nr_cpumask_bits)
+                                               return idle_cpu;
+                               }
+                       }
+                       cpumask_andnot(cpus, cpus, sched_group_span(sg));
+               }
+       }
+
        for_each_cpu_wrap(cpu, cpus, target + 1) {
                if (has_idle_core) {
                        i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -7165,7 +7254,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
                                return i;
 
                } else {
-                       if (!--nr)
+                       if (--nr <= 0)
                                return -1;
                        idle_cpu = __select_idle_cpu(cpu, p);
                        if ((unsigned int)idle_cpu < nr_cpumask_bits)
@@ -7176,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
        if (has_idle_core)
                set_idle_cores(target, false);
 
-       if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) {
-               time = cpu_clock(this) - time;
-
-               /*
-                * Account for the scan cost of wakeups against the average
-                * idle time.
-                */
-               this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
-
-               update_avg(&this_sd->avg_scan_cost, time);
-       }
-
        return idle_cpu;
 }
 
@@ -7227,7 +7304,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
                 * Look for the CPU with best capacity.
                 */
                else if (fits < 0)
-                       cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
+                       cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
 
                /*
                 * First, select CPU which fits better (-1 being better than 0).
@@ -7267,7 +7344,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
        bool has_idle_core = false;
        struct sched_domain *sd;
        unsigned long task_util, util_min, util_max;
-       int i, recent_used_cpu;
+       int i, recent_used_cpu, prev_aff = -1;
 
        /*
         * On asymmetric system, update task utilization because we will check
@@ -7294,8 +7371,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         */
        if (prev != target && cpus_share_cache(prev, target) &&
            (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
-           asym_fits_cpu(task_util, util_min, util_max, prev))
-               return prev;
+           asym_fits_cpu(task_util, util_min, util_max, prev)) {
+
+               if (!static_branch_unlikely(&sched_cluster_active) ||
+                   cpus_share_resources(prev, target))
+                       return prev;
+
+               prev_aff = prev;
+       }
 
        /*
         * Allow a per-cpu kthread to stack with the wakee if the
@@ -7322,7 +7405,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
            (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
            cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
            asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
-               return recent_used_cpu;
+
+               if (!static_branch_unlikely(&sched_cluster_active) ||
+                   cpus_share_resources(recent_used_cpu, target))
+                       return recent_used_cpu;
+
+       } else {
+               recent_used_cpu = -1;
        }
 
        /*
@@ -7363,6 +7452,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
        if ((unsigned)i < nr_cpumask_bits)
                return i;
 
+       /*
+        * For cluster machines which have lower sharing cache like L2 or
+        * LLC Tag, we tend to find an idle CPU in the target's cluster
+        * first. But prev_cpu or recent_used_cpu may also be a good candidate,
+        * use them if possible when no idle CPU found in select_idle_cpu().
+        */
+       if ((unsigned int)prev_aff < nr_cpumask_bits)
+               return prev_aff;
+       if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
+               return recent_used_cpu;
+
        return target;
 }
 
@@ -7469,7 +7569,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
                util = max(util, util_est);
        }
 
-       return min(util, capacity_orig_of(cpu));
+       return min(util, arch_scale_cpu_capacity(cpu));
 }
 
 unsigned long cpu_util_cfs(int cpu)
@@ -7621,11 +7721,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd,
 {
        unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
        unsigned long busy_time = eenv->pd_busy_time;
+       unsigned long energy;
 
        if (dst_cpu >= 0)
                busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
 
-       return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+       energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
+
+       trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
+
+       return energy;
 }
 
 /*
@@ -7700,7 +7805,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
        target = prev_cpu;
 
        sync_entity_load_avg(&p->se);
-       if (!uclamp_task_util(p, p_util_min, p_util_max))
+       if (!task_util_est(p) && p_util_min == 0)
                goto unlock;
 
        eenv_task_busy_time(&eenv, p, prev_cpu);
@@ -7708,11 +7813,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
        for (; pd; pd = pd->next) {
                unsigned long util_min = p_util_min, util_max = p_util_max;
                unsigned long cpu_cap, cpu_thermal_cap, util;
-               unsigned long cur_delta, max_spare_cap = 0;
+               long prev_spare_cap = -1, max_spare_cap = -1;
                unsigned long rq_util_min, rq_util_max;
-               unsigned long prev_spare_cap = 0;
+               unsigned long cur_delta, base_energy;
                int max_spare_cap_cpu = -1;
-               unsigned long base_energy;
                int fits, max_fits = -1;
 
                cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
@@ -7775,7 +7879,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                                prev_spare_cap = cpu_cap;
                                prev_fits = fits;
                        } else if ((fits > max_fits) ||
-                                  ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
+                                  ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
                                /*
                                 * Find the CPU with the maximum spare capacity
                                 * among the remaining CPUs in the performance
@@ -7787,7 +7891,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                        }
                }
 
-               if (max_spare_cap_cpu < 0 && prev_spare_cap == 0)
+               if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
                        continue;
 
                eenv_pd_busy_time(&eenv, cpus, p);
@@ -7795,7 +7899,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                base_energy = compute_energy(&eenv, pd, cpus, p, -1);
 
                /* Evaluate the energy impact of using prev_cpu. */
-               if (prev_spare_cap > 0) {
+               if (prev_spare_cap > -1) {
                        prev_delta = compute_energy(&eenv, pd, cpus, p,
                                                    prev_cpu);
                        /* CPU utilization has changed */
@@ -7996,7 +8100,7 @@ static void set_next_buddy(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
 {
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
@@ -8009,7 +8113,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 
        /*
         * This is possible from callers such as attach_tasks(), in which we
-        * unconditionally check_preempt_curr() after an enqueue (which may have
+        * unconditionally wakeup_preempt() after an enqueue (which may have
         * lead to a throttle).  This both saves work and prevents false
         * next-buddy nomination below.
         */
@@ -8101,7 +8205,7 @@ again:
                                goto again;
                }
 
-               se = pick_next_entity(cfs_rq, curr);
+               se = pick_next_entity(cfs_rq);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
 
@@ -8164,7 +8268,7 @@ again:
                        }
                }
 
-               se = pick_next_entity(cfs_rq, curr);
+               se = pick_next_entity(cfs_rq);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
 
@@ -8203,7 +8307,7 @@ simple:
                put_prev_task(rq, prev);
 
        do {
-               se = pick_next_entity(cfs_rq, NULL);
+               se = pick_next_entity(cfs_rq);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
@@ -8916,7 +9020,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
 
        WARN_ON_ONCE(task_rq(p) != rq);
        activate_task(rq, p, ENQUEUE_NOCLOCK);
-       check_preempt_curr(rq, p, 0);
+       wakeup_preempt(rq, p, 0);
 }
 
 /*
@@ -9256,8 +9360,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
        unsigned long capacity = scale_rt_capacity(cpu);
        struct sched_group *sdg = sd->groups;
 
-       cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
-
        if (!capacity)
                capacity = 1;
 
@@ -9333,7 +9435,7 @@ static inline int
 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 {
        return ((rq->cpu_capacity * sd->imbalance_pct) <
-                               (rq->cpu_capacity_orig * 100));
+                               (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
 }
 
 /*
@@ -9344,7 +9446,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
 {
        return rq->misfit_task_load &&
-               (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+               (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
                 check_cpu_capacity(rq, sd));
 }
 
@@ -9496,7 +9598,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
  * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
  * imbalances in the number of CPUS are dealt with in find_busiest_group().
  *
- * If we are balancing load within an SMT core, or at DIE domain level, always
+ * If we are balancing load within an SMT core, or at PKG domain level, always
  * proceed.
  *
  * Return: true if @env::dst_cpu can do with asym_packing load balance. False
@@ -11195,13 +11297,15 @@ more_balance:
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
-                       raw_spin_rq_unlock_irqrestore(busiest, flags);
 
+                       preempt_disable();
+                       raw_spin_rq_unlock_irqrestore(busiest, flags);
                        if (active_balance) {
                                stop_one_cpu_nowait(cpu_of(busiest),
                                        active_load_balance_cpu_stop, busiest,
                                        &busiest->active_balance_work);
                        }
+                       preempt_enable();
                }
        } else {
                sd->nr_balance_failed = 0;
@@ -11509,36 +11613,39 @@ static inline int on_null_domain(struct rq *rq)
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * NOHZ idle load balancing (ILB) details:
+ *
+ * - When one of the busy CPUs notices that there may be an idle rebalancing
  *   needed, they will kick the idle load balancer, which then does idle
  *   load balancing for all the idle CPUs.
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
+ *
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
  *   anywhere yet.
  */
-
 static inline int find_new_ilb(void)
 {
-       int ilb;
        const struct cpumask *hk_mask;
+       int ilb_cpu;
 
        hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
 
-       for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
+       for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
 
-               if (ilb == smp_processor_id())
+               if (ilb_cpu == smp_processor_id())
                        continue;
 
-               if (idle_cpu(ilb))
-                       return ilb;
+               if (idle_cpu(ilb_cpu))
+                       return ilb_cpu;
        }
 
-       return nr_cpu_ids;
+       return -1;
 }
 
 /*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
+ * SMP function call (IPI).
+ *
+ * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
  */
 static void kick_ilb(unsigned int flags)
 {
@@ -11552,8 +11659,7 @@ static void kick_ilb(unsigned int flags)
                nohz.next_balance = jiffies+1;
 
        ilb_cpu = find_new_ilb();
-
-       if (ilb_cpu >= nr_cpu_ids)
+       if (ilb_cpu < 0)
                return;
 
        /*
@@ -11566,7 +11672,7 @@ static void kick_ilb(unsigned int flags)
 
        /*
         * This way we generate an IPI on the target CPU which
-        * is idle. And the softirq performing nohz idle load balance
+        * is idle, and the softirq performing NOHZ idle load balancing
         * will be run before returning from the IPI.
         */
        smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
@@ -11595,7 +11701,7 @@ static void nohz_balancer_kick(struct rq *rq)
 
        /*
         * None are in tickless mode and hence no need for NOHZ idle load
-        * balancing.
+        * balancing:
         */
        if (likely(!atomic_read(&nohz.nr_cpus)))
                return;
@@ -11617,9 +11723,8 @@ static void nohz_balancer_kick(struct rq *rq)
        sd = rcu_dereference(rq->sd);
        if (sd) {
                /*
-                * If there's a CFS task and the current CPU has reduced
-                * capacity; kick the ILB to see if there's a better CPU to run
-                * on.
+                * If there's a runnable CFS task and the current CPU has reduced
+                * capacity, kick the ILB to see if there's a better CPU to run on:
                 */
                if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
                        flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
@@ -11671,11 +11776,11 @@ static void nohz_balancer_kick(struct rq *rq)
        if (sds) {
                /*
                 * If there is an imbalance between LLC domains (IOW we could
-                * increase the overall cache use), we need some less-loaded LLC
-                * domain to pull some load. Likewise, we may need to spread
+                * increase the overall cache utilization), we need a less-loaded LLC
+                * domain to pull some load from. Likewise, we may need to spread
                 * load within the current LLC domain (e.g. packed SMT cores but
                 * other CPUs are idle). We can't really know from here how busy
-                * the others are - so just get a nohz balance going if it looks
+                * the others are - so just get a NOHZ balance going if it looks
                 * like this LLC domain has tasks we could move.
                 */
                nr_busy = atomic_read(&sds->nr_busy_cpus);
@@ -11945,8 +12050,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 }
 
 /*
- * Check if we need to run the ILB for updating blocked load before entering
- * idle state.
+ * Check if we need to directly run the ILB for updating blocked load before
+ * entering idle state. Here we run ILB directly without issuing IPIs.
+ *
+ * Note that when this function is called, the tick may not yet be stopped on
+ * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
+ * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
+ * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
+ * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
+ * called from this function on (this) CPU that's not yet in the mask. That's
+ * OK because the goal of nohz_run_idle_balance() is to run ILB only for
+ * updating the blocked load of already idle CPUs without waking up one of
+ * those idle CPUs and outside the preempt disable / irq off phase of the local
+ * cpu about to enter idle, because it can take a long time.
  */
 void nohz_run_idle_balance(int cpu)
 {
@@ -12391,7 +12507,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
                if (p->prio > oldprio)
                        resched_curr(rq);
        } else
-               check_preempt_curr(rq, p, 0);
+               wakeup_preempt(rq, p, 0);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -12493,7 +12609,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
                if (task_current(rq, p))
                        resched_curr(rq);
                else
-                       check_preempt_curr(rq, p, 0);
+                       wakeup_preempt(rq, p, 0);
        }
 }
 
@@ -12852,7 +12968,7 @@ DEFINE_SCHED_CLASS(fair) = {
        .yield_task             = yield_task_fair,
        .yield_to_task          = yield_to_task_fair,
 
-       .check_preempt_curr     = check_preempt_wakeup,
+       .wakeup_preempt         = check_preempt_wakeup_fair,
 
        .pick_next_task         = __pick_next_task_fair,
        .put_prev_task          = put_prev_task_fair,