sched/core: Avoid obvious double update_rq_clock warning
authorHao Jia <jiahao.os@bytedance.com>
Sat, 30 Apr 2022 08:58:42 +0000 (16:58 +0800)
committerPeter Zijlstra <peterz@infradead.org>
Wed, 11 May 2022 14:27:11 +0000 (16:27 +0200)
When we use raw_spin_rq_lock() to acquire the rq lock and have to
update the rq clock while holding the lock, the kernel may issue
a WARN_DOUBLE_CLOCK warning.

Since we directly use raw_spin_rq_lock() to acquire rq lock instead of
rq_lock(), there is no corresponding change to rq->clock_update_flags.
In particular, we have obtained the rq lock of other CPUs, the
rq->clock_update_flags of this CPU may be RQCF_UPDATED at this time, and
then calling update_rq_clock() will trigger the WARN_DOUBLE_CLOCK warning.

So we need to clear RQCF_UPDATED of rq->clock_update_flags to avoid
the WARN_DOUBLE_CLOCK warning.

For the sched_rt_period_timer() and migrate_task_rq_dl() cases
we simply replace raw_spin_rq_lock()/raw_spin_rq_unlock() with
rq_lock()/rq_unlock().

For the {pull,push}_{rt,dl}_task() cases, we add the
double_rq_clock_clear_update() function to clear RQCF_UPDATED of
rq->clock_update_flags, and call double_rq_clock_clear_update()
before double_lock_balance()/double_rq_lock() returns to avoid the
WARN_DOUBLE_CLOCK warning.

Some call trace reports:
Call Trace 1:
 <IRQ>
 sched_rt_period_timer+0x10f/0x3a0
 ? enqueue_top_rt_rq+0x110/0x110
 __hrtimer_run_queues+0x1a9/0x490
 hrtimer_interrupt+0x10b/0x240
 __sysvec_apic_timer_interrupt+0x8a/0x250
 sysvec_apic_timer_interrupt+0x9a/0xd0
 </IRQ>
 <TASK>
 asm_sysvec_apic_timer_interrupt+0x12/0x20

Call Trace 2:
 <TASK>
 activate_task+0x8b/0x110
 push_rt_task.part.108+0x241/0x2c0
 push_rt_tasks+0x15/0x30
 finish_task_switch+0xaa/0x2e0
 ? __switch_to+0x134/0x420
 __schedule+0x343/0x8e0
 ? hrtimer_start_range_ns+0x101/0x340
 schedule+0x4e/0xb0
 do_nanosleep+0x8e/0x160
 hrtimer_nanosleep+0x89/0x120
 ? hrtimer_init_sleeper+0x90/0x90
 __x64_sys_nanosleep+0x96/0xd0
 do_syscall_64+0x34/0x90
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Call Trace 3:
 <TASK>
 deactivate_task+0x93/0xe0
 pull_rt_task+0x33e/0x400
 balance_rt+0x7e/0x90
 __schedule+0x62f/0x8e0
 do_task_dead+0x3f/0x50
 do_exit+0x7b8/0xbb0
 do_group_exit+0x2d/0x90
 get_signal+0x9df/0x9e0
 ? preempt_count_add+0x56/0xa0
 ? __remove_hrtimer+0x35/0x70
 arch_do_signal_or_restart+0x36/0x720
 ? nanosleep_copyout+0x39/0x50
 ? do_nanosleep+0x131/0x160
 ? audit_filter_inodes+0xf5/0x120
 exit_to_user_mode_prepare+0x10f/0x1e0
 syscall_exit_to_user_mode+0x17/0x30
 do_syscall_64+0x40/0x90
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Call Trace 4:
 update_rq_clock+0x128/0x1a0
 migrate_task_rq_dl+0xec/0x310
 set_task_cpu+0x84/0x1e4
 try_to_wake_up+0x1d8/0x5c0
 wake_up_process+0x1c/0x30
 hrtimer_wakeup+0x24/0x3c
 __hrtimer_run_queues+0x114/0x270
 hrtimer_interrupt+0xe8/0x244
 arch_timer_handler_phys+0x30/0x50
 handle_percpu_devid_irq+0x88/0x140
 generic_handle_domain_irq+0x40/0x60
 gic_handle_irq+0x48/0xe0
 call_on_irq_stack+0x2c/0x60
 do_interrupt_handler+0x80/0x84

Steps to reproduce:
1. Enable CONFIG_SCHED_DEBUG when compiling the kernel
2. echo 1 > /sys/kernel/debug/clear_warn_once
   echo "WARN_DOUBLE_CLOCK" > /sys/kernel/debug/sched/features
   echo "NO_RT_PUSH_IPI" > /sys/kernel/debug/sched/features
3. Run some rt/dl tasks that periodically work and sleep, e.g.
Create 2*n rt or dl (90% running) tasks via rt-app (on a system
with n CPUs), and Dietmar Eggemann reports Call Trace 4 when running
on PREEMPT_RT kernel.

Signed-off-by: Hao Jia <jiahao.os@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://lore.kernel.org/r/20220430085843.62939-2-jiahao.os@bytedance.com
kernel/sched/core.c
kernel/sched/deadline.c
kernel/sched/rt.c
kernel/sched/sched.h

index 48cfad152b86823b73cb9cd5d652db7982d90799..2c9f5e97682fbcb9cff6ed6bda356f6afc09b88f 100644 (file)
@@ -613,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
                swap(rq1, rq2);
 
        raw_spin_rq_lock(rq1);
-       if (__rq_lockp(rq1) == __rq_lockp(rq2))
-               return;
+       if (__rq_lockp(rq1) != __rq_lockp(rq2))
+               raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
 
-       raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+       double_rq_clock_clear_update(rq1, rq2);
 }
 #endif
 
index 6ae423627a7a7c5e832de8ae9854bb58463138c7..0ad2818e77a2b58ee4de286a068b4646af4d7891 100644 (file)
@@ -1830,6 +1830,7 @@ out:
 
 static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
 {
+       struct rq_flags rf;
        struct rq *rq;
 
        if (READ_ONCE(p->__state) != TASK_WAKING)
@@ -1841,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
         * from try_to_wake_up(). Hence, p->pi_lock is locked, but
         * rq->lock is not... So, lock it
         */
-       raw_spin_rq_lock(rq);
+       rq_lock(rq, &rf);
        if (p->dl.dl_non_contending) {
                update_rq_clock(rq);
                sub_running_bw(&p->dl, &rq->dl);
@@ -1857,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
                        put_task_struct(p);
        }
        sub_rq_bw(&p->dl, &rq->dl);
-       raw_spin_rq_unlock(rq);
+       rq_unlock(rq, &rf);
 }
 
 static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
index a32c46889af89900de533c90011fd8c6c116fa10..7891c0f0e1ff732eff0aafd61057b528158a80c4 100644 (file)
@@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                int enqueue = 0;
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
                struct rq *rq = rq_of_rt_rq(rt_rq);
+               struct rq_flags rf;
                int skip;
 
                /*
@@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
                if (skip)
                        continue;
 
-               raw_spin_rq_lock(rq);
+               rq_lock(rq, &rf);
                update_rq_clock(rq);
 
                if (rt_rq->rt_time) {
@@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 
                if (enqueue)
                        sched_rt_rq_enqueue(rt_rq);
-               raw_spin_rq_unlock(rq);
+               rq_unlock(rq, &rf);
        }
 
        if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
index 7f338c53ce42cfd80ea235817119240bebc9ce43..fe4d1acb7e381159b73fb497e62219fa81cc6484 100644 (file)
@@ -2474,6 +2474,24 @@ unsigned long arch_scale_freq_capacity(int cpu)
 }
 #endif
 
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
+ * acquire rq lock instead of rq_lock(). So at the end of these two functions
+ * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
+ * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
+ */
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
+{
+       rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+       /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
+#ifdef CONFIG_SMP
+       rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
+}
+#else
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+#endif
 
 #ifdef CONFIG_SMP
 
@@ -2539,14 +2557,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
-       if (__rq_lockp(this_rq) == __rq_lockp(busiest))
-               return 0;
-
-       if (likely(raw_spin_rq_trylock(busiest)))
+       if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+           likely(raw_spin_rq_trylock(busiest))) {
+               double_rq_clock_clear_update(this_rq, busiest);
                return 0;
+       }
 
        if (rq_order_less(this_rq, busiest)) {
                raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+               double_rq_clock_clear_update(this_rq, busiest);
                return 0;
        }
 
@@ -2640,6 +2659,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
        BUG_ON(rq1 != rq2);
        raw_spin_rq_lock(rq1);
        __acquire(rq2->lock);   /* Fake it out ;) */
+       double_rq_clock_clear_update(rq1, rq2);
 }
 
 /*