Merge tag 'x86-core-2023-06-26' of ssh://gitolite.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 26 Jun 2023 21:45:53 +0000 (14:45 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 26 Jun 2023 21:45:53 +0000 (14:45 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Jun 2023 21:45:53 +0000 (14:45 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 26 Jun 2023 21:45:53 +0000 (14:45 -0700)
diff --cc arch/x86/include/asm/cpu.h
Simple merge
diff --cc arch/x86/include/asm/smp.h

index 42060775a3d06bd8ed6cd73b0a3bf29a9582d1ba,5906aa914220526be7f79d3bfac99abceff97fa2..600cf25dbfc64b0f43a2257cabb50b5c0ce43883
--- 1/arch/x86/include/asm/smp.h
--- 2/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@@ -126,7 -130,10 +126,9 @@@ void native_play_dead(void)
   void play_dead_common(void);
   void wbinvd_on_cpu(int cpu);
   int wbinvd_on_all_cpus(void);
- -void cond_wakeup_cpu0(void);
   
+ void smp_kick_mwait_play_dead(void);
+ 
   void native_smp_send_reschedule(int cpu);
   void native_send_call_func_ipi(const struct cpumask *mask);
   void native_send_call_func_single_ipi(int cpu);
diff --cc arch/x86/kernel/smp.c
Simple merge
diff --cc arch/x86/kernel/smpboot.c

index 8de80608fd01cb3efb677817620008f901eb232b,4ee43396b9102481d6de4957b08d8d3f2f06e8ec..8779a7ed3e8776217240a0d1bd09ef85958365eb
--- 1/arch/x86/kernel/smpboot.c
--- 2/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -103,12 -102,20 +104,26 @@@ EXPORT_PER_CPU_SYMBOL(cpu_die_map)
   DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
   EXPORT_PER_CPU_SYMBOL(cpu_info);
   
+ +/* CPUs which are the primary SMT threads */
+ +struct cpumask __cpu_primary_thread_mask __read_mostly;
+ +
+ +/* Representing CPUs for which sibling maps can be computed */
+ +static cpumask_var_t cpu_sibling_setup_mask;
+ +
+ struct mwait_cpu_dead {
+       unsigned int    control;
+       unsigned int    status;
+ };
+ 
+ #define CPUDEAD_MWAIT_WAIT    0xDEADBEEF
+ #define CPUDEAD_MWAIT_KEXEC_HLT       0x4A17DEAD
+ 
+ /*
+  * Cache line aligned data for mwait_play_dead(). Separate on purpose so
+  * that it's unlikely to be touched by other CPUs.
+  */
+ static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
+ 
   /* Logical package management. We might want to allocate that dynamically */
   unsigned int __max_logical_packages __read_mostly;
   EXPORT_SYMBOL(__max_logical_packages);
@@@ -161,19 -169,31 +176,23 @@@ static inline void smpboot_restore_warm
   
   }
   
- -/*
- - * Report back to the Boot Processor during boot time or to the caller processor
- - * during CPU online.
- - */
- -static void smp_callin(void)
+ +/* Run the next set of setup steps for the upcoming CPU */
+ +static void ap_starting(void)
   {
- -      int cpuid;
+ +      int cpuid = smp_processor_id();
   
+       /* Mop up eventual mwait_play_dead() wreckage */
+       this_cpu_write(mwait_cpu_dead.status, 0);
+       this_cpu_write(mwait_cpu_dead.control, 0);
+ 
         /*
- -       * If waken up by an INIT in an 82489DX configuration
- -       * cpu_callout_mask guarantees we don't get here before
- -       * an INIT_deassert IPI reaches our local APIC, so it is
- -       * now safe to touch our local APIC.
- -       */
- -      cpuid = smp_processor_id();
- -
- -      /*
- -       * the boot CPU has finished the init stage and is spinning
- -       * on callin_map until we finish. We are free to set up this
- -       * CPU, first the APIC. (this is probably redundant on most
- -       * boards)
+ +       * If woken up by an INIT in an 82489DX configuration the alive
+ +       * synchronization guarantees that the CPU does not reach this
+ +       * point before an INIT_deassert IPI reaches the local APIC, so it
+ +       * is now safe to touch the local APIC.
+ +       *
+ +       * Set up this CPU, first the APIC, which is probably redundant on
+ +       * most boards.
          */
         apic_ap_setup();
   
@@@ -826,21 -814,53 +845,16 @@@ static void __init smp_quirk_init_udela
   }
   
   /*
- - * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
- - * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
- - * won't ... remember to clear down the APIC, etc later.
+ + * Wake up AP by INIT, INIT, STARTUP sequence.
    */
- static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
- -int
- -wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
- -{
- -      u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
- -      unsigned long send_status, accept_status = 0;
- -      int maxlvt;
- -
- -      /* Target chip */
- -      /* Boot on the stack */
- -      /* Kick the second */
- -      apic_icr_write(APIC_DM_NMI | dm, apicid);
- -
- -      pr_debug("Waiting for send to finish...\n");
- -      send_status = safe_apic_wait_icr_idle();
- -
- -      /*
- -       * Give the other CPU some time to accept the IPI.
- -       */
- -      udelay(200);
- -      if (APIC_INTEGRATED(boot_cpu_apic_version)) {
- -              maxlvt = lapic_get_maxlvt();
- -              if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
- -                      apic_write(APIC_ESR, 0);
- -              accept_status = (apic_read(APIC_ESR) & 0xEF);
- -      }
- -      pr_debug("NMI sent\n");
- -
- -      if (send_status)
- -              pr_err("APIC never delivered???\n");
- -      if (accept_status)
- -              pr_err("APIC delivery error (%lx)\n", accept_status);
- -
- -      return (send_status | accept_status);
- -}
- -
+ static void send_init_sequence(int phys_apicid)
   {
-       unsigned long send_status = 0, accept_status = 0;
-       int maxlvt, num_starts, j;
- 
-       preempt_disable();
-       maxlvt = lapic_get_maxlvt();
+       int maxlvt = lapic_get_maxlvt();
   
-       /*
-        * Be paranoid about clearing APIC errors.
-        */
+       /* Be paranoid about clearing APIC errors. */
         if (APIC_INTEGRATED(boot_cpu_apic_version)) {
-               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
+               /* Due to the Pentium erratum 3AP.  */
+               if (maxlvt > 3)
                         apic_write(APIC_ESR, 0);
                 apic_read(APIC_ESR);
         }
@@@ -861,14 -871,20 +865,22 @@@
   
         udelay(init_udelay);
   
-       pr_debug("Deasserting INIT\n");
- 
-       /* Target chip */
-       /* Send IPI */
+       /* Deassert INIT on the target CPU */
         apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
+       safe_apic_wait_icr_idle();
+ }
   
-       pr_debug("Waiting for send to finish...\n");
-       send_status = safe_apic_wait_icr_idle();
+ /*
+  * Wake up AP by INIT, INIT, STARTUP sequence.
+  */
+ static int wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
+ {
+       unsigned long send_status = 0, accept_status = 0;
- -      int num_starts, j, maxlvt = lapic_get_maxlvt();
++      int num_starts, j, maxlvt;
+ 
++      preempt_disable();
++      maxlvt = lapic_get_maxlvt();
+       send_init_sequence(phys_apicid);
   
         mb();
   
@@@ -1651,11 -1840,60 +1679,58 @@@ static inline void mwait_play_dead(void
                  * case where we return around the loop.
                  */
                 mb();
-               clflush(mwait_ptr);
+               clflush(md);
                 mb();
-               __monitor(mwait_ptr, 0, 0);
+               __monitor(md, 0, 0);
                 mb();
                 __mwait(eax, 0);
- -
- -              cond_wakeup_cpu0();
+ 
+               if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
+                       /*
+                        * Kexec is about to happen. Don't go back into mwait() as
+                        * the kexec kernel might overwrite text and data including
+                        * page tables and stack. So mwait() would resume when the
+                        * monitor cache line is written to and then the CPU goes
+                        * south due to overwritten text, page tables and stack.
+                        *
+                        * Note: This does _NOT_ protect against a stray MCE, NMI,
+                        * SMI. They will resume execution at the instruction
+                        * following the HLT instruction and run into the problem
+                        * which this is trying to prevent.
+                        */
+                       WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
+                       while(1)
+                               native_halt();
+               }
+       }
+ }
+ 
+ /*
+  * Kick all "offline" CPUs out of mwait on kexec(). See comment in
+  * mwait_play_dead().
+  */
+ void smp_kick_mwait_play_dead(void)
+ {
+       u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
+       struct mwait_cpu_dead *md;
+       unsigned int cpu, i;
+ 
+       for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
+               md = per_cpu_ptr(&mwait_cpu_dead, cpu);
+ 
+               /* Does it sit in mwait_play_dead() ? */
+               if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
+                       continue;
+ 
+               /* Wait up to 5ms */
+               for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
+                       /* Bring it out of mwait */
+                       WRITE_ONCE(md->control, newstate);
+                       udelay(5);
+               }
+ 
+               if (READ_ONCE(md->status) != newstate)
+                       pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
         }
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 26 Jun 2023 21:45:53 +0000 (14:45 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 26 Jun 2023 21:45:53 +0000 (14:45 -0700)
		1	2
arch/x86/include/asm/cpu.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/smp.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smp.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history