kernel/smp.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Generic helpers for smp ipi calls
   4  *
   5  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
   6  */
   7
   8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   9
  10 #include <linux/irq_work.h>
  11 #include <linux/rcupdate.h>
  12 #include <linux/rculist.h>
  13 #include <linux/kernel.h>
  14 #include <linux/export.h>
  15 #include <linux/percpu.h>
  16 #include <linux/init.h>
  17 #include <linux/interrupt.h>
  18 #include <linux/gfp.h>
  19 #include <linux/smp.h>
  20 #include <linux/cpu.h>
  21 #include <linux/sched.h>
  22 #include <linux/sched/idle.h>
  23 #include <linux/hypervisor.h>
  24 #include <linux/sched/clock.h>
  25 #include <linux/nmi.h>
  26 #include <linux/sched/debug.h>
  27 #include <linux/jump_label.h>
  28
  29 #include <trace/events/ipi.h>
  30 #define CREATE_TRACE_POINTS
  31 #include <trace/events/csd.h>
  32 #undef CREATE_TRACE_POINTS
  33
  34 #include "smpboot.h"
  35 #include "sched/smp.h"
  36
  37 #define CSD_TYPE(_csd)  ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
  38
  39 struct call_function_data {
  40         call_single_data_t      __percpu *csd;
  41         cpumask_var_t           cpumask;
  42         cpumask_var_t           cpumask_ipi;
  43 };
  44
  45 static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
  46
  47 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
  48
  49 static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);
  50
  51 static void __flush_smp_call_function_queue(bool warn_cpu_offline);
  52
  53 int smpcfd_prepare_cpu(unsigned int cpu)
  54 {
  55         struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
  56
  57         if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
  58                                      cpu_to_node(cpu)))
  59                 return -ENOMEM;
  60         if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
  61                                      cpu_to_node(cpu))) {
  62                 free_cpumask_var(cfd->cpumask);
  63                 return -ENOMEM;
  64         }
  65         cfd->csd = alloc_percpu(call_single_data_t);
  66         if (!cfd->csd) {
  67                 free_cpumask_var(cfd->cpumask);
  68                 free_cpumask_var(cfd->cpumask_ipi);
  69                 return -ENOMEM;
  70         }
  71
  72         return 0;
  73 }
  74
  75 int smpcfd_dead_cpu(unsigned int cpu)
  76 {
  77         struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
  78
  79         free_cpumask_var(cfd->cpumask);
  80         free_cpumask_var(cfd->cpumask_ipi);
  81         free_percpu(cfd->csd);
  82         return 0;
  83 }
  84
  85 int smpcfd_dying_cpu(unsigned int cpu)
  86 {
  87         /*
  88          * The IPIs for the smp-call-function callbacks queued by other
  89          * CPUs might arrive late, either due to hardware latencies or
  90          * because this CPU disabled interrupts (inside stop-machine)
  91          * before the IPIs were sent. So flush out any pending callbacks
  92          * explicitly (without waiting for the IPIs to arrive), to
  93          * ensure that the outgoing CPU doesn't go offline with work
  94          * still pending.
  95          */
  96         __flush_smp_call_function_queue(false);
  97         irq_work_run();
  98         return 0;
  99 }
 100
 101 void __init call_function_init(void)
 102 {
 103         int i;
 104
 105         for_each_possible_cpu(i)
 106                 init_llist_head(&per_cpu(call_single_queue, i));
 107
 108         smpcfd_prepare_cpu(smp_processor_id());
 109 }
 110
 111 static __always_inline void
 112 send_call_function_single_ipi(int cpu)
 113 {
 114         if (call_function_single_prep_ipi(cpu)) {
 115                 trace_ipi_send_cpu(cpu, _RET_IP_,
 116                                    generic_smp_call_function_single_interrupt);
 117                 arch_send_call_function_single_ipi(cpu);
 118         }
 119 }
 120
 121 static __always_inline void
 122 send_call_function_ipi_mask(struct cpumask *mask)
 123 {
 124         trace_ipi_send_cpumask(mask, _RET_IP_,
 125                                generic_smp_call_function_single_interrupt);
 126         arch_send_call_function_ipi_mask(mask);
 127 }
 128
 129 static __always_inline void
 130 csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
 131 {
 132         trace_csd_function_entry(func, csd);
 133         func(info);
 134         trace_csd_function_exit(func, csd);
 135 }
 136
 137 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 138
 139 static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
 140
 141 /*
 142  * Parse the csdlock_debug= kernel boot parameter.
 143  *
 144  * If you need to restore the old "ext" value that once provided
 145  * additional debugging information, reapply the following commits:
 146  *
 147  * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
 148  * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
 149  */
 150 static int __init csdlock_debug(char *str)
 151 {
 152         int ret;
 153         unsigned int val = 0;
 154
 155         ret = get_option(&str, &val);
 156         if (ret) {
 157                 if (val)
 158                         static_branch_enable(&csdlock_debug_enabled);
 159                 else
 160                         static_branch_disable(&csdlock_debug_enabled);
 161         }
 162
 163         return 1;
 164 }
 165 __setup("csdlock_debug=", csdlock_debug);
 166
 167 static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
 168 static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
 169 static DEFINE_PER_CPU(void *, cur_csd_info);
 170
 171 static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
 172 module_param(csd_lock_timeout, ulong, 0444);
 173 static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
 174 module_param(panic_on_ipistall, int, 0444);
 175
 176 static atomic_t csd_bug_count = ATOMIC_INIT(0);
 177
 178 /* Record current CSD work for current CPU, NULL to erase. */
 179 static void __csd_lock_record(call_single_data_t *csd)
 180 {
 181         if (!csd) {
 182                 smp_mb(); /* NULL cur_csd after unlock. */
 183                 __this_cpu_write(cur_csd, NULL);
 184                 return;
 185         }
 186         __this_cpu_write(cur_csd_func, csd->func);
 187         __this_cpu_write(cur_csd_info, csd->info);
 188         smp_wmb(); /* func and info before csd. */
 189         __this_cpu_write(cur_csd, csd);
 190         smp_mb(); /* Update cur_csd before function call. */
 191                   /* Or before unlock, as the case may be. */
 192 }
 193
 194 static __always_inline void csd_lock_record(call_single_data_t *csd)
 195 {
 196         if (static_branch_unlikely(&csdlock_debug_enabled))
 197                 __csd_lock_record(csd);
 198 }
 199
 200 static int csd_lock_wait_getcpu(call_single_data_t *csd)
 201 {
 202         unsigned int csd_type;
 203
 204         csd_type = CSD_TYPE(csd);
 205         if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
 206                 return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
 207         return -1;
 208 }
 209
 210 /*
 211  * Complain if too much time spent waiting.  Note that only
 212  * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
 213  * so waiting on other types gets much less information.
 214  */
 215 static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
 216 {
 217         int cpu = -1;
 218         int cpux;
 219         bool firsttime;
 220         u64 ts2, ts_delta;
 221         call_single_data_t *cpu_cur_csd;
 222         unsigned int flags = READ_ONCE(csd->node.u_flags);
 223         unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;
 224
 225         if (!(flags & CSD_FLAG_LOCK)) {
 226                 if (!unlikely(*bug_id))
 227                         return true;
 228                 cpu = csd_lock_wait_getcpu(csd);
 229                 pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
 230                          *bug_id, raw_smp_processor_id(), cpu);
 231                 return true;
 232         }
 233
 234         ts2 = sched_clock();
 235         /* How long since we last checked for a stuck CSD lock.*/
 236         ts_delta = ts2 - *ts1;
 237         if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0))
 238                 return false;
 239
 240         firsttime = !*bug_id;
 241         if (firsttime)
 242                 *bug_id = atomic_inc_return(&csd_bug_count);
 243         cpu = csd_lock_wait_getcpu(csd);
 244         if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
 245                 cpux = 0;
 246         else
 247                 cpux = cpu;
 248         cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
 249         /* How long since this CSD lock was stuck. */
 250         ts_delta = ts2 - ts0;
 251         pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
 252                  firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta,
 253                  cpu, csd->func, csd->info);
 254         /*
 255          * If the CSD lock is still stuck after 5 minutes, it is unlikely
 256          * to become unstuck. Use a signed comparison to avoid triggering
 257          * on underflows when the TSC is out of sync between sockets.
 258          */
 259         BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
 260         if (cpu_cur_csd && csd != cpu_cur_csd) {
 261                 pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
 262                          *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
 263                          READ_ONCE(per_cpu(cur_csd_info, cpux)));
 264         } else {
 265                 pr_alert("\tcsd: CSD lock (#%d) %s.\n",
 266                          *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
 267         }
 268         if (cpu >= 0) {
 269                 if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
 270                         dump_cpu_task(cpu);
 271                 if (!cpu_cur_csd) {
 272                         pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
 273                         arch_send_call_function_single_ipi(cpu);
 274                 }
 275         }
 276         if (firsttime)
 277                 dump_stack();
 278         *ts1 = ts2;
 279
 280         return false;
 281 }
 282
 283 /*
 284  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
 285  *
 286  * For non-synchronous ipi calls the csd can still be in use by the
 287  * previous function call. For multi-cpu calls its even more interesting
 288  * as we'll have to ensure no other cpu is observing our csd.
 289  */
 290 static void __csd_lock_wait(call_single_data_t *csd)
 291 {
 292         int bug_id = 0;
 293         u64 ts0, ts1;
 294
 295         ts1 = ts0 = sched_clock();
 296         for (;;) {
 297                 if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
 298                         break;
 299                 cpu_relax();
 300         }
 301         smp_acquire__after_ctrl_dep();
 302 }
 303
 304 static __always_inline void csd_lock_wait(call_single_data_t *csd)
 305 {
 306         if (static_branch_unlikely(&csdlock_debug_enabled)) {
 307                 __csd_lock_wait(csd);
 308                 return;
 309         }
 310
 311         smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 312 }
 313 #else
 314 static void csd_lock_record(call_single_data_t *csd)
 315 {
 316 }
 317
 318 static __always_inline void csd_lock_wait(call_single_data_t *csd)
 319 {
 320         smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
 321 }
 322 #endif
 323
 324 static __always_inline void csd_lock(call_single_data_t *csd)
 325 {
 326         csd_lock_wait(csd);
 327         csd->node.u_flags |= CSD_FLAG_LOCK;
 328
 329         /*
 330          * prevent CPU from reordering the above assignment
 331          * to ->flags with any subsequent assignments to other
 332          * fields of the specified call_single_data_t structure:
 333          */
 334         smp_wmb();
 335 }
 336
 337 static __always_inline void csd_unlock(call_single_data_t *csd)
 338 {
 339         WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));
 340
 341         /*
 342          * ensure we're all done before releasing data:
 343          */
 344         smp_store_release(&csd->node.u_flags, 0);
 345 }
 346
 347 static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
 348
 349 void __smp_call_single_queue(int cpu, struct llist_node *node)
 350 {
 351         /*
 352          * We have to check the type of the CSD before queueing it, because
 353          * once queued it can have its flags cleared by
 354          *   flush_smp_call_function_queue()
 355          * even if we haven't sent the smp_call IPI yet (e.g. the stopper
 356          * executes migration_cpu_stop() on the remote CPU).
 357          */
 358         if (trace_csd_queue_cpu_enabled()) {
 359                 call_single_data_t *csd;
 360                 smp_call_func_t func;
 361
 362                 csd = container_of(node, call_single_data_t, node.llist);
 363                 func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
 364                         sched_ttwu_pending : csd->func;
 365
 366                 trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
 367         }
 368
 369         /*
 370          * The list addition should be visible to the target CPU when it pops
 371          * the head of the list to pull the entry off it in the IPI handler
 372          * because of normal cache coherency rules implied by the underlying
 373          * llist ops.
 374          *
 375          * If IPIs can go out of order to the cache coherency protocol
 376          * in an architecture, sufficient synchronisation should be added
 377          * to arch code to make it appear to obey cache coherency WRT
 378          * locking and barrier primitives. Generic code isn't really
 379          * equipped to do the right thing...
 380          */
 381         if (llist_add(node, &per_cpu(call_single_queue, cpu)))
 382                 send_call_function_single_ipi(cpu);
 383 }
 384
 385 /*
 386  * Insert a previously allocated call_single_data_t element
 387  * for execution on the given CPU. data must already have
 388  * ->func, ->info, and ->flags set.
 389  */
 390 static int generic_exec_single(int cpu, call_single_data_t *csd)
 391 {
 392         if (cpu == smp_processor_id()) {
 393                 smp_call_func_t func = csd->func;
 394                 void *info = csd->info;
 395                 unsigned long flags;
 396
 397                 /*
 398                  * We can unlock early even for the synchronous on-stack case,
 399                  * since we're doing this from the same CPU..
 400                  */
 401                 csd_lock_record(csd);
 402                 csd_unlock(csd);
 403                 local_irq_save(flags);
 404                 csd_do_func(func, info, NULL);
 405                 csd_lock_record(NULL);
 406                 local_irq_restore(flags);
 407                 return 0;
 408         }
 409
 410         if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
 411                 csd_unlock(csd);
 412                 return -ENXIO;
 413         }
 414
 415         __smp_call_single_queue(cpu, &csd->node.llist);
 416
 417         return 0;
 418 }
 419
 420 /**
 421  * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
 422  *
 423  * Invoked by arch to handle an IPI for call function single.
 424  * Must be called with interrupts disabled.
 425  */
 426 void generic_smp_call_function_single_interrupt(void)
 427 {
 428         __flush_smp_call_function_queue(true);
 429 }
 430
 431 /**
 432  * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 433  *
 434  * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
 435  *                    offline CPU. Skip this check if set to 'false'.
 436  *
 437  * Flush any pending smp-call-function callbacks queued on this CPU. This is
 438  * invoked by the generic IPI handler, as well as by a CPU about to go offline,
 439  * to ensure that all pending IPI callbacks are run before it goes completely
 440  * offline.
 441  *
 442  * Loop through the call_single_queue and run all the queued callbacks.
 443  * Must be called with interrupts disabled.
 444  */
 445 static void __flush_smp_call_function_queue(bool warn_cpu_offline)
 446 {
 447         call_single_data_t *csd, *csd_next;
 448         struct llist_node *entry, *prev;
 449         struct llist_head *head;
 450         static bool warned;
 451         atomic_t *tbt;
 452
 453         lockdep_assert_irqs_disabled();
 454
 455         /* Allow waiters to send backtrace NMI from here onwards */
 456         tbt = this_cpu_ptr(&trigger_backtrace);
 457         atomic_set_release(tbt, 1);
 458
 459         head = this_cpu_ptr(&call_single_queue);
 460         entry = llist_del_all(head);
 461         entry = llist_reverse_order(entry);
 462
 463         /* There shouldn't be any pending callbacks on an offline CPU. */
 464         if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
 465                      !warned && entry != NULL)) {
 466                 warned = true;
 467                 WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
 468
 469                 /*
 470                  * We don't have to use the _safe() variant here
 471                  * because we are not invoking the IPI handlers yet.
 472                  */
 473                 llist_for_each_entry(csd, entry, node.llist) {
 474                         switch (CSD_TYPE(csd)) {
 475                         case CSD_TYPE_ASYNC:
 476                         case CSD_TYPE_SYNC:
 477                         case CSD_TYPE_IRQ_WORK:
 478                                 pr_warn("IPI callback %pS sent to offline CPU\n",
 479                                         csd->func);
 480                                 break;
 481
 482                         case CSD_TYPE_TTWU:
 483                                 pr_warn("IPI task-wakeup sent to offline CPU\n");
 484                                 break;
 485
 486                         default:
 487                                 pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
 488                                         CSD_TYPE(csd));
 489                                 break;
 490                         }
 491                 }
 492         }
 493
 494         /*
 495          * First; run all SYNC callbacks, people are waiting for us.
 496          */
 497         prev = NULL;
 498         llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 499                 /* Do we wait until *after* callback? */
 500                 if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
 501                         smp_call_func_t func = csd->func;
 502                         void *info = csd->info;
 503
 504                         if (prev) {
 505                                 prev->next = &csd_next->node.llist;
 506                         } else {
 507                                 entry = &csd_next->node.llist;
 508                         }
 509
 510                         csd_lock_record(csd);
 511                         csd_do_func(func, info, csd);
 512                         csd_unlock(csd);
 513                         csd_lock_record(NULL);
 514                 } else {
 515                         prev = &csd->node.llist;
 516                 }
 517         }
 518
 519         if (!entry)
 520                 return;
 521
 522         /*
 523          * Second; run all !SYNC callbacks.
 524          */
 525         prev = NULL;
 526         llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
 527                 int type = CSD_TYPE(csd);
 528
 529                 if (type != CSD_TYPE_TTWU) {
 530                         if (prev) {
 531                                 prev->next = &csd_next->node.llist;
 532                         } else {
 533                                 entry = &csd_next->node.llist;
 534                         }
 535
 536                         if (type == CSD_TYPE_ASYNC) {
 537                                 smp_call_func_t func = csd->func;
 538                                 void *info = csd->info;
 539
 540                                 csd_lock_record(csd);
 541                                 csd_unlock(csd);
 542                                 csd_do_func(func, info, csd);
 543                                 csd_lock_record(NULL);
 544                         } else if (type == CSD_TYPE_IRQ_WORK) {
 545                                 irq_work_single(csd);
 546                         }
 547
 548                 } else {
 549                         prev = &csd->node.llist;
 550                 }
 551         }
 552
 553         /*
 554          * Third; only CSD_TYPE_TTWU is left, issue those.
 555          */
 556         if (entry) {
 557                 csd = llist_entry(entry, typeof(*csd), node.llist);
 558                 csd_do_func(sched_ttwu_pending, entry, csd);
 559         }
 560 }
 561
 562
 563 /**
 564  * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 565  *                                 from task context (idle, migration thread)
 566  *
 567  * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
 568  * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
 569  * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
 570  * handle queued SMP function calls before scheduling.
 571  *
 572  * The migration thread has to ensure that an eventually pending wakeup has
 573  * been handled before it migrates a task.
 574  */
 575 void flush_smp_call_function_queue(void)
 576 {
 577         unsigned int was_pending;
 578         unsigned long flags;
 579
 580         if (llist_empty(this_cpu_ptr(&call_single_queue)))
 581                 return;
 582
 583         local_irq_save(flags);
 584         /* Get the already pending soft interrupts for RT enabled kernels */
 585         was_pending = local_softirq_pending();
 586         __flush_smp_call_function_queue(true);
 587         if (local_softirq_pending())
 588                 do_softirq_post_smp_call_flush(was_pending);
 589
 590         local_irq_restore(flags);
 591 }
 592
 593 /*
 594  * smp_call_function_single - Run a function on a specific CPU
 595  * @func: The function to run. This must be fast and non-blocking.
 596  * @info: An arbitrary pointer to pass to the function.
 597  * @wait: If true, wait until function has completed on other CPUs.
 598  *
 599  * Returns 0 on success, else a negative status code.
 600  */
 601 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 602                              int wait)
 603 {
 604         call_single_data_t *csd;
 605         call_single_data_t csd_stack = {
 606                 .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
 607         };
 608         int this_cpu;
 609         int err;
 610
 611         /*
 612          * prevent preemption and reschedule on another processor,
 613          * as well as CPU removal
 614          */
 615         this_cpu = get_cpu();
 616
 617         /*
 618          * Can deadlock when called with interrupts disabled.
 619          * We allow cpu's that are not yet online though, as no one else can
 620          * send smp call function interrupt to this cpu and as such deadlocks
 621          * can't happen.
 622          */
 623         WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
 624                      && !oops_in_progress);
 625
 626         /*
 627          * When @wait we can deadlock when we interrupt between llist_add() and
 628          * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
 629          * csd_lock() on because the interrupt context uses the same csd
 630          * storage.
 631          */
 632         WARN_ON_ONCE(!in_task());
 633
 634         csd = &csd_stack;
 635         if (!wait) {
 636                 csd = this_cpu_ptr(&csd_data);
 637                 csd_lock(csd);
 638         }
 639
 640         csd->func = func;
 641         csd->info = info;
 642 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 643         csd->node.src = smp_processor_id();
 644         csd->node.dst = cpu;
 645 #endif
 646
 647         err = generic_exec_single(cpu, csd);
 648
 649         if (wait)
 650                 csd_lock_wait(csd);
 651
 652         put_cpu();
 653
 654         return err;
 655 }
 656 EXPORT_SYMBOL(smp_call_function_single);
 657
 658 /**
 659  * smp_call_function_single_async() - Run an asynchronous function on a
 660  *                               specific CPU.
 661  * @cpu: The CPU to run on.
 662  * @csd: Pre-allocated and setup data structure
 663  *
 664  * Like smp_call_function_single(), but the call is asynchonous and
 665  * can thus be done from contexts with disabled interrupts.
 666  *
 667  * The caller passes his own pre-allocated data structure
 668  * (ie: embedded in an object) and is responsible for synchronizing it
 669  * such that the IPIs performed on the @csd are strictly serialized.
 670  *
 671  * If the function is called with one csd which has not yet been
 672  * processed by previous call to smp_call_function_single_async(), the
 673  * function will return immediately with -EBUSY showing that the csd
 674  * object is still in progress.
 675  *
 676  * NOTE: Be careful, there is unfortunately no current debugging facility to
 677  * validate the correctness of this serialization.
 678  *
 679  * Return: %0 on success or negative errno value on error
 680  */
 681 int smp_call_function_single_async(int cpu, call_single_data_t *csd)
 682 {
 683         int err = 0;
 684
 685         preempt_disable();
 686
 687         if (csd->node.u_flags & CSD_FLAG_LOCK) {
 688                 err = -EBUSY;
 689                 goto out;
 690         }
 691
 692         csd->node.u_flags = CSD_FLAG_LOCK;
 693         smp_wmb();
 694
 695         err = generic_exec_single(cpu, csd);
 696
 697 out:
 698         preempt_enable();
 699
 700         return err;
 701 }
 702 EXPORT_SYMBOL_GPL(smp_call_function_single_async);
 703
 704 /*
 705  * smp_call_function_any - Run a function on any of the given cpus
 706  * @mask: The mask of cpus it can run on.
 707  * @func: The function to run. This must be fast and non-blocking.
 708  * @info: An arbitrary pointer to pass to the function.
 709  * @wait: If true, wait until function has completed.
 710  *
 711  * Returns 0 on success, else a negative status code (if no cpus were online).
 712  *
 713  * Selection preference:
 714  *      1) current cpu if in @mask
 715  *      2) any cpu of current node if in @mask
 716  *      3) any other online cpu in @mask
 717  */
 718 int smp_call_function_any(const struct cpumask *mask,
 719                           smp_call_func_t func, void *info, int wait)
 720 {
 721         unsigned int cpu;
 722         const struct cpumask *nodemask;
 723         int ret;
 724
 725         /* Try for same CPU (cheapest) */
 726         cpu = get_cpu();
 727         if (cpumask_test_cpu(cpu, mask))
 728                 goto call;
 729
 730         /* Try for same node. */
 731         nodemask = cpumask_of_node(cpu_to_node(cpu));
 732         for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
 733              cpu = cpumask_next_and(cpu, nodemask, mask)) {
 734                 if (cpu_online(cpu))
 735                         goto call;
 736         }
 737
 738         /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
 739         cpu = cpumask_any_and(mask, cpu_online_mask);
 740 call:
 741         ret = smp_call_function_single(cpu, func, info, wait);
 742         put_cpu();
 743         return ret;
 744 }
 745 EXPORT_SYMBOL_GPL(smp_call_function_any);
 746
 747 /*
 748  * Flags to be used as scf_flags argument of smp_call_function_many_cond().
 749  *
 750  * %SCF_WAIT:           Wait until function execution is completed
 751  * %SCF_RUN_LOCAL:      Run also locally if local cpu is set in cpumask
 752  */
 753 #define SCF_WAIT        (1U << 0)
 754 #define SCF_RUN_LOCAL   (1U << 1)
 755
 756 static void smp_call_function_many_cond(const struct cpumask *mask,
 757                                         smp_call_func_t func, void *info,
 758                                         unsigned int scf_flags,
 759                                         smp_cond_func_t cond_func)
 760 {
 761         int cpu, last_cpu, this_cpu = smp_processor_id();
 762         struct call_function_data *cfd;
 763         bool wait = scf_flags & SCF_WAIT;
 764         int nr_cpus = 0;
 765         bool run_remote = false;
 766         bool run_local = false;
 767
 768         lockdep_assert_preemption_disabled();
 769
 770         /*
 771          * Can deadlock when called with interrupts disabled.
 772          * We allow cpu's that are not yet online though, as no one else can
 773          * send smp call function interrupt to this cpu and as such deadlocks
 774          * can't happen.
 775          */
 776         if (cpu_online(this_cpu) && !oops_in_progress &&
 777             !early_boot_irqs_disabled)
 778                 lockdep_assert_irqs_enabled();
 779
 780         /*
 781          * When @wait we can deadlock when we interrupt between llist_add() and
 782          * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
 783          * csd_lock() on because the interrupt context uses the same csd
 784          * storage.
 785          */
 786         WARN_ON_ONCE(!in_task());
 787
 788         /* Check if we need local execution. */
 789         if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
 790                 run_local = true;
 791
 792         /* Check if we need remote execution, i.e., any CPU excluding this one. */
 793         cpu = cpumask_first_and(mask, cpu_online_mask);
 794         if (cpu == this_cpu)
 795                 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
 796         if (cpu < nr_cpu_ids)
 797                 run_remote = true;
 798
 799         if (run_remote) {
 800                 cfd = this_cpu_ptr(&cfd_data);
 801                 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
 802                 __cpumask_clear_cpu(this_cpu, cfd->cpumask);
 803
 804                 cpumask_clear(cfd->cpumask_ipi);
 805                 for_each_cpu(cpu, cfd->cpumask) {
 806                         call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
 807
 808                         if (cond_func && !cond_func(cpu, info)) {
 809                                 __cpumask_clear_cpu(cpu, cfd->cpumask);
 810                                 continue;
 811                         }
 812
 813                         csd_lock(csd);
 814                         if (wait)
 815                                 csd->node.u_flags |= CSD_TYPE_SYNC;
 816                         csd->func = func;
 817                         csd->info = info;
 818 #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
 819                         csd->node.src = smp_processor_id();
 820                         csd->node.dst = cpu;
 821 #endif
 822                         trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
 823
 824                         if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
 825                                 __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 826                                 nr_cpus++;
 827                                 last_cpu = cpu;
 828                         }
 829                 }
 830
 831                 /*
 832                  * Choose the most efficient way to send an IPI. Note that the
 833                  * number of CPUs might be zero due to concurrent changes to the
 834                  * provided mask.
 835                  */
 836                 if (nr_cpus == 1)
 837                         send_call_function_single_ipi(last_cpu);
 838                 else if (likely(nr_cpus > 1))
 839                         send_call_function_ipi_mask(cfd->cpumask_ipi);
 840         }
 841
 842         if (run_local && (!cond_func || cond_func(this_cpu, info))) {
 843                 unsigned long flags;
 844
 845                 local_irq_save(flags);
 846                 csd_do_func(func, info, NULL);
 847                 local_irq_restore(flags);
 848         }
 849
 850         if (run_remote && wait) {
 851                 for_each_cpu(cpu, cfd->cpumask) {
 852                         call_single_data_t *csd;
 853
 854                         csd = per_cpu_ptr(cfd->csd, cpu);
 855                         csd_lock_wait(csd);
 856                 }
 857         }
 858 }
 859
 860 /**
 861  * smp_call_function_many(): Run a function on a set of CPUs.
 862  * @mask: The set of cpus to run on (only runs on online subset).
 863  * @func: The function to run. This must be fast and non-blocking.
 864  * @info: An arbitrary pointer to pass to the function.
 865  * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
 866  *        (atomically) until function has completed on other CPUs. If
 867  *        %SCF_RUN_LOCAL is set, the function will also be run locally
 868  *        if the local CPU is set in the @cpumask.
 869  *
 870  * If @wait is true, then returns once @func has returned.
 871  *
 872  * You must not call this function with disabled interrupts or from a
 873  * hardware interrupt handler or from a bottom half handler. Preemption
 874  * must be disabled when calling this function.
 875  */
 876 void smp_call_function_many(const struct cpumask *mask,
 877                             smp_call_func_t func, void *info, bool wait)
 878 {
 879         smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
 880 }
 881 EXPORT_SYMBOL(smp_call_function_many);
 882
 883 /**
 884  * smp_call_function(): Run a function on all other CPUs.
 885  * @func: The function to run. This must be fast and non-blocking.
 886  * @info: An arbitrary pointer to pass to the function.
 887  * @wait: If true, wait (atomically) until function has completed
 888  *        on other CPUs.
 889  *
 890  * Returns 0.
 891  *
 892  * If @wait is true, then returns once @func has returned; otherwise
 893  * it returns just before the target cpu calls @func.
 894  *
 895  * You must not call this function with disabled interrupts or from a
 896  * hardware interrupt handler or from a bottom half handler.
 897  */
 898 void smp_call_function(smp_call_func_t func, void *info, int wait)
 899 {
 900         preempt_disable();
 901         smp_call_function_many(cpu_online_mask, func, info, wait);
 902         preempt_enable();
 903 }
 904 EXPORT_SYMBOL(smp_call_function);
 905
 906 /* Setup configured maximum number of CPUs to activate */
 907 unsigned int setup_max_cpus = NR_CPUS;
 908 EXPORT_SYMBOL(setup_max_cpus);
 909
 910
 911 /*
 912  * Setup routine for controlling SMP activation
 913  *
 914  * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 915  * activation entirely (the MPS table probe still happens, though).
 916  *
 917  * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 918  * greater than 0, limits the maximum number of CPUs activated in
 919  * SMP mode to <NUM>.
 920  */
 921
 922 void __weak __init arch_disable_smp_support(void) { }
 923
 924 static int __init nosmp(char *str)
 925 {
 926         setup_max_cpus = 0;
 927         arch_disable_smp_support();
 928
 929         return 0;
 930 }
 931
 932 early_param("nosmp", nosmp);
 933
 934 /* this is hard limit */
 935 static int __init nrcpus(char *str)
 936 {
 937         int nr_cpus;
 938
 939         if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
 940                 set_nr_cpu_ids(nr_cpus);
 941
 942         return 0;
 943 }
 944
 945 early_param("nr_cpus", nrcpus);
 946
 947 static int __init maxcpus(char *str)
 948 {
 949         get_option(&str, &setup_max_cpus);
 950         if (setup_max_cpus == 0)
 951                 arch_disable_smp_support();
 952
 953         return 0;
 954 }
 955
 956 early_param("maxcpus", maxcpus);
 957
 958 #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
 959 /* Setup number of possible processor ids */
 960 unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
 961 EXPORT_SYMBOL(nr_cpu_ids);
 962 #endif
 963
 964 /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
 965 void __init setup_nr_cpu_ids(void)
 966 {
 967         set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
 968 }
 969
 970 /* Called by boot processor to activate the rest. */
 971 void __init smp_init(void)
 972 {
 973         int num_nodes, num_cpus;
 974
 975         idle_threads_init();
 976         cpuhp_threads_init();
 977
 978         pr_info("Bringing up secondary CPUs ...\n");
 979
 980         bringup_nonboot_cpus(setup_max_cpus);
 981
 982         num_nodes = num_online_nodes();
 983         num_cpus  = num_online_cpus();
 984         pr_info("Brought up %d node%s, %d CPU%s\n",
 985                 num_nodes, (num_nodes > 1 ? "s" : ""),
 986                 num_cpus,  (num_cpus  > 1 ? "s" : ""));
 987
 988         /* Any cleanup work */
 989         smp_cpus_done(setup_max_cpus);
 990 }
 991
 992 /*
 993  * on_each_cpu_cond(): Call a function on each processor for which
 994  * the supplied function cond_func returns true, optionally waiting
 995  * for all the required CPUs to finish. This may include the local
 996  * processor.
 997  * @cond_func:  A callback function that is passed a cpu id and
 998  *              the info parameter. The function is called
 999  *              with preemption disabled. The function should
1000  *              return a blooean value indicating whether to IPI
1001  *              the specified CPU.
1002  * @func:       The function to run on all applicable CPUs.
1003  *              This must be fast and non-blocking.
1004  * @info:       An arbitrary pointer to pass to both functions.
1005  * @wait:       If true, wait (atomically) until function has
1006  *              completed on other CPUs.
1007  *
1008  * Preemption is disabled to protect against CPUs going offline but not online.
1009  * CPUs going online during the call will not be seen or sent an IPI.
1010  *
1011  * You must not call this function with disabled interrupts or
1012  * from a hardware interrupt handler or from a bottom half handler.
1013  */
1014 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
1015                            void *info, bool wait, const struct cpumask *mask)
1016 {
1017         unsigned int scf_flags = SCF_RUN_LOCAL;
1018
1019         if (wait)
1020                 scf_flags |= SCF_WAIT;
1021
1022         preempt_disable();
1023         smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
1024         preempt_enable();
1025 }
1026 EXPORT_SYMBOL(on_each_cpu_cond_mask);
1027
1028 static void do_nothing(void *unused)
1029 {
1030 }
1031
1032 /**
1033  * kick_all_cpus_sync - Force all cpus out of idle
1034  *
1035  * Used to synchronize the update of pm_idle function pointer. It's
1036  * called after the pointer is updated and returns after the dummy
1037  * callback function has been executed on all cpus. The execution of
1038  * the function can only happen on the remote cpus after they have
1039  * left the idle function which had been called via pm_idle function
1040  * pointer. So it's guaranteed that nothing uses the previous pointer
1041  * anymore.
1042  */
1043 void kick_all_cpus_sync(void)
1044 {
1045         /* Make sure the change is visible before we kick the cpus */
1046         smp_mb();
1047         smp_call_function(do_nothing, NULL, 1);
1048 }
1049 EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
1050
1051 /**
1052  * wake_up_all_idle_cpus - break all cpus out of idle
1053  * wake_up_all_idle_cpus try to break all cpus which is in idle state even
1054  * including idle polling cpus, for non-idle cpus, we will do nothing
1055  * for them.
1056  */
1057 void wake_up_all_idle_cpus(void)
1058 {
1059         int cpu;
1060
1061         for_each_possible_cpu(cpu) {
1062                 preempt_disable();
1063                 if (cpu != smp_processor_id() && cpu_online(cpu))
1064                         wake_up_if_idle(cpu);
1065                 preempt_enable();
1066         }
1067 }
1068 EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
1069
1070 /**
1071  * struct smp_call_on_cpu_struct - Call a function on a specific CPU
1072  * @work: &work_struct
1073  * @done: &completion to signal
1074  * @func: function to call
1075  * @data: function's data argument
1076  * @ret: return value from @func
1077  * @cpu: target CPU (%-1 for any CPU)
1078  *
1079  * Used to call a function on a specific cpu and wait for it to return.
1080  * Optionally make sure the call is done on a specified physical cpu via vcpu
1081  * pinning in order to support virtualized environments.
1082  */
1083 struct smp_call_on_cpu_struct {
1084         struct work_struct      work;
1085         struct completion       done;
1086         int                     (*func)(void *);
1087         void                    *data;
1088         int                     ret;
1089         int                     cpu;
1090 };
1091
1092 static void smp_call_on_cpu_callback(struct work_struct *work)
1093 {
1094         struct smp_call_on_cpu_struct *sscs;
1095
1096         sscs = container_of(work, struct smp_call_on_cpu_struct, work);
1097         if (sscs->cpu >= 0)
1098                 hypervisor_pin_vcpu(sscs->cpu);
1099         sscs->ret = sscs->func(sscs->data);
1100         if (sscs->cpu >= 0)
1101                 hypervisor_pin_vcpu(-1);
1102
1103         complete(&sscs->done);
1104 }
1105
1106 int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
1107 {
1108         struct smp_call_on_cpu_struct sscs = {
1109                 .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
1110                 .func = func,
1111                 .data = par,
1112                 .cpu  = phys ? cpu : -1,
1113         };
1114
1115         INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);
1116
1117         if (cpu >= nr_cpu_ids || !cpu_online(cpu))
1118                 return -ENXIO;
1119
1120         queue_work_on(cpu, system_wq, &sscs.work);
1121         wait_for_completion(&sscs.done);
1122
1123         return sscs.ret;
1124 }
1125 EXPORT_SYMBOL_GPL(smp_call_on_cpu);