kernel/rcupreempt.c

   1 /*
   2  * Read-Copy Update mechanism for mutual exclusion, realtime implementation
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17  *
  18  * Copyright IBM Corporation, 2006
  19  *
  20  * Authors: Paul E. McKenney <paulmck@us.ibm.com>
  21  *              With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
  22  *              for pushing me away from locks and towards counters, and
  23  *              to Suparna Bhattacharya for pushing me completely away
  24  *              from atomic instructions on the read side.
  25  *
  26  * Papers:  http://www.rdrop.com/users/paulmck/RCU
  27  *
  28  * Design Document: http://lwn.net/Articles/253651/
  29  *
  30  * For detailed explanation of Read-Copy Update mechanism see -
  31  *              Documentation/RCU/ *.txt
  32  *
  33  */
  34 #include <linux/types.h>
  35 #include <linux/kernel.h>
  36 #include <linux/init.h>
  37 #include <linux/spinlock.h>
  38 #include <linux/smp.h>
  39 #include <linux/rcupdate.h>
  40 #include <linux/interrupt.h>
  41 #include <linux/sched.h>
  42 #include <asm/atomic.h>
  43 #include <linux/bitops.h>
  44 #include <linux/module.h>
  45 #include <linux/completion.h>
  46 #include <linux/moduleparam.h>
  47 #include <linux/percpu.h>
  48 #include <linux/notifier.h>
  49 #include <linux/rcupdate.h>
  50 #include <linux/cpu.h>
  51 #include <linux/random.h>
  52 #include <linux/delay.h>
  53 #include <linux/byteorder/swabb.h>
  54 #include <linux/cpumask.h>
  55 #include <linux/rcupreempt_trace.h>
  56
  57 /*
  58  * Macro that prevents the compiler from reordering accesses, but does
  59  * absolutely -nothing- to prevent CPUs from reordering.  This is used
  60  * only to mediate communication between mainline code and hardware
  61  * interrupt and NMI handlers.
  62  */
  63 #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
  64
  65 /*
  66  * PREEMPT_RCU data structures.
  67  */
  68
  69 /*
  70  * GP_STAGES specifies the number of times the state machine has
  71  * to go through the all the rcu_try_flip_states (see below)
  72  * in a single Grace Period.
  73  *
  74  * GP in GP_STAGES stands for Grace Period ;)
  75  */
  76 #define GP_STAGES    2
  77 struct rcu_data {
  78         spinlock_t      lock;           /* Protect rcu_data fields. */
  79         long            completed;      /* Number of last completed batch. */
  80         int             waitlistcount;
  81         struct tasklet_struct rcu_tasklet;
  82         struct rcu_head *nextlist;
  83         struct rcu_head **nexttail;
  84         struct rcu_head *waitlist[GP_STAGES];
  85         struct rcu_head **waittail[GP_STAGES];
  86         struct rcu_head *donelist;
  87         struct rcu_head **donetail;
  88         long rcu_flipctr[2];
  89 #ifdef CONFIG_RCU_TRACE
  90         struct rcupreempt_trace trace;
  91 #endif /* #ifdef CONFIG_RCU_TRACE */
  92 };
  93
  94 /*
  95  * States for rcu_try_flip() and friends.
  96  */
  97
  98 enum rcu_try_flip_states {
  99
 100         /*
 101          * Stay here if nothing is happening. Flip the counter if somthing
 102          * starts happening. Denoted by "I"
 103          */
 104         rcu_try_flip_idle_state,
 105
 106         /*
 107          * Wait here for all CPUs to notice that the counter has flipped. This
 108          * prevents the old set of counters from ever being incremented once
 109          * we leave this state, which in turn is necessary because we cannot
 110          * test any individual counter for zero -- we can only check the sum.
 111          * Denoted by "A".
 112          */
 113         rcu_try_flip_waitack_state,
 114
 115         /*
 116          * Wait here for the sum of the old per-CPU counters to reach zero.
 117          * Denoted by "Z".
 118          */
 119         rcu_try_flip_waitzero_state,
 120
 121         /*
 122          * Wait here for each of the other CPUs to execute a memory barrier.
 123          * This is necessary to ensure that these other CPUs really have
 124          * completed executing their RCU read-side critical sections, despite
 125          * their CPUs wildly reordering memory. Denoted by "M".
 126          */
 127         rcu_try_flip_waitmb_state,
 128 };
 129
 130 struct rcu_ctrlblk {
 131         spinlock_t      fliplock;       /* Protect state-machine transitions. */
 132         long            completed;      /* Number of last completed batch. */
 133         enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
 134                                                         the rcu state machine */
 135 };
 136
 137 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 138 static struct rcu_ctrlblk rcu_ctrlblk = {
 139         .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 140         .completed = 0,
 141         .rcu_try_flip_state = rcu_try_flip_idle_state,
 142 };
 143
 144
 145 #ifdef CONFIG_RCU_TRACE
 146 static char *rcu_try_flip_state_names[] =
 147         { "idle", "waitack", "waitzero", "waitmb" };
 148 #endif /* #ifdef CONFIG_RCU_TRACE */
 149
 150 static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
 151
 152 /*
 153  * Enum and per-CPU flag to determine when each CPU has seen
 154  * the most recent counter flip.
 155  */
 156
 157 enum rcu_flip_flag_values {
 158         rcu_flip_seen,          /* Steady/initial state, last flip seen. */
 159                                 /* Only GP detector can update. */
 160         rcu_flipped             /* Flip just completed, need confirmation. */
 161                                 /* Only corresponding CPU can update. */
 162 };
 163 static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
 164                                                                 = rcu_flip_seen;
 165
 166 /*
 167  * Enum and per-CPU flag to determine when each CPU has executed the
 168  * needed memory barrier to fence in memory references from its last RCU
 169  * read-side critical section in the just-completed grace period.
 170  */
 171
 172 enum rcu_mb_flag_values {
 173         rcu_mb_done,            /* Steady/initial state, no mb()s required. */
 174                                 /* Only GP detector can update. */
 175         rcu_mb_needed           /* Flip just completed, need an mb(). */
 176                                 /* Only corresponding CPU can update. */
 177 };
 178 static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
 179                                                                 = rcu_mb_done;
 180
 181 /*
 182  * RCU_DATA_ME: find the current CPU's rcu_data structure.
 183  * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
 184  */
 185 #define RCU_DATA_ME()           (&__get_cpu_var(rcu_data))
 186 #define RCU_DATA_CPU(cpu)       (&per_cpu(rcu_data, cpu))
 187
 188 /*
 189  * Helper macro for tracing when the appropriate rcu_data is not
 190  * cached in a local variable, but where the CPU number is so cached.
 191  */
 192 #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
 193
 194 /*
 195  * Helper macro for tracing when the appropriate rcu_data is not
 196  * cached in a local variable.
 197  */
 198 #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
 199
 200 /*
 201  * Helper macro for tracing when the appropriate rcu_data is pointed
 202  * to by a local variable.
 203  */
 204 #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
 205
 206 /*
 207  * Return the number of RCU batches processed thus far.  Useful
 208  * for debug and statistics.
 209  */
 210 long rcu_batches_completed(void)
 211 {
 212         return rcu_ctrlblk.completed;
 213 }
 214 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 215
 216 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 217
 218 void __rcu_read_lock(void)
 219 {
 220         int idx;
 221         struct task_struct *t = current;
 222         int nesting;
 223
 224         nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
 225         if (nesting != 0) {
 226
 227                 /* An earlier rcu_read_lock() covers us, just count it. */
 228
 229                 t->rcu_read_lock_nesting = nesting + 1;
 230
 231         } else {
 232                 unsigned long flags;
 233
 234                 /*
 235                  * We disable interrupts for the following reasons:
 236                  * - If we get scheduling clock interrupt here, and we
 237                  *   end up acking the counter flip, it's like a promise
 238                  *   that we will never increment the old counter again.
 239                  *   Thus we will break that promise if that
 240                  *   scheduling clock interrupt happens between the time
 241                  *   we pick the .completed field and the time that we
 242                  *   increment our counter.
 243                  *
 244                  * - We don't want to be preempted out here.
 245                  *
 246                  * NMIs can still occur, of course, and might themselves
 247                  * contain rcu_read_lock().
 248                  */
 249
 250                 local_irq_save(flags);
 251
 252                 /*
 253                  * Outermost nesting of rcu_read_lock(), so increment
 254                  * the current counter for the current CPU.  Use volatile
 255                  * casts to prevent the compiler from reordering.
 256                  */
 257
 258                 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
 259                 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
 260
 261                 /*
 262                  * Now that the per-CPU counter has been incremented, we
 263                  * are protected from races with rcu_read_lock() invoked
 264                  * from NMI handlers on this CPU.  We can therefore safely
 265                  * increment the nesting counter, relieving further NMIs
 266                  * of the need to increment the per-CPU counter.
 267                  */
 268
 269                 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
 270
 271                 /*
 272                  * Now that we have preventing any NMIs from storing
 273                  * to the ->rcu_flipctr_idx, we can safely use it to
 274                  * remember which counter to decrement in the matching
 275                  * rcu_read_unlock().
 276                  */
 277
 278                 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
 279                 local_irq_restore(flags);
 280         }
 281 }
 282 EXPORT_SYMBOL_GPL(__rcu_read_lock);
 283
 284 void __rcu_read_unlock(void)
 285 {
 286         int idx;
 287         struct task_struct *t = current;
 288         int nesting;
 289
 290         nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
 291         if (nesting > 1) {
 292
 293                 /*
 294                  * We are still protected by the enclosing rcu_read_lock(),
 295                  * so simply decrement the counter.
 296                  */
 297
 298                 t->rcu_read_lock_nesting = nesting - 1;
 299
 300         } else {
 301                 unsigned long flags;
 302
 303                 /*
 304                  * Disable local interrupts to prevent the grace-period
 305                  * detection state machine from seeing us half-done.
 306                  * NMIs can still occur, of course, and might themselves
 307                  * contain rcu_read_lock() and rcu_read_unlock().
 308                  */
 309
 310                 local_irq_save(flags);
 311
 312                 /*
 313                  * Outermost nesting of rcu_read_unlock(), so we must
 314                  * decrement the current counter for the current CPU.
 315                  * This must be done carefully, because NMIs can
 316                  * occur at any point in this code, and any rcu_read_lock()
 317                  * and rcu_read_unlock() pairs in the NMI handlers
 318                  * must interact non-destructively with this code.
 319                  * Lots of volatile casts, and -very- careful ordering.
 320                  *
 321                  * Changes to this code, including this one, must be
 322                  * inspected, validated, and tested extremely carefully!!!
 323                  */
 324
 325                 /*
 326                  * First, pick up the index.
 327                  */
 328
 329                 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
 330
 331                 /*
 332                  * Now that we have fetched the counter index, it is
 333                  * safe to decrement the per-task RCU nesting counter.
 334                  * After this, any interrupts or NMIs will increment and
 335                  * decrement the per-CPU counters.
 336                  */
 337                 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
 338
 339                 /*
 340                  * It is now safe to decrement this task's nesting count.
 341                  * NMIs that occur after this statement will route their
 342                  * rcu_read_lock() calls through this "else" clause, and
 343                  * will thus start incrementing the per-CPU counter on
 344                  * their own.  They will also clobber ->rcu_flipctr_idx,
 345                  * but that is OK, since we have already fetched it.
 346                  */
 347
 348                 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
 349                 local_irq_restore(flags);
 350         }
 351 }
 352 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 353
 354 /*
 355  * If a global counter flip has occurred since the last time that we
 356  * advanced callbacks, advance them.  Hardware interrupts must be
 357  * disabled when calling this function.
 358  */
 359 static void __rcu_advance_callbacks(struct rcu_data *rdp)
 360 {
 361         int cpu;
 362         int i;
 363         int wlc = 0;
 364
 365         if (rdp->completed != rcu_ctrlblk.completed) {
 366                 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
 367                         *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
 368                         rdp->donetail = rdp->waittail[GP_STAGES - 1];
 369                         RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
 370                 }
 371                 for (i = GP_STAGES - 2; i >= 0; i--) {
 372                         if (rdp->waitlist[i] != NULL) {
 373                                 rdp->waitlist[i + 1] = rdp->waitlist[i];
 374                                 rdp->waittail[i + 1] = rdp->waittail[i];
 375                                 wlc++;
 376                         } else {
 377                                 rdp->waitlist[i + 1] = NULL;
 378                                 rdp->waittail[i + 1] =
 379                                         &rdp->waitlist[i + 1];
 380                         }
 381                 }
 382                 if (rdp->nextlist != NULL) {
 383                         rdp->waitlist[0] = rdp->nextlist;
 384                         rdp->waittail[0] = rdp->nexttail;
 385                         wlc++;
 386                         rdp->nextlist = NULL;
 387                         rdp->nexttail = &rdp->nextlist;
 388                         RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
 389                 } else {
 390                         rdp->waitlist[0] = NULL;
 391                         rdp->waittail[0] = &rdp->waitlist[0];
 392                 }
 393                 rdp->waitlistcount = wlc;
 394                 rdp->completed = rcu_ctrlblk.completed;
 395         }
 396
 397         /*
 398          * Check to see if this CPU needs to report that it has seen
 399          * the most recent counter flip, thereby declaring that all
 400          * subsequent rcu_read_lock() invocations will respect this flip.
 401          */
 402
 403         cpu = raw_smp_processor_id();
 404         if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
 405                 smp_mb();  /* Subsequent counter accesses must see new value */
 406                 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
 407                 smp_mb();  /* Subsequent RCU read-side critical sections */
 408                            /*  seen -after- acknowledgement. */
 409         }
 410 }
 411
 412 /*
 413  * Get here when RCU is idle.  Decide whether we need to
 414  * move out of idle state, and return non-zero if so.
 415  * "Straightforward" approach for the moment, might later
 416  * use callback-list lengths, grace-period duration, or
 417  * some such to determine when to exit idle state.
 418  * Might also need a pre-idle test that does not acquire
 419  * the lock, but let's get the simple case working first...
 420  */
 421
 422 static int
 423 rcu_try_flip_idle(void)
 424 {
 425         int cpu;
 426
 427         RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
 428         if (!rcu_pending(smp_processor_id())) {
 429                 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
 430                 return 0;
 431         }
 432
 433         /*
 434          * Do the flip.
 435          */
 436
 437         RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
 438         rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
 439
 440         /*
 441          * Need a memory barrier so that other CPUs see the new
 442          * counter value before they see the subsequent change of all
 443          * the rcu_flip_flag instances to rcu_flipped.
 444          */
 445
 446         smp_mb();       /* see above block comment. */
 447
 448         /* Now ask each CPU for acknowledgement of the flip. */
 449
 450         for_each_cpu_mask(cpu, rcu_cpu_online_map)
 451                 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
 452
 453         return 1;
 454 }
 455
 456 /*
 457  * Wait for CPUs to acknowledge the flip.
 458  */
 459
 460 static int
 461 rcu_try_flip_waitack(void)
 462 {
 463         int cpu;
 464
 465         RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
 466         for_each_cpu_mask(cpu, rcu_cpu_online_map)
 467                 if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 468                         RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
 469                         return 0;
 470                 }
 471
 472         /*
 473          * Make sure our checks above don't bleed into subsequent
 474          * waiting for the sum of the counters to reach zero.
 475          */
 476
 477         smp_mb();       /* see above block comment. */
 478         RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
 479         return 1;
 480 }
 481
 482 /*
 483  * Wait for collective ``last'' counter to reach zero,
 484  * then tell all CPUs to do an end-of-grace-period memory barrier.
 485  */
 486
 487 static int
 488 rcu_try_flip_waitzero(void)
 489 {
 490         int cpu;
 491         int lastidx = !(rcu_ctrlblk.completed & 0x1);
 492         int sum = 0;
 493
 494         /* Check to see if the sum of the "last" counters is zero. */
 495
 496         RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
 497         for_each_cpu_mask(cpu, rcu_cpu_online_map)
 498                 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
 499         if (sum != 0) {
 500                 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
 501                 return 0;
 502         }
 503
 504         /*
 505          * This ensures that the other CPUs see the call for
 506          * memory barriers -after- the sum to zero has been
 507          * detected here
 508          */
 509         smp_mb();  /*  ^^^^^^^^^^^^ */
 510
 511         /* Call for a memory barrier from each CPU. */
 512         for_each_cpu_mask(cpu, rcu_cpu_online_map)
 513                 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
 514
 515         RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
 516         return 1;
 517 }
 518
 519 /*
 520  * Wait for all CPUs to do their end-of-grace-period memory barrier.
 521  * Return 0 once all CPUs have done so.
 522  */
 523
 524 static int
 525 rcu_try_flip_waitmb(void)
 526 {
 527         int cpu;
 528
 529         RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
 530         for_each_cpu_mask(cpu, rcu_cpu_online_map)
 531                 if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 532                         RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
 533                         return 0;
 534                 }
 535
 536         smp_mb(); /* Ensure that the above checks precede any following flip. */
 537         RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
 538         return 1;
 539 }
 540
 541 /*
 542  * Attempt a single flip of the counters.  Remember, a single flip does
 543  * -not- constitute a grace period.  Instead, the interval between
 544  * at least GP_STAGES consecutive flips is a grace period.
 545  *
 546  * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
 547  * on a large SMP, they might want to use a hierarchical organization of
 548  * the per-CPU-counter pairs.
 549  */
 550 static void rcu_try_flip(void)
 551 {
 552         unsigned long flags;
 553
 554         RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
 555         if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
 556                 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
 557                 return;
 558         }
 559
 560         /*
 561          * Take the next transition(s) through the RCU grace-period
 562          * flip-counter state machine.
 563          */
 564
 565         switch (rcu_ctrlblk.rcu_try_flip_state) {
 566         case rcu_try_flip_idle_state:
 567                 if (rcu_try_flip_idle())
 568                         rcu_ctrlblk.rcu_try_flip_state =
 569                                 rcu_try_flip_waitack_state;
 570                 break;
 571         case rcu_try_flip_waitack_state:
 572                 if (rcu_try_flip_waitack())
 573                         rcu_ctrlblk.rcu_try_flip_state =
 574                                 rcu_try_flip_waitzero_state;
 575                 break;
 576         case rcu_try_flip_waitzero_state:
 577                 if (rcu_try_flip_waitzero())
 578                         rcu_ctrlblk.rcu_try_flip_state =
 579                                 rcu_try_flip_waitmb_state;
 580                 break;
 581         case rcu_try_flip_waitmb_state:
 582                 if (rcu_try_flip_waitmb())
 583                         rcu_ctrlblk.rcu_try_flip_state =
 584                                 rcu_try_flip_idle_state;
 585         }
 586         spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 587 }
 588
 589 /*
 590  * Check to see if this CPU needs to do a memory barrier in order to
 591  * ensure that any prior RCU read-side critical sections have committed
 592  * their counter manipulations and critical-section memory references
 593  * before declaring the grace period to be completed.
 594  */
 595 static void rcu_check_mb(int cpu)
 596 {
 597         if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
 598                 smp_mb();  /* Ensure RCU read-side accesses are visible. */
 599                 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
 600         }
 601 }
 602
 603 void rcu_check_callbacks(int cpu, int user)
 604 {
 605         unsigned long flags;
 606         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 607
 608         rcu_check_mb(cpu);
 609         if (rcu_ctrlblk.completed == rdp->completed)
 610                 rcu_try_flip();
 611         spin_lock_irqsave(&rdp->lock, flags);
 612         RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
 613         __rcu_advance_callbacks(rdp);
 614         if (rdp->donelist == NULL) {
 615                 spin_unlock_irqrestore(&rdp->lock, flags);
 616         } else {
 617                 spin_unlock_irqrestore(&rdp->lock, flags);
 618                 raise_softirq(RCU_SOFTIRQ);
 619         }
 620 }
 621
 622 /*
 623  * Needed by dynticks, to make sure all RCU processing has finished
 624  * when we go idle:
 625  */
 626 void rcu_advance_callbacks(int cpu, int user)
 627 {
 628         unsigned long flags;
 629         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 630
 631         if (rcu_ctrlblk.completed == rdp->completed) {
 632                 rcu_try_flip();
 633                 if (rcu_ctrlblk.completed == rdp->completed)
 634                         return;
 635         }
 636         spin_lock_irqsave(&rdp->lock, flags);
 637         RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
 638         __rcu_advance_callbacks(rdp);
 639         spin_unlock_irqrestore(&rdp->lock, flags);
 640 }
 641
 642 #ifdef CONFIG_HOTPLUG_CPU
 643 #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
 644                 *dsttail = srclist; \
 645                 if (srclist != NULL) { \
 646                         dsttail = srctail; \
 647                         srclist = NULL; \
 648                         srctail = &srclist;\
 649                 } \
 650         } while (0)
 651
 652 void rcu_offline_cpu(int cpu)
 653 {
 654         int i;
 655         struct rcu_head *list = NULL;
 656         unsigned long flags;
 657         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 658         struct rcu_head **tail = &list;
 659
 660         /*
 661          * Remove all callbacks from the newly dead CPU, retaining order.
 662          * Otherwise rcu_barrier() will fail
 663          */
 664
 665         spin_lock_irqsave(&rdp->lock, flags);
 666         rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
 667         for (i = GP_STAGES - 1; i >= 0; i--)
 668                 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
 669                                                 list, tail);
 670         rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
 671         spin_unlock_irqrestore(&rdp->lock, flags);
 672         rdp->waitlistcount = 0;
 673
 674         /* Disengage the newly dead CPU from the grace-period computation. */
 675
 676         spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 677         rcu_check_mb(cpu);
 678         if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
 679                 smp_mb();  /* Subsequent counter accesses must see new value */
 680                 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
 681                 smp_mb();  /* Subsequent RCU read-side critical sections */
 682                            /*  seen -after- acknowledgement. */
 683         }
 684
 685         RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
 686         RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
 687
 688         RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
 689         RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
 690
 691         cpu_clear(cpu, rcu_cpu_online_map);
 692
 693         spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 694
 695         /*
 696          * Place the removed callbacks on the current CPU's queue.
 697          * Make them all start a new grace period: simple approach,
 698          * in theory could starve a given set of callbacks, but
 699          * you would need to be doing some serious CPU hotplugging
 700          * to make this happen.  If this becomes a problem, adding
 701          * a synchronize_rcu() to the hotplug path would be a simple
 702          * fix.
 703          */
 704
 705         rdp = RCU_DATA_ME();
 706         spin_lock_irqsave(&rdp->lock, flags);
 707         *rdp->nexttail = list;
 708         if (list)
 709                 rdp->nexttail = tail;
 710         spin_unlock_irqrestore(&rdp->lock, flags);
 711 }
 712
 713 void __devinit rcu_online_cpu(int cpu)
 714 {
 715         unsigned long flags;
 716
 717         spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 718         cpu_set(cpu, rcu_cpu_online_map);
 719         spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
 720 }
 721
 722 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 723
 724 void rcu_offline_cpu(int cpu)
 725 {
 726 }
 727
 728 void __devinit rcu_online_cpu(int cpu)
 729 {
 730 }
 731
 732 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 733
 734 static void rcu_process_callbacks(struct softirq_action *unused)
 735 {
 736         unsigned long flags;
 737         struct rcu_head *next, *list;
 738         struct rcu_data *rdp = RCU_DATA_ME();
 739
 740         spin_lock_irqsave(&rdp->lock, flags);
 741         list = rdp->donelist;
 742         if (list == NULL) {
 743                 spin_unlock_irqrestore(&rdp->lock, flags);
 744                 return;
 745         }
 746         rdp->donelist = NULL;
 747         rdp->donetail = &rdp->donelist;
 748         RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
 749         spin_unlock_irqrestore(&rdp->lock, flags);
 750         while (list) {
 751                 next = list->next;
 752                 list->func(list);
 753                 list = next;
 754                 RCU_TRACE_ME(rcupreempt_trace_invoke);
 755         }
 756 }
 757
 758 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 759 {
 760         unsigned long flags;
 761         struct rcu_data *rdp;
 762
 763         head->func = func;
 764         head->next = NULL;
 765         local_irq_save(flags);
 766         rdp = RCU_DATA_ME();
 767         spin_lock(&rdp->lock);
 768         __rcu_advance_callbacks(rdp);
 769         *rdp->nexttail = head;
 770         rdp->nexttail = &head->next;
 771         RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
 772         spin_unlock(&rdp->lock);
 773         local_irq_restore(flags);
 774 }
 775 EXPORT_SYMBOL_GPL(call_rcu);
 776
 777 /*
 778  * Wait until all currently running preempt_disable() code segments
 779  * (including hardware-irq-disable segments) complete.  Note that
 780  * in -rt this does -not- necessarily result in all currently executing
 781  * interrupt -handlers- having completed.
 782  */
 783 void __synchronize_sched(void)
 784 {
 785         cpumask_t oldmask;
 786         int cpu;
 787
 788         if (sched_getaffinity(0, &oldmask) < 0)
 789                 oldmask = cpu_possible_map;
 790         for_each_online_cpu(cpu) {
 791                 sched_setaffinity(0, cpumask_of_cpu(cpu));
 792                 schedule();
 793         }
 794         sched_setaffinity(0, oldmask);
 795 }
 796 EXPORT_SYMBOL_GPL(__synchronize_sched);
 797
 798 /*
 799  * Check to see if any future RCU-related work will need to be done
 800  * by the current CPU, even if none need be done immediately, returning
 801  * 1 if so.  Assumes that notifiers would take care of handling any
 802  * outstanding requests from the RCU core.
 803  *
 804  * This function is part of the RCU implementation; it is -not-
 805  * an exported member of the RCU API.
 806  */
 807 int rcu_needs_cpu(int cpu)
 808 {
 809         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 810
 811         return (rdp->donelist != NULL ||
 812                 !!rdp->waitlistcount ||
 813                 rdp->nextlist != NULL);
 814 }
 815
 816 int rcu_pending(int cpu)
 817 {
 818         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 819
 820         /* The CPU has at least one callback queued somewhere. */
 821
 822         if (rdp->donelist != NULL ||
 823             !!rdp->waitlistcount ||
 824             rdp->nextlist != NULL)
 825                 return 1;
 826
 827         /* The RCU core needs an acknowledgement from this CPU. */
 828
 829         if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
 830             (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
 831                 return 1;
 832
 833         /* This CPU has fallen behind the global grace-period number. */
 834
 835         if (rdp->completed != rcu_ctrlblk.completed)
 836                 return 1;
 837
 838         /* Nothing needed from this CPU. */
 839
 840         return 0;
 841 }
 842
 843 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 844                                 unsigned long action, void *hcpu)
 845 {
 846         long cpu = (long)hcpu;
 847
 848         switch (action) {
 849         case CPU_UP_PREPARE:
 850         case CPU_UP_PREPARE_FROZEN:
 851                 rcu_online_cpu(cpu);
 852                 break;
 853         case CPU_UP_CANCELED:
 854         case CPU_UP_CANCELED_FROZEN:
 855         case CPU_DEAD:
 856         case CPU_DEAD_FROZEN:
 857                 rcu_offline_cpu(cpu);
 858                 break;
 859         default:
 860                 break;
 861         }
 862         return NOTIFY_OK;
 863 }
 864
 865 static struct notifier_block __cpuinitdata rcu_nb = {
 866         .notifier_call = rcu_cpu_notify,
 867 };
 868
 869 void __init __rcu_init(void)
 870 {
 871         int cpu;
 872         int i;
 873         struct rcu_data *rdp;
 874
 875         printk(KERN_NOTICE "Preemptible RCU implementation.\n");
 876         for_each_possible_cpu(cpu) {
 877                 rdp = RCU_DATA_CPU(cpu);
 878                 spin_lock_init(&rdp->lock);
 879                 rdp->completed = 0;
 880                 rdp->waitlistcount = 0;
 881                 rdp->nextlist = NULL;
 882                 rdp->nexttail = &rdp->nextlist;
 883                 for (i = 0; i < GP_STAGES; i++) {
 884                         rdp->waitlist[i] = NULL;
 885                         rdp->waittail[i] = &rdp->waitlist[i];
 886                 }
 887                 rdp->donelist = NULL;
 888                 rdp->donetail = &rdp->donelist;
 889                 rdp->rcu_flipctr[0] = 0;
 890                 rdp->rcu_flipctr[1] = 0;
 891         }
 892         register_cpu_notifier(&rcu_nb);
 893
 894         /*
 895          * We don't need protection against CPU-Hotplug here
 896          * since
 897          * a) If a CPU comes online while we are iterating over the
 898          *    cpu_online_map below, we would only end up making a
 899          *    duplicate call to rcu_online_cpu() which sets the corresponding
 900          *    CPU's mask in the rcu_cpu_online_map.
 901          *
 902          * b) A CPU cannot go offline at this point in time since the user
 903          *    does not have access to the sysfs interface, nor do we
 904          *    suspend the system.
 905          */
 906         for_each_online_cpu(cpu)
 907                 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
 908
 909         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
 910 }
 911
 912 /*
 913  * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
 914  */
 915 void synchronize_kernel(void)
 916 {
 917         synchronize_rcu();
 918 }
 919
 920 #ifdef CONFIG_RCU_TRACE
 921 long *rcupreempt_flipctr(int cpu)
 922 {
 923         return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
 924 }
 925 EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
 926
 927 int rcupreempt_flip_flag(int cpu)
 928 {
 929         return per_cpu(rcu_flip_flag, cpu);
 930 }
 931 EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
 932
 933 int rcupreempt_mb_flag(int cpu)
 934 {
 935         return per_cpu(rcu_mb_flag, cpu);
 936 }
 937 EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
 938
 939 char *rcupreempt_try_flip_state_name(void)
 940 {
 941         return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
 942 }
 943 EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
 944
 945 struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
 946 {
 947         struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 948
 949         return &rdp->trace;
 950 }
 951 EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
 952
 953 #endif /* #ifdef RCU_TRACE */