kernel/rcutree.c

   1 /*
   2  * Read-Copy Update mechanism for mutual exclusion
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17  *
  18  * Copyright IBM Corporation, 2008
  19  *
  20  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  21  *          Manfred Spraul <manfred@colorfullife.com>
  22  *          Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
  23  *
  24  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  25  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  26  *
  27  * For detailed explanation of Read-Copy Update mechanism see -
  28  *      Documentation/RCU
  29  */
  30 #include <linux/types.h>
  31 #include <linux/kernel.h>
  32 #include <linux/init.h>
  33 #include <linux/spinlock.h>
  34 #include <linux/smp.h>
  35 #include <linux/rcupdate.h>
  36 #include <linux/interrupt.h>
  37 #include <linux/sched.h>
  38 #include <linux/nmi.h>
  39 #include <linux/atomic.h>
  40 #include <linux/bitops.h>
  41 #include <linux/export.h>
  42 #include <linux/completion.h>
  43 #include <linux/moduleparam.h>
  44 #include <linux/percpu.h>
  45 #include <linux/notifier.h>
  46 #include <linux/cpu.h>
  47 #include <linux/mutex.h>
  48 #include <linux/time.h>
  49 #include <linux/kernel_stat.h>
  50 #include <linux/wait.h>
  51 #include <linux/kthread.h>
  52 #include <linux/prefetch.h>
  53 #include <linux/delay.h>
  54 #include <linux/stop_machine.h>
  55 #include <linux/random.h>
  56
  57 #include "rcutree.h"
  58 #include <trace/events/rcu.h>
  59
  60 #include "rcu.h"
  61
  62 /* Data structures. */
  63
  64 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
  65 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
  66
  67 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
  68         .level = { &sname##_state.node[0] }, \
  69         .call = cr, \
  70         .fqs_state = RCU_GP_IDLE, \
  71         .gpnum = 0UL - 300UL, \
  72         .completed = 0UL - 300UL, \
  73         .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
  74         .orphan_nxttail = &sname##_state.orphan_nxtlist, \
  75         .orphan_donetail = &sname##_state.orphan_donelist, \
  76         .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
  77         .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
  78         .name = #sname, \
  79         .abbr = sabbr, \
  80 }
  81
  82 struct rcu_state rcu_sched_state =
  83         RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
  84 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
  85
  86 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
  87 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
  88
  89 static struct rcu_state *rcu_state;
  90 LIST_HEAD(rcu_struct_flavors);
  91
  92 /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
  93 static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
  94 module_param(rcu_fanout_leaf, int, 0444);
  95 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
  96 static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
  97         NUM_RCU_LVL_0,
  98         NUM_RCU_LVL_1,
  99         NUM_RCU_LVL_2,
 100         NUM_RCU_LVL_3,
 101         NUM_RCU_LVL_4,
 102 };
 103 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 104
 105 /*
 106  * The rcu_scheduler_active variable transitions from zero to one just
 107  * before the first task is spawned.  So when this variable is zero, RCU
 108  * can assume that there is but one task, allowing RCU to (for example)
 109  * optimize synchronize_sched() to a simple barrier().  When this variable
 110  * is one, RCU must actually do all the hard work required to detect real
 111  * grace periods.  This variable is also used to suppress boot-time false
 112  * positives from lockdep-RCU error checking.
 113  */
 114 int rcu_scheduler_active __read_mostly;
 115 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 116
 117 /*
 118  * The rcu_scheduler_fully_active variable transitions from zero to one
 119  * during the early_initcall() processing, which is after the scheduler
 120  * is capable of creating new tasks.  So RCU processing (for example,
 121  * creating tasks for RCU priority boosting) must be delayed until after
 122  * rcu_scheduler_fully_active transitions from zero to one.  We also
 123  * currently delay invocation of any RCU callbacks until after this point.
 124  *
 125  * It might later prove better for people registering RCU callbacks during
 126  * early boot to take responsibility for these callbacks, but one step at
 127  * a time.
 128  */
 129 static int rcu_scheduler_fully_active __read_mostly;
 130
 131 #ifdef CONFIG_RCU_BOOST
 132
 133 /*
 134  * Control variables for per-CPU and per-rcu_node kthreads.  These
 135  * handle all flavors of RCU.
 136  */
 137 static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
 138 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 139 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
 140 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 141
 142 #endif /* #ifdef CONFIG_RCU_BOOST */
 143
 144 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 145 static void invoke_rcu_core(void);
 146 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 147
 148 /*
 149  * Track the rcutorture test sequence number and the update version
 150  * number within a given test.  The rcutorture_testseq is incremented
 151  * on every rcutorture module load and unload, so has an odd value
 152  * when a test is running.  The rcutorture_vernum is set to zero
 153  * when rcutorture starts and is incremented on each rcutorture update.
 154  * These variables enable correlating rcutorture output with the
 155  * RCU tracing information.
 156  */
 157 unsigned long rcutorture_testseq;
 158 unsigned long rcutorture_vernum;
 159
 160 /*
 161  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 162  * permit this function to be invoked without holding the root rcu_node
 163  * structure's ->lock, but of course results can be subject to change.
 164  */
 165 static int rcu_gp_in_progress(struct rcu_state *rsp)
 166 {
 167         return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
 168 }
 169
 170 /*
 171  * Note a quiescent state.  Because we do not need to know
 172  * how many quiescent states passed, just if there was at least
 173  * one since the start of the grace period, this just sets a flag.
 174  * The caller must have disabled preemption.
 175  */
 176 void rcu_sched_qs(int cpu)
 177 {
 178         struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 179
 180         if (rdp->passed_quiesce == 0)
 181                 trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
 182         rdp->passed_quiesce = 1;
 183 }
 184
 185 void rcu_bh_qs(int cpu)
 186 {
 187         struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 188
 189         if (rdp->passed_quiesce == 0)
 190                 trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
 191         rdp->passed_quiesce = 1;
 192 }
 193
 194 /*
 195  * Note a context switch.  This is a quiescent state for RCU-sched,
 196  * and requires special handling for preemptible RCU.
 197  * The caller must have disabled preemption.
 198  */
 199 void rcu_note_context_switch(int cpu)
 200 {
 201         trace_rcu_utilization("Start context switch");
 202         rcu_sched_qs(cpu);
 203         rcu_preempt_note_context_switch(cpu);
 204         trace_rcu_utilization("End context switch");
 205 }
 206 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 207
 208 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 209         .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
 210         .dynticks = ATOMIC_INIT(1),
 211 };
 212
 213 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
 214 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
 215 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
 216
 217 module_param(blimit, long, 0444);
 218 module_param(qhimark, long, 0444);
 219 module_param(qlowmark, long, 0444);
 220
 221 static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
 222 static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
 223
 224 module_param(jiffies_till_first_fqs, ulong, 0644);
 225 module_param(jiffies_till_next_fqs, ulong, 0644);
 226
 227 static void rcu_start_gp(struct rcu_state *rsp);
 228 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
 229 static void force_quiescent_state(struct rcu_state *rsp);
 230 static int rcu_pending(int cpu);
 231
 232 /*
 233  * Return the number of RCU-sched batches processed thus far for debug & stats.
 234  */
 235 long rcu_batches_completed_sched(void)
 236 {
 237         return rcu_sched_state.completed;
 238 }
 239 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 240
 241 /*
 242  * Return the number of RCU BH batches processed thus far for debug & stats.
 243  */
 244 long rcu_batches_completed_bh(void)
 245 {
 246         return rcu_bh_state.completed;
 247 }
 248 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 249
 250 /*
 251  * Force a quiescent state for RCU BH.
 252  */
 253 void rcu_bh_force_quiescent_state(void)
 254 {
 255         force_quiescent_state(&rcu_bh_state);
 256 }
 257 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 258
 259 /*
 260  * Record the number of times rcutorture tests have been initiated and
 261  * terminated.  This information allows the debugfs tracing stats to be
 262  * correlated to the rcutorture messages, even when the rcutorture module
 263  * is being repeatedly loaded and unloaded.  In other words, we cannot
 264  * store this state in rcutorture itself.
 265  */
 266 void rcutorture_record_test_transition(void)
 267 {
 268         rcutorture_testseq++;
 269         rcutorture_vernum = 0;
 270 }
 271 EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
 272
 273 /*
 274  * Record the number of writer passes through the current rcutorture test.
 275  * This is also used to correlate debugfs tracing stats with the rcutorture
 276  * messages.
 277  */
 278 void rcutorture_record_progress(unsigned long vernum)
 279 {
 280         rcutorture_vernum++;
 281 }
 282 EXPORT_SYMBOL_GPL(rcutorture_record_progress);
 283
 284 /*
 285  * Force a quiescent state for RCU-sched.
 286  */
 287 void rcu_sched_force_quiescent_state(void)
 288 {
 289         force_quiescent_state(&rcu_sched_state);
 290 }
 291 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
 292
 293 /*
 294  * Does the CPU have callbacks ready to be invoked?
 295  */
 296 static int
 297 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 298 {
 299         return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
 300                rdp->nxttail[RCU_DONE_TAIL] != NULL;
 301 }
 302
 303 /*
 304  * Does the current CPU require a not-yet-started grace period?
 305  * The caller must have disabled interrupts to prevent races with
 306  * normal callback registry.
 307  */
 308 static int
 309 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 310 {
 311         int i;
 312
 313         if (rcu_gp_in_progress(rsp))
 314                 return 0;  /* No, a grace period is already in progress. */
 315         if (rcu_nocb_needs_gp(rsp))
 316                 return 1;  /* Yes, a no-CBs CPU needs one. */
 317         if (!rdp->nxttail[RCU_NEXT_TAIL])
 318                 return 0;  /* No, this is a no-CBs (or offline) CPU. */
 319         if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
 320                 return 1;  /* Yes, this CPU has newly registered callbacks. */
 321         for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
 322                 if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
 323                     ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
 324                                  rdp->nxtcompleted[i]))
 325                         return 1;  /* Yes, CBs for future grace period. */
 326         return 0; /* No grace period needed. */
 327 }
 328
 329 /*
 330  * Return the root node of the specified rcu_state structure.
 331  */
 332 static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 333 {
 334         return &rsp->node[0];
 335 }
 336
 337 /*
 338  * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
 339  *
 340  * If the new value of the ->dynticks_nesting counter now is zero,
 341  * we really have entered idle, and must do the appropriate accounting.
 342  * The caller must have disabled interrupts.
 343  */
 344 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
 345                                 bool user)
 346 {
 347         trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
 348         if (!user && !is_idle_task(current)) {
 349                 struct task_struct *idle = idle_task(smp_processor_id());
 350
 351                 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
 352                 ftrace_dump(DUMP_ORIG);
 353                 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 354                           current->pid, current->comm,
 355                           idle->pid, idle->comm); /* must be idle task! */
 356         }
 357         rcu_prepare_for_idle(smp_processor_id());
 358         /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
 359         smp_mb__before_atomic_inc();  /* See above. */
 360         atomic_inc(&rdtp->dynticks);
 361         smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
 362         WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 363
 364         /*
 365          * It is illegal to enter an extended quiescent state while
 366          * in an RCU read-side critical section.
 367          */
 368         rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
 369                            "Illegal idle entry in RCU read-side critical section.");
 370         rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
 371                            "Illegal idle entry in RCU-bh read-side critical section.");
 372         rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
 373                            "Illegal idle entry in RCU-sched read-side critical section.");
 374 }
 375
 376 /*
 377  * Enter an RCU extended quiescent state, which can be either the
 378  * idle loop or adaptive-tickless usermode execution.
 379  */
 380 static void rcu_eqs_enter(bool user)
 381 {
 382         long long oldval;
 383         struct rcu_dynticks *rdtp;
 384
 385         rdtp = &__get_cpu_var(rcu_dynticks);
 386         oldval = rdtp->dynticks_nesting;
 387         WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
 388         if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
 389                 rdtp->dynticks_nesting = 0;
 390         else
 391                 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
 392         rcu_eqs_enter_common(rdtp, oldval, user);
 393 }
 394
 395 /**
 396  * rcu_idle_enter - inform RCU that current CPU is entering idle
 397  *
 398  * Enter idle mode, in other words, -leave- the mode in which RCU
 399  * read-side critical sections can occur.  (Though RCU read-side
 400  * critical sections can occur in irq handlers in idle, a possibility
 401  * handled by irq_enter() and irq_exit().)
 402  *
 403  * We crowbar the ->dynticks_nesting field to zero to allow for
 404  * the possibility of usermode upcalls having messed up our count
 405  * of interrupt nesting level during the prior busy period.
 406  */
 407 void rcu_idle_enter(void)
 408 {
 409         unsigned long flags;
 410
 411         local_irq_save(flags);
 412         rcu_eqs_enter(false);
 413         local_irq_restore(flags);
 414 }
 415 EXPORT_SYMBOL_GPL(rcu_idle_enter);
 416
 417 #ifdef CONFIG_RCU_USER_QS
 418 /**
 419  * rcu_user_enter - inform RCU that we are resuming userspace.
 420  *
 421  * Enter RCU idle mode right before resuming userspace.  No use of RCU
 422  * is permitted between this call and rcu_user_exit(). This way the
 423  * CPU doesn't need to maintain the tick for RCU maintenance purposes
 424  * when the CPU runs in userspace.
 425  */
 426 void rcu_user_enter(void)
 427 {
 428         rcu_eqs_enter(1);
 429 }
 430
 431 /**
 432  * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
 433  * after the current irq returns.
 434  *
 435  * This is similar to rcu_user_enter() but in the context of a non-nesting
 436  * irq. After this call, RCU enters into idle mode when the interrupt
 437  * returns.
 438  */
 439 void rcu_user_enter_after_irq(void)
 440 {
 441         unsigned long flags;
 442         struct rcu_dynticks *rdtp;
 443
 444         local_irq_save(flags);
 445         rdtp = &__get_cpu_var(rcu_dynticks);
 446         /* Ensure this irq is interrupting a non-idle RCU state.  */
 447         WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
 448         rdtp->dynticks_nesting = 1;
 449         local_irq_restore(flags);
 450 }
 451 #endif /* CONFIG_RCU_USER_QS */
 452
 453 /**
 454  * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
 455  *
 456  * Exit from an interrupt handler, which might possibly result in entering
 457  * idle mode, in other words, leaving the mode in which read-side critical
 458  * sections can occur.
 459  *
 460  * This code assumes that the idle loop never does anything that might
 461  * result in unbalanced calls to irq_enter() and irq_exit().  If your
 462  * architecture violates this assumption, RCU will give you what you
 463  * deserve, good and hard.  But very infrequently and irreproducibly.
 464  *
 465  * Use things like work queues to work around this limitation.
 466  *
 467  * You have been warned.
 468  */
 469 void rcu_irq_exit(void)
 470 {
 471         unsigned long flags;
 472         long long oldval;
 473         struct rcu_dynticks *rdtp;
 474
 475         local_irq_save(flags);
 476         rdtp = &__get_cpu_var(rcu_dynticks);
 477         oldval = rdtp->dynticks_nesting;
 478         rdtp->dynticks_nesting--;
 479         WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
 480         if (rdtp->dynticks_nesting)
 481                 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
 482         else
 483                 rcu_eqs_enter_common(rdtp, oldval, true);
 484         local_irq_restore(flags);
 485 }
 486
 487 /*
 488  * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
 489  *
 490  * If the new value of the ->dynticks_nesting counter was previously zero,
 491  * we really have exited idle, and must do the appropriate accounting.
 492  * The caller must have disabled interrupts.
 493  */
 494 static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
 495                                int user)
 496 {
 497         smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
 498         atomic_inc(&rdtp->dynticks);
 499         /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
 500         smp_mb__after_atomic_inc();  /* See above. */
 501         WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 502         rcu_cleanup_after_idle(smp_processor_id());
 503         trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
 504         if (!user && !is_idle_task(current)) {
 505                 struct task_struct *idle = idle_task(smp_processor_id());
 506
 507                 trace_rcu_dyntick("Error on exit: not idle task",
 508                                   oldval, rdtp->dynticks_nesting);
 509                 ftrace_dump(DUMP_ORIG);
 510                 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 511                           current->pid, current->comm,
 512                           idle->pid, idle->comm); /* must be idle task! */
 513         }
 514 }
 515
 516 /*
 517  * Exit an RCU extended quiescent state, which can be either the
 518  * idle loop or adaptive-tickless usermode execution.
 519  */
 520 static void rcu_eqs_exit(bool user)
 521 {
 522         struct rcu_dynticks *rdtp;
 523         long long oldval;
 524
 525         rdtp = &__get_cpu_var(rcu_dynticks);
 526         oldval = rdtp->dynticks_nesting;
 527         WARN_ON_ONCE(oldval < 0);
 528         if (oldval & DYNTICK_TASK_NEST_MASK)
 529                 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
 530         else
 531                 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 532         rcu_eqs_exit_common(rdtp, oldval, user);
 533 }
 534
 535 /**
 536  * rcu_idle_exit - inform RCU that current CPU is leaving idle
 537  *
 538  * Exit idle mode, in other words, -enter- the mode in which RCU
 539  * read-side critical sections can occur.
 540  *
 541  * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
 542  * allow for the possibility of usermode upcalls messing up our count
 543  * of interrupt nesting level during the busy period that is just
 544  * now starting.
 545  */
 546 void rcu_idle_exit(void)
 547 {
 548         unsigned long flags;
 549
 550         local_irq_save(flags);
 551         rcu_eqs_exit(false);
 552         local_irq_restore(flags);
 553 }
 554 EXPORT_SYMBOL_GPL(rcu_idle_exit);
 555
 556 #ifdef CONFIG_RCU_USER_QS
 557 /**
 558  * rcu_user_exit - inform RCU that we are exiting userspace.
 559  *
 560  * Exit RCU idle mode while entering the kernel because it can
 561  * run a RCU read side critical section anytime.
 562  */
 563 void rcu_user_exit(void)
 564 {
 565         rcu_eqs_exit(1);
 566 }
 567
 568 /**
 569  * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
 570  * idle mode after the current non-nesting irq returns.
 571  *
 572  * This is similar to rcu_user_exit() but in the context of an irq.
 573  * This is called when the irq has interrupted a userspace RCU idle mode
 574  * context. When the current non-nesting interrupt returns after this call,
 575  * the CPU won't restore the RCU idle mode.
 576  */
 577 void rcu_user_exit_after_irq(void)
 578 {
 579         unsigned long flags;
 580         struct rcu_dynticks *rdtp;
 581
 582         local_irq_save(flags);
 583         rdtp = &__get_cpu_var(rcu_dynticks);
 584         /* Ensure we are interrupting an RCU idle mode. */
 585         WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
 586         rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
 587         local_irq_restore(flags);
 588 }
 589 #endif /* CONFIG_RCU_USER_QS */
 590
 591 /**
 592  * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
 593  *
 594  * Enter an interrupt handler, which might possibly result in exiting
 595  * idle mode, in other words, entering the mode in which read-side critical
 596  * sections can occur.
 597  *
 598  * Note that the Linux kernel is fully capable of entering an interrupt
 599  * handler that it never exits, for example when doing upcalls to
 600  * user mode!  This code assumes that the idle loop never does upcalls to
 601  * user mode.  If your architecture does do upcalls from the idle loop (or
 602  * does anything else that results in unbalanced calls to the irq_enter()
 603  * and irq_exit() functions), RCU will give you what you deserve, good
 604  * and hard.  But very infrequently and irreproducibly.
 605  *
 606  * Use things like work queues to work around this limitation.
 607  *
 608  * You have been warned.
 609  */
 610 void rcu_irq_enter(void)
 611 {
 612         unsigned long flags;
 613         struct rcu_dynticks *rdtp;
 614         long long oldval;
 615
 616         local_irq_save(flags);
 617         rdtp = &__get_cpu_var(rcu_dynticks);
 618         oldval = rdtp->dynticks_nesting;
 619         rdtp->dynticks_nesting++;
 620         WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
 621         if (oldval)
 622                 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
 623         else
 624                 rcu_eqs_exit_common(rdtp, oldval, true);
 625         local_irq_restore(flags);
 626 }
 627
 628 /**
 629  * rcu_nmi_enter - inform RCU of entry to NMI context
 630  *
 631  * If the CPU was idle with dynamic ticks active, and there is no
 632  * irq handler running, this updates rdtp->dynticks_nmi to let the
 633  * RCU grace-period handling know that the CPU is active.
 634  */
 635 void rcu_nmi_enter(void)
 636 {
 637         struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 638
 639         if (rdtp->dynticks_nmi_nesting == 0 &&
 640             (atomic_read(&rdtp->dynticks) & 0x1))
 641                 return;
 642         rdtp->dynticks_nmi_nesting++;
 643         smp_mb__before_atomic_inc();  /* Force delay from prior write. */
 644         atomic_inc(&rdtp->dynticks);
 645         /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
 646         smp_mb__after_atomic_inc();  /* See above. */
 647         WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 648 }
 649
 650 /**
 651  * rcu_nmi_exit - inform RCU of exit from NMI context
 652  *
 653  * If the CPU was idle with dynamic ticks active, and there is no
 654  * irq handler running, this updates rdtp->dynticks_nmi to let the
 655  * RCU grace-period handling know that the CPU is no longer active.
 656  */
 657 void rcu_nmi_exit(void)
 658 {
 659         struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
 660
 661         if (rdtp->dynticks_nmi_nesting == 0 ||
 662             --rdtp->dynticks_nmi_nesting != 0)
 663                 return;
 664         /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
 665         smp_mb__before_atomic_inc();  /* See above. */
 666         atomic_inc(&rdtp->dynticks);
 667         smp_mb__after_atomic_inc();  /* Force delay to next write. */
 668         WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 669 }
 670
 671 /**
 672  * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
 673  *
 674  * If the current CPU is in its idle loop and is neither in an interrupt
 675  * or NMI handler, return true.
 676  */
 677 int rcu_is_cpu_idle(void)
 678 {
 679         int ret;
 680
 681         preempt_disable();
 682         ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
 683         preempt_enable();
 684         return ret;
 685 }
 686 EXPORT_SYMBOL(rcu_is_cpu_idle);
 687
 688 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 689
 690 /*
 691  * Is the current CPU online?  Disable preemption to avoid false positives
 692  * that could otherwise happen due to the current CPU number being sampled,
 693  * this task being preempted, its old CPU being taken offline, resuming
 694  * on some other CPU, then determining that its old CPU is now offline.
 695  * It is OK to use RCU on an offline processor during initial boot, hence
 696  * the check for rcu_scheduler_fully_active.  Note also that it is OK
 697  * for a CPU coming online to use RCU for one jiffy prior to marking itself
 698  * online in the cpu_online_mask.  Similarly, it is OK for a CPU going
 699  * offline to continue to use RCU for one jiffy after marking itself
 700  * offline in the cpu_online_mask.  This leniency is necessary given the
 701  * non-atomic nature of the online and offline processing, for example,
 702  * the fact that a CPU enters the scheduler after completing the CPU_DYING
 703  * notifiers.
 704  *
 705  * This is also why RCU internally marks CPUs online during the
 706  * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
 707  *
 708  * Disable checking if in an NMI handler because we cannot safely report
 709  * errors from NMI handlers anyway.
 710  */
 711 bool rcu_lockdep_current_cpu_online(void)
 712 {
 713         struct rcu_data *rdp;
 714         struct rcu_node *rnp;
 715         bool ret;
 716
 717         if (in_nmi())
 718                 return 1;
 719         preempt_disable();
 720         rdp = &__get_cpu_var(rcu_sched_data);
 721         rnp = rdp->mynode;
 722         ret = (rdp->grpmask & rnp->qsmaskinit) ||
 723               !rcu_scheduler_fully_active;
 724         preempt_enable();
 725         return ret;
 726 }
 727 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
 728
 729 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
 730
 731 /**
 732  * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
 733  *
 734  * If the current CPU is idle or running at a first-level (not nested)
 735  * interrupt from idle, return true.  The caller must have at least
 736  * disabled preemption.
 737  */
 738 static int rcu_is_cpu_rrupt_from_idle(void)
 739 {
 740         return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 741 }
 742
 743 /*
 744  * Snapshot the specified CPU's dynticks counter so that we can later
 745  * credit them with an implicit quiescent state.  Return 1 if this CPU
 746  * is in dynticks idle mode, which is an extended quiescent state.
 747  */
 748 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 749 {
 750         rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
 751         return (rdp->dynticks_snap & 0x1) == 0;
 752 }
 753
 754 /*
 755  * Return true if the specified CPU has passed through a quiescent
 756  * state by virtue of being in or having passed through an dynticks
 757  * idle state since the last call to dyntick_save_progress_counter()
 758  * for this same CPU, or by virtue of having been offline.
 759  */
 760 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 761 {
 762         unsigned int curr;
 763         unsigned int snap;
 764
 765         curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
 766         snap = (unsigned int)rdp->dynticks_snap;
 767
 768         /*
 769          * If the CPU passed through or entered a dynticks idle phase with
 770          * no active irq/NMI handlers, then we can safely pretend that the CPU
 771          * already acknowledged the request to pass through a quiescent
 772          * state.  Either way, that CPU cannot possibly be in an RCU
 773          * read-side critical section that started before the beginning
 774          * of the current RCU grace period.
 775          */
 776         if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
 777                 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
 778                 rdp->dynticks_fqs++;
 779                 return 1;
 780         }
 781
 782         /*
 783          * Check for the CPU being offline, but only if the grace period
 784          * is old enough.  We don't need to worry about the CPU changing
 785          * state: If we see it offline even once, it has been through a
 786          * quiescent state.
 787          *
 788          * The reason for insisting that the grace period be at least
 789          * one jiffy old is that CPUs that are not quite online and that
 790          * have just gone offline can still execute RCU read-side critical
 791          * sections.
 792          */
 793         if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
 794                 return 0;  /* Grace period is not old enough. */
 795         barrier();
 796         if (cpu_is_offline(rdp->cpu)) {
 797                 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
 798                 rdp->offline_fqs++;
 799                 return 1;
 800         }
 801         return 0;
 802 }
 803
 804 static void record_gp_stall_check_time(struct rcu_state *rsp)
 805 {
 806         rsp->gp_start = jiffies;
 807         rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
 808 }
 809
 810 /*
 811  * Dump stacks of all tasks running on stalled CPUs.  This is a fallback
 812  * for architectures that do not implement trigger_all_cpu_backtrace().
 813  * The NMI-triggered stack traces are more accurate because they are
 814  * printed by the target CPU.
 815  */
 816 static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 817 {
 818         int cpu;
 819         unsigned long flags;
 820         struct rcu_node *rnp;
 821
 822         rcu_for_each_leaf_node(rsp, rnp) {
 823                 raw_spin_lock_irqsave(&rnp->lock, flags);
 824                 if (rnp->qsmask != 0) {
 825                         for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
 826                                 if (rnp->qsmask & (1UL << cpu))
 827                                         dump_cpu_task(rnp->grplo + cpu);
 828                 }
 829                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 830         }
 831 }
 832
 833 static void print_other_cpu_stall(struct rcu_state *rsp)
 834 {
 835         int cpu;
 836         long delta;
 837         unsigned long flags;
 838         int ndetected = 0;
 839         struct rcu_node *rnp = rcu_get_root(rsp);
 840         long totqlen = 0;
 841
 842         /* Only let one CPU complain about others per time interval. */
 843
 844         raw_spin_lock_irqsave(&rnp->lock, flags);
 845         delta = jiffies - rsp->jiffies_stall;
 846         if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
 847                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 848                 return;
 849         }
 850         rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
 851         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 852
 853         /*
 854          * OK, time to rat on our buddy...
 855          * See Documentation/RCU/stallwarn.txt for info on how to debug
 856          * RCU CPU stall warnings.
 857          */
 858         printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
 859                rsp->name);
 860         print_cpu_stall_info_begin();
 861         rcu_for_each_leaf_node(rsp, rnp) {
 862                 raw_spin_lock_irqsave(&rnp->lock, flags);
 863                 ndetected += rcu_print_task_stall(rnp);
 864                 if (rnp->qsmask != 0) {
 865                         for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
 866                                 if (rnp->qsmask & (1UL << cpu)) {
 867                                         print_cpu_stall_info(rsp,
 868                                                              rnp->grplo + cpu);
 869                                         ndetected++;
 870                                 }
 871                 }
 872                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 873         }
 874
 875         /*
 876          * Now rat on any tasks that got kicked up to the root rcu_node
 877          * due to CPU offlining.
 878          */
 879         rnp = rcu_get_root(rsp);
 880         raw_spin_lock_irqsave(&rnp->lock, flags);
 881         ndetected += rcu_print_task_stall(rnp);
 882         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 883
 884         print_cpu_stall_info_end();
 885         for_each_possible_cpu(cpu)
 886                 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
 887         pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
 888                smp_processor_id(), (long)(jiffies - rsp->gp_start),
 889                rsp->gpnum, rsp->completed, totqlen);
 890         if (ndetected == 0)
 891                 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
 892         else if (!trigger_all_cpu_backtrace())
 893                 rcu_dump_cpu_stacks(rsp);
 894
 895         /* Complain about tasks blocking the grace period. */
 896
 897         rcu_print_detail_task_stall(rsp);
 898
 899         force_quiescent_state(rsp);  /* Kick them all. */
 900 }
 901
 902 static void print_cpu_stall(struct rcu_state *rsp)
 903 {
 904         int cpu;
 905         unsigned long flags;
 906         struct rcu_node *rnp = rcu_get_root(rsp);
 907         long totqlen = 0;
 908
 909         /*
 910          * OK, time to rat on ourselves...
 911          * See Documentation/RCU/stallwarn.txt for info on how to debug
 912          * RCU CPU stall warnings.
 913          */
 914         printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
 915         print_cpu_stall_info_begin();
 916         print_cpu_stall_info(rsp, smp_processor_id());
 917         print_cpu_stall_info_end();
 918         for_each_possible_cpu(cpu)
 919                 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
 920         pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
 921                 jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
 922         if (!trigger_all_cpu_backtrace())
 923                 dump_stack();
 924
 925         raw_spin_lock_irqsave(&rnp->lock, flags);
 926         if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
 927                 rsp->jiffies_stall = jiffies +
 928                                      3 * rcu_jiffies_till_stall_check() + 3;
 929         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 930
 931         set_need_resched();  /* kick ourselves to get things going. */
 932 }
 933
 934 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 935 {
 936         unsigned long j;
 937         unsigned long js;
 938         struct rcu_node *rnp;
 939
 940         if (rcu_cpu_stall_suppress)
 941                 return;
 942         j = ACCESS_ONCE(jiffies);
 943         js = ACCESS_ONCE(rsp->jiffies_stall);
 944         rnp = rdp->mynode;
 945         if (rcu_gp_in_progress(rsp) &&
 946             (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
 947
 948                 /* We haven't checked in, so go dump stack. */
 949                 print_cpu_stall(rsp);
 950
 951         } else if (rcu_gp_in_progress(rsp) &&
 952                    ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
 953
 954                 /* They had a few time units to dump stack, so complain. */
 955                 print_other_cpu_stall(rsp);
 956         }
 957 }
 958
 959 /**
 960  * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
 961  *
 962  * Set the stall-warning timeout way off into the future, thus preventing
 963  * any RCU CPU stall-warning messages from appearing in the current set of
 964  * RCU grace periods.
 965  *
 966  * The caller must disable hard irqs.
 967  */
 968 void rcu_cpu_stall_reset(void)
 969 {
 970         struct rcu_state *rsp;
 971
 972         for_each_rcu_flavor(rsp)
 973                 rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 974 }
 975
 976 /*
 977  * Update CPU-local rcu_data state to record the newly noticed grace period.
 978  * This is used both when we started the grace period and when we notice
 979  * that someone else started the grace period.  The caller must hold the
 980  * ->lock of the leaf rcu_node structure corresponding to the current CPU,
 981  *  and must have irqs disabled.
 982  */
 983 static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 984 {
 985         if (rdp->gpnum != rnp->gpnum) {
 986                 /*
 987                  * If the current grace period is waiting for this CPU,
 988                  * set up to detect a quiescent state, otherwise don't
 989                  * go looking for one.
 990                  */
 991                 rdp->gpnum = rnp->gpnum;
 992                 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
 993                 rdp->passed_quiesce = 0;
 994                 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 995                 zero_cpu_stall_ticks(rdp);
 996         }
 997 }
 998
 999 static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
1000 {
1001         unsigned long flags;
1002         struct rcu_node *rnp;
1003
1004         local_irq_save(flags);
1005         rnp = rdp->mynode;
1006         if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
1007             !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1008                 local_irq_restore(flags);
1009                 return;
1010         }
1011         __note_new_gpnum(rsp, rnp, rdp);
1012         raw_spin_unlock_irqrestore(&rnp->lock, flags);
1013 }
1014
1015 /*
1016  * Did someone else start a new RCU grace period start since we last
1017  * checked?  Update local state appropriately if so.  Must be called
1018  * on the CPU corresponding to rdp.
1019  */
1020 static int
1021 check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
1022 {
1023         unsigned long flags;
1024         int ret = 0;
1025
1026         local_irq_save(flags);
1027         if (rdp->gpnum != rsp->gpnum) {
1028                 note_new_gpnum(rsp, rdp);
1029                 ret = 1;
1030         }
1031         local_irq_restore(flags);
1032         return ret;
1033 }
1034
1035 /*
1036  * Initialize the specified rcu_data structure's callback list to empty.
1037  */
1038 static void init_callback_list(struct rcu_data *rdp)
1039 {
1040         int i;
1041
1042         if (init_nocb_callback_list(rdp))
1043                 return;
1044         rdp->nxtlist = NULL;
1045         for (i = 0; i < RCU_NEXT_SIZE; i++)
1046                 rdp->nxttail[i] = &rdp->nxtlist;
1047 }
1048
1049 /*
1050  * Determine the value that ->completed will have at the end of the
1051  * next subsequent grace period.  This is used to tag callbacks so that
1052  * a CPU can invoke callbacks in a timely fashion even if that CPU has
1053  * been dyntick-idle for an extended period with callbacks under the
1054  * influence of RCU_FAST_NO_HZ.
1055  *
1056  * The caller must hold rnp->lock with interrupts disabled.
1057  */
1058 static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1059                                        struct rcu_node *rnp)
1060 {
1061         /*
1062          * If RCU is idle, we just wait for the next grace period.
1063          * But we can only be sure that RCU is idle if we are looking
1064          * at the root rcu_node structure -- otherwise, a new grace
1065          * period might have started, but just not yet gotten around
1066          * to initializing the current non-root rcu_node structure.
1067          */
1068         if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
1069                 return rnp->completed + 1;
1070
1071         /*
1072          * Otherwise, wait for a possible partial grace period and
1073          * then the subsequent full grace period.
1074          */
1075         return rnp->completed + 2;
1076 }
1077
1078 /*
1079  * Trace-event helper function for rcu_start_future_gp() and
1080  * rcu_nocb_wait_gp().
1081  */
1082 static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1083                                 unsigned long c, char *s)
1084 {
1085         trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1086                                       rnp->completed, c, rnp->level,
1087                                       rnp->grplo, rnp->grphi, s);
1088 }
1089
1090 /*
1091  * Start some future grace period, as needed to handle newly arrived
1092  * callbacks.  The required future grace periods are recorded in each
1093  * rcu_node structure's ->need_future_gp field.
1094  *
1095  * The caller must hold the specified rcu_node structure's ->lock.
1096  */
1097 static unsigned long __maybe_unused
1098 rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1099 {
1100         unsigned long c;
1101         int i;
1102         struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1103
1104         /*
1105          * Pick up grace-period number for new callbacks.  If this
1106          * grace period is already marked as needed, return to the caller.
1107          */
1108         c = rcu_cbs_completed(rdp->rsp, rnp);
1109         trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
1110         if (rnp->need_future_gp[c & 0x1]) {
1111                 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
1112                 return c;
1113         }
1114
1115         /*
1116          * If either this rcu_node structure or the root rcu_node structure
1117          * believe that a grace period is in progress, then we must wait
1118          * for the one following, which is in "c".  Because our request
1119          * will be noticed at the end of the current grace period, we don't
1120          * need to explicitly start one.
1121          */
1122         if (rnp->gpnum != rnp->completed ||
1123             ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1124                 rnp->need_future_gp[c & 0x1]++;
1125                 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
1126                 return c;
1127         }
1128
1129         /*
1130          * There might be no grace period in progress.  If we don't already
1131          * hold it, acquire the root rcu_node structure's lock in order to
1132          * start one (if needed).
1133          */
1134         if (rnp != rnp_root)
1135                 raw_spin_lock(&rnp_root->lock);
1136
1137         /*
1138          * Get a new grace-period number.  If there really is no grace
1139          * period in progress, it will be smaller than the one we obtained
1140          * earlier.  Adjust callbacks as needed.  Note that even no-CBs
1141          * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
1142          */
1143         c = rcu_cbs_completed(rdp->rsp, rnp_root);
1144         for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
1145                 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
1146                         rdp->nxtcompleted[i] = c;
1147
1148         /*
1149          * If the needed for the required grace period is already
1150          * recorded, trace and leave.
1151          */
1152         if (rnp_root->need_future_gp[c & 0x1]) {
1153                 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
1154                 goto unlock_out;
1155         }
1156
1157         /* Record the need for the future grace period. */
1158         rnp_root->need_future_gp[c & 0x1]++;
1159
1160         /* If a grace period is not already in progress, start one. */
1161         if (rnp_root->gpnum != rnp_root->completed) {
1162                 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
1163         } else {
1164                 trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
1165                 rcu_start_gp(rdp->rsp);
1166         }
1167 unlock_out:
1168         if (rnp != rnp_root)
1169                 raw_spin_unlock(&rnp_root->lock);
1170         return c;
1171 }
1172
1173 /*
1174  * Clean up any old requests for the just-ended grace period.  Also return
1175  * whether any additional grace periods have been requested.  Also invoke
1176  * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
1177  * waiting for this grace period to complete.
1178  */
1179 static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1180 {
1181         int c = rnp->completed;
1182         int needmore;
1183         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1184
1185         rcu_nocb_gp_cleanup(rsp, rnp);
1186         rnp->need_future_gp[c & 0x1] = 0;
1187         needmore = rnp->need_future_gp[(c + 1) & 0x1];
1188         trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
1189         return needmore;
1190 }
1191
1192 /*
1193  * If there is room, assign a ->completed number to any callbacks on
1194  * this CPU that have not already been assigned.  Also accelerate any
1195  * callbacks that were previously assigned a ->completed number that has
1196  * since proven to be too conservative, which can happen if callbacks get
1197  * assigned a ->completed number while RCU is idle, but with reference to
1198  * a non-root rcu_node structure.  This function is idempotent, so it does
1199  * not hurt to call it repeatedly.
1200  *
1201  * The caller must hold rnp->lock with interrupts disabled.
1202  */
1203 static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1204                                struct rcu_data *rdp)
1205 {
1206         unsigned long c;
1207         int i;
1208
1209         /* If the CPU has no callbacks, nothing to do. */
1210         if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1211                 return;
1212
1213         /*
1214          * Starting from the sublist containing the callbacks most
1215          * recently assigned a ->completed number and working down, find the
1216          * first sublist that is not assignable to an upcoming grace period.
1217          * Such a sublist has something in it (first two tests) and has
1218          * a ->completed number assigned that will complete sooner than
1219          * the ->completed number for newly arrived callbacks (last test).
1220          *
1221          * The key point is that any later sublist can be assigned the
1222          * same ->completed number as the newly arrived callbacks, which
1223          * means that the callbacks in any of these later sublist can be
1224          * grouped into a single sublist, whether or not they have already
1225          * been assigned a ->completed number.
1226          */
1227         c = rcu_cbs_completed(rsp, rnp);
1228         for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
1229                 if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
1230                     !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
1231                         break;
1232
1233         /*
1234          * If there are no sublist for unassigned callbacks, leave.
1235          * At the same time, advance "i" one sublist, so that "i" will
1236          * index into the sublist where all the remaining callbacks should
1237          * be grouped into.
1238          */
1239         if (++i >= RCU_NEXT_TAIL)
1240                 return;
1241
1242         /*
1243          * Assign all subsequent callbacks' ->completed number to the next
1244          * full grace period and group them all in the sublist initially
1245          * indexed by "i".
1246          */
1247         for (; i <= RCU_NEXT_TAIL; i++) {
1248                 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1249                 rdp->nxtcompleted[i] = c;
1250         }
1251
1252         /* Trace depending on how much we were able to accelerate. */
1253         if (!*rdp->nxttail[RCU_WAIT_TAIL])
1254                 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
1255         else
1256                 trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
1257 }
1258
1259 /*
1260  * Move any callbacks whose grace period has completed to the
1261  * RCU_DONE_TAIL sublist, then compact the remaining sublists and
1262  * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1263  * sublist.  This function is idempotent, so it does not hurt to
1264  * invoke it repeatedly.  As long as it is not invoked -too- often...
1265  *
1266  * The caller must hold rnp->lock with interrupts disabled.
1267  */
1268 static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1269                             struct rcu_data *rdp)
1270 {
1271         int i, j;
1272
1273         /* If the CPU has no callbacks, nothing to do. */
1274         if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1275                 return;
1276
1277         /*
1278          * Find all callbacks whose ->completed numbers indicate that they
1279          * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1280          */
1281         for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
1282                 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
1283                         break;
1284                 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
1285         }
1286         /* Clean up any sublist tail pointers that were misordered above. */
1287         for (j = RCU_WAIT_TAIL; j < i; j++)
1288                 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
1289
1290         /* Copy down callbacks to fill in empty sublists. */
1291         for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
1292                 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
1293                         break;
1294                 rdp->nxttail[j] = rdp->nxttail[i];
1295                 rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
1296         }
1297
1298         /* Classify any remaining callbacks. */
1299         rcu_accelerate_cbs(rsp, rnp, rdp);
1300 }
1301
1302 /*
1303  * Advance this CPU's callbacks, but only if the current grace period
1304  * has ended.  This may be called only from the CPU to whom the rdp
1305  * belongs.  In addition, the corresponding leaf rcu_node structure's
1306  * ->lock must be held by the caller, with irqs disabled.
1307  */
1308 static void
1309 __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1310 {
1311         /* Did another grace period end? */
1312         if (rdp->completed == rnp->completed) {
1313
1314                 /* No, so just accelerate recent callbacks. */
1315                 rcu_accelerate_cbs(rsp, rnp, rdp);
1316
1317         } else {
1318
1319                 /* Advance callbacks. */
1320                 rcu_advance_cbs(rsp, rnp, rdp);
1321
1322                 /* Remember that we saw this grace-period completion. */
1323                 rdp->completed = rnp->completed;
1324                 trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
1325
1326                 /*
1327                  * If we were in an extended quiescent state, we may have
1328                  * missed some grace periods that others CPUs handled on
1329                  * our behalf. Catch up with this state to avoid noting
1330                  * spurious new grace periods.  If another grace period
1331                  * has started, then rnp->gpnum will have advanced, so
1332                  * we will detect this later on.  Of course, any quiescent
1333                  * states we found for the old GP are now invalid.
1334                  */
1335                 if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
1336                         rdp->gpnum = rdp->completed;
1337                         rdp->passed_quiesce = 0;
1338                 }
1339
1340                 /*
1341                  * If RCU does not need a quiescent state from this CPU,
1342                  * then make sure that this CPU doesn't go looking for one.
1343                  */
1344                 if ((rnp->qsmask & rdp->grpmask) == 0)
1345                         rdp->qs_pending = 0;
1346         }
1347 }
1348
1349 /*
1350  * Advance this CPU's callbacks, but only if the current grace period
1351  * has ended.  This may be called only from the CPU to whom the rdp
1352  * belongs.
1353  */
1354 static void
1355 rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
1356 {
1357         unsigned long flags;
1358         struct rcu_node *rnp;
1359
1360         local_irq_save(flags);
1361         rnp = rdp->mynode;
1362         if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
1363             !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1364                 local_irq_restore(flags);
1365                 return;
1366         }
1367         __rcu_process_gp_end(rsp, rnp, rdp);
1368         raw_spin_unlock_irqrestore(&rnp->lock, flags);
1369 }
1370
1371 /*
1372  * Do per-CPU grace-period initialization for running CPU.  The caller
1373  * must hold the lock of the leaf rcu_node structure corresponding to
1374  * this CPU.
1375  */
1376 static void
1377 rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
1378 {
1379         /* Prior grace period ended, so advance callbacks for current CPU. */
1380         __rcu_process_gp_end(rsp, rnp, rdp);
1381
1382         /* Set state so that this CPU will detect the next quiescent state. */
1383         __note_new_gpnum(rsp, rnp, rdp);
1384 }
1385
1386 /*
1387  * Initialize a new grace period.
1388  */
1389 static int rcu_gp_init(struct rcu_state *rsp)
1390 {
1391         struct rcu_data *rdp;
1392         struct rcu_node *rnp = rcu_get_root(rsp);
1393
1394         raw_spin_lock_irq(&rnp->lock);
1395         rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1396
1397         if (rcu_gp_in_progress(rsp)) {
1398                 /* Grace period already in progress, don't start another.  */
1399                 raw_spin_unlock_irq(&rnp->lock);
1400                 return 0;
1401         }
1402
1403         /* Advance to a new grace period and initialize state. */
1404         rsp->gpnum++;
1405         trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
1406         record_gp_stall_check_time(rsp);
1407         raw_spin_unlock_irq(&rnp->lock);
1408
1409         /* Exclude any concurrent CPU-hotplug operations. */
1410         mutex_lock(&rsp->onoff_mutex);
1411
1412         /*
1413          * Set the quiescent-state-needed bits in all the rcu_node
1414          * structures for all currently online CPUs in breadth-first order,
1415          * starting from the root rcu_node structure, relying on the layout
1416          * of the tree within the rsp->node[] array.  Note that other CPUs
1417          * will access only the leaves of the hierarchy, thus seeing that no
1418          * grace period is in progress, at least until the corresponding
1419          * leaf node has been initialized.  In addition, we have excluded
1420          * CPU-hotplug operations.
1421          *
1422          * The grace period cannot complete until the initialization
1423          * process finishes, because this kthread handles both.
1424          */
1425         rcu_for_each_node_breadth_first(rsp, rnp) {
1426                 raw_spin_lock_irq(&rnp->lock);
1427                 rdp = this_cpu_ptr(rsp->rda);
1428                 rcu_preempt_check_blocked_tasks(rnp);
1429                 rnp->qsmask = rnp->qsmaskinit;
1430                 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1431                 WARN_ON_ONCE(rnp->completed != rsp->completed);
1432                 ACCESS_ONCE(rnp->completed) = rsp->completed;
1433                 if (rnp == rdp->mynode)
1434                         rcu_start_gp_per_cpu(rsp, rnp, rdp);
1435                 rcu_preempt_boost_start_gp(rnp);
1436                 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1437                                             rnp->level, rnp->grplo,
1438                                             rnp->grphi, rnp->qsmask);
1439                 raw_spin_unlock_irq(&rnp->lock);
1440 #ifdef CONFIG_PROVE_RCU_DELAY
1441                 if ((random32() % (rcu_num_nodes * 8)) == 0)
1442                         schedule_timeout_uninterruptible(2);
1443 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1444                 cond_resched();
1445         }
1446
1447         mutex_unlock(&rsp->onoff_mutex);
1448         return 1;
1449 }
1450
1451 /*
1452  * Do one round of quiescent-state forcing.
1453  */
1454 int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1455 {
1456         int fqs_state = fqs_state_in;
1457         struct rcu_node *rnp = rcu_get_root(rsp);
1458
1459         rsp->n_force_qs++;
1460         if (fqs_state == RCU_SAVE_DYNTICK) {
1461                 /* Collect dyntick-idle snapshots. */
1462                 force_qs_rnp(rsp, dyntick_save_progress_counter);
1463                 fqs_state = RCU_FORCE_QS;
1464         } else {
1465                 /* Handle dyntick-idle and offline CPUs. */
1466                 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1467         }
1468         /* Clear flag to prevent immediate re-entry. */
1469         if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1470                 raw_spin_lock_irq(&rnp->lock);
1471                 rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
1472                 raw_spin_unlock_irq(&rnp->lock);
1473         }
1474         return fqs_state;
1475 }
1476
1477 /*
1478  * Clean up after the old grace period.
1479  */
1480 static void rcu_gp_cleanup(struct rcu_state *rsp)
1481 {
1482         unsigned long gp_duration;
1483         int nocb = 0;
1484         struct rcu_data *rdp;
1485         struct rcu_node *rnp = rcu_get_root(rsp);
1486
1487         raw_spin_lock_irq(&rnp->lock);
1488         gp_duration = jiffies - rsp->gp_start;
1489         if (gp_duration > rsp->gp_max)
1490                 rsp->gp_max = gp_duration;
1491
1492         /*
1493          * We know the grace period is complete, but to everyone else
1494          * it appears to still be ongoing.  But it is also the case
1495          * that to everyone else it looks like there is nothing that
1496          * they can do to advance the grace period.  It is therefore
1497          * safe for us to drop the lock in order to mark the grace
1498          * period as completed in all of the rcu_node structures.
1499          */
1500         raw_spin_unlock_irq(&rnp->lock);
1501
1502         /*
1503          * Propagate new ->completed value to rcu_node structures so
1504          * that other CPUs don't have to wait until the start of the next
1505          * grace period to process their callbacks.  This also avoids
1506          * some nasty RCU grace-period initialization races by forcing
1507          * the end of the current grace period to be completely recorded in
1508          * all of the rcu_node structures before the beginning of the next
1509          * grace period is recorded in any of the rcu_node structures.
1510          */
1511         rcu_for_each_node_breadth_first(rsp, rnp) {
1512                 raw_spin_lock_irq(&rnp->lock);
1513                 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1514                 rdp = this_cpu_ptr(rsp->rda);
1515                 if (rnp == rdp->mynode)
1516                         __rcu_process_gp_end(rsp, rnp, rdp);
1517                 nocb += rcu_future_gp_cleanup(rsp, rnp);
1518                 raw_spin_unlock_irq(&rnp->lock);
1519                 cond_resched();
1520         }
1521         rnp = rcu_get_root(rsp);
1522         raw_spin_lock_irq(&rnp->lock);
1523         rcu_nocb_gp_set(rnp, nocb);
1524
1525         rsp->completed = rsp->gpnum; /* Declare grace period done. */
1526         trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1527         rsp->fqs_state = RCU_GP_IDLE;
1528         rdp = this_cpu_ptr(rsp->rda);
1529         rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */
1530         if (cpu_needs_another_gp(rsp, rdp))
1531                 rsp->gp_flags = 1;
1532         raw_spin_unlock_irq(&rnp->lock);
1533 }
1534
1535 /*
1536  * Body of kthread that handles grace periods.
1537  */
1538 static int __noreturn rcu_gp_kthread(void *arg)
1539 {
1540         int fqs_state;
1541         unsigned long j;
1542         int ret;
1543         struct rcu_state *rsp = arg;
1544         struct rcu_node *rnp = rcu_get_root(rsp);
1545
1546         for (;;) {
1547
1548                 /* Handle grace-period start. */
1549                 for (;;) {
1550                         wait_event_interruptible(rsp->gp_wq,
1551                                                  rsp->gp_flags &
1552                                                  RCU_GP_FLAG_INIT);
1553                         if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
1554                             rcu_gp_init(rsp))
1555                                 break;
1556                         cond_resched();
1557                         flush_signals(current);
1558                 }
1559
1560                 /* Handle quiescent-state forcing. */
1561                 fqs_state = RCU_SAVE_DYNTICK;
1562                 j = jiffies_till_first_fqs;
1563                 if (j > HZ) {
1564                         j = HZ;
1565                         jiffies_till_first_fqs = HZ;
1566                 }
1567                 for (;;) {
1568                         rsp->jiffies_force_qs = jiffies + j;
1569                         ret = wait_event_interruptible_timeout(rsp->gp_wq,
1570                                         (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
1571                                         (!ACCESS_ONCE(rnp->qsmask) &&
1572                                          !rcu_preempt_blocked_readers_cgp(rnp)),
1573                                         j);
1574                         /* If grace period done, leave loop. */
1575                         if (!ACCESS_ONCE(rnp->qsmask) &&
1576                             !rcu_preempt_blocked_readers_cgp(rnp))
1577                                 break;
1578                         /* If time for quiescent-state forcing, do it. */
1579                         if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
1580                                 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1581                                 cond_resched();
1582                         } else {
1583                                 /* Deal with stray signal. */
1584                                 cond_resched();
1585                                 flush_signals(current);
1586                         }
1587                         j = jiffies_till_next_fqs;
1588                         if (j > HZ) {
1589                                 j = HZ;
1590                                 jiffies_till_next_fqs = HZ;
1591                         } else if (j < 1) {
1592                                 j = 1;
1593                                 jiffies_till_next_fqs = 1;
1594                         }
1595                 }
1596
1597                 /* Handle grace-period end. */
1598                 rcu_gp_cleanup(rsp);
1599         }
1600 }
1601
1602 /*
1603  * Start a new RCU grace period if warranted, re-initializing the hierarchy
1604  * in preparation for detecting the next grace period.  The caller must hold
1605  * the root node's ->lock and hard irqs must be disabled.
1606  *
1607  * Note that it is legal for a dying CPU (which is marked as offline) to
1608  * invoke this function.  This can happen when the dying CPU reports its
1609  * quiescent state.
1610  */
1611 static void
1612 rcu_start_gp(struct rcu_state *rsp)
1613 {
1614         struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1615         struct rcu_node *rnp = rcu_get_root(rsp);
1616
1617         /*
1618          * If there is no grace period in progress right now, any
1619          * callbacks we have up to this point will be satisfied by the
1620          * next grace period.  Also, advancing the callbacks reduces the
1621          * probability of false positives from cpu_needs_another_gp()
1622          * resulting in pointless grace periods.  So, advance callbacks!
1623          */
1624         rcu_advance_cbs(rsp, rnp, rdp);
1625
1626         if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
1627                 /*
1628                  * Either we have not yet spawned the grace-period
1629                  * task, this CPU does not need another grace period,
1630                  * or a grace period is already in progress.
1631                  * Either way, don't start a new grace period.
1632                  */
1633                 return;
1634         }
1635         rsp->gp_flags = RCU_GP_FLAG_INIT;
1636
1637         /* Ensure that CPU is aware of completion of last grace period. */
1638         __rcu_process_gp_end(rsp, rdp->mynode, rdp);
1639
1640         /* Wake up rcu_gp_kthread() to start the grace period. */
1641         wake_up(&rsp->gp_wq);
1642 }
1643
1644 /*
1645  * Report a full set of quiescent states to the specified rcu_state
1646  * data structure.  This involves cleaning up after the prior grace
1647  * period and letting rcu_start_gp() start up the next grace period
1648  * if one is needed.  Note that the caller must hold rnp->lock, which
1649  * is released before return.
1650  */
1651 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1652         __releases(rcu_get_root(rsp)->lock)
1653 {
1654         WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
1655         raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
1656         wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
1657 }
1658
1659 /*
1660  * Similar to rcu_report_qs_rdp(), for which it is a helper function.
1661  * Allows quiescent states for a group of CPUs to be reported at one go
1662  * to the specified rcu_node structure, though all the CPUs in the group
1663  * must be represented by the same rcu_node structure (which need not be
1664  * a leaf rcu_node structure, though it often will be).  That structure's
1665  * lock must be held upon entry, and it is released before return.
1666  */
1667 static void
1668 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
1669                   struct rcu_node *rnp, unsigned long flags)
1670         __releases(rnp->lock)
1671 {
1672         struct rcu_node *rnp_c;
1673
1674         /* Walk up the rcu_node hierarchy. */
1675         for (;;) {
1676                 if (!(rnp->qsmask & mask)) {
1677
1678                         /* Our bit has already been cleared, so done. */
1679                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
1680                         return;
1681                 }
1682                 rnp->qsmask &= ~mask;
1683                 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
1684                                                  mask, rnp->qsmask, rnp->level,
1685                                                  rnp->grplo, rnp->grphi,
1686                                                  !!rnp->gp_tasks);
1687                 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
1688
1689                         /* Other bits still set at this level, so done. */
1690                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
1691                         return;
1692                 }
1693                 mask = rnp->grpmask;
1694                 if (rnp->parent == NULL) {
1695
1696                         /* No more levels.  Exit loop holding root lock. */
1697
1698                         break;
1699                 }
1700                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1701                 rnp_c = rnp;
1702                 rnp = rnp->parent;
1703                 raw_spin_lock_irqsave(&rnp->lock, flags);
1704                 WARN_ON_ONCE(rnp_c->qsmask);
1705         }
1706
1707         /*
1708          * Get here if we are the last CPU to pass through a quiescent
1709          * state for this grace period.  Invoke rcu_report_qs_rsp()
1710          * to clean up and start the next grace period if one is needed.
1711          */
1712         rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
1713 }
1714
1715 /*
1716  * Record a quiescent state for the specified CPU to that CPU's rcu_data
1717  * structure.  This must be either called from the specified CPU, or
1718  * called when the specified CPU is known to be offline (and when it is
1719  * also known that no other CPU is concurrently trying to help the offline
1720  * CPU).  The lastcomp argument is used to make sure we are still in the
1721  * grace period of interest.  We don't want to end the current grace period
1722  * based on quiescent states detected in an earlier grace period!
1723  */
1724 static void
1725 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1726 {
1727         unsigned long flags;
1728         unsigned long mask;
1729         struct rcu_node *rnp;
1730
1731         rnp = rdp->mynode;
1732         raw_spin_lock_irqsave(&rnp->lock, flags);
1733         if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
1734             rnp->completed == rnp->gpnum) {
1735
1736                 /*
1737                  * The grace period in which this quiescent state was
1738                  * recorded has ended, so don't report it upwards.
1739                  * We will instead need a new quiescent state that lies
1740                  * within the current grace period.
1741                  */
1742                 rdp->passed_quiesce = 0;        /* need qs for new gp. */
1743                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1744                 return;
1745         }
1746         mask = rdp->grpmask;
1747         if ((rnp->qsmask & mask) == 0) {
1748                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1749         } else {
1750                 rdp->qs_pending = 0;
1751
1752                 /*
1753                  * This GP can't end until cpu checks in, so all of our
1754                  * callbacks can be processed during the next GP.
1755                  */
1756                 rcu_accelerate_cbs(rsp, rnp, rdp);
1757
1758                 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1759         }
1760 }
1761
1762 /*
1763  * Check to see if there is a new grace period of which this CPU
1764  * is not yet aware, and if so, set up local rcu_data state for it.
1765  * Otherwise, see if this CPU has just passed through its first
1766  * quiescent state for this grace period, and record that fact if so.
1767  */
1768 static void
1769 rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1770 {
1771         /* If there is now a new grace period, record and return. */
1772         if (check_for_new_grace_period(rsp, rdp))
1773                 return;
1774
1775         /*
1776          * Does this CPU still need to do its part for current grace period?
1777          * If no, return and let the other CPUs do their part as well.
1778          */
1779         if (!rdp->qs_pending)
1780                 return;
1781
1782         /*
1783          * Was there a quiescent state since the beginning of the grace
1784          * period? If no, then exit and wait for the next call.
1785          */
1786         if (!rdp->passed_quiesce)
1787                 return;
1788
1789         /*
1790          * Tell RCU we are done (but rcu_report_qs_rdp() will be the
1791          * judge of that).
1792          */
1793         rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
1794 }
1795
1796 #ifdef CONFIG_HOTPLUG_CPU
1797
1798 /*
1799  * Send the specified CPU's RCU callbacks to the orphanage.  The
1800  * specified CPU must be offline, and the caller must hold the
1801  * ->orphan_lock.
1802  */
1803 static void
1804 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1805                           struct rcu_node *rnp, struct rcu_data *rdp)
1806 {
1807         /* No-CBs CPUs do not have orphanable callbacks. */
1808         if (is_nocb_cpu(rdp->cpu))
1809                 return;
1810
1811         /*
1812          * Orphan the callbacks.  First adjust the counts.  This is safe
1813          * because _rcu_barrier() excludes CPU-hotplug operations, so it
1814          * cannot be running now.  Thus no memory barrier is required.
1815          */
1816         if (rdp->nxtlist != NULL) {
1817                 rsp->qlen_lazy += rdp->qlen_lazy;
1818                 rsp->qlen += rdp->qlen;
1819                 rdp->n_cbs_orphaned += rdp->qlen;
1820                 rdp->qlen_lazy = 0;
1821                 ACCESS_ONCE(rdp->qlen) = 0;
1822         }
1823
1824         /*
1825          * Next, move those callbacks still needing a grace period to
1826          * the orphanage, where some other CPU will pick them up.
1827          * Some of the callbacks might have gone partway through a grace
1828          * period, but that is too bad.  They get to start over because we
1829          * cannot assume that grace periods are synchronized across CPUs.
1830          * We don't bother updating the ->nxttail[] array yet, instead
1831          * we just reset the whole thing later on.
1832          */
1833         if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) {
1834                 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL];
1835                 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL];
1836                 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
1837         }
1838
1839         /*
1840          * Then move the ready-to-invoke callbacks to the orphanage,
1841          * where some other CPU will pick them up.  These will not be
1842          * required to pass though another grace period: They are done.
1843          */
1844         if (rdp->nxtlist != NULL) {
1845                 *rsp->orphan_donetail = rdp->nxtlist;
1846                 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
1847         }
1848
1849         /* Finally, initialize the rcu_data structure's list to empty.  */
1850         init_callback_list(rdp);
1851 }
1852
1853 /*
1854  * Adopt the RCU callbacks from the specified rcu_state structure's
1855  * orphanage.  The caller must hold the ->orphan_lock.
1856  */
1857 static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
1858 {
1859         int i;
1860         struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1861
1862         /* No-CBs CPUs are handled specially. */
1863         if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
1864                 return;
1865
1866         /* Do the accounting first. */
1867         rdp->qlen_lazy += rsp->qlen_lazy;
1868         rdp->qlen += rsp->qlen;
1869         rdp->n_cbs_adopted += rsp->qlen;
1870         if (rsp->qlen_lazy != rsp->qlen)
1871                 rcu_idle_count_callbacks_posted();
1872         rsp->qlen_lazy = 0;
1873         rsp->qlen = 0;
1874
1875         /*
1876          * We do not need a memory barrier here because the only way we
1877          * can get here if there is an rcu_barrier() in flight is if
1878          * we are the task doing the rcu_barrier().
1879          */
1880
1881         /* First adopt the ready-to-invoke callbacks. */
1882         if (rsp->orphan_donelist != NULL) {
1883                 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL];
1884                 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist;
1885                 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--)
1886                         if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
1887                                 rdp->nxttail[i] = rsp->orphan_donetail;
1888                 rsp->orphan_donelist = NULL;
1889                 rsp->orphan_donetail = &rsp->orphan_donelist;
1890         }
1891
1892         /* And then adopt the callbacks that still need a grace period. */
1893         if (rsp->orphan_nxtlist != NULL) {
1894                 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist;
1895                 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail;
1896                 rsp->orphan_nxtlist = NULL;
1897                 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
1898         }
1899 }
1900
1901 /*
1902  * Trace the fact that this CPU is going offline.
1903  */
1904 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1905 {
1906         RCU_TRACE(unsigned long mask);
1907         RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda));
1908         RCU_TRACE(struct rcu_node *rnp = rdp->mynode);
1909
1910         RCU_TRACE(mask = rdp->grpmask);
1911         trace_rcu_grace_period(rsp->name,
1912                                rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1913                                "cpuofl");
1914 }
1915
1916 /*
1917  * The CPU has been completely removed, and some other CPU is reporting
1918  * this fact from process context.  Do the remainder of the cleanup,
1919  * including orphaning the outgoing CPU's RCU callbacks, and also
1920  * adopting them.  There can only be one CPU hotplug operation at a time,
1921  * so no other CPU can be attempting to update rcu_cpu_kthread_task.
1922  */
1923 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1924 {
1925         unsigned long flags;
1926         unsigned long mask;
1927         int need_report = 0;
1928         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1929         struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
1930
1931         /* Adjust any no-longer-needed kthreads. */
1932         rcu_boost_kthread_setaffinity(rnp, -1);
1933
1934         /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1935
1936         /* Exclude any attempts to start a new grace period. */
1937         mutex_lock(&rsp->onoff_mutex);
1938         raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
1939
1940         /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
1941         rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
1942         rcu_adopt_orphan_cbs(rsp);
1943
1944         /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1945         mask = rdp->grpmask;    /* rnp->grplo is constant. */
1946         do {
1947                 raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
1948                 rnp->qsmaskinit &= ~mask;
1949                 if (rnp->qsmaskinit != 0) {
1950                         if (rnp != rdp->mynode)
1951                                 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1952                         break;
1953                 }
1954                 if (rnp == rdp->mynode)
1955                         need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
1956                 else
1957                         raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1958                 mask = rnp->grpmask;
1959                 rnp = rnp->parent;
1960         } while (rnp != NULL);
1961
1962         /*
1963          * We still hold the leaf rcu_node structure lock here, and
1964          * irqs are still disabled.  The reason for this subterfuge is
1965          * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
1966          * held leads to deadlock.
1967          */
1968         raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
1969         rnp = rdp->mynode;
1970         if (need_report & RCU_OFL_TASKS_NORM_GP)
1971                 rcu_report_unblock_qs_rnp(rnp, flags);
1972         else
1973                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1974         if (need_report & RCU_OFL_TASKS_EXP_GP)
1975                 rcu_report_exp_rnp(rsp, rnp, true);
1976         WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
1977                   "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
1978                   cpu, rdp->qlen, rdp->nxtlist);
1979         init_callback_list(rdp);
1980         /* Disallow further callbacks on this CPU. */
1981         rdp->nxttail[RCU_NEXT_TAIL] = NULL;
1982         mutex_unlock(&rsp->onoff_mutex);
1983 }
1984
1985 #else /* #ifdef CONFIG_HOTPLUG_CPU */
1986
1987 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1988 {
1989 }
1990
1991 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1992 {
1993 }
1994
1995 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1996
1997 /*
1998  * Invoke any RCU callbacks that have made it to the end of their grace
1999  * period.  Thottle as specified by rdp->blimit.
2000  */
2001 static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
2002 {
2003         unsigned long flags;
2004         struct rcu_head *next, *list, **tail;
2005         long bl, count, count_lazy;
2006         int i;
2007
2008         /* If no callbacks are ready, just return. */
2009         if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
2010                 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
2011                 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
2012                                     need_resched(), is_idle_task(current),
2013                                     rcu_is_callbacks_kthread());
2014                 return;
2015         }
2016
2017         /*
2018          * Extract the list of ready callbacks, disabling to prevent
2019          * races with call_rcu() from interrupt handlers.
2020          */
2021         local_irq_save(flags);
2022         WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2023         bl = rdp->blimit;
2024         trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
2025         list = rdp->nxtlist;
2026         rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
2027         *rdp->nxttail[RCU_DONE_TAIL] = NULL;
2028         tail = rdp->nxttail[RCU_DONE_TAIL];
2029         for (i = RCU_NEXT_SIZE - 1; i >= 0; i--)
2030                 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL])
2031                         rdp->nxttail[i] = &rdp->nxtlist;
2032         local_irq_restore(flags);
2033
2034         /* Invoke callbacks. */
2035         count = count_lazy = 0;
2036         while (list) {
2037                 next = list->next;
2038                 prefetch(next);
2039                 debug_rcu_head_unqueue(list);
2040                 if (__rcu_reclaim(rsp->name, list))
2041                         count_lazy++;
2042                 list = next;
2043                 /* Stop only if limit reached and CPU has something to do. */
2044                 if (++count >= bl &&
2045                     (need_resched() ||
2046                      (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2047                         break;
2048         }
2049
2050         local_irq_save(flags);
2051         trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
2052                             is_idle_task(current),
2053                             rcu_is_callbacks_kthread());
2054
2055         /* Update count, and requeue any remaining callbacks. */
2056         if (list != NULL) {
2057                 *tail = rdp->nxtlist;
2058                 rdp->nxtlist = list;
2059                 for (i = 0; i < RCU_NEXT_SIZE; i++)
2060                         if (&rdp->nxtlist == rdp->nxttail[i])
2061                                 rdp->nxttail[i] = tail;
2062                         else
2063                                 break;
2064         }
2065         smp_mb(); /* List handling before counting for rcu_barrier(). */
2066         rdp->qlen_lazy -= count_lazy;
2067         ACCESS_ONCE(rdp->qlen) -= count;
2068         rdp->n_cbs_invoked += count;
2069
2070         /* Reinstate batch limit if we have worked down the excess. */
2071         if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
2072                 rdp->blimit = blimit;
2073
2074         /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
2075         if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
2076                 rdp->qlen_last_fqs_check = 0;
2077                 rdp->n_force_qs_snap = rsp->n_force_qs;
2078         } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
2079                 rdp->qlen_last_fqs_check = rdp->qlen;
2080         WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
2081
2082         local_irq_restore(flags);
2083
2084         /* Re-invoke RCU core processing if there are callbacks remaining. */
2085         if (cpu_has_callbacks_ready_to_invoke(rdp))
2086                 invoke_rcu_core();
2087 }
2088
2089 /*
2090  * Check to see if this CPU is in a non-context-switch quiescent state
2091  * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
2092  * Also schedule RCU core processing.
2093  *
2094  * This function must be called from hardirq context.  It is normally
2095  * invoked from the scheduling-clock interrupt.  If rcu_pending returns
2096  * false, there is no point in invoking rcu_check_callbacks().
2097  */
2098 void rcu_check_callbacks(int cpu, int user)
2099 {
2100         trace_rcu_utilization("Start scheduler-tick");
2101         increment_cpu_stall_ticks();
2102         if (user || rcu_is_cpu_rrupt_from_idle()) {
2103
2104                 /*
2105                  * Get here if this CPU took its interrupt from user
2106                  * mode or from the idle loop, and if this is not a
2107                  * nested interrupt.  In this case, the CPU is in
2108                  * a quiescent state, so note it.
2109                  *
2110                  * No memory barrier is required here because both
2111                  * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
2112                  * variables that other CPUs neither access nor modify,
2113                  * at least not while the corresponding CPU is online.
2114                  */
2115
2116                 rcu_sched_qs(cpu);
2117                 rcu_bh_qs(cpu);
2118
2119         } else if (!in_softirq()) {
2120
2121                 /*
2122                  * Get here if this CPU did not take its interrupt from
2123                  * softirq, in other words, if it is not interrupting
2124                  * a rcu_bh read-side critical section.  This is an _bh
2125                  * critical section, so note it.
2126                  */
2127
2128                 rcu_bh_qs(cpu);
2129         }
2130         rcu_preempt_check_callbacks(cpu);
2131         if (rcu_pending(cpu))
2132                 invoke_rcu_core();
2133         trace_rcu_utilization("End scheduler-tick");
2134 }
2135
2136 /*
2137  * Scan the leaf rcu_node structures, processing dyntick state for any that
2138  * have not yet encountered a quiescent state, using the function specified.
2139  * Also initiate boosting for any threads blocked on the root rcu_node.
2140  *
2141  * The caller must have suppressed start of new grace periods.
2142  */
2143 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
2144 {
2145         unsigned long bit;
2146         int cpu;
2147         unsigned long flags;
2148         unsigned long mask;
2149         struct rcu_node *rnp;
2150
2151         rcu_for_each_leaf_node(rsp, rnp) {
2152                 cond_resched();
2153                 mask = 0;
2154                 raw_spin_lock_irqsave(&rnp->lock, flags);
2155                 if (!rcu_gp_in_progress(rsp)) {
2156                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
2157                         return;
2158                 }
2159                 if (rnp->qsmask == 0) {
2160                         rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
2161                         continue;
2162                 }
2163                 cpu = rnp->grplo;
2164                 bit = 1;
2165                 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2166                         if ((rnp->qsmask & bit) != 0 &&
2167                             f(per_cpu_ptr(rsp->rda, cpu)))
2168                                 mask |= bit;
2169                 }
2170                 if (mask != 0) {
2171
2172                         /* rcu_report_qs_rnp() releases rnp->lock. */
2173                         rcu_report_qs_rnp(mask, rsp, rnp, flags);
2174                         continue;
2175                 }
2176                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2177         }
2178         rnp = rcu_get_root(rsp);
2179         if (rnp->qsmask == 0) {
2180                 raw_spin_lock_irqsave(&rnp->lock, flags);
2181                 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
2182         }
2183 }
2184
2185 /*
2186  * Force quiescent states on reluctant CPUs, and also detect which
2187  * CPUs are in dyntick-idle mode.
2188  */
2189 static void force_quiescent_state(struct rcu_state *rsp)
2190 {
2191         unsigned long flags;
2192         bool ret;
2193         struct rcu_node *rnp;
2194         struct rcu_node *rnp_old = NULL;
2195
2196         /* Funnel through hierarchy to reduce memory contention. */
2197         rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
2198         for (; rnp != NULL; rnp = rnp->parent) {
2199                 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
2200                       !raw_spin_trylock(&rnp->fqslock);
2201                 if (rnp_old != NULL)
2202                         raw_spin_unlock(&rnp_old->fqslock);
2203                 if (ret) {
2204                         rsp->n_force_qs_lh++;
2205                         return;
2206                 }
2207                 rnp_old = rnp;
2208         }
2209         /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
2210
2211         /* Reached the root of the rcu_node tree, acquire lock. */
2212         raw_spin_lock_irqsave(&rnp_old->lock, flags);
2213         raw_spin_unlock(&rnp_old->fqslock);
2214         if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
2215                 rsp->n_force_qs_lh++;
2216                 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2217                 return;  /* Someone beat us to it. */
2218         }
2219         rsp->gp_flags |= RCU_GP_FLAG_FQS;
2220         raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2221         wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
2222 }
2223
2224 /*
2225  * This does the RCU core processing work for the specified rcu_state
2226  * and rcu_data structures.  This may be called only from the CPU to
2227  * whom the rdp belongs.
2228  */
2229 static void
2230 __rcu_process_callbacks(struct rcu_state *rsp)
2231 {
2232         unsigned long flags;
2233         struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
2234
2235         WARN_ON_ONCE(rdp->beenonline == 0);
2236
2237         /* Handle the end of a grace period that some other CPU ended.  */
2238         rcu_process_gp_end(rsp, rdp);
2239
2240         /* Update RCU state based on any recent quiescent states. */
2241         rcu_check_quiescent_state(rsp, rdp);
2242
2243         /* Does this CPU require a not-yet-started grace period? */
2244         local_irq_save(flags);
2245         if (cpu_needs_another_gp(rsp, rdp)) {
2246                 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2247                 rcu_start_gp(rsp);
2248                 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2249         } else {
2250                 local_irq_restore(flags);
2251         }
2252
2253         /* If there are callbacks ready, invoke them. */
2254         if (cpu_has_callbacks_ready_to_invoke(rdp))
2255                 invoke_rcu_callbacks(rsp, rdp);
2256 }
2257
2258 /*
2259  * Do RCU core processing for the current CPU.
2260  */
2261 static void rcu_process_callbacks(struct softirq_action *unused)
2262 {
2263         struct rcu_state *rsp;
2264
2265         if (cpu_is_offline(smp_processor_id()))
2266                 return;
2267         trace_rcu_utilization("Start RCU core");
2268         for_each_rcu_flavor(rsp)
2269                 __rcu_process_callbacks(rsp);
2270         trace_rcu_utilization("End RCU core");
2271 }
2272
2273 /*
2274  * Schedule RCU callback invocation.  If the specified type of RCU
2275  * does not support RCU priority boosting, just do a direct call,
2276  * otherwise wake up the per-CPU kernel kthread.  Note that because we
2277  * are running on the current CPU with interrupts disabled, the
2278  * rcu_cpu_kthread_task cannot disappear out from under us.
2279  */
2280 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
2281 {
2282         if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active)))
2283                 return;
2284         if (likely(!rsp->boost)) {
2285                 rcu_do_batch(rsp, rdp);
2286                 return;
2287         }
2288         invoke_rcu_callbacks_kthread();
2289 }
2290
2291 static void invoke_rcu_core(void)
2292 {
2293         raise_softirq(RCU_SOFTIRQ);
2294 }
2295
2296 /*
2297  * Handle any core-RCU processing required by a call_rcu() invocation.
2298  */
2299 static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2300                             struct rcu_head *head, unsigned long flags)
2301 {
2302         /*
2303          * If called from an extended quiescent state, invoke the RCU
2304          * core in order to force a re-evaluation of RCU's idleness.
2305          */
2306         if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
2307                 invoke_rcu_core();
2308
2309         /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
2310         if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
2311                 return;
2312
2313         /*
2314          * Force the grace period if too many callbacks or too long waiting.
2315          * Enforce hysteresis, and don't invoke force_quiescent_state()
2316          * if some other CPU has recently done so.  Also, don't bother
2317          * invoking force_quiescent_state() if the newly enqueued callback
2318          * is the only one waiting for a grace period to complete.
2319          */
2320         if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
2321
2322                 /* Are we ignoring a completed grace period? */
2323                 rcu_process_gp_end(rsp, rdp);
2324                 check_for_new_grace_period(rsp, rdp);
2325
2326                 /* Start a new grace period if one not already started. */
2327                 if (!rcu_gp_in_progress(rsp)) {
2328                         struct rcu_node *rnp_root = rcu_get_root(rsp);
2329
2330                         raw_spin_lock(&rnp_root->lock);
2331                         rcu_start_gp(rsp);
2332                         raw_spin_unlock(&rnp_root->lock);
2333                 } else {
2334                         /* Give the grace period a kick. */
2335                         rdp->blimit = LONG_MAX;
2336                         if (rsp->n_force_qs == rdp->n_force_qs_snap &&
2337                             *rdp->nxttail[RCU_DONE_TAIL] != head)
2338                                 force_quiescent_state(rsp);
2339                         rdp->n_force_qs_snap = rsp->n_force_qs;
2340                         rdp->qlen_last_fqs_check = rdp->qlen;
2341                 }
2342         }
2343 }
2344
2345 /*
2346  * Helper function for call_rcu() and friends.  The cpu argument will
2347  * normally be -1, indicating "currently running CPU".  It may specify
2348  * a CPU only if that CPU is a no-CBs CPU.  Currently, only _rcu_barrier()
2349  * is expected to specify a CPU.
2350  */
2351 static void
2352 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2353            struct rcu_state *rsp, int cpu, bool lazy)
2354 {
2355         unsigned long flags;
2356         struct rcu_data *rdp;
2357
2358         WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
2359         debug_rcu_head_queue(head);
2360         head->func = func;
2361         head->next = NULL;
2362
2363         /*
2364          * Opportunistically note grace-period endings and beginnings.
2365          * Note that we might see a beginning right after we see an
2366          * end, but never vice versa, since this CPU has to pass through
2367          * a quiescent state betweentimes.
2368          */
2369         local_irq_save(flags);
2370         rdp = this_cpu_ptr(rsp->rda);
2371
2372         /* Add the callback to our list. */
2373         if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
2374                 int offline;
2375
2376                 if (cpu != -1)
2377                         rdp = per_cpu_ptr(rsp->rda, cpu);
2378                 offline = !__call_rcu_nocb(rdp, head, lazy);
2379                 WARN_ON_ONCE(offline);
2380                 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2381                 local_irq_restore(flags);
2382                 return;
2383         }
2384         ACCESS_ONCE(rdp->qlen)++;
2385         if (lazy)
2386                 rdp->qlen_lazy++;
2387         else
2388                 rcu_idle_count_callbacks_posted();
2389         smp_mb();  /* Count before adding callback for rcu_barrier(). */
2390         *rdp->nxttail[RCU_NEXT_TAIL] = head;
2391         rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
2392
2393         if (__is_kfree_rcu_offset((unsigned long)func))
2394                 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
2395                                          rdp->qlen_lazy, rdp->qlen);
2396         else
2397                 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
2398
2399         /* Go handle any RCU core processing required. */
2400         __call_rcu_core(rsp, rdp, head, flags);
2401         local_irq_restore(flags);
2402 }
2403
2404 /*
2405  * Queue an RCU-sched callback for invocation after a grace period.
2406  */
2407 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2408 {
2409         __call_rcu(head, func, &rcu_sched_state, -1, 0);
2410 }
2411 EXPORT_SYMBOL_GPL(call_rcu_sched);
2412
2413 /*
2414  * Queue an RCU callback for invocation after a quicker grace period.
2415  */
2416 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
2417 {
2418         __call_rcu(head, func, &rcu_bh_state, -1, 0);
2419 }
2420 EXPORT_SYMBOL_GPL(call_rcu_bh);
2421
2422 /*
2423  * Because a context switch is a grace period for RCU-sched and RCU-bh,
2424  * any blocking grace-period wait automatically implies a grace period
2425  * if there is only one CPU online at any point time during execution
2426  * of either synchronize_sched() or synchronize_rcu_bh().  It is OK to
2427  * occasionally incorrectly indicate that there are multiple CPUs online
2428  * when there was in fact only one the whole time, as this just adds
2429  * some overhead: RCU still operates correctly.
2430  */
2431 static inline int rcu_blocking_is_gp(void)
2432 {
2433         int ret;
2434
2435         might_sleep();  /* Check for RCU read-side critical section. */
2436         preempt_disable();
2437         ret = num_online_cpus() <= 1;
2438         preempt_enable();
2439         return ret;
2440 }
2441
2442 /**
2443  * synchronize_sched - wait until an rcu-sched grace period has elapsed.
2444  *
2445  * Control will return to the caller some time after a full rcu-sched
2446  * grace period has elapsed, in other words after all currently executing
2447  * rcu-sched read-side critical sections have completed.   These read-side
2448  * critical sections are delimited by rcu_read_lock_sched() and
2449  * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(),
2450  * local_irq_disable(), and so on may be used in place of
2451  * rcu_read_lock_sched().
2452  *
2453  * This means that all preempt_disable code sequences, including NMI and
2454  * non-threaded hardware-interrupt handlers, in progress on entry will
2455  * have completed before this primitive returns.  However, this does not
2456  * guarantee that softirq handlers will have completed, since in some
2457  * kernels, these handlers can run in process context, and can block.
2458  *
2459  * Note that this guarantee implies further memory-ordering guarantees.
2460  * On systems with more than one CPU, when synchronize_sched() returns,
2461  * each CPU is guaranteed to have executed a full memory barrier since the
2462  * end of its last RCU-sched read-side critical section whose beginning
2463  * preceded the call to synchronize_sched().  In addition, each CPU having
2464  * an RCU read-side critical section that extends beyond the return from
2465  * synchronize_sched() is guaranteed to have executed a full memory barrier
2466  * after the beginning of synchronize_sched() and before the beginning of
2467  * that RCU read-side critical section.  Note that these guarantees include
2468  * CPUs that are offline, idle, or executing in user mode, as well as CPUs
2469  * that are executing in the kernel.
2470  *
2471  * Furthermore, if CPU A invoked synchronize_sched(), which returned
2472  * to its caller on CPU B, then both CPU A and CPU B are guaranteed
2473  * to have executed a full memory barrier during the execution of
2474  * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
2475  * again only if the system has more than one CPU).
2476  *
2477  * This primitive provides the guarantees made by the (now removed)
2478  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
2479  * guarantees that rcu_read_lock() sections will have completed.
2480  * In "classic RCU", these two guarantees happen to be one and
2481  * the same, but can differ in realtime RCU implementations.
2482  */
2483 void synchronize_sched(void)
2484 {
2485         rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
2486                            !lock_is_held(&rcu_lock_map) &&
2487                            !lock_is_held(&rcu_sched_lock_map),
2488                            "Illegal synchronize_sched() in RCU-sched read-side critical section");
2489         if (rcu_blocking_is_gp())
2490                 return;
2491         if (rcu_expedited)
2492                 synchronize_sched_expedited();
2493         else
2494                 wait_rcu_gp(call_rcu_sched);
2495 }
2496 EXPORT_SYMBOL_GPL(synchronize_sched);
2497
2498 /**
2499  * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
2500  *
2501  * Control will return to the caller some time after a full rcu_bh grace
2502  * period has elapsed, in other words after all currently executing rcu_bh
2503  * read-side critical sections have completed.  RCU read-side critical
2504  * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
2505  * and may be nested.
2506  *
2507  * See the description of synchronize_sched() for more detailed information
2508  * on memory ordering guarantees.
2509  */
2510 void synchronize_rcu_bh(void)
2511 {
2512         rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
2513                            !lock_is_held(&rcu_lock_map) &&
2514                            !lock_is_held(&rcu_sched_lock_map),
2515                            "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2516         if (rcu_blocking_is_gp())
2517                 return;
2518         if (rcu_expedited)
2519                 synchronize_rcu_bh_expedited();
2520         else
2521                 wait_rcu_gp(call_rcu_bh);
2522 }
2523 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
2524
2525 static int synchronize_sched_expedited_cpu_stop(void *data)
2526 {
2527         /*
2528          * There must be a full memory barrier on each affected CPU
2529          * between the time that try_stop_cpus() is called and the
2530          * time that it returns.
2531          *
2532          * In the current initial implementation of cpu_stop, the
2533          * above condition is already met when the control reaches
2534          * this point and the following smp_mb() is not strictly
2535          * necessary.  Do smp_mb() anyway for documentation and
2536          * robustness against future implementation changes.
2537          */
2538         smp_mb(); /* See above comment block. */
2539         return 0;
2540 }
2541
2542 /**
2543  * synchronize_sched_expedited - Brute-force RCU-sched grace period
2544  *
2545  * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
2546  * approach to force the grace period to end quickly.  This consumes
2547  * significant time on all CPUs and is unfriendly to real-time workloads,
2548  * so is thus not recommended for any sort of common-case code.  In fact,
2549  * if you are using synchronize_sched_expedited() in a loop, please
2550  * restructure your code to batch your updates, and then use a single
2551  * synchronize_sched() instead.
2552  *
2553  * Note that it is illegal to call this function while holding any lock
2554  * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
2555  * to call this function from a CPU-hotplug notifier.  Failing to observe
2556  * these restriction will result in deadlock.
2557  *
2558  * This implementation can be thought of as an application of ticket
2559  * locking to RCU, with sync_sched_expedited_started and
2560  * sync_sched_expedited_done taking on the roles of the halves
2561  * of the ticket-lock word.  Each task atomically increments
2562  * sync_sched_expedited_started upon entry, snapshotting the old value,
2563  * then attempts to stop all the CPUs.  If this succeeds, then each
2564  * CPU will have executed a context switch, resulting in an RCU-sched
2565  * grace period.  We are then done, so we use atomic_cmpxchg() to
2566  * update sync_sched_expedited_done to match our snapshot -- but
2567  * only if someone else has not already advanced past our snapshot.
2568  *
2569  * On the other hand, if try_stop_cpus() fails, we check the value
2570  * of sync_sched_expedited_done.  If it has advanced past our
2571  * initial snapshot, then someone else must have forced a grace period
2572  * some time after we took our snapshot.  In this case, our work is
2573  * done for us, and we can simply return.  Otherwise, we try again,
2574  * but keep our initial snapshot for purposes of checking for someone
2575  * doing our work for us.
2576  *
2577  * If we fail too many times in a row, we fall back to synchronize_sched().
2578  */
2579 void synchronize_sched_expedited(void)
2580 {
2581         long firstsnap, s, snap;
2582         int trycount = 0;
2583         struct rcu_state *rsp = &rcu_sched_state;
2584
2585         /*
2586          * If we are in danger of counter wrap, just do synchronize_sched().
2587          * By allowing sync_sched_expedited_started to advance no more than
2588          * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
2589          * that more than 3.5 billion CPUs would be required to force a
2590          * counter wrap on a 32-bit system.  Quite a few more CPUs would of
2591          * course be required on a 64-bit system.
2592          */
2593         if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
2594                          (ulong)atomic_long_read(&rsp->expedited_done) +
2595                          ULONG_MAX / 8)) {
2596                 synchronize_sched();
2597                 atomic_long_inc(&rsp->expedited_wrap);
2598                 return;
2599         }
2600
2601         /*
2602          * Take a ticket.  Note that atomic_inc_return() implies a
2603          * full memory barrier.
2604          */
2605         snap = atomic_long_inc_return(&rsp->expedited_start);
2606         firstsnap = snap;
2607         get_online_cpus();
2608         WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2609
2610         /*
2611          * Each pass through the following loop attempts to force a
2612          * context switch on each CPU.
2613          */
2614         while (try_stop_cpus(cpu_online_mask,
2615                              synchronize_sched_expedited_cpu_stop,
2616                              NULL) == -EAGAIN) {
2617                 put_online_cpus();
2618                 atomic_long_inc(&rsp->expedited_tryfail);
2619
2620                 /* Check to see if someone else did our work for us. */
2621                 s = atomic_long_read(&rsp->expedited_done);
2622                 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2623                         /* ensure test happens before caller kfree */
2624                         smp_mb__before_atomic_inc(); /* ^^^ */
2625                         atomic_long_inc(&rsp->expedited_workdone1);
2626                         return;
2627                 }
2628
2629                 /* No joy, try again later.  Or just synchronize_sched(). */
2630                 if (trycount++ < 10) {
2631                         udelay(trycount * num_online_cpus());
2632                 } else {
2633                         wait_rcu_gp(call_rcu_sched);
2634                         atomic_long_inc(&rsp->expedited_normal);
2635                         return;
2636                 }
2637
2638                 /* Recheck to see if someone else did our work for us. */
2639                 s = atomic_long_read(&rsp->expedited_done);
2640                 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
2641                         /* ensure test happens before caller kfree */
2642                         smp_mb__before_atomic_inc(); /* ^^^ */
2643                         atomic_long_inc(&rsp->expedited_workdone2);
2644                         return;
2645                 }
2646
2647                 /*
2648                  * Refetching sync_sched_expedited_started allows later
2649                  * callers to piggyback on our grace period.  We retry
2650                  * after they started, so our grace period works for them,
2651                  * and they started after our first try, so their grace
2652                  * period works for us.
2653                  */
2654                 get_online_cpus();
2655                 snap = atomic_long_read(&rsp->expedited_start);
2656                 smp_mb(); /* ensure read is before try_stop_cpus(). */
2657         }
2658         atomic_long_inc(&rsp->expedited_stoppedcpus);
2659
2660         /*
2661          * Everyone up to our most recent fetch is covered by our grace
2662          * period.  Update the counter, but only if our work is still
2663          * relevant -- which it won't be if someone who started later
2664          * than we did already did their update.
2665          */
2666         do {
2667                 atomic_long_inc(&rsp->expedited_done_tries);
2668                 s = atomic_long_read(&rsp->expedited_done);
2669                 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
2670                         /* ensure test happens before caller kfree */
2671                         smp_mb__before_atomic_inc(); /* ^^^ */
2672                         atomic_long_inc(&rsp->expedited_done_lost);
2673                         break;
2674                 }
2675         } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
2676         atomic_long_inc(&rsp->expedited_done_exit);
2677
2678         put_online_cpus();
2679 }
2680 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
2681
2682 /*
2683  * Check to see if there is any immediate RCU-related work to be done
2684  * by the current CPU, for the specified type of RCU, returning 1 if so.
2685  * The checks are in order of increasing expense: checks that can be
2686  * carried out against CPU-local state are performed first.  However,
2687  * we must check for CPU stalls first, else we might not get a chance.
2688  */
2689 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2690 {
2691         struct rcu_node *rnp = rdp->mynode;
2692
2693         rdp->n_rcu_pending++;
2694
2695         /* Check for CPU stalls, if enabled. */
2696         check_cpu_stall(rsp, rdp);
2697
2698         /* Is the RCU core waiting for a quiescent state from this CPU? */
2699         if (rcu_scheduler_fully_active &&
2700             rdp->qs_pending && !rdp->passed_quiesce) {
2701                 rdp->n_rp_qs_pending++;
2702         } else if (rdp->qs_pending && rdp->passed_quiesce) {
2703                 rdp->n_rp_report_qs++;
2704                 return 1;
2705         }
2706
2707         /* Does this CPU have callbacks ready to invoke? */
2708         if (cpu_has_callbacks_ready_to_invoke(rdp)) {
2709                 rdp->n_rp_cb_ready++;
2710                 return 1;
2711         }
2712
2713         /* Has RCU gone idle with this CPU needing another grace period? */
2714         if (cpu_needs_another_gp(rsp, rdp)) {
2715                 rdp->n_rp_cpu_needs_gp++;
2716                 return 1;
2717         }
2718
2719         /* Has another RCU grace period completed?  */
2720         if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */
2721                 rdp->n_rp_gp_completed++;
2722                 return 1;
2723         }
2724
2725         /* Has a new RCU grace period started? */
2726         if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */
2727                 rdp->n_rp_gp_started++;
2728                 return 1;
2729         }
2730
2731         /* nothing to do */
2732         rdp->n_rp_need_nothing++;
2733         return 0;
2734 }
2735
2736 /*
2737  * Check to see if there is any immediate RCU-related work to be done
2738  * by the current CPU, returning 1 if so.  This function is part of the
2739  * RCU implementation; it is -not- an exported member of the RCU API.
2740  */
2741 static int rcu_pending(int cpu)
2742 {
2743         struct rcu_state *rsp;
2744
2745         for_each_rcu_flavor(rsp)
2746                 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
2747                         return 1;
2748         return 0;
2749 }
2750
2751 /*
2752  * Return true if the specified CPU has any callback.  If all_lazy is
2753  * non-NULL, store an indication of whether all callbacks are lazy.
2754  * (If there are no callbacks, all of them are deemed to be lazy.)
2755  */
2756 static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2757 {
2758         bool al = true;
2759         bool hc = false;
2760         struct rcu_data *rdp;
2761         struct rcu_state *rsp;
2762
2763         for_each_rcu_flavor(rsp) {
2764                 rdp = per_cpu_ptr(rsp->rda, cpu);
2765                 if (rdp->qlen != rdp->qlen_lazy)
2766                         al = false;
2767                 if (rdp->nxtlist)
2768                         hc = true;
2769         }
2770         if (all_lazy)
2771                 *all_lazy = al;
2772         return hc;
2773 }
2774
2775 /*
2776  * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
2777  * the compiler is expected to optimize this away.
2778  */
2779 static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
2780                                int cpu, unsigned long done)
2781 {
2782         trace_rcu_barrier(rsp->name, s, cpu,
2783                           atomic_read(&rsp->barrier_cpu_count), done);
2784 }
2785
2786 /*
2787  * RCU callback function for _rcu_barrier().  If we are last, wake
2788  * up the task executing _rcu_barrier().
2789  */
2790 static void rcu_barrier_callback(struct rcu_head *rhp)
2791 {
2792         struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
2793         struct rcu_state *rsp = rdp->rsp;
2794
2795         if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
2796                 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
2797                 complete(&rsp->barrier_completion);
2798         } else {
2799                 _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
2800         }
2801 }
2802
2803 /*
2804  * Called with preemption disabled, and from cross-cpu IRQ context.
2805  */
2806 static void rcu_barrier_func(void *type)
2807 {
2808         struct rcu_state *rsp = type;
2809         struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
2810
2811         _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
2812         atomic_inc(&rsp->barrier_cpu_count);
2813         rsp->call(&rdp->barrier_head, rcu_barrier_callback);
2814 }
2815
2816 /*
2817  * Orchestrate the specified type of RCU barrier, waiting for all
2818  * RCU callbacks of the specified type to complete.
2819  */
2820 static void _rcu_barrier(struct rcu_state *rsp)
2821 {
2822         int cpu;
2823         struct rcu_data *rdp;
2824         unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
2825         unsigned long snap_done;
2826
2827         _rcu_barrier_trace(rsp, "Begin", -1, snap);
2828
2829         /* Take mutex to serialize concurrent rcu_barrier() requests. */
2830         mutex_lock(&rsp->barrier_mutex);
2831
2832         /*
2833          * Ensure that all prior references, including to ->n_barrier_done,
2834          * are ordered before the _rcu_barrier() machinery.
2835          */
2836         smp_mb();  /* See above block comment. */
2837
2838         /*
2839          * Recheck ->n_barrier_done to see if others did our work for us.
2840          * This means checking ->n_barrier_done for an even-to-odd-to-even
2841          * transition.  The "if" expression below therefore rounds the old
2842          * value up to the next even number and adds two before comparing.
2843          */
2844         snap_done = ACCESS_ONCE(rsp->n_barrier_done);
2845         _rcu_barrier_trace(rsp, "Check", -1, snap_done);
2846         if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
2847                 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
2848                 smp_mb(); /* caller's subsequent code after above check. */
2849                 mutex_unlock(&rsp->barrier_mutex);
2850                 return;
2851         }
2852
2853         /*
2854          * Increment ->n_barrier_done to avoid duplicate work.  Use
2855          * ACCESS_ONCE() to prevent the compiler from speculating
2856          * the increment to precede the early-exit check.
2857          */
2858         ACCESS_ONCE(rsp->n_barrier_done)++;
2859         WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
2860         _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
2861         smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
2862
2863         /*
2864          * Initialize the count to one rather than to zero in order to
2865          * avoid a too-soon return to zero in case of a short grace period
2866          * (or preemption of this task).  Exclude CPU-hotplug operations
2867          * to ensure that no offline CPU has callbacks queued.
2868          */
2869         init_completion(&rsp->barrier_completion);
2870         atomic_set(&rsp->barrier_cpu_count, 1);
2871         get_online_cpus();
2872
2873         /*
2874          * Force each CPU with callbacks to register a new callback.
2875          * When that callback is invoked, we will know that all of the
2876          * corresponding CPU's preceding callbacks have been invoked.
2877          */
2878         for_each_possible_cpu(cpu) {
2879                 if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
2880                         continue;
2881                 rdp = per_cpu_ptr(rsp->rda, cpu);
2882                 if (is_nocb_cpu(cpu)) {
2883                         _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2884                                            rsp->n_barrier_done);
2885                         atomic_inc(&rsp->barrier_cpu_count);
2886                         __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
2887                                    rsp, cpu, 0);
2888                 } else if (ACCESS_ONCE(rdp->qlen)) {
2889                         _rcu_barrier_trace(rsp, "OnlineQ", cpu,
2890                                            rsp->n_barrier_done);
2891                         smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
2892                 } else {
2893                         _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
2894                                            rsp->n_barrier_done);
2895                 }
2896         }
2897         put_online_cpus();
2898
2899         /*
2900          * Now that we have an rcu_barrier_callback() callback on each
2901          * CPU, and thus each counted, remove the initial count.
2902          */
2903         if (atomic_dec_and_test(&rsp->barrier_cpu_count))
2904                 complete(&rsp->barrier_completion);
2905
2906         /* Increment ->n_barrier_done to prevent duplicate work. */
2907         smp_mb(); /* Keep increment after above mechanism. */
2908         ACCESS_ONCE(rsp->n_barrier_done)++;
2909         WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
2910         _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
2911         smp_mb(); /* Keep increment before caller's subsequent code. */
2912
2913         /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
2914         wait_for_completion(&rsp->barrier_completion);
2915
2916         /* Other rcu_barrier() invocations can now safely proceed. */
2917         mutex_unlock(&rsp->barrier_mutex);
2918 }
2919
2920 /**
2921  * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
2922  */
2923 void rcu_barrier_bh(void)
2924 {
2925         _rcu_barrier(&rcu_bh_state);
2926 }
2927 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
2928
2929 /**
2930  * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
2931  */
2932 void rcu_barrier_sched(void)
2933 {
2934         _rcu_barrier(&rcu_sched_state);
2935 }
2936 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
2937
2938 /*
2939  * Do boot-time initialization of a CPU's per-CPU RCU data.
2940  */
2941 static void __init
2942 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2943 {
2944         unsigned long flags;
2945         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2946         struct rcu_node *rnp = rcu_get_root(rsp);
2947
2948         /* Set up local state, ensuring consistent view of global state. */
2949         raw_spin_lock_irqsave(&rnp->lock, flags);
2950         rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
2951         init_callback_list(rdp);
2952         rdp->qlen_lazy = 0;
2953         ACCESS_ONCE(rdp->qlen) = 0;
2954         rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2955         WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2956         WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2957         rdp->cpu = cpu;
2958         rdp->rsp = rsp;
2959         rcu_boot_init_nocb_percpu_data(rdp);
2960         raw_spin_unlock_irqrestore(&rnp->lock, flags);
2961 }
2962
2963 /*
2964  * Initialize a CPU's per-CPU RCU data.  Note that only one online or
2965  * offline event can be happening at a given time.  Note also that we
2966  * can accept some slop in the rsp->completed access due to the fact
2967  * that this CPU cannot possibly have any RCU callbacks in flight yet.
2968  */
2969 static void __cpuinit
2970 rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2971 {
2972         unsigned long flags;
2973         unsigned long mask;
2974         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2975         struct rcu_node *rnp = rcu_get_root(rsp);
2976
2977         /* Exclude new grace periods. */
2978         mutex_lock(&rsp->onoff_mutex);
2979
2980         /* Set up local state, ensuring consistent view of global state. */
2981         raw_spin_lock_irqsave(&rnp->lock, flags);
2982         rdp->beenonline = 1;     /* We have now been online. */
2983         rdp->preemptible = preemptible;
2984         rdp->qlen_last_fqs_check = 0;
2985         rdp->n_force_qs_snap = rsp->n_force_qs;
2986         rdp->blimit = blimit;
2987         init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
2988         rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2989         atomic_set(&rdp->dynticks->dynticks,
2990                    (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2991         raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
2992
2993         /* Add CPU to rcu_node bitmasks. */
2994         rnp = rdp->mynode;
2995         mask = rdp->grpmask;
2996         do {
2997                 /* Exclude any attempts to start a new GP on small systems. */
2998                 raw_spin_lock(&rnp->lock);      /* irqs already disabled. */
2999                 rnp->qsmaskinit |= mask;
3000                 mask = rnp->grpmask;
3001                 if (rnp == rdp->mynode) {
3002                         /*
3003                          * If there is a grace period in progress, we will
3004                          * set up to wait for it next time we run the
3005                          * RCU core code.
3006                          */
3007                         rdp->gpnum = rnp->completed;
3008                         rdp->completed = rnp->completed;
3009                         rdp->passed_quiesce = 0;
3010                         rdp->qs_pending = 0;
3011                         trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
3012                 }
3013                 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
3014                 rnp = rnp->parent;
3015         } while (rnp != NULL && !(rnp->qsmaskinit & mask));
3016         local_irq_restore(flags);
3017
3018         mutex_unlock(&rsp->onoff_mutex);
3019 }
3020
3021 static void __cpuinit rcu_prepare_cpu(int cpu)
3022 {
3023         struct rcu_state *rsp;
3024
3025         for_each_rcu_flavor(rsp)
3026                 rcu_init_percpu_data(cpu, rsp,
3027                                      strcmp(rsp->name, "rcu_preempt") == 0);
3028 }
3029
3030 /*
3031  * Handle CPU online/offline notification events.
3032  */
3033 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
3034                                     unsigned long action, void *hcpu)
3035 {
3036         long cpu = (long)hcpu;
3037         struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
3038         struct rcu_node *rnp = rdp->mynode;
3039         struct rcu_state *rsp;
3040
3041         trace_rcu_utilization("Start CPU hotplug");
3042         switch (action) {
3043         case CPU_UP_PREPARE:
3044         case CPU_UP_PREPARE_FROZEN:
3045                 rcu_prepare_cpu(cpu);
3046                 rcu_prepare_kthreads(cpu);
3047                 break;
3048         case CPU_ONLINE:
3049         case CPU_DOWN_FAILED:
3050                 rcu_boost_kthread_setaffinity(rnp, -1);
3051                 break;
3052         case CPU_DOWN_PREPARE:
3053                 rcu_boost_kthread_setaffinity(rnp, cpu);
3054                 break;
3055         case CPU_DYING:
3056         case CPU_DYING_FROZEN:
3057                 /*
3058                  * The whole machine is "stopped" except this CPU, so we can
3059                  * touch any data without introducing corruption. We send the
3060                  * dying CPU's callbacks to an arbitrarily chosen online CPU.
3061                  */
3062                 for_each_rcu_flavor(rsp)
3063                         rcu_cleanup_dying_cpu(rsp);
3064                 break;
3065         case CPU_DEAD:
3066         case CPU_DEAD_FROZEN:
3067         case CPU_UP_CANCELED:
3068         case CPU_UP_CANCELED_FROZEN:
3069                 for_each_rcu_flavor(rsp)
3070                         rcu_cleanup_dead_cpu(cpu, rsp);
3071                 break;
3072         default:
3073                 break;
3074         }
3075         trace_rcu_utilization("End CPU hotplug");
3076         return NOTIFY_OK;
3077 }
3078
3079 /*
3080  * Spawn the kthread that handles this RCU flavor's grace periods.
3081  */
3082 static int __init rcu_spawn_gp_kthread(void)
3083 {
3084         unsigned long flags;
3085         struct rcu_node *rnp;
3086         struct rcu_state *rsp;
3087         struct task_struct *t;
3088
3089         for_each_rcu_flavor(rsp) {
3090                 t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
3091                 BUG_ON(IS_ERR(t));
3092                 rnp = rcu_get_root(rsp);
3093                 raw_spin_lock_irqsave(&rnp->lock, flags);
3094                 rsp->gp_kthread = t;
3095                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3096                 rcu_spawn_nocb_kthreads(rsp);
3097         }
3098         return 0;
3099 }
3100 early_initcall(rcu_spawn_gp_kthread);
3101
3102 /*
3103  * This function is invoked towards the end of the scheduler's initialization
3104  * process.  Before this is called, the idle task might contain
3105  * RCU read-side critical sections (during which time, this idle
3106  * task is booting the system).  After this function is called, the
3107  * idle tasks are prohibited from containing RCU read-side critical
3108  * sections.  This function also enables RCU lockdep checking.
3109  */
3110 void rcu_scheduler_starting(void)
3111 {
3112         WARN_ON(num_online_cpus() != 1);
3113         WARN_ON(nr_context_switches() > 0);
3114         rcu_scheduler_active = 1;
3115 }
3116
3117 /*
3118  * Compute the per-level fanout, either using the exact fanout specified
3119  * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
3120  */
3121 #ifdef CONFIG_RCU_FANOUT_EXACT
3122 static void __init rcu_init_levelspread(struct rcu_state *rsp)
3123 {
3124         int i;
3125
3126         for (i = rcu_num_lvls - 1; i > 0; i--)
3127                 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3128         rsp->levelspread[0] = rcu_fanout_leaf;
3129 }
3130 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
3131 static void __init rcu_init_levelspread(struct rcu_state *rsp)
3132 {
3133         int ccur;
3134         int cprv;
3135         int i;
3136
3137         cprv = nr_cpu_ids;
3138         for (i = rcu_num_lvls - 1; i >= 0; i--) {
3139                 ccur = rsp->levelcnt[i];
3140                 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
3141                 cprv = ccur;
3142         }
3143 }
3144 #endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
3145
3146 /*
3147  * Helper function for rcu_init() that initializes one rcu_state structure.
3148  */
3149 static void __init rcu_init_one(struct rcu_state *rsp,
3150                 struct rcu_data __percpu *rda)
3151 {
3152         static char *buf[] = { "rcu_node_0",
3153                                "rcu_node_1",
3154                                "rcu_node_2",
3155                                "rcu_node_3" };  /* Match MAX_RCU_LVLS */
3156         static char *fqs[] = { "rcu_node_fqs_0",
3157                                "rcu_node_fqs_1",
3158                                "rcu_node_fqs_2",
3159                                "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
3160         int cpustride = 1;
3161         int i;
3162         int j;
3163         struct rcu_node *rnp;
3164
3165         BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
3166
3167         /* Silence gcc 4.8 warning about array index out of range. */
3168         if (rcu_num_lvls > RCU_NUM_LVLS)
3169                 panic("rcu_init_one: rcu_num_lvls overflow");
3170
3171         /* Initialize the level-tracking arrays. */
3172
3173         for (i = 0; i < rcu_num_lvls; i++)
3174                 rsp->levelcnt[i] = num_rcu_lvl[i];
3175         for (i = 1; i < rcu_num_lvls; i++)
3176                 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
3177         rcu_init_levelspread(rsp);
3178
3179         /* Initialize the elements themselves, starting from the leaves. */
3180
3181         for (i = rcu_num_lvls - 1; i >= 0; i--) {
3182                 cpustride *= rsp->levelspread[i];
3183                 rnp = rsp->level[i];
3184                 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
3185                         raw_spin_lock_init(&rnp->lock);
3186                         lockdep_set_class_and_name(&rnp->lock,
3187                                                    &rcu_node_class[i], buf[i]);
3188                         raw_spin_lock_init(&rnp->fqslock);
3189                         lockdep_set_class_and_name(&rnp->fqslock,
3190                                                    &rcu_fqs_class[i], fqs[i]);
3191                         rnp->gpnum = rsp->gpnum;
3192                         rnp->completed = rsp->completed;
3193                         rnp->qsmask = 0;
3194                         rnp->qsmaskinit = 0;
3195                         rnp->grplo = j * cpustride;
3196                         rnp->grphi = (j + 1) * cpustride - 1;
3197                         if (rnp->grphi >= NR_CPUS)
3198                                 rnp->grphi = NR_CPUS - 1;
3199                         if (i == 0) {
3200                                 rnp->grpnum = 0;
3201                                 rnp->grpmask = 0;
3202                                 rnp->parent = NULL;
3203                         } else {
3204                                 rnp->grpnum = j % rsp->levelspread[i - 1];
3205                                 rnp->grpmask = 1UL << rnp->grpnum;
3206                                 rnp->parent = rsp->level[i - 1] +
3207                                               j / rsp->levelspread[i - 1];
3208                         }
3209                         rnp->level = i;
3210                         INIT_LIST_HEAD(&rnp->blkd_tasks);
3211                         rcu_init_one_nocb(rnp);
3212                 }
3213         }
3214
3215         rsp->rda = rda;
3216         init_waitqueue_head(&rsp->gp_wq);
3217         rnp = rsp->level[rcu_num_lvls - 1];
3218         for_each_possible_cpu(i) {
3219                 while (i > rnp->grphi)
3220                         rnp++;
3221                 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
3222                 rcu_boot_init_percpu_data(i, rsp);
3223         }
3224         list_add(&rsp->flavors, &rcu_struct_flavors);
3225 }
3226
3227 /*
3228  * Compute the rcu_node tree geometry from kernel parameters.  This cannot
3229  * replace the definitions in rcutree.h because those are needed to size
3230  * the ->node array in the rcu_state structure.
3231  */
3232 static void __init rcu_init_geometry(void)
3233 {
3234         int i;
3235         int j;
3236         int n = nr_cpu_ids;
3237         int rcu_capacity[MAX_RCU_LVLS + 1];
3238
3239         /* If the compile-time values are accurate, just leave. */
3240         if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
3241             nr_cpu_ids == NR_CPUS)
3242                 return;
3243
3244         /*
3245          * Compute number of nodes that can be handled an rcu_node tree
3246          * with the given number of levels.  Setting rcu_capacity[0] makes
3247          * some of the arithmetic easier.
3248          */
3249         rcu_capacity[0] = 1;
3250         rcu_capacity[1] = rcu_fanout_leaf;
3251         for (i = 2; i <= MAX_RCU_LVLS; i++)
3252                 rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
3253
3254         /*
3255          * The boot-time rcu_fanout_leaf parameter is only permitted
3256          * to increase the leaf-level fanout, not decrease it.  Of course,
3257          * the leaf-level fanout cannot exceed the number of bits in
3258          * the rcu_node masks.  Finally, the tree must be able to accommodate
3259          * the configured number of CPUs.  Complain and fall back to the
3260          * compile-time values if these limits are exceeded.
3261          */
3262         if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
3263             rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
3264             n > rcu_capacity[MAX_RCU_LVLS]) {
3265                 WARN_ON(1);
3266                 return;
3267         }
3268
3269         /* Calculate the number of rcu_nodes at each level of the tree. */
3270         for (i = 1; i <= MAX_RCU_LVLS; i++)
3271                 if (n <= rcu_capacity[i]) {
3272                         for (j = 0; j <= i; j++)
3273                                 num_rcu_lvl[j] =
3274                                         DIV_ROUND_UP(n, rcu_capacity[i - j]);
3275                         rcu_num_lvls = i;
3276                         for (j = i + 1; j <= MAX_RCU_LVLS; j++)
3277                                 num_rcu_lvl[j] = 0;
3278                         break;
3279                 }
3280
3281         /* Calculate the total number of rcu_node structures. */
3282         rcu_num_nodes = 0;
3283         for (i = 0; i <= MAX_RCU_LVLS; i++)
3284                 rcu_num_nodes += num_rcu_lvl[i];
3285         rcu_num_nodes -= n;
3286 }
3287
3288 void __init rcu_init(void)
3289 {
3290         int cpu;
3291
3292         rcu_bootup_announce();
3293         rcu_init_geometry();
3294         rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3295         rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3296         __rcu_init_preempt();
3297          open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3298
3299         /*
3300          * We don't need protection against CPU-hotplug here because
3301          * this is called early in boot, before either interrupts
3302          * or the scheduler are operational.
3303          */
3304         cpu_notifier(rcu_cpu_notify, 0);
3305         for_each_online_cpu(cpu)
3306                 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3307 }
3308
3309 #include "rcutree_plugin.h"