kernel/sched_fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 /*
  24  * Targeted preemption latency for CPU-bound tasks:
  25  * (default: 20ms, units: nanoseconds)
  26  *
  27  * NOTE: this latency value is not the same as the concept of
  28  * 'timeslice length' - timeslices in CFS are of variable length.
  29  * (to see the precise effective timeslice length of your workload,
  30  *  run vmstat and monitor the context-switches field)
  31  *
  32  * On SMP systems the value of this is multiplied by the log2 of the
  33  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  34  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  35  * Targeted preemption latency for CPU-bound tasks:
  36  */
  37 const_debug unsigned int sysctl_sched_latency = 20000000ULL;
  38
  39 /*
  40  * After fork, child runs first. (default) If set to 0 then
  41  * parent will (try to) run first.
  42  */
  43 const_debug unsigned int sysctl_sched_child_runs_first = 1;
  44
  45 /*
  46  * Minimal preemption granularity for CPU-bound tasks:
  47  * (default: 2 msec, units: nanoseconds)
  48  */
  49 const_debug unsigned int sysctl_sched_nr_latency = 20;
  50
  51 /*
  52  * sys_sched_yield() compat mode
  53  *
  54  * This option switches the agressive yield implementation of the
  55  * old scheduler back on.
  56  */
  57 unsigned int __read_mostly sysctl_sched_compat_yield;
  58
  59 /*
  60  * SCHED_BATCH wake-up granularity.
  61  * (default: 25 msec, units: nanoseconds)
  62  *
  63  * This option delays the preemption effects of decoupled workloads
  64  * and reduces their over-scheduling. Synchronous workloads will still
  65  * have immediate wakeup/sleep latencies.
  66  */
  67 const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  68
  69 /*
  70  * SCHED_OTHER wake-up granularity.
  71  * (default: 1 msec, units: nanoseconds)
  72  *
  73  * This option delays the preemption effects of decoupled workloads
  74  * and reduces their over-scheduling. Synchronous workloads will still
  75  * have immediate wakeup/sleep latencies.
  76  */
  77 const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
  78
  79 extern struct sched_class fair_sched_class;
  80
  81 /**************************************************************
  82  * CFS operations on generic schedulable entities:
  83  */
  84
  85 #ifdef CONFIG_FAIR_GROUP_SCHED
  86
  87 /* cpu runqueue to which this cfs_rq is attached */
  88 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  89 {
  90         return cfs_rq->rq;
  91 }
  92
  93 /* An entity is a task if it doesn't "own" a runqueue */
  94 #define entity_is_task(se)      (!se->my_q)
  95
  96 #else   /* CONFIG_FAIR_GROUP_SCHED */
  97
  98 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  99 {
 100         return container_of(cfs_rq, struct rq, cfs);
 101 }
 102
 103 #define entity_is_task(se)      1
 104
 105 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 106
 107 static inline struct task_struct *task_of(struct sched_entity *se)
 108 {
 109         return container_of(se, struct task_struct, se);
 110 }
 111
 112
 113 /**************************************************************
 114  * Scheduling class tree data structure manipulation methods:
 115  */
 116
 117 static inline u64
 118 max_vruntime(u64 min_vruntime, u64 vruntime)
 119 {
 120         s64 delta = (s64)(vruntime - min_vruntime);
 121         if (delta > 0)
 122                 min_vruntime = vruntime;
 123
 124         return min_vruntime;
 125 }
 126
 127 static inline u64
 128 min_vruntime(u64 min_vruntime, u64 vruntime)
 129 {
 130         s64 delta = (s64)(vruntime - min_vruntime);
 131         if (delta < 0)
 132                 min_vruntime = vruntime;
 133
 134         return min_vruntime;
 135 }
 136
 137 static inline s64
 138 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 139 {
 140         return se->vruntime - cfs_rq->min_vruntime;
 141 }
 142
 143 /*
 144  * Enqueue an entity into the rb-tree:
 145  */
 146 static void
 147 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 148 {
 149         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 150         struct rb_node *parent = NULL;
 151         struct sched_entity *entry;
 152         s64 key = entity_key(cfs_rq, se);
 153         int leftmost = 1;
 154
 155         /*
 156          * Find the right place in the rbtree:
 157          */
 158         while (*link) {
 159                 parent = *link;
 160                 entry = rb_entry(parent, struct sched_entity, run_node);
 161                 /*
 162                  * We dont care about collisions. Nodes with
 163                  * the same key stay together.
 164                  */
 165                 if (key < entity_key(cfs_rq, entry)) {
 166                         link = &parent->rb_left;
 167                 } else {
 168                         link = &parent->rb_right;
 169                         leftmost = 0;
 170                 }
 171         }
 172
 173         /*
 174          * Maintain a cache of leftmost tree entries (it is frequently
 175          * used):
 176          */
 177         if (leftmost)
 178                 cfs_rq->rb_leftmost = &se->run_node;
 179
 180         rb_link_node(&se->run_node, parent, link);
 181         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 182 }
 183
 184 static void
 185 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 186 {
 187         if (cfs_rq->rb_leftmost == &se->run_node)
 188                 cfs_rq->rb_leftmost = rb_next(&se->run_node);
 189
 190         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 191 }
 192
 193 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
 194 {
 195         return cfs_rq->rb_leftmost;
 196 }
 197
 198 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 199 {
 200         return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 201 }
 202
 203 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 204 {
 205         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 206         struct sched_entity *se = NULL;
 207         struct rb_node *parent;
 208
 209         while (*link) {
 210                 parent = *link;
 211                 se = rb_entry(parent, struct sched_entity, run_node);
 212                 link = &parent->rb_right;
 213         }
 214
 215         return se;
 216 }
 217
 218 /**************************************************************
 219  * Scheduling class statistics methods:
 220  */
 221
 222 static u64 __sched_period(unsigned long nr_running)
 223 {
 224         u64 period = sysctl_sched_latency;
 225         unsigned long nr_latency = sysctl_sched_nr_latency;
 226
 227         if (unlikely(nr_running > nr_latency)) {
 228                 period *= nr_running;
 229                 do_div(period, nr_latency);
 230         }
 231
 232         return period;
 233 }
 234
 235 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 236 {
 237         u64 period = __sched_period(cfs_rq->nr_running);
 238
 239         period *= se->load.weight;
 240         do_div(period, cfs_rq->load.weight);
 241
 242         return period;
 243 }
 244
 245 static u64 __sched_vslice(unsigned long nr_running)
 246 {
 247         unsigned long period = sysctl_sched_latency;
 248         unsigned long nr_latency = sysctl_sched_nr_latency;
 249
 250         if (unlikely(nr_running > nr_latency))
 251                 nr_running = nr_latency;
 252
 253         period /= nr_running;
 254
 255         return (u64)period;
 256 }
 257
 258 /*
 259  * Update the current task's runtime statistics. Skip current tasks that
 260  * are not in our scheduling class.
 261  */
 262 static inline void
 263 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 264               unsigned long delta_exec)
 265 {
 266         unsigned long delta_exec_weighted;
 267         u64 vruntime;
 268
 269         schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 270
 271         curr->sum_exec_runtime += delta_exec;
 272         schedstat_add(cfs_rq, exec_clock, delta_exec);
 273         delta_exec_weighted = delta_exec;
 274         if (unlikely(curr->load.weight != NICE_0_LOAD)) {
 275                 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
 276                                                         &curr->load);
 277         }
 278         curr->vruntime += delta_exec_weighted;
 279
 280         /*
 281          * maintain cfs_rq->min_vruntime to be a monotonic increasing
 282          * value tracking the leftmost vruntime in the tree.
 283          */
 284         if (first_fair(cfs_rq)) {
 285                 vruntime = min_vruntime(curr->vruntime,
 286                                 __pick_next_entity(cfs_rq)->vruntime);
 287         } else
 288                 vruntime = curr->vruntime;
 289
 290         cfs_rq->min_vruntime =
 291                 max_vruntime(cfs_rq->min_vruntime, vruntime);
 292 }
 293
 294 static void update_curr(struct cfs_rq *cfs_rq)
 295 {
 296         struct sched_entity *curr = cfs_rq->curr;
 297         u64 now = rq_of(cfs_rq)->clock;
 298         unsigned long delta_exec;
 299
 300         if (unlikely(!curr))
 301                 return;
 302
 303         /*
 304          * Get the amount of time the current task was running
 305          * since the last time we changed load (this cannot
 306          * overflow on 32 bits):
 307          */
 308         delta_exec = (unsigned long)(now - curr->exec_start);
 309
 310         __update_curr(cfs_rq, curr, delta_exec);
 311         curr->exec_start = now;
 312 }
 313
 314 static inline void
 315 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 316 {
 317         schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 318 }
 319
 320 static inline unsigned long
 321 calc_weighted(unsigned long delta, struct sched_entity *se)
 322 {
 323         unsigned long weight = se->load.weight;
 324
 325         if (unlikely(weight != NICE_0_LOAD))
 326                 return (u64)delta * se->load.weight >> NICE_0_SHIFT;
 327         else
 328                 return delta;
 329 }
 330
 331 /*
 332  * Task is being enqueued - update stats:
 333  */
 334 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 335 {
 336         /*
 337          * Are we enqueueing a waiting task? (for current tasks
 338          * a dequeue/enqueue event is a NOP)
 339          */
 340         if (se != cfs_rq->curr)
 341                 update_stats_wait_start(cfs_rq, se);
 342 }
 343
 344 static void
 345 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 346 {
 347         schedstat_set(se->wait_max, max(se->wait_max,
 348                         rq_of(cfs_rq)->clock - se->wait_start));
 349         schedstat_set(se->wait_start, 0);
 350 }
 351
 352 static inline void
 353 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 354 {
 355         update_curr(cfs_rq);
 356         /*
 357          * Mark the end of the wait period if dequeueing a
 358          * waiting task:
 359          */
 360         if (se != cfs_rq->curr)
 361                 update_stats_wait_end(cfs_rq, se);
 362 }
 363
 364 /*
 365  * We are picking a new current task - update its stats:
 366  */
 367 static inline void
 368 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 369 {
 370         /*
 371          * We are starting a new run period:
 372          */
 373         se->exec_start = rq_of(cfs_rq)->clock;
 374 }
 375
 376 /*
 377  * We are descheduling a task - update its stats:
 378  */
 379 static inline void
 380 update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 381 {
 382         se->exec_start = 0;
 383 }
 384
 385 /**************************************************
 386  * Scheduling class queueing methods:
 387  */
 388
 389 static void
 390 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 391 {
 392         update_load_add(&cfs_rq->load, se->load.weight);
 393         cfs_rq->nr_running++;
 394         se->on_rq = 1;
 395 }
 396
 397 static void
 398 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 399 {
 400         update_load_sub(&cfs_rq->load, se->load.weight);
 401         cfs_rq->nr_running--;
 402         se->on_rq = 0;
 403 }
 404
 405 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 406 {
 407 #ifdef CONFIG_SCHEDSTATS
 408         if (se->sleep_start) {
 409                 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
 410
 411                 if ((s64)delta < 0)
 412                         delta = 0;
 413
 414                 if (unlikely(delta > se->sleep_max))
 415                         se->sleep_max = delta;
 416
 417                 se->sleep_start = 0;
 418                 se->sum_sleep_runtime += delta;
 419         }
 420         if (se->block_start) {
 421                 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
 422
 423                 if ((s64)delta < 0)
 424                         delta = 0;
 425
 426                 if (unlikely(delta > se->block_max))
 427                         se->block_max = delta;
 428
 429                 se->block_start = 0;
 430                 se->sum_sleep_runtime += delta;
 431
 432                 /*
 433                  * Blocking time is in units of nanosecs, so shift by 20 to
 434                  * get a milliseconds-range estimation of the amount of
 435                  * time that the task spent sleeping:
 436                  */
 437                 if (unlikely(prof_on == SLEEP_PROFILING)) {
 438                         struct task_struct *tsk = task_of(se);
 439
 440                         profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 441                                      delta >> 20);
 442                 }
 443         }
 444 #endif
 445 }
 446
 447 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 448 {
 449 #ifdef CONFIG_SCHED_DEBUG
 450         s64 d = se->vruntime - cfs_rq->min_vruntime;
 451
 452         if (d < 0)
 453                 d = -d;
 454
 455         if (d > 3*sysctl_sched_latency)
 456                 schedstat_inc(cfs_rq, nr_spread_over);
 457 #endif
 458 }
 459
 460 static void
 461 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 462 {
 463         u64 vruntime;
 464
 465         vruntime = cfs_rq->min_vruntime;
 466
 467         if (sched_feat(USE_TREE_AVG)) {
 468                 struct sched_entity *last = __pick_last_entity(cfs_rq);
 469                 if (last) {
 470                         vruntime += last->vruntime;
 471                         vruntime >>= 1;
 472                 }
 473         } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
 474                 vruntime += __sched_vslice(cfs_rq->nr_running)/2;
 475
 476         if (initial && sched_feat(START_DEBIT))
 477                 vruntime += __sched_vslice(cfs_rq->nr_running + 1);
 478
 479         if (!initial) {
 480                 if (sched_feat(NEW_FAIR_SLEEPERS))
 481                         vruntime -= sysctl_sched_latency;
 482
 483                 vruntime = max_t(s64, vruntime, se->vruntime);
 484         }
 485
 486         se->vruntime = vruntime;
 487
 488 }
 489
 490 static void
 491 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 492 {
 493         /*
 494          * Update the fair clock.
 495          */
 496         update_curr(cfs_rq);
 497
 498         if (wakeup) {
 499                 place_entity(cfs_rq, se, 0);
 500                 enqueue_sleeper(cfs_rq, se);
 501         }
 502
 503         update_stats_enqueue(cfs_rq, se);
 504         check_spread(cfs_rq, se);
 505         if (se != cfs_rq->curr)
 506                 __enqueue_entity(cfs_rq, se);
 507         account_entity_enqueue(cfs_rq, se);
 508 }
 509
 510 static void
 511 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 512 {
 513         update_stats_dequeue(cfs_rq, se);
 514         if (sleep) {
 515 #ifdef CONFIG_SCHEDSTATS
 516                 if (entity_is_task(se)) {
 517                         struct task_struct *tsk = task_of(se);
 518
 519                         if (tsk->state & TASK_INTERRUPTIBLE)
 520                                 se->sleep_start = rq_of(cfs_rq)->clock;
 521                         if (tsk->state & TASK_UNINTERRUPTIBLE)
 522                                 se->block_start = rq_of(cfs_rq)->clock;
 523                 }
 524 #endif
 525         }
 526
 527         if (se != cfs_rq->curr)
 528                 __dequeue_entity(cfs_rq, se);
 529         account_entity_dequeue(cfs_rq, se);
 530 }
 531
 532 /*
 533  * Preempt the current task with a newly woken task if needed:
 534  */
 535 static void
 536 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 537 {
 538         unsigned long ideal_runtime, delta_exec;
 539
 540         ideal_runtime = sched_slice(cfs_rq, curr);
 541         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 542         if (delta_exec > ideal_runtime)
 543                 resched_task(rq_of(cfs_rq)->curr);
 544 }
 545
 546 static void
 547 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 548 {
 549         /* 'current' is not kept within the tree. */
 550         if (se->on_rq) {
 551                 /*
 552                  * Any task has to be enqueued before it get to execute on
 553                  * a CPU. So account for the time it spent waiting on the
 554                  * runqueue.
 555                  */
 556                 update_stats_wait_end(cfs_rq, se);
 557                 __dequeue_entity(cfs_rq, se);
 558         }
 559
 560         update_stats_curr_start(cfs_rq, se);
 561         cfs_rq->curr = se;
 562 #ifdef CONFIG_SCHEDSTATS
 563         /*
 564          * Track our maximum slice length, if the CPU's load is at
 565          * least twice that of our own weight (i.e. dont track it
 566          * when there are only lesser-weight tasks around):
 567          */
 568         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 569                 se->slice_max = max(se->slice_max,
 570                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
 571         }
 572 #endif
 573         se->prev_sum_exec_runtime = se->sum_exec_runtime;
 574 }
 575
 576 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 577 {
 578         struct sched_entity *se = __pick_next_entity(cfs_rq);
 579
 580         set_next_entity(cfs_rq, se);
 581
 582         return se;
 583 }
 584
 585 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 586 {
 587         /*
 588          * If still on the runqueue then deactivate_task()
 589          * was not called and update_curr() has to be done:
 590          */
 591         if (prev->on_rq)
 592                 update_curr(cfs_rq);
 593
 594         update_stats_curr_end(cfs_rq, prev);
 595
 596         check_spread(cfs_rq, prev);
 597         if (prev->on_rq) {
 598                 update_stats_wait_start(cfs_rq, prev);
 599                 /* Put 'current' back into the tree. */
 600                 __enqueue_entity(cfs_rq, prev);
 601         }
 602         cfs_rq->curr = NULL;
 603 }
 604
 605 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 606 {
 607         /*
 608          * Update run-time statistics of the 'current'.
 609          */
 610         update_curr(cfs_rq);
 611
 612         if (cfs_rq->nr_running > 1)
 613                 check_preempt_tick(cfs_rq, curr);
 614 }
 615
 616 /**************************************************
 617  * CFS operations on tasks:
 618  */
 619
 620 #ifdef CONFIG_FAIR_GROUP_SCHED
 621
 622 /* Walk up scheduling entities hierarchy */
 623 #define for_each_sched_entity(se) \
 624                 for (; se; se = se->parent)
 625
 626 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 627 {
 628         return p->se.cfs_rq;
 629 }
 630
 631 /* runqueue on which this entity is (to be) queued */
 632 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 633 {
 634         return se->cfs_rq;
 635 }
 636
 637 /* runqueue "owned" by this group */
 638 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 639 {
 640         return grp->my_q;
 641 }
 642
 643 /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
 644  * another cpu ('this_cpu')
 645  */
 646 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 647 {
 648         return cfs_rq->tg->cfs_rq[this_cpu];
 649 }
 650
 651 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 652 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 653         list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 654
 655 /* Do the two (enqueued) tasks belong to the same group ? */
 656 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 657 {
 658         if (curr->se.cfs_rq == p->se.cfs_rq)
 659                 return 1;
 660
 661         return 0;
 662 }
 663
 664 #else   /* CONFIG_FAIR_GROUP_SCHED */
 665
 666 #define for_each_sched_entity(se) \
 667                 for (; se; se = NULL)
 668
 669 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 670 {
 671         return &task_rq(p)->cfs;
 672 }
 673
 674 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 675 {
 676         struct task_struct *p = task_of(se);
 677         struct rq *rq = task_rq(p);
 678
 679         return &rq->cfs;
 680 }
 681
 682 /* runqueue "owned" by this group */
 683 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 684 {
 685         return NULL;
 686 }
 687
 688 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 689 {
 690         return &cpu_rq(this_cpu)->cfs;
 691 }
 692
 693 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 694                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 695
 696 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
 697 {
 698         return 1;
 699 }
 700
 701 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 702
 703 /*
 704  * The enqueue_task method is called before nr_running is
 705  * increased. Here we update the fair scheduling stats and
 706  * then put the task into the rbtree:
 707  */
 708 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 709 {
 710         struct cfs_rq *cfs_rq;
 711         struct sched_entity *se = &p->se;
 712
 713         for_each_sched_entity(se) {
 714                 if (se->on_rq)
 715                         break;
 716                 cfs_rq = cfs_rq_of(se);
 717                 enqueue_entity(cfs_rq, se, wakeup);
 718         }
 719 }
 720
 721 /*
 722  * The dequeue_task method is called before nr_running is
 723  * decreased. We remove the task from the rbtree and
 724  * update the fair scheduling stats:
 725  */
 726 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 727 {
 728         struct cfs_rq *cfs_rq;
 729         struct sched_entity *se = &p->se;
 730
 731         for_each_sched_entity(se) {
 732                 cfs_rq = cfs_rq_of(se);
 733                 dequeue_entity(cfs_rq, se, sleep);
 734                 /* Don't dequeue parent if it has other entities besides us */
 735                 if (cfs_rq->load.weight)
 736                         break;
 737         }
 738 }
 739
 740 /*
 741  * sched_yield() support is very simple - we dequeue and enqueue.
 742  *
 743  * If compat_yield is turned on then we requeue to the end of the tree.
 744  */
 745 static void yield_task_fair(struct rq *rq)
 746 {
 747         struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
 748         struct sched_entity *rightmost, *se = &rq->curr->se;
 749
 750         /*
 751          * Are we the only task in the tree?
 752          */
 753         if (unlikely(cfs_rq->nr_running == 1))
 754                 return;
 755
 756         if (likely(!sysctl_sched_compat_yield)) {
 757                 __update_rq_clock(rq);
 758                 /*
 759                  * Dequeue and enqueue the task to update its
 760                  * position within the tree:
 761                  */
 762                 update_curr(cfs_rq);
 763
 764                 return;
 765         }
 766         /*
 767          * Find the rightmost entry in the rbtree:
 768          */
 769         rightmost = __pick_last_entity(cfs_rq);
 770         /*
 771          * Already in the rightmost position?
 772          */
 773         if (unlikely(rightmost->vruntime < se->vruntime))
 774                 return;
 775
 776         /*
 777          * Minimally necessary key value to be last in the tree:
 778          * Upon rescheduling, sched_class::put_prev_task() will place
 779          * 'current' within the tree based on its new key value.
 780          */
 781         se->vruntime = rightmost->vruntime + 1;
 782 }
 783
 784 /*
 785  * Preempt the current task with a newly woken task if needed:
 786  */
 787 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 788 {
 789         struct task_struct *curr = rq->curr;
 790         struct cfs_rq *cfs_rq = task_cfs_rq(curr), *pcfs_rq;
 791         struct sched_entity *se = &curr->se, *pse = &p->se;
 792
 793         if (unlikely(rt_prio(p->prio))) {
 794                 update_rq_clock(rq);
 795                 update_curr(cfs_rq);
 796                 resched_task(curr);
 797                 return;
 798         }
 799
 800         for_each_sched_entity(se) {
 801                 cfs_rq = cfs_rq_of(se);
 802                 pcfs_rq = cfs_rq_of(pse);
 803
 804                 if (cfs_rq == pcfs_rq) {
 805                         s64 delta = se->vruntime - pse->vruntime;
 806
 807                         if (delta > (s64)sysctl_sched_wakeup_granularity)
 808                                 resched_task(curr);
 809                         break;
 810                 }
 811 #ifdef CONFIG_FAIR_GROUP_SCHED
 812                 pse = pse->parent;
 813 #endif
 814         }
 815 }
 816
 817 static struct task_struct *pick_next_task_fair(struct rq *rq)
 818 {
 819         struct cfs_rq *cfs_rq = &rq->cfs;
 820         struct sched_entity *se;
 821
 822         if (unlikely(!cfs_rq->nr_running))
 823                 return NULL;
 824
 825         do {
 826                 se = pick_next_entity(cfs_rq);
 827                 cfs_rq = group_cfs_rq(se);
 828         } while (cfs_rq);
 829
 830         return task_of(se);
 831 }
 832
 833 /*
 834  * Account for a descheduled task:
 835  */
 836 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 837 {
 838         struct sched_entity *se = &prev->se;
 839         struct cfs_rq *cfs_rq;
 840
 841         for_each_sched_entity(se) {
 842                 cfs_rq = cfs_rq_of(se);
 843                 put_prev_entity(cfs_rq, se);
 844         }
 845 }
 846
 847 /**************************************************
 848  * Fair scheduling class load-balancing methods:
 849  */
 850
 851 /*
 852  * Load-balancing iterator. Note: while the runqueue stays locked
 853  * during the whole iteration, the current task might be
 854  * dequeued so the iterator has to be dequeue-safe. Here we
 855  * achieve that by always pre-iterating before returning
 856  * the current task:
 857  */
 858 static inline struct task_struct *
 859 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 860 {
 861         struct task_struct *p;
 862
 863         if (!curr)
 864                 return NULL;
 865
 866         p = rb_entry(curr, struct task_struct, se.run_node);
 867         cfs_rq->rb_load_balance_curr = rb_next(curr);
 868
 869         return p;
 870 }
 871
 872 static struct task_struct *load_balance_start_fair(void *arg)
 873 {
 874         struct cfs_rq *cfs_rq = arg;
 875
 876         return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
 877 }
 878
 879 static struct task_struct *load_balance_next_fair(void *arg)
 880 {
 881         struct cfs_rq *cfs_rq = arg;
 882
 883         return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 884 }
 885
 886 #ifdef CONFIG_FAIR_GROUP_SCHED
 887 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 888 {
 889         struct sched_entity *curr;
 890         struct task_struct *p;
 891
 892         if (!cfs_rq->nr_running)
 893                 return MAX_PRIO;
 894
 895         curr = cfs_rq->curr;
 896         if (!curr)
 897                 curr = __pick_next_entity(cfs_rq);
 898
 899         p = task_of(curr);
 900
 901         return p->prio;
 902 }
 903 #endif
 904
 905 static unsigned long
 906 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 907                   unsigned long max_nr_move, unsigned long max_load_move,
 908                   struct sched_domain *sd, enum cpu_idle_type idle,
 909                   int *all_pinned, int *this_best_prio)
 910 {
 911         struct cfs_rq *busy_cfs_rq;
 912         unsigned long load_moved, total_nr_moved = 0, nr_moved;
 913         long rem_load_move = max_load_move;
 914         struct rq_iterator cfs_rq_iterator;
 915
 916         cfs_rq_iterator.start = load_balance_start_fair;
 917         cfs_rq_iterator.next = load_balance_next_fair;
 918
 919         for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 920 #ifdef CONFIG_FAIR_GROUP_SCHED
 921                 struct cfs_rq *this_cfs_rq;
 922                 long imbalance;
 923                 unsigned long maxload;
 924
 925                 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 926
 927                 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
 928                 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
 929                 if (imbalance <= 0)
 930                         continue;
 931
 932                 /* Don't pull more than imbalance/2 */
 933                 imbalance /= 2;
 934                 maxload = min(rem_load_move, imbalance);
 935
 936                 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
 937 #else
 938 # define maxload rem_load_move
 939 #endif
 940                 /* pass busy_cfs_rq argument into
 941                  * load_balance_[start|next]_fair iterators
 942                  */
 943                 cfs_rq_iterator.arg = busy_cfs_rq;
 944                 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
 945                                 max_nr_move, maxload, sd, idle, all_pinned,
 946                                 &load_moved, this_best_prio, &cfs_rq_iterator);
 947
 948                 total_nr_moved += nr_moved;
 949                 max_nr_move -= nr_moved;
 950                 rem_load_move -= load_moved;
 951
 952                 if (max_nr_move <= 0 || rem_load_move <= 0)
 953                         break;
 954         }
 955
 956         return max_load_move - rem_load_move;
 957 }
 958
 959 /*
 960  * scheduler tick hitting a task of our scheduling class:
 961  */
 962 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 963 {
 964         struct cfs_rq *cfs_rq;
 965         struct sched_entity *se = &curr->se;
 966
 967         for_each_sched_entity(se) {
 968                 cfs_rq = cfs_rq_of(se);
 969                 entity_tick(cfs_rq, se);
 970         }
 971 }
 972
 973 #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
 974
 975 /*
 976  * Share the fairness runtime between parent and child, thus the
 977  * total amount of pressure for CPU stays equal - new tasks
 978  * get a chance to run but frequent forkers are not allowed to
 979  * monopolize the CPU. Note: the parent runqueue is locked,
 980  * the child is not running yet.
 981  */
 982 static void task_new_fair(struct rq *rq, struct task_struct *p)
 983 {
 984         struct cfs_rq *cfs_rq = task_cfs_rq(p);
 985         struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
 986
 987         sched_info_queued(p);
 988
 989         update_curr(cfs_rq);
 990         place_entity(cfs_rq, se, 1);
 991
 992         if (sysctl_sched_child_runs_first &&
 993                         curr->vruntime < se->vruntime) {
 994                 /*
 995                  * Upon rescheduling, sched_class::put_prev_task() will place
 996                  * 'current' within the tree based on its new key value.
 997                  */
 998                 swap(curr->vruntime, se->vruntime);
 999         }
1000
1001         update_stats_enqueue(cfs_rq, se);
1002         check_spread(cfs_rq, se);
1003         check_spread(cfs_rq, curr);
1004         __enqueue_entity(cfs_rq, se);
1005         account_entity_enqueue(cfs_rq, se);
1006         resched_task(rq->curr);
1007 }
1008
1009 /* Account for a task changing its policy or group.
1010  *
1011  * This routine is mostly called to set cfs_rq->curr field when a task
1012  * migrates between groups/classes.
1013  */
1014 static void set_curr_task_fair(struct rq *rq)
1015 {
1016         struct sched_entity *se = &rq->curr->se;
1017
1018         for_each_sched_entity(se)
1019                 set_next_entity(cfs_rq_of(se), se);
1020 }
1021
1022 /*
1023  * All the scheduling class methods:
1024  */
1025 struct sched_class fair_sched_class __read_mostly = {
1026         .enqueue_task           = enqueue_task_fair,
1027         .dequeue_task           = dequeue_task_fair,
1028         .yield_task             = yield_task_fair,
1029
1030         .check_preempt_curr     = check_preempt_wakeup,
1031
1032         .pick_next_task         = pick_next_task_fair,
1033         .put_prev_task          = put_prev_task_fair,
1034
1035         .load_balance           = load_balance_fair,
1036
1037         .set_curr_task          = set_curr_task_fair,
1038         .task_tick              = task_tick_fair,
1039         .task_new               = task_new_fair,
1040 };
1041
1042 #ifdef CONFIG_SCHED_DEBUG
1043 static void print_cfs_stats(struct seq_file *m, int cpu)
1044 {
1045         struct cfs_rq *cfs_rq;
1046
1047 #ifdef CONFIG_FAIR_GROUP_SCHED
1048         print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1049 #endif
1050         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1051                 print_cfs_rq(m, cpu, cfs_rq);
1052 }
1053 #endif