sched/numa: Set the scan rate proportional to the memory usage of the task being...
[deliverable/linux.git] / kernel / sched / fair.c
CommitLineData
bf0f6f24
IM
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
21805085
PZ
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
bf0f6f24
IM
21 */
22
9745512c 23#include <linux/latencytop.h>
1983a922 24#include <linux/sched.h>
3436ae12 25#include <linux/cpumask.h>
029632fb
PZ
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
cbee9f88 29#include <linux/mempolicy.h>
e14808b4 30#include <linux/migrate.h>
cbee9f88 31#include <linux/task_work.h>
029632fb
PZ
32
33#include <trace/events/sched.h>
34
35#include "sched.h"
9745512c 36
bf0f6f24 37/*
21805085 38 * Targeted preemption latency for CPU-bound tasks:
864616ee 39 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24 40 *
21805085 41 * NOTE: this latency value is not the same as the concept of
d274a4ce
IM
42 * 'timeslice length' - timeslices in CFS are of variable length
43 * and have no persistent notion like in traditional, time-slice
44 * based scheduling concepts.
bf0f6f24 45 *
d274a4ce
IM
46 * (to see the precise effective timeslice length of your workload,
47 * run vmstat and monitor the context-switches (cs) field)
bf0f6f24 48 */
21406928
MG
49unsigned int sysctl_sched_latency = 6000000ULL;
50unsigned int normalized_sysctl_sched_latency = 6000000ULL;
2bd8e6d4 51
1983a922
CE
52/*
53 * The initial- and re-scaling of tunables is configurable
54 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55 *
56 * Options are:
57 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60 */
61enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 = SCHED_TUNABLESCALING_LOG;
63
2bd8e6d4 64/*
b2be5e96 65 * Minimal preemption granularity for CPU-bound tasks:
864616ee 66 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
2bd8e6d4 67 */
0bf377bb
IM
68unsigned int sysctl_sched_min_granularity = 750000ULL;
69unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
21805085
PZ
70
71/*
b2be5e96
PZ
72 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73 */
0bf377bb 74static unsigned int sched_nr_latency = 8;
b2be5e96
PZ
75
76/*
2bba22c5 77 * After fork, child runs first. If set to 0 (default) then
b2be5e96 78 * parent will (try to) run first.
21805085 79 */
2bba22c5 80unsigned int sysctl_sched_child_runs_first __read_mostly;
bf0f6f24 81
bf0f6f24
IM
82/*
83 * SCHED_OTHER wake-up granularity.
172e082a 84 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24
IM
85 *
86 * This option delays the preemption effects of decoupled workloads
87 * and reduces their over-scheduling. Synchronous workloads will still
88 * have immediate wakeup/sleep latencies.
89 */
172e082a 90unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
0bcdcf28 91unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
bf0f6f24 92
da84d961
IM
93const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94
a7a4f8a7
PT
95/*
96 * The exponential sliding window over which load is averaged for shares
97 * distribution.
98 * (default: 10msec)
99 */
100unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101
ec12cb7f
PT
102#ifdef CONFIG_CFS_BANDWIDTH
103/*
104 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105 * each time a cfs_rq requests quota.
106 *
107 * Note: in the case that the slice exceeds the runtime remaining (either due
108 * to consumption or the quota being specified to be smaller than the slice)
109 * we will always only issue the remaining available time.
110 *
111 * default: 5 msec, units: microseconds
112 */
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif
115
8527632d
PG
116static inline void update_load_add(struct load_weight *lw, unsigned long inc)
117{
118 lw->weight += inc;
119 lw->inv_weight = 0;
120}
121
122static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
123{
124 lw->weight -= dec;
125 lw->inv_weight = 0;
126}
127
128static inline void update_load_set(struct load_weight *lw, unsigned long w)
129{
130 lw->weight = w;
131 lw->inv_weight = 0;
132}
133
029632fb
PZ
134/*
135 * Increase the granularity value when there are more CPUs,
136 * because with more CPUs the 'effective latency' as visible
137 * to users decreases. But the relationship is not linear,
138 * so pick a second-best guess by going with the log2 of the
139 * number of CPUs.
140 *
141 * This idea comes from the SD scheduler of Con Kolivas:
142 */
143static int get_update_sysctl_factor(void)
144{
145 unsigned int cpus = min_t(int, num_online_cpus(), 8);
146 unsigned int factor;
147
148 switch (sysctl_sched_tunable_scaling) {
149 case SCHED_TUNABLESCALING_NONE:
150 factor = 1;
151 break;
152 case SCHED_TUNABLESCALING_LINEAR:
153 factor = cpus;
154 break;
155 case SCHED_TUNABLESCALING_LOG:
156 default:
157 factor = 1 + ilog2(cpus);
158 break;
159 }
160
161 return factor;
162}
163
164static void update_sysctl(void)
165{
166 unsigned int factor = get_update_sysctl_factor();
167
168#define SET_SYSCTL(name) \
169 (sysctl_##name = (factor) * normalized_sysctl_##name)
170 SET_SYSCTL(sched_min_granularity);
171 SET_SYSCTL(sched_latency);
172 SET_SYSCTL(sched_wakeup_granularity);
173#undef SET_SYSCTL
174}
175
176void sched_init_granularity(void)
177{
178 update_sysctl();
179}
180
181#if BITS_PER_LONG == 32
182# define WMULT_CONST (~0UL)
183#else
184# define WMULT_CONST (1UL << 32)
185#endif
186
187#define WMULT_SHIFT 32
188
189/*
190 * Shift right and round:
191 */
192#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
193
194/*
195 * delta *= weight / lw
196 */
197static unsigned long
198calc_delta_mine(unsigned long delta_exec, unsigned long weight,
199 struct load_weight *lw)
200{
201 u64 tmp;
202
203 /*
204 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
205 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
206 * 2^SCHED_LOAD_RESOLUTION.
207 */
208 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
209 tmp = (u64)delta_exec * scale_load_down(weight);
210 else
211 tmp = (u64)delta_exec;
212
213 if (!lw->inv_weight) {
214 unsigned long w = scale_load_down(lw->weight);
215
216 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
217 lw->inv_weight = 1;
218 else if (unlikely(!w))
219 lw->inv_weight = WMULT_CONST;
220 else
221 lw->inv_weight = WMULT_CONST / w;
222 }
223
224 /*
225 * Check whether we'd overflow the 64-bit multiplication:
226 */
227 if (unlikely(tmp > WMULT_CONST))
228 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
229 WMULT_SHIFT/2);
230 else
231 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
232
233 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
234}
235
236
237const struct sched_class fair_sched_class;
a4c2f00f 238
bf0f6f24
IM
239/**************************************************************
240 * CFS operations on generic schedulable entities:
241 */
242
62160e3f 243#ifdef CONFIG_FAIR_GROUP_SCHED
bf0f6f24 244
62160e3f 245/* cpu runqueue to which this cfs_rq is attached */
bf0f6f24
IM
246static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
247{
62160e3f 248 return cfs_rq->rq;
bf0f6f24
IM
249}
250
62160e3f
IM
251/* An entity is a task if it doesn't "own" a runqueue */
252#define entity_is_task(se) (!se->my_q)
bf0f6f24 253
8f48894f
PZ
254static inline struct task_struct *task_of(struct sched_entity *se)
255{
256#ifdef CONFIG_SCHED_DEBUG
257 WARN_ON_ONCE(!entity_is_task(se));
258#endif
259 return container_of(se, struct task_struct, se);
260}
261
b758149c
PZ
262/* Walk up scheduling entities hierarchy */
263#define for_each_sched_entity(se) \
264 for (; se; se = se->parent)
265
266static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
267{
268 return p->se.cfs_rq;
269}
270
271/* runqueue on which this entity is (to be) queued */
272static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
273{
274 return se->cfs_rq;
275}
276
277/* runqueue "owned" by this group */
278static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
279{
280 return grp->my_q;
281}
282
aff3e498
PT
283static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
284 int force_update);
9ee474f5 285
3d4b47b4
PZ
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288 if (!cfs_rq->on_list) {
67e86250
PT
289 /*
290 * Ensure we either appear before our parent (if already
291 * enqueued) or force our parent to appear after us when it is
292 * enqueued. The fact that we always enqueue bottom-up
293 * reduces this to two cases.
294 */
295 if (cfs_rq->tg->parent &&
296 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
297 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
298 &rq_of(cfs_rq)->leaf_cfs_rq_list);
299 } else {
300 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
3d4b47b4 301 &rq_of(cfs_rq)->leaf_cfs_rq_list);
67e86250 302 }
3d4b47b4
PZ
303
304 cfs_rq->on_list = 1;
9ee474f5 305 /* We should have no load, but we need to update last_decay. */
aff3e498 306 update_cfs_rq_blocked_load(cfs_rq, 0);
3d4b47b4
PZ
307 }
308}
309
310static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
311{
312 if (cfs_rq->on_list) {
313 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
314 cfs_rq->on_list = 0;
315 }
316}
317
b758149c
PZ
318/* Iterate thr' all leaf cfs_rq's on a runqueue */
319#define for_each_leaf_cfs_rq(rq, cfs_rq) \
320 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
321
322/* Do the two (enqueued) entities belong to the same group ? */
323static inline int
324is_same_group(struct sched_entity *se, struct sched_entity *pse)
325{
326 if (se->cfs_rq == pse->cfs_rq)
327 return 1;
328
329 return 0;
330}
331
332static inline struct sched_entity *parent_entity(struct sched_entity *se)
333{
334 return se->parent;
335}
336
464b7527
PZ
337/* return depth at which a sched entity is present in the hierarchy */
338static inline int depth_se(struct sched_entity *se)
339{
340 int depth = 0;
341
342 for_each_sched_entity(se)
343 depth++;
344
345 return depth;
346}
347
348static void
349find_matching_se(struct sched_entity **se, struct sched_entity **pse)
350{
351 int se_depth, pse_depth;
352
353 /*
354 * preemption test can be made between sibling entities who are in the
355 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
356 * both tasks until we find their ancestors who are siblings of common
357 * parent.
358 */
359
360 /* First walk up until both entities are at same depth */
361 se_depth = depth_se(*se);
362 pse_depth = depth_se(*pse);
363
364 while (se_depth > pse_depth) {
365 se_depth--;
366 *se = parent_entity(*se);
367 }
368
369 while (pse_depth > se_depth) {
370 pse_depth--;
371 *pse = parent_entity(*pse);
372 }
373
374 while (!is_same_group(*se, *pse)) {
375 *se = parent_entity(*se);
376 *pse = parent_entity(*pse);
377 }
378}
379
8f48894f
PZ
380#else /* !CONFIG_FAIR_GROUP_SCHED */
381
382static inline struct task_struct *task_of(struct sched_entity *se)
383{
384 return container_of(se, struct task_struct, se);
385}
bf0f6f24 386
62160e3f
IM
387static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
388{
389 return container_of(cfs_rq, struct rq, cfs);
bf0f6f24
IM
390}
391
392#define entity_is_task(se) 1
393
b758149c
PZ
394#define for_each_sched_entity(se) \
395 for (; se; se = NULL)
bf0f6f24 396
b758149c 397static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
bf0f6f24 398{
b758149c 399 return &task_rq(p)->cfs;
bf0f6f24
IM
400}
401
b758149c
PZ
402static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
403{
404 struct task_struct *p = task_of(se);
405 struct rq *rq = task_rq(p);
406
407 return &rq->cfs;
408}
409
410/* runqueue "owned" by this group */
411static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
412{
413 return NULL;
414}
415
3d4b47b4
PZ
416static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
417{
418}
419
420static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
421{
422}
423
b758149c
PZ
424#define for_each_leaf_cfs_rq(rq, cfs_rq) \
425 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
426
427static inline int
428is_same_group(struct sched_entity *se, struct sched_entity *pse)
429{
430 return 1;
431}
432
433static inline struct sched_entity *parent_entity(struct sched_entity *se)
434{
435 return NULL;
436}
437
464b7527
PZ
438static inline void
439find_matching_se(struct sched_entity **se, struct sched_entity **pse)
440{
441}
442
b758149c
PZ
443#endif /* CONFIG_FAIR_GROUP_SCHED */
444
6c16a6dc
PZ
445static __always_inline
446void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
bf0f6f24
IM
447
448/**************************************************************
449 * Scheduling class tree data structure manipulation methods:
450 */
451
1bf08230 452static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
02e0431a 453{
1bf08230 454 s64 delta = (s64)(vruntime - max_vruntime);
368059a9 455 if (delta > 0)
1bf08230 456 max_vruntime = vruntime;
02e0431a 457
1bf08230 458 return max_vruntime;
02e0431a
PZ
459}
460
0702e3eb 461static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
b0ffd246
PZ
462{
463 s64 delta = (s64)(vruntime - min_vruntime);
464 if (delta < 0)
465 min_vruntime = vruntime;
466
467 return min_vruntime;
468}
469
54fdc581
FC
470static inline int entity_before(struct sched_entity *a,
471 struct sched_entity *b)
472{
473 return (s64)(a->vruntime - b->vruntime) < 0;
474}
475
1af5f730
PZ
476static void update_min_vruntime(struct cfs_rq *cfs_rq)
477{
478 u64 vruntime = cfs_rq->min_vruntime;
479
480 if (cfs_rq->curr)
481 vruntime = cfs_rq->curr->vruntime;
482
483 if (cfs_rq->rb_leftmost) {
484 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
485 struct sched_entity,
486 run_node);
487
e17036da 488 if (!cfs_rq->curr)
1af5f730
PZ
489 vruntime = se->vruntime;
490 else
491 vruntime = min_vruntime(vruntime, se->vruntime);
492 }
493
1bf08230 494 /* ensure we never gain time by being placed backwards. */
1af5f730 495 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
3fe1698b
PZ
496#ifndef CONFIG_64BIT
497 smp_wmb();
498 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
499#endif
1af5f730
PZ
500}
501
bf0f6f24
IM
502/*
503 * Enqueue an entity into the rb-tree:
504 */
0702e3eb 505static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
506{
507 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
508 struct rb_node *parent = NULL;
509 struct sched_entity *entry;
bf0f6f24
IM
510 int leftmost = 1;
511
512 /*
513 * Find the right place in the rbtree:
514 */
515 while (*link) {
516 parent = *link;
517 entry = rb_entry(parent, struct sched_entity, run_node);
518 /*
519 * We dont care about collisions. Nodes with
520 * the same key stay together.
521 */
2bd2d6f2 522 if (entity_before(se, entry)) {
bf0f6f24
IM
523 link = &parent->rb_left;
524 } else {
525 link = &parent->rb_right;
526 leftmost = 0;
527 }
528 }
529
530 /*
531 * Maintain a cache of leftmost tree entries (it is frequently
532 * used):
533 */
1af5f730 534 if (leftmost)
57cb499d 535 cfs_rq->rb_leftmost = &se->run_node;
bf0f6f24
IM
536
537 rb_link_node(&se->run_node, parent, link);
538 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
539}
540
0702e3eb 541static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 542{
3fe69747
PZ
543 if (cfs_rq->rb_leftmost == &se->run_node) {
544 struct rb_node *next_node;
3fe69747
PZ
545
546 next_node = rb_next(&se->run_node);
547 cfs_rq->rb_leftmost = next_node;
3fe69747 548 }
e9acbff6 549
bf0f6f24 550 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
551}
552
029632fb 553struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
bf0f6f24 554{
f4b6755f
PZ
555 struct rb_node *left = cfs_rq->rb_leftmost;
556
557 if (!left)
558 return NULL;
559
560 return rb_entry(left, struct sched_entity, run_node);
bf0f6f24
IM
561}
562
ac53db59
RR
563static struct sched_entity *__pick_next_entity(struct sched_entity *se)
564{
565 struct rb_node *next = rb_next(&se->run_node);
566
567 if (!next)
568 return NULL;
569
570 return rb_entry(next, struct sched_entity, run_node);
571}
572
573#ifdef CONFIG_SCHED_DEBUG
029632fb 574struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
aeb73b04 575{
7eee3e67 576 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
aeb73b04 577
70eee74b
BS
578 if (!last)
579 return NULL;
7eee3e67
IM
580
581 return rb_entry(last, struct sched_entity, run_node);
aeb73b04
PZ
582}
583
bf0f6f24
IM
584/**************************************************************
585 * Scheduling class statistics methods:
586 */
587
acb4a848 588int sched_proc_update_handler(struct ctl_table *table, int write,
8d65af78 589 void __user *buffer, size_t *lenp,
b2be5e96
PZ
590 loff_t *ppos)
591{
8d65af78 592 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
acb4a848 593 int factor = get_update_sysctl_factor();
b2be5e96
PZ
594
595 if (ret || !write)
596 return ret;
597
598 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
599 sysctl_sched_min_granularity);
600
acb4a848
CE
601#define WRT_SYSCTL(name) \
602 (normalized_sysctl_##name = sysctl_##name / (factor))
603 WRT_SYSCTL(sched_min_granularity);
604 WRT_SYSCTL(sched_latency);
605 WRT_SYSCTL(sched_wakeup_granularity);
acb4a848
CE
606#undef WRT_SYSCTL
607
b2be5e96
PZ
608 return 0;
609}
610#endif
647e7cac 611
a7be37ac 612/*
f9c0b095 613 * delta /= w
a7be37ac
PZ
614 */
615static inline unsigned long
616calc_delta_fair(unsigned long delta, struct sched_entity *se)
617{
f9c0b095
PZ
618 if (unlikely(se->load.weight != NICE_0_LOAD))
619 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
a7be37ac
PZ
620
621 return delta;
622}
623
647e7cac
IM
624/*
625 * The idea is to set a period in which each task runs once.
626 *
532b1858 627 * When there are too many tasks (sched_nr_latency) we have to stretch
647e7cac
IM
628 * this period because otherwise the slices get too small.
629 *
630 * p = (nr <= nl) ? l : l*nr/nl
631 */
4d78e7b6
PZ
632static u64 __sched_period(unsigned long nr_running)
633{
634 u64 period = sysctl_sched_latency;
b2be5e96 635 unsigned long nr_latency = sched_nr_latency;
4d78e7b6
PZ
636
637 if (unlikely(nr_running > nr_latency)) {
4bf0b771 638 period = sysctl_sched_min_granularity;
4d78e7b6 639 period *= nr_running;
4d78e7b6
PZ
640 }
641
642 return period;
643}
644
647e7cac
IM
645/*
646 * We calculate the wall-time slice from the period by taking a part
647 * proportional to the weight.
648 *
f9c0b095 649 * s = p*P[w/rw]
647e7cac 650 */
6d0f0ebd 651static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
21805085 652{
0a582440 653 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
f9c0b095 654
0a582440 655 for_each_sched_entity(se) {
6272d68c 656 struct load_weight *load;
3104bf03 657 struct load_weight lw;
6272d68c
LM
658
659 cfs_rq = cfs_rq_of(se);
660 load = &cfs_rq->load;
f9c0b095 661
0a582440 662 if (unlikely(!se->on_rq)) {
3104bf03 663 lw = cfs_rq->load;
0a582440
MG
664
665 update_load_add(&lw, se->load.weight);
666 load = &lw;
667 }
668 slice = calc_delta_mine(slice, se->load.weight, load);
669 }
670 return slice;
bf0f6f24
IM
671}
672
647e7cac 673/*
660cc00f 674 * We calculate the vruntime slice of a to-be-inserted task.
647e7cac 675 *
f9c0b095 676 * vs = s/w
647e7cac 677 */
f9c0b095 678static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
67e9fb2a 679{
f9c0b095 680 return calc_delta_fair(sched_slice(cfs_rq, se), se);
a7be37ac
PZ
681}
682
a75cdaa9
AS
683#ifdef CONFIG_SMP
684static inline void __update_task_entity_contrib(struct sched_entity *se);
685
686/* Give new task start runnable values to heavy its load in infant time */
687void init_task_runnable_average(struct task_struct *p)
688{
689 u32 slice;
690
691 p->se.avg.decay_count = 0;
692 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
693 p->se.avg.runnable_avg_sum = slice;
694 p->se.avg.runnable_avg_period = slice;
695 __update_task_entity_contrib(&p->se);
696}
697#else
698void init_task_runnable_average(struct task_struct *p)
699{
700}
701#endif
702
bf0f6f24
IM
703/*
704 * Update the current task's runtime statistics. Skip current tasks that
705 * are not in our scheduling class.
706 */
707static inline void
8ebc91d9
IM
708__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
709 unsigned long delta_exec)
bf0f6f24 710{
bbdba7c0 711 unsigned long delta_exec_weighted;
bf0f6f24 712
41acab88
LDM
713 schedstat_set(curr->statistics.exec_max,
714 max((u64)delta_exec, curr->statistics.exec_max));
bf0f6f24
IM
715
716 curr->sum_exec_runtime += delta_exec;
7a62eabc 717 schedstat_add(cfs_rq, exec_clock, delta_exec);
a7be37ac 718 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
88ec22d3 719
e9acbff6 720 curr->vruntime += delta_exec_weighted;
1af5f730 721 update_min_vruntime(cfs_rq);
bf0f6f24
IM
722}
723
b7cc0896 724static void update_curr(struct cfs_rq *cfs_rq)
bf0f6f24 725{
429d43bc 726 struct sched_entity *curr = cfs_rq->curr;
78becc27 727 u64 now = rq_clock_task(rq_of(cfs_rq));
bf0f6f24
IM
728 unsigned long delta_exec;
729
730 if (unlikely(!curr))
731 return;
732
733 /*
734 * Get the amount of time the current task was running
735 * since the last time we changed load (this cannot
736 * overflow on 32 bits):
737 */
8ebc91d9 738 delta_exec = (unsigned long)(now - curr->exec_start);
34f28ecd
PZ
739 if (!delta_exec)
740 return;
bf0f6f24 741
8ebc91d9
IM
742 __update_curr(cfs_rq, curr, delta_exec);
743 curr->exec_start = now;
d842de87
SV
744
745 if (entity_is_task(curr)) {
746 struct task_struct *curtask = task_of(curr);
747
f977bb49 748 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
d842de87 749 cpuacct_charge(curtask, delta_exec);
f06febc9 750 account_group_exec_runtime(curtask, delta_exec);
d842de87 751 }
ec12cb7f
PT
752
753 account_cfs_rq_runtime(cfs_rq, delta_exec);
bf0f6f24
IM
754}
755
756static inline void
5870db5b 757update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 758{
78becc27 759 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
bf0f6f24
IM
760}
761
bf0f6f24
IM
762/*
763 * Task is being enqueued - update stats:
764 */
d2417e5a 765static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 766{
bf0f6f24
IM
767 /*
768 * Are we enqueueing a waiting task? (for current tasks
769 * a dequeue/enqueue event is a NOP)
770 */
429d43bc 771 if (se != cfs_rq->curr)
5870db5b 772 update_stats_wait_start(cfs_rq, se);
bf0f6f24
IM
773}
774
bf0f6f24 775static void
9ef0a961 776update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 777{
41acab88 778 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
78becc27 779 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
41acab88
LDM
780 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
781 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
78becc27 782 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
768d0c27
PZ
783#ifdef CONFIG_SCHEDSTATS
784 if (entity_is_task(se)) {
785 trace_sched_stat_wait(task_of(se),
78becc27 786 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
768d0c27
PZ
787 }
788#endif
41acab88 789 schedstat_set(se->statistics.wait_start, 0);
bf0f6f24
IM
790}
791
792static inline void
19b6a2e3 793update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 794{
bf0f6f24
IM
795 /*
796 * Mark the end of the wait period if dequeueing a
797 * waiting task:
798 */
429d43bc 799 if (se != cfs_rq->curr)
9ef0a961 800 update_stats_wait_end(cfs_rq, se);
bf0f6f24
IM
801}
802
803/*
804 * We are picking a new current task - update its stats:
805 */
806static inline void
79303e9e 807update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
808{
809 /*
810 * We are starting a new run period:
811 */
78becc27 812 se->exec_start = rq_clock_task(rq_of(cfs_rq));
bf0f6f24
IM
813}
814
bf0f6f24
IM
815/**************************************************
816 * Scheduling class queueing methods:
817 */
818
cbee9f88
PZ
819#ifdef CONFIG_NUMA_BALANCING
820/*
598f0ec0
MG
821 * Approximate time to scan a full NUMA task in ms. The task scan period is
822 * calculated based on the tasks virtual memory size and
823 * numa_balancing_scan_size.
cbee9f88 824 */
598f0ec0
MG
825unsigned int sysctl_numa_balancing_scan_period_min = 1000;
826unsigned int sysctl_numa_balancing_scan_period_max = 60000;
827unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
6e5fb223
PZ
828
829/* Portion of address space to scan in MB */
830unsigned int sysctl_numa_balancing_scan_size = 256;
cbee9f88 831
4b96a29b
PZ
832/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
833unsigned int sysctl_numa_balancing_scan_delay = 1000;
834
598f0ec0
MG
835static unsigned int task_nr_scan_windows(struct task_struct *p)
836{
837 unsigned long rss = 0;
838 unsigned long nr_scan_pages;
839
840 /*
841 * Calculations based on RSS as non-present and empty pages are skipped
842 * by the PTE scanner and NUMA hinting faults should be trapped based
843 * on resident pages
844 */
845 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
846 rss = get_mm_rss(p->mm);
847 if (!rss)
848 rss = nr_scan_pages;
849
850 rss = round_up(rss, nr_scan_pages);
851 return rss / nr_scan_pages;
852}
853
854/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
855#define MAX_SCAN_WINDOW 2560
856
857static unsigned int task_scan_min(struct task_struct *p)
858{
859 unsigned int scan, floor;
860 unsigned int windows = 1;
861
862 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
863 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
864 floor = 1000 / windows;
865
866 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
867 return max_t(unsigned int, floor, scan);
868}
869
870static unsigned int task_scan_max(struct task_struct *p)
871{
872 unsigned int smin = task_scan_min(p);
873 unsigned int smax;
874
875 /* Watch for min being lower than max due to floor calculations */
876 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
877 return max(smin, smax);
878}
879
cbee9f88
PZ
880static void task_numa_placement(struct task_struct *p)
881{
2832bc19 882 int seq;
cbee9f88 883
2832bc19
HD
884 if (!p->mm) /* for example, ksmd faulting in a user's mm */
885 return;
886 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
cbee9f88
PZ
887 if (p->numa_scan_seq == seq)
888 return;
889 p->numa_scan_seq = seq;
598f0ec0 890 p->numa_scan_period_max = task_scan_max(p);
cbee9f88
PZ
891
892 /* FIXME: Scheduling placement policy hints go here */
893}
894
895/*
896 * Got a PROT_NONE fault for a page on @node.
897 */
b8593bfd 898void task_numa_fault(int node, int pages, bool migrated)
cbee9f88
PZ
899{
900 struct task_struct *p = current;
901
10e84b97 902 if (!numabalancing_enabled)
1a687c2e
MG
903 return;
904
cbee9f88
PZ
905 /* FIXME: Allocate task-specific structure for placement policy here */
906
fb003b80 907 /*
b8593bfd
MG
908 * If pages are properly placed (did not migrate) then scan slower.
909 * This is reset periodically in case of phase changes
fb003b80 910 */
598f0ec0
MG
911 if (!migrated) {
912 /* Initialise if necessary */
913 if (!p->numa_scan_period_max)
914 p->numa_scan_period_max = task_scan_max(p);
915
916 p->numa_scan_period = min(p->numa_scan_period_max,
917 p->numa_scan_period + 10);
918 }
fb003b80 919
cbee9f88
PZ
920 task_numa_placement(p);
921}
922
6e5fb223
PZ
923static void reset_ptenuma_scan(struct task_struct *p)
924{
925 ACCESS_ONCE(p->mm->numa_scan_seq)++;
926 p->mm->numa_scan_offset = 0;
927}
928
cbee9f88
PZ
929/*
930 * The expensive part of numa migration is done from task_work context.
931 * Triggered from task_tick_numa().
932 */
933void task_numa_work(struct callback_head *work)
934{
935 unsigned long migrate, next_scan, now = jiffies;
936 struct task_struct *p = current;
937 struct mm_struct *mm = p->mm;
6e5fb223 938 struct vm_area_struct *vma;
9f40604c 939 unsigned long start, end;
598f0ec0 940 unsigned long nr_pte_updates = 0;
9f40604c 941 long pages;
cbee9f88
PZ
942
943 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
944
945 work->next = work; /* protect against double add */
946 /*
947 * Who cares about NUMA placement when they're dying.
948 *
949 * NOTE: make sure not to dereference p->mm before this check,
950 * exit_task_work() happens _after_ exit_mm() so we could be called
951 * without p->mm even though we still had it when we enqueued this
952 * work.
953 */
954 if (p->flags & PF_EXITING)
955 return;
956
7e8d16b6
MG
957 if (!mm->numa_next_reset || !mm->numa_next_scan) {
958 mm->numa_next_scan = now +
959 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
960 mm->numa_next_reset = now +
961 msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
962 }
963
b8593bfd
MG
964 /*
965 * Reset the scan period if enough time has gone by. Objective is that
966 * scanning will be reduced if pages are properly placed. As tasks
967 * can enter different phases this needs to be re-examined. Lacking
968 * proper tracking of reference behaviour, this blunt hammer is used.
969 */
970 migrate = mm->numa_next_reset;
971 if (time_after(now, migrate)) {
598f0ec0 972 p->numa_scan_period = task_scan_min(p);
b8593bfd
MG
973 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
974 xchg(&mm->numa_next_reset, next_scan);
975 }
976
cbee9f88
PZ
977 /*
978 * Enforce maximal scan/migration frequency..
979 */
980 migrate = mm->numa_next_scan;
981 if (time_before(now, migrate))
982 return;
983
598f0ec0
MG
984 if (p->numa_scan_period == 0) {
985 p->numa_scan_period_max = task_scan_max(p);
986 p->numa_scan_period = task_scan_min(p);
987 }
cbee9f88 988
fb003b80 989 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
cbee9f88
PZ
990 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
991 return;
992
19a78d11
PZ
993 /*
994 * Delay this task enough that another task of this mm will likely win
995 * the next time around.
996 */
997 p->node_stamp += 2 * TICK_NSEC;
998
9f40604c
MG
999 start = mm->numa_scan_offset;
1000 pages = sysctl_numa_balancing_scan_size;
1001 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
1002 if (!pages)
1003 return;
cbee9f88 1004
6e5fb223 1005 down_read(&mm->mmap_sem);
9f40604c 1006 vma = find_vma(mm, start);
6e5fb223
PZ
1007 if (!vma) {
1008 reset_ptenuma_scan(p);
9f40604c 1009 start = 0;
6e5fb223
PZ
1010 vma = mm->mmap;
1011 }
9f40604c 1012 for (; vma; vma = vma->vm_next) {
6e5fb223
PZ
1013 if (!vma_migratable(vma))
1014 continue;
1015
1016 /* Skip small VMAs. They are not likely to be of relevance */
221392c3 1017 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
6e5fb223
PZ
1018 continue;
1019
9f40604c
MG
1020 do {
1021 start = max(start, vma->vm_start);
1022 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
1023 end = min(end, vma->vm_end);
598f0ec0
MG
1024 nr_pte_updates += change_prot_numa(vma, start, end);
1025
1026 /*
1027 * Scan sysctl_numa_balancing_scan_size but ensure that
1028 * at least one PTE is updated so that unused virtual
1029 * address space is quickly skipped.
1030 */
1031 if (nr_pte_updates)
1032 pages -= (end - start) >> PAGE_SHIFT;
6e5fb223 1033
9f40604c
MG
1034 start = end;
1035 if (pages <= 0)
1036 goto out;
1037 } while (end != vma->vm_end);
cbee9f88 1038 }
6e5fb223 1039
9f40604c 1040out:
6e5fb223 1041 /*
c69307d5
PZ
1042 * It is possible to reach the end of the VMA list but the last few
1043 * VMAs are not guaranteed to the vma_migratable. If they are not, we
1044 * would find the !migratable VMA on the next scan but not reset the
1045 * scanner to the start so check it now.
6e5fb223
PZ
1046 */
1047 if (vma)
9f40604c 1048 mm->numa_scan_offset = start;
6e5fb223
PZ
1049 else
1050 reset_ptenuma_scan(p);
1051 up_read(&mm->mmap_sem);
cbee9f88
PZ
1052}
1053
1054/*
1055 * Drive the periodic memory faults..
1056 */
1057void task_tick_numa(struct rq *rq, struct task_struct *curr)
1058{
1059 struct callback_head *work = &curr->numa_work;
1060 u64 period, now;
1061
1062 /*
1063 * We don't care about NUMA placement if we don't have memory.
1064 */
1065 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
1066 return;
1067
1068 /*
1069 * Using runtime rather than walltime has the dual advantage that
1070 * we (mostly) drive the selection from busy threads and that the
1071 * task needs to have done some actual work before we bother with
1072 * NUMA placement.
1073 */
1074 now = curr->se.sum_exec_runtime;
1075 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1076
1077 if (now - curr->node_stamp > period) {
4b96a29b 1078 if (!curr->node_stamp)
598f0ec0 1079 curr->numa_scan_period = task_scan_min(curr);
19a78d11 1080 curr->node_stamp += period;
cbee9f88
PZ
1081
1082 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1083 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1084 task_work_add(curr, work, true);
1085 }
1086 }
1087}
1088#else
1089static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1090{
1091}
1092#endif /* CONFIG_NUMA_BALANCING */
1093
30cfdcfc
DA
1094static void
1095account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1096{
1097 update_load_add(&cfs_rq->load, se->load.weight);
c09595f6 1098 if (!parent_entity(se))
029632fb 1099 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
367456c7
PZ
1100#ifdef CONFIG_SMP
1101 if (entity_is_task(se))
eb95308e 1102 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
367456c7 1103#endif
30cfdcfc 1104 cfs_rq->nr_running++;
30cfdcfc
DA
1105}
1106
1107static void
1108account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1109{
1110 update_load_sub(&cfs_rq->load, se->load.weight);
c09595f6 1111 if (!parent_entity(se))
029632fb 1112 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
367456c7 1113 if (entity_is_task(se))
b87f1724 1114 list_del_init(&se->group_node);
30cfdcfc 1115 cfs_rq->nr_running--;
30cfdcfc
DA
1116}
1117
3ff6dcac
YZ
1118#ifdef CONFIG_FAIR_GROUP_SCHED
1119# ifdef CONFIG_SMP
cf5f0acf
PZ
1120static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1121{
1122 long tg_weight;
1123
1124 /*
1125 * Use this CPU's actual weight instead of the last load_contribution
1126 * to gain a more accurate current total weight. See
1127 * update_cfs_rq_load_contribution().
1128 */
bf5b986e 1129 tg_weight = atomic_long_read(&tg->load_avg);
82958366 1130 tg_weight -= cfs_rq->tg_load_contrib;
cf5f0acf
PZ
1131 tg_weight += cfs_rq->load.weight;
1132
1133 return tg_weight;
1134}
1135
6d5ab293 1136static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac 1137{
cf5f0acf 1138 long tg_weight, load, shares;
3ff6dcac 1139
cf5f0acf 1140 tg_weight = calc_tg_weight(tg, cfs_rq);
6d5ab293 1141 load = cfs_rq->load.weight;
3ff6dcac 1142
3ff6dcac 1143 shares = (tg->shares * load);
cf5f0acf
PZ
1144 if (tg_weight)
1145 shares /= tg_weight;
3ff6dcac
YZ
1146
1147 if (shares < MIN_SHARES)
1148 shares = MIN_SHARES;
1149 if (shares > tg->shares)
1150 shares = tg->shares;
1151
1152 return shares;
1153}
3ff6dcac 1154# else /* CONFIG_SMP */
6d5ab293 1155static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac
YZ
1156{
1157 return tg->shares;
1158}
3ff6dcac 1159# endif /* CONFIG_SMP */
2069dd75
PZ
1160static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1161 unsigned long weight)
1162{
19e5eebb
PT
1163 if (se->on_rq) {
1164 /* commit outstanding execution time */
1165 if (cfs_rq->curr == se)
1166 update_curr(cfs_rq);
2069dd75 1167 account_entity_dequeue(cfs_rq, se);
19e5eebb 1168 }
2069dd75
PZ
1169
1170 update_load_set(&se->load, weight);
1171
1172 if (se->on_rq)
1173 account_entity_enqueue(cfs_rq, se);
1174}
1175
82958366
PT
1176static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1177
6d5ab293 1178static void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75
PZ
1179{
1180 struct task_group *tg;
1181 struct sched_entity *se;
3ff6dcac 1182 long shares;
2069dd75 1183
2069dd75
PZ
1184 tg = cfs_rq->tg;
1185 se = tg->se[cpu_of(rq_of(cfs_rq))];
64660c86 1186 if (!se || throttled_hierarchy(cfs_rq))
2069dd75 1187 return;
3ff6dcac
YZ
1188#ifndef CONFIG_SMP
1189 if (likely(se->load.weight == tg->shares))
1190 return;
1191#endif
6d5ab293 1192 shares = calc_cfs_shares(cfs_rq, tg);
2069dd75
PZ
1193
1194 reweight_entity(cfs_rq_of(se), se, shares);
1195}
1196#else /* CONFIG_FAIR_GROUP_SCHED */
6d5ab293 1197static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75
PZ
1198{
1199}
1200#endif /* CONFIG_FAIR_GROUP_SCHED */
1201
141965c7 1202#ifdef CONFIG_SMP
5b51f2f8
PT
1203/*
1204 * We choose a half-life close to 1 scheduling period.
1205 * Note: The tables below are dependent on this value.
1206 */
1207#define LOAD_AVG_PERIOD 32
1208#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1209#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1210
1211/* Precomputed fixed inverse multiplies for multiplication by y^n */
1212static const u32 runnable_avg_yN_inv[] = {
1213 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1214 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1215 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1216 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1217 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1218 0x85aac367, 0x82cd8698,
1219};
1220
1221/*
1222 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1223 * over-estimates when re-combining.
1224 */
1225static const u32 runnable_avg_yN_sum[] = {
1226 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1227 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1228 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1229};
1230
9d85f21c
PT
1231/*
1232 * Approximate:
1233 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1234 */
1235static __always_inline u64 decay_load(u64 val, u64 n)
1236{
5b51f2f8
PT
1237 unsigned int local_n;
1238
1239 if (!n)
1240 return val;
1241 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1242 return 0;
1243
1244 /* after bounds checking we can collapse to 32-bit */
1245 local_n = n;
1246
1247 /*
1248 * As y^PERIOD = 1/2, we can combine
1249 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1250 * With a look-up table which covers k^n (n<PERIOD)
1251 *
1252 * To achieve constant time decay_load.
1253 */
1254 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1255 val >>= local_n / LOAD_AVG_PERIOD;
1256 local_n %= LOAD_AVG_PERIOD;
9d85f21c
PT
1257 }
1258
5b51f2f8
PT
1259 val *= runnable_avg_yN_inv[local_n];
1260 /* We don't use SRR here since we always want to round down. */
1261 return val >> 32;
1262}
1263
1264/*
1265 * For updates fully spanning n periods, the contribution to runnable
1266 * average will be: \Sum 1024*y^n
1267 *
1268 * We can compute this reasonably efficiently by combining:
1269 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1270 */
1271static u32 __compute_runnable_contrib(u64 n)
1272{
1273 u32 contrib = 0;
1274
1275 if (likely(n <= LOAD_AVG_PERIOD))
1276 return runnable_avg_yN_sum[n];
1277 else if (unlikely(n >= LOAD_AVG_MAX_N))
1278 return LOAD_AVG_MAX;
1279
1280 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1281 do {
1282 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1283 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1284
1285 n -= LOAD_AVG_PERIOD;
1286 } while (n > LOAD_AVG_PERIOD);
1287
1288 contrib = decay_load(contrib, n);
1289 return contrib + runnable_avg_yN_sum[n];
9d85f21c
PT
1290}
1291
1292/*
1293 * We can represent the historical contribution to runnable average as the
1294 * coefficients of a geometric series. To do this we sub-divide our runnable
1295 * history into segments of approximately 1ms (1024us); label the segment that
1296 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1297 *
1298 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1299 * p0 p1 p2
1300 * (now) (~1ms ago) (~2ms ago)
1301 *
1302 * Let u_i denote the fraction of p_i that the entity was runnable.
1303 *
1304 * We then designate the fractions u_i as our co-efficients, yielding the
1305 * following representation of historical load:
1306 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1307 *
1308 * We choose y based on the with of a reasonably scheduling period, fixing:
1309 * y^32 = 0.5
1310 *
1311 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1312 * approximately half as much as the contribution to load within the last ms
1313 * (u_0).
1314 *
1315 * When a period "rolls over" and we have new u_0`, multiplying the previous
1316 * sum again by y is sufficient to update:
1317 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1318 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1319 */
1320static __always_inline int __update_entity_runnable_avg(u64 now,
1321 struct sched_avg *sa,
1322 int runnable)
1323{
5b51f2f8
PT
1324 u64 delta, periods;
1325 u32 runnable_contrib;
9d85f21c
PT
1326 int delta_w, decayed = 0;
1327
1328 delta = now - sa->last_runnable_update;
1329 /*
1330 * This should only happen when time goes backwards, which it
1331 * unfortunately does during sched clock init when we swap over to TSC.
1332 */
1333 if ((s64)delta < 0) {
1334 sa->last_runnable_update = now;
1335 return 0;
1336 }
1337
1338 /*
1339 * Use 1024ns as the unit of measurement since it's a reasonable
1340 * approximation of 1us and fast to compute.
1341 */
1342 delta >>= 10;
1343 if (!delta)
1344 return 0;
1345 sa->last_runnable_update = now;
1346
1347 /* delta_w is the amount already accumulated against our next period */
1348 delta_w = sa->runnable_avg_period % 1024;
1349 if (delta + delta_w >= 1024) {
1350 /* period roll-over */
1351 decayed = 1;
1352
1353 /*
1354 * Now that we know we're crossing a period boundary, figure
1355 * out how much from delta we need to complete the current
1356 * period and accrue it.
1357 */
1358 delta_w = 1024 - delta_w;
5b51f2f8
PT
1359 if (runnable)
1360 sa->runnable_avg_sum += delta_w;
1361 sa->runnable_avg_period += delta_w;
1362
1363 delta -= delta_w;
1364
1365 /* Figure out how many additional periods this update spans */
1366 periods = delta / 1024;
1367 delta %= 1024;
1368
1369 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1370 periods + 1);
1371 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1372 periods + 1);
1373
1374 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1375 runnable_contrib = __compute_runnable_contrib(periods);
1376 if (runnable)
1377 sa->runnable_avg_sum += runnable_contrib;
1378 sa->runnable_avg_period += runnable_contrib;
9d85f21c
PT
1379 }
1380
1381 /* Remainder of delta accrued against u_0` */
1382 if (runnable)
1383 sa->runnable_avg_sum += delta;
1384 sa->runnable_avg_period += delta;
1385
1386 return decayed;
1387}
1388
9ee474f5 1389/* Synchronize an entity's decay with its parenting cfs_rq.*/
aff3e498 1390static inline u64 __synchronize_entity_decay(struct sched_entity *se)
9ee474f5
PT
1391{
1392 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1393 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1394
1395 decays -= se->avg.decay_count;
1396 if (!decays)
aff3e498 1397 return 0;
9ee474f5
PT
1398
1399 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1400 se->avg.decay_count = 0;
aff3e498
PT
1401
1402 return decays;
9ee474f5
PT
1403}
1404
c566e8e9
PT
1405#ifdef CONFIG_FAIR_GROUP_SCHED
1406static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1407 int force_update)
1408{
1409 struct task_group *tg = cfs_rq->tg;
bf5b986e 1410 long tg_contrib;
c566e8e9
PT
1411
1412 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1413 tg_contrib -= cfs_rq->tg_load_contrib;
1414
bf5b986e
AS
1415 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1416 atomic_long_add(tg_contrib, &tg->load_avg);
c566e8e9
PT
1417 cfs_rq->tg_load_contrib += tg_contrib;
1418 }
1419}
8165e145 1420
bb17f655
PT
1421/*
1422 * Aggregate cfs_rq runnable averages into an equivalent task_group
1423 * representation for computing load contributions.
1424 */
1425static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1426 struct cfs_rq *cfs_rq)
1427{
1428 struct task_group *tg = cfs_rq->tg;
1429 long contrib;
1430
1431 /* The fraction of a cpu used by this cfs_rq */
1432 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1433 sa->runnable_avg_period + 1);
1434 contrib -= cfs_rq->tg_runnable_contrib;
1435
1436 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1437 atomic_add(contrib, &tg->runnable_avg);
1438 cfs_rq->tg_runnable_contrib += contrib;
1439 }
1440}
1441
8165e145
PT
1442static inline void __update_group_entity_contrib(struct sched_entity *se)
1443{
1444 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1445 struct task_group *tg = cfs_rq->tg;
bb17f655
PT
1446 int runnable_avg;
1447
8165e145
PT
1448 u64 contrib;
1449
1450 contrib = cfs_rq->tg_load_contrib * tg->shares;
bf5b986e
AS
1451 se->avg.load_avg_contrib = div_u64(contrib,
1452 atomic_long_read(&tg->load_avg) + 1);
bb17f655
PT
1453
1454 /*
1455 * For group entities we need to compute a correction term in the case
1456 * that they are consuming <1 cpu so that we would contribute the same
1457 * load as a task of equal weight.
1458 *
1459 * Explicitly co-ordinating this measurement would be expensive, but
1460 * fortunately the sum of each cpus contribution forms a usable
1461 * lower-bound on the true value.
1462 *
1463 * Consider the aggregate of 2 contributions. Either they are disjoint
1464 * (and the sum represents true value) or they are disjoint and we are
1465 * understating by the aggregate of their overlap.
1466 *
1467 * Extending this to N cpus, for a given overlap, the maximum amount we
1468 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1469 * cpus that overlap for this interval and w_i is the interval width.
1470 *
1471 * On a small machine; the first term is well-bounded which bounds the
1472 * total error since w_i is a subset of the period. Whereas on a
1473 * larger machine, while this first term can be larger, if w_i is the
1474 * of consequential size guaranteed to see n_i*w_i quickly converge to
1475 * our upper bound of 1-cpu.
1476 */
1477 runnable_avg = atomic_read(&tg->runnable_avg);
1478 if (runnable_avg < NICE_0_LOAD) {
1479 se->avg.load_avg_contrib *= runnable_avg;
1480 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1481 }
8165e145 1482}
c566e8e9
PT
1483#else
1484static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1485 int force_update) {}
bb17f655
PT
1486static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1487 struct cfs_rq *cfs_rq) {}
8165e145 1488static inline void __update_group_entity_contrib(struct sched_entity *se) {}
c566e8e9
PT
1489#endif
1490
8165e145
PT
1491static inline void __update_task_entity_contrib(struct sched_entity *se)
1492{
1493 u32 contrib;
1494
1495 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1496 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1497 contrib /= (se->avg.runnable_avg_period + 1);
1498 se->avg.load_avg_contrib = scale_load(contrib);
1499}
1500
2dac754e
PT
1501/* Compute the current contribution to load_avg by se, return any delta */
1502static long __update_entity_load_avg_contrib(struct sched_entity *se)
1503{
1504 long old_contrib = se->avg.load_avg_contrib;
1505
8165e145
PT
1506 if (entity_is_task(se)) {
1507 __update_task_entity_contrib(se);
1508 } else {
bb17f655 1509 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
8165e145
PT
1510 __update_group_entity_contrib(se);
1511 }
2dac754e
PT
1512
1513 return se->avg.load_avg_contrib - old_contrib;
1514}
1515
9ee474f5
PT
1516static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1517 long load_contrib)
1518{
1519 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1520 cfs_rq->blocked_load_avg -= load_contrib;
1521 else
1522 cfs_rq->blocked_load_avg = 0;
1523}
1524
f1b17280
PT
1525static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1526
9d85f21c 1527/* Update a sched_entity's runnable average */
9ee474f5
PT
1528static inline void update_entity_load_avg(struct sched_entity *se,
1529 int update_cfs_rq)
9d85f21c 1530{
2dac754e
PT
1531 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1532 long contrib_delta;
f1b17280 1533 u64 now;
2dac754e 1534
f1b17280
PT
1535 /*
1536 * For a group entity we need to use their owned cfs_rq_clock_task() in
1537 * case they are the parent of a throttled hierarchy.
1538 */
1539 if (entity_is_task(se))
1540 now = cfs_rq_clock_task(cfs_rq);
1541 else
1542 now = cfs_rq_clock_task(group_cfs_rq(se));
1543
1544 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
2dac754e
PT
1545 return;
1546
1547 contrib_delta = __update_entity_load_avg_contrib(se);
9ee474f5
PT
1548
1549 if (!update_cfs_rq)
1550 return;
1551
2dac754e
PT
1552 if (se->on_rq)
1553 cfs_rq->runnable_load_avg += contrib_delta;
9ee474f5
PT
1554 else
1555 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1556}
1557
1558/*
1559 * Decay the load contributed by all blocked children and account this so that
1560 * their contribution may appropriately discounted when they wake up.
1561 */
aff3e498 1562static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
9ee474f5 1563{
f1b17280 1564 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
9ee474f5
PT
1565 u64 decays;
1566
1567 decays = now - cfs_rq->last_decay;
aff3e498 1568 if (!decays && !force_update)
9ee474f5
PT
1569 return;
1570
2509940f
AS
1571 if (atomic_long_read(&cfs_rq->removed_load)) {
1572 unsigned long removed_load;
1573 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
aff3e498
PT
1574 subtract_blocked_load_contrib(cfs_rq, removed_load);
1575 }
9ee474f5 1576
aff3e498
PT
1577 if (decays) {
1578 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1579 decays);
1580 atomic64_add(decays, &cfs_rq->decay_counter);
1581 cfs_rq->last_decay = now;
1582 }
c566e8e9
PT
1583
1584 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
9d85f21c 1585}
18bf2805
BS
1586
1587static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1588{
78becc27 1589 __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
bb17f655 1590 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
18bf2805 1591}
2dac754e
PT
1592
1593/* Add the load generated by se into cfs_rq's child load-average */
1594static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
1595 struct sched_entity *se,
1596 int wakeup)
2dac754e 1597{
aff3e498
PT
1598 /*
1599 * We track migrations using entity decay_count <= 0, on a wake-up
1600 * migration we use a negative decay count to track the remote decays
1601 * accumulated while sleeping.
a75cdaa9
AS
1602 *
1603 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
1604 * are seen by enqueue_entity_load_avg() as a migration with an already
1605 * constructed load_avg_contrib.
aff3e498
PT
1606 */
1607 if (unlikely(se->avg.decay_count <= 0)) {
78becc27 1608 se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
aff3e498
PT
1609 if (se->avg.decay_count) {
1610 /*
1611 * In a wake-up migration we have to approximate the
1612 * time sleeping. This is because we can't synchronize
1613 * clock_task between the two cpus, and it is not
1614 * guaranteed to be read-safe. Instead, we can
1615 * approximate this using our carried decays, which are
1616 * explicitly atomically readable.
1617 */
1618 se->avg.last_runnable_update -= (-se->avg.decay_count)
1619 << 20;
1620 update_entity_load_avg(se, 0);
1621 /* Indicate that we're now synchronized and on-rq */
1622 se->avg.decay_count = 0;
1623 }
9ee474f5
PT
1624 wakeup = 0;
1625 } else {
282cf499
AS
1626 /*
1627 * Task re-woke on same cpu (or else migrate_task_rq_fair()
1628 * would have made count negative); we must be careful to avoid
1629 * double-accounting blocked time after synchronizing decays.
1630 */
1631 se->avg.last_runnable_update += __synchronize_entity_decay(se)
1632 << 20;
9ee474f5
PT
1633 }
1634
aff3e498
PT
1635 /* migrated tasks did not contribute to our blocked load */
1636 if (wakeup) {
9ee474f5 1637 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
aff3e498
PT
1638 update_entity_load_avg(se, 0);
1639 }
9ee474f5 1640
2dac754e 1641 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
aff3e498
PT
1642 /* we force update consideration on load-balancer moves */
1643 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2dac754e
PT
1644}
1645
9ee474f5
PT
1646/*
1647 * Remove se's load from this cfs_rq child load-average, if the entity is
1648 * transitioning to a blocked state we track its projected decay using
1649 * blocked_load_avg.
1650 */
2dac754e 1651static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
1652 struct sched_entity *se,
1653 int sleep)
2dac754e 1654{
9ee474f5 1655 update_entity_load_avg(se, 1);
aff3e498
PT
1656 /* we force update consideration on load-balancer moves */
1657 update_cfs_rq_blocked_load(cfs_rq, !sleep);
9ee474f5 1658
2dac754e 1659 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
9ee474f5
PT
1660 if (sleep) {
1661 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1662 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1663 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
2dac754e 1664}
642dbc39
VG
1665
1666/*
1667 * Update the rq's load with the elapsed running time before entering
1668 * idle. if the last scheduled task is not a CFS task, idle_enter will
1669 * be the only way to update the runnable statistic.
1670 */
1671void idle_enter_fair(struct rq *this_rq)
1672{
1673 update_rq_runnable_avg(this_rq, 1);
1674}
1675
1676/*
1677 * Update the rq's load with the elapsed idle time before a task is
1678 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1679 * be the only way to update the runnable statistic.
1680 */
1681void idle_exit_fair(struct rq *this_rq)
1682{
1683 update_rq_runnable_avg(this_rq, 0);
1684}
1685
9d85f21c 1686#else
9ee474f5
PT
1687static inline void update_entity_load_avg(struct sched_entity *se,
1688 int update_cfs_rq) {}
18bf2805 1689static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2dac754e 1690static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
1691 struct sched_entity *se,
1692 int wakeup) {}
2dac754e 1693static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
1694 struct sched_entity *se,
1695 int sleep) {}
aff3e498
PT
1696static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1697 int force_update) {}
9d85f21c
PT
1698#endif
1699
2396af69 1700static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 1701{
bf0f6f24 1702#ifdef CONFIG_SCHEDSTATS
e414314c
PZ
1703 struct task_struct *tsk = NULL;
1704
1705 if (entity_is_task(se))
1706 tsk = task_of(se);
1707
41acab88 1708 if (se->statistics.sleep_start) {
78becc27 1709 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
bf0f6f24
IM
1710
1711 if ((s64)delta < 0)
1712 delta = 0;
1713
41acab88
LDM
1714 if (unlikely(delta > se->statistics.sleep_max))
1715 se->statistics.sleep_max = delta;
bf0f6f24 1716
8c79a045 1717 se->statistics.sleep_start = 0;
41acab88 1718 se->statistics.sum_sleep_runtime += delta;
9745512c 1719
768d0c27 1720 if (tsk) {
e414314c 1721 account_scheduler_latency(tsk, delta >> 10, 1);
768d0c27
PZ
1722 trace_sched_stat_sleep(tsk, delta);
1723 }
bf0f6f24 1724 }
41acab88 1725 if (se->statistics.block_start) {
78becc27 1726 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
bf0f6f24
IM
1727
1728 if ((s64)delta < 0)
1729 delta = 0;
1730
41acab88
LDM
1731 if (unlikely(delta > se->statistics.block_max))
1732 se->statistics.block_max = delta;
bf0f6f24 1733
8c79a045 1734 se->statistics.block_start = 0;
41acab88 1735 se->statistics.sum_sleep_runtime += delta;
30084fbd 1736
e414314c 1737 if (tsk) {
8f0dfc34 1738 if (tsk->in_iowait) {
41acab88
LDM
1739 se->statistics.iowait_sum += delta;
1740 se->statistics.iowait_count++;
768d0c27 1741 trace_sched_stat_iowait(tsk, delta);
8f0dfc34
AV
1742 }
1743
b781a602
AV
1744 trace_sched_stat_blocked(tsk, delta);
1745
e414314c
PZ
1746 /*
1747 * Blocking time is in units of nanosecs, so shift by
1748 * 20 to get a milliseconds-range estimation of the
1749 * amount of time that the task spent sleeping:
1750 */
1751 if (unlikely(prof_on == SLEEP_PROFILING)) {
1752 profile_hits(SLEEP_PROFILING,
1753 (void *)get_wchan(tsk),
1754 delta >> 20);
1755 }
1756 account_scheduler_latency(tsk, delta >> 10, 0);
30084fbd 1757 }
bf0f6f24
IM
1758 }
1759#endif
1760}
1761
ddc97297
PZ
1762static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1763{
1764#ifdef CONFIG_SCHED_DEBUG
1765 s64 d = se->vruntime - cfs_rq->min_vruntime;
1766
1767 if (d < 0)
1768 d = -d;
1769
1770 if (d > 3*sysctl_sched_latency)
1771 schedstat_inc(cfs_rq, nr_spread_over);
1772#endif
1773}
1774
aeb73b04
PZ
1775static void
1776place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1777{
1af5f730 1778 u64 vruntime = cfs_rq->min_vruntime;
94dfb5e7 1779
2cb8600e
PZ
1780 /*
1781 * The 'current' period is already promised to the current tasks,
1782 * however the extra weight of the new task will slow them down a
1783 * little, place the new task so that it fits in the slot that
1784 * stays open at the end.
1785 */
94dfb5e7 1786 if (initial && sched_feat(START_DEBIT))
f9c0b095 1787 vruntime += sched_vslice(cfs_rq, se);
aeb73b04 1788
a2e7a7eb 1789 /* sleeps up to a single latency don't count. */
5ca9880c 1790 if (!initial) {
a2e7a7eb 1791 unsigned long thresh = sysctl_sched_latency;
a7be37ac 1792
a2e7a7eb
MG
1793 /*
1794 * Halve their sleep time's effect, to allow
1795 * for a gentler effect of sleepers:
1796 */
1797 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1798 thresh >>= 1;
51e0304c 1799
a2e7a7eb 1800 vruntime -= thresh;
aeb73b04
PZ
1801 }
1802
b5d9d734 1803 /* ensure we never gain time by being placed backwards. */
16c8f1c7 1804 se->vruntime = max_vruntime(se->vruntime, vruntime);
aeb73b04
PZ
1805}
1806
d3d9dc33
PT
1807static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1808
bf0f6f24 1809static void
88ec22d3 1810enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 1811{
88ec22d3
PZ
1812 /*
1813 * Update the normalized vruntime before updating min_vruntime
0fc576d5 1814 * through calling update_curr().
88ec22d3 1815 */
371fd7e7 1816 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
88ec22d3
PZ
1817 se->vruntime += cfs_rq->min_vruntime;
1818
bf0f6f24 1819 /*
a2a2d680 1820 * Update run-time statistics of the 'current'.
bf0f6f24 1821 */
b7cc0896 1822 update_curr(cfs_rq);
f269ae04 1823 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
17bc14b7
LT
1824 account_entity_enqueue(cfs_rq, se);
1825 update_cfs_shares(cfs_rq);
bf0f6f24 1826
88ec22d3 1827 if (flags & ENQUEUE_WAKEUP) {
aeb73b04 1828 place_entity(cfs_rq, se, 0);
2396af69 1829 enqueue_sleeper(cfs_rq, se);
e9acbff6 1830 }
bf0f6f24 1831
d2417e5a 1832 update_stats_enqueue(cfs_rq, se);
ddc97297 1833 check_spread(cfs_rq, se);
83b699ed
SV
1834 if (se != cfs_rq->curr)
1835 __enqueue_entity(cfs_rq, se);
2069dd75 1836 se->on_rq = 1;
3d4b47b4 1837
d3d9dc33 1838 if (cfs_rq->nr_running == 1) {
3d4b47b4 1839 list_add_leaf_cfs_rq(cfs_rq);
d3d9dc33
PT
1840 check_enqueue_throttle(cfs_rq);
1841 }
bf0f6f24
IM
1842}
1843
2c13c919 1844static void __clear_buddies_last(struct sched_entity *se)
2002c695 1845{
2c13c919
RR
1846 for_each_sched_entity(se) {
1847 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1848 if (cfs_rq->last == se)
1849 cfs_rq->last = NULL;
1850 else
1851 break;
1852 }
1853}
2002c695 1854
2c13c919
RR
1855static void __clear_buddies_next(struct sched_entity *se)
1856{
1857 for_each_sched_entity(se) {
1858 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1859 if (cfs_rq->next == se)
1860 cfs_rq->next = NULL;
1861 else
1862 break;
1863 }
2002c695
PZ
1864}
1865
ac53db59
RR
1866static void __clear_buddies_skip(struct sched_entity *se)
1867{
1868 for_each_sched_entity(se) {
1869 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1870 if (cfs_rq->skip == se)
1871 cfs_rq->skip = NULL;
1872 else
1873 break;
1874 }
1875}
1876
a571bbea
PZ
1877static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1878{
2c13c919
RR
1879 if (cfs_rq->last == se)
1880 __clear_buddies_last(se);
1881
1882 if (cfs_rq->next == se)
1883 __clear_buddies_next(se);
ac53db59
RR
1884
1885 if (cfs_rq->skip == se)
1886 __clear_buddies_skip(se);
a571bbea
PZ
1887}
1888
6c16a6dc 1889static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d8b4986d 1890
bf0f6f24 1891static void
371fd7e7 1892dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 1893{
a2a2d680
DA
1894 /*
1895 * Update run-time statistics of the 'current'.
1896 */
1897 update_curr(cfs_rq);
17bc14b7 1898 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
a2a2d680 1899
19b6a2e3 1900 update_stats_dequeue(cfs_rq, se);
371fd7e7 1901 if (flags & DEQUEUE_SLEEP) {
67e9fb2a 1902#ifdef CONFIG_SCHEDSTATS
bf0f6f24
IM
1903 if (entity_is_task(se)) {
1904 struct task_struct *tsk = task_of(se);
1905
1906 if (tsk->state & TASK_INTERRUPTIBLE)
78becc27 1907 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
bf0f6f24 1908 if (tsk->state & TASK_UNINTERRUPTIBLE)
78becc27 1909 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
bf0f6f24 1910 }
db36cc7d 1911#endif
67e9fb2a
PZ
1912 }
1913
2002c695 1914 clear_buddies(cfs_rq, se);
4793241b 1915
83b699ed 1916 if (se != cfs_rq->curr)
30cfdcfc 1917 __dequeue_entity(cfs_rq, se);
17bc14b7 1918 se->on_rq = 0;
30cfdcfc 1919 account_entity_dequeue(cfs_rq, se);
88ec22d3
PZ
1920
1921 /*
1922 * Normalize the entity after updating the min_vruntime because the
1923 * update can refer to the ->curr item and we need to reflect this
1924 * movement in our normalized position.
1925 */
371fd7e7 1926 if (!(flags & DEQUEUE_SLEEP))
88ec22d3 1927 se->vruntime -= cfs_rq->min_vruntime;
1e876231 1928
d8b4986d
PT
1929 /* return excess runtime on last dequeue */
1930 return_cfs_rq_runtime(cfs_rq);
1931
1e876231 1932 update_min_vruntime(cfs_rq);
17bc14b7 1933 update_cfs_shares(cfs_rq);
bf0f6f24
IM
1934}
1935
1936/*
1937 * Preempt the current task with a newly woken task if needed:
1938 */
7c92e54f 1939static void
2e09bf55 1940check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
bf0f6f24 1941{
11697830 1942 unsigned long ideal_runtime, delta_exec;
f4cfb33e
WX
1943 struct sched_entity *se;
1944 s64 delta;
11697830 1945
6d0f0ebd 1946 ideal_runtime = sched_slice(cfs_rq, curr);
11697830 1947 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
a9f3e2b5 1948 if (delta_exec > ideal_runtime) {
bf0f6f24 1949 resched_task(rq_of(cfs_rq)->curr);
a9f3e2b5
MG
1950 /*
1951 * The current task ran long enough, ensure it doesn't get
1952 * re-elected due to buddy favours.
1953 */
1954 clear_buddies(cfs_rq, curr);
f685ceac
MG
1955 return;
1956 }
1957
1958 /*
1959 * Ensure that a task that missed wakeup preemption by a
1960 * narrow margin doesn't have to wait for a full slice.
1961 * This also mitigates buddy induced latencies under load.
1962 */
f685ceac
MG
1963 if (delta_exec < sysctl_sched_min_granularity)
1964 return;
1965
f4cfb33e
WX
1966 se = __pick_first_entity(cfs_rq);
1967 delta = curr->vruntime - se->vruntime;
f685ceac 1968
f4cfb33e
WX
1969 if (delta < 0)
1970 return;
d7d82944 1971
f4cfb33e
WX
1972 if (delta > ideal_runtime)
1973 resched_task(rq_of(cfs_rq)->curr);
bf0f6f24
IM
1974}
1975
83b699ed 1976static void
8494f412 1977set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 1978{
83b699ed
SV
1979 /* 'current' is not kept within the tree. */
1980 if (se->on_rq) {
1981 /*
1982 * Any task has to be enqueued before it get to execute on
1983 * a CPU. So account for the time it spent waiting on the
1984 * runqueue.
1985 */
1986 update_stats_wait_end(cfs_rq, se);
1987 __dequeue_entity(cfs_rq, se);
1988 }
1989
79303e9e 1990 update_stats_curr_start(cfs_rq, se);
429d43bc 1991 cfs_rq->curr = se;
eba1ed4b
IM
1992#ifdef CONFIG_SCHEDSTATS
1993 /*
1994 * Track our maximum slice length, if the CPU's load is at
1995 * least twice that of our own weight (i.e. dont track it
1996 * when there are only lesser-weight tasks around):
1997 */
495eca49 1998 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
41acab88 1999 se->statistics.slice_max = max(se->statistics.slice_max,
eba1ed4b
IM
2000 se->sum_exec_runtime - se->prev_sum_exec_runtime);
2001 }
2002#endif
4a55b450 2003 se->prev_sum_exec_runtime = se->sum_exec_runtime;
bf0f6f24
IM
2004}
2005
3f3a4904
PZ
2006static int
2007wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2008
ac53db59
RR
2009/*
2010 * Pick the next process, keeping these things in mind, in this order:
2011 * 1) keep things fair between processes/task groups
2012 * 2) pick the "next" process, since someone really wants that to run
2013 * 3) pick the "last" process, for cache locality
2014 * 4) do not run the "skip" process, if something else is available
2015 */
f4b6755f 2016static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
aa2ac252 2017{
ac53db59 2018 struct sched_entity *se = __pick_first_entity(cfs_rq);
f685ceac 2019 struct sched_entity *left = se;
f4b6755f 2020
ac53db59
RR
2021 /*
2022 * Avoid running the skip buddy, if running something else can
2023 * be done without getting too unfair.
2024 */
2025 if (cfs_rq->skip == se) {
2026 struct sched_entity *second = __pick_next_entity(se);
2027 if (second && wakeup_preempt_entity(second, left) < 1)
2028 se = second;
2029 }
aa2ac252 2030
f685ceac
MG
2031 /*
2032 * Prefer last buddy, try to return the CPU to a preempted task.
2033 */
2034 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2035 se = cfs_rq->last;
2036
ac53db59
RR
2037 /*
2038 * Someone really wants this to run. If it's not unfair, run it.
2039 */
2040 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2041 se = cfs_rq->next;
2042
f685ceac 2043 clear_buddies(cfs_rq, se);
4793241b
PZ
2044
2045 return se;
aa2ac252
PZ
2046}
2047
d3d9dc33
PT
2048static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2049
ab6cde26 2050static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
bf0f6f24
IM
2051{
2052 /*
2053 * If still on the runqueue then deactivate_task()
2054 * was not called and update_curr() has to be done:
2055 */
2056 if (prev->on_rq)
b7cc0896 2057 update_curr(cfs_rq);
bf0f6f24 2058
d3d9dc33
PT
2059 /* throttle cfs_rqs exceeding runtime */
2060 check_cfs_rq_runtime(cfs_rq);
2061
ddc97297 2062 check_spread(cfs_rq, prev);
30cfdcfc 2063 if (prev->on_rq) {
5870db5b 2064 update_stats_wait_start(cfs_rq, prev);
30cfdcfc
DA
2065 /* Put 'current' back into the tree. */
2066 __enqueue_entity(cfs_rq, prev);
9d85f21c 2067 /* in !on_rq case, update occurred at dequeue */
9ee474f5 2068 update_entity_load_avg(prev, 1);
30cfdcfc 2069 }
429d43bc 2070 cfs_rq->curr = NULL;
bf0f6f24
IM
2071}
2072
8f4d37ec
PZ
2073static void
2074entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
bf0f6f24 2075{
bf0f6f24 2076 /*
30cfdcfc 2077 * Update run-time statistics of the 'current'.
bf0f6f24 2078 */
30cfdcfc 2079 update_curr(cfs_rq);
bf0f6f24 2080
9d85f21c
PT
2081 /*
2082 * Ensure that runnable average is periodically updated.
2083 */
9ee474f5 2084 update_entity_load_avg(curr, 1);
aff3e498 2085 update_cfs_rq_blocked_load(cfs_rq, 1);
bf0bd948 2086 update_cfs_shares(cfs_rq);
9d85f21c 2087
8f4d37ec
PZ
2088#ifdef CONFIG_SCHED_HRTICK
2089 /*
2090 * queued ticks are scheduled to match the slice, so don't bother
2091 * validating it and just reschedule.
2092 */
983ed7a6
HH
2093 if (queued) {
2094 resched_task(rq_of(cfs_rq)->curr);
2095 return;
2096 }
8f4d37ec
PZ
2097 /*
2098 * don't let the period tick interfere with the hrtick preemption
2099 */
2100 if (!sched_feat(DOUBLE_TICK) &&
2101 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2102 return;
2103#endif
2104
2c2efaed 2105 if (cfs_rq->nr_running > 1)
2e09bf55 2106 check_preempt_tick(cfs_rq, curr);
bf0f6f24
IM
2107}
2108
ab84d31e
PT
2109
2110/**************************************************
2111 * CFS bandwidth control machinery
2112 */
2113
2114#ifdef CONFIG_CFS_BANDWIDTH
029632fb
PZ
2115
2116#ifdef HAVE_JUMP_LABEL
c5905afb 2117static struct static_key __cfs_bandwidth_used;
029632fb
PZ
2118
2119static inline bool cfs_bandwidth_used(void)
2120{
c5905afb 2121 return static_key_false(&__cfs_bandwidth_used);
029632fb
PZ
2122}
2123
2124void account_cfs_bandwidth_used(int enabled, int was_enabled)
2125{
2126 /* only need to count groups transitioning between enabled/!enabled */
2127 if (enabled && !was_enabled)
c5905afb 2128 static_key_slow_inc(&__cfs_bandwidth_used);
029632fb 2129 else if (!enabled && was_enabled)
c5905afb 2130 static_key_slow_dec(&__cfs_bandwidth_used);
029632fb
PZ
2131}
2132#else /* HAVE_JUMP_LABEL */
2133static bool cfs_bandwidth_used(void)
2134{
2135 return true;
2136}
2137
2138void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2139#endif /* HAVE_JUMP_LABEL */
2140
ab84d31e
PT
2141/*
2142 * default period for cfs group bandwidth.
2143 * default: 0.1s, units: nanoseconds
2144 */
2145static inline u64 default_cfs_period(void)
2146{
2147 return 100000000ULL;
2148}
ec12cb7f
PT
2149
2150static inline u64 sched_cfs_bandwidth_slice(void)
2151{
2152 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2153}
2154
a9cf55b2
PT
2155/*
2156 * Replenish runtime according to assigned quota and update expiration time.
2157 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2158 * additional synchronization around rq->lock.
2159 *
2160 * requires cfs_b->lock
2161 */
029632fb 2162void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
a9cf55b2
PT
2163{
2164 u64 now;
2165
2166 if (cfs_b->quota == RUNTIME_INF)
2167 return;
2168
2169 now = sched_clock_cpu(smp_processor_id());
2170 cfs_b->runtime = cfs_b->quota;
2171 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2172}
2173
029632fb
PZ
2174static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2175{
2176 return &tg->cfs_bandwidth;
2177}
2178
f1b17280
PT
2179/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2180static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2181{
2182 if (unlikely(cfs_rq->throttle_count))
2183 return cfs_rq->throttled_clock_task;
2184
78becc27 2185 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
f1b17280
PT
2186}
2187
85dac906
PT
2188/* returns 0 on failure to allocate runtime */
2189static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f
PT
2190{
2191 struct task_group *tg = cfs_rq->tg;
2192 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
a9cf55b2 2193 u64 amount = 0, min_amount, expires;
ec12cb7f
PT
2194
2195 /* note: this is a positive sum as runtime_remaining <= 0 */
2196 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2197
2198 raw_spin_lock(&cfs_b->lock);
2199 if (cfs_b->quota == RUNTIME_INF)
2200 amount = min_amount;
58088ad0 2201 else {
a9cf55b2
PT
2202 /*
2203 * If the bandwidth pool has become inactive, then at least one
2204 * period must have elapsed since the last consumption.
2205 * Refresh the global state and ensure bandwidth timer becomes
2206 * active.
2207 */
2208 if (!cfs_b->timer_active) {
2209 __refill_cfs_bandwidth_runtime(cfs_b);
58088ad0 2210 __start_cfs_bandwidth(cfs_b);
a9cf55b2 2211 }
58088ad0
PT
2212
2213 if (cfs_b->runtime > 0) {
2214 amount = min(cfs_b->runtime, min_amount);
2215 cfs_b->runtime -= amount;
2216 cfs_b->idle = 0;
2217 }
ec12cb7f 2218 }
a9cf55b2 2219 expires = cfs_b->runtime_expires;
ec12cb7f
PT
2220 raw_spin_unlock(&cfs_b->lock);
2221
2222 cfs_rq->runtime_remaining += amount;
a9cf55b2
PT
2223 /*
2224 * we may have advanced our local expiration to account for allowed
2225 * spread between our sched_clock and the one on which runtime was
2226 * issued.
2227 */
2228 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2229 cfs_rq->runtime_expires = expires;
85dac906
PT
2230
2231 return cfs_rq->runtime_remaining > 0;
ec12cb7f
PT
2232}
2233
a9cf55b2
PT
2234/*
2235 * Note: This depends on the synchronization provided by sched_clock and the
2236 * fact that rq->clock snapshots this value.
2237 */
2238static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f 2239{
a9cf55b2 2240 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
a9cf55b2
PT
2241
2242 /* if the deadline is ahead of our clock, nothing to do */
78becc27 2243 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
ec12cb7f
PT
2244 return;
2245
a9cf55b2
PT
2246 if (cfs_rq->runtime_remaining < 0)
2247 return;
2248
2249 /*
2250 * If the local deadline has passed we have to consider the
2251 * possibility that our sched_clock is 'fast' and the global deadline
2252 * has not truly expired.
2253 *
2254 * Fortunately we can check determine whether this the case by checking
2255 * whether the global deadline has advanced.
2256 */
2257
2258 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2259 /* extend local deadline, drift is bounded above by 2 ticks */
2260 cfs_rq->runtime_expires += TICK_NSEC;
2261 } else {
2262 /* global deadline is ahead, expiration has passed */
2263 cfs_rq->runtime_remaining = 0;
2264 }
2265}
2266
2267static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2268 unsigned long delta_exec)
2269{
2270 /* dock delta_exec before expiring quota (as it could span periods) */
ec12cb7f 2271 cfs_rq->runtime_remaining -= delta_exec;
a9cf55b2
PT
2272 expire_cfs_rq_runtime(cfs_rq);
2273
2274 if (likely(cfs_rq->runtime_remaining > 0))
ec12cb7f
PT
2275 return;
2276
85dac906
PT
2277 /*
2278 * if we're unable to extend our runtime we resched so that the active
2279 * hierarchy can be throttled
2280 */
2281 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2282 resched_task(rq_of(cfs_rq)->curr);
ec12cb7f
PT
2283}
2284
6c16a6dc
PZ
2285static __always_inline
2286void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
ec12cb7f 2287{
56f570e5 2288 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
ec12cb7f
PT
2289 return;
2290
2291 __account_cfs_rq_runtime(cfs_rq, delta_exec);
2292}
2293
85dac906
PT
2294static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2295{
56f570e5 2296 return cfs_bandwidth_used() && cfs_rq->throttled;
85dac906
PT
2297}
2298
64660c86
PT
2299/* check whether cfs_rq, or any parent, is throttled */
2300static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2301{
56f570e5 2302 return cfs_bandwidth_used() && cfs_rq->throttle_count;
64660c86
PT
2303}
2304
2305/*
2306 * Ensure that neither of the group entities corresponding to src_cpu or
2307 * dest_cpu are members of a throttled hierarchy when performing group
2308 * load-balance operations.
2309 */
2310static inline int throttled_lb_pair(struct task_group *tg,
2311 int src_cpu, int dest_cpu)
2312{
2313 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2314
2315 src_cfs_rq = tg->cfs_rq[src_cpu];
2316 dest_cfs_rq = tg->cfs_rq[dest_cpu];
2317
2318 return throttled_hierarchy(src_cfs_rq) ||
2319 throttled_hierarchy(dest_cfs_rq);
2320}
2321
2322/* updated child weight may affect parent so we have to do this bottom up */
2323static int tg_unthrottle_up(struct task_group *tg, void *data)
2324{
2325 struct rq *rq = data;
2326 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2327
2328 cfs_rq->throttle_count--;
2329#ifdef CONFIG_SMP
2330 if (!cfs_rq->throttle_count) {
f1b17280 2331 /* adjust cfs_rq_clock_task() */
78becc27 2332 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
f1b17280 2333 cfs_rq->throttled_clock_task;
64660c86
PT
2334 }
2335#endif
2336
2337 return 0;
2338}
2339
2340static int tg_throttle_down(struct task_group *tg, void *data)
2341{
2342 struct rq *rq = data;
2343 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2344
82958366
PT
2345 /* group is entering throttled state, stop time */
2346 if (!cfs_rq->throttle_count)
78becc27 2347 cfs_rq->throttled_clock_task = rq_clock_task(rq);
64660c86
PT
2348 cfs_rq->throttle_count++;
2349
2350 return 0;
2351}
2352
d3d9dc33 2353static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
85dac906
PT
2354{
2355 struct rq *rq = rq_of(cfs_rq);
2356 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2357 struct sched_entity *se;
2358 long task_delta, dequeue = 1;
2359
2360 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2361
f1b17280 2362 /* freeze hierarchy runnable averages while throttled */
64660c86
PT
2363 rcu_read_lock();
2364 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2365 rcu_read_unlock();
85dac906
PT
2366
2367 task_delta = cfs_rq->h_nr_running;
2368 for_each_sched_entity(se) {
2369 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2370 /* throttled entity or throttle-on-deactivate */
2371 if (!se->on_rq)
2372 break;
2373
2374 if (dequeue)
2375 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2376 qcfs_rq->h_nr_running -= task_delta;
2377
2378 if (qcfs_rq->load.weight)
2379 dequeue = 0;
2380 }
2381
2382 if (!se)
2383 rq->nr_running -= task_delta;
2384
2385 cfs_rq->throttled = 1;
78becc27 2386 cfs_rq->throttled_clock = rq_clock(rq);
85dac906
PT
2387 raw_spin_lock(&cfs_b->lock);
2388 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2389 raw_spin_unlock(&cfs_b->lock);
2390}
2391
029632fb 2392void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
671fd9da
PT
2393{
2394 struct rq *rq = rq_of(cfs_rq);
2395 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2396 struct sched_entity *se;
2397 int enqueue = 1;
2398 long task_delta;
2399
22b958d8 2400 se = cfs_rq->tg->se[cpu_of(rq)];
671fd9da
PT
2401
2402 cfs_rq->throttled = 0;
1a55af2e
FW
2403
2404 update_rq_clock(rq);
2405
671fd9da 2406 raw_spin_lock(&cfs_b->lock);
78becc27 2407 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
671fd9da
PT
2408 list_del_rcu(&cfs_rq->throttled_list);
2409 raw_spin_unlock(&cfs_b->lock);
2410
64660c86
PT
2411 /* update hierarchical throttle state */
2412 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2413
671fd9da
PT
2414 if (!cfs_rq->load.weight)
2415 return;
2416
2417 task_delta = cfs_rq->h_nr_running;
2418 for_each_sched_entity(se) {
2419 if (se->on_rq)
2420 enqueue = 0;
2421
2422 cfs_rq = cfs_rq_of(se);
2423 if (enqueue)
2424 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2425 cfs_rq->h_nr_running += task_delta;
2426
2427 if (cfs_rq_throttled(cfs_rq))
2428 break;
2429 }
2430
2431 if (!se)
2432 rq->nr_running += task_delta;
2433
2434 /* determine whether we need to wake up potentially idle cpu */
2435 if (rq->curr == rq->idle && rq->cfs.nr_running)
2436 resched_task(rq->curr);
2437}
2438
2439static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2440 u64 remaining, u64 expires)
2441{
2442 struct cfs_rq *cfs_rq;
2443 u64 runtime = remaining;
2444
2445 rcu_read_lock();
2446 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2447 throttled_list) {
2448 struct rq *rq = rq_of(cfs_rq);
2449
2450 raw_spin_lock(&rq->lock);
2451 if (!cfs_rq_throttled(cfs_rq))
2452 goto next;
2453
2454 runtime = -cfs_rq->runtime_remaining + 1;
2455 if (runtime > remaining)
2456 runtime = remaining;
2457 remaining -= runtime;
2458
2459 cfs_rq->runtime_remaining += runtime;
2460 cfs_rq->runtime_expires = expires;
2461
2462 /* we check whether we're throttled above */
2463 if (cfs_rq->runtime_remaining > 0)
2464 unthrottle_cfs_rq(cfs_rq);
2465
2466next:
2467 raw_spin_unlock(&rq->lock);
2468
2469 if (!remaining)
2470 break;
2471 }
2472 rcu_read_unlock();
2473
2474 return remaining;
2475}
2476
58088ad0
PT
2477/*
2478 * Responsible for refilling a task_group's bandwidth and unthrottling its
2479 * cfs_rqs as appropriate. If there has been no activity within the last
2480 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2481 * used to track this state.
2482 */
2483static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2484{
671fd9da
PT
2485 u64 runtime, runtime_expires;
2486 int idle = 1, throttled;
58088ad0
PT
2487
2488 raw_spin_lock(&cfs_b->lock);
2489 /* no need to continue the timer with no bandwidth constraint */
2490 if (cfs_b->quota == RUNTIME_INF)
2491 goto out_unlock;
2492
671fd9da
PT
2493 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2494 /* idle depends on !throttled (for the case of a large deficit) */
2495 idle = cfs_b->idle && !throttled;
e8da1b18 2496 cfs_b->nr_periods += overrun;
671fd9da 2497
a9cf55b2
PT
2498 /* if we're going inactive then everything else can be deferred */
2499 if (idle)
2500 goto out_unlock;
2501
2502 __refill_cfs_bandwidth_runtime(cfs_b);
2503
671fd9da
PT
2504 if (!throttled) {
2505 /* mark as potentially idle for the upcoming period */
2506 cfs_b->idle = 1;
2507 goto out_unlock;
2508 }
2509
e8da1b18
NR
2510 /* account preceding periods in which throttling occurred */
2511 cfs_b->nr_throttled += overrun;
2512
671fd9da
PT
2513 /*
2514 * There are throttled entities so we must first use the new bandwidth
2515 * to unthrottle them before making it generally available. This
2516 * ensures that all existing debts will be paid before a new cfs_rq is
2517 * allowed to run.
2518 */
2519 runtime = cfs_b->runtime;
2520 runtime_expires = cfs_b->runtime_expires;
2521 cfs_b->runtime = 0;
2522
2523 /*
2524 * This check is repeated as we are holding onto the new bandwidth
2525 * while we unthrottle. This can potentially race with an unthrottled
2526 * group trying to acquire new bandwidth from the global pool.
2527 */
2528 while (throttled && runtime > 0) {
2529 raw_spin_unlock(&cfs_b->lock);
2530 /* we can't nest cfs_b->lock while distributing bandwidth */
2531 runtime = distribute_cfs_runtime(cfs_b, runtime,
2532 runtime_expires);
2533 raw_spin_lock(&cfs_b->lock);
2534
2535 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2536 }
58088ad0 2537
671fd9da
PT
2538 /* return (any) remaining runtime */
2539 cfs_b->runtime = runtime;
2540 /*
2541 * While we are ensured activity in the period following an
2542 * unthrottle, this also covers the case in which the new bandwidth is
2543 * insufficient to cover the existing bandwidth deficit. (Forcing the
2544 * timer to remain active while there are any throttled entities.)
2545 */
2546 cfs_b->idle = 0;
58088ad0
PT
2547out_unlock:
2548 if (idle)
2549 cfs_b->timer_active = 0;
2550 raw_spin_unlock(&cfs_b->lock);
2551
2552 return idle;
2553}
d3d9dc33 2554
d8b4986d
PT
2555/* a cfs_rq won't donate quota below this amount */
2556static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2557/* minimum remaining period time to redistribute slack quota */
2558static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2559/* how long we wait to gather additional slack before distributing */
2560static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2561
2562/* are we near the end of the current quota period? */
2563static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2564{
2565 struct hrtimer *refresh_timer = &cfs_b->period_timer;
2566 u64 remaining;
2567
2568 /* if the call-back is running a quota refresh is already occurring */
2569 if (hrtimer_callback_running(refresh_timer))
2570 return 1;
2571
2572 /* is a quota refresh about to occur? */
2573 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2574 if (remaining < min_expire)
2575 return 1;
2576
2577 return 0;
2578}
2579
2580static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2581{
2582 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2583
2584 /* if there's a quota refresh soon don't bother with slack */
2585 if (runtime_refresh_within(cfs_b, min_left))
2586 return;
2587
2588 start_bandwidth_timer(&cfs_b->slack_timer,
2589 ns_to_ktime(cfs_bandwidth_slack_period));
2590}
2591
2592/* we know any runtime found here is valid as update_curr() precedes return */
2593static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2594{
2595 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2596 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2597
2598 if (slack_runtime <= 0)
2599 return;
2600
2601 raw_spin_lock(&cfs_b->lock);
2602 if (cfs_b->quota != RUNTIME_INF &&
2603 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2604 cfs_b->runtime += slack_runtime;
2605
2606 /* we are under rq->lock, defer unthrottling using a timer */
2607 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2608 !list_empty(&cfs_b->throttled_cfs_rq))
2609 start_cfs_slack_bandwidth(cfs_b);
2610 }
2611 raw_spin_unlock(&cfs_b->lock);
2612
2613 /* even if it's not valid for return we don't want to try again */
2614 cfs_rq->runtime_remaining -= slack_runtime;
2615}
2616
2617static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2618{
56f570e5
PT
2619 if (!cfs_bandwidth_used())
2620 return;
2621
fccfdc6f 2622 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
d8b4986d
PT
2623 return;
2624
2625 __return_cfs_rq_runtime(cfs_rq);
2626}
2627
2628/*
2629 * This is done with a timer (instead of inline with bandwidth return) since
2630 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2631 */
2632static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2633{
2634 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2635 u64 expires;
2636
2637 /* confirm we're still not at a refresh boundary */
2638 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2639 return;
2640
2641 raw_spin_lock(&cfs_b->lock);
2642 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2643 runtime = cfs_b->runtime;
2644 cfs_b->runtime = 0;
2645 }
2646 expires = cfs_b->runtime_expires;
2647 raw_spin_unlock(&cfs_b->lock);
2648
2649 if (!runtime)
2650 return;
2651
2652 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2653
2654 raw_spin_lock(&cfs_b->lock);
2655 if (expires == cfs_b->runtime_expires)
2656 cfs_b->runtime = runtime;
2657 raw_spin_unlock(&cfs_b->lock);
2658}
2659
d3d9dc33
PT
2660/*
2661 * When a group wakes up we want to make sure that its quota is not already
2662 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2663 * runtime as update_curr() throttling can not not trigger until it's on-rq.
2664 */
2665static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2666{
56f570e5
PT
2667 if (!cfs_bandwidth_used())
2668 return;
2669
d3d9dc33
PT
2670 /* an active group must be handled by the update_curr()->put() path */
2671 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2672 return;
2673
2674 /* ensure the group is not already throttled */
2675 if (cfs_rq_throttled(cfs_rq))
2676 return;
2677
2678 /* update runtime allocation */
2679 account_cfs_rq_runtime(cfs_rq, 0);
2680 if (cfs_rq->runtime_remaining <= 0)
2681 throttle_cfs_rq(cfs_rq);
2682}
2683
2684/* conditionally throttle active cfs_rq's from put_prev_entity() */
2685static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2686{
56f570e5
PT
2687 if (!cfs_bandwidth_used())
2688 return;
2689
d3d9dc33
PT
2690 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2691 return;
2692
2693 /*
2694 * it's possible for a throttled entity to be forced into a running
2695 * state (e.g. set_curr_task), in this case we're finished.
2696 */
2697 if (cfs_rq_throttled(cfs_rq))
2698 return;
2699
2700 throttle_cfs_rq(cfs_rq);
2701}
029632fb 2702
029632fb
PZ
2703static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2704{
2705 struct cfs_bandwidth *cfs_b =
2706 container_of(timer, struct cfs_bandwidth, slack_timer);
2707 do_sched_cfs_slack_timer(cfs_b);
2708
2709 return HRTIMER_NORESTART;
2710}
2711
2712static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2713{
2714 struct cfs_bandwidth *cfs_b =
2715 container_of(timer, struct cfs_bandwidth, period_timer);
2716 ktime_t now;
2717 int overrun;
2718 int idle = 0;
2719
2720 for (;;) {
2721 now = hrtimer_cb_get_time(timer);
2722 overrun = hrtimer_forward(timer, now, cfs_b->period);
2723
2724 if (!overrun)
2725 break;
2726
2727 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2728 }
2729
2730 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2731}
2732
2733void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2734{
2735 raw_spin_lock_init(&cfs_b->lock);
2736 cfs_b->runtime = 0;
2737 cfs_b->quota = RUNTIME_INF;
2738 cfs_b->period = ns_to_ktime(default_cfs_period());
2739
2740 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2741 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2742 cfs_b->period_timer.function = sched_cfs_period_timer;
2743 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2744 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2745}
2746
2747static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2748{
2749 cfs_rq->runtime_enabled = 0;
2750 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2751}
2752
2753/* requires cfs_b->lock, may release to reprogram timer */
2754void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2755{
2756 /*
2757 * The timer may be active because we're trying to set a new bandwidth
2758 * period or because we're racing with the tear-down path
2759 * (timer_active==0 becomes visible before the hrtimer call-back
2760 * terminates). In either case we ensure that it's re-programmed
2761 */
2762 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2763 raw_spin_unlock(&cfs_b->lock);
2764 /* ensure cfs_b->lock is available while we wait */
2765 hrtimer_cancel(&cfs_b->period_timer);
2766
2767 raw_spin_lock(&cfs_b->lock);
2768 /* if someone else restarted the timer then we're done */
2769 if (cfs_b->timer_active)
2770 return;
2771 }
2772
2773 cfs_b->timer_active = 1;
2774 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2775}
2776
2777static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2778{
2779 hrtimer_cancel(&cfs_b->period_timer);
2780 hrtimer_cancel(&cfs_b->slack_timer);
2781}
2782
38dc3348 2783static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
029632fb
PZ
2784{
2785 struct cfs_rq *cfs_rq;
2786
2787 for_each_leaf_cfs_rq(rq, cfs_rq) {
2788 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2789
2790 if (!cfs_rq->runtime_enabled)
2791 continue;
2792
2793 /*
2794 * clock_task is not advancing so we just need to make sure
2795 * there's some valid quota amount
2796 */
2797 cfs_rq->runtime_remaining = cfs_b->quota;
2798 if (cfs_rq_throttled(cfs_rq))
2799 unthrottle_cfs_rq(cfs_rq);
2800 }
2801}
2802
2803#else /* CONFIG_CFS_BANDWIDTH */
f1b17280
PT
2804static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2805{
78becc27 2806 return rq_clock_task(rq_of(cfs_rq));
f1b17280
PT
2807}
2808
2809static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2810 unsigned long delta_exec) {}
d3d9dc33
PT
2811static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2812static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
6c16a6dc 2813static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
85dac906
PT
2814
2815static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2816{
2817 return 0;
2818}
64660c86
PT
2819
2820static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2821{
2822 return 0;
2823}
2824
2825static inline int throttled_lb_pair(struct task_group *tg,
2826 int src_cpu, int dest_cpu)
2827{
2828 return 0;
2829}
029632fb
PZ
2830
2831void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2832
2833#ifdef CONFIG_FAIR_GROUP_SCHED
2834static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
ab84d31e
PT
2835#endif
2836
029632fb
PZ
2837static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2838{
2839 return NULL;
2840}
2841static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
a4c96ae3 2842static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
029632fb
PZ
2843
2844#endif /* CONFIG_CFS_BANDWIDTH */
2845
bf0f6f24
IM
2846/**************************************************
2847 * CFS operations on tasks:
2848 */
2849
8f4d37ec
PZ
2850#ifdef CONFIG_SCHED_HRTICK
2851static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2852{
8f4d37ec
PZ
2853 struct sched_entity *se = &p->se;
2854 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2855
2856 WARN_ON(task_rq(p) != rq);
2857
b39e66ea 2858 if (cfs_rq->nr_running > 1) {
8f4d37ec
PZ
2859 u64 slice = sched_slice(cfs_rq, se);
2860 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2861 s64 delta = slice - ran;
2862
2863 if (delta < 0) {
2864 if (rq->curr == p)
2865 resched_task(p);
2866 return;
2867 }
2868
2869 /*
2870 * Don't schedule slices shorter than 10000ns, that just
2871 * doesn't make sense. Rely on vruntime for fairness.
2872 */
31656519 2873 if (rq->curr != p)
157124c1 2874 delta = max_t(s64, 10000LL, delta);
8f4d37ec 2875
31656519 2876 hrtick_start(rq, delta);
8f4d37ec
PZ
2877 }
2878}
a4c2f00f
PZ
2879
2880/*
2881 * called from enqueue/dequeue and updates the hrtick when the
2882 * current task is from our class and nr_running is low enough
2883 * to matter.
2884 */
2885static void hrtick_update(struct rq *rq)
2886{
2887 struct task_struct *curr = rq->curr;
2888
b39e66ea 2889 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
a4c2f00f
PZ
2890 return;
2891
2892 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2893 hrtick_start_fair(rq, curr);
2894}
55e12e5e 2895#else /* !CONFIG_SCHED_HRTICK */
8f4d37ec
PZ
2896static inline void
2897hrtick_start_fair(struct rq *rq, struct task_struct *p)
2898{
2899}
a4c2f00f
PZ
2900
2901static inline void hrtick_update(struct rq *rq)
2902{
2903}
8f4d37ec
PZ
2904#endif
2905
bf0f6f24
IM
2906/*
2907 * The enqueue_task method is called before nr_running is
2908 * increased. Here we update the fair scheduling stats and
2909 * then put the task into the rbtree:
2910 */
ea87bb78 2911static void
371fd7e7 2912enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
2913{
2914 struct cfs_rq *cfs_rq;
62fb1851 2915 struct sched_entity *se = &p->se;
bf0f6f24
IM
2916
2917 for_each_sched_entity(se) {
62fb1851 2918 if (se->on_rq)
bf0f6f24
IM
2919 break;
2920 cfs_rq = cfs_rq_of(se);
88ec22d3 2921 enqueue_entity(cfs_rq, se, flags);
85dac906
PT
2922
2923 /*
2924 * end evaluation on encountering a throttled cfs_rq
2925 *
2926 * note: in the case of encountering a throttled cfs_rq we will
2927 * post the final h_nr_running increment below.
2928 */
2929 if (cfs_rq_throttled(cfs_rq))
2930 break;
953bfcd1 2931 cfs_rq->h_nr_running++;
85dac906 2932
88ec22d3 2933 flags = ENQUEUE_WAKEUP;
bf0f6f24 2934 }
8f4d37ec 2935
2069dd75 2936 for_each_sched_entity(se) {
0f317143 2937 cfs_rq = cfs_rq_of(se);
953bfcd1 2938 cfs_rq->h_nr_running++;
2069dd75 2939
85dac906
PT
2940 if (cfs_rq_throttled(cfs_rq))
2941 break;
2942
17bc14b7 2943 update_cfs_shares(cfs_rq);
9ee474f5 2944 update_entity_load_avg(se, 1);
2069dd75
PZ
2945 }
2946
18bf2805
BS
2947 if (!se) {
2948 update_rq_runnable_avg(rq, rq->nr_running);
85dac906 2949 inc_nr_running(rq);
18bf2805 2950 }
a4c2f00f 2951 hrtick_update(rq);
bf0f6f24
IM
2952}
2953
2f36825b
VP
2954static void set_next_buddy(struct sched_entity *se);
2955
bf0f6f24
IM
2956/*
2957 * The dequeue_task method is called before nr_running is
2958 * decreased. We remove the task from the rbtree and
2959 * update the fair scheduling stats:
2960 */
371fd7e7 2961static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
2962{
2963 struct cfs_rq *cfs_rq;
62fb1851 2964 struct sched_entity *se = &p->se;
2f36825b 2965 int task_sleep = flags & DEQUEUE_SLEEP;
bf0f6f24
IM
2966
2967 for_each_sched_entity(se) {
2968 cfs_rq = cfs_rq_of(se);
371fd7e7 2969 dequeue_entity(cfs_rq, se, flags);
85dac906
PT
2970
2971 /*
2972 * end evaluation on encountering a throttled cfs_rq
2973 *
2974 * note: in the case of encountering a throttled cfs_rq we will
2975 * post the final h_nr_running decrement below.
2976 */
2977 if (cfs_rq_throttled(cfs_rq))
2978 break;
953bfcd1 2979 cfs_rq->h_nr_running--;
2069dd75 2980
bf0f6f24 2981 /* Don't dequeue parent if it has other entities besides us */
2f36825b
VP
2982 if (cfs_rq->load.weight) {
2983 /*
2984 * Bias pick_next to pick a task from this cfs_rq, as
2985 * p is sleeping when it is within its sched_slice.
2986 */
2987 if (task_sleep && parent_entity(se))
2988 set_next_buddy(parent_entity(se));
9598c82d
PT
2989
2990 /* avoid re-evaluating load for this entity */
2991 se = parent_entity(se);
bf0f6f24 2992 break;
2f36825b 2993 }
371fd7e7 2994 flags |= DEQUEUE_SLEEP;
bf0f6f24 2995 }
8f4d37ec 2996
2069dd75 2997 for_each_sched_entity(se) {
0f317143 2998 cfs_rq = cfs_rq_of(se);
953bfcd1 2999 cfs_rq->h_nr_running--;
2069dd75 3000
85dac906
PT
3001 if (cfs_rq_throttled(cfs_rq))
3002 break;
3003
17bc14b7 3004 update_cfs_shares(cfs_rq);
9ee474f5 3005 update_entity_load_avg(se, 1);
2069dd75
PZ
3006 }
3007
18bf2805 3008 if (!se) {
85dac906 3009 dec_nr_running(rq);
18bf2805
BS
3010 update_rq_runnable_avg(rq, 1);
3011 }
a4c2f00f 3012 hrtick_update(rq);
bf0f6f24
IM
3013}
3014
e7693a36 3015#ifdef CONFIG_SMP
029632fb
PZ
3016/* Used instead of source_load when we know the type == 0 */
3017static unsigned long weighted_cpuload(const int cpu)
3018{
b92486cb 3019 return cpu_rq(cpu)->cfs.runnable_load_avg;
029632fb
PZ
3020}
3021
3022/*
3023 * Return a low guess at the load of a migration-source cpu weighted
3024 * according to the scheduling class and "nice" value.
3025 *
3026 * We want to under-estimate the load of migration sources, to
3027 * balance conservatively.
3028 */
3029static unsigned long source_load(int cpu, int type)
3030{
3031 struct rq *rq = cpu_rq(cpu);
3032 unsigned long total = weighted_cpuload(cpu);
3033
3034 if (type == 0 || !sched_feat(LB_BIAS))
3035 return total;
3036
3037 return min(rq->cpu_load[type-1], total);
3038}
3039
3040/*
3041 * Return a high guess at the load of a migration-target cpu weighted
3042 * according to the scheduling class and "nice" value.
3043 */
3044static unsigned long target_load(int cpu, int type)
3045{
3046 struct rq *rq = cpu_rq(cpu);
3047 unsigned long total = weighted_cpuload(cpu);
3048
3049 if (type == 0 || !sched_feat(LB_BIAS))
3050 return total;
3051
3052 return max(rq->cpu_load[type-1], total);
3053}
3054
3055static unsigned long power_of(int cpu)
3056{
3057 return cpu_rq(cpu)->cpu_power;
3058}
3059
3060static unsigned long cpu_avg_load_per_task(int cpu)
3061{
3062 struct rq *rq = cpu_rq(cpu);
3063 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
b92486cb 3064 unsigned long load_avg = rq->cfs.runnable_load_avg;
029632fb
PZ
3065
3066 if (nr_running)
b92486cb 3067 return load_avg / nr_running;
029632fb
PZ
3068
3069 return 0;
3070}
3071
62470419
MW
3072static void record_wakee(struct task_struct *p)
3073{
3074 /*
3075 * Rough decay (wiping) for cost saving, don't worry
3076 * about the boundary, really active task won't care
3077 * about the loss.
3078 */
3079 if (jiffies > current->wakee_flip_decay_ts + HZ) {
3080 current->wakee_flips = 0;
3081 current->wakee_flip_decay_ts = jiffies;
3082 }
3083
3084 if (current->last_wakee != p) {
3085 current->last_wakee = p;
3086 current->wakee_flips++;
3087 }
3088}
098fb9db 3089
74f8e4b2 3090static void task_waking_fair(struct task_struct *p)
88ec22d3
PZ
3091{
3092 struct sched_entity *se = &p->se;
3093 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3fe1698b
PZ
3094 u64 min_vruntime;
3095
3096#ifndef CONFIG_64BIT
3097 u64 min_vruntime_copy;
88ec22d3 3098
3fe1698b
PZ
3099 do {
3100 min_vruntime_copy = cfs_rq->min_vruntime_copy;
3101 smp_rmb();
3102 min_vruntime = cfs_rq->min_vruntime;
3103 } while (min_vruntime != min_vruntime_copy);
3104#else
3105 min_vruntime = cfs_rq->min_vruntime;
3106#endif
88ec22d3 3107
3fe1698b 3108 se->vruntime -= min_vruntime;
62470419 3109 record_wakee(p);
88ec22d3
PZ
3110}
3111
bb3469ac 3112#ifdef CONFIG_FAIR_GROUP_SCHED
f5bfb7d9
PZ
3113/*
3114 * effective_load() calculates the load change as seen from the root_task_group
3115 *
3116 * Adding load to a group doesn't make a group heavier, but can cause movement
3117 * of group shares between cpus. Assuming the shares were perfectly aligned one
3118 * can calculate the shift in shares.
cf5f0acf
PZ
3119 *
3120 * Calculate the effective load difference if @wl is added (subtracted) to @tg
3121 * on this @cpu and results in a total addition (subtraction) of @wg to the
3122 * total group weight.
3123 *
3124 * Given a runqueue weight distribution (rw_i) we can compute a shares
3125 * distribution (s_i) using:
3126 *
3127 * s_i = rw_i / \Sum rw_j (1)
3128 *
3129 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
3130 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
3131 * shares distribution (s_i):
3132 *
3133 * rw_i = { 2, 4, 1, 0 }
3134 * s_i = { 2/7, 4/7, 1/7, 0 }
3135 *
3136 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3137 * task used to run on and the CPU the waker is running on), we need to
3138 * compute the effect of waking a task on either CPU and, in case of a sync
3139 * wakeup, compute the effect of the current task going to sleep.
3140 *
3141 * So for a change of @wl to the local @cpu with an overall group weight change
3142 * of @wl we can compute the new shares distribution (s'_i) using:
3143 *
3144 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
3145 *
3146 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3147 * differences in waking a task to CPU 0. The additional task changes the
3148 * weight and shares distributions like:
3149 *
3150 * rw'_i = { 3, 4, 1, 0 }
3151 * s'_i = { 3/8, 4/8, 1/8, 0 }
3152 *
3153 * We can then compute the difference in effective weight by using:
3154 *
3155 * dw_i = S * (s'_i - s_i) (3)
3156 *
3157 * Where 'S' is the group weight as seen by its parent.
3158 *
3159 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3160 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3161 * 4/7) times the weight of the group.
f5bfb7d9 3162 */
2069dd75 3163static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
bb3469ac 3164{
4be9daaa 3165 struct sched_entity *se = tg->se[cpu];
f1d239f7 3166
cf5f0acf 3167 if (!tg->parent) /* the trivial, non-cgroup case */
f1d239f7
PZ
3168 return wl;
3169
4be9daaa 3170 for_each_sched_entity(se) {
cf5f0acf 3171 long w, W;
4be9daaa 3172
977dda7c 3173 tg = se->my_q->tg;
bb3469ac 3174
cf5f0acf
PZ
3175 /*
3176 * W = @wg + \Sum rw_j
3177 */
3178 W = wg + calc_tg_weight(tg, se->my_q);
4be9daaa 3179
cf5f0acf
PZ
3180 /*
3181 * w = rw_i + @wl
3182 */
3183 w = se->my_q->load.weight + wl;
940959e9 3184
cf5f0acf
PZ
3185 /*
3186 * wl = S * s'_i; see (2)
3187 */
3188 if (W > 0 && w < W)
3189 wl = (w * tg->shares) / W;
977dda7c
PT
3190 else
3191 wl = tg->shares;
940959e9 3192
cf5f0acf
PZ
3193 /*
3194 * Per the above, wl is the new se->load.weight value; since
3195 * those are clipped to [MIN_SHARES, ...) do so now. See
3196 * calc_cfs_shares().
3197 */
977dda7c
PT
3198 if (wl < MIN_SHARES)
3199 wl = MIN_SHARES;
cf5f0acf
PZ
3200
3201 /*
3202 * wl = dw_i = S * (s'_i - s_i); see (3)
3203 */
977dda7c 3204 wl -= se->load.weight;
cf5f0acf
PZ
3205
3206 /*
3207 * Recursively apply this logic to all parent groups to compute
3208 * the final effective load change on the root group. Since
3209 * only the @tg group gets extra weight, all parent groups can
3210 * only redistribute existing shares. @wl is the shift in shares
3211 * resulting from this level per the above.
3212 */
4be9daaa 3213 wg = 0;
4be9daaa 3214 }
bb3469ac 3215
4be9daaa 3216 return wl;
bb3469ac
PZ
3217}
3218#else
4be9daaa 3219
83378269
PZ
3220static inline unsigned long effective_load(struct task_group *tg, int cpu,
3221 unsigned long wl, unsigned long wg)
4be9daaa 3222{
83378269 3223 return wl;
bb3469ac 3224}
4be9daaa 3225
bb3469ac
PZ
3226#endif
3227
62470419
MW
3228static int wake_wide(struct task_struct *p)
3229{
7d9ffa89 3230 int factor = this_cpu_read(sd_llc_size);
62470419
MW
3231
3232 /*
3233 * Yeah, it's the switching-frequency, could means many wakee or
3234 * rapidly switch, use factor here will just help to automatically
3235 * adjust the loose-degree, so bigger node will lead to more pull.
3236 */
3237 if (p->wakee_flips > factor) {
3238 /*
3239 * wakee is somewhat hot, it needs certain amount of cpu
3240 * resource, so if waker is far more hot, prefer to leave
3241 * it alone.
3242 */
3243 if (current->wakee_flips > (factor * p->wakee_flips))
3244 return 1;
3245 }
3246
3247 return 0;
3248}
3249
c88d5910 3250static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
098fb9db 3251{
e37b6a7b 3252 s64 this_load, load;
c88d5910 3253 int idx, this_cpu, prev_cpu;
098fb9db 3254 unsigned long tl_per_task;
c88d5910 3255 struct task_group *tg;
83378269 3256 unsigned long weight;
b3137bc8 3257 int balanced;
098fb9db 3258
62470419
MW
3259 /*
3260 * If we wake multiple tasks be careful to not bounce
3261 * ourselves around too much.
3262 */
3263 if (wake_wide(p))
3264 return 0;
3265
c88d5910
PZ
3266 idx = sd->wake_idx;
3267 this_cpu = smp_processor_id();
3268 prev_cpu = task_cpu(p);
3269 load = source_load(prev_cpu, idx);
3270 this_load = target_load(this_cpu, idx);
098fb9db 3271
b3137bc8
MG
3272 /*
3273 * If sync wakeup then subtract the (maximum possible)
3274 * effect of the currently running task from the load
3275 * of the current CPU:
3276 */
83378269
PZ
3277 if (sync) {
3278 tg = task_group(current);
3279 weight = current->se.load.weight;
3280
c88d5910 3281 this_load += effective_load(tg, this_cpu, -weight, -weight);
83378269
PZ
3282 load += effective_load(tg, prev_cpu, 0, -weight);
3283 }
b3137bc8 3284
83378269
PZ
3285 tg = task_group(p);
3286 weight = p->se.load.weight;
b3137bc8 3287
71a29aa7
PZ
3288 /*
3289 * In low-load situations, where prev_cpu is idle and this_cpu is idle
c88d5910
PZ
3290 * due to the sync cause above having dropped this_load to 0, we'll
3291 * always have an imbalance, but there's really nothing you can do
3292 * about that, so that's good too.
71a29aa7
PZ
3293 *
3294 * Otherwise check if either cpus are near enough in load to allow this
3295 * task to be woken on this_cpu.
3296 */
e37b6a7b
PT
3297 if (this_load > 0) {
3298 s64 this_eff_load, prev_eff_load;
e51fd5e2
PZ
3299
3300 this_eff_load = 100;
3301 this_eff_load *= power_of(prev_cpu);
3302 this_eff_load *= this_load +
3303 effective_load(tg, this_cpu, weight, weight);
3304
3305 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3306 prev_eff_load *= power_of(this_cpu);
3307 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3308
3309 balanced = this_eff_load <= prev_eff_load;
3310 } else
3311 balanced = true;
b3137bc8 3312
098fb9db 3313 /*
4ae7d5ce
IM
3314 * If the currently running task will sleep within
3315 * a reasonable amount of time then attract this newly
3316 * woken task:
098fb9db 3317 */
2fb7635c
PZ
3318 if (sync && balanced)
3319 return 1;
098fb9db 3320
41acab88 3321 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
098fb9db
IM
3322 tl_per_task = cpu_avg_load_per_task(this_cpu);
3323
c88d5910
PZ
3324 if (balanced ||
3325 (this_load <= load &&
3326 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
098fb9db
IM
3327 /*
3328 * This domain has SD_WAKE_AFFINE and
3329 * p is cache cold in this domain, and
3330 * there is no bad imbalance.
3331 */
c88d5910 3332 schedstat_inc(sd, ttwu_move_affine);
41acab88 3333 schedstat_inc(p, se.statistics.nr_wakeups_affine);
098fb9db
IM
3334
3335 return 1;
3336 }
3337 return 0;
3338}
3339
aaee1203
PZ
3340/*
3341 * find_idlest_group finds and returns the least busy CPU group within the
3342 * domain.
3343 */
3344static struct sched_group *
78e7ed53 3345find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5158f4e4 3346 int this_cpu, int load_idx)
e7693a36 3347{
b3bd3de6 3348 struct sched_group *idlest = NULL, *group = sd->groups;
aaee1203 3349 unsigned long min_load = ULONG_MAX, this_load = 0;
aaee1203 3350 int imbalance = 100 + (sd->imbalance_pct-100)/2;
e7693a36 3351
aaee1203
PZ
3352 do {
3353 unsigned long load, avg_load;
3354 int local_group;
3355 int i;
e7693a36 3356
aaee1203
PZ
3357 /* Skip over this group if it has no CPUs allowed */
3358 if (!cpumask_intersects(sched_group_cpus(group),
fa17b507 3359 tsk_cpus_allowed(p)))
aaee1203
PZ
3360 continue;
3361
3362 local_group = cpumask_test_cpu(this_cpu,
3363 sched_group_cpus(group));
3364
3365 /* Tally up the load of all CPUs in the group */
3366 avg_load = 0;
3367
3368 for_each_cpu(i, sched_group_cpus(group)) {
3369 /* Bias balancing toward cpus of our domain */
3370 if (local_group)
3371 load = source_load(i, load_idx);
3372 else
3373 load = target_load(i, load_idx);
3374
3375 avg_load += load;
3376 }
3377
3378 /* Adjust by relative CPU power of the group */
9c3f75cb 3379 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
aaee1203
PZ
3380
3381 if (local_group) {
3382 this_load = avg_load;
aaee1203
PZ
3383 } else if (avg_load < min_load) {
3384 min_load = avg_load;
3385 idlest = group;
3386 }
3387 } while (group = group->next, group != sd->groups);
3388
3389 if (!idlest || 100*this_load < imbalance*min_load)
3390 return NULL;
3391 return idlest;
3392}
3393
3394/*
3395 * find_idlest_cpu - find the idlest cpu among the cpus in group.
3396 */
3397static int
3398find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3399{
3400 unsigned long load, min_load = ULONG_MAX;
3401 int idlest = -1;
3402 int i;
3403
3404 /* Traverse only the allowed CPUs */
fa17b507 3405 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
aaee1203
PZ
3406 load = weighted_cpuload(i);
3407
3408 if (load < min_load || (load == min_load && i == this_cpu)) {
3409 min_load = load;
3410 idlest = i;
e7693a36
GH
3411 }
3412 }
3413
aaee1203
PZ
3414 return idlest;
3415}
e7693a36 3416
a50bde51
PZ
3417/*
3418 * Try and locate an idle CPU in the sched_domain.
3419 */
99bd5e2f 3420static int select_idle_sibling(struct task_struct *p, int target)
a50bde51 3421{
99bd5e2f 3422 struct sched_domain *sd;
37407ea7 3423 struct sched_group *sg;
e0a79f52 3424 int i = task_cpu(p);
a50bde51 3425
e0a79f52
MG
3426 if (idle_cpu(target))
3427 return target;
99bd5e2f
SS
3428
3429 /*
e0a79f52 3430 * If the prevous cpu is cache affine and idle, don't be stupid.
99bd5e2f 3431 */
e0a79f52
MG
3432 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3433 return i;
a50bde51
PZ
3434
3435 /*
37407ea7 3436 * Otherwise, iterate the domains and find an elegible idle cpu.
a50bde51 3437 */
518cd623 3438 sd = rcu_dereference(per_cpu(sd_llc, target));
970e1789 3439 for_each_lower_domain(sd) {
37407ea7
LT
3440 sg = sd->groups;
3441 do {
3442 if (!cpumask_intersects(sched_group_cpus(sg),
3443 tsk_cpus_allowed(p)))
3444 goto next;
3445
3446 for_each_cpu(i, sched_group_cpus(sg)) {
e0a79f52 3447 if (i == target || !idle_cpu(i))
37407ea7
LT
3448 goto next;
3449 }
970e1789 3450
37407ea7
LT
3451 target = cpumask_first_and(sched_group_cpus(sg),
3452 tsk_cpus_allowed(p));
3453 goto done;
3454next:
3455 sg = sg->next;
3456 } while (sg != sd->groups);
3457 }
3458done:
a50bde51
PZ
3459 return target;
3460}
3461
aaee1203
PZ
3462/*
3463 * sched_balance_self: balance the current task (running on cpu) in domains
3464 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3465 * SD_BALANCE_EXEC.
3466 *
3467 * Balance, ie. select the least loaded group.
3468 *
3469 * Returns the target CPU number, or the same CPU if no balancing is needed.
3470 *
3471 * preempt must be disabled.
3472 */
0017d735 3473static int
7608dec2 3474select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
aaee1203 3475{
29cd8bae 3476 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
c88d5910
PZ
3477 int cpu = smp_processor_id();
3478 int prev_cpu = task_cpu(p);
3479 int new_cpu = cpu;
99bd5e2f 3480 int want_affine = 0;
5158f4e4 3481 int sync = wake_flags & WF_SYNC;
c88d5910 3482
29baa747 3483 if (p->nr_cpus_allowed == 1)
76854c7e
MG
3484 return prev_cpu;
3485
0763a660 3486 if (sd_flag & SD_BALANCE_WAKE) {
fa17b507 3487 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
c88d5910
PZ
3488 want_affine = 1;
3489 new_cpu = prev_cpu;
3490 }
aaee1203 3491
dce840a0 3492 rcu_read_lock();
aaee1203 3493 for_each_domain(cpu, tmp) {
e4f42888
PZ
3494 if (!(tmp->flags & SD_LOAD_BALANCE))
3495 continue;
3496
fe3bcfe1 3497 /*
99bd5e2f
SS
3498 * If both cpu and prev_cpu are part of this domain,
3499 * cpu is a valid SD_WAKE_AFFINE target.
fe3bcfe1 3500 */
99bd5e2f
SS
3501 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3502 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3503 affine_sd = tmp;
29cd8bae 3504 break;
f03542a7 3505 }
29cd8bae 3506
f03542a7 3507 if (tmp->flags & sd_flag)
29cd8bae
PZ
3508 sd = tmp;
3509 }
3510
8b911acd 3511 if (affine_sd) {
f03542a7 3512 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
dce840a0
PZ
3513 prev_cpu = cpu;
3514
3515 new_cpu = select_idle_sibling(p, prev_cpu);
3516 goto unlock;
8b911acd 3517 }
e7693a36 3518
aaee1203 3519 while (sd) {
5158f4e4 3520 int load_idx = sd->forkexec_idx;
aaee1203 3521 struct sched_group *group;
c88d5910 3522 int weight;
098fb9db 3523
0763a660 3524 if (!(sd->flags & sd_flag)) {
aaee1203
PZ
3525 sd = sd->child;
3526 continue;
3527 }
098fb9db 3528
5158f4e4
PZ
3529 if (sd_flag & SD_BALANCE_WAKE)
3530 load_idx = sd->wake_idx;
098fb9db 3531
5158f4e4 3532 group = find_idlest_group(sd, p, cpu, load_idx);
aaee1203
PZ
3533 if (!group) {
3534 sd = sd->child;
3535 continue;
3536 }
4ae7d5ce 3537
d7c33c49 3538 new_cpu = find_idlest_cpu(group, p, cpu);
aaee1203
PZ
3539 if (new_cpu == -1 || new_cpu == cpu) {
3540 /* Now try balancing at a lower domain level of cpu */
3541 sd = sd->child;
3542 continue;
e7693a36 3543 }
aaee1203
PZ
3544
3545 /* Now try balancing at a lower domain level of new_cpu */
3546 cpu = new_cpu;
669c55e9 3547 weight = sd->span_weight;
aaee1203
PZ
3548 sd = NULL;
3549 for_each_domain(cpu, tmp) {
669c55e9 3550 if (weight <= tmp->span_weight)
aaee1203 3551 break;
0763a660 3552 if (tmp->flags & sd_flag)
aaee1203
PZ
3553 sd = tmp;
3554 }
3555 /* while loop will break here if sd == NULL */
e7693a36 3556 }
dce840a0
PZ
3557unlock:
3558 rcu_read_unlock();
e7693a36 3559
c88d5910 3560 return new_cpu;
e7693a36 3561}
0a74bef8
PT
3562
3563/*
3564 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3565 * cfs_rq_of(p) references at time of call are still valid and identify the
3566 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3567 * other assumptions, including the state of rq->lock, should be made.
3568 */
3569static void
3570migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3571{
aff3e498
PT
3572 struct sched_entity *se = &p->se;
3573 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3574
3575 /*
3576 * Load tracking: accumulate removed load so that it can be processed
3577 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3578 * to blocked load iff they have a positive decay-count. It can never
3579 * be negative here since on-rq tasks have decay-count == 0.
3580 */
3581 if (se->avg.decay_count) {
3582 se->avg.decay_count = -__synchronize_entity_decay(se);
2509940f
AS
3583 atomic_long_add(se->avg.load_avg_contrib,
3584 &cfs_rq->removed_load);
aff3e498 3585 }
0a74bef8 3586}
e7693a36
GH
3587#endif /* CONFIG_SMP */
3588
e52fb7c0
PZ
3589static unsigned long
3590wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
0bbd3336
PZ
3591{
3592 unsigned long gran = sysctl_sched_wakeup_granularity;
3593
3594 /*
e52fb7c0
PZ
3595 * Since its curr running now, convert the gran from real-time
3596 * to virtual-time in his units.
13814d42
MG
3597 *
3598 * By using 'se' instead of 'curr' we penalize light tasks, so
3599 * they get preempted easier. That is, if 'se' < 'curr' then
3600 * the resulting gran will be larger, therefore penalizing the
3601 * lighter, if otoh 'se' > 'curr' then the resulting gran will
3602 * be smaller, again penalizing the lighter task.
3603 *
3604 * This is especially important for buddies when the leftmost
3605 * task is higher priority than the buddy.
0bbd3336 3606 */
f4ad9bd2 3607 return calc_delta_fair(gran, se);
0bbd3336
PZ
3608}
3609
464b7527
PZ
3610/*
3611 * Should 'se' preempt 'curr'.
3612 *
3613 * |s1
3614 * |s2
3615 * |s3
3616 * g
3617 * |<--->|c
3618 *
3619 * w(c, s1) = -1
3620 * w(c, s2) = 0
3621 * w(c, s3) = 1
3622 *
3623 */
3624static int
3625wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3626{
3627 s64 gran, vdiff = curr->vruntime - se->vruntime;
3628
3629 if (vdiff <= 0)
3630 return -1;
3631
e52fb7c0 3632 gran = wakeup_gran(curr, se);
464b7527
PZ
3633 if (vdiff > gran)
3634 return 1;
3635
3636 return 0;
3637}
3638
02479099
PZ
3639static void set_last_buddy(struct sched_entity *se)
3640{
69c80f3e
VP
3641 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3642 return;
3643
3644 for_each_sched_entity(se)
3645 cfs_rq_of(se)->last = se;
02479099
PZ
3646}
3647
3648static void set_next_buddy(struct sched_entity *se)
3649{
69c80f3e
VP
3650 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3651 return;
3652
3653 for_each_sched_entity(se)
3654 cfs_rq_of(se)->next = se;
02479099
PZ
3655}
3656
ac53db59
RR
3657static void set_skip_buddy(struct sched_entity *se)
3658{
69c80f3e
VP
3659 for_each_sched_entity(se)
3660 cfs_rq_of(se)->skip = se;
ac53db59
RR
3661}
3662
bf0f6f24
IM
3663/*
3664 * Preempt the current task with a newly woken task if needed:
3665 */
5a9b86f6 3666static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
bf0f6f24
IM
3667{
3668 struct task_struct *curr = rq->curr;
8651a86c 3669 struct sched_entity *se = &curr->se, *pse = &p->se;
03e89e45 3670 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
f685ceac 3671 int scale = cfs_rq->nr_running >= sched_nr_latency;
2f36825b 3672 int next_buddy_marked = 0;
bf0f6f24 3673
4ae7d5ce
IM
3674 if (unlikely(se == pse))
3675 return;
3676
5238cdd3 3677 /*
ddcdf6e7 3678 * This is possible from callers such as move_task(), in which we
5238cdd3
PT
3679 * unconditionally check_prempt_curr() after an enqueue (which may have
3680 * lead to a throttle). This both saves work and prevents false
3681 * next-buddy nomination below.
3682 */
3683 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3684 return;
3685
2f36825b 3686 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3cb63d52 3687 set_next_buddy(pse);
2f36825b
VP
3688 next_buddy_marked = 1;
3689 }
57fdc26d 3690
aec0a514
BR
3691 /*
3692 * We can come here with TIF_NEED_RESCHED already set from new task
3693 * wake up path.
5238cdd3
PT
3694 *
3695 * Note: this also catches the edge-case of curr being in a throttled
3696 * group (e.g. via set_curr_task), since update_curr() (in the
3697 * enqueue of curr) will have resulted in resched being set. This
3698 * prevents us from potentially nominating it as a false LAST_BUDDY
3699 * below.
aec0a514
BR
3700 */
3701 if (test_tsk_need_resched(curr))
3702 return;
3703
a2f5c9ab
DH
3704 /* Idle tasks are by definition preempted by non-idle tasks. */
3705 if (unlikely(curr->policy == SCHED_IDLE) &&
3706 likely(p->policy != SCHED_IDLE))
3707 goto preempt;
3708
91c234b4 3709 /*
a2f5c9ab
DH
3710 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3711 * is driven by the tick):
91c234b4 3712 */
8ed92e51 3713 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
91c234b4 3714 return;
bf0f6f24 3715
464b7527 3716 find_matching_se(&se, &pse);
9bbd7374 3717 update_curr(cfs_rq_of(se));
002f128b 3718 BUG_ON(!pse);
2f36825b
VP
3719 if (wakeup_preempt_entity(se, pse) == 1) {
3720 /*
3721 * Bias pick_next to pick the sched entity that is
3722 * triggering this preemption.
3723 */
3724 if (!next_buddy_marked)
3725 set_next_buddy(pse);
3a7e73a2 3726 goto preempt;
2f36825b 3727 }
464b7527 3728
3a7e73a2 3729 return;
a65ac745 3730
3a7e73a2
PZ
3731preempt:
3732 resched_task(curr);
3733 /*
3734 * Only set the backward buddy when the current task is still
3735 * on the rq. This can happen when a wakeup gets interleaved
3736 * with schedule on the ->pre_schedule() or idle_balance()
3737 * point, either of which can * drop the rq lock.
3738 *
3739 * Also, during early boot the idle thread is in the fair class,
3740 * for obvious reasons its a bad idea to schedule back to it.
3741 */
3742 if (unlikely(!se->on_rq || curr == rq->idle))
3743 return;
3744
3745 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3746 set_last_buddy(se);
bf0f6f24
IM
3747}
3748
fb8d4724 3749static struct task_struct *pick_next_task_fair(struct rq *rq)
bf0f6f24 3750{
8f4d37ec 3751 struct task_struct *p;
bf0f6f24
IM
3752 struct cfs_rq *cfs_rq = &rq->cfs;
3753 struct sched_entity *se;
3754
36ace27e 3755 if (!cfs_rq->nr_running)
bf0f6f24
IM
3756 return NULL;
3757
3758 do {
9948f4b2 3759 se = pick_next_entity(cfs_rq);
f4b6755f 3760 set_next_entity(cfs_rq, se);
bf0f6f24
IM
3761 cfs_rq = group_cfs_rq(se);
3762 } while (cfs_rq);
3763
8f4d37ec 3764 p = task_of(se);
b39e66ea
MG
3765 if (hrtick_enabled(rq))
3766 hrtick_start_fair(rq, p);
8f4d37ec
PZ
3767
3768 return p;
bf0f6f24
IM
3769}
3770
3771/*
3772 * Account for a descheduled task:
3773 */
31ee529c 3774static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24
IM
3775{
3776 struct sched_entity *se = &prev->se;
3777 struct cfs_rq *cfs_rq;
3778
3779 for_each_sched_entity(se) {
3780 cfs_rq = cfs_rq_of(se);
ab6cde26 3781 put_prev_entity(cfs_rq, se);
bf0f6f24
IM
3782 }
3783}
3784
ac53db59
RR
3785/*
3786 * sched_yield() is very simple
3787 *
3788 * The magic of dealing with the ->skip buddy is in pick_next_entity.
3789 */
3790static void yield_task_fair(struct rq *rq)
3791{
3792 struct task_struct *curr = rq->curr;
3793 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3794 struct sched_entity *se = &curr->se;
3795
3796 /*
3797 * Are we the only task in the tree?
3798 */
3799 if (unlikely(rq->nr_running == 1))
3800 return;
3801
3802 clear_buddies(cfs_rq, se);
3803
3804 if (curr->policy != SCHED_BATCH) {
3805 update_rq_clock(rq);
3806 /*
3807 * Update run-time statistics of the 'current'.
3808 */
3809 update_curr(cfs_rq);
916671c0
MG
3810 /*
3811 * Tell update_rq_clock() that we've just updated,
3812 * so we don't do microscopic update in schedule()
3813 * and double the fastpath cost.
3814 */
3815 rq->skip_clock_update = 1;
ac53db59
RR
3816 }
3817
3818 set_skip_buddy(se);
3819}
3820
d95f4122
MG
3821static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3822{
3823 struct sched_entity *se = &p->se;
3824
5238cdd3
PT
3825 /* throttled hierarchies are not runnable */
3826 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
d95f4122
MG
3827 return false;
3828
3829 /* Tell the scheduler that we'd really like pse to run next. */
3830 set_next_buddy(se);
3831
d95f4122
MG
3832 yield_task_fair(rq);
3833
3834 return true;
3835}
3836
681f3e68 3837#ifdef CONFIG_SMP
bf0f6f24 3838/**************************************************
e9c84cb8
PZ
3839 * Fair scheduling class load-balancing methods.
3840 *
3841 * BASICS
3842 *
3843 * The purpose of load-balancing is to achieve the same basic fairness the
3844 * per-cpu scheduler provides, namely provide a proportional amount of compute
3845 * time to each task. This is expressed in the following equation:
3846 *
3847 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3848 *
3849 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3850 * W_i,0 is defined as:
3851 *
3852 * W_i,0 = \Sum_j w_i,j (2)
3853 *
3854 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3855 * is derived from the nice value as per prio_to_weight[].
3856 *
3857 * The weight average is an exponential decay average of the instantaneous
3858 * weight:
3859 *
3860 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3861 *
3862 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3863 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3864 * can also include other factors [XXX].
3865 *
3866 * To achieve this balance we define a measure of imbalance which follows
3867 * directly from (1):
3868 *
3869 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3870 *
3871 * We them move tasks around to minimize the imbalance. In the continuous
3872 * function space it is obvious this converges, in the discrete case we get
3873 * a few fun cases generally called infeasible weight scenarios.
3874 *
3875 * [XXX expand on:
3876 * - infeasible weights;
3877 * - local vs global optima in the discrete case. ]
3878 *
3879 *
3880 * SCHED DOMAINS
3881 *
3882 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3883 * for all i,j solution, we create a tree of cpus that follows the hardware
3884 * topology where each level pairs two lower groups (or better). This results
3885 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3886 * tree to only the first of the previous level and we decrease the frequency
3887 * of load-balance at each level inv. proportional to the number of cpus in
3888 * the groups.
3889 *
3890 * This yields:
3891 *
3892 * log_2 n 1 n
3893 * \Sum { --- * --- * 2^i } = O(n) (5)
3894 * i = 0 2^i 2^i
3895 * `- size of each group
3896 * | | `- number of cpus doing load-balance
3897 * | `- freq
3898 * `- sum over all levels
3899 *
3900 * Coupled with a limit on how many tasks we can migrate every balance pass,
3901 * this makes (5) the runtime complexity of the balancer.
3902 *
3903 * An important property here is that each CPU is still (indirectly) connected
3904 * to every other cpu in at most O(log n) steps:
3905 *
3906 * The adjacency matrix of the resulting graph is given by:
3907 *
3908 * log_2 n
3909 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3910 * k = 0
3911 *
3912 * And you'll find that:
3913 *
3914 * A^(log_2 n)_i,j != 0 for all i,j (7)
3915 *
3916 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3917 * The task movement gives a factor of O(m), giving a convergence complexity
3918 * of:
3919 *
3920 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3921 *
3922 *
3923 * WORK CONSERVING
3924 *
3925 * In order to avoid CPUs going idle while there's still work to do, new idle
3926 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3927 * tree itself instead of relying on other CPUs to bring it work.
3928 *
3929 * This adds some complexity to both (5) and (8) but it reduces the total idle
3930 * time.
3931 *
3932 * [XXX more?]
3933 *
3934 *
3935 * CGROUPS
3936 *
3937 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3938 *
3939 * s_k,i
3940 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3941 * S_k
3942 *
3943 * Where
3944 *
3945 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3946 *
3947 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3948 *
3949 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3950 * property.
3951 *
3952 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3953 * rewrite all of this once again.]
3954 */
bf0f6f24 3955
ed387b78
HS
3956static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3957
ddcdf6e7 3958#define LBF_ALL_PINNED 0x01
367456c7 3959#define LBF_NEED_BREAK 0x02
6263322c
PZ
3960#define LBF_DST_PINNED 0x04
3961#define LBF_SOME_PINNED 0x08
ddcdf6e7
PZ
3962
3963struct lb_env {
3964 struct sched_domain *sd;
3965
ddcdf6e7 3966 struct rq *src_rq;
85c1e7da 3967 int src_cpu;
ddcdf6e7
PZ
3968
3969 int dst_cpu;
3970 struct rq *dst_rq;
3971
88b8dac0
SV
3972 struct cpumask *dst_grpmask;
3973 int new_dst_cpu;
ddcdf6e7 3974 enum cpu_idle_type idle;
bd939f45 3975 long imbalance;
b9403130
MW
3976 /* The set of CPUs under consideration for load-balancing */
3977 struct cpumask *cpus;
3978
ddcdf6e7 3979 unsigned int flags;
367456c7
PZ
3980
3981 unsigned int loop;
3982 unsigned int loop_break;
3983 unsigned int loop_max;
ddcdf6e7
PZ
3984};
3985
1e3c88bd 3986/*
ddcdf6e7 3987 * move_task - move a task from one runqueue to another runqueue.
1e3c88bd
PZ
3988 * Both runqueues must be locked.
3989 */
ddcdf6e7 3990static void move_task(struct task_struct *p, struct lb_env *env)
1e3c88bd 3991{
ddcdf6e7
PZ
3992 deactivate_task(env->src_rq, p, 0);
3993 set_task_cpu(p, env->dst_cpu);
3994 activate_task(env->dst_rq, p, 0);
3995 check_preempt_curr(env->dst_rq, p, 0);
1e3c88bd
PZ
3996}
3997
029632fb
PZ
3998/*
3999 * Is this task likely cache-hot:
4000 */
4001static int
4002task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
4003{
4004 s64 delta;
4005
4006 if (p->sched_class != &fair_sched_class)
4007 return 0;
4008
4009 if (unlikely(p->policy == SCHED_IDLE))
4010 return 0;
4011
4012 /*
4013 * Buddy candidates are cache hot:
4014 */
4015 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
4016 (&p->se == cfs_rq_of(&p->se)->next ||
4017 &p->se == cfs_rq_of(&p->se)->last))
4018 return 1;
4019
4020 if (sysctl_sched_migration_cost == -1)
4021 return 1;
4022 if (sysctl_sched_migration_cost == 0)
4023 return 0;
4024
4025 delta = now - p->se.exec_start;
4026
4027 return delta < (s64)sysctl_sched_migration_cost;
4028}
4029
1e3c88bd
PZ
4030/*
4031 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
4032 */
4033static
8e45cb54 4034int can_migrate_task(struct task_struct *p, struct lb_env *env)
1e3c88bd
PZ
4035{
4036 int tsk_cache_hot = 0;
4037 /*
4038 * We do not migrate tasks that are:
d3198084 4039 * 1) throttled_lb_pair, or
1e3c88bd 4040 * 2) cannot be migrated to this CPU due to cpus_allowed, or
d3198084
JK
4041 * 3) running (obviously), or
4042 * 4) are cache-hot on their current CPU.
1e3c88bd 4043 */
d3198084
JK
4044 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4045 return 0;
4046
ddcdf6e7 4047 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
e02e60c1 4048 int cpu;
88b8dac0 4049
41acab88 4050 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
88b8dac0 4051
6263322c
PZ
4052 env->flags |= LBF_SOME_PINNED;
4053
88b8dac0
SV
4054 /*
4055 * Remember if this task can be migrated to any other cpu in
4056 * our sched_group. We may want to revisit it if we couldn't
4057 * meet load balance goals by pulling other tasks on src_cpu.
4058 *
4059 * Also avoid computing new_dst_cpu if we have already computed
4060 * one in current iteration.
4061 */
6263322c 4062 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
88b8dac0
SV
4063 return 0;
4064
e02e60c1
JK
4065 /* Prevent to re-select dst_cpu via env's cpus */
4066 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4067 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6263322c 4068 env->flags |= LBF_DST_PINNED;
e02e60c1
JK
4069 env->new_dst_cpu = cpu;
4070 break;
4071 }
88b8dac0 4072 }
e02e60c1 4073
1e3c88bd
PZ
4074 return 0;
4075 }
88b8dac0
SV
4076
4077 /* Record that we found atleast one task that could run on dst_cpu */
8e45cb54 4078 env->flags &= ~LBF_ALL_PINNED;
1e3c88bd 4079
ddcdf6e7 4080 if (task_running(env->src_rq, p)) {
41acab88 4081 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1e3c88bd
PZ
4082 return 0;
4083 }
4084
4085 /*
4086 * Aggressive migration if:
4087 * 1) task is cache cold, or
4088 * 2) too many balance attempts have failed.
4089 */
4090
78becc27 4091 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
1e3c88bd 4092 if (!tsk_cache_hot ||
8e45cb54 4093 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4e2dcb73 4094
1e3c88bd 4095 if (tsk_cache_hot) {
8e45cb54 4096 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
41acab88 4097 schedstat_inc(p, se.statistics.nr_forced_migrations);
1e3c88bd 4098 }
4e2dcb73 4099
1e3c88bd
PZ
4100 return 1;
4101 }
4102
4e2dcb73
ZH
4103 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
4104 return 0;
1e3c88bd
PZ
4105}
4106
897c395f
PZ
4107/*
4108 * move_one_task tries to move exactly one task from busiest to this_rq, as
4109 * part of active balancing operations within "domain".
4110 * Returns 1 if successful and 0 otherwise.
4111 *
4112 * Called with both runqueues locked.
4113 */
8e45cb54 4114static int move_one_task(struct lb_env *env)
897c395f
PZ
4115{
4116 struct task_struct *p, *n;
897c395f 4117
367456c7 4118 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
367456c7
PZ
4119 if (!can_migrate_task(p, env))
4120 continue;
897c395f 4121
367456c7
PZ
4122 move_task(p, env);
4123 /*
4124 * Right now, this is only the second place move_task()
4125 * is called, so we can safely collect move_task()
4126 * stats here rather than inside move_task().
4127 */
4128 schedstat_inc(env->sd, lb_gained[env->idle]);
4129 return 1;
897c395f 4130 }
897c395f
PZ
4131 return 0;
4132}
4133
367456c7
PZ
4134static unsigned long task_h_load(struct task_struct *p);
4135
eb95308e
PZ
4136static const unsigned int sched_nr_migrate_break = 32;
4137
5d6523eb 4138/*
bd939f45 4139 * move_tasks tries to move up to imbalance weighted load from busiest to
5d6523eb
PZ
4140 * this_rq, as part of a balancing operation within domain "sd".
4141 * Returns 1 if successful and 0 otherwise.
4142 *
4143 * Called with both runqueues locked.
4144 */
4145static int move_tasks(struct lb_env *env)
1e3c88bd 4146{
5d6523eb
PZ
4147 struct list_head *tasks = &env->src_rq->cfs_tasks;
4148 struct task_struct *p;
367456c7
PZ
4149 unsigned long load;
4150 int pulled = 0;
1e3c88bd 4151
bd939f45 4152 if (env->imbalance <= 0)
5d6523eb 4153 return 0;
1e3c88bd 4154
5d6523eb
PZ
4155 while (!list_empty(tasks)) {
4156 p = list_first_entry(tasks, struct task_struct, se.group_node);
1e3c88bd 4157
367456c7
PZ
4158 env->loop++;
4159 /* We've more or less seen every task there is, call it quits */
5d6523eb 4160 if (env->loop > env->loop_max)
367456c7 4161 break;
5d6523eb
PZ
4162
4163 /* take a breather every nr_migrate tasks */
367456c7 4164 if (env->loop > env->loop_break) {
eb95308e 4165 env->loop_break += sched_nr_migrate_break;
8e45cb54 4166 env->flags |= LBF_NEED_BREAK;
ee00e66f 4167 break;
a195f004 4168 }
1e3c88bd 4169
d3198084 4170 if (!can_migrate_task(p, env))
367456c7
PZ
4171 goto next;
4172
4173 load = task_h_load(p);
5d6523eb 4174
eb95308e 4175 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
367456c7
PZ
4176 goto next;
4177
bd939f45 4178 if ((load / 2) > env->imbalance)
367456c7 4179 goto next;
1e3c88bd 4180
ddcdf6e7 4181 move_task(p, env);
ee00e66f 4182 pulled++;
bd939f45 4183 env->imbalance -= load;
1e3c88bd
PZ
4184
4185#ifdef CONFIG_PREEMPT
ee00e66f
PZ
4186 /*
4187 * NEWIDLE balancing is a source of latency, so preemptible
4188 * kernels will stop after the first task is pulled to minimize
4189 * the critical section.
4190 */
5d6523eb 4191 if (env->idle == CPU_NEWLY_IDLE)
ee00e66f 4192 break;
1e3c88bd
PZ
4193#endif
4194
ee00e66f
PZ
4195 /*
4196 * We only want to steal up to the prescribed amount of
4197 * weighted load.
4198 */
bd939f45 4199 if (env->imbalance <= 0)
ee00e66f 4200 break;
367456c7
PZ
4201
4202 continue;
4203next:
5d6523eb 4204 list_move_tail(&p->se.group_node, tasks);
1e3c88bd 4205 }
5d6523eb 4206
1e3c88bd 4207 /*
ddcdf6e7
PZ
4208 * Right now, this is one of only two places move_task() is called,
4209 * so we can safely collect move_task() stats here rather than
4210 * inside move_task().
1e3c88bd 4211 */
8e45cb54 4212 schedstat_add(env->sd, lb_gained[env->idle], pulled);
1e3c88bd 4213
5d6523eb 4214 return pulled;
1e3c88bd
PZ
4215}
4216
230059de 4217#ifdef CONFIG_FAIR_GROUP_SCHED
9e3081ca
PZ
4218/*
4219 * update tg->load_weight by folding this cpu's load_avg
4220 */
48a16753 4221static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
9e3081ca 4222{
48a16753
PT
4223 struct sched_entity *se = tg->se[cpu];
4224 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
9e3081ca 4225
48a16753
PT
4226 /* throttled entities do not contribute to load */
4227 if (throttled_hierarchy(cfs_rq))
4228 return;
9e3081ca 4229
aff3e498 4230 update_cfs_rq_blocked_load(cfs_rq, 1);
9e3081ca 4231
82958366
PT
4232 if (se) {
4233 update_entity_load_avg(se, 1);
4234 /*
4235 * We pivot on our runnable average having decayed to zero for
4236 * list removal. This generally implies that all our children
4237 * have also been removed (modulo rounding error or bandwidth
4238 * control); however, such cases are rare and we can fix these
4239 * at enqueue.
4240 *
4241 * TODO: fix up out-of-order children on enqueue.
4242 */
4243 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4244 list_del_leaf_cfs_rq(cfs_rq);
4245 } else {
48a16753 4246 struct rq *rq = rq_of(cfs_rq);
82958366
PT
4247 update_rq_runnable_avg(rq, rq->nr_running);
4248 }
9e3081ca
PZ
4249}
4250
48a16753 4251static void update_blocked_averages(int cpu)
9e3081ca 4252{
9e3081ca 4253 struct rq *rq = cpu_rq(cpu);
48a16753
PT
4254 struct cfs_rq *cfs_rq;
4255 unsigned long flags;
9e3081ca 4256
48a16753
PT
4257 raw_spin_lock_irqsave(&rq->lock, flags);
4258 update_rq_clock(rq);
9763b67f
PZ
4259 /*
4260 * Iterates the task_group tree in a bottom up fashion, see
4261 * list_add_leaf_cfs_rq() for details.
4262 */
64660c86 4263 for_each_leaf_cfs_rq(rq, cfs_rq) {
48a16753
PT
4264 /*
4265 * Note: We may want to consider periodically releasing
4266 * rq->lock about these updates so that creating many task
4267 * groups does not result in continually extending hold time.
4268 */
4269 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
64660c86 4270 }
48a16753
PT
4271
4272 raw_spin_unlock_irqrestore(&rq->lock, flags);
9e3081ca
PZ
4273}
4274
9763b67f 4275/*
68520796 4276 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9763b67f
PZ
4277 * This needs to be done in a top-down fashion because the load of a child
4278 * group is a fraction of its parents load.
4279 */
68520796 4280static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9763b67f 4281{
68520796
VD
4282 struct rq *rq = rq_of(cfs_rq);
4283 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
a35b6466 4284 unsigned long now = jiffies;
68520796 4285 unsigned long load;
a35b6466 4286
68520796 4287 if (cfs_rq->last_h_load_update == now)
a35b6466
PZ
4288 return;
4289
68520796
VD
4290 cfs_rq->h_load_next = NULL;
4291 for_each_sched_entity(se) {
4292 cfs_rq = cfs_rq_of(se);
4293 cfs_rq->h_load_next = se;
4294 if (cfs_rq->last_h_load_update == now)
4295 break;
4296 }
a35b6466 4297
68520796 4298 if (!se) {
7e3115ef 4299 cfs_rq->h_load = cfs_rq->runnable_load_avg;
68520796
VD
4300 cfs_rq->last_h_load_update = now;
4301 }
4302
4303 while ((se = cfs_rq->h_load_next) != NULL) {
4304 load = cfs_rq->h_load;
4305 load = div64_ul(load * se->avg.load_avg_contrib,
4306 cfs_rq->runnable_load_avg + 1);
4307 cfs_rq = group_cfs_rq(se);
4308 cfs_rq->h_load = load;
4309 cfs_rq->last_h_load_update = now;
4310 }
9763b67f
PZ
4311}
4312
367456c7 4313static unsigned long task_h_load(struct task_struct *p)
230059de 4314{
367456c7 4315 struct cfs_rq *cfs_rq = task_cfs_rq(p);
230059de 4316
68520796 4317 update_cfs_rq_h_load(cfs_rq);
a003a25b
AS
4318 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
4319 cfs_rq->runnable_load_avg + 1);
230059de
PZ
4320}
4321#else
48a16753 4322static inline void update_blocked_averages(int cpu)
9e3081ca
PZ
4323{
4324}
4325
367456c7 4326static unsigned long task_h_load(struct task_struct *p)
1e3c88bd 4327{
a003a25b 4328 return p->se.avg.load_avg_contrib;
1e3c88bd 4329}
230059de 4330#endif
1e3c88bd 4331
1e3c88bd 4332/********** Helpers for find_busiest_group ************************/
1e3c88bd
PZ
4333/*
4334 * sg_lb_stats - stats of a sched_group required for load_balancing
4335 */
4336struct sg_lb_stats {
4337 unsigned long avg_load; /*Avg load across the CPUs of the group */
4338 unsigned long group_load; /* Total load over the CPUs of the group */
1e3c88bd 4339 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
56cf515b 4340 unsigned long load_per_task;
3ae11c90 4341 unsigned long group_power;
147c5fc2
PZ
4342 unsigned int sum_nr_running; /* Nr tasks running in the group */
4343 unsigned int group_capacity;
4344 unsigned int idle_cpus;
4345 unsigned int group_weight;
1e3c88bd 4346 int group_imb; /* Is there an imbalance in the group ? */
fab47622 4347 int group_has_capacity; /* Is there extra capacity in the group? */
1e3c88bd
PZ
4348};
4349
56cf515b
JK
4350/*
4351 * sd_lb_stats - Structure to store the statistics of a sched_domain
4352 * during load balancing.
4353 */
4354struct sd_lb_stats {
4355 struct sched_group *busiest; /* Busiest group in this sd */
4356 struct sched_group *local; /* Local group in this sd */
4357 unsigned long total_load; /* Total load of all groups in sd */
4358 unsigned long total_pwr; /* Total power of all groups in sd */
4359 unsigned long avg_load; /* Average load across all groups in sd */
4360
56cf515b 4361 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
147c5fc2 4362 struct sg_lb_stats local_stat; /* Statistics of the local group */
56cf515b
JK
4363};
4364
147c5fc2
PZ
4365static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4366{
4367 /*
4368 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
4369 * local_stat because update_sg_lb_stats() does a full clear/assignment.
4370 * We must however clear busiest_stat::avg_load because
4371 * update_sd_pick_busiest() reads this before assignment.
4372 */
4373 *sds = (struct sd_lb_stats){
4374 .busiest = NULL,
4375 .local = NULL,
4376 .total_load = 0UL,
4377 .total_pwr = 0UL,
4378 .busiest_stat = {
4379 .avg_load = 0UL,
4380 },
4381 };
4382}
4383
1e3c88bd
PZ
4384/**
4385 * get_sd_load_idx - Obtain the load index for a given sched domain.
4386 * @sd: The sched_domain whose load_idx is to be obtained.
4387 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
e69f6186
YB
4388 *
4389 * Return: The load index.
1e3c88bd
PZ
4390 */
4391static inline int get_sd_load_idx(struct sched_domain *sd,
4392 enum cpu_idle_type idle)
4393{
4394 int load_idx;
4395
4396 switch (idle) {
4397 case CPU_NOT_IDLE:
4398 load_idx = sd->busy_idx;
4399 break;
4400
4401 case CPU_NEWLY_IDLE:
4402 load_idx = sd->newidle_idx;
4403 break;
4404 default:
4405 load_idx = sd->idle_idx;
4406 break;
4407 }
4408
4409 return load_idx;
4410}
4411
15f803c9 4412static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
1e3c88bd 4413{
1399fa78 4414 return SCHED_POWER_SCALE;
1e3c88bd
PZ
4415}
4416
4417unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4418{
4419 return default_scale_freq_power(sd, cpu);
4420}
4421
15f803c9 4422static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
1e3c88bd 4423{
669c55e9 4424 unsigned long weight = sd->span_weight;
1e3c88bd
PZ
4425 unsigned long smt_gain = sd->smt_gain;
4426
4427 smt_gain /= weight;
4428
4429 return smt_gain;
4430}
4431
4432unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4433{
4434 return default_scale_smt_power(sd, cpu);
4435}
4436
15f803c9 4437static unsigned long scale_rt_power(int cpu)
1e3c88bd
PZ
4438{
4439 struct rq *rq = cpu_rq(cpu);
b654f7de 4440 u64 total, available, age_stamp, avg;
1e3c88bd 4441
b654f7de
PZ
4442 /*
4443 * Since we're reading these variables without serialization make sure
4444 * we read them once before doing sanity checks on them.
4445 */
4446 age_stamp = ACCESS_ONCE(rq->age_stamp);
4447 avg = ACCESS_ONCE(rq->rt_avg);
4448
78becc27 4449 total = sched_avg_period() + (rq_clock(rq) - age_stamp);
aa483808 4450
b654f7de 4451 if (unlikely(total < avg)) {
aa483808
VP
4452 /* Ensures that power won't end up being negative */
4453 available = 0;
4454 } else {
b654f7de 4455 available = total - avg;
aa483808 4456 }
1e3c88bd 4457
1399fa78
NR
4458 if (unlikely((s64)total < SCHED_POWER_SCALE))
4459 total = SCHED_POWER_SCALE;
1e3c88bd 4460
1399fa78 4461 total >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
4462
4463 return div_u64(available, total);
4464}
4465
4466static void update_cpu_power(struct sched_domain *sd, int cpu)
4467{
669c55e9 4468 unsigned long weight = sd->span_weight;
1399fa78 4469 unsigned long power = SCHED_POWER_SCALE;
1e3c88bd
PZ
4470 struct sched_group *sdg = sd->groups;
4471
1e3c88bd
PZ
4472 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4473 if (sched_feat(ARCH_POWER))
4474 power *= arch_scale_smt_power(sd, cpu);
4475 else
4476 power *= default_scale_smt_power(sd, cpu);
4477
1399fa78 4478 power >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
4479 }
4480
9c3f75cb 4481 sdg->sgp->power_orig = power;
9d5efe05
SV
4482
4483 if (sched_feat(ARCH_POWER))
4484 power *= arch_scale_freq_power(sd, cpu);
4485 else
4486 power *= default_scale_freq_power(sd, cpu);
4487
1399fa78 4488 power >>= SCHED_POWER_SHIFT;
9d5efe05 4489
1e3c88bd 4490 power *= scale_rt_power(cpu);
1399fa78 4491 power >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
4492
4493 if (!power)
4494 power = 1;
4495
e51fd5e2 4496 cpu_rq(cpu)->cpu_power = power;
9c3f75cb 4497 sdg->sgp->power = power;
1e3c88bd
PZ
4498}
4499
029632fb 4500void update_group_power(struct sched_domain *sd, int cpu)
1e3c88bd
PZ
4501{
4502 struct sched_domain *child = sd->child;
4503 struct sched_group *group, *sdg = sd->groups;
863bffc8 4504 unsigned long power, power_orig;
4ec4412e
VG
4505 unsigned long interval;
4506
4507 interval = msecs_to_jiffies(sd->balance_interval);
4508 interval = clamp(interval, 1UL, max_load_balance_interval);
4509 sdg->sgp->next_update = jiffies + interval;
1e3c88bd
PZ
4510
4511 if (!child) {
4512 update_cpu_power(sd, cpu);
4513 return;
4514 }
4515
863bffc8 4516 power_orig = power = 0;
1e3c88bd 4517
74a5ce20
PZ
4518 if (child->flags & SD_OVERLAP) {
4519 /*
4520 * SD_OVERLAP domains cannot assume that child groups
4521 * span the current group.
4522 */
4523
863bffc8
PZ
4524 for_each_cpu(cpu, sched_group_cpus(sdg)) {
4525 struct sched_group *sg = cpu_rq(cpu)->sd->groups;
4526
4527 power_orig += sg->sgp->power_orig;
4528 power += sg->sgp->power;
4529 }
74a5ce20
PZ
4530 } else {
4531 /*
4532 * !SD_OVERLAP domains can assume that child groups
4533 * span the current group.
4534 */
4535
4536 group = child->groups;
4537 do {
863bffc8 4538 power_orig += group->sgp->power_orig;
74a5ce20
PZ
4539 power += group->sgp->power;
4540 group = group->next;
4541 } while (group != child->groups);
4542 }
1e3c88bd 4543
863bffc8
PZ
4544 sdg->sgp->power_orig = power_orig;
4545 sdg->sgp->power = power;
1e3c88bd
PZ
4546}
4547
9d5efe05
SV
4548/*
4549 * Try and fix up capacity for tiny siblings, this is needed when
4550 * things like SD_ASYM_PACKING need f_b_g to select another sibling
4551 * which on its own isn't powerful enough.
4552 *
4553 * See update_sd_pick_busiest() and check_asym_packing().
4554 */
4555static inline int
4556fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4557{
4558 /*
1399fa78 4559 * Only siblings can have significantly less than SCHED_POWER_SCALE
9d5efe05 4560 */
a6c75f2f 4561 if (!(sd->flags & SD_SHARE_CPUPOWER))
9d5efe05
SV
4562 return 0;
4563
4564 /*
4565 * If ~90% of the cpu_power is still there, we're good.
4566 */
9c3f75cb 4567 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
9d5efe05
SV
4568 return 1;
4569
4570 return 0;
4571}
4572
30ce5dab
PZ
4573/*
4574 * Group imbalance indicates (and tries to solve) the problem where balancing
4575 * groups is inadequate due to tsk_cpus_allowed() constraints.
4576 *
4577 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
4578 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
4579 * Something like:
4580 *
4581 * { 0 1 2 3 } { 4 5 6 7 }
4582 * * * * *
4583 *
4584 * If we were to balance group-wise we'd place two tasks in the first group and
4585 * two tasks in the second group. Clearly this is undesired as it will overload
4586 * cpu 3 and leave one of the cpus in the second group unused.
4587 *
4588 * The current solution to this issue is detecting the skew in the first group
6263322c
PZ
4589 * by noticing the lower domain failed to reach balance and had difficulty
4590 * moving tasks due to affinity constraints.
30ce5dab
PZ
4591 *
4592 * When this is so detected; this group becomes a candidate for busiest; see
4593 * update_sd_pick_busiest(). And calculcate_imbalance() and
6263322c 4594 * find_busiest_group() avoid some of the usual balance conditions to allow it
30ce5dab
PZ
4595 * to create an effective group imbalance.
4596 *
4597 * This is a somewhat tricky proposition since the next run might not find the
4598 * group imbalance and decide the groups need to be balanced again. A most
4599 * subtle and fragile situation.
4600 */
4601
6263322c 4602static inline int sg_imbalanced(struct sched_group *group)
30ce5dab 4603{
6263322c 4604 return group->sgp->imbalance;
30ce5dab
PZ
4605}
4606
b37d9316
PZ
4607/*
4608 * Compute the group capacity.
4609 *
c61037e9
PZ
4610 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
4611 * first dividing out the smt factor and computing the actual number of cores
4612 * and limit power unit capacity with that.
b37d9316
PZ
4613 */
4614static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
4615{
c61037e9
PZ
4616 unsigned int capacity, smt, cpus;
4617 unsigned int power, power_orig;
4618
4619 power = group->sgp->power;
4620 power_orig = group->sgp->power_orig;
4621 cpus = group->group_weight;
b37d9316 4622
c61037e9
PZ
4623 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
4624 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
4625 capacity = cpus / smt; /* cores */
b37d9316 4626
c61037e9 4627 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
b37d9316
PZ
4628 if (!capacity)
4629 capacity = fix_small_capacity(env->sd, group);
4630
4631 return capacity;
4632}
4633
1e3c88bd
PZ
4634/**
4635 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
cd96891d 4636 * @env: The load balancing environment.
1e3c88bd 4637 * @group: sched_group whose statistics are to be updated.
1e3c88bd 4638 * @load_idx: Load index of sched_domain of this_cpu for load calc.
1e3c88bd 4639 * @local_group: Does group contain this_cpu.
1e3c88bd
PZ
4640 * @sgs: variable to hold the statistics for this group.
4641 */
bd939f45
PZ
4642static inline void update_sg_lb_stats(struct lb_env *env,
4643 struct sched_group *group, int load_idx,
23f0d209 4644 int local_group, struct sg_lb_stats *sgs)
1e3c88bd 4645{
30ce5dab
PZ
4646 unsigned long nr_running;
4647 unsigned long load;
bd939f45 4648 int i;
1e3c88bd 4649
b72ff13c
PZ
4650 memset(sgs, 0, sizeof(*sgs));
4651
b9403130 4652 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
1e3c88bd
PZ
4653 struct rq *rq = cpu_rq(i);
4654
e44bc5c5
PZ
4655 nr_running = rq->nr_running;
4656
1e3c88bd 4657 /* Bias balancing toward cpus of our domain */
6263322c 4658 if (local_group)
04f733b4 4659 load = target_load(i, load_idx);
6263322c 4660 else
1e3c88bd 4661 load = source_load(i, load_idx);
1e3c88bd
PZ
4662
4663 sgs->group_load += load;
e44bc5c5 4664 sgs->sum_nr_running += nr_running;
1e3c88bd 4665 sgs->sum_weighted_load += weighted_cpuload(i);
aae6d3dd
SS
4666 if (idle_cpu(i))
4667 sgs->idle_cpus++;
1e3c88bd
PZ
4668 }
4669
1e3c88bd 4670 /* Adjust by relative CPU power of the group */
3ae11c90
PZ
4671 sgs->group_power = group->sgp->power;
4672 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
1e3c88bd 4673
dd5feea1 4674 if (sgs->sum_nr_running)
38d0f770 4675 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
1e3c88bd 4676
aae6d3dd 4677 sgs->group_weight = group->group_weight;
fab47622 4678
b37d9316
PZ
4679 sgs->group_imb = sg_imbalanced(group);
4680 sgs->group_capacity = sg_capacity(env, group);
4681
fab47622
NR
4682 if (sgs->group_capacity > sgs->sum_nr_running)
4683 sgs->group_has_capacity = 1;
1e3c88bd
PZ
4684}
4685
532cb4c4
MN
4686/**
4687 * update_sd_pick_busiest - return 1 on busiest group
cd96891d 4688 * @env: The load balancing environment.
532cb4c4
MN
4689 * @sds: sched_domain statistics
4690 * @sg: sched_group candidate to be checked for being the busiest
b6b12294 4691 * @sgs: sched_group statistics
532cb4c4
MN
4692 *
4693 * Determine if @sg is a busier group than the previously selected
4694 * busiest group.
e69f6186
YB
4695 *
4696 * Return: %true if @sg is a busier group than the previously selected
4697 * busiest group. %false otherwise.
532cb4c4 4698 */
bd939f45 4699static bool update_sd_pick_busiest(struct lb_env *env,
532cb4c4
MN
4700 struct sd_lb_stats *sds,
4701 struct sched_group *sg,
bd939f45 4702 struct sg_lb_stats *sgs)
532cb4c4 4703{
56cf515b 4704 if (sgs->avg_load <= sds->busiest_stat.avg_load)
532cb4c4
MN
4705 return false;
4706
4707 if (sgs->sum_nr_running > sgs->group_capacity)
4708 return true;
4709
4710 if (sgs->group_imb)
4711 return true;
4712
4713 /*
4714 * ASYM_PACKING needs to move all the work to the lowest
4715 * numbered CPUs in the group, therefore mark all groups
4716 * higher than ourself as busy.
4717 */
bd939f45
PZ
4718 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4719 env->dst_cpu < group_first_cpu(sg)) {
532cb4c4
MN
4720 if (!sds->busiest)
4721 return true;
4722
4723 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4724 return true;
4725 }
4726
4727 return false;
4728}
4729
1e3c88bd 4730/**
461819ac 4731 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
cd96891d 4732 * @env: The load balancing environment.
1e3c88bd
PZ
4733 * @balance: Should we balance.
4734 * @sds: variable to hold the statistics for this sched_domain.
4735 */
bd939f45 4736static inline void update_sd_lb_stats(struct lb_env *env,
23f0d209 4737 struct sd_lb_stats *sds)
1e3c88bd 4738{
bd939f45
PZ
4739 struct sched_domain *child = env->sd->child;
4740 struct sched_group *sg = env->sd->groups;
56cf515b 4741 struct sg_lb_stats tmp_sgs;
1e3c88bd
PZ
4742 int load_idx, prefer_sibling = 0;
4743
4744 if (child && child->flags & SD_PREFER_SIBLING)
4745 prefer_sibling = 1;
4746
bd939f45 4747 load_idx = get_sd_load_idx(env->sd, env->idle);
1e3c88bd
PZ
4748
4749 do {
56cf515b 4750 struct sg_lb_stats *sgs = &tmp_sgs;
1e3c88bd
PZ
4751 int local_group;
4752
bd939f45 4753 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
56cf515b
JK
4754 if (local_group) {
4755 sds->local = sg;
4756 sgs = &sds->local_stat;
b72ff13c
PZ
4757
4758 if (env->idle != CPU_NEWLY_IDLE ||
4759 time_after_eq(jiffies, sg->sgp->next_update))
4760 update_group_power(env->sd, env->dst_cpu);
56cf515b 4761 }
1e3c88bd 4762
56cf515b 4763 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
1e3c88bd 4764
b72ff13c
PZ
4765 if (local_group)
4766 goto next_group;
4767
1e3c88bd
PZ
4768 /*
4769 * In case the child domain prefers tasks go to siblings
532cb4c4 4770 * first, lower the sg capacity to one so that we'll try
75dd321d
NR
4771 * and move all the excess tasks away. We lower the capacity
4772 * of a group only if the local group has the capacity to fit
4773 * these excess tasks, i.e. nr_running < group_capacity. The
4774 * extra check prevents the case where you always pull from the
4775 * heaviest group when it is already under-utilized (possible
4776 * with a large weight task outweighs the tasks on the system).
1e3c88bd 4777 */
b72ff13c
PZ
4778 if (prefer_sibling && sds->local &&
4779 sds->local_stat.group_has_capacity)
147c5fc2 4780 sgs->group_capacity = min(sgs->group_capacity, 1U);
1e3c88bd 4781
b72ff13c 4782 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
532cb4c4 4783 sds->busiest = sg;
56cf515b 4784 sds->busiest_stat = *sgs;
1e3c88bd
PZ
4785 }
4786
b72ff13c
PZ
4787next_group:
4788 /* Now, start updating sd_lb_stats */
4789 sds->total_load += sgs->group_load;
4790 sds->total_pwr += sgs->group_power;
4791
532cb4c4 4792 sg = sg->next;
bd939f45 4793 } while (sg != env->sd->groups);
532cb4c4
MN
4794}
4795
532cb4c4
MN
4796/**
4797 * check_asym_packing - Check to see if the group is packed into the
4798 * sched doman.
4799 *
4800 * This is primarily intended to used at the sibling level. Some
4801 * cores like POWER7 prefer to use lower numbered SMT threads. In the
4802 * case of POWER7, it can move to lower SMT modes only when higher
4803 * threads are idle. When in lower SMT modes, the threads will
4804 * perform better since they share less core resources. Hence when we
4805 * have idle threads, we want them to be the higher ones.
4806 *
4807 * This packing function is run on idle threads. It checks to see if
4808 * the busiest CPU in this domain (core in the P7 case) has a higher
4809 * CPU number than the packing function is being run on. Here we are
4810 * assuming lower CPU number will be equivalent to lower a SMT thread
4811 * number.
4812 *
e69f6186 4813 * Return: 1 when packing is required and a task should be moved to
b6b12294
MN
4814 * this CPU. The amount of the imbalance is returned in *imbalance.
4815 *
cd96891d 4816 * @env: The load balancing environment.
532cb4c4 4817 * @sds: Statistics of the sched_domain which is to be packed
532cb4c4 4818 */
bd939f45 4819static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
532cb4c4
MN
4820{
4821 int busiest_cpu;
4822
bd939f45 4823 if (!(env->sd->flags & SD_ASYM_PACKING))
532cb4c4
MN
4824 return 0;
4825
4826 if (!sds->busiest)
4827 return 0;
4828
4829 busiest_cpu = group_first_cpu(sds->busiest);
bd939f45 4830 if (env->dst_cpu > busiest_cpu)
532cb4c4
MN
4831 return 0;
4832
bd939f45 4833 env->imbalance = DIV_ROUND_CLOSEST(
3ae11c90
PZ
4834 sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
4835 SCHED_POWER_SCALE);
bd939f45 4836
532cb4c4 4837 return 1;
1e3c88bd
PZ
4838}
4839
4840/**
4841 * fix_small_imbalance - Calculate the minor imbalance that exists
4842 * amongst the groups of a sched_domain, during
4843 * load balancing.
cd96891d 4844 * @env: The load balancing environment.
1e3c88bd 4845 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 4846 */
bd939f45
PZ
4847static inline
4848void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd
PZ
4849{
4850 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4851 unsigned int imbn = 2;
dd5feea1 4852 unsigned long scaled_busy_load_per_task;
56cf515b 4853 struct sg_lb_stats *local, *busiest;
1e3c88bd 4854
56cf515b
JK
4855 local = &sds->local_stat;
4856 busiest = &sds->busiest_stat;
1e3c88bd 4857
56cf515b
JK
4858 if (!local->sum_nr_running)
4859 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
4860 else if (busiest->load_per_task > local->load_per_task)
4861 imbn = 1;
dd5feea1 4862
56cf515b
JK
4863 scaled_busy_load_per_task =
4864 (busiest->load_per_task * SCHED_POWER_SCALE) /
3ae11c90 4865 busiest->group_power;
56cf515b 4866
3029ede3
VD
4867 if (busiest->avg_load + scaled_busy_load_per_task >=
4868 local->avg_load + (scaled_busy_load_per_task * imbn)) {
56cf515b 4869 env->imbalance = busiest->load_per_task;
1e3c88bd
PZ
4870 return;
4871 }
4872
4873 /*
4874 * OK, we don't have enough imbalance to justify moving tasks,
4875 * however we may be able to increase total CPU power used by
4876 * moving them.
4877 */
4878
3ae11c90 4879 pwr_now += busiest->group_power *
56cf515b 4880 min(busiest->load_per_task, busiest->avg_load);
3ae11c90 4881 pwr_now += local->group_power *
56cf515b 4882 min(local->load_per_task, local->avg_load);
1399fa78 4883 pwr_now /= SCHED_POWER_SCALE;
1e3c88bd
PZ
4884
4885 /* Amount of load we'd subtract */
56cf515b 4886 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
3ae11c90 4887 busiest->group_power;
56cf515b 4888 if (busiest->avg_load > tmp) {
3ae11c90 4889 pwr_move += busiest->group_power *
56cf515b
JK
4890 min(busiest->load_per_task,
4891 busiest->avg_load - tmp);
4892 }
1e3c88bd
PZ
4893
4894 /* Amount of load we'd add */
3ae11c90 4895 if (busiest->avg_load * busiest->group_power <
56cf515b 4896 busiest->load_per_task * SCHED_POWER_SCALE) {
3ae11c90
PZ
4897 tmp = (busiest->avg_load * busiest->group_power) /
4898 local->group_power;
56cf515b
JK
4899 } else {
4900 tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
3ae11c90 4901 local->group_power;
56cf515b 4902 }
3ae11c90
PZ
4903 pwr_move += local->group_power *
4904 min(local->load_per_task, local->avg_load + tmp);
1399fa78 4905 pwr_move /= SCHED_POWER_SCALE;
1e3c88bd
PZ
4906
4907 /* Move if we gain throughput */
4908 if (pwr_move > pwr_now)
56cf515b 4909 env->imbalance = busiest->load_per_task;
1e3c88bd
PZ
4910}
4911
4912/**
4913 * calculate_imbalance - Calculate the amount of imbalance present within the
4914 * groups of a given sched_domain during load balance.
bd939f45 4915 * @env: load balance environment
1e3c88bd 4916 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 4917 */
bd939f45 4918static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 4919{
dd5feea1 4920 unsigned long max_pull, load_above_capacity = ~0UL;
56cf515b
JK
4921 struct sg_lb_stats *local, *busiest;
4922
4923 local = &sds->local_stat;
56cf515b 4924 busiest = &sds->busiest_stat;
dd5feea1 4925
56cf515b 4926 if (busiest->group_imb) {
30ce5dab
PZ
4927 /*
4928 * In the group_imb case we cannot rely on group-wide averages
4929 * to ensure cpu-load equilibrium, look at wider averages. XXX
4930 */
56cf515b
JK
4931 busiest->load_per_task =
4932 min(busiest->load_per_task, sds->avg_load);
dd5feea1
SS
4933 }
4934
1e3c88bd
PZ
4935 /*
4936 * In the presence of smp nice balancing, certain scenarios can have
4937 * max load less than avg load(as we skip the groups at or below
4938 * its cpu_power, while calculating max_load..)
4939 */
b1885550
VD
4940 if (busiest->avg_load <= sds->avg_load ||
4941 local->avg_load >= sds->avg_load) {
bd939f45
PZ
4942 env->imbalance = 0;
4943 return fix_small_imbalance(env, sds);
1e3c88bd
PZ
4944 }
4945
56cf515b 4946 if (!busiest->group_imb) {
dd5feea1
SS
4947 /*
4948 * Don't want to pull so many tasks that a group would go idle.
30ce5dab
PZ
4949 * Except of course for the group_imb case, since then we might
4950 * have to drop below capacity to reach cpu-load equilibrium.
dd5feea1 4951 */
56cf515b
JK
4952 load_above_capacity =
4953 (busiest->sum_nr_running - busiest->group_capacity);
dd5feea1 4954
1399fa78 4955 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
3ae11c90 4956 load_above_capacity /= busiest->group_power;
dd5feea1
SS
4957 }
4958
4959 /*
4960 * We're trying to get all the cpus to the average_load, so we don't
4961 * want to push ourselves above the average load, nor do we wish to
4962 * reduce the max loaded cpu below the average load. At the same time,
4963 * we also don't want to reduce the group load below the group capacity
4964 * (so that we can implement power-savings policies etc). Thus we look
4965 * for the minimum possible imbalance.
dd5feea1 4966 */
30ce5dab 4967 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
1e3c88bd
PZ
4968
4969 /* How much load to actually move to equalise the imbalance */
56cf515b 4970 env->imbalance = min(
3ae11c90
PZ
4971 max_pull * busiest->group_power,
4972 (sds->avg_load - local->avg_load) * local->group_power
56cf515b 4973 ) / SCHED_POWER_SCALE;
1e3c88bd
PZ
4974
4975 /*
4976 * if *imbalance is less than the average load per runnable task
25985edc 4977 * there is no guarantee that any tasks will be moved so we'll have
1e3c88bd
PZ
4978 * a think about bumping its value to force at least one task to be
4979 * moved
4980 */
56cf515b 4981 if (env->imbalance < busiest->load_per_task)
bd939f45 4982 return fix_small_imbalance(env, sds);
1e3c88bd 4983}
fab47622 4984
1e3c88bd
PZ
4985/******* find_busiest_group() helpers end here *********************/
4986
4987/**
4988 * find_busiest_group - Returns the busiest group within the sched_domain
4989 * if there is an imbalance. If there isn't an imbalance, and
4990 * the user has opted for power-savings, it returns a group whose
4991 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4992 * such a group exists.
4993 *
4994 * Also calculates the amount of weighted load which should be moved
4995 * to restore balance.
4996 *
cd96891d 4997 * @env: The load balancing environment.
1e3c88bd 4998 *
e69f6186 4999 * Return: - The busiest group if imbalance exists.
1e3c88bd
PZ
5000 * - If no imbalance and user has opted for power-savings balance,
5001 * return the least loaded group whose CPUs can be
5002 * put to idle by rebalancing its tasks onto our group.
5003 */
56cf515b 5004static struct sched_group *find_busiest_group(struct lb_env *env)
1e3c88bd 5005{
56cf515b 5006 struct sg_lb_stats *local, *busiest;
1e3c88bd
PZ
5007 struct sd_lb_stats sds;
5008
147c5fc2 5009 init_sd_lb_stats(&sds);
1e3c88bd
PZ
5010
5011 /*
5012 * Compute the various statistics relavent for load balancing at
5013 * this level.
5014 */
23f0d209 5015 update_sd_lb_stats(env, &sds);
56cf515b
JK
5016 local = &sds.local_stat;
5017 busiest = &sds.busiest_stat;
1e3c88bd 5018
bd939f45
PZ
5019 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
5020 check_asym_packing(env, &sds))
532cb4c4
MN
5021 return sds.busiest;
5022
cc57aa8f 5023 /* There is no busy sibling group to pull tasks from */
56cf515b 5024 if (!sds.busiest || busiest->sum_nr_running == 0)
1e3c88bd
PZ
5025 goto out_balanced;
5026
1399fa78 5027 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
b0432d8f 5028
866ab43e
PZ
5029 /*
5030 * If the busiest group is imbalanced the below checks don't
30ce5dab 5031 * work because they assume all things are equal, which typically
866ab43e
PZ
5032 * isn't true due to cpus_allowed constraints and the like.
5033 */
56cf515b 5034 if (busiest->group_imb)
866ab43e
PZ
5035 goto force_balance;
5036
cc57aa8f 5037 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
56cf515b
JK
5038 if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
5039 !busiest->group_has_capacity)
fab47622
NR
5040 goto force_balance;
5041
cc57aa8f
PZ
5042 /*
5043 * If the local group is more busy than the selected busiest group
5044 * don't try and pull any tasks.
5045 */
56cf515b 5046 if (local->avg_load >= busiest->avg_load)
1e3c88bd
PZ
5047 goto out_balanced;
5048
cc57aa8f
PZ
5049 /*
5050 * Don't pull any tasks if this group is already above the domain
5051 * average load.
5052 */
56cf515b 5053 if (local->avg_load >= sds.avg_load)
1e3c88bd
PZ
5054 goto out_balanced;
5055
bd939f45 5056 if (env->idle == CPU_IDLE) {
aae6d3dd
SS
5057 /*
5058 * This cpu is idle. If the busiest group load doesn't
5059 * have more tasks than the number of available cpu's and
5060 * there is no imbalance between this and busiest group
5061 * wrt to idle cpu's, it is balanced.
5062 */
56cf515b
JK
5063 if ((local->idle_cpus < busiest->idle_cpus) &&
5064 busiest->sum_nr_running <= busiest->group_weight)
aae6d3dd 5065 goto out_balanced;
c186fafe
PZ
5066 } else {
5067 /*
5068 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
5069 * imbalance_pct to be conservative.
5070 */
56cf515b
JK
5071 if (100 * busiest->avg_load <=
5072 env->sd->imbalance_pct * local->avg_load)
c186fafe 5073 goto out_balanced;
aae6d3dd 5074 }
1e3c88bd 5075
fab47622 5076force_balance:
1e3c88bd 5077 /* Looks like there is an imbalance. Compute it */
bd939f45 5078 calculate_imbalance(env, &sds);
1e3c88bd
PZ
5079 return sds.busiest;
5080
5081out_balanced:
bd939f45 5082 env->imbalance = 0;
1e3c88bd
PZ
5083 return NULL;
5084}
5085
5086/*
5087 * find_busiest_queue - find the busiest runqueue among the cpus in group.
5088 */
bd939f45 5089static struct rq *find_busiest_queue(struct lb_env *env,
b9403130 5090 struct sched_group *group)
1e3c88bd
PZ
5091{
5092 struct rq *busiest = NULL, *rq;
95a79b80 5093 unsigned long busiest_load = 0, busiest_power = 1;
1e3c88bd
PZ
5094 int i;
5095
6906a408 5096 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
1e3c88bd 5097 unsigned long power = power_of(i);
1399fa78
NR
5098 unsigned long capacity = DIV_ROUND_CLOSEST(power,
5099 SCHED_POWER_SCALE);
1e3c88bd
PZ
5100 unsigned long wl;
5101
9d5efe05 5102 if (!capacity)
bd939f45 5103 capacity = fix_small_capacity(env->sd, group);
9d5efe05 5104
1e3c88bd 5105 rq = cpu_rq(i);
6e40f5bb 5106 wl = weighted_cpuload(i);
1e3c88bd 5107
6e40f5bb
TG
5108 /*
5109 * When comparing with imbalance, use weighted_cpuload()
5110 * which is not scaled with the cpu power.
5111 */
bd939f45 5112 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
1e3c88bd
PZ
5113 continue;
5114
6e40f5bb
TG
5115 /*
5116 * For the load comparisons with the other cpu's, consider
5117 * the weighted_cpuload() scaled with the cpu power, so that
5118 * the load can be moved away from the cpu that is potentially
5119 * running at a lower capacity.
95a79b80
JK
5120 *
5121 * Thus we're looking for max(wl_i / power_i), crosswise
5122 * multiplication to rid ourselves of the division works out
5123 * to: wl_i * power_j > wl_j * power_i; where j is our
5124 * previous maximum.
6e40f5bb 5125 */
95a79b80
JK
5126 if (wl * busiest_power > busiest_load * power) {
5127 busiest_load = wl;
5128 busiest_power = power;
1e3c88bd
PZ
5129 busiest = rq;
5130 }
5131 }
5132
5133 return busiest;
5134}
5135
5136/*
5137 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
5138 * so long as it is large enough.
5139 */
5140#define MAX_PINNED_INTERVAL 512
5141
5142/* Working cpumask for load_balance and load_balance_newidle. */
e6252c3e 5143DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
1e3c88bd 5144
bd939f45 5145static int need_active_balance(struct lb_env *env)
1af3ed3d 5146{
bd939f45
PZ
5147 struct sched_domain *sd = env->sd;
5148
5149 if (env->idle == CPU_NEWLY_IDLE) {
532cb4c4
MN
5150
5151 /*
5152 * ASYM_PACKING needs to force migrate tasks from busy but
5153 * higher numbered CPUs in order to pack all tasks in the
5154 * lowest numbered CPUs.
5155 */
bd939f45 5156 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
532cb4c4 5157 return 1;
1af3ed3d
PZ
5158 }
5159
5160 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
5161}
5162
969c7921
TH
5163static int active_load_balance_cpu_stop(void *data);
5164
23f0d209
JK
5165static int should_we_balance(struct lb_env *env)
5166{
5167 struct sched_group *sg = env->sd->groups;
5168 struct cpumask *sg_cpus, *sg_mask;
5169 int cpu, balance_cpu = -1;
5170
5171 /*
5172 * In the newly idle case, we will allow all the cpu's
5173 * to do the newly idle load balance.
5174 */
5175 if (env->idle == CPU_NEWLY_IDLE)
5176 return 1;
5177
5178 sg_cpus = sched_group_cpus(sg);
5179 sg_mask = sched_group_mask(sg);
5180 /* Try to find first idle cpu */
5181 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
5182 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
5183 continue;
5184
5185 balance_cpu = cpu;
5186 break;
5187 }
5188
5189 if (balance_cpu == -1)
5190 balance_cpu = group_balance_cpu(sg);
5191
5192 /*
5193 * First idle cpu or the first cpu(busiest) in this sched group
5194 * is eligible for doing load balancing at this and above domains.
5195 */
b0cff9d8 5196 return balance_cpu == env->dst_cpu;
23f0d209
JK
5197}
5198
1e3c88bd
PZ
5199/*
5200 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5201 * tasks if there is an imbalance.
5202 */
5203static int load_balance(int this_cpu, struct rq *this_rq,
5204 struct sched_domain *sd, enum cpu_idle_type idle,
23f0d209 5205 int *continue_balancing)
1e3c88bd 5206{
88b8dac0 5207 int ld_moved, cur_ld_moved, active_balance = 0;
6263322c 5208 struct sched_domain *sd_parent = sd->parent;
1e3c88bd 5209 struct sched_group *group;
1e3c88bd
PZ
5210 struct rq *busiest;
5211 unsigned long flags;
e6252c3e 5212 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
1e3c88bd 5213
8e45cb54
PZ
5214 struct lb_env env = {
5215 .sd = sd,
ddcdf6e7
PZ
5216 .dst_cpu = this_cpu,
5217 .dst_rq = this_rq,
88b8dac0 5218 .dst_grpmask = sched_group_cpus(sd->groups),
8e45cb54 5219 .idle = idle,
eb95308e 5220 .loop_break = sched_nr_migrate_break,
b9403130 5221 .cpus = cpus,
8e45cb54
PZ
5222 };
5223
cfc03118
JK
5224 /*
5225 * For NEWLY_IDLE load_balancing, we don't need to consider
5226 * other cpus in our group
5227 */
e02e60c1 5228 if (idle == CPU_NEWLY_IDLE)
cfc03118 5229 env.dst_grpmask = NULL;
cfc03118 5230
1e3c88bd
PZ
5231 cpumask_copy(cpus, cpu_active_mask);
5232
1e3c88bd
PZ
5233 schedstat_inc(sd, lb_count[idle]);
5234
5235redo:
23f0d209
JK
5236 if (!should_we_balance(&env)) {
5237 *continue_balancing = 0;
1e3c88bd 5238 goto out_balanced;
23f0d209 5239 }
1e3c88bd 5240
23f0d209 5241 group = find_busiest_group(&env);
1e3c88bd
PZ
5242 if (!group) {
5243 schedstat_inc(sd, lb_nobusyg[idle]);
5244 goto out_balanced;
5245 }
5246
b9403130 5247 busiest = find_busiest_queue(&env, group);
1e3c88bd
PZ
5248 if (!busiest) {
5249 schedstat_inc(sd, lb_nobusyq[idle]);
5250 goto out_balanced;
5251 }
5252
78feefc5 5253 BUG_ON(busiest == env.dst_rq);
1e3c88bd 5254
bd939f45 5255 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
1e3c88bd
PZ
5256
5257 ld_moved = 0;
5258 if (busiest->nr_running > 1) {
5259 /*
5260 * Attempt to move tasks. If find_busiest_group has found
5261 * an imbalance but busiest->nr_running <= 1, the group is
5262 * still unbalanced. ld_moved simply stays zero, so it is
5263 * correctly treated as an imbalance.
5264 */
8e45cb54 5265 env.flags |= LBF_ALL_PINNED;
c82513e5
PZ
5266 env.src_cpu = busiest->cpu;
5267 env.src_rq = busiest;
5268 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8e45cb54 5269
5d6523eb 5270more_balance:
1e3c88bd 5271 local_irq_save(flags);
78feefc5 5272 double_rq_lock(env.dst_rq, busiest);
88b8dac0
SV
5273
5274 /*
5275 * cur_ld_moved - load moved in current iteration
5276 * ld_moved - cumulative load moved across iterations
5277 */
5278 cur_ld_moved = move_tasks(&env);
5279 ld_moved += cur_ld_moved;
78feefc5 5280 double_rq_unlock(env.dst_rq, busiest);
1e3c88bd
PZ
5281 local_irq_restore(flags);
5282
5283 /*
5284 * some other cpu did the load balance for us.
5285 */
88b8dac0
SV
5286 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5287 resched_cpu(env.dst_cpu);
5288
f1cd0858
JK
5289 if (env.flags & LBF_NEED_BREAK) {
5290 env.flags &= ~LBF_NEED_BREAK;
5291 goto more_balance;
5292 }
5293
88b8dac0
SV
5294 /*
5295 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5296 * us and move them to an alternate dst_cpu in our sched_group
5297 * where they can run. The upper limit on how many times we
5298 * iterate on same src_cpu is dependent on number of cpus in our
5299 * sched_group.
5300 *
5301 * This changes load balance semantics a bit on who can move
5302 * load to a given_cpu. In addition to the given_cpu itself
5303 * (or a ilb_cpu acting on its behalf where given_cpu is
5304 * nohz-idle), we now have balance_cpu in a position to move
5305 * load to given_cpu. In rare situations, this may cause
5306 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5307 * _independently_ and at _same_ time to move some load to
5308 * given_cpu) causing exceess load to be moved to given_cpu.
5309 * This however should not happen so much in practice and
5310 * moreover subsequent load balance cycles should correct the
5311 * excess load moved.
5312 */
6263322c 5313 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
88b8dac0 5314
7aff2e3a
VD
5315 /* Prevent to re-select dst_cpu via env's cpus */
5316 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5317
78feefc5 5318 env.dst_rq = cpu_rq(env.new_dst_cpu);
88b8dac0 5319 env.dst_cpu = env.new_dst_cpu;
6263322c 5320 env.flags &= ~LBF_DST_PINNED;
88b8dac0
SV
5321 env.loop = 0;
5322 env.loop_break = sched_nr_migrate_break;
e02e60c1 5323
88b8dac0
SV
5324 /*
5325 * Go back to "more_balance" rather than "redo" since we
5326 * need to continue with same src_cpu.
5327 */
5328 goto more_balance;
5329 }
1e3c88bd 5330
6263322c
PZ
5331 /*
5332 * We failed to reach balance because of affinity.
5333 */
5334 if (sd_parent) {
5335 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
5336
5337 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5338 *group_imbalance = 1;
5339 } else if (*group_imbalance)
5340 *group_imbalance = 0;
5341 }
5342
1e3c88bd 5343 /* All tasks on this runqueue were pinned by CPU affinity */
8e45cb54 5344 if (unlikely(env.flags & LBF_ALL_PINNED)) {
1e3c88bd 5345 cpumask_clear_cpu(cpu_of(busiest), cpus);
bbf18b19
PN
5346 if (!cpumask_empty(cpus)) {
5347 env.loop = 0;
5348 env.loop_break = sched_nr_migrate_break;
1e3c88bd 5349 goto redo;
bbf18b19 5350 }
1e3c88bd
PZ
5351 goto out_balanced;
5352 }
5353 }
5354
5355 if (!ld_moved) {
5356 schedstat_inc(sd, lb_failed[idle]);
58b26c4c
VP
5357 /*
5358 * Increment the failure counter only on periodic balance.
5359 * We do not want newidle balance, which can be very
5360 * frequent, pollute the failure counter causing
5361 * excessive cache_hot migrations and active balances.
5362 */
5363 if (idle != CPU_NEWLY_IDLE)
5364 sd->nr_balance_failed++;
1e3c88bd 5365
bd939f45 5366 if (need_active_balance(&env)) {
1e3c88bd
PZ
5367 raw_spin_lock_irqsave(&busiest->lock, flags);
5368
969c7921
TH
5369 /* don't kick the active_load_balance_cpu_stop,
5370 * if the curr task on busiest cpu can't be
5371 * moved to this_cpu
1e3c88bd
PZ
5372 */
5373 if (!cpumask_test_cpu(this_cpu,
fa17b507 5374 tsk_cpus_allowed(busiest->curr))) {
1e3c88bd
PZ
5375 raw_spin_unlock_irqrestore(&busiest->lock,
5376 flags);
8e45cb54 5377 env.flags |= LBF_ALL_PINNED;
1e3c88bd
PZ
5378 goto out_one_pinned;
5379 }
5380
969c7921
TH
5381 /*
5382 * ->active_balance synchronizes accesses to
5383 * ->active_balance_work. Once set, it's cleared
5384 * only after active load balance is finished.
5385 */
1e3c88bd
PZ
5386 if (!busiest->active_balance) {
5387 busiest->active_balance = 1;
5388 busiest->push_cpu = this_cpu;
5389 active_balance = 1;
5390 }
5391 raw_spin_unlock_irqrestore(&busiest->lock, flags);
969c7921 5392
bd939f45 5393 if (active_balance) {
969c7921
TH
5394 stop_one_cpu_nowait(cpu_of(busiest),
5395 active_load_balance_cpu_stop, busiest,
5396 &busiest->active_balance_work);
bd939f45 5397 }
1e3c88bd
PZ
5398
5399 /*
5400 * We've kicked active balancing, reset the failure
5401 * counter.
5402 */
5403 sd->nr_balance_failed = sd->cache_nice_tries+1;
5404 }
5405 } else
5406 sd->nr_balance_failed = 0;
5407
5408 if (likely(!active_balance)) {
5409 /* We were unbalanced, so reset the balancing interval */
5410 sd->balance_interval = sd->min_interval;
5411 } else {
5412 /*
5413 * If we've begun active balancing, start to back off. This
5414 * case may not be covered by the all_pinned logic if there
5415 * is only 1 task on the busy runqueue (because we don't call
5416 * move_tasks).
5417 */
5418 if (sd->balance_interval < sd->max_interval)
5419 sd->balance_interval *= 2;
5420 }
5421
1e3c88bd
PZ
5422 goto out;
5423
5424out_balanced:
5425 schedstat_inc(sd, lb_balanced[idle]);
5426
5427 sd->nr_balance_failed = 0;
5428
5429out_one_pinned:
5430 /* tune up the balancing interval */
8e45cb54 5431 if (((env.flags & LBF_ALL_PINNED) &&
5b54b56b 5432 sd->balance_interval < MAX_PINNED_INTERVAL) ||
1e3c88bd
PZ
5433 (sd->balance_interval < sd->max_interval))
5434 sd->balance_interval *= 2;
5435
46e49b38 5436 ld_moved = 0;
1e3c88bd 5437out:
1e3c88bd
PZ
5438 return ld_moved;
5439}
5440
1e3c88bd
PZ
5441/*
5442 * idle_balance is called by schedule() if this_cpu is about to become
5443 * idle. Attempts to pull tasks from other CPUs.
5444 */
029632fb 5445void idle_balance(int this_cpu, struct rq *this_rq)
1e3c88bd
PZ
5446{
5447 struct sched_domain *sd;
5448 int pulled_task = 0;
5449 unsigned long next_balance = jiffies + HZ;
9bd721c5 5450 u64 curr_cost = 0;
1e3c88bd 5451
78becc27 5452 this_rq->idle_stamp = rq_clock(this_rq);
1e3c88bd
PZ
5453
5454 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5455 return;
5456
f492e12e
PZ
5457 /*
5458 * Drop the rq->lock, but keep IRQ/preempt disabled.
5459 */
5460 raw_spin_unlock(&this_rq->lock);
5461
48a16753 5462 update_blocked_averages(this_cpu);
dce840a0 5463 rcu_read_lock();
1e3c88bd
PZ
5464 for_each_domain(this_cpu, sd) {
5465 unsigned long interval;
23f0d209 5466 int continue_balancing = 1;
9bd721c5 5467 u64 t0, domain_cost;
1e3c88bd
PZ
5468
5469 if (!(sd->flags & SD_LOAD_BALANCE))
5470 continue;
5471
9bd721c5
JL
5472 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
5473 break;
5474
f492e12e 5475 if (sd->flags & SD_BALANCE_NEWIDLE) {
9bd721c5
JL
5476 t0 = sched_clock_cpu(this_cpu);
5477
1e3c88bd 5478 /* If we've pulled tasks over stop searching: */
f492e12e 5479 pulled_task = load_balance(this_cpu, this_rq,
23f0d209
JK
5480 sd, CPU_NEWLY_IDLE,
5481 &continue_balancing);
9bd721c5
JL
5482
5483 domain_cost = sched_clock_cpu(this_cpu) - t0;
5484 if (domain_cost > sd->max_newidle_lb_cost)
5485 sd->max_newidle_lb_cost = domain_cost;
5486
5487 curr_cost += domain_cost;
f492e12e 5488 }
1e3c88bd
PZ
5489
5490 interval = msecs_to_jiffies(sd->balance_interval);
5491 if (time_after(next_balance, sd->last_balance + interval))
5492 next_balance = sd->last_balance + interval;
d5ad140b
NR
5493 if (pulled_task) {
5494 this_rq->idle_stamp = 0;
1e3c88bd 5495 break;
d5ad140b 5496 }
1e3c88bd 5497 }
dce840a0 5498 rcu_read_unlock();
f492e12e
PZ
5499
5500 raw_spin_lock(&this_rq->lock);
5501
1e3c88bd
PZ
5502 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5503 /*
5504 * We are going idle. next_balance may be set based on
5505 * a busy processor. So reset next_balance.
5506 */
5507 this_rq->next_balance = next_balance;
5508 }
9bd721c5
JL
5509
5510 if (curr_cost > this_rq->max_idle_balance_cost)
5511 this_rq->max_idle_balance_cost = curr_cost;
1e3c88bd
PZ
5512}
5513
5514/*
969c7921
TH
5515 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5516 * running tasks off the busiest CPU onto idle CPUs. It requires at
5517 * least 1 task to be running on each physical CPU where possible, and
5518 * avoids physical / logical imbalances.
1e3c88bd 5519 */
969c7921 5520static int active_load_balance_cpu_stop(void *data)
1e3c88bd 5521{
969c7921
TH
5522 struct rq *busiest_rq = data;
5523 int busiest_cpu = cpu_of(busiest_rq);
1e3c88bd 5524 int target_cpu = busiest_rq->push_cpu;
969c7921 5525 struct rq *target_rq = cpu_rq(target_cpu);
1e3c88bd 5526 struct sched_domain *sd;
969c7921
TH
5527
5528 raw_spin_lock_irq(&busiest_rq->lock);
5529
5530 /* make sure the requested cpu hasn't gone down in the meantime */
5531 if (unlikely(busiest_cpu != smp_processor_id() ||
5532 !busiest_rq->active_balance))
5533 goto out_unlock;
1e3c88bd
PZ
5534
5535 /* Is there any task to move? */
5536 if (busiest_rq->nr_running <= 1)
969c7921 5537 goto out_unlock;
1e3c88bd
PZ
5538
5539 /*
5540 * This condition is "impossible", if it occurs
5541 * we need to fix it. Originally reported by
5542 * Bjorn Helgaas on a 128-cpu setup.
5543 */
5544 BUG_ON(busiest_rq == target_rq);
5545
5546 /* move a task from busiest_rq to target_rq */
5547 double_lock_balance(busiest_rq, target_rq);
1e3c88bd
PZ
5548
5549 /* Search for an sd spanning us and the target CPU. */
dce840a0 5550 rcu_read_lock();
1e3c88bd
PZ
5551 for_each_domain(target_cpu, sd) {
5552 if ((sd->flags & SD_LOAD_BALANCE) &&
5553 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5554 break;
5555 }
5556
5557 if (likely(sd)) {
8e45cb54
PZ
5558 struct lb_env env = {
5559 .sd = sd,
ddcdf6e7
PZ
5560 .dst_cpu = target_cpu,
5561 .dst_rq = target_rq,
5562 .src_cpu = busiest_rq->cpu,
5563 .src_rq = busiest_rq,
8e45cb54
PZ
5564 .idle = CPU_IDLE,
5565 };
5566
1e3c88bd
PZ
5567 schedstat_inc(sd, alb_count);
5568
8e45cb54 5569 if (move_one_task(&env))
1e3c88bd
PZ
5570 schedstat_inc(sd, alb_pushed);
5571 else
5572 schedstat_inc(sd, alb_failed);
5573 }
dce840a0 5574 rcu_read_unlock();
1e3c88bd 5575 double_unlock_balance(busiest_rq, target_rq);
969c7921
TH
5576out_unlock:
5577 busiest_rq->active_balance = 0;
5578 raw_spin_unlock_irq(&busiest_rq->lock);
5579 return 0;
1e3c88bd
PZ
5580}
5581
3451d024 5582#ifdef CONFIG_NO_HZ_COMMON
83cd4fe2
VP
5583/*
5584 * idle load balancing details
83cd4fe2
VP
5585 * - When one of the busy CPUs notice that there may be an idle rebalancing
5586 * needed, they will kick the idle load balancer, which then does idle
5587 * load balancing for all the idle CPUs.
5588 */
1e3c88bd 5589static struct {
83cd4fe2 5590 cpumask_var_t idle_cpus_mask;
0b005cf5 5591 atomic_t nr_cpus;
83cd4fe2
VP
5592 unsigned long next_balance; /* in jiffy units */
5593} nohz ____cacheline_aligned;
1e3c88bd 5594
8e7fbcbc 5595static inline int find_new_ilb(int call_cpu)
1e3c88bd 5596{
0b005cf5 5597 int ilb = cpumask_first(nohz.idle_cpus_mask);
1e3c88bd 5598
786d6dc7
SS
5599 if (ilb < nr_cpu_ids && idle_cpu(ilb))
5600 return ilb;
5601
5602 return nr_cpu_ids;
1e3c88bd 5603}
1e3c88bd 5604
83cd4fe2
VP
5605/*
5606 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5607 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5608 * CPU (if there is one).
5609 */
5610static void nohz_balancer_kick(int cpu)
5611{
5612 int ilb_cpu;
5613
5614 nohz.next_balance++;
5615
0b005cf5 5616 ilb_cpu = find_new_ilb(cpu);
83cd4fe2 5617
0b005cf5
SS
5618 if (ilb_cpu >= nr_cpu_ids)
5619 return;
83cd4fe2 5620
cd490c5b 5621 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
1c792db7
SS
5622 return;
5623 /*
5624 * Use smp_send_reschedule() instead of resched_cpu().
5625 * This way we generate a sched IPI on the target cpu which
5626 * is idle. And the softirq performing nohz idle load balance
5627 * will be run before returning from the IPI.
5628 */
5629 smp_send_reschedule(ilb_cpu);
83cd4fe2
VP
5630 return;
5631}
5632
c1cc017c 5633static inline void nohz_balance_exit_idle(int cpu)
71325960
SS
5634{
5635 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5636 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5637 atomic_dec(&nohz.nr_cpus);
5638 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5639 }
5640}
5641
69e1e811
SS
5642static inline void set_cpu_sd_state_busy(void)
5643{
5644 struct sched_domain *sd;
69e1e811 5645
69e1e811 5646 rcu_read_lock();
424c93fe 5647 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
25f55d9d
VG
5648
5649 if (!sd || !sd->nohz_idle)
5650 goto unlock;
5651 sd->nohz_idle = 0;
5652
5653 for (; sd; sd = sd->parent)
69e1e811 5654 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
25f55d9d 5655unlock:
69e1e811
SS
5656 rcu_read_unlock();
5657}
5658
5659void set_cpu_sd_state_idle(void)
5660{
5661 struct sched_domain *sd;
69e1e811 5662
69e1e811 5663 rcu_read_lock();
424c93fe 5664 sd = rcu_dereference_check_sched_domain(this_rq()->sd);
25f55d9d
VG
5665
5666 if (!sd || sd->nohz_idle)
5667 goto unlock;
5668 sd->nohz_idle = 1;
5669
5670 for (; sd; sd = sd->parent)
69e1e811 5671 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
25f55d9d 5672unlock:
69e1e811
SS
5673 rcu_read_unlock();
5674}
5675
1e3c88bd 5676/*
c1cc017c 5677 * This routine will record that the cpu is going idle with tick stopped.
0b005cf5 5678 * This info will be used in performing idle load balancing in the future.
1e3c88bd 5679 */
c1cc017c 5680void nohz_balance_enter_idle(int cpu)
1e3c88bd 5681{
71325960
SS
5682 /*
5683 * If this cpu is going down, then nothing needs to be done.
5684 */
5685 if (!cpu_active(cpu))
5686 return;
5687
c1cc017c
AS
5688 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5689 return;
1e3c88bd 5690
c1cc017c
AS
5691 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5692 atomic_inc(&nohz.nr_cpus);
5693 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
1e3c88bd 5694}
71325960 5695
0db0628d 5696static int sched_ilb_notifier(struct notifier_block *nfb,
71325960
SS
5697 unsigned long action, void *hcpu)
5698{
5699 switch (action & ~CPU_TASKS_FROZEN) {
5700 case CPU_DYING:
c1cc017c 5701 nohz_balance_exit_idle(smp_processor_id());
71325960
SS
5702 return NOTIFY_OK;
5703 default:
5704 return NOTIFY_DONE;
5705 }
5706}
1e3c88bd
PZ
5707#endif
5708
5709static DEFINE_SPINLOCK(balancing);
5710
49c022e6
PZ
5711/*
5712 * Scale the max load_balance interval with the number of CPUs in the system.
5713 * This trades load-balance latency on larger machines for less cross talk.
5714 */
029632fb 5715void update_max_interval(void)
49c022e6
PZ
5716{
5717 max_load_balance_interval = HZ*num_online_cpus()/10;
5718}
5719
1e3c88bd
PZ
5720/*
5721 * It checks each scheduling domain to see if it is due to be balanced,
5722 * and initiates a balancing operation if so.
5723 *
b9b0853a 5724 * Balancing parameters are set up in init_sched_domains.
1e3c88bd
PZ
5725 */
5726static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5727{
23f0d209 5728 int continue_balancing = 1;
1e3c88bd
PZ
5729 struct rq *rq = cpu_rq(cpu);
5730 unsigned long interval;
04f733b4 5731 struct sched_domain *sd;
1e3c88bd
PZ
5732 /* Earliest time when we have to do rebalance again */
5733 unsigned long next_balance = jiffies + 60*HZ;
5734 int update_next_balance = 0;
f48627e6
JL
5735 int need_serialize, need_decay = 0;
5736 u64 max_cost = 0;
1e3c88bd 5737
48a16753 5738 update_blocked_averages(cpu);
2069dd75 5739
dce840a0 5740 rcu_read_lock();
1e3c88bd 5741 for_each_domain(cpu, sd) {
f48627e6
JL
5742 /*
5743 * Decay the newidle max times here because this is a regular
5744 * visit to all the domains. Decay ~1% per second.
5745 */
5746 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
5747 sd->max_newidle_lb_cost =
5748 (sd->max_newidle_lb_cost * 253) / 256;
5749 sd->next_decay_max_lb_cost = jiffies + HZ;
5750 need_decay = 1;
5751 }
5752 max_cost += sd->max_newidle_lb_cost;
5753
1e3c88bd
PZ
5754 if (!(sd->flags & SD_LOAD_BALANCE))
5755 continue;
5756
f48627e6
JL
5757 /*
5758 * Stop the load balance at this level. There is another
5759 * CPU in our sched group which is doing load balancing more
5760 * actively.
5761 */
5762 if (!continue_balancing) {
5763 if (need_decay)
5764 continue;
5765 break;
5766 }
5767
1e3c88bd
PZ
5768 interval = sd->balance_interval;
5769 if (idle != CPU_IDLE)
5770 interval *= sd->busy_factor;
5771
5772 /* scale ms to jiffies */
5773 interval = msecs_to_jiffies(interval);
49c022e6 5774 interval = clamp(interval, 1UL, max_load_balance_interval);
1e3c88bd
PZ
5775
5776 need_serialize = sd->flags & SD_SERIALIZE;
5777
5778 if (need_serialize) {
5779 if (!spin_trylock(&balancing))
5780 goto out;
5781 }
5782
5783 if (time_after_eq(jiffies, sd->last_balance + interval)) {
23f0d209 5784 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
1e3c88bd 5785 /*
6263322c 5786 * The LBF_DST_PINNED logic could have changed
de5eb2dd
JK
5787 * env->dst_cpu, so we can't know our idle
5788 * state even if we migrated tasks. Update it.
1e3c88bd 5789 */
de5eb2dd 5790 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
1e3c88bd
PZ
5791 }
5792 sd->last_balance = jiffies;
5793 }
5794 if (need_serialize)
5795 spin_unlock(&balancing);
5796out:
5797 if (time_after(next_balance, sd->last_balance + interval)) {
5798 next_balance = sd->last_balance + interval;
5799 update_next_balance = 1;
5800 }
f48627e6
JL
5801 }
5802 if (need_decay) {
1e3c88bd 5803 /*
f48627e6
JL
5804 * Ensure the rq-wide value also decays but keep it at a
5805 * reasonable floor to avoid funnies with rq->avg_idle.
1e3c88bd 5806 */
f48627e6
JL
5807 rq->max_idle_balance_cost =
5808 max((u64)sysctl_sched_migration_cost, max_cost);
1e3c88bd 5809 }
dce840a0 5810 rcu_read_unlock();
1e3c88bd
PZ
5811
5812 /*
5813 * next_balance will be updated only when there is a need.
5814 * When the cpu is attached to null domain for ex, it will not be
5815 * updated.
5816 */
5817 if (likely(update_next_balance))
5818 rq->next_balance = next_balance;
5819}
5820
3451d024 5821#ifdef CONFIG_NO_HZ_COMMON
1e3c88bd 5822/*
3451d024 5823 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
1e3c88bd
PZ
5824 * rebalancing for all the cpus for whom scheduler ticks are stopped.
5825 */
83cd4fe2
VP
5826static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5827{
5828 struct rq *this_rq = cpu_rq(this_cpu);
5829 struct rq *rq;
5830 int balance_cpu;
5831
1c792db7
SS
5832 if (idle != CPU_IDLE ||
5833 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5834 goto end;
83cd4fe2
VP
5835
5836 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8a6d42d1 5837 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
83cd4fe2
VP
5838 continue;
5839
5840 /*
5841 * If this cpu gets work to do, stop the load balancing
5842 * work being done for other cpus. Next load
5843 * balancing owner will pick it up.
5844 */
1c792db7 5845 if (need_resched())
83cd4fe2 5846 break;
83cd4fe2 5847
5ed4f1d9
VG
5848 rq = cpu_rq(balance_cpu);
5849
5850 raw_spin_lock_irq(&rq->lock);
5851 update_rq_clock(rq);
5852 update_idle_cpu_load(rq);
5853 raw_spin_unlock_irq(&rq->lock);
83cd4fe2
VP
5854
5855 rebalance_domains(balance_cpu, CPU_IDLE);
5856
83cd4fe2
VP
5857 if (time_after(this_rq->next_balance, rq->next_balance))
5858 this_rq->next_balance = rq->next_balance;
5859 }
5860 nohz.next_balance = this_rq->next_balance;
1c792db7
SS
5861end:
5862 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
83cd4fe2
VP
5863}
5864
5865/*
0b005cf5
SS
5866 * Current heuristic for kicking the idle load balancer in the presence
5867 * of an idle cpu is the system.
5868 * - This rq has more than one task.
5869 * - At any scheduler domain level, this cpu's scheduler group has multiple
5870 * busy cpu's exceeding the group's power.
5871 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5872 * domain span are idle.
83cd4fe2
VP
5873 */
5874static inline int nohz_kick_needed(struct rq *rq, int cpu)
5875{
5876 unsigned long now = jiffies;
0b005cf5 5877 struct sched_domain *sd;
83cd4fe2 5878
1c792db7 5879 if (unlikely(idle_cpu(cpu)))
83cd4fe2
VP
5880 return 0;
5881
1c792db7
SS
5882 /*
5883 * We may be recently in ticked or tickless idle mode. At the first
5884 * busy tick after returning from idle, we will update the busy stats.
5885 */
69e1e811 5886 set_cpu_sd_state_busy();
c1cc017c 5887 nohz_balance_exit_idle(cpu);
0b005cf5
SS
5888
5889 /*
5890 * None are in tickless mode and hence no need for NOHZ idle load
5891 * balancing.
5892 */
5893 if (likely(!atomic_read(&nohz.nr_cpus)))
5894 return 0;
1c792db7
SS
5895
5896 if (time_before(now, nohz.next_balance))
83cd4fe2
VP
5897 return 0;
5898
0b005cf5
SS
5899 if (rq->nr_running >= 2)
5900 goto need_kick;
83cd4fe2 5901
067491b7 5902 rcu_read_lock();
0b005cf5
SS
5903 for_each_domain(cpu, sd) {
5904 struct sched_group *sg = sd->groups;
5905 struct sched_group_power *sgp = sg->sgp;
5906 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
83cd4fe2 5907
0b005cf5 5908 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
067491b7 5909 goto need_kick_unlock;
0b005cf5
SS
5910
5911 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5912 && (cpumask_first_and(nohz.idle_cpus_mask,
5913 sched_domain_span(sd)) < cpu))
067491b7 5914 goto need_kick_unlock;
0b005cf5
SS
5915
5916 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5917 break;
83cd4fe2 5918 }
067491b7 5919 rcu_read_unlock();
83cd4fe2 5920 return 0;
067491b7
PZ
5921
5922need_kick_unlock:
5923 rcu_read_unlock();
0b005cf5
SS
5924need_kick:
5925 return 1;
83cd4fe2
VP
5926}
5927#else
5928static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5929#endif
5930
5931/*
5932 * run_rebalance_domains is triggered when needed from the scheduler tick.
5933 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5934 */
1e3c88bd
PZ
5935static void run_rebalance_domains(struct softirq_action *h)
5936{
5937 int this_cpu = smp_processor_id();
5938 struct rq *this_rq = cpu_rq(this_cpu);
6eb57e0d 5939 enum cpu_idle_type idle = this_rq->idle_balance ?
1e3c88bd
PZ
5940 CPU_IDLE : CPU_NOT_IDLE;
5941
5942 rebalance_domains(this_cpu, idle);
5943
1e3c88bd 5944 /*
83cd4fe2 5945 * If this cpu has a pending nohz_balance_kick, then do the
1e3c88bd
PZ
5946 * balancing on behalf of the other idle cpus whose ticks are
5947 * stopped.
5948 */
83cd4fe2 5949 nohz_idle_balance(this_cpu, idle);
1e3c88bd
PZ
5950}
5951
5952static inline int on_null_domain(int cpu)
5953{
90a6501f 5954 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
1e3c88bd
PZ
5955}
5956
5957/*
5958 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
1e3c88bd 5959 */
029632fb 5960void trigger_load_balance(struct rq *rq, int cpu)
1e3c88bd 5961{
1e3c88bd
PZ
5962 /* Don't need to rebalance while attached to NULL domain */
5963 if (time_after_eq(jiffies, rq->next_balance) &&
5964 likely(!on_null_domain(cpu)))
5965 raise_softirq(SCHED_SOFTIRQ);
3451d024 5966#ifdef CONFIG_NO_HZ_COMMON
1c792db7 5967 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
83cd4fe2
VP
5968 nohz_balancer_kick(cpu);
5969#endif
1e3c88bd
PZ
5970}
5971
0bcdcf28
CE
5972static void rq_online_fair(struct rq *rq)
5973{
5974 update_sysctl();
5975}
5976
5977static void rq_offline_fair(struct rq *rq)
5978{
5979 update_sysctl();
a4c96ae3
PB
5980
5981 /* Ensure any throttled groups are reachable by pick_next_task */
5982 unthrottle_offline_cfs_rqs(rq);
0bcdcf28
CE
5983}
5984
55e12e5e 5985#endif /* CONFIG_SMP */
e1d1484f 5986
bf0f6f24
IM
5987/*
5988 * scheduler tick hitting a task of our scheduling class:
5989 */
8f4d37ec 5990static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
bf0f6f24
IM
5991{
5992 struct cfs_rq *cfs_rq;
5993 struct sched_entity *se = &curr->se;
5994
5995 for_each_sched_entity(se) {
5996 cfs_rq = cfs_rq_of(se);
8f4d37ec 5997 entity_tick(cfs_rq, se, queued);
bf0f6f24 5998 }
18bf2805 5999
10e84b97 6000 if (numabalancing_enabled)
cbee9f88 6001 task_tick_numa(rq, curr);
3d59eebc 6002
18bf2805 6003 update_rq_runnable_avg(rq, 1);
bf0f6f24
IM
6004}
6005
6006/*
cd29fe6f
PZ
6007 * called on fork with the child task as argument from the parent's context
6008 * - child not yet on the tasklist
6009 * - preemption disabled
bf0f6f24 6010 */
cd29fe6f 6011static void task_fork_fair(struct task_struct *p)
bf0f6f24 6012{
4fc420c9
DN
6013 struct cfs_rq *cfs_rq;
6014 struct sched_entity *se = &p->se, *curr;
00bf7bfc 6015 int this_cpu = smp_processor_id();
cd29fe6f
PZ
6016 struct rq *rq = this_rq();
6017 unsigned long flags;
6018
05fa785c 6019 raw_spin_lock_irqsave(&rq->lock, flags);
bf0f6f24 6020
861d034e
PZ
6021 update_rq_clock(rq);
6022
4fc420c9
DN
6023 cfs_rq = task_cfs_rq(current);
6024 curr = cfs_rq->curr;
6025
6c9a27f5
DN
6026 /*
6027 * Not only the cpu but also the task_group of the parent might have
6028 * been changed after parent->se.parent,cfs_rq were copied to
6029 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
6030 * of child point to valid ones.
6031 */
6032 rcu_read_lock();
6033 __set_task_cpu(p, this_cpu);
6034 rcu_read_unlock();
bf0f6f24 6035
7109c442 6036 update_curr(cfs_rq);
cd29fe6f 6037
b5d9d734
MG
6038 if (curr)
6039 se->vruntime = curr->vruntime;
aeb73b04 6040 place_entity(cfs_rq, se, 1);
4d78e7b6 6041
cd29fe6f 6042 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
87fefa38 6043 /*
edcb60a3
IM
6044 * Upon rescheduling, sched_class::put_prev_task() will place
6045 * 'current' within the tree based on its new key value.
6046 */
4d78e7b6 6047 swap(curr->vruntime, se->vruntime);
aec0a514 6048 resched_task(rq->curr);
4d78e7b6 6049 }
bf0f6f24 6050
88ec22d3
PZ
6051 se->vruntime -= cfs_rq->min_vruntime;
6052
05fa785c 6053 raw_spin_unlock_irqrestore(&rq->lock, flags);
bf0f6f24
IM
6054}
6055
cb469845
SR
6056/*
6057 * Priority of the task has changed. Check to see if we preempt
6058 * the current task.
6059 */
da7a735e
PZ
6060static void
6061prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
cb469845 6062{
da7a735e
PZ
6063 if (!p->se.on_rq)
6064 return;
6065
cb469845
SR
6066 /*
6067 * Reschedule if we are currently running on this runqueue and
6068 * our priority decreased, or if we are not currently running on
6069 * this runqueue and our priority is higher than the current's
6070 */
da7a735e 6071 if (rq->curr == p) {
cb469845
SR
6072 if (p->prio > oldprio)
6073 resched_task(rq->curr);
6074 } else
15afe09b 6075 check_preempt_curr(rq, p, 0);
cb469845
SR
6076}
6077
da7a735e
PZ
6078static void switched_from_fair(struct rq *rq, struct task_struct *p)
6079{
6080 struct sched_entity *se = &p->se;
6081 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6082
6083 /*
6084 * Ensure the task's vruntime is normalized, so that when its
6085 * switched back to the fair class the enqueue_entity(.flags=0) will
6086 * do the right thing.
6087 *
6088 * If it was on_rq, then the dequeue_entity(.flags=0) will already
6089 * have normalized the vruntime, if it was !on_rq, then only when
6090 * the task is sleeping will it still have non-normalized vruntime.
6091 */
6092 if (!se->on_rq && p->state != TASK_RUNNING) {
6093 /*
6094 * Fix up our vruntime so that the current sleep doesn't
6095 * cause 'unlimited' sleep bonus.
6096 */
6097 place_entity(cfs_rq, se, 0);
6098 se->vruntime -= cfs_rq->min_vruntime;
6099 }
9ee474f5 6100
141965c7 6101#ifdef CONFIG_SMP
9ee474f5
PT
6102 /*
6103 * Remove our load from contribution when we leave sched_fair
6104 * and ensure we don't carry in an old decay_count if we
6105 * switch back.
6106 */
87e3c8ae
KT
6107 if (se->avg.decay_count) {
6108 __synchronize_entity_decay(se);
6109 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
9ee474f5
PT
6110 }
6111#endif
da7a735e
PZ
6112}
6113
cb469845
SR
6114/*
6115 * We switched to the sched_fair class.
6116 */
da7a735e 6117static void switched_to_fair(struct rq *rq, struct task_struct *p)
cb469845 6118{
da7a735e
PZ
6119 if (!p->se.on_rq)
6120 return;
6121
cb469845
SR
6122 /*
6123 * We were most likely switched from sched_rt, so
6124 * kick off the schedule if running, otherwise just see
6125 * if we can still preempt the current task.
6126 */
da7a735e 6127 if (rq->curr == p)
cb469845
SR
6128 resched_task(rq->curr);
6129 else
15afe09b 6130 check_preempt_curr(rq, p, 0);
cb469845
SR
6131}
6132
83b699ed
SV
6133/* Account for a task changing its policy or group.
6134 *
6135 * This routine is mostly called to set cfs_rq->curr field when a task
6136 * migrates between groups/classes.
6137 */
6138static void set_curr_task_fair(struct rq *rq)
6139{
6140 struct sched_entity *se = &rq->curr->se;
6141
ec12cb7f
PT
6142 for_each_sched_entity(se) {
6143 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6144
6145 set_next_entity(cfs_rq, se);
6146 /* ensure bandwidth has been allocated on our new cfs_rq */
6147 account_cfs_rq_runtime(cfs_rq, 0);
6148 }
83b699ed
SV
6149}
6150
029632fb
PZ
6151void init_cfs_rq(struct cfs_rq *cfs_rq)
6152{
6153 cfs_rq->tasks_timeline = RB_ROOT;
029632fb
PZ
6154 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6155#ifndef CONFIG_64BIT
6156 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
6157#endif
141965c7 6158#ifdef CONFIG_SMP
9ee474f5 6159 atomic64_set(&cfs_rq->decay_counter, 1);
2509940f 6160 atomic_long_set(&cfs_rq->removed_load, 0);
9ee474f5 6161#endif
029632fb
PZ
6162}
6163
810b3817 6164#ifdef CONFIG_FAIR_GROUP_SCHED
b2b5ce02 6165static void task_move_group_fair(struct task_struct *p, int on_rq)
810b3817 6166{
aff3e498 6167 struct cfs_rq *cfs_rq;
b2b5ce02
PZ
6168 /*
6169 * If the task was not on the rq at the time of this cgroup movement
6170 * it must have been asleep, sleeping tasks keep their ->vruntime
6171 * absolute on their old rq until wakeup (needed for the fair sleeper
6172 * bonus in place_entity()).
6173 *
6174 * If it was on the rq, we've just 'preempted' it, which does convert
6175 * ->vruntime to a relative base.
6176 *
6177 * Make sure both cases convert their relative position when migrating
6178 * to another cgroup's rq. This does somewhat interfere with the
6179 * fair sleeper stuff for the first placement, but who cares.
6180 */
7ceff013
DN
6181 /*
6182 * When !on_rq, vruntime of the task has usually NOT been normalized.
6183 * But there are some cases where it has already been normalized:
6184 *
6185 * - Moving a forked child which is waiting for being woken up by
6186 * wake_up_new_task().
62af3783
DN
6187 * - Moving a task which has been woken up by try_to_wake_up() and
6188 * waiting for actually being woken up by sched_ttwu_pending().
7ceff013
DN
6189 *
6190 * To prevent boost or penalty in the new cfs_rq caused by delta
6191 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
6192 */
62af3783 6193 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
7ceff013
DN
6194 on_rq = 1;
6195
b2b5ce02
PZ
6196 if (!on_rq)
6197 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
6198 set_task_rq(p, task_cpu(p));
aff3e498
PT
6199 if (!on_rq) {
6200 cfs_rq = cfs_rq_of(&p->se);
6201 p->se.vruntime += cfs_rq->min_vruntime;
6202#ifdef CONFIG_SMP
6203 /*
6204 * migrate_task_rq_fair() will have removed our previous
6205 * contribution, but we must synchronize for ongoing future
6206 * decay.
6207 */
6208 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
6209 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
6210#endif
6211 }
810b3817 6212}
029632fb
PZ
6213
6214void free_fair_sched_group(struct task_group *tg)
6215{
6216 int i;
6217
6218 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
6219
6220 for_each_possible_cpu(i) {
6221 if (tg->cfs_rq)
6222 kfree(tg->cfs_rq[i]);
6223 if (tg->se)
6224 kfree(tg->se[i]);
6225 }
6226
6227 kfree(tg->cfs_rq);
6228 kfree(tg->se);
6229}
6230
6231int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6232{
6233 struct cfs_rq *cfs_rq;
6234 struct sched_entity *se;
6235 int i;
6236
6237 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
6238 if (!tg->cfs_rq)
6239 goto err;
6240 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
6241 if (!tg->se)
6242 goto err;
6243
6244 tg->shares = NICE_0_LOAD;
6245
6246 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
6247
6248 for_each_possible_cpu(i) {
6249 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
6250 GFP_KERNEL, cpu_to_node(i));
6251 if (!cfs_rq)
6252 goto err;
6253
6254 se = kzalloc_node(sizeof(struct sched_entity),
6255 GFP_KERNEL, cpu_to_node(i));
6256 if (!se)
6257 goto err_free_rq;
6258
6259 init_cfs_rq(cfs_rq);
6260 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
6261 }
6262
6263 return 1;
6264
6265err_free_rq:
6266 kfree(cfs_rq);
6267err:
6268 return 0;
6269}
6270
6271void unregister_fair_sched_group(struct task_group *tg, int cpu)
6272{
6273 struct rq *rq = cpu_rq(cpu);
6274 unsigned long flags;
6275
6276 /*
6277 * Only empty task groups can be destroyed; so we can speculatively
6278 * check on_list without danger of it being re-added.
6279 */
6280 if (!tg->cfs_rq[cpu]->on_list)
6281 return;
6282
6283 raw_spin_lock_irqsave(&rq->lock, flags);
6284 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6285 raw_spin_unlock_irqrestore(&rq->lock, flags);
6286}
6287
6288void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6289 struct sched_entity *se, int cpu,
6290 struct sched_entity *parent)
6291{
6292 struct rq *rq = cpu_rq(cpu);
6293
6294 cfs_rq->tg = tg;
6295 cfs_rq->rq = rq;
029632fb
PZ
6296 init_cfs_rq_runtime(cfs_rq);
6297
6298 tg->cfs_rq[cpu] = cfs_rq;
6299 tg->se[cpu] = se;
6300
6301 /* se could be NULL for root_task_group */
6302 if (!se)
6303 return;
6304
6305 if (!parent)
6306 se->cfs_rq = &rq->cfs;
6307 else
6308 se->cfs_rq = parent->my_q;
6309
6310 se->my_q = cfs_rq;
6311 update_load_set(&se->load, 0);
6312 se->parent = parent;
6313}
6314
6315static DEFINE_MUTEX(shares_mutex);
6316
6317int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6318{
6319 int i;
6320 unsigned long flags;
6321
6322 /*
6323 * We can't change the weight of the root cgroup.
6324 */
6325 if (!tg->se[0])
6326 return -EINVAL;
6327
6328 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6329
6330 mutex_lock(&shares_mutex);
6331 if (tg->shares == shares)
6332 goto done;
6333
6334 tg->shares = shares;
6335 for_each_possible_cpu(i) {
6336 struct rq *rq = cpu_rq(i);
6337 struct sched_entity *se;
6338
6339 se = tg->se[i];
6340 /* Propagate contribution to hierarchy */
6341 raw_spin_lock_irqsave(&rq->lock, flags);
71b1da46
FW
6342
6343 /* Possible calls to update_curr() need rq clock */
6344 update_rq_clock(rq);
17bc14b7 6345 for_each_sched_entity(se)
029632fb
PZ
6346 update_cfs_shares(group_cfs_rq(se));
6347 raw_spin_unlock_irqrestore(&rq->lock, flags);
6348 }
6349
6350done:
6351 mutex_unlock(&shares_mutex);
6352 return 0;
6353}
6354#else /* CONFIG_FAIR_GROUP_SCHED */
6355
6356void free_fair_sched_group(struct task_group *tg) { }
6357
6358int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6359{
6360 return 1;
6361}
6362
6363void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6364
6365#endif /* CONFIG_FAIR_GROUP_SCHED */
6366
810b3817 6367
6d686f45 6368static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
0d721cea
PW
6369{
6370 struct sched_entity *se = &task->se;
0d721cea
PW
6371 unsigned int rr_interval = 0;
6372
6373 /*
6374 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
6375 * idle runqueue:
6376 */
0d721cea 6377 if (rq->cfs.load.weight)
a59f4e07 6378 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
0d721cea
PW
6379
6380 return rr_interval;
6381}
6382
bf0f6f24
IM
6383/*
6384 * All the scheduling class methods:
6385 */
029632fb 6386const struct sched_class fair_sched_class = {
5522d5d5 6387 .next = &idle_sched_class,
bf0f6f24
IM
6388 .enqueue_task = enqueue_task_fair,
6389 .dequeue_task = dequeue_task_fair,
6390 .yield_task = yield_task_fair,
d95f4122 6391 .yield_to_task = yield_to_task_fair,
bf0f6f24 6392
2e09bf55 6393 .check_preempt_curr = check_preempt_wakeup,
bf0f6f24
IM
6394
6395 .pick_next_task = pick_next_task_fair,
6396 .put_prev_task = put_prev_task_fair,
6397
681f3e68 6398#ifdef CONFIG_SMP
4ce72a2c 6399 .select_task_rq = select_task_rq_fair,
0a74bef8 6400 .migrate_task_rq = migrate_task_rq_fair,
141965c7 6401
0bcdcf28
CE
6402 .rq_online = rq_online_fair,
6403 .rq_offline = rq_offline_fair,
88ec22d3
PZ
6404
6405 .task_waking = task_waking_fair,
681f3e68 6406#endif
bf0f6f24 6407
83b699ed 6408 .set_curr_task = set_curr_task_fair,
bf0f6f24 6409 .task_tick = task_tick_fair,
cd29fe6f 6410 .task_fork = task_fork_fair,
cb469845
SR
6411
6412 .prio_changed = prio_changed_fair,
da7a735e 6413 .switched_from = switched_from_fair,
cb469845 6414 .switched_to = switched_to_fair,
810b3817 6415
0d721cea
PW
6416 .get_rr_interval = get_rr_interval_fair,
6417
810b3817 6418#ifdef CONFIG_FAIR_GROUP_SCHED
b2b5ce02 6419 .task_move_group = task_move_group_fair,
810b3817 6420#endif
bf0f6f24
IM
6421};
6422
6423#ifdef CONFIG_SCHED_DEBUG
029632fb 6424void print_cfs_stats(struct seq_file *m, int cpu)
bf0f6f24 6425{
bf0f6f24
IM
6426 struct cfs_rq *cfs_rq;
6427
5973e5b9 6428 rcu_read_lock();
c3b64f1e 6429 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5cef9eca 6430 print_cfs_rq(m, cpu, cfs_rq);
5973e5b9 6431 rcu_read_unlock();
bf0f6f24
IM
6432}
6433#endif
029632fb
PZ
6434
6435__init void init_sched_fair_class(void)
6436{
6437#ifdef CONFIG_SMP
6438 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6439
3451d024 6440#ifdef CONFIG_NO_HZ_COMMON
554cecaf 6441 nohz.next_balance = jiffies;
029632fb 6442 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
71325960 6443 cpu_notifier(sched_ilb_notifier, 0);
029632fb
PZ
6444#endif
6445#endif /* SMP */
6446
6447}
This page took 1.133166 seconds and 5 git commands to generate.