Commit | Line | Data |
---|---|---|
425e0968 IM |
1 | |
2 | #ifdef CONFIG_SCHEDSTATS | |
3 | /* | |
4 | * bump this up when changing the output format or the meaning of an existing | |
5 | * format, so that tools can adapt (or abort) | |
6 | */ | |
7 | #define SCHEDSTAT_VERSION 14 | |
8 | ||
9 | static int show_schedstat(struct seq_file *seq, void *v) | |
10 | { | |
11 | int cpu; | |
b9689052 | 12 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; |
39106dcf MT |
13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); |
14 | ||
15 | if (mask_str == NULL) | |
16 | return -ENOMEM; | |
425e0968 IM |
17 | |
18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | |
19 | seq_printf(seq, "timestamp %lu\n", jiffies); | |
20 | for_each_online_cpu(cpu) { | |
21 | struct rq *rq = cpu_rq(cpu); | |
22 | #ifdef CONFIG_SMP | |
23 | struct sched_domain *sd; | |
2d72376b | 24 | int dcount = 0; |
425e0968 IM |
25 | #endif |
26 | ||
27 | /* runqueue-specific stats */ | |
28 | seq_printf(seq, | |
480b9434 | 29 | "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu", |
425e0968 | 30 | cpu, rq->yld_both_empty, |
2d72376b IM |
31 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, |
32 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | |
33 | rq->ttwu_count, rq->ttwu_local, | |
9c2c4802 | 34 | rq->rq_cpu_time, |
2d72376b | 35 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); |
425e0968 IM |
36 | |
37 | seq_printf(seq, "\n"); | |
38 | ||
39 | #ifdef CONFIG_SMP | |
40 | /* domain-specific stats */ | |
41 | preempt_disable(); | |
42 | for_each_domain(cpu, sd) { | |
43 | enum cpu_idle_type itype; | |
425e0968 | 44 | |
758b2cdc | 45 | cpumask_scnprintf(mask_str, mask_len, |
968ea6d8 | 46 | sched_domain_span(sd)); |
2d72376b | 47 | seq_printf(seq, "domain%d %s", dcount++, mask_str); |
425e0968 IM |
48 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; |
49 | itype++) { | |
480b9434 | 50 | seq_printf(seq, " %u %u %u %u %u %u %u %u", |
2d72376b | 51 | sd->lb_count[itype], |
425e0968 IM |
52 | sd->lb_balanced[itype], |
53 | sd->lb_failed[itype], | |
54 | sd->lb_imbalance[itype], | |
55 | sd->lb_gained[itype], | |
56 | sd->lb_hot_gained[itype], | |
57 | sd->lb_nobusyq[itype], | |
58 | sd->lb_nobusyg[itype]); | |
59 | } | |
f95e0d1c IM |
60 | seq_printf(seq, |
61 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | |
2d72376b IM |
62 | sd->alb_count, sd->alb_failed, sd->alb_pushed, |
63 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | |
64 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | |
425e0968 IM |
65 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
66 | sd->ttwu_move_balance); | |
67 | } | |
68 | preempt_enable(); | |
69 | #endif | |
70 | } | |
c6fba545 | 71 | kfree(mask_str); |
425e0968 IM |
72 | return 0; |
73 | } | |
74 | ||
75 | static int schedstat_open(struct inode *inode, struct file *file) | |
76 | { | |
77 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | |
78 | char *buf = kmalloc(size, GFP_KERNEL); | |
79 | struct seq_file *m; | |
80 | int res; | |
81 | ||
82 | if (!buf) | |
83 | return -ENOMEM; | |
84 | res = single_open(file, show_schedstat, NULL); | |
85 | if (!res) { | |
86 | m = file->private_data; | |
87 | m->buf = buf; | |
88 | m->size = size; | |
89 | } else | |
90 | kfree(buf); | |
91 | return res; | |
92 | } | |
93 | ||
b5aadf7f | 94 | static const struct file_operations proc_schedstat_operations = { |
425e0968 IM |
95 | .open = schedstat_open, |
96 | .read = seq_read, | |
97 | .llseek = seq_lseek, | |
98 | .release = single_release, | |
99 | }; | |
100 | ||
b5aadf7f AD |
101 | static int __init proc_schedstat_init(void) |
102 | { | |
103 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | |
104 | return 0; | |
105 | } | |
106 | module_init(proc_schedstat_init); | |
107 | ||
425e0968 IM |
108 | /* |
109 | * Expects runqueue lock to be held for atomicity of update | |
110 | */ | |
111 | static inline void | |
112 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | |
113 | { | |
114 | if (rq) { | |
115 | rq->rq_sched_info.run_delay += delta; | |
2d72376b | 116 | rq->rq_sched_info.pcount++; |
425e0968 IM |
117 | } |
118 | } | |
119 | ||
120 | /* | |
121 | * Expects runqueue lock to be held for atomicity of update | |
122 | */ | |
123 | static inline void | |
124 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |
125 | { | |
126 | if (rq) | |
9c2c4802 | 127 | rq->rq_cpu_time += delta; |
425e0968 | 128 | } |
46ac22ba AG |
129 | |
130 | static inline void | |
131 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |
132 | { | |
133 | if (rq) | |
134 | rq->rq_sched_info.run_delay += delta; | |
135 | } | |
425e0968 IM |
136 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
137 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | |
c3c70119 | 138 | # define schedstat_set(var, val) do { var = (val); } while (0) |
425e0968 IM |
139 | #else /* !CONFIG_SCHEDSTATS */ |
140 | static inline void | |
141 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | |
142 | {} | |
143 | static inline void | |
46ac22ba AG |
144 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) |
145 | {} | |
146 | static inline void | |
425e0968 IM |
147 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) |
148 | {} | |
149 | # define schedstat_inc(rq, field) do { } while (0) | |
150 | # define schedstat_add(rq, field, amt) do { } while (0) | |
c3c70119 | 151 | # define schedstat_set(var, val) do { } while (0) |
425e0968 IM |
152 | #endif |
153 | ||
9a41785c | 154 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
46ac22ba AG |
155 | static inline void sched_info_reset_dequeued(struct task_struct *t) |
156 | { | |
157 | t->sched_info.last_queued = 0; | |
158 | } | |
159 | ||
425e0968 IM |
160 | /* |
161 | * Called when a process is dequeued from the active array and given | |
162 | * the cpu. We should note that with the exception of interactive | |
163 | * tasks, the expired queue will become the active queue after the active | |
164 | * queue is empty, without explicitly dequeuing and requeuing tasks in the | |
165 | * expired queue. (Interactive tasks may be requeued directly to the | |
166 | * active queue, thus delaying tasks in the expired queue from running; | |
167 | * see scheduler_tick()). | |
168 | * | |
46ac22ba AG |
169 | * Though we are interested in knowing how long it was from the *first* time a |
170 | * task was queued to the time that it finally hit a cpu, we call this routine | |
171 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | |
172 | * delta taken on each cpu would annul the skew. | |
425e0968 IM |
173 | */ |
174 | static inline void sched_info_dequeued(struct task_struct *t) | |
175 | { | |
46ac22ba AG |
176 | unsigned long long now = task_rq(t)->clock, delta = 0; |
177 | ||
178 | if (unlikely(sched_info_on())) | |
179 | if (t->sched_info.last_queued) | |
180 | delta = now - t->sched_info.last_queued; | |
181 | sched_info_reset_dequeued(t); | |
182 | t->sched_info.run_delay += delta; | |
183 | ||
184 | rq_sched_info_dequeued(task_rq(t), delta); | |
425e0968 IM |
185 | } |
186 | ||
187 | /* | |
188 | * Called when a task finally hits the cpu. We can now calculate how | |
189 | * long it was waiting to run. We also note when it began so that we | |
190 | * can keep stats on how long its timeslice is. | |
191 | */ | |
192 | static void sched_info_arrive(struct task_struct *t) | |
193 | { | |
9a41785c | 194 | unsigned long long now = task_rq(t)->clock, delta = 0; |
425e0968 IM |
195 | |
196 | if (t->sched_info.last_queued) | |
197 | delta = now - t->sched_info.last_queued; | |
46ac22ba | 198 | sched_info_reset_dequeued(t); |
425e0968 IM |
199 | t->sched_info.run_delay += delta; |
200 | t->sched_info.last_arrival = now; | |
2d72376b | 201 | t->sched_info.pcount++; |
425e0968 IM |
202 | |
203 | rq_sched_info_arrive(task_rq(t), delta); | |
204 | } | |
205 | ||
206 | /* | |
207 | * Called when a process is queued into either the active or expired | |
208 | * array. The time is noted and later used to determine how long we | |
209 | * had to wait for us to reach the cpu. Since the expired queue will | |
210 | * become the active queue after active queue is empty, without dequeuing | |
211 | * and requeuing any tasks, we are interested in queuing to either. It | |
212 | * is unusual but not impossible for tasks to be dequeued and immediately | |
213 | * requeued in the same or another array: this can happen in sched_yield(), | |
214 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | |
215 | * to runqueue. | |
216 | * | |
217 | * This function is only called from enqueue_task(), but also only updates | |
218 | * the timestamp if it is already not set. It's assumed that | |
219 | * sched_info_dequeued() will clear that stamp when appropriate. | |
220 | */ | |
221 | static inline void sched_info_queued(struct task_struct *t) | |
222 | { | |
223 | if (unlikely(sched_info_on())) | |
224 | if (!t->sched_info.last_queued) | |
9a41785c | 225 | t->sched_info.last_queued = task_rq(t)->clock; |
425e0968 IM |
226 | } |
227 | ||
228 | /* | |
229 | * Called when a process ceases being the active-running process, either | |
230 | * voluntarily or involuntarily. Now we can calculate how long we ran. | |
d4abc238 BR |
231 | * Also, if the process is still in the TASK_RUNNING state, call |
232 | * sched_info_queued() to mark that it has now again started waiting on | |
233 | * the runqueue. | |
425e0968 IM |
234 | */ |
235 | static inline void sched_info_depart(struct task_struct *t) | |
236 | { | |
9a41785c BS |
237 | unsigned long long delta = task_rq(t)->clock - |
238 | t->sched_info.last_arrival; | |
425e0968 | 239 | |
425e0968 | 240 | rq_sched_info_depart(task_rq(t), delta); |
d4abc238 BR |
241 | |
242 | if (t->state == TASK_RUNNING) | |
243 | sched_info_queued(t); | |
425e0968 IM |
244 | } |
245 | ||
246 | /* | |
247 | * Called when tasks are switched involuntarily due, typically, to expiring | |
248 | * their time slice. (This may also be called when switching to or from | |
249 | * the idle task.) We are only called when prev != next. | |
250 | */ | |
251 | static inline void | |
252 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | |
253 | { | |
254 | struct rq *rq = task_rq(prev); | |
255 | ||
256 | /* | |
257 | * prev now departs the cpu. It's not interesting to record | |
258 | * stats about how efficient we were at scheduling the idle | |
259 | * process, however. | |
260 | */ | |
261 | if (prev != rq->idle) | |
262 | sched_info_depart(prev); | |
263 | ||
264 | if (next != rq->idle) | |
265 | sched_info_arrive(next); | |
266 | } | |
267 | static inline void | |
268 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | |
269 | { | |
270 | if (unlikely(sched_info_on())) | |
271 | __sched_info_switch(prev, next); | |
272 | } | |
273 | #else | |
46ac22ba AG |
274 | #define sched_info_queued(t) do { } while (0) |
275 | #define sched_info_reset_dequeued(t) do { } while (0) | |
276 | #define sched_info_dequeued(t) do { } while (0) | |
277 | #define sched_info_switch(t, next) do { } while (0) | |
9a41785c | 278 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
425e0968 | 279 | |
bb34d92f FM |
280 | /* |
281 | * The following are functions that support scheduler-internal time accounting. | |
282 | * These functions are generally called at the timer tick. None of this depends | |
283 | * on CONFIG_SCHEDSTATS. | |
284 | */ | |
285 | ||
bb34d92f | 286 | /** |
7086efe1 | 287 | * account_group_user_time - Maintain utime for a thread group. |
bb34d92f | 288 | * |
7086efe1 FM |
289 | * @tsk: Pointer to task structure. |
290 | * @cputime: Time value by which to increment the utime field of the | |
291 | * thread_group_cputime structure. | |
bb34d92f FM |
292 | * |
293 | * If thread group time is being maintained, get the structure for the | |
294 | * running CPU and update the utime field there. | |
295 | */ | |
7086efe1 FM |
296 | static inline void account_group_user_time(struct task_struct *tsk, |
297 | cputime_t cputime) | |
bb34d92f | 298 | { |
4cd4c1b4 | 299 | struct thread_group_cputimer *cputimer; |
7086efe1 | 300 | |
ad133ba3 ON |
301 | /* tsk == current, ensure it is safe to use ->signal */ |
302 | if (unlikely(tsk->exit_state)) | |
7086efe1 | 303 | return; |
ad133ba3 | 304 | |
4cd4c1b4 | 305 | cputimer = &tsk->signal->cputimer; |
bb34d92f | 306 | |
4cd4c1b4 PZ |
307 | if (!cputimer->running) |
308 | return; | |
309 | ||
310 | spin_lock(&cputimer->lock); | |
311 | cputimer->cputime.utime = | |
312 | cputime_add(cputimer->cputime.utime, cputime); | |
313 | spin_unlock(&cputimer->lock); | |
bb34d92f FM |
314 | } |
315 | ||
316 | /** | |
7086efe1 | 317 | * account_group_system_time - Maintain stime for a thread group. |
bb34d92f | 318 | * |
7086efe1 FM |
319 | * @tsk: Pointer to task structure. |
320 | * @cputime: Time value by which to increment the stime field of the | |
321 | * thread_group_cputime structure. | |
bb34d92f FM |
322 | * |
323 | * If thread group time is being maintained, get the structure for the | |
324 | * running CPU and update the stime field there. | |
325 | */ | |
7086efe1 FM |
326 | static inline void account_group_system_time(struct task_struct *tsk, |
327 | cputime_t cputime) | |
bb34d92f | 328 | { |
4cd4c1b4 | 329 | struct thread_group_cputimer *cputimer; |
7086efe1 | 330 | |
ad133ba3 ON |
331 | /* tsk == current, ensure it is safe to use ->signal */ |
332 | if (unlikely(tsk->exit_state)) | |
7086efe1 | 333 | return; |
ad133ba3 | 334 | |
4cd4c1b4 PZ |
335 | cputimer = &tsk->signal->cputimer; |
336 | ||
337 | if (!cputimer->running) | |
338 | return; | |
bb34d92f | 339 | |
4cd4c1b4 PZ |
340 | spin_lock(&cputimer->lock); |
341 | cputimer->cputime.stime = | |
342 | cputime_add(cputimer->cputime.stime, cputime); | |
343 | spin_unlock(&cputimer->lock); | |
bb34d92f FM |
344 | } |
345 | ||
346 | /** | |
7086efe1 | 347 | * account_group_exec_runtime - Maintain exec runtime for a thread group. |
bb34d92f | 348 | * |
7086efe1 | 349 | * @tsk: Pointer to task structure. |
bb34d92f | 350 | * @ns: Time value by which to increment the sum_exec_runtime field |
7086efe1 | 351 | * of the thread_group_cputime structure. |
bb34d92f FM |
352 | * |
353 | * If thread group time is being maintained, get the structure for the | |
354 | * running CPU and update the sum_exec_runtime field there. | |
355 | */ | |
7086efe1 FM |
356 | static inline void account_group_exec_runtime(struct task_struct *tsk, |
357 | unsigned long long ns) | |
bb34d92f | 358 | { |
4cd4c1b4 | 359 | struct thread_group_cputimer *cputimer; |
7086efe1 FM |
360 | struct signal_struct *sig; |
361 | ||
362 | sig = tsk->signal; | |
ad133ba3 ON |
363 | /* see __exit_signal()->task_rq_unlock_wait() */ |
364 | barrier(); | |
7086efe1 FM |
365 | if (unlikely(!sig)) |
366 | return; | |
ad133ba3 | 367 | |
4cd4c1b4 PZ |
368 | cputimer = &sig->cputimer; |
369 | ||
370 | if (!cputimer->running) | |
371 | return; | |
bb34d92f | 372 | |
4cd4c1b4 PZ |
373 | spin_lock(&cputimer->lock); |
374 | cputimer->cputime.sum_exec_runtime += ns; | |
375 | spin_unlock(&cputimer->lock); | |
bb34d92f | 376 | } |