Commit | Line | Data |
---|---|---|
73fbec60 FW |
1 | #include <linux/export.h> |
2 | #include <linux/sched.h> | |
3 | #include <linux/tsacct_kern.h> | |
4 | #include <linux/kernel_stat.h> | |
5 | #include <linux/static_key.h> | |
abf917cd | 6 | #include <linux/context_tracking.h> |
73fbec60 | 7 | #include "sched.h" |
1fe7c4ef SS |
8 | #ifdef CONFIG_PARAVIRT |
9 | #include <asm/paravirt.h> | |
10 | #endif | |
73fbec60 FW |
11 | |
12 | ||
13 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | |
14 | ||
15 | /* | |
16 | * There are no locks covering percpu hardirq/softirq time. | |
bf9fae9f | 17 | * They are only modified in vtime_account, on corresponding CPU |
73fbec60 FW |
18 | * with interrupts disabled. So, writes are safe. |
19 | * They are read and saved off onto struct rq in update_rq_clock(). | |
20 | * This may result in other CPU reading this CPU's irq time and can | |
bf9fae9f | 21 | * race with irq/vtime_account on this CPU. We would either get old |
73fbec60 FW |
22 | * or new value with a side effect of accounting a slice of irq time to wrong |
23 | * task when irq is in progress while we read rq->clock. That is a worthy | |
24 | * compromise in place of having locks on each irq in account_system_time. | |
25 | */ | |
26 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | |
27 | DEFINE_PER_CPU(u64, cpu_softirq_time); | |
28 | ||
29 | static DEFINE_PER_CPU(u64, irq_start_time); | |
30 | static int sched_clock_irqtime; | |
31 | ||
32 | void enable_sched_clock_irqtime(void) | |
33 | { | |
34 | sched_clock_irqtime = 1; | |
35 | } | |
36 | ||
37 | void disable_sched_clock_irqtime(void) | |
38 | { | |
39 | sched_clock_irqtime = 0; | |
40 | } | |
41 | ||
42 | #ifndef CONFIG_64BIT | |
43 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | |
44 | #endif /* CONFIG_64BIT */ | |
45 | ||
46 | /* | |
47 | * Called before incrementing preempt_count on {soft,}irq_enter | |
48 | * and before decrementing preempt_count on {soft,}irq_exit. | |
49 | */ | |
3e1df4f5 | 50 | void irqtime_account_irq(struct task_struct *curr) |
73fbec60 FW |
51 | { |
52 | unsigned long flags; | |
53 | s64 delta; | |
54 | int cpu; | |
55 | ||
56 | if (!sched_clock_irqtime) | |
57 | return; | |
58 | ||
59 | local_irq_save(flags); | |
60 | ||
61 | cpu = smp_processor_id(); | |
62 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | |
63 | __this_cpu_add(irq_start_time, delta); | |
64 | ||
65 | irq_time_write_begin(); | |
66 | /* | |
67 | * We do not account for softirq time from ksoftirqd here. | |
68 | * We want to continue accounting softirq time to ksoftirqd thread | |
69 | * in that case, so as not to confuse scheduler with a special task | |
70 | * that do not consume any time, but still wants to run. | |
71 | */ | |
72 | if (hardirq_count()) | |
73 | __this_cpu_add(cpu_hardirq_time, delta); | |
74 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | |
75 | __this_cpu_add(cpu_softirq_time, delta); | |
76 | ||
77 | irq_time_write_end(); | |
78 | local_irq_restore(flags); | |
79 | } | |
3e1df4f5 | 80 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
73fbec60 FW |
81 | |
82 | static int irqtime_account_hi_update(void) | |
83 | { | |
84 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
85 | unsigned long flags; | |
86 | u64 latest_ns; | |
87 | int ret = 0; | |
88 | ||
89 | local_irq_save(flags); | |
90 | latest_ns = this_cpu_read(cpu_hardirq_time); | |
91 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | |
92 | ret = 1; | |
93 | local_irq_restore(flags); | |
94 | return ret; | |
95 | } | |
96 | ||
97 | static int irqtime_account_si_update(void) | |
98 | { | |
99 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
100 | unsigned long flags; | |
101 | u64 latest_ns; | |
102 | int ret = 0; | |
103 | ||
104 | local_irq_save(flags); | |
105 | latest_ns = this_cpu_read(cpu_softirq_time); | |
106 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | |
107 | ret = 1; | |
108 | local_irq_restore(flags); | |
109 | return ret; | |
110 | } | |
111 | ||
112 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
113 | ||
114 | #define sched_clock_irqtime (0) | |
115 | ||
116 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | |
117 | ||
118 | static inline void task_group_account_field(struct task_struct *p, int index, | |
119 | u64 tmp) | |
120 | { | |
73fbec60 FW |
121 | /* |
122 | * Since all updates are sure to touch the root cgroup, we | |
123 | * get ourselves ahead and touch it first. If the root cgroup | |
124 | * is the only cgroup, then nothing else should be necessary. | |
125 | * | |
126 | */ | |
a4f61cc0 | 127 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
73fbec60 | 128 | |
1966aaf7 | 129 | cpuacct_account_field(p, index, tmp); |
73fbec60 FW |
130 | } |
131 | ||
132 | /* | |
133 | * Account user cpu time to a process. | |
134 | * @p: the process that the cpu time gets accounted to | |
135 | * @cputime: the cpu time spent in user space since the last update | |
136 | * @cputime_scaled: cputime scaled by cpu frequency | |
137 | */ | |
138 | void account_user_time(struct task_struct *p, cputime_t cputime, | |
139 | cputime_t cputime_scaled) | |
140 | { | |
141 | int index; | |
142 | ||
143 | /* Add user time to process. */ | |
144 | p->utime += cputime; | |
145 | p->utimescaled += cputime_scaled; | |
146 | account_group_user_time(p, cputime); | |
147 | ||
d0ea0268 | 148 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
73fbec60 FW |
149 | |
150 | /* Add user time to cpustat. */ | |
151 | task_group_account_field(p, index, (__force u64) cputime); | |
152 | ||
153 | /* Account for user time used */ | |
6fac4829 | 154 | acct_account_cputime(p); |
73fbec60 FW |
155 | } |
156 | ||
157 | /* | |
158 | * Account guest cpu time to a process. | |
159 | * @p: the process that the cpu time gets accounted to | |
160 | * @cputime: the cpu time spent in virtual machine since the last update | |
161 | * @cputime_scaled: cputime scaled by cpu frequency | |
162 | */ | |
163 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | |
164 | cputime_t cputime_scaled) | |
165 | { | |
166 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
167 | ||
168 | /* Add guest time to process. */ | |
169 | p->utime += cputime; | |
170 | p->utimescaled += cputime_scaled; | |
171 | account_group_user_time(p, cputime); | |
172 | p->gtime += cputime; | |
173 | ||
174 | /* Add guest time to cpustat. */ | |
d0ea0268 | 175 | if (task_nice(p) > 0) { |
73fbec60 FW |
176 | cpustat[CPUTIME_NICE] += (__force u64) cputime; |
177 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | |
178 | } else { | |
179 | cpustat[CPUTIME_USER] += (__force u64) cputime; | |
180 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | |
181 | } | |
182 | } | |
183 | ||
184 | /* | |
185 | * Account system cpu time to a process and desired cpustat field | |
186 | * @p: the process that the cpu time gets accounted to | |
187 | * @cputime: the cpu time spent in kernel space since the last update | |
188 | * @cputime_scaled: cputime scaled by cpu frequency | |
189 | * @target_cputime64: pointer to cpustat field that has to be updated | |
190 | */ | |
191 | static inline | |
192 | void __account_system_time(struct task_struct *p, cputime_t cputime, | |
193 | cputime_t cputime_scaled, int index) | |
194 | { | |
195 | /* Add system time to process. */ | |
196 | p->stime += cputime; | |
197 | p->stimescaled += cputime_scaled; | |
198 | account_group_system_time(p, cputime); | |
199 | ||
200 | /* Add system time to cpustat. */ | |
201 | task_group_account_field(p, index, (__force u64) cputime); | |
202 | ||
203 | /* Account for system time used */ | |
6fac4829 | 204 | acct_account_cputime(p); |
73fbec60 FW |
205 | } |
206 | ||
207 | /* | |
208 | * Account system cpu time to a process. | |
209 | * @p: the process that the cpu time gets accounted to | |
210 | * @hardirq_offset: the offset to subtract from hardirq_count() | |
211 | * @cputime: the cpu time spent in kernel space since the last update | |
212 | * @cputime_scaled: cputime scaled by cpu frequency | |
213 | */ | |
214 | void account_system_time(struct task_struct *p, int hardirq_offset, | |
215 | cputime_t cputime, cputime_t cputime_scaled) | |
216 | { | |
217 | int index; | |
218 | ||
219 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | |
220 | account_guest_time(p, cputime, cputime_scaled); | |
221 | return; | |
222 | } | |
223 | ||
224 | if (hardirq_count() - hardirq_offset) | |
225 | index = CPUTIME_IRQ; | |
226 | else if (in_serving_softirq()) | |
227 | index = CPUTIME_SOFTIRQ; | |
228 | else | |
229 | index = CPUTIME_SYSTEM; | |
230 | ||
231 | __account_system_time(p, cputime, cputime_scaled, index); | |
232 | } | |
233 | ||
234 | /* | |
235 | * Account for involuntary wait time. | |
236 | * @cputime: the cpu time spent in involuntary wait | |
237 | */ | |
238 | void account_steal_time(cputime_t cputime) | |
239 | { | |
240 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
241 | ||
242 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | |
243 | } | |
244 | ||
245 | /* | |
246 | * Account for idle time. | |
247 | * @cputime: the cpu time spent in idle wait | |
248 | */ | |
249 | void account_idle_time(cputime_t cputime) | |
250 | { | |
251 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
252 | struct rq *rq = this_rq(); | |
253 | ||
254 | if (atomic_read(&rq->nr_iowait) > 0) | |
255 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | |
256 | else | |
257 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | |
258 | } | |
259 | ||
260 | static __always_inline bool steal_account_process_tick(void) | |
261 | { | |
262 | #ifdef CONFIG_PARAVIRT | |
263 | if (static_key_false(¶virt_steal_enabled)) { | |
dee08a72 | 264 | u64 steal; |
f9c904b7 | 265 | unsigned long steal_jiffies; |
73fbec60 FW |
266 | |
267 | steal = paravirt_steal_clock(smp_processor_id()); | |
268 | steal -= this_rq()->prev_steal_time; | |
269 | ||
dee08a72 | 270 | /* |
f9c904b7 CF |
271 | * steal is in nsecs but our caller is expecting steal |
272 | * time in jiffies. Lets cast the result to jiffies | |
dee08a72 FW |
273 | * granularity and account the rest on the next rounds. |
274 | */ | |
f9c904b7 CF |
275 | steal_jiffies = nsecs_to_jiffies(steal); |
276 | this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); | |
73fbec60 | 277 | |
f9c904b7 CF |
278 | account_steal_time(jiffies_to_cputime(steal_jiffies)); |
279 | return steal_jiffies; | |
73fbec60 FW |
280 | } |
281 | #endif | |
282 | return false; | |
283 | } | |
284 | ||
a634f933 FW |
285 | /* |
286 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | |
287 | * tasks (sum on group iteration) belonging to @tsk's group. | |
288 | */ | |
289 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |
290 | { | |
291 | struct signal_struct *sig = tsk->signal; | |
6fac4829 | 292 | cputime_t utime, stime; |
a634f933 | 293 | struct task_struct *t; |
e78c3496 | 294 | unsigned int seq, nextseq; |
9c368b5b | 295 | unsigned long flags; |
a634f933 FW |
296 | |
297 | rcu_read_lock(); | |
e78c3496 RR |
298 | /* Attempt a lockless read on the first round. */ |
299 | nextseq = 0; | |
300 | do { | |
301 | seq = nextseq; | |
9c368b5b | 302 | flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); |
e78c3496 RR |
303 | times->utime = sig->utime; |
304 | times->stime = sig->stime; | |
305 | times->sum_exec_runtime = sig->sum_sched_runtime; | |
306 | ||
307 | for_each_thread(tsk, t) { | |
308 | task_cputime(t, &utime, &stime); | |
309 | times->utime += utime; | |
310 | times->stime += stime; | |
311 | times->sum_exec_runtime += task_sched_runtime(t); | |
312 | } | |
313 | /* If lockless access failed, take the lock. */ | |
314 | nextseq = 1; | |
315 | } while (need_seqretry(&sig->stats_lock, seq)); | |
9c368b5b | 316 | done_seqretry_irqrestore(&sig->stats_lock, seq, flags); |
a634f933 FW |
317 | rcu_read_unlock(); |
318 | } | |
319 | ||
73fbec60 FW |
320 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
321 | /* | |
322 | * Account a tick to a process and cpustat | |
323 | * @p: the process that the cpu time gets accounted to | |
324 | * @user_tick: is the tick from userspace | |
325 | * @rq: the pointer to rq | |
326 | * | |
327 | * Tick demultiplexing follows the order | |
328 | * - pending hardirq update | |
329 | * - pending softirq update | |
330 | * - user_time | |
331 | * - idle_time | |
332 | * - system time | |
333 | * - check for guest_time | |
334 | * - else account as system_time | |
335 | * | |
336 | * Check for hardirq is done both for system and user time as there is | |
337 | * no timer going off while we are on hardirq and hence we may never get an | |
338 | * opportunity to update it solely in system time. | |
339 | * p->stime and friends are only updated on system time and not on irq | |
340 | * softirq as those do not count in task exec_runtime any more. | |
341 | */ | |
342 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |
2d513868 | 343 | struct rq *rq, int ticks) |
73fbec60 | 344 | { |
2d513868 TG |
345 | cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); |
346 | u64 cputime = (__force u64) cputime_one_jiffy; | |
73fbec60 FW |
347 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
348 | ||
349 | if (steal_account_process_tick()) | |
350 | return; | |
351 | ||
2d513868 TG |
352 | cputime *= ticks; |
353 | scaled *= ticks; | |
354 | ||
73fbec60 | 355 | if (irqtime_account_hi_update()) { |
2d513868 | 356 | cpustat[CPUTIME_IRQ] += cputime; |
73fbec60 | 357 | } else if (irqtime_account_si_update()) { |
2d513868 | 358 | cpustat[CPUTIME_SOFTIRQ] += cputime; |
73fbec60 FW |
359 | } else if (this_cpu_ksoftirqd() == p) { |
360 | /* | |
361 | * ksoftirqd time do not get accounted in cpu_softirq_time. | |
362 | * So, we have to handle it separately here. | |
363 | * Also, p->stime needs to be updated for ksoftirqd. | |
364 | */ | |
2d513868 | 365 | __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); |
73fbec60 | 366 | } else if (user_tick) { |
2d513868 | 367 | account_user_time(p, cputime, scaled); |
73fbec60 | 368 | } else if (p == rq->idle) { |
2d513868 | 369 | account_idle_time(cputime); |
73fbec60 | 370 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
2d513868 | 371 | account_guest_time(p, cputime, scaled); |
73fbec60 | 372 | } else { |
2d513868 | 373 | __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); |
73fbec60 FW |
374 | } |
375 | } | |
376 | ||
377 | static void irqtime_account_idle_ticks(int ticks) | |
378 | { | |
73fbec60 FW |
379 | struct rq *rq = this_rq(); |
380 | ||
2d513868 | 381 | irqtime_account_process_tick(current, 0, rq, ticks); |
73fbec60 FW |
382 | } |
383 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
3f4724ea FW |
384 | static inline void irqtime_account_idle_ticks(int ticks) {} |
385 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |
2d513868 | 386 | struct rq *rq, int nr_ticks) {} |
73fbec60 FW |
387 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
388 | ||
73fbec60 FW |
389 | /* |
390 | * Use precise platform statistics if available: | |
391 | */ | |
392 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | |
a7e1a9e3 | 393 | |
e3942ba0 | 394 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
b0493406 | 395 | void vtime_common_task_switch(struct task_struct *prev) |
e3942ba0 FW |
396 | { |
397 | if (is_idle_task(prev)) | |
398 | vtime_account_idle(prev); | |
399 | else | |
400 | vtime_account_system(prev); | |
401 | ||
abf917cd | 402 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
e3942ba0 | 403 | vtime_account_user(prev); |
abf917cd | 404 | #endif |
e3942ba0 FW |
405 | arch_vtime_task_switch(prev); |
406 | } | |
407 | #endif | |
11113334 | 408 | |
a7e1a9e3 FW |
409 | /* |
410 | * Archs that account the whole time spent in the idle task | |
411 | * (outside irq) as idle time can rely on this and just implement | |
fd25b4c2 | 412 | * vtime_account_system() and vtime_account_idle(). Archs that |
a7e1a9e3 FW |
413 | * have other meaning of the idle time (s390 only includes the |
414 | * time spent by the CPU when it's in low power mode) must override | |
415 | * vtime_account(). | |
416 | */ | |
417 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | |
b0493406 | 418 | void vtime_common_account_irq_enter(struct task_struct *tsk) |
a7e1a9e3 | 419 | { |
abf917cd FW |
420 | if (!in_interrupt()) { |
421 | /* | |
422 | * If we interrupted user, context_tracking_in_user() | |
423 | * is 1 because the context tracking don't hook | |
424 | * on irq entry/exit. This way we know if | |
425 | * we need to flush user time on kernel entry. | |
426 | */ | |
427 | if (context_tracking_in_user()) { | |
428 | vtime_account_user(tsk); | |
429 | return; | |
430 | } | |
431 | ||
432 | if (is_idle_task(tsk)) { | |
433 | vtime_account_idle(tsk); | |
434 | return; | |
435 | } | |
436 | } | |
437 | vtime_account_system(tsk); | |
a7e1a9e3 | 438 | } |
b0493406 | 439 | EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); |
a7e1a9e3 | 440 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
9fbc42ea FW |
441 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
442 | ||
443 | ||
444 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | |
445 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
446 | { | |
447 | *ut = p->utime; | |
448 | *st = p->stime; | |
449 | } | |
9eec50b8 | 450 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
a7e1a9e3 | 451 | |
9fbc42ea FW |
452 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
453 | { | |
454 | struct task_cputime cputime; | |
73fbec60 | 455 | |
9fbc42ea FW |
456 | thread_group_cputime(p, &cputime); |
457 | ||
458 | *ut = cputime.utime; | |
459 | *st = cputime.stime; | |
460 | } | |
461 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | |
462 | /* | |
463 | * Account a single tick of cpu time. | |
464 | * @p: the process that the cpu time gets accounted to | |
465 | * @user_tick: indicates if the tick is a user or a system tick | |
466 | */ | |
467 | void account_process_tick(struct task_struct *p, int user_tick) | |
73fbec60 | 468 | { |
9fbc42ea FW |
469 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
470 | struct rq *rq = this_rq(); | |
73fbec60 | 471 | |
55dbdcfa | 472 | if (vtime_accounting_cpu_enabled()) |
9fbc42ea FW |
473 | return; |
474 | ||
475 | if (sched_clock_irqtime) { | |
2d513868 | 476 | irqtime_account_process_tick(p, user_tick, rq, 1); |
9fbc42ea FW |
477 | return; |
478 | } | |
479 | ||
480 | if (steal_account_process_tick()) | |
481 | return; | |
73fbec60 | 482 | |
9fbc42ea FW |
483 | if (user_tick) |
484 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | |
485 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | |
486 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | |
487 | one_jiffy_scaled); | |
73fbec60 | 488 | else |
9fbc42ea FW |
489 | account_idle_time(cputime_one_jiffy); |
490 | } | |
73fbec60 | 491 | |
9fbc42ea FW |
492 | /* |
493 | * Account multiple ticks of steal time. | |
494 | * @p: the process from which the cpu time has been stolen | |
495 | * @ticks: number of stolen ticks | |
496 | */ | |
497 | void account_steal_ticks(unsigned long ticks) | |
498 | { | |
499 | account_steal_time(jiffies_to_cputime(ticks)); | |
500 | } | |
501 | ||
502 | /* | |
503 | * Account multiple ticks of idle time. | |
504 | * @ticks: number of stolen ticks | |
505 | */ | |
506 | void account_idle_ticks(unsigned long ticks) | |
507 | { | |
508 | ||
509 | if (sched_clock_irqtime) { | |
510 | irqtime_account_idle_ticks(ticks); | |
511 | return; | |
512 | } | |
513 | ||
514 | account_idle_time(jiffies_to_cputime(ticks)); | |
515 | } | |
73fbec60 | 516 | |
d9a3c982 | 517 | /* |
55eaa7c1 SG |
518 | * Perform (stime * rtime) / total, but avoid multiplication overflow by |
519 | * loosing precision when the numbers are big. | |
d9a3c982 FW |
520 | */ |
521 | static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) | |
73fbec60 | 522 | { |
55eaa7c1 | 523 | u64 scaled; |
73fbec60 | 524 | |
55eaa7c1 SG |
525 | for (;;) { |
526 | /* Make sure "rtime" is the bigger of stime/rtime */ | |
84f9f3a1 SG |
527 | if (stime > rtime) |
528 | swap(rtime, stime); | |
55eaa7c1 SG |
529 | |
530 | /* Make sure 'total' fits in 32 bits */ | |
531 | if (total >> 32) | |
532 | goto drop_precision; | |
533 | ||
534 | /* Does rtime (and thus stime) fit in 32 bits? */ | |
535 | if (!(rtime >> 32)) | |
536 | break; | |
537 | ||
538 | /* Can we just balance rtime/stime rather than dropping bits? */ | |
539 | if (stime >> 31) | |
540 | goto drop_precision; | |
541 | ||
542 | /* We can grow stime and shrink rtime and try to make them both fit */ | |
543 | stime <<= 1; | |
544 | rtime >>= 1; | |
545 | continue; | |
546 | ||
547 | drop_precision: | |
548 | /* We drop from rtime, it has more bits than stime */ | |
549 | rtime >>= 1; | |
550 | total >>= 1; | |
d9a3c982 | 551 | } |
73fbec60 | 552 | |
55eaa7c1 SG |
553 | /* |
554 | * Make sure gcc understands that this is a 32x32->64 multiply, | |
555 | * followed by a 64/32->64 divide. | |
556 | */ | |
557 | scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); | |
d9a3c982 | 558 | return (__force cputime_t) scaled; |
73fbec60 FW |
559 | } |
560 | ||
347abad9 | 561 | /* |
9d7fb042 PZ |
562 | * Adjust tick based cputime random precision against scheduler runtime |
563 | * accounting. | |
347abad9 | 564 | * |
9d7fb042 PZ |
565 | * Tick based cputime accounting depend on random scheduling timeslices of a |
566 | * task to be interrupted or not by the timer. Depending on these | |
567 | * circumstances, the number of these interrupts may be over or | |
568 | * under-optimistic, matching the real user and system cputime with a variable | |
569 | * precision. | |
570 | * | |
571 | * Fix this by scaling these tick based values against the total runtime | |
572 | * accounted by the CFS scheduler. | |
573 | * | |
574 | * This code provides the following guarantees: | |
575 | * | |
576 | * stime + utime == rtime | |
577 | * stime_i+1 >= stime_i, utime_i+1 >= utime_i | |
578 | * | |
579 | * Assuming that rtime_i+1 >= rtime_i. | |
fa092057 | 580 | */ |
d37f761d | 581 | static void cputime_adjust(struct task_cputime *curr, |
9d7fb042 | 582 | struct prev_cputime *prev, |
d37f761d | 583 | cputime_t *ut, cputime_t *st) |
73fbec60 | 584 | { |
5a8e01f8 | 585 | cputime_t rtime, stime, utime; |
9d7fb042 | 586 | unsigned long flags; |
fa092057 | 587 | |
9d7fb042 PZ |
588 | /* Serialize concurrent callers such that we can honour our guarantees */ |
589 | raw_spin_lock_irqsave(&prev->lock, flags); | |
d37f761d | 590 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); |
73fbec60 | 591 | |
772c808a | 592 | /* |
9d7fb042 PZ |
593 | * This is possible under two circumstances: |
594 | * - rtime isn't monotonic after all (a bug); | |
595 | * - we got reordered by the lock. | |
596 | * | |
597 | * In both cases this acts as a filter such that the rest of the code | |
598 | * can assume it is monotonic regardless of anything else. | |
772c808a SG |
599 | */ |
600 | if (prev->stime + prev->utime >= rtime) | |
601 | goto out; | |
602 | ||
5a8e01f8 SG |
603 | stime = curr->stime; |
604 | utime = curr->utime; | |
605 | ||
606 | if (utime == 0) { | |
607 | stime = rtime; | |
9d7fb042 PZ |
608 | goto update; |
609 | } | |
5a8e01f8 | 610 | |
9d7fb042 PZ |
611 | if (stime == 0) { |
612 | utime = rtime; | |
613 | goto update; | |
d9a3c982 | 614 | } |
73fbec60 | 615 | |
9d7fb042 PZ |
616 | stime = scale_stime((__force u64)stime, (__force u64)rtime, |
617 | (__force u64)(stime + utime)); | |
618 | ||
619 | /* | |
620 | * Make sure stime doesn't go backwards; this preserves monotonicity | |
621 | * for utime because rtime is monotonic. | |
622 | * | |
623 | * utime_i+1 = rtime_i+1 - stime_i | |
624 | * = rtime_i+1 - (rtime_i - utime_i) | |
625 | * = (rtime_i+1 - rtime_i) + utime_i | |
626 | * >= utime_i | |
627 | */ | |
628 | if (stime < prev->stime) | |
629 | stime = prev->stime; | |
630 | utime = rtime - stime; | |
631 | ||
632 | /* | |
633 | * Make sure utime doesn't go backwards; this still preserves | |
634 | * monotonicity for stime, analogous argument to above. | |
635 | */ | |
636 | if (utime < prev->utime) { | |
637 | utime = prev->utime; | |
638 | stime = rtime - utime; | |
639 | } | |
d37f761d | 640 | |
9d7fb042 PZ |
641 | update: |
642 | prev->stime = stime; | |
643 | prev->utime = utime; | |
772c808a | 644 | out: |
d37f761d FW |
645 | *ut = prev->utime; |
646 | *st = prev->stime; | |
9d7fb042 | 647 | raw_spin_unlock_irqrestore(&prev->lock, flags); |
d37f761d | 648 | } |
73fbec60 | 649 | |
d37f761d FW |
650 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
651 | { | |
652 | struct task_cputime cputime = { | |
d37f761d FW |
653 | .sum_exec_runtime = p->se.sum_exec_runtime, |
654 | }; | |
655 | ||
6fac4829 | 656 | task_cputime(p, &cputime.utime, &cputime.stime); |
d37f761d | 657 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
73fbec60 | 658 | } |
9eec50b8 | 659 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
73fbec60 | 660 | |
e80d0a1a | 661 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
73fbec60 | 662 | { |
73fbec60 | 663 | struct task_cputime cputime; |
73fbec60 FW |
664 | |
665 | thread_group_cputime(p, &cputime); | |
d37f761d | 666 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); |
73fbec60 | 667 | } |
9fbc42ea | 668 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
abf917cd FW |
669 | |
670 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | |
ff9a9b4c | 671 | static cputime_t vtime_delta(struct task_struct *tsk) |
6a61671b | 672 | { |
ff9a9b4c | 673 | unsigned long now = READ_ONCE(jiffies); |
6a61671b | 674 | |
ff9a9b4c | 675 | if (time_before(now, (unsigned long)tsk->vtime_snap)) |
6a61671b | 676 | return 0; |
abf917cd | 677 | |
ff9a9b4c | 678 | return jiffies_to_cputime(now - tsk->vtime_snap); |
6a61671b FW |
679 | } |
680 | ||
681 | static cputime_t get_vtime_delta(struct task_struct *tsk) | |
abf917cd | 682 | { |
ff9a9b4c RR |
683 | unsigned long now = READ_ONCE(jiffies); |
684 | unsigned long delta = now - tsk->vtime_snap; | |
abf917cd | 685 | |
7098c1ea | 686 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
ff9a9b4c | 687 | tsk->vtime_snap = now; |
abf917cd | 688 | |
ff9a9b4c | 689 | return jiffies_to_cputime(delta); |
abf917cd FW |
690 | } |
691 | ||
6a61671b FW |
692 | static void __vtime_account_system(struct task_struct *tsk) |
693 | { | |
694 | cputime_t delta_cpu = get_vtime_delta(tsk); | |
695 | ||
696 | account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); | |
697 | } | |
698 | ||
abf917cd FW |
699 | void vtime_account_system(struct task_struct *tsk) |
700 | { | |
ff9a9b4c RR |
701 | if (!vtime_delta(tsk)) |
702 | return; | |
703 | ||
b7ce2277 | 704 | write_seqcount_begin(&tsk->vtime_seqcount); |
6a61671b | 705 | __vtime_account_system(tsk); |
b7ce2277 | 706 | write_seqcount_end(&tsk->vtime_seqcount); |
6a61671b | 707 | } |
3f4724ea | 708 | |
b0493406 | 709 | void vtime_gen_account_irq_exit(struct task_struct *tsk) |
6a61671b | 710 | { |
b7ce2277 | 711 | write_seqcount_begin(&tsk->vtime_seqcount); |
ff9a9b4c RR |
712 | if (vtime_delta(tsk)) |
713 | __vtime_account_system(tsk); | |
6a61671b FW |
714 | if (context_tracking_in_user()) |
715 | tsk->vtime_snap_whence = VTIME_USER; | |
b7ce2277 | 716 | write_seqcount_end(&tsk->vtime_seqcount); |
abf917cd FW |
717 | } |
718 | ||
719 | void vtime_account_user(struct task_struct *tsk) | |
720 | { | |
3f4724ea FW |
721 | cputime_t delta_cpu; |
722 | ||
b7ce2277 | 723 | write_seqcount_begin(&tsk->vtime_seqcount); |
6a61671b | 724 | tsk->vtime_snap_whence = VTIME_SYS; |
ff9a9b4c RR |
725 | if (vtime_delta(tsk)) { |
726 | delta_cpu = get_vtime_delta(tsk); | |
727 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | |
728 | } | |
b7ce2277 | 729 | write_seqcount_end(&tsk->vtime_seqcount); |
6a61671b FW |
730 | } |
731 | ||
732 | void vtime_user_enter(struct task_struct *tsk) | |
733 | { | |
b7ce2277 | 734 | write_seqcount_begin(&tsk->vtime_seqcount); |
ff9a9b4c RR |
735 | if (vtime_delta(tsk)) |
736 | __vtime_account_system(tsk); | |
af2350bd | 737 | tsk->vtime_snap_whence = VTIME_USER; |
b7ce2277 | 738 | write_seqcount_end(&tsk->vtime_seqcount); |
6a61671b FW |
739 | } |
740 | ||
741 | void vtime_guest_enter(struct task_struct *tsk) | |
742 | { | |
5b206d48 FW |
743 | /* |
744 | * The flags must be updated under the lock with | |
745 | * the vtime_snap flush and update. | |
746 | * That enforces a right ordering and update sequence | |
747 | * synchronization against the reader (task_gtime()) | |
748 | * that can thus safely catch up with a tickless delta. | |
749 | */ | |
b7ce2277 | 750 | write_seqcount_begin(&tsk->vtime_seqcount); |
ff9a9b4c RR |
751 | if (vtime_delta(tsk)) |
752 | __vtime_account_system(tsk); | |
6a61671b | 753 | current->flags |= PF_VCPU; |
b7ce2277 | 754 | write_seqcount_end(&tsk->vtime_seqcount); |
6a61671b | 755 | } |
48d6a816 | 756 | EXPORT_SYMBOL_GPL(vtime_guest_enter); |
6a61671b FW |
757 | |
758 | void vtime_guest_exit(struct task_struct *tsk) | |
759 | { | |
b7ce2277 | 760 | write_seqcount_begin(&tsk->vtime_seqcount); |
6a61671b FW |
761 | __vtime_account_system(tsk); |
762 | current->flags &= ~PF_VCPU; | |
b7ce2277 | 763 | write_seqcount_end(&tsk->vtime_seqcount); |
abf917cd | 764 | } |
48d6a816 | 765 | EXPORT_SYMBOL_GPL(vtime_guest_exit); |
abf917cd FW |
766 | |
767 | void vtime_account_idle(struct task_struct *tsk) | |
768 | { | |
6a61671b | 769 | cputime_t delta_cpu = get_vtime_delta(tsk); |
abf917cd FW |
770 | |
771 | account_idle_time(delta_cpu); | |
772 | } | |
3f4724ea | 773 | |
6a61671b FW |
774 | void arch_vtime_task_switch(struct task_struct *prev) |
775 | { | |
b7ce2277 | 776 | write_seqcount_begin(&prev->vtime_seqcount); |
7098c1ea | 777 | prev->vtime_snap_whence = VTIME_INACTIVE; |
b7ce2277 | 778 | write_seqcount_end(&prev->vtime_seqcount); |
6a61671b | 779 | |
b7ce2277 | 780 | write_seqcount_begin(¤t->vtime_seqcount); |
6a61671b | 781 | current->vtime_snap_whence = VTIME_SYS; |
ff9a9b4c | 782 | current->vtime_snap = jiffies; |
b7ce2277 | 783 | write_seqcount_end(¤t->vtime_seqcount); |
6a61671b FW |
784 | } |
785 | ||
45eacc69 | 786 | void vtime_init_idle(struct task_struct *t, int cpu) |
6a61671b FW |
787 | { |
788 | unsigned long flags; | |
789 | ||
b7ce2277 FW |
790 | local_irq_save(flags); |
791 | write_seqcount_begin(&t->vtime_seqcount); | |
6a61671b | 792 | t->vtime_snap_whence = VTIME_SYS; |
ff9a9b4c | 793 | t->vtime_snap = jiffies; |
b7ce2277 FW |
794 | write_seqcount_end(&t->vtime_seqcount); |
795 | local_irq_restore(flags); | |
6a61671b FW |
796 | } |
797 | ||
798 | cputime_t task_gtime(struct task_struct *t) | |
799 | { | |
6a61671b FW |
800 | unsigned int seq; |
801 | cputime_t gtime; | |
802 | ||
e5925394 | 803 | if (!vtime_accounting_enabled()) |
2541117b HS |
804 | return t->gtime; |
805 | ||
6a61671b | 806 | do { |
b7ce2277 | 807 | seq = read_seqcount_begin(&t->vtime_seqcount); |
6a61671b FW |
808 | |
809 | gtime = t->gtime; | |
cab245d6 | 810 | if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) |
6a61671b FW |
811 | gtime += vtime_delta(t); |
812 | ||
b7ce2277 | 813 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); |
6a61671b FW |
814 | |
815 | return gtime; | |
816 | } | |
817 | ||
818 | /* | |
819 | * Fetch cputime raw values from fields of task_struct and | |
820 | * add up the pending nohz execution time since the last | |
821 | * cputime snapshot. | |
822 | */ | |
823 | static void | |
824 | fetch_task_cputime(struct task_struct *t, | |
825 | cputime_t *u_dst, cputime_t *s_dst, | |
826 | cputime_t *u_src, cputime_t *s_src, | |
827 | cputime_t *udelta, cputime_t *sdelta) | |
828 | { | |
6a61671b FW |
829 | unsigned int seq; |
830 | unsigned long long delta; | |
831 | ||
832 | do { | |
833 | *udelta = 0; | |
834 | *sdelta = 0; | |
835 | ||
b7ce2277 | 836 | seq = read_seqcount_begin(&t->vtime_seqcount); |
6a61671b FW |
837 | |
838 | if (u_dst) | |
839 | *u_dst = *u_src; | |
840 | if (s_dst) | |
841 | *s_dst = *s_src; | |
842 | ||
843 | /* Task is sleeping, nothing to add */ | |
7098c1ea | 844 | if (t->vtime_snap_whence == VTIME_INACTIVE || |
6a61671b FW |
845 | is_idle_task(t)) |
846 | continue; | |
847 | ||
848 | delta = vtime_delta(t); | |
849 | ||
850 | /* | |
851 | * Task runs either in user or kernel space, add pending nohz time to | |
852 | * the right place. | |
853 | */ | |
854 | if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { | |
855 | *udelta = delta; | |
856 | } else { | |
857 | if (t->vtime_snap_whence == VTIME_SYS) | |
858 | *sdelta = delta; | |
859 | } | |
b7ce2277 | 860 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); |
6a61671b FW |
861 | } |
862 | ||
863 | ||
864 | void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) | |
865 | { | |
866 | cputime_t udelta, sdelta; | |
867 | ||
e5925394 | 868 | if (!vtime_accounting_enabled()) { |
7877a0ba HS |
869 | if (utime) |
870 | *utime = t->utime; | |
871 | if (stime) | |
872 | *stime = t->stime; | |
873 | return; | |
874 | } | |
875 | ||
6a61671b FW |
876 | fetch_task_cputime(t, utime, stime, &t->utime, |
877 | &t->stime, &udelta, &sdelta); | |
878 | if (utime) | |
879 | *utime += udelta; | |
880 | if (stime) | |
881 | *stime += sdelta; | |
882 | } | |
883 | ||
884 | void task_cputime_scaled(struct task_struct *t, | |
885 | cputime_t *utimescaled, cputime_t *stimescaled) | |
886 | { | |
887 | cputime_t udelta, sdelta; | |
888 | ||
e5925394 | 889 | if (!vtime_accounting_enabled()) { |
7877a0ba HS |
890 | if (utimescaled) |
891 | *utimescaled = t->utimescaled; | |
892 | if (stimescaled) | |
893 | *stimescaled = t->stimescaled; | |
894 | return; | |
895 | } | |
896 | ||
6a61671b FW |
897 | fetch_task_cputime(t, utimescaled, stimescaled, |
898 | &t->utimescaled, &t->stimescaled, &udelta, &sdelta); | |
899 | if (utimescaled) | |
900 | *utimescaled += cputime_to_scaled(udelta); | |
901 | if (stimescaled) | |
902 | *stimescaled += cputime_to_scaled(sdelta); | |
903 | } | |
abf917cd | 904 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |