2 * Performance counter core code
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
7 * For licencing details see kernel-base/COPYING
11 #include <linux/cpu.h>
12 #include <linux/smp.h>
13 #include <linux/file.h>
14 #include <linux/poll.h>
15 #include <linux/sysfs.h>
16 #include <linux/ptrace.h>
17 #include <linux/percpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/syscalls.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/kernel_stat.h>
22 #include <linux/perf_counter.h>
25 * Each CPU has a list of per CPU counters:
27 DEFINE_PER_CPU(struct perf_cpu_context
, perf_cpu_context
);
29 int perf_max_counters __read_mostly
= 1;
30 static int perf_reserved_percpu __read_mostly
;
31 static int perf_overcommit __read_mostly
= 1;
34 * Mutex for (sysadmin-configurable) counter reservations:
36 static DEFINE_MUTEX(perf_resource_mutex
);
39 * Architecture provided APIs - weak aliases:
41 extern __weak
const struct hw_perf_counter_ops
*
42 hw_perf_counter_init(struct perf_counter
*counter
)
47 u64 __weak
hw_perf_save_disable(void) { return 0; }
48 void __weak
hw_perf_restore(u64 ctrl
) { barrier(); }
49 void __weak
hw_perf_counter_setup(int cpu
) { barrier(); }
50 int __weak
hw_perf_group_sched_in(struct perf_counter
*group_leader
,
51 struct perf_cpu_context
*cpuctx
,
52 struct perf_counter_context
*ctx
, int cpu
)
57 void __weak
perf_counter_print_debug(void) { }
60 list_add_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
62 struct perf_counter
*group_leader
= counter
->group_leader
;
65 * Depending on whether it is a standalone or sibling counter,
66 * add it straight to the context's counter list, or to the group
67 * leader's sibling list:
69 if (counter
->group_leader
== counter
)
70 list_add_tail(&counter
->list_entry
, &ctx
->counter_list
);
72 list_add_tail(&counter
->list_entry
, &group_leader
->sibling_list
);
76 list_del_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
78 struct perf_counter
*sibling
, *tmp
;
80 list_del_init(&counter
->list_entry
);
83 * If this was a group counter with sibling counters then
84 * upgrade the siblings to singleton counters by adding them
85 * to the context list directly:
87 list_for_each_entry_safe(sibling
, tmp
,
88 &counter
->sibling_list
, list_entry
) {
90 list_del_init(&sibling
->list_entry
);
91 list_add_tail(&sibling
->list_entry
, &ctx
->counter_list
);
92 sibling
->group_leader
= sibling
;
97 counter_sched_out(struct perf_counter
*counter
,
98 struct perf_cpu_context
*cpuctx
,
99 struct perf_counter_context
*ctx
)
101 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
104 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
105 counter
->hw_ops
->disable(counter
);
108 if (!is_software_counter(counter
))
109 cpuctx
->active_oncpu
--;
111 if (counter
->hw_event
.exclusive
|| !cpuctx
->active_oncpu
)
112 cpuctx
->exclusive
= 0;
116 group_sched_out(struct perf_counter
*group_counter
,
117 struct perf_cpu_context
*cpuctx
,
118 struct perf_counter_context
*ctx
)
120 struct perf_counter
*counter
;
122 if (group_counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
125 counter_sched_out(group_counter
, cpuctx
, ctx
);
128 * Schedule out siblings (if any):
130 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
)
131 counter_sched_out(counter
, cpuctx
, ctx
);
133 if (group_counter
->hw_event
.exclusive
)
134 cpuctx
->exclusive
= 0;
138 * Cross CPU call to remove a performance counter
140 * We disable the counter on the hardware level first. After that we
141 * remove it from the context list.
143 static void __perf_counter_remove_from_context(void *info
)
145 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
146 struct perf_counter
*counter
= info
;
147 struct perf_counter_context
*ctx
= counter
->ctx
;
152 * If this is a task context, we need to check whether it is
153 * the current task context of this cpu. If not it has been
154 * scheduled out before the smp call arrived.
156 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
159 curr_rq_lock_irq_save(&flags
);
160 spin_lock(&ctx
->lock
);
162 counter_sched_out(counter
, cpuctx
, ctx
);
164 counter
->task
= NULL
;
168 * Protect the list operation against NMI by disabling the
169 * counters on a global level. NOP for non NMI based counters.
171 perf_flags
= hw_perf_save_disable();
172 list_del_counter(counter
, ctx
);
173 hw_perf_restore(perf_flags
);
177 * Allow more per task counters with respect to the
180 cpuctx
->max_pertask
=
181 min(perf_max_counters
- ctx
->nr_counters
,
182 perf_max_counters
- perf_reserved_percpu
);
185 spin_unlock(&ctx
->lock
);
186 curr_rq_unlock_irq_restore(&flags
);
191 * Remove the counter from a task's (or a CPU's) list of counters.
193 * Must be called with counter->mutex and ctx->mutex held.
195 * CPU counters are removed with a smp call. For task counters we only
196 * call when the task is on a CPU.
198 static void perf_counter_remove_from_context(struct perf_counter
*counter
)
200 struct perf_counter_context
*ctx
= counter
->ctx
;
201 struct task_struct
*task
= ctx
->task
;
205 * Per cpu counters are removed via an smp call and
206 * the removal is always sucessful.
208 smp_call_function_single(counter
->cpu
,
209 __perf_counter_remove_from_context
,
215 task_oncpu_function_call(task
, __perf_counter_remove_from_context
,
218 spin_lock_irq(&ctx
->lock
);
220 * If the context is active we need to retry the smp call.
222 if (ctx
->nr_active
&& !list_empty(&counter
->list_entry
)) {
223 spin_unlock_irq(&ctx
->lock
);
228 * The lock prevents that this context is scheduled in so we
229 * can remove the counter safely, if the call above did not
232 if (!list_empty(&counter
->list_entry
)) {
234 list_del_counter(counter
, ctx
);
235 counter
->task
= NULL
;
237 spin_unlock_irq(&ctx
->lock
);
241 * Cross CPU call to disable a performance counter
243 static void __perf_counter_disable(void *info
)
245 struct perf_counter
*counter
= info
;
246 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
247 struct perf_counter_context
*ctx
= counter
->ctx
;
251 * If this is a per-task counter, need to check whether this
252 * counter's task is the current task on this cpu.
254 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
257 curr_rq_lock_irq_save(&flags
);
258 spin_lock(&ctx
->lock
);
261 * If the counter is on, turn it off.
262 * If it is in error state, leave it in error state.
264 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
) {
265 if (counter
== counter
->group_leader
)
266 group_sched_out(counter
, cpuctx
, ctx
);
268 counter_sched_out(counter
, cpuctx
, ctx
);
269 counter
->state
= PERF_COUNTER_STATE_OFF
;
272 spin_unlock(&ctx
->lock
);
273 curr_rq_unlock_irq_restore(&flags
);
279 static void perf_counter_disable(struct perf_counter
*counter
)
281 struct perf_counter_context
*ctx
= counter
->ctx
;
282 struct task_struct
*task
= ctx
->task
;
286 * Disable the counter on the cpu that it's on
288 smp_call_function_single(counter
->cpu
, __perf_counter_disable
,
294 task_oncpu_function_call(task
, __perf_counter_disable
, counter
);
296 spin_lock_irq(&ctx
->lock
);
298 * If the counter is still active, we need to retry the cross-call.
300 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
301 spin_unlock_irq(&ctx
->lock
);
306 * Since we have the lock this context can't be scheduled
307 * in, so we can change the state safely.
309 if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
)
310 counter
->state
= PERF_COUNTER_STATE_OFF
;
312 spin_unlock_irq(&ctx
->lock
);
316 * Disable a counter and all its children.
318 static void perf_counter_disable_family(struct perf_counter
*counter
)
320 struct perf_counter
*child
;
322 perf_counter_disable(counter
);
325 * Lock the mutex to protect the list of children
327 mutex_lock(&counter
->mutex
);
328 list_for_each_entry(child
, &counter
->child_list
, child_list
)
329 perf_counter_disable(child
);
330 mutex_unlock(&counter
->mutex
);
334 counter_sched_in(struct perf_counter
*counter
,
335 struct perf_cpu_context
*cpuctx
,
336 struct perf_counter_context
*ctx
,
339 if (counter
->state
<= PERF_COUNTER_STATE_OFF
)
342 counter
->state
= PERF_COUNTER_STATE_ACTIVE
;
343 counter
->oncpu
= cpu
; /* TODO: put 'cpu' into cpuctx->cpu */
345 * The new state must be visible before we turn it on in the hardware:
349 if (counter
->hw_ops
->enable(counter
)) {
350 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
355 if (!is_software_counter(counter
))
356 cpuctx
->active_oncpu
++;
359 if (counter
->hw_event
.exclusive
)
360 cpuctx
->exclusive
= 1;
366 * Return 1 for a group consisting entirely of software counters,
367 * 0 if the group contains any hardware counters.
369 static int is_software_only_group(struct perf_counter
*leader
)
371 struct perf_counter
*counter
;
373 if (!is_software_counter(leader
))
375 list_for_each_entry(counter
, &leader
->sibling_list
, list_entry
)
376 if (!is_software_counter(counter
))
382 * Work out whether we can put this counter group on the CPU now.
384 static int group_can_go_on(struct perf_counter
*counter
,
385 struct perf_cpu_context
*cpuctx
,
389 * Groups consisting entirely of software counters can always go on.
391 if (is_software_only_group(counter
))
394 * If an exclusive group is already on, no other hardware
395 * counters can go on.
397 if (cpuctx
->exclusive
)
400 * If this group is exclusive and there are already
401 * counters on the CPU, it can't go on.
403 if (counter
->hw_event
.exclusive
&& cpuctx
->active_oncpu
)
406 * Otherwise, try to add it if all previous groups were able
413 * Cross CPU call to install and enable a performance counter
415 static void __perf_install_in_context(void *info
)
417 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
418 struct perf_counter
*counter
= info
;
419 struct perf_counter_context
*ctx
= counter
->ctx
;
420 struct perf_counter
*leader
= counter
->group_leader
;
421 int cpu
= smp_processor_id();
427 * If this is a task context, we need to check whether it is
428 * the current task context of this cpu. If not it has been
429 * scheduled out before the smp call arrived.
431 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
434 curr_rq_lock_irq_save(&flags
);
435 spin_lock(&ctx
->lock
);
438 * Protect the list operation against NMI by disabling the
439 * counters on a global level. NOP for non NMI based counters.
441 perf_flags
= hw_perf_save_disable();
443 list_add_counter(counter
, ctx
);
447 * Don't put the counter on if it is disabled or if
448 * it is in a group and the group isn't on.
450 if (counter
->state
!= PERF_COUNTER_STATE_INACTIVE
||
451 (leader
!= counter
&& leader
->state
!= PERF_COUNTER_STATE_ACTIVE
))
455 * An exclusive counter can't go on if there are already active
456 * hardware counters, and no hardware counter can go on if there
457 * is already an exclusive counter on.
459 if (!group_can_go_on(counter
, cpuctx
, 1))
462 err
= counter_sched_in(counter
, cpuctx
, ctx
, cpu
);
466 * This counter couldn't go on. If it is in a group
467 * then we have to pull the whole group off.
468 * If the counter group is pinned then put it in error state.
470 if (leader
!= counter
)
471 group_sched_out(leader
, cpuctx
, ctx
);
472 if (leader
->hw_event
.pinned
)
473 leader
->state
= PERF_COUNTER_STATE_ERROR
;
476 if (!err
&& !ctx
->task
&& cpuctx
->max_pertask
)
477 cpuctx
->max_pertask
--;
480 hw_perf_restore(perf_flags
);
482 spin_unlock(&ctx
->lock
);
483 curr_rq_unlock_irq_restore(&flags
);
487 * Attach a performance counter to a context
489 * First we add the counter to the list with the hardware enable bit
490 * in counter->hw_config cleared.
492 * If the counter is attached to a task which is on a CPU we use a smp
493 * call to enable it in the task context. The task might have been
494 * scheduled away, but we check this in the smp call again.
496 * Must be called with ctx->mutex held.
499 perf_install_in_context(struct perf_counter_context
*ctx
,
500 struct perf_counter
*counter
,
503 struct task_struct
*task
= ctx
->task
;
508 * Per cpu counters are installed via an smp call and
509 * the install is always sucessful.
511 smp_call_function_single(cpu
, __perf_install_in_context
,
516 counter
->task
= task
;
518 task_oncpu_function_call(task
, __perf_install_in_context
,
521 spin_lock_irq(&ctx
->lock
);
523 * we need to retry the smp call.
525 if (ctx
->is_active
&& list_empty(&counter
->list_entry
)) {
526 spin_unlock_irq(&ctx
->lock
);
531 * The lock prevents that this context is scheduled in so we
532 * can add the counter safely, if it the call above did not
535 if (list_empty(&counter
->list_entry
)) {
536 list_add_counter(counter
, ctx
);
539 spin_unlock_irq(&ctx
->lock
);
543 * Cross CPU call to enable a performance counter
545 static void __perf_counter_enable(void *info
)
547 struct perf_counter
*counter
= info
;
548 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
549 struct perf_counter_context
*ctx
= counter
->ctx
;
550 struct perf_counter
*leader
= counter
->group_leader
;
555 * If this is a per-task counter, need to check whether this
556 * counter's task is the current task on this cpu.
558 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
561 curr_rq_lock_irq_save(&flags
);
562 spin_lock(&ctx
->lock
);
564 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
566 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
569 * If the counter is in a group and isn't the group leader,
570 * then don't put it on unless the group is on.
572 if (leader
!= counter
&& leader
->state
!= PERF_COUNTER_STATE_ACTIVE
)
575 if (!group_can_go_on(counter
, cpuctx
, 1))
578 err
= counter_sched_in(counter
, cpuctx
, ctx
,
583 * If this counter can't go on and it's part of a
584 * group, then the whole group has to come off.
586 if (leader
!= counter
)
587 group_sched_out(leader
, cpuctx
, ctx
);
588 if (leader
->hw_event
.pinned
)
589 leader
->state
= PERF_COUNTER_STATE_ERROR
;
593 spin_unlock(&ctx
->lock
);
594 curr_rq_unlock_irq_restore(&flags
);
600 static void perf_counter_enable(struct perf_counter
*counter
)
602 struct perf_counter_context
*ctx
= counter
->ctx
;
603 struct task_struct
*task
= ctx
->task
;
607 * Enable the counter on the cpu that it's on
609 smp_call_function_single(counter
->cpu
, __perf_counter_enable
,
614 spin_lock_irq(&ctx
->lock
);
615 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
619 * If the counter is in error state, clear that first.
620 * That way, if we see the counter in error state below, we
621 * know that it has gone back into error state, as distinct
622 * from the task having been scheduled away before the
623 * cross-call arrived.
625 if (counter
->state
== PERF_COUNTER_STATE_ERROR
)
626 counter
->state
= PERF_COUNTER_STATE_OFF
;
629 spin_unlock_irq(&ctx
->lock
);
630 task_oncpu_function_call(task
, __perf_counter_enable
, counter
);
632 spin_lock_irq(&ctx
->lock
);
635 * If the context is active and the counter is still off,
636 * we need to retry the cross-call.
638 if (ctx
->is_active
&& counter
->state
== PERF_COUNTER_STATE_OFF
)
642 * Since we have the lock this context can't be scheduled
643 * in, so we can change the state safely.
645 if (counter
->state
== PERF_COUNTER_STATE_OFF
)
646 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
648 spin_unlock_irq(&ctx
->lock
);
652 * Enable a counter and all its children.
654 static void perf_counter_enable_family(struct perf_counter
*counter
)
656 struct perf_counter
*child
;
658 perf_counter_enable(counter
);
661 * Lock the mutex to protect the list of children
663 mutex_lock(&counter
->mutex
);
664 list_for_each_entry(child
, &counter
->child_list
, child_list
)
665 perf_counter_enable(child
);
666 mutex_unlock(&counter
->mutex
);
669 void __perf_counter_sched_out(struct perf_counter_context
*ctx
,
670 struct perf_cpu_context
*cpuctx
)
672 struct perf_counter
*counter
;
675 spin_lock(&ctx
->lock
);
677 if (likely(!ctx
->nr_counters
))
680 flags
= hw_perf_save_disable();
681 if (ctx
->nr_active
) {
682 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
)
683 group_sched_out(counter
, cpuctx
, ctx
);
685 hw_perf_restore(flags
);
687 spin_unlock(&ctx
->lock
);
691 * Called from scheduler to remove the counters of the current task,
692 * with interrupts disabled.
694 * We stop each counter and update the counter value in counter->count.
696 * This does not protect us against NMI, but disable()
697 * sets the disabled bit in the control field of counter _before_
698 * accessing the counter control register. If a NMI hits, then it will
699 * not restart the counter.
701 void perf_counter_task_sched_out(struct task_struct
*task
, int cpu
)
703 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
704 struct perf_counter_context
*ctx
= &task
->perf_counter_ctx
;
706 if (likely(!cpuctx
->task_ctx
))
709 __perf_counter_sched_out(ctx
, cpuctx
);
711 cpuctx
->task_ctx
= NULL
;
714 static void perf_counter_cpu_sched_out(struct perf_cpu_context
*cpuctx
)
716 __perf_counter_sched_out(&cpuctx
->ctx
, cpuctx
);
720 group_sched_in(struct perf_counter
*group_counter
,
721 struct perf_cpu_context
*cpuctx
,
722 struct perf_counter_context
*ctx
,
725 struct perf_counter
*counter
, *partial_group
;
728 if (group_counter
->state
== PERF_COUNTER_STATE_OFF
)
731 ret
= hw_perf_group_sched_in(group_counter
, cpuctx
, ctx
, cpu
);
733 return ret
< 0 ? ret
: 0;
735 if (counter_sched_in(group_counter
, cpuctx
, ctx
, cpu
))
739 * Schedule in siblings as one group (if any):
741 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
) {
742 if (counter_sched_in(counter
, cpuctx
, ctx
, cpu
)) {
743 partial_group
= counter
;
752 * Groups can be scheduled in as one unit only, so undo any
753 * partial group before returning:
755 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
) {
756 if (counter
== partial_group
)
758 counter_sched_out(counter
, cpuctx
, ctx
);
760 counter_sched_out(group_counter
, cpuctx
, ctx
);
766 __perf_counter_sched_in(struct perf_counter_context
*ctx
,
767 struct perf_cpu_context
*cpuctx
, int cpu
)
769 struct perf_counter
*counter
;
773 spin_lock(&ctx
->lock
);
775 if (likely(!ctx
->nr_counters
))
778 flags
= hw_perf_save_disable();
781 * First go through the list and put on any pinned groups
782 * in order to give them the best chance of going on.
784 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
785 if (counter
->state
<= PERF_COUNTER_STATE_OFF
||
786 !counter
->hw_event
.pinned
)
788 if (counter
->cpu
!= -1 && counter
->cpu
!= cpu
)
791 if (group_can_go_on(counter
, cpuctx
, 1))
792 group_sched_in(counter
, cpuctx
, ctx
, cpu
);
795 * If this pinned group hasn't been scheduled,
796 * put it in error state.
798 if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
)
799 counter
->state
= PERF_COUNTER_STATE_ERROR
;
802 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
804 * Ignore counters in OFF or ERROR state, and
805 * ignore pinned counters since we did them already.
807 if (counter
->state
<= PERF_COUNTER_STATE_OFF
||
808 counter
->hw_event
.pinned
)
812 * Listen to the 'cpu' scheduling filter constraint
815 if (counter
->cpu
!= -1 && counter
->cpu
!= cpu
)
818 if (group_can_go_on(counter
, cpuctx
, can_add_hw
)) {
819 if (group_sched_in(counter
, cpuctx
, ctx
, cpu
))
823 hw_perf_restore(flags
);
825 spin_unlock(&ctx
->lock
);
829 * Called from scheduler to add the counters of the current task
830 * with interrupts disabled.
832 * We restore the counter value and then enable it.
834 * This does not protect us against NMI, but enable()
835 * sets the enabled bit in the control field of counter _before_
836 * accessing the counter control register. If a NMI hits, then it will
837 * keep the counter running.
839 void perf_counter_task_sched_in(struct task_struct
*task
, int cpu
)
841 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
842 struct perf_counter_context
*ctx
= &task
->perf_counter_ctx
;
844 __perf_counter_sched_in(ctx
, cpuctx
, cpu
);
845 cpuctx
->task_ctx
= ctx
;
848 static void perf_counter_cpu_sched_in(struct perf_cpu_context
*cpuctx
, int cpu
)
850 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
852 __perf_counter_sched_in(ctx
, cpuctx
, cpu
);
855 int perf_counter_task_disable(void)
857 struct task_struct
*curr
= current
;
858 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
859 struct perf_counter
*counter
;
864 if (likely(!ctx
->nr_counters
))
867 curr_rq_lock_irq_save(&flags
);
868 cpu
= smp_processor_id();
870 /* force the update of the task clock: */
871 __task_delta_exec(curr
, 1);
873 perf_counter_task_sched_out(curr
, cpu
);
875 spin_lock(&ctx
->lock
);
878 * Disable all the counters:
880 perf_flags
= hw_perf_save_disable();
882 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
883 if (counter
->state
!= PERF_COUNTER_STATE_ERROR
)
884 counter
->state
= PERF_COUNTER_STATE_OFF
;
887 hw_perf_restore(perf_flags
);
889 spin_unlock(&ctx
->lock
);
891 curr_rq_unlock_irq_restore(&flags
);
896 int perf_counter_task_enable(void)
898 struct task_struct
*curr
= current
;
899 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
900 struct perf_counter
*counter
;
905 if (likely(!ctx
->nr_counters
))
908 curr_rq_lock_irq_save(&flags
);
909 cpu
= smp_processor_id();
911 /* force the update of the task clock: */
912 __task_delta_exec(curr
, 1);
914 perf_counter_task_sched_out(curr
, cpu
);
916 spin_lock(&ctx
->lock
);
919 * Disable all the counters:
921 perf_flags
= hw_perf_save_disable();
923 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
924 if (counter
->state
> PERF_COUNTER_STATE_OFF
)
926 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
927 counter
->hw_event
.disabled
= 0;
929 hw_perf_restore(perf_flags
);
931 spin_unlock(&ctx
->lock
);
933 perf_counter_task_sched_in(curr
, cpu
);
935 curr_rq_unlock_irq_restore(&flags
);
941 * Round-robin a context's counters:
943 static void rotate_ctx(struct perf_counter_context
*ctx
)
945 struct perf_counter
*counter
;
948 if (!ctx
->nr_counters
)
951 spin_lock(&ctx
->lock
);
953 * Rotate the first entry last (works just fine for group counters too):
955 perf_flags
= hw_perf_save_disable();
956 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
957 list_del(&counter
->list_entry
);
958 list_add_tail(&counter
->list_entry
, &ctx
->counter_list
);
961 hw_perf_restore(perf_flags
);
963 spin_unlock(&ctx
->lock
);
966 void perf_counter_task_tick(struct task_struct
*curr
, int cpu
)
968 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
969 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
970 const int rotate_percpu
= 0;
973 perf_counter_cpu_sched_out(cpuctx
);
974 perf_counter_task_sched_out(curr
, cpu
);
977 rotate_ctx(&cpuctx
->ctx
);
981 perf_counter_cpu_sched_in(cpuctx
, cpu
);
982 perf_counter_task_sched_in(curr
, cpu
);
986 * Cross CPU call to read the hardware counter
988 static void __read(void *info
)
990 struct perf_counter
*counter
= info
;
993 curr_rq_lock_irq_save(&flags
);
994 counter
->hw_ops
->read(counter
);
995 curr_rq_unlock_irq_restore(&flags
);
998 static u64
perf_counter_read(struct perf_counter
*counter
)
1001 * If counter is enabled and currently active on a CPU, update the
1002 * value in the counter structure:
1004 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
1005 smp_call_function_single(counter
->oncpu
,
1006 __read
, counter
, 1);
1009 return atomic64_read(&counter
->count
);
1013 * Cross CPU call to switch performance data pointers
1015 static void __perf_switch_irq_data(void *info
)
1017 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
1018 struct perf_counter
*counter
= info
;
1019 struct perf_counter_context
*ctx
= counter
->ctx
;
1020 struct perf_data
*oldirqdata
= counter
->irqdata
;
1023 * If this is a task context, we need to check whether it is
1024 * the current task context of this cpu. If not it has been
1025 * scheduled out before the smp call arrived.
1028 if (cpuctx
->task_ctx
!= ctx
)
1030 spin_lock(&ctx
->lock
);
1033 /* Change the pointer NMI safe */
1034 atomic_long_set((atomic_long_t
*)&counter
->irqdata
,
1035 (unsigned long) counter
->usrdata
);
1036 counter
->usrdata
= oldirqdata
;
1039 spin_unlock(&ctx
->lock
);
1042 static struct perf_data
*perf_switch_irq_data(struct perf_counter
*counter
)
1044 struct perf_counter_context
*ctx
= counter
->ctx
;
1045 struct perf_data
*oldirqdata
= counter
->irqdata
;
1046 struct task_struct
*task
= ctx
->task
;
1049 smp_call_function_single(counter
->cpu
,
1050 __perf_switch_irq_data
,
1052 return counter
->usrdata
;
1056 spin_lock_irq(&ctx
->lock
);
1057 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
) {
1058 counter
->irqdata
= counter
->usrdata
;
1059 counter
->usrdata
= oldirqdata
;
1060 spin_unlock_irq(&ctx
->lock
);
1063 spin_unlock_irq(&ctx
->lock
);
1064 task_oncpu_function_call(task
, __perf_switch_irq_data
, counter
);
1065 /* Might have failed, because task was scheduled out */
1066 if (counter
->irqdata
== oldirqdata
)
1069 return counter
->usrdata
;
1072 static void put_context(struct perf_counter_context
*ctx
)
1075 put_task_struct(ctx
->task
);
1078 static struct perf_counter_context
*find_get_context(pid_t pid
, int cpu
)
1080 struct perf_cpu_context
*cpuctx
;
1081 struct perf_counter_context
*ctx
;
1082 struct task_struct
*task
;
1085 * If cpu is not a wildcard then this is a percpu counter:
1088 /* Must be root to operate on a CPU counter: */
1089 if (!capable(CAP_SYS_ADMIN
))
1090 return ERR_PTR(-EACCES
);
1092 if (cpu
< 0 || cpu
> num_possible_cpus())
1093 return ERR_PTR(-EINVAL
);
1096 * We could be clever and allow to attach a counter to an
1097 * offline CPU and activate it when the CPU comes up, but
1100 if (!cpu_isset(cpu
, cpu_online_map
))
1101 return ERR_PTR(-ENODEV
);
1103 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1113 task
= find_task_by_vpid(pid
);
1115 get_task_struct(task
);
1119 return ERR_PTR(-ESRCH
);
1121 ctx
= &task
->perf_counter_ctx
;
1124 /* Reuse ptrace permission checks for now. */
1125 if (!ptrace_may_access(task
, PTRACE_MODE_READ
)) {
1127 return ERR_PTR(-EACCES
);
1134 * Called when the last reference to the file is gone.
1136 static int perf_release(struct inode
*inode
, struct file
*file
)
1138 struct perf_counter
*counter
= file
->private_data
;
1139 struct perf_counter_context
*ctx
= counter
->ctx
;
1141 file
->private_data
= NULL
;
1143 mutex_lock(&ctx
->mutex
);
1144 mutex_lock(&counter
->mutex
);
1146 perf_counter_remove_from_context(counter
);
1149 mutex_unlock(&counter
->mutex
);
1150 mutex_unlock(&ctx
->mutex
);
1158 * Read the performance counter - simple non blocking version for now
1161 perf_read_hw(struct perf_counter
*counter
, char __user
*buf
, size_t count
)
1165 if (count
!= sizeof(cntval
))
1169 * Return end-of-file for a read on a counter that is in
1170 * error state (i.e. because it was pinned but it couldn't be
1171 * scheduled on to the CPU at some point).
1173 if (counter
->state
== PERF_COUNTER_STATE_ERROR
)
1176 mutex_lock(&counter
->mutex
);
1177 cntval
= perf_counter_read(counter
);
1178 mutex_unlock(&counter
->mutex
);
1180 return put_user(cntval
, (u64 __user
*) buf
) ? -EFAULT
: sizeof(cntval
);
1184 perf_copy_usrdata(struct perf_data
*usrdata
, char __user
*buf
, size_t count
)
1189 count
= min(count
, (size_t)usrdata
->len
);
1190 if (copy_to_user(buf
, usrdata
->data
+ usrdata
->rd_idx
, count
))
1193 /* Adjust the counters */
1194 usrdata
->len
-= count
;
1196 usrdata
->rd_idx
= 0;
1198 usrdata
->rd_idx
+= count
;
1204 perf_read_irq_data(struct perf_counter
*counter
,
1209 struct perf_data
*irqdata
, *usrdata
;
1210 DECLARE_WAITQUEUE(wait
, current
);
1213 irqdata
= counter
->irqdata
;
1214 usrdata
= counter
->usrdata
;
1216 if (usrdata
->len
+ irqdata
->len
>= count
)
1222 spin_lock_irq(&counter
->waitq
.lock
);
1223 __add_wait_queue(&counter
->waitq
, &wait
);
1225 set_current_state(TASK_INTERRUPTIBLE
);
1226 if (usrdata
->len
+ irqdata
->len
>= count
)
1229 if (signal_pending(current
))
1232 if (counter
->state
== PERF_COUNTER_STATE_ERROR
)
1235 spin_unlock_irq(&counter
->waitq
.lock
);
1237 spin_lock_irq(&counter
->waitq
.lock
);
1239 __remove_wait_queue(&counter
->waitq
, &wait
);
1240 __set_current_state(TASK_RUNNING
);
1241 spin_unlock_irq(&counter
->waitq
.lock
);
1243 if (usrdata
->len
+ irqdata
->len
< count
&&
1244 counter
->state
!= PERF_COUNTER_STATE_ERROR
)
1245 return -ERESTARTSYS
;
1247 mutex_lock(&counter
->mutex
);
1249 /* Drain pending data first: */
1250 res
= perf_copy_usrdata(usrdata
, buf
, count
);
1251 if (res
< 0 || res
== count
)
1254 /* Switch irq buffer: */
1255 usrdata
= perf_switch_irq_data(counter
);
1256 res2
= perf_copy_usrdata(usrdata
, buf
+ res
, count
- res
);
1264 mutex_unlock(&counter
->mutex
);
1270 perf_read(struct file
*file
, char __user
*buf
, size_t count
, loff_t
*ppos
)
1272 struct perf_counter
*counter
= file
->private_data
;
1274 switch (counter
->hw_event
.record_type
) {
1275 case PERF_RECORD_SIMPLE
:
1276 return perf_read_hw(counter
, buf
, count
);
1278 case PERF_RECORD_IRQ
:
1279 case PERF_RECORD_GROUP
:
1280 return perf_read_irq_data(counter
, buf
, count
,
1281 file
->f_flags
& O_NONBLOCK
);
1286 static unsigned int perf_poll(struct file
*file
, poll_table
*wait
)
1288 struct perf_counter
*counter
= file
->private_data
;
1289 unsigned int events
= 0;
1290 unsigned long flags
;
1292 poll_wait(file
, &counter
->waitq
, wait
);
1294 spin_lock_irqsave(&counter
->waitq
.lock
, flags
);
1295 if (counter
->usrdata
->len
|| counter
->irqdata
->len
)
1297 spin_unlock_irqrestore(&counter
->waitq
.lock
, flags
);
1302 static long perf_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
1304 struct perf_counter
*counter
= file
->private_data
;
1308 case PERF_COUNTER_IOC_ENABLE
:
1309 perf_counter_enable_family(counter
);
1311 case PERF_COUNTER_IOC_DISABLE
:
1312 perf_counter_disable_family(counter
);
1320 static const struct file_operations perf_fops
= {
1321 .release
= perf_release
,
1324 .unlocked_ioctl
= perf_ioctl
,
1325 .compat_ioctl
= perf_ioctl
,
1328 static int cpu_clock_perf_counter_enable(struct perf_counter
*counter
)
1330 int cpu
= raw_smp_processor_id();
1332 atomic64_set(&counter
->hw
.prev_count
, cpu_clock(cpu
));
1336 static void cpu_clock_perf_counter_update(struct perf_counter
*counter
)
1338 int cpu
= raw_smp_processor_id();
1342 now
= cpu_clock(cpu
);
1343 prev
= atomic64_read(&counter
->hw
.prev_count
);
1344 atomic64_set(&counter
->hw
.prev_count
, now
);
1345 atomic64_add(now
- prev
, &counter
->count
);
1348 static void cpu_clock_perf_counter_disable(struct perf_counter
*counter
)
1350 cpu_clock_perf_counter_update(counter
);
1353 static void cpu_clock_perf_counter_read(struct perf_counter
*counter
)
1355 cpu_clock_perf_counter_update(counter
);
1358 static const struct hw_perf_counter_ops perf_ops_cpu_clock
= {
1359 .enable
= cpu_clock_perf_counter_enable
,
1360 .disable
= cpu_clock_perf_counter_disable
,
1361 .read
= cpu_clock_perf_counter_read
,
1365 * Called from within the scheduler:
1367 static u64
task_clock_perf_counter_val(struct perf_counter
*counter
, int update
)
1369 struct task_struct
*curr
= counter
->task
;
1372 delta
= __task_delta_exec(curr
, update
);
1374 return curr
->se
.sum_exec_runtime
+ delta
;
1377 static void task_clock_perf_counter_update(struct perf_counter
*counter
, u64 now
)
1382 prev
= atomic64_read(&counter
->hw
.prev_count
);
1384 atomic64_set(&counter
->hw
.prev_count
, now
);
1388 atomic64_add(delta
, &counter
->count
);
1391 static void task_clock_perf_counter_read(struct perf_counter
*counter
)
1393 u64 now
= task_clock_perf_counter_val(counter
, 1);
1395 task_clock_perf_counter_update(counter
, now
);
1398 static int task_clock_perf_counter_enable(struct perf_counter
*counter
)
1400 u64 now
= task_clock_perf_counter_val(counter
, 0);
1402 atomic64_set(&counter
->hw
.prev_count
, now
);
1407 static void task_clock_perf_counter_disable(struct perf_counter
*counter
)
1409 u64 now
= task_clock_perf_counter_val(counter
, 0);
1411 task_clock_perf_counter_update(counter
, now
);
1414 static const struct hw_perf_counter_ops perf_ops_task_clock
= {
1415 .enable
= task_clock_perf_counter_enable
,
1416 .disable
= task_clock_perf_counter_disable
,
1417 .read
= task_clock_perf_counter_read
,
1420 static u64
get_page_faults(void)
1422 struct task_struct
*curr
= current
;
1424 return curr
->maj_flt
+ curr
->min_flt
;
1427 static void page_faults_perf_counter_update(struct perf_counter
*counter
)
1432 prev
= atomic64_read(&counter
->hw
.prev_count
);
1433 now
= get_page_faults();
1435 atomic64_set(&counter
->hw
.prev_count
, now
);
1439 atomic64_add(delta
, &counter
->count
);
1442 static void page_faults_perf_counter_read(struct perf_counter
*counter
)
1444 page_faults_perf_counter_update(counter
);
1447 static int page_faults_perf_counter_enable(struct perf_counter
*counter
)
1450 * page-faults is a per-task value already,
1451 * so we dont have to clear it on switch-in.
1457 static void page_faults_perf_counter_disable(struct perf_counter
*counter
)
1459 page_faults_perf_counter_update(counter
);
1462 static const struct hw_perf_counter_ops perf_ops_page_faults
= {
1463 .enable
= page_faults_perf_counter_enable
,
1464 .disable
= page_faults_perf_counter_disable
,
1465 .read
= page_faults_perf_counter_read
,
1468 static u64
get_context_switches(void)
1470 struct task_struct
*curr
= current
;
1472 return curr
->nvcsw
+ curr
->nivcsw
;
1475 static void context_switches_perf_counter_update(struct perf_counter
*counter
)
1480 prev
= atomic64_read(&counter
->hw
.prev_count
);
1481 now
= get_context_switches();
1483 atomic64_set(&counter
->hw
.prev_count
, now
);
1487 atomic64_add(delta
, &counter
->count
);
1490 static void context_switches_perf_counter_read(struct perf_counter
*counter
)
1492 context_switches_perf_counter_update(counter
);
1495 static int context_switches_perf_counter_enable(struct perf_counter
*counter
)
1498 * ->nvcsw + curr->nivcsw is a per-task value already,
1499 * so we dont have to clear it on switch-in.
1505 static void context_switches_perf_counter_disable(struct perf_counter
*counter
)
1507 context_switches_perf_counter_update(counter
);
1510 static const struct hw_perf_counter_ops perf_ops_context_switches
= {
1511 .enable
= context_switches_perf_counter_enable
,
1512 .disable
= context_switches_perf_counter_disable
,
1513 .read
= context_switches_perf_counter_read
,
1516 static inline u64
get_cpu_migrations(void)
1518 return current
->se
.nr_migrations
;
1521 static void cpu_migrations_perf_counter_update(struct perf_counter
*counter
)
1526 prev
= atomic64_read(&counter
->hw
.prev_count
);
1527 now
= get_cpu_migrations();
1529 atomic64_set(&counter
->hw
.prev_count
, now
);
1533 atomic64_add(delta
, &counter
->count
);
1536 static void cpu_migrations_perf_counter_read(struct perf_counter
*counter
)
1538 cpu_migrations_perf_counter_update(counter
);
1541 static int cpu_migrations_perf_counter_enable(struct perf_counter
*counter
)
1544 * se.nr_migrations is a per-task value already,
1545 * so we dont have to clear it on switch-in.
1551 static void cpu_migrations_perf_counter_disable(struct perf_counter
*counter
)
1553 cpu_migrations_perf_counter_update(counter
);
1556 static const struct hw_perf_counter_ops perf_ops_cpu_migrations
= {
1557 .enable
= cpu_migrations_perf_counter_enable
,
1558 .disable
= cpu_migrations_perf_counter_disable
,
1559 .read
= cpu_migrations_perf_counter_read
,
1562 static const struct hw_perf_counter_ops
*
1563 sw_perf_counter_init(struct perf_counter
*counter
)
1565 const struct hw_perf_counter_ops
*hw_ops
= NULL
;
1567 switch (counter
->hw_event
.type
) {
1568 case PERF_COUNT_CPU_CLOCK
:
1569 hw_ops
= &perf_ops_cpu_clock
;
1571 case PERF_COUNT_TASK_CLOCK
:
1572 hw_ops
= &perf_ops_task_clock
;
1574 case PERF_COUNT_PAGE_FAULTS
:
1575 hw_ops
= &perf_ops_page_faults
;
1577 case PERF_COUNT_CONTEXT_SWITCHES
:
1578 hw_ops
= &perf_ops_context_switches
;
1580 case PERF_COUNT_CPU_MIGRATIONS
:
1581 hw_ops
= &perf_ops_cpu_migrations
;
1590 * Allocate and initialize a counter structure
1592 static struct perf_counter
*
1593 perf_counter_alloc(struct perf_counter_hw_event
*hw_event
,
1595 struct perf_counter
*group_leader
,
1598 const struct hw_perf_counter_ops
*hw_ops
;
1599 struct perf_counter
*counter
;
1601 counter
= kzalloc(sizeof(*counter
), gfpflags
);
1606 * Single counters are their own group leaders, with an
1607 * empty sibling list:
1610 group_leader
= counter
;
1612 mutex_init(&counter
->mutex
);
1613 INIT_LIST_HEAD(&counter
->list_entry
);
1614 INIT_LIST_HEAD(&counter
->sibling_list
);
1615 init_waitqueue_head(&counter
->waitq
);
1617 INIT_LIST_HEAD(&counter
->child_list
);
1619 counter
->irqdata
= &counter
->data
[0];
1620 counter
->usrdata
= &counter
->data
[1];
1622 counter
->hw_event
= *hw_event
;
1623 counter
->wakeup_pending
= 0;
1624 counter
->group_leader
= group_leader
;
1625 counter
->hw_ops
= NULL
;
1627 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
1628 if (hw_event
->disabled
)
1629 counter
->state
= PERF_COUNTER_STATE_OFF
;
1632 if (!hw_event
->raw
&& hw_event
->type
< 0)
1633 hw_ops
= sw_perf_counter_init(counter
);
1635 hw_ops
= hw_perf_counter_init(counter
);
1641 counter
->hw_ops
= hw_ops
;
1647 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1649 * @hw_event_uptr: event type attributes for monitoring/sampling
1652 * @group_fd: group leader counter fd
1655 sys_perf_counter_open(struct perf_counter_hw_event
*hw_event_uptr __user
,
1656 pid_t pid
, int cpu
, int group_fd
)
1658 struct perf_counter
*counter
, *group_leader
;
1659 struct perf_counter_hw_event hw_event
;
1660 struct perf_counter_context
*ctx
;
1661 struct file
*counter_file
= NULL
;
1662 struct file
*group_file
= NULL
;
1663 int fput_needed
= 0;
1664 int fput_needed2
= 0;
1667 if (copy_from_user(&hw_event
, hw_event_uptr
, sizeof(hw_event
)) != 0)
1671 * Get the target context (task or percpu):
1673 ctx
= find_get_context(pid
, cpu
);
1675 return PTR_ERR(ctx
);
1678 * Look up the group leader (we will attach this counter to it):
1680 group_leader
= NULL
;
1681 if (group_fd
!= -1) {
1683 group_file
= fget_light(group_fd
, &fput_needed
);
1685 goto err_put_context
;
1686 if (group_file
->f_op
!= &perf_fops
)
1687 goto err_put_context
;
1689 group_leader
= group_file
->private_data
;
1691 * Do not allow a recursive hierarchy (this new sibling
1692 * becoming part of another group-sibling):
1694 if (group_leader
->group_leader
!= group_leader
)
1695 goto err_put_context
;
1697 * Do not allow to attach to a group in a different
1698 * task or CPU context:
1700 if (group_leader
->ctx
!= ctx
)
1701 goto err_put_context
;
1703 * Only a group leader can be exclusive or pinned
1705 if (hw_event
.exclusive
|| hw_event
.pinned
)
1706 goto err_put_context
;
1710 counter
= perf_counter_alloc(&hw_event
, cpu
, group_leader
, GFP_KERNEL
);
1712 goto err_put_context
;
1714 ret
= anon_inode_getfd("[perf_counter]", &perf_fops
, counter
, 0);
1716 goto err_free_put_context
;
1718 counter_file
= fget_light(ret
, &fput_needed2
);
1720 goto err_free_put_context
;
1722 counter
->filp
= counter_file
;
1723 mutex_lock(&ctx
->mutex
);
1724 perf_install_in_context(ctx
, counter
, cpu
);
1725 mutex_unlock(&ctx
->mutex
);
1727 fput_light(counter_file
, fput_needed2
);
1730 fput_light(group_file
, fput_needed
);
1734 err_free_put_context
:
1744 * Initialize the perf_counter context in a task_struct:
1747 __perf_counter_init_context(struct perf_counter_context
*ctx
,
1748 struct task_struct
*task
)
1750 memset(ctx
, 0, sizeof(*ctx
));
1751 spin_lock_init(&ctx
->lock
);
1752 mutex_init(&ctx
->mutex
);
1753 INIT_LIST_HEAD(&ctx
->counter_list
);
1758 * inherit a counter from parent task to child task:
1760 static struct perf_counter
*
1761 inherit_counter(struct perf_counter
*parent_counter
,
1762 struct task_struct
*parent
,
1763 struct perf_counter_context
*parent_ctx
,
1764 struct task_struct
*child
,
1765 struct perf_counter
*group_leader
,
1766 struct perf_counter_context
*child_ctx
)
1768 struct perf_counter
*child_counter
;
1771 * Instead of creating recursive hierarchies of counters,
1772 * we link inherited counters back to the original parent,
1773 * which has a filp for sure, which we use as the reference
1776 if (parent_counter
->parent
)
1777 parent_counter
= parent_counter
->parent
;
1779 child_counter
= perf_counter_alloc(&parent_counter
->hw_event
,
1780 parent_counter
->cpu
, group_leader
,
1786 * Link it up in the child's context:
1788 child_counter
->ctx
= child_ctx
;
1789 child_counter
->task
= child
;
1790 list_add_counter(child_counter
, child_ctx
);
1791 child_ctx
->nr_counters
++;
1793 child_counter
->parent
= parent_counter
;
1795 * inherit into child's child as well:
1797 child_counter
->hw_event
.inherit
= 1;
1800 * Get a reference to the parent filp - we will fput it
1801 * when the child counter exits. This is safe to do because
1802 * we are in the parent and we know that the filp still
1803 * exists and has a nonzero count:
1805 atomic_long_inc(&parent_counter
->filp
->f_count
);
1808 * Link this into the parent counter's child list
1810 mutex_lock(&parent_counter
->mutex
);
1811 list_add_tail(&child_counter
->child_list
, &parent_counter
->child_list
);
1814 * Make the child state follow the state of the parent counter,
1815 * not its hw_event.disabled bit. We hold the parent's mutex,
1816 * so we won't race with perf_counter_{en,dis}able_family.
1818 if (parent_counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
1819 child_counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
1821 child_counter
->state
= PERF_COUNTER_STATE_OFF
;
1823 mutex_unlock(&parent_counter
->mutex
);
1825 return child_counter
;
1828 static int inherit_group(struct perf_counter
*parent_counter
,
1829 struct task_struct
*parent
,
1830 struct perf_counter_context
*parent_ctx
,
1831 struct task_struct
*child
,
1832 struct perf_counter_context
*child_ctx
)
1834 struct perf_counter
*leader
;
1835 struct perf_counter
*sub
;
1837 leader
= inherit_counter(parent_counter
, parent
, parent_ctx
,
1838 child
, NULL
, child_ctx
);
1841 list_for_each_entry(sub
, &parent_counter
->sibling_list
, list_entry
) {
1842 if (!inherit_counter(sub
, parent
, parent_ctx
,
1843 child
, leader
, child_ctx
))
1849 static void sync_child_counter(struct perf_counter
*child_counter
,
1850 struct perf_counter
*parent_counter
)
1852 u64 parent_val
, child_val
;
1854 parent_val
= atomic64_read(&parent_counter
->count
);
1855 child_val
= atomic64_read(&child_counter
->count
);
1858 * Add back the child's count to the parent's count:
1860 atomic64_add(child_val
, &parent_counter
->count
);
1863 * Remove this counter from the parent's list
1865 mutex_lock(&parent_counter
->mutex
);
1866 list_del_init(&child_counter
->child_list
);
1867 mutex_unlock(&parent_counter
->mutex
);
1870 * Release the parent counter, if this was the last
1873 fput(parent_counter
->filp
);
1877 __perf_counter_exit_task(struct task_struct
*child
,
1878 struct perf_counter
*child_counter
,
1879 struct perf_counter_context
*child_ctx
)
1881 struct perf_counter
*parent_counter
;
1882 struct perf_counter
*sub
, *tmp
;
1885 * If we do not self-reap then we have to wait for the
1886 * child task to unschedule (it will happen for sure),
1887 * so that its counter is at its final count. (This
1888 * condition triggers rarely - child tasks usually get
1889 * off their CPU before the parent has a chance to
1890 * get this far into the reaping action)
1892 if (child
!= current
) {
1893 wait_task_inactive(child
, 0);
1894 list_del_init(&child_counter
->list_entry
);
1896 struct perf_cpu_context
*cpuctx
;
1897 unsigned long flags
;
1901 * Disable and unlink this counter.
1903 * Be careful about zapping the list - IRQ/NMI context
1904 * could still be processing it:
1906 curr_rq_lock_irq_save(&flags
);
1907 perf_flags
= hw_perf_save_disable();
1909 cpuctx
= &__get_cpu_var(perf_cpu_context
);
1911 group_sched_out(child_counter
, cpuctx
, child_ctx
);
1913 list_del_init(&child_counter
->list_entry
);
1915 child_ctx
->nr_counters
--;
1917 hw_perf_restore(perf_flags
);
1918 curr_rq_unlock_irq_restore(&flags
);
1921 parent_counter
= child_counter
->parent
;
1923 * It can happen that parent exits first, and has counters
1924 * that are still around due to the child reference. These
1925 * counters need to be zapped - but otherwise linger.
1927 if (parent_counter
) {
1928 sync_child_counter(child_counter
, parent_counter
);
1929 list_for_each_entry_safe(sub
, tmp
, &child_counter
->sibling_list
,
1932 sync_child_counter(sub
, sub
->parent
);
1937 if (!child_counter
->filp
|| !atomic_long_read(&child_counter
->filp
->f_count
))
1938 kfree(child_counter
);
1942 * When a child task exits, feed back counter values to parent counters.
1944 * Note: we may be running in child context, but the PID is not hashed
1945 * anymore so new counters will not be added.
1947 void perf_counter_exit_task(struct task_struct
*child
)
1949 struct perf_counter
*child_counter
, *tmp
;
1950 struct perf_counter_context
*child_ctx
;
1952 child_ctx
= &child
->perf_counter_ctx
;
1954 if (likely(!child_ctx
->nr_counters
))
1957 list_for_each_entry_safe(child_counter
, tmp
, &child_ctx
->counter_list
,
1959 __perf_counter_exit_task(child
, child_counter
, child_ctx
);
1963 * Initialize the perf_counter context in task_struct
1965 void perf_counter_init_task(struct task_struct
*child
)
1967 struct perf_counter_context
*child_ctx
, *parent_ctx
;
1968 struct perf_counter
*counter
;
1969 struct task_struct
*parent
= current
;
1971 child_ctx
= &child
->perf_counter_ctx
;
1972 parent_ctx
= &parent
->perf_counter_ctx
;
1974 __perf_counter_init_context(child_ctx
, child
);
1977 * This is executed from the parent task context, so inherit
1978 * counters that have been marked for cloning:
1981 if (likely(!parent_ctx
->nr_counters
))
1985 * Lock the parent list. No need to lock the child - not PID
1986 * hashed yet and not running, so nobody can access it.
1988 mutex_lock(&parent_ctx
->mutex
);
1991 * We dont have to disable NMIs - we are only looking at
1992 * the list, not manipulating it:
1994 list_for_each_entry(counter
, &parent_ctx
->counter_list
, list_entry
) {
1995 if (!counter
->hw_event
.inherit
)
1998 if (inherit_group(counter
, parent
,
1999 parent_ctx
, child
, child_ctx
))
2003 mutex_unlock(&parent_ctx
->mutex
);
2006 static void __cpuinit
perf_counter_init_cpu(int cpu
)
2008 struct perf_cpu_context
*cpuctx
;
2010 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
2011 __perf_counter_init_context(&cpuctx
->ctx
, NULL
);
2013 mutex_lock(&perf_resource_mutex
);
2014 cpuctx
->max_pertask
= perf_max_counters
- perf_reserved_percpu
;
2015 mutex_unlock(&perf_resource_mutex
);
2017 hw_perf_counter_setup(cpu
);
2020 #ifdef CONFIG_HOTPLUG_CPU
2021 static void __perf_counter_exit_cpu(void *info
)
2023 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
2024 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
2025 struct perf_counter
*counter
, *tmp
;
2027 list_for_each_entry_safe(counter
, tmp
, &ctx
->counter_list
, list_entry
)
2028 __perf_counter_remove_from_context(counter
);
2030 static void perf_counter_exit_cpu(int cpu
)
2032 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
2033 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
2035 mutex_lock(&ctx
->mutex
);
2036 smp_call_function_single(cpu
, __perf_counter_exit_cpu
, NULL
, 1);
2037 mutex_unlock(&ctx
->mutex
);
2040 static inline void perf_counter_exit_cpu(int cpu
) { }
2043 static int __cpuinit
2044 perf_cpu_notify(struct notifier_block
*self
, unsigned long action
, void *hcpu
)
2046 unsigned int cpu
= (long)hcpu
;
2050 case CPU_UP_PREPARE
:
2051 case CPU_UP_PREPARE_FROZEN
:
2052 perf_counter_init_cpu(cpu
);
2055 case CPU_DOWN_PREPARE
:
2056 case CPU_DOWN_PREPARE_FROZEN
:
2057 perf_counter_exit_cpu(cpu
);
2067 static struct notifier_block __cpuinitdata perf_cpu_nb
= {
2068 .notifier_call
= perf_cpu_notify
,
2071 static int __init
perf_counter_init(void)
2073 perf_cpu_notify(&perf_cpu_nb
, (unsigned long)CPU_UP_PREPARE
,
2074 (void *)(long)smp_processor_id());
2075 register_cpu_notifier(&perf_cpu_nb
);
2079 early_initcall(perf_counter_init
);
2081 static ssize_t
perf_show_reserve_percpu(struct sysdev_class
*class, char *buf
)
2083 return sprintf(buf
, "%d\n", perf_reserved_percpu
);
2087 perf_set_reserve_percpu(struct sysdev_class
*class,
2091 struct perf_cpu_context
*cpuctx
;
2095 err
= strict_strtoul(buf
, 10, &val
);
2098 if (val
> perf_max_counters
)
2101 mutex_lock(&perf_resource_mutex
);
2102 perf_reserved_percpu
= val
;
2103 for_each_online_cpu(cpu
) {
2104 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
2105 spin_lock_irq(&cpuctx
->ctx
.lock
);
2106 mpt
= min(perf_max_counters
- cpuctx
->ctx
.nr_counters
,
2107 perf_max_counters
- perf_reserved_percpu
);
2108 cpuctx
->max_pertask
= mpt
;
2109 spin_unlock_irq(&cpuctx
->ctx
.lock
);
2111 mutex_unlock(&perf_resource_mutex
);
2116 static ssize_t
perf_show_overcommit(struct sysdev_class
*class, char *buf
)
2118 return sprintf(buf
, "%d\n", perf_overcommit
);
2122 perf_set_overcommit(struct sysdev_class
*class, const char *buf
, size_t count
)
2127 err
= strict_strtoul(buf
, 10, &val
);
2133 mutex_lock(&perf_resource_mutex
);
2134 perf_overcommit
= val
;
2135 mutex_unlock(&perf_resource_mutex
);
2140 static SYSDEV_CLASS_ATTR(
2143 perf_show_reserve_percpu
,
2144 perf_set_reserve_percpu
2147 static SYSDEV_CLASS_ATTR(
2150 perf_show_overcommit
,
2154 static struct attribute
*perfclass_attrs
[] = {
2155 &attr_reserve_percpu
.attr
,
2156 &attr_overcommit
.attr
,
2160 static struct attribute_group perfclass_attr_group
= {
2161 .attrs
= perfclass_attrs
,
2162 .name
= "perf_counters",
2165 static int __init
perf_counter_sysfs_init(void)
2167 return sysfs_create_group(&cpu_sysdev_class
.kset
.kobj
,
2168 &perfclass_attr_group
);
2170 device_initcall(perf_counter_sysfs_init
);