2 * Performance counter core code
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
9 * For licensing details see kernel-base/COPYING
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/file.h>
17 #include <linux/poll.h>
18 #include <linux/sysfs.h>
19 #include <linux/ptrace.h>
20 #include <linux/percpu.h>
21 #include <linux/vmstat.h>
22 #include <linux/hardirq.h>
23 #include <linux/rculist.h>
24 #include <linux/uaccess.h>
25 #include <linux/syscalls.h>
26 #include <linux/anon_inodes.h>
27 #include <linux/kernel_stat.h>
28 #include <linux/perf_counter.h>
29 #include <linux/dcache.h>
31 #include <asm/irq_regs.h>
34 * Each CPU has a list of per CPU counters:
36 DEFINE_PER_CPU(struct perf_cpu_context
, perf_cpu_context
);
38 int perf_max_counters __read_mostly
= 1;
39 static int perf_reserved_percpu __read_mostly
;
40 static int perf_overcommit __read_mostly
= 1;
42 static atomic_t nr_counters __read_mostly
;
43 static atomic_t nr_mmap_tracking __read_mostly
;
44 static atomic_t nr_munmap_tracking __read_mostly
;
45 static atomic_t nr_comm_tracking __read_mostly
;
47 int sysctl_perf_counter_priv __read_mostly
; /* do we need to be privileged */
48 int sysctl_perf_counter_mlock __read_mostly
= 512; /* 'free' kb per user */
49 int sysctl_perf_counter_limit __read_mostly
= 100000; /* max NMIs per second */
52 * Lock for (sysadmin-configurable) counter reservations:
54 static DEFINE_SPINLOCK(perf_resource_lock
);
57 * Architecture provided APIs - weak aliases:
59 extern __weak
const struct pmu
*hw_perf_counter_init(struct perf_counter
*counter
)
64 void __weak
hw_perf_disable(void) { barrier(); }
65 void __weak
hw_perf_enable(void) { barrier(); }
67 void __weak
hw_perf_counter_setup(int cpu
) { barrier(); }
68 int __weak
hw_perf_group_sched_in(struct perf_counter
*group_leader
,
69 struct perf_cpu_context
*cpuctx
,
70 struct perf_counter_context
*ctx
, int cpu
)
75 void __weak
perf_counter_print_debug(void) { }
77 static DEFINE_PER_CPU(int, disable_count
);
79 void __perf_disable(void)
81 __get_cpu_var(disable_count
)++;
84 bool __perf_enable(void)
86 return !--__get_cpu_var(disable_count
);
89 void perf_disable(void)
95 void perf_enable(void)
101 static void get_ctx(struct perf_counter_context
*ctx
)
103 atomic_inc(&ctx
->refcount
);
106 static void free_ctx(struct rcu_head
*head
)
108 struct perf_counter_context
*ctx
;
110 ctx
= container_of(head
, struct perf_counter_context
, rcu_head
);
114 static void put_ctx(struct perf_counter_context
*ctx
)
116 if (atomic_dec_and_test(&ctx
->refcount
)) {
118 put_ctx(ctx
->parent_ctx
);
120 put_task_struct(ctx
->task
);
121 call_rcu(&ctx
->rcu_head
, free_ctx
);
126 * Add a counter from the lists for its context.
127 * Must be called with ctx->mutex and ctx->lock held.
130 list_add_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
132 struct perf_counter
*group_leader
= counter
->group_leader
;
135 * Depending on whether it is a standalone or sibling counter,
136 * add it straight to the context's counter list, or to the group
137 * leader's sibling list:
139 if (group_leader
== counter
)
140 list_add_tail(&counter
->list_entry
, &ctx
->counter_list
);
142 list_add_tail(&counter
->list_entry
, &group_leader
->sibling_list
);
143 group_leader
->nr_siblings
++;
146 list_add_rcu(&counter
->event_entry
, &ctx
->event_list
);
151 * Remove a counter from the lists for its context.
152 * Must be called with ctx->mutex and ctx->lock held.
155 list_del_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
157 struct perf_counter
*sibling
, *tmp
;
159 if (list_empty(&counter
->list_entry
))
163 list_del_init(&counter
->list_entry
);
164 list_del_rcu(&counter
->event_entry
);
166 if (counter
->group_leader
!= counter
)
167 counter
->group_leader
->nr_siblings
--;
170 * If this was a group counter with sibling counters then
171 * upgrade the siblings to singleton counters by adding them
172 * to the context list directly:
174 list_for_each_entry_safe(sibling
, tmp
,
175 &counter
->sibling_list
, list_entry
) {
177 list_move_tail(&sibling
->list_entry
, &ctx
->counter_list
);
178 sibling
->group_leader
= sibling
;
183 counter_sched_out(struct perf_counter
*counter
,
184 struct perf_cpu_context
*cpuctx
,
185 struct perf_counter_context
*ctx
)
187 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
190 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
191 counter
->tstamp_stopped
= ctx
->time
;
192 counter
->pmu
->disable(counter
);
195 if (!is_software_counter(counter
))
196 cpuctx
->active_oncpu
--;
198 if (counter
->hw_event
.exclusive
|| !cpuctx
->active_oncpu
)
199 cpuctx
->exclusive
= 0;
203 group_sched_out(struct perf_counter
*group_counter
,
204 struct perf_cpu_context
*cpuctx
,
205 struct perf_counter_context
*ctx
)
207 struct perf_counter
*counter
;
209 if (group_counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
212 counter_sched_out(group_counter
, cpuctx
, ctx
);
215 * Schedule out siblings (if any):
217 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
)
218 counter_sched_out(counter
, cpuctx
, ctx
);
220 if (group_counter
->hw_event
.exclusive
)
221 cpuctx
->exclusive
= 0;
225 * Cross CPU call to remove a performance counter
227 * We disable the counter on the hardware level first. After that we
228 * remove it from the context list.
230 static void __perf_counter_remove_from_context(void *info
)
232 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
233 struct perf_counter
*counter
= info
;
234 struct perf_counter_context
*ctx
= counter
->ctx
;
237 * If this is a task context, we need to check whether it is
238 * the current task context of this cpu. If not it has been
239 * scheduled out before the smp call arrived.
241 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
244 spin_lock(&ctx
->lock
);
246 * Protect the list operation against NMI by disabling the
247 * counters on a global level.
251 counter_sched_out(counter
, cpuctx
, ctx
);
253 list_del_counter(counter
, ctx
);
257 * Allow more per task counters with respect to the
260 cpuctx
->max_pertask
=
261 min(perf_max_counters
- ctx
->nr_counters
,
262 perf_max_counters
- perf_reserved_percpu
);
266 spin_unlock(&ctx
->lock
);
271 * Remove the counter from a task's (or a CPU's) list of counters.
273 * Must be called with ctx->mutex held.
275 * CPU counters are removed with a smp call. For task counters we only
276 * call when the task is on a CPU.
278 * If counter->ctx is a cloned context, callers must make sure that
279 * every task struct that counter->ctx->task could possibly point to
280 * remains valid. This is OK when called from perf_release since
281 * that only calls us on the top-level context, which can't be a clone.
282 * When called from perf_counter_exit_task, it's OK because the
283 * context has been detached from its task.
285 static void perf_counter_remove_from_context(struct perf_counter
*counter
)
287 struct perf_counter_context
*ctx
= counter
->ctx
;
288 struct task_struct
*task
= ctx
->task
;
292 * Per cpu counters are removed via an smp call and
293 * the removal is always sucessful.
295 smp_call_function_single(counter
->cpu
,
296 __perf_counter_remove_from_context
,
302 task_oncpu_function_call(task
, __perf_counter_remove_from_context
,
305 spin_lock_irq(&ctx
->lock
);
307 * If the context is active we need to retry the smp call.
309 if (ctx
->nr_active
&& !list_empty(&counter
->list_entry
)) {
310 spin_unlock_irq(&ctx
->lock
);
315 * The lock prevents that this context is scheduled in so we
316 * can remove the counter safely, if the call above did not
319 if (!list_empty(&counter
->list_entry
)) {
320 list_del_counter(counter
, ctx
);
322 spin_unlock_irq(&ctx
->lock
);
325 static inline u64
perf_clock(void)
327 return cpu_clock(smp_processor_id());
331 * Update the record of the current time in a context.
333 static void update_context_time(struct perf_counter_context
*ctx
)
335 u64 now
= perf_clock();
337 ctx
->time
+= now
- ctx
->timestamp
;
338 ctx
->timestamp
= now
;
342 * Update the total_time_enabled and total_time_running fields for a counter.
344 static void update_counter_times(struct perf_counter
*counter
)
346 struct perf_counter_context
*ctx
= counter
->ctx
;
349 if (counter
->state
< PERF_COUNTER_STATE_INACTIVE
)
352 counter
->total_time_enabled
= ctx
->time
- counter
->tstamp_enabled
;
354 if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
)
355 run_end
= counter
->tstamp_stopped
;
359 counter
->total_time_running
= run_end
- counter
->tstamp_running
;
363 * Update total_time_enabled and total_time_running for all counters in a group.
365 static void update_group_times(struct perf_counter
*leader
)
367 struct perf_counter
*counter
;
369 update_counter_times(leader
);
370 list_for_each_entry(counter
, &leader
->sibling_list
, list_entry
)
371 update_counter_times(counter
);
375 * Cross CPU call to disable a performance counter
377 static void __perf_counter_disable(void *info
)
379 struct perf_counter
*counter
= info
;
380 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
381 struct perf_counter_context
*ctx
= counter
->ctx
;
384 * If this is a per-task counter, need to check whether this
385 * counter's task is the current task on this cpu.
387 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
390 spin_lock(&ctx
->lock
);
393 * If the counter is on, turn it off.
394 * If it is in error state, leave it in error state.
396 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
) {
397 update_context_time(ctx
);
398 update_counter_times(counter
);
399 if (counter
== counter
->group_leader
)
400 group_sched_out(counter
, cpuctx
, ctx
);
402 counter_sched_out(counter
, cpuctx
, ctx
);
403 counter
->state
= PERF_COUNTER_STATE_OFF
;
406 spin_unlock(&ctx
->lock
);
412 * If counter->ctx is a cloned context, callers must make sure that
413 * every task struct that counter->ctx->task could possibly point to
414 * remains valid. This condition is satisifed when called through
415 * perf_counter_for_each_child or perf_counter_for_each because they
416 * hold the top-level counter's child_mutex, so any descendant that
417 * goes to exit will block in sync_child_counter.
418 * When called from perf_pending_counter it's OK because counter->ctx
419 * is the current context on this CPU and preemption is disabled,
420 * hence we can't get into perf_counter_task_sched_out for this context.
422 static void perf_counter_disable(struct perf_counter
*counter
)
424 struct perf_counter_context
*ctx
= counter
->ctx
;
425 struct task_struct
*task
= ctx
->task
;
429 * Disable the counter on the cpu that it's on
431 smp_call_function_single(counter
->cpu
, __perf_counter_disable
,
437 task_oncpu_function_call(task
, __perf_counter_disable
, counter
);
439 spin_lock_irq(&ctx
->lock
);
441 * If the counter is still active, we need to retry the cross-call.
443 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
444 spin_unlock_irq(&ctx
->lock
);
449 * Since we have the lock this context can't be scheduled
450 * in, so we can change the state safely.
452 if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
) {
453 update_counter_times(counter
);
454 counter
->state
= PERF_COUNTER_STATE_OFF
;
457 spin_unlock_irq(&ctx
->lock
);
461 counter_sched_in(struct perf_counter
*counter
,
462 struct perf_cpu_context
*cpuctx
,
463 struct perf_counter_context
*ctx
,
466 if (counter
->state
<= PERF_COUNTER_STATE_OFF
)
469 counter
->state
= PERF_COUNTER_STATE_ACTIVE
;
470 counter
->oncpu
= cpu
; /* TODO: put 'cpu' into cpuctx->cpu */
472 * The new state must be visible before we turn it on in the hardware:
476 if (counter
->pmu
->enable(counter
)) {
477 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
482 counter
->tstamp_running
+= ctx
->time
- counter
->tstamp_stopped
;
484 if (!is_software_counter(counter
))
485 cpuctx
->active_oncpu
++;
488 if (counter
->hw_event
.exclusive
)
489 cpuctx
->exclusive
= 1;
495 group_sched_in(struct perf_counter
*group_counter
,
496 struct perf_cpu_context
*cpuctx
,
497 struct perf_counter_context
*ctx
,
500 struct perf_counter
*counter
, *partial_group
;
503 if (group_counter
->state
== PERF_COUNTER_STATE_OFF
)
506 ret
= hw_perf_group_sched_in(group_counter
, cpuctx
, ctx
, cpu
);
508 return ret
< 0 ? ret
: 0;
510 group_counter
->prev_state
= group_counter
->state
;
511 if (counter_sched_in(group_counter
, cpuctx
, ctx
, cpu
))
515 * Schedule in siblings as one group (if any):
517 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
) {
518 counter
->prev_state
= counter
->state
;
519 if (counter_sched_in(counter
, cpuctx
, ctx
, cpu
)) {
520 partial_group
= counter
;
529 * Groups can be scheduled in as one unit only, so undo any
530 * partial group before returning:
532 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
) {
533 if (counter
== partial_group
)
535 counter_sched_out(counter
, cpuctx
, ctx
);
537 counter_sched_out(group_counter
, cpuctx
, ctx
);
543 * Return 1 for a group consisting entirely of software counters,
544 * 0 if the group contains any hardware counters.
546 static int is_software_only_group(struct perf_counter
*leader
)
548 struct perf_counter
*counter
;
550 if (!is_software_counter(leader
))
553 list_for_each_entry(counter
, &leader
->sibling_list
, list_entry
)
554 if (!is_software_counter(counter
))
561 * Work out whether we can put this counter group on the CPU now.
563 static int group_can_go_on(struct perf_counter
*counter
,
564 struct perf_cpu_context
*cpuctx
,
568 * Groups consisting entirely of software counters can always go on.
570 if (is_software_only_group(counter
))
573 * If an exclusive group is already on, no other hardware
574 * counters can go on.
576 if (cpuctx
->exclusive
)
579 * If this group is exclusive and there are already
580 * counters on the CPU, it can't go on.
582 if (counter
->hw_event
.exclusive
&& cpuctx
->active_oncpu
)
585 * Otherwise, try to add it if all previous groups were able
591 static void add_counter_to_ctx(struct perf_counter
*counter
,
592 struct perf_counter_context
*ctx
)
594 list_add_counter(counter
, ctx
);
595 counter
->prev_state
= PERF_COUNTER_STATE_OFF
;
596 counter
->tstamp_enabled
= ctx
->time
;
597 counter
->tstamp_running
= ctx
->time
;
598 counter
->tstamp_stopped
= ctx
->time
;
602 * Cross CPU call to install and enable a performance counter
604 * Must be called with ctx->mutex held
606 static void __perf_install_in_context(void *info
)
608 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
609 struct perf_counter
*counter
= info
;
610 struct perf_counter_context
*ctx
= counter
->ctx
;
611 struct perf_counter
*leader
= counter
->group_leader
;
612 int cpu
= smp_processor_id();
616 * If this is a task context, we need to check whether it is
617 * the current task context of this cpu. If not it has been
618 * scheduled out before the smp call arrived.
619 * Or possibly this is the right context but it isn't
620 * on this cpu because it had no counters.
622 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
) {
623 if (cpuctx
->task_ctx
|| ctx
->task
!= current
)
625 cpuctx
->task_ctx
= ctx
;
628 spin_lock(&ctx
->lock
);
630 update_context_time(ctx
);
633 * Protect the list operation against NMI by disabling the
634 * counters on a global level. NOP for non NMI based counters.
638 add_counter_to_ctx(counter
, ctx
);
641 * Don't put the counter on if it is disabled or if
642 * it is in a group and the group isn't on.
644 if (counter
->state
!= PERF_COUNTER_STATE_INACTIVE
||
645 (leader
!= counter
&& leader
->state
!= PERF_COUNTER_STATE_ACTIVE
))
649 * An exclusive counter can't go on if there are already active
650 * hardware counters, and no hardware counter can go on if there
651 * is already an exclusive counter on.
653 if (!group_can_go_on(counter
, cpuctx
, 1))
656 err
= counter_sched_in(counter
, cpuctx
, ctx
, cpu
);
660 * This counter couldn't go on. If it is in a group
661 * then we have to pull the whole group off.
662 * If the counter group is pinned then put it in error state.
664 if (leader
!= counter
)
665 group_sched_out(leader
, cpuctx
, ctx
);
666 if (leader
->hw_event
.pinned
) {
667 update_group_times(leader
);
668 leader
->state
= PERF_COUNTER_STATE_ERROR
;
672 if (!err
&& !ctx
->task
&& cpuctx
->max_pertask
)
673 cpuctx
->max_pertask
--;
678 spin_unlock(&ctx
->lock
);
682 * Attach a performance counter to a context
684 * First we add the counter to the list with the hardware enable bit
685 * in counter->hw_config cleared.
687 * If the counter is attached to a task which is on a CPU we use a smp
688 * call to enable it in the task context. The task might have been
689 * scheduled away, but we check this in the smp call again.
691 * Must be called with ctx->mutex held.
694 perf_install_in_context(struct perf_counter_context
*ctx
,
695 struct perf_counter
*counter
,
698 struct task_struct
*task
= ctx
->task
;
702 * Per cpu counters are installed via an smp call and
703 * the install is always sucessful.
705 smp_call_function_single(cpu
, __perf_install_in_context
,
711 task_oncpu_function_call(task
, __perf_install_in_context
,
714 spin_lock_irq(&ctx
->lock
);
716 * we need to retry the smp call.
718 if (ctx
->is_active
&& list_empty(&counter
->list_entry
)) {
719 spin_unlock_irq(&ctx
->lock
);
724 * The lock prevents that this context is scheduled in so we
725 * can add the counter safely, if it the call above did not
728 if (list_empty(&counter
->list_entry
))
729 add_counter_to_ctx(counter
, ctx
);
730 spin_unlock_irq(&ctx
->lock
);
734 * Cross CPU call to enable a performance counter
736 static void __perf_counter_enable(void *info
)
738 struct perf_counter
*counter
= info
;
739 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
740 struct perf_counter_context
*ctx
= counter
->ctx
;
741 struct perf_counter
*leader
= counter
->group_leader
;
745 * If this is a per-task counter, need to check whether this
746 * counter's task is the current task on this cpu.
748 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
) {
749 if (cpuctx
->task_ctx
|| ctx
->task
!= current
)
751 cpuctx
->task_ctx
= ctx
;
754 spin_lock(&ctx
->lock
);
756 update_context_time(ctx
);
758 counter
->prev_state
= counter
->state
;
759 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
761 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
762 counter
->tstamp_enabled
= ctx
->time
- counter
->total_time_enabled
;
765 * If the counter is in a group and isn't the group leader,
766 * then don't put it on unless the group is on.
768 if (leader
!= counter
&& leader
->state
!= PERF_COUNTER_STATE_ACTIVE
)
771 if (!group_can_go_on(counter
, cpuctx
, 1)) {
775 if (counter
== leader
)
776 err
= group_sched_in(counter
, cpuctx
, ctx
,
779 err
= counter_sched_in(counter
, cpuctx
, ctx
,
786 * If this counter can't go on and it's part of a
787 * group, then the whole group has to come off.
789 if (leader
!= counter
)
790 group_sched_out(leader
, cpuctx
, ctx
);
791 if (leader
->hw_event
.pinned
) {
792 update_group_times(leader
);
793 leader
->state
= PERF_COUNTER_STATE_ERROR
;
798 spin_unlock(&ctx
->lock
);
804 * If counter->ctx is a cloned context, callers must make sure that
805 * every task struct that counter->ctx->task could possibly point to
806 * remains valid. This condition is satisfied when called through
807 * perf_counter_for_each_child or perf_counter_for_each as described
808 * for perf_counter_disable.
810 static void perf_counter_enable(struct perf_counter
*counter
)
812 struct perf_counter_context
*ctx
= counter
->ctx
;
813 struct task_struct
*task
= ctx
->task
;
817 * Enable the counter on the cpu that it's on
819 smp_call_function_single(counter
->cpu
, __perf_counter_enable
,
824 spin_lock_irq(&ctx
->lock
);
825 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
829 * If the counter is in error state, clear that first.
830 * That way, if we see the counter in error state below, we
831 * know that it has gone back into error state, as distinct
832 * from the task having been scheduled away before the
833 * cross-call arrived.
835 if (counter
->state
== PERF_COUNTER_STATE_ERROR
)
836 counter
->state
= PERF_COUNTER_STATE_OFF
;
839 spin_unlock_irq(&ctx
->lock
);
840 task_oncpu_function_call(task
, __perf_counter_enable
, counter
);
842 spin_lock_irq(&ctx
->lock
);
845 * If the context is active and the counter is still off,
846 * we need to retry the cross-call.
848 if (ctx
->is_active
&& counter
->state
== PERF_COUNTER_STATE_OFF
)
852 * Since we have the lock this context can't be scheduled
853 * in, so we can change the state safely.
855 if (counter
->state
== PERF_COUNTER_STATE_OFF
) {
856 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
857 counter
->tstamp_enabled
=
858 ctx
->time
- counter
->total_time_enabled
;
861 spin_unlock_irq(&ctx
->lock
);
864 static int perf_counter_refresh(struct perf_counter
*counter
, int refresh
)
867 * not supported on inherited counters
869 if (counter
->hw_event
.inherit
)
872 atomic_add(refresh
, &counter
->event_limit
);
873 perf_counter_enable(counter
);
878 void __perf_counter_sched_out(struct perf_counter_context
*ctx
,
879 struct perf_cpu_context
*cpuctx
)
881 struct perf_counter
*counter
;
883 spin_lock(&ctx
->lock
);
885 if (likely(!ctx
->nr_counters
))
887 update_context_time(ctx
);
890 if (ctx
->nr_active
) {
891 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
892 if (counter
!= counter
->group_leader
)
893 counter_sched_out(counter
, cpuctx
, ctx
);
895 group_sched_out(counter
, cpuctx
, ctx
);
900 spin_unlock(&ctx
->lock
);
904 * Test whether two contexts are equivalent, i.e. whether they
905 * have both been cloned from the same version of the same context
906 * and they both have the same number of enabled counters.
907 * If the number of enabled counters is the same, then the set
908 * of enabled counters should be the same, because these are both
909 * inherited contexts, therefore we can't access individual counters
910 * in them directly with an fd; we can only enable/disable all
911 * counters via prctl, or enable/disable all counters in a family
912 * via ioctl, which will have the same effect on both contexts.
914 static int context_equiv(struct perf_counter_context
*ctx1
,
915 struct perf_counter_context
*ctx2
)
917 return ctx1
->parent_ctx
&& ctx1
->parent_ctx
== ctx2
->parent_ctx
918 && ctx1
->parent_gen
== ctx2
->parent_gen
919 && ctx1
->parent_gen
!= ~0ull;
923 * Called from scheduler to remove the counters of the current task,
924 * with interrupts disabled.
926 * We stop each counter and update the counter value in counter->count.
928 * This does not protect us against NMI, but disable()
929 * sets the disabled bit in the control field of counter _before_
930 * accessing the counter control register. If a NMI hits, then it will
931 * not restart the counter.
933 void perf_counter_task_sched_out(struct task_struct
*task
,
934 struct task_struct
*next
, int cpu
)
936 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
937 struct perf_counter_context
*ctx
= task
->perf_counter_ctxp
;
938 struct perf_counter_context
*next_ctx
;
939 struct perf_counter_context
*parent
;
940 struct pt_regs
*regs
;
943 regs
= task_pt_regs(task
);
944 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES
, 1, 1, regs
, 0);
946 if (likely(!ctx
|| !cpuctx
->task_ctx
))
949 update_context_time(ctx
);
952 parent
= rcu_dereference(ctx
->parent_ctx
);
953 next_ctx
= next
->perf_counter_ctxp
;
954 if (parent
&& next_ctx
&&
955 rcu_dereference(next_ctx
->parent_ctx
) == parent
) {
957 * Looks like the two contexts are clones, so we might be
958 * able to optimize the context switch. We lock both
959 * contexts and check that they are clones under the
960 * lock (including re-checking that neither has been
961 * uncloned in the meantime). It doesn't matter which
962 * order we take the locks because no other cpu could
963 * be trying to lock both of these tasks.
965 spin_lock(&ctx
->lock
);
966 spin_lock_nested(&next_ctx
->lock
, SINGLE_DEPTH_NESTING
);
967 if (context_equiv(ctx
, next_ctx
)) {
969 * XXX do we need a memory barrier of sorts
970 * wrt to rcu_dereference() of perf_counter_ctxp
972 task
->perf_counter_ctxp
= next_ctx
;
973 next
->perf_counter_ctxp
= ctx
;
975 next_ctx
->task
= task
;
978 spin_unlock(&next_ctx
->lock
);
979 spin_unlock(&ctx
->lock
);
984 __perf_counter_sched_out(ctx
, cpuctx
);
985 cpuctx
->task_ctx
= NULL
;
990 * Called with IRQs disabled
992 static void __perf_counter_task_sched_out(struct perf_counter_context
*ctx
)
994 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
996 if (!cpuctx
->task_ctx
)
999 if (WARN_ON_ONCE(ctx
!= cpuctx
->task_ctx
))
1002 __perf_counter_sched_out(ctx
, cpuctx
);
1003 cpuctx
->task_ctx
= NULL
;
1007 * Called with IRQs disabled
1009 static void perf_counter_cpu_sched_out(struct perf_cpu_context
*cpuctx
)
1011 __perf_counter_sched_out(&cpuctx
->ctx
, cpuctx
);
1015 __perf_counter_sched_in(struct perf_counter_context
*ctx
,
1016 struct perf_cpu_context
*cpuctx
, int cpu
)
1018 struct perf_counter
*counter
;
1021 spin_lock(&ctx
->lock
);
1023 if (likely(!ctx
->nr_counters
))
1026 ctx
->timestamp
= perf_clock();
1031 * First go through the list and put on any pinned groups
1032 * in order to give them the best chance of going on.
1034 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
1035 if (counter
->state
<= PERF_COUNTER_STATE_OFF
||
1036 !counter
->hw_event
.pinned
)
1038 if (counter
->cpu
!= -1 && counter
->cpu
!= cpu
)
1041 if (counter
!= counter
->group_leader
)
1042 counter_sched_in(counter
, cpuctx
, ctx
, cpu
);
1044 if (group_can_go_on(counter
, cpuctx
, 1))
1045 group_sched_in(counter
, cpuctx
, ctx
, cpu
);
1049 * If this pinned group hasn't been scheduled,
1050 * put it in error state.
1052 if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
) {
1053 update_group_times(counter
);
1054 counter
->state
= PERF_COUNTER_STATE_ERROR
;
1058 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
1060 * Ignore counters in OFF or ERROR state, and
1061 * ignore pinned counters since we did them already.
1063 if (counter
->state
<= PERF_COUNTER_STATE_OFF
||
1064 counter
->hw_event
.pinned
)
1068 * Listen to the 'cpu' scheduling filter constraint
1071 if (counter
->cpu
!= -1 && counter
->cpu
!= cpu
)
1074 if (counter
!= counter
->group_leader
) {
1075 if (counter_sched_in(counter
, cpuctx
, ctx
, cpu
))
1078 if (group_can_go_on(counter
, cpuctx
, can_add_hw
)) {
1079 if (group_sched_in(counter
, cpuctx
, ctx
, cpu
))
1086 spin_unlock(&ctx
->lock
);
1090 * Called from scheduler to add the counters of the current task
1091 * with interrupts disabled.
1093 * We restore the counter value and then enable it.
1095 * This does not protect us against NMI, but enable()
1096 * sets the enabled bit in the control field of counter _before_
1097 * accessing the counter control register. If a NMI hits, then it will
1098 * keep the counter running.
1100 void perf_counter_task_sched_in(struct task_struct
*task
, int cpu
)
1102 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1103 struct perf_counter_context
*ctx
= task
->perf_counter_ctxp
;
1107 if (cpuctx
->task_ctx
== ctx
)
1109 __perf_counter_sched_in(ctx
, cpuctx
, cpu
);
1110 cpuctx
->task_ctx
= ctx
;
1113 static void perf_counter_cpu_sched_in(struct perf_cpu_context
*cpuctx
, int cpu
)
1115 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
1117 __perf_counter_sched_in(ctx
, cpuctx
, cpu
);
1120 #define MAX_INTERRUPTS (~0ULL)
1122 static void perf_log_throttle(struct perf_counter
*counter
, int enable
);
1123 static void perf_log_period(struct perf_counter
*counter
, u64 period
);
1125 static void perf_adjust_freq(struct perf_counter_context
*ctx
)
1127 struct perf_counter
*counter
;
1128 u64 interrupts
, irq_period
;
1132 spin_lock(&ctx
->lock
);
1133 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
1134 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
1137 interrupts
= counter
->hw
.interrupts
;
1138 counter
->hw
.interrupts
= 0;
1140 if (interrupts
== MAX_INTERRUPTS
) {
1141 perf_log_throttle(counter
, 1);
1142 counter
->pmu
->unthrottle(counter
);
1143 interrupts
= 2*sysctl_perf_counter_limit
/HZ
;
1146 if (!counter
->hw_event
.freq
|| !counter
->hw_event
.irq_freq
)
1149 events
= HZ
* interrupts
* counter
->hw
.irq_period
;
1150 period
= div64_u64(events
, counter
->hw_event
.irq_freq
);
1152 delta
= (s64
)(1 + period
- counter
->hw
.irq_period
);
1155 irq_period
= counter
->hw
.irq_period
+ delta
;
1160 perf_log_period(counter
, irq_period
);
1162 counter
->hw
.irq_period
= irq_period
;
1164 spin_unlock(&ctx
->lock
);
1168 * Round-robin a context's counters:
1170 static void rotate_ctx(struct perf_counter_context
*ctx
)
1172 struct perf_counter
*counter
;
1174 if (!ctx
->nr_counters
)
1177 spin_lock(&ctx
->lock
);
1179 * Rotate the first entry last (works just fine for group counters too):
1182 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
1183 list_move_tail(&counter
->list_entry
, &ctx
->counter_list
);
1188 spin_unlock(&ctx
->lock
);
1191 void perf_counter_task_tick(struct task_struct
*curr
, int cpu
)
1193 struct perf_cpu_context
*cpuctx
;
1194 struct perf_counter_context
*ctx
;
1196 if (!atomic_read(&nr_counters
))
1199 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1200 ctx
= curr
->perf_counter_ctxp
;
1202 perf_adjust_freq(&cpuctx
->ctx
);
1204 perf_adjust_freq(ctx
);
1206 perf_counter_cpu_sched_out(cpuctx
);
1208 __perf_counter_task_sched_out(ctx
);
1210 rotate_ctx(&cpuctx
->ctx
);
1214 perf_counter_cpu_sched_in(cpuctx
, cpu
);
1216 perf_counter_task_sched_in(curr
, cpu
);
1220 * Cross CPU call to read the hardware counter
1222 static void __read(void *info
)
1224 struct perf_counter
*counter
= info
;
1225 struct perf_counter_context
*ctx
= counter
->ctx
;
1226 unsigned long flags
;
1228 local_irq_save(flags
);
1230 update_context_time(ctx
);
1231 counter
->pmu
->read(counter
);
1232 update_counter_times(counter
);
1233 local_irq_restore(flags
);
1236 static u64
perf_counter_read(struct perf_counter
*counter
)
1239 * If counter is enabled and currently active on a CPU, update the
1240 * value in the counter structure:
1242 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
1243 smp_call_function_single(counter
->oncpu
,
1244 __read
, counter
, 1);
1245 } else if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
) {
1246 update_counter_times(counter
);
1249 return atomic64_read(&counter
->count
);
1253 * Initialize the perf_counter context in a task_struct:
1256 __perf_counter_init_context(struct perf_counter_context
*ctx
,
1257 struct task_struct
*task
)
1259 memset(ctx
, 0, sizeof(*ctx
));
1260 spin_lock_init(&ctx
->lock
);
1261 mutex_init(&ctx
->mutex
);
1262 INIT_LIST_HEAD(&ctx
->counter_list
);
1263 INIT_LIST_HEAD(&ctx
->event_list
);
1264 atomic_set(&ctx
->refcount
, 1);
1268 static struct perf_counter_context
*find_get_context(pid_t pid
, int cpu
)
1270 struct perf_cpu_context
*cpuctx
;
1271 struct perf_counter_context
*ctx
;
1272 struct perf_counter_context
*parent_ctx
;
1273 struct task_struct
*task
;
1277 * If cpu is not a wildcard then this is a percpu counter:
1280 /* Must be root to operate on a CPU counter: */
1281 if (sysctl_perf_counter_priv
&& !capable(CAP_SYS_ADMIN
))
1282 return ERR_PTR(-EACCES
);
1284 if (cpu
< 0 || cpu
> num_possible_cpus())
1285 return ERR_PTR(-EINVAL
);
1288 * We could be clever and allow to attach a counter to an
1289 * offline CPU and activate it when the CPU comes up, but
1292 if (!cpu_isset(cpu
, cpu_online_map
))
1293 return ERR_PTR(-ENODEV
);
1295 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1306 task
= find_task_by_vpid(pid
);
1308 get_task_struct(task
);
1312 return ERR_PTR(-ESRCH
);
1315 * Can't attach counters to a dying task.
1318 if (task
->flags
& PF_EXITING
)
1321 /* Reuse ptrace permission checks for now. */
1323 if (!ptrace_may_access(task
, PTRACE_MODE_READ
))
1329 ctx
= rcu_dereference(task
->perf_counter_ctxp
);
1332 * If this context is a clone of another, it might
1333 * get swapped for another underneath us by
1334 * perf_counter_task_sched_out, though the
1335 * rcu_read_lock() protects us from any context
1336 * getting freed. Lock the context and check if it
1337 * got swapped before we could get the lock, and retry
1338 * if so. If we locked the right context, then it
1339 * can't get swapped on us any more and we can
1340 * unclone it if necessary.
1341 * Once it's not a clone things will be stable.
1343 spin_lock_irq(&ctx
->lock
);
1344 if (ctx
!= rcu_dereference(task
->perf_counter_ctxp
)) {
1345 spin_unlock_irq(&ctx
->lock
);
1348 parent_ctx
= ctx
->parent_ctx
;
1350 put_ctx(parent_ctx
);
1351 ctx
->parent_ctx
= NULL
; /* no longer a clone */
1354 * Get an extra reference before dropping the lock so that
1355 * this context won't get freed if the task exits.
1358 spin_unlock_irq(&ctx
->lock
);
1363 ctx
= kmalloc(sizeof(struct perf_counter_context
), GFP_KERNEL
);
1367 __perf_counter_init_context(ctx
, task
);
1369 if (cmpxchg(&task
->perf_counter_ctxp
, NULL
, ctx
)) {
1371 * We raced with some other task; use
1372 * the context they set.
1377 get_task_struct(task
);
1380 put_task_struct(task
);
1384 put_task_struct(task
);
1385 return ERR_PTR(err
);
1388 static void free_counter_rcu(struct rcu_head
*head
)
1390 struct perf_counter
*counter
;
1392 counter
= container_of(head
, struct perf_counter
, rcu_head
);
1396 static void perf_pending_sync(struct perf_counter
*counter
);
1398 static void free_counter(struct perf_counter
*counter
)
1400 perf_pending_sync(counter
);
1402 atomic_dec(&nr_counters
);
1403 if (counter
->hw_event
.mmap
)
1404 atomic_dec(&nr_mmap_tracking
);
1405 if (counter
->hw_event
.munmap
)
1406 atomic_dec(&nr_munmap_tracking
);
1407 if (counter
->hw_event
.comm
)
1408 atomic_dec(&nr_comm_tracking
);
1410 if (counter
->destroy
)
1411 counter
->destroy(counter
);
1413 put_ctx(counter
->ctx
);
1414 call_rcu(&counter
->rcu_head
, free_counter_rcu
);
1418 * Called when the last reference to the file is gone.
1420 static int perf_release(struct inode
*inode
, struct file
*file
)
1422 struct perf_counter
*counter
= file
->private_data
;
1423 struct perf_counter_context
*ctx
= counter
->ctx
;
1425 file
->private_data
= NULL
;
1427 WARN_ON_ONCE(ctx
->parent_ctx
);
1428 mutex_lock(&ctx
->mutex
);
1429 perf_counter_remove_from_context(counter
);
1430 mutex_unlock(&ctx
->mutex
);
1432 mutex_lock(&counter
->owner
->perf_counter_mutex
);
1433 list_del_init(&counter
->owner_entry
);
1434 mutex_unlock(&counter
->owner
->perf_counter_mutex
);
1435 put_task_struct(counter
->owner
);
1437 free_counter(counter
);
1443 * Read the performance counter - simple non blocking version for now
1446 perf_read_hw(struct perf_counter
*counter
, char __user
*buf
, size_t count
)
1452 * Return end-of-file for a read on a counter that is in
1453 * error state (i.e. because it was pinned but it couldn't be
1454 * scheduled on to the CPU at some point).
1456 if (counter
->state
== PERF_COUNTER_STATE_ERROR
)
1459 WARN_ON_ONCE(counter
->ctx
->parent_ctx
);
1460 mutex_lock(&counter
->child_mutex
);
1461 values
[0] = perf_counter_read(counter
);
1463 if (counter
->hw_event
.read_format
& PERF_FORMAT_TOTAL_TIME_ENABLED
)
1464 values
[n
++] = counter
->total_time_enabled
+
1465 atomic64_read(&counter
->child_total_time_enabled
);
1466 if (counter
->hw_event
.read_format
& PERF_FORMAT_TOTAL_TIME_RUNNING
)
1467 values
[n
++] = counter
->total_time_running
+
1468 atomic64_read(&counter
->child_total_time_running
);
1469 mutex_unlock(&counter
->child_mutex
);
1471 if (count
< n
* sizeof(u64
))
1473 count
= n
* sizeof(u64
);
1475 if (copy_to_user(buf
, values
, count
))
1482 perf_read(struct file
*file
, char __user
*buf
, size_t count
, loff_t
*ppos
)
1484 struct perf_counter
*counter
= file
->private_data
;
1486 return perf_read_hw(counter
, buf
, count
);
1489 static unsigned int perf_poll(struct file
*file
, poll_table
*wait
)
1491 struct perf_counter
*counter
= file
->private_data
;
1492 struct perf_mmap_data
*data
;
1493 unsigned int events
= POLL_HUP
;
1496 data
= rcu_dereference(counter
->data
);
1498 events
= atomic_xchg(&data
->poll
, 0);
1501 poll_wait(file
, &counter
->waitq
, wait
);
1506 static void perf_counter_reset(struct perf_counter
*counter
)
1508 (void)perf_counter_read(counter
);
1509 atomic64_set(&counter
->count
, 0);
1510 perf_counter_update_userpage(counter
);
1513 static void perf_counter_for_each_sibling(struct perf_counter
*counter
,
1514 void (*func
)(struct perf_counter
*))
1516 struct perf_counter_context
*ctx
= counter
->ctx
;
1517 struct perf_counter
*sibling
;
1519 WARN_ON_ONCE(ctx
->parent_ctx
);
1520 mutex_lock(&ctx
->mutex
);
1521 counter
= counter
->group_leader
;
1524 list_for_each_entry(sibling
, &counter
->sibling_list
, list_entry
)
1526 mutex_unlock(&ctx
->mutex
);
1530 * Holding the top-level counter's child_mutex means that any
1531 * descendant process that has inherited this counter will block
1532 * in sync_child_counter if it goes to exit, thus satisfying the
1533 * task existence requirements of perf_counter_enable/disable.
1535 static void perf_counter_for_each_child(struct perf_counter
*counter
,
1536 void (*func
)(struct perf_counter
*))
1538 struct perf_counter
*child
;
1540 WARN_ON_ONCE(counter
->ctx
->parent_ctx
);
1541 mutex_lock(&counter
->child_mutex
);
1543 list_for_each_entry(child
, &counter
->child_list
, child_list
)
1545 mutex_unlock(&counter
->child_mutex
);
1548 static void perf_counter_for_each(struct perf_counter
*counter
,
1549 void (*func
)(struct perf_counter
*))
1551 struct perf_counter
*child
;
1553 WARN_ON_ONCE(counter
->ctx
->parent_ctx
);
1554 mutex_lock(&counter
->child_mutex
);
1555 perf_counter_for_each_sibling(counter
, func
);
1556 list_for_each_entry(child
, &counter
->child_list
, child_list
)
1557 perf_counter_for_each_sibling(child
, func
);
1558 mutex_unlock(&counter
->child_mutex
);
1561 static long perf_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
1563 struct perf_counter
*counter
= file
->private_data
;
1564 void (*func
)(struct perf_counter
*);
1568 case PERF_COUNTER_IOC_ENABLE
:
1569 func
= perf_counter_enable
;
1571 case PERF_COUNTER_IOC_DISABLE
:
1572 func
= perf_counter_disable
;
1574 case PERF_COUNTER_IOC_RESET
:
1575 func
= perf_counter_reset
;
1578 case PERF_COUNTER_IOC_REFRESH
:
1579 return perf_counter_refresh(counter
, arg
);
1584 if (flags
& PERF_IOC_FLAG_GROUP
)
1585 perf_counter_for_each(counter
, func
);
1587 perf_counter_for_each_child(counter
, func
);
1592 int perf_counter_task_enable(void)
1594 struct perf_counter
*counter
;
1596 mutex_lock(¤t
->perf_counter_mutex
);
1597 list_for_each_entry(counter
, ¤t
->perf_counter_list
, owner_entry
)
1598 perf_counter_for_each_child(counter
, perf_counter_enable
);
1599 mutex_unlock(¤t
->perf_counter_mutex
);
1604 int perf_counter_task_disable(void)
1606 struct perf_counter
*counter
;
1608 mutex_lock(¤t
->perf_counter_mutex
);
1609 list_for_each_entry(counter
, ¤t
->perf_counter_list
, owner_entry
)
1610 perf_counter_for_each_child(counter
, perf_counter_disable
);
1611 mutex_unlock(¤t
->perf_counter_mutex
);
1617 * Callers need to ensure there can be no nesting of this function, otherwise
1618 * the seqlock logic goes bad. We can not serialize this because the arch
1619 * code calls this from NMI context.
1621 void perf_counter_update_userpage(struct perf_counter
*counter
)
1623 struct perf_mmap_data
*data
;
1624 struct perf_counter_mmap_page
*userpg
;
1627 data
= rcu_dereference(counter
->data
);
1631 userpg
= data
->user_page
;
1634 * Disable preemption so as to not let the corresponding user-space
1635 * spin too long if we get preempted.
1640 userpg
->index
= counter
->hw
.idx
;
1641 userpg
->offset
= atomic64_read(&counter
->count
);
1642 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
)
1643 userpg
->offset
-= atomic64_read(&counter
->hw
.prev_count
);
1652 static int perf_mmap_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
1654 struct perf_counter
*counter
= vma
->vm_file
->private_data
;
1655 struct perf_mmap_data
*data
;
1656 int ret
= VM_FAULT_SIGBUS
;
1659 data
= rcu_dereference(counter
->data
);
1663 if (vmf
->pgoff
== 0) {
1664 vmf
->page
= virt_to_page(data
->user_page
);
1666 int nr
= vmf
->pgoff
- 1;
1668 if ((unsigned)nr
> data
->nr_pages
)
1671 vmf
->page
= virt_to_page(data
->data_pages
[nr
]);
1673 get_page(vmf
->page
);
1681 static int perf_mmap_data_alloc(struct perf_counter
*counter
, int nr_pages
)
1683 struct perf_mmap_data
*data
;
1687 WARN_ON(atomic_read(&counter
->mmap_count
));
1689 size
= sizeof(struct perf_mmap_data
);
1690 size
+= nr_pages
* sizeof(void *);
1692 data
= kzalloc(size
, GFP_KERNEL
);
1696 data
->user_page
= (void *)get_zeroed_page(GFP_KERNEL
);
1697 if (!data
->user_page
)
1698 goto fail_user_page
;
1700 for (i
= 0; i
< nr_pages
; i
++) {
1701 data
->data_pages
[i
] = (void *)get_zeroed_page(GFP_KERNEL
);
1702 if (!data
->data_pages
[i
])
1703 goto fail_data_pages
;
1706 data
->nr_pages
= nr_pages
;
1707 atomic_set(&data
->lock
, -1);
1709 rcu_assign_pointer(counter
->data
, data
);
1714 for (i
--; i
>= 0; i
--)
1715 free_page((unsigned long)data
->data_pages
[i
]);
1717 free_page((unsigned long)data
->user_page
);
1726 static void __perf_mmap_data_free(struct rcu_head
*rcu_head
)
1728 struct perf_mmap_data
*data
= container_of(rcu_head
,
1729 struct perf_mmap_data
, rcu_head
);
1732 free_page((unsigned long)data
->user_page
);
1733 for (i
= 0; i
< data
->nr_pages
; i
++)
1734 free_page((unsigned long)data
->data_pages
[i
]);
1738 static void perf_mmap_data_free(struct perf_counter
*counter
)
1740 struct perf_mmap_data
*data
= counter
->data
;
1742 WARN_ON(atomic_read(&counter
->mmap_count
));
1744 rcu_assign_pointer(counter
->data
, NULL
);
1745 call_rcu(&data
->rcu_head
, __perf_mmap_data_free
);
1748 static void perf_mmap_open(struct vm_area_struct
*vma
)
1750 struct perf_counter
*counter
= vma
->vm_file
->private_data
;
1752 atomic_inc(&counter
->mmap_count
);
1755 static void perf_mmap_close(struct vm_area_struct
*vma
)
1757 struct perf_counter
*counter
= vma
->vm_file
->private_data
;
1759 WARN_ON_ONCE(counter
->ctx
->parent_ctx
);
1760 if (atomic_dec_and_mutex_lock(&counter
->mmap_count
,
1761 &counter
->mmap_mutex
)) {
1762 struct user_struct
*user
= current_user();
1764 atomic_long_sub(counter
->data
->nr_pages
+ 1, &user
->locked_vm
);
1765 vma
->vm_mm
->locked_vm
-= counter
->data
->nr_locked
;
1766 perf_mmap_data_free(counter
);
1767 mutex_unlock(&counter
->mmap_mutex
);
1771 static struct vm_operations_struct perf_mmap_vmops
= {
1772 .open
= perf_mmap_open
,
1773 .close
= perf_mmap_close
,
1774 .fault
= perf_mmap_fault
,
1777 static int perf_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1779 struct perf_counter
*counter
= file
->private_data
;
1780 struct user_struct
*user
= current_user();
1781 unsigned long vma_size
;
1782 unsigned long nr_pages
;
1783 unsigned long user_locked
, user_lock_limit
;
1784 unsigned long locked
, lock_limit
;
1785 long user_extra
, extra
;
1788 if (!(vma
->vm_flags
& VM_SHARED
) || (vma
->vm_flags
& VM_WRITE
))
1791 vma_size
= vma
->vm_end
- vma
->vm_start
;
1792 nr_pages
= (vma_size
/ PAGE_SIZE
) - 1;
1795 * If we have data pages ensure they're a power-of-two number, so we
1796 * can do bitmasks instead of modulo.
1798 if (nr_pages
!= 0 && !is_power_of_2(nr_pages
))
1801 if (vma_size
!= PAGE_SIZE
* (1 + nr_pages
))
1804 if (vma
->vm_pgoff
!= 0)
1807 WARN_ON_ONCE(counter
->ctx
->parent_ctx
);
1808 mutex_lock(&counter
->mmap_mutex
);
1809 if (atomic_inc_not_zero(&counter
->mmap_count
)) {
1810 if (nr_pages
!= counter
->data
->nr_pages
)
1815 user_extra
= nr_pages
+ 1;
1816 user_lock_limit
= sysctl_perf_counter_mlock
>> (PAGE_SHIFT
- 10);
1819 * Increase the limit linearly with more CPUs:
1821 user_lock_limit
*= num_online_cpus();
1823 user_locked
= atomic_long_read(&user
->locked_vm
) + user_extra
;
1826 if (user_locked
> user_lock_limit
)
1827 extra
= user_locked
- user_lock_limit
;
1829 lock_limit
= current
->signal
->rlim
[RLIMIT_MEMLOCK
].rlim_cur
;
1830 lock_limit
>>= PAGE_SHIFT
;
1831 locked
= vma
->vm_mm
->locked_vm
+ extra
;
1833 if ((locked
> lock_limit
) && !capable(CAP_IPC_LOCK
)) {
1838 WARN_ON(counter
->data
);
1839 ret
= perf_mmap_data_alloc(counter
, nr_pages
);
1843 atomic_set(&counter
->mmap_count
, 1);
1844 atomic_long_add(user_extra
, &user
->locked_vm
);
1845 vma
->vm_mm
->locked_vm
+= extra
;
1846 counter
->data
->nr_locked
= extra
;
1848 mutex_unlock(&counter
->mmap_mutex
);
1850 vma
->vm_flags
&= ~VM_MAYWRITE
;
1851 vma
->vm_flags
|= VM_RESERVED
;
1852 vma
->vm_ops
= &perf_mmap_vmops
;
1857 static int perf_fasync(int fd
, struct file
*filp
, int on
)
1859 struct perf_counter
*counter
= filp
->private_data
;
1860 struct inode
*inode
= filp
->f_path
.dentry
->d_inode
;
1863 mutex_lock(&inode
->i_mutex
);
1864 retval
= fasync_helper(fd
, filp
, on
, &counter
->fasync
);
1865 mutex_unlock(&inode
->i_mutex
);
1873 static const struct file_operations perf_fops
= {
1874 .release
= perf_release
,
1877 .unlocked_ioctl
= perf_ioctl
,
1878 .compat_ioctl
= perf_ioctl
,
1880 .fasync
= perf_fasync
,
1884 * Perf counter wakeup
1886 * If there's data, ensure we set the poll() state and publish everything
1887 * to user-space before waking everybody up.
1890 void perf_counter_wakeup(struct perf_counter
*counter
)
1892 wake_up_all(&counter
->waitq
);
1894 if (counter
->pending_kill
) {
1895 kill_fasync(&counter
->fasync
, SIGIO
, counter
->pending_kill
);
1896 counter
->pending_kill
= 0;
1903 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1905 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1906 * single linked list and use cmpxchg() to add entries lockless.
1909 static void perf_pending_counter(struct perf_pending_entry
*entry
)
1911 struct perf_counter
*counter
= container_of(entry
,
1912 struct perf_counter
, pending
);
1914 if (counter
->pending_disable
) {
1915 counter
->pending_disable
= 0;
1916 perf_counter_disable(counter
);
1919 if (counter
->pending_wakeup
) {
1920 counter
->pending_wakeup
= 0;
1921 perf_counter_wakeup(counter
);
1925 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1927 static DEFINE_PER_CPU(struct perf_pending_entry
*, perf_pending_head
) = {
1931 static void perf_pending_queue(struct perf_pending_entry
*entry
,
1932 void (*func
)(struct perf_pending_entry
*))
1934 struct perf_pending_entry
**head
;
1936 if (cmpxchg(&entry
->next
, NULL
, PENDING_TAIL
) != NULL
)
1941 head
= &get_cpu_var(perf_pending_head
);
1944 entry
->next
= *head
;
1945 } while (cmpxchg(head
, entry
->next
, entry
) != entry
->next
);
1947 set_perf_counter_pending();
1949 put_cpu_var(perf_pending_head
);
1952 static int __perf_pending_run(void)
1954 struct perf_pending_entry
*list
;
1957 list
= xchg(&__get_cpu_var(perf_pending_head
), PENDING_TAIL
);
1958 while (list
!= PENDING_TAIL
) {
1959 void (*func
)(struct perf_pending_entry
*);
1960 struct perf_pending_entry
*entry
= list
;
1967 * Ensure we observe the unqueue before we issue the wakeup,
1968 * so that we won't be waiting forever.
1969 * -- see perf_not_pending().
1980 static inline int perf_not_pending(struct perf_counter
*counter
)
1983 * If we flush on whatever cpu we run, there is a chance we don't
1987 __perf_pending_run();
1991 * Ensure we see the proper queue state before going to sleep
1992 * so that we do not miss the wakeup. -- see perf_pending_handle()
1995 return counter
->pending
.next
== NULL
;
1998 static void perf_pending_sync(struct perf_counter
*counter
)
2000 wait_event(counter
->waitq
, perf_not_pending(counter
));
2003 void perf_counter_do_pending(void)
2005 __perf_pending_run();
2009 * Callchain support -- arch specific
2012 __weak
struct perf_callchain_entry
*perf_callchain(struct pt_regs
*regs
)
2021 struct perf_output_handle
{
2022 struct perf_counter
*counter
;
2023 struct perf_mmap_data
*data
;
2024 unsigned int offset
;
2029 unsigned long flags
;
2032 static void perf_output_wakeup(struct perf_output_handle
*handle
)
2034 atomic_set(&handle
->data
->poll
, POLL_IN
);
2037 handle
->counter
->pending_wakeup
= 1;
2038 perf_pending_queue(&handle
->counter
->pending
,
2039 perf_pending_counter
);
2041 perf_counter_wakeup(handle
->counter
);
2045 * Curious locking construct.
2047 * We need to ensure a later event doesn't publish a head when a former
2048 * event isn't done writing. However since we need to deal with NMIs we
2049 * cannot fully serialize things.
2051 * What we do is serialize between CPUs so we only have to deal with NMI
2052 * nesting on a single CPU.
2054 * We only publish the head (and generate a wakeup) when the outer-most
2057 static void perf_output_lock(struct perf_output_handle
*handle
)
2059 struct perf_mmap_data
*data
= handle
->data
;
2064 local_irq_save(handle
->flags
);
2065 cpu
= smp_processor_id();
2067 if (in_nmi() && atomic_read(&data
->lock
) == cpu
)
2070 while (atomic_cmpxchg(&data
->lock
, -1, cpu
) != -1)
2076 static void perf_output_unlock(struct perf_output_handle
*handle
)
2078 struct perf_mmap_data
*data
= handle
->data
;
2081 data
->done_head
= data
->head
;
2083 if (!handle
->locked
)
2088 * The xchg implies a full barrier that ensures all writes are done
2089 * before we publish the new head, matched by a rmb() in userspace when
2090 * reading this position.
2092 while ((head
= atomic_xchg(&data
->done_head
, 0)))
2093 data
->user_page
->data_head
= head
;
2096 * NMI can happen here, which means we can miss a done_head update.
2099 cpu
= atomic_xchg(&data
->lock
, -1);
2100 WARN_ON_ONCE(cpu
!= smp_processor_id());
2103 * Therefore we have to validate we did not indeed do so.
2105 if (unlikely(atomic_read(&data
->done_head
))) {
2107 * Since we had it locked, we can lock it again.
2109 while (atomic_cmpxchg(&data
->lock
, -1, cpu
) != -1)
2115 if (atomic_xchg(&data
->wakeup
, 0))
2116 perf_output_wakeup(handle
);
2118 local_irq_restore(handle
->flags
);
2121 static int perf_output_begin(struct perf_output_handle
*handle
,
2122 struct perf_counter
*counter
, unsigned int size
,
2123 int nmi
, int overflow
)
2125 struct perf_mmap_data
*data
;
2126 unsigned int offset
, head
;
2129 * For inherited counters we send all the output towards the parent.
2131 if (counter
->parent
)
2132 counter
= counter
->parent
;
2135 data
= rcu_dereference(counter
->data
);
2139 handle
->data
= data
;
2140 handle
->counter
= counter
;
2142 handle
->overflow
= overflow
;
2144 if (!data
->nr_pages
)
2147 perf_output_lock(handle
);
2150 offset
= head
= atomic_read(&data
->head
);
2152 } while (atomic_cmpxchg(&data
->head
, offset
, head
) != offset
);
2154 handle
->offset
= offset
;
2155 handle
->head
= head
;
2157 if ((offset
>> PAGE_SHIFT
) != (head
>> PAGE_SHIFT
))
2158 atomic_set(&data
->wakeup
, 1);
2163 perf_output_wakeup(handle
);
2170 static void perf_output_copy(struct perf_output_handle
*handle
,
2171 void *buf
, unsigned int len
)
2173 unsigned int pages_mask
;
2174 unsigned int offset
;
2178 offset
= handle
->offset
;
2179 pages_mask
= handle
->data
->nr_pages
- 1;
2180 pages
= handle
->data
->data_pages
;
2183 unsigned int page_offset
;
2186 nr
= (offset
>> PAGE_SHIFT
) & pages_mask
;
2187 page_offset
= offset
& (PAGE_SIZE
- 1);
2188 size
= min_t(unsigned int, PAGE_SIZE
- page_offset
, len
);
2190 memcpy(pages
[nr
] + page_offset
, buf
, size
);
2197 handle
->offset
= offset
;
2200 * Check we didn't copy past our reservation window, taking the
2201 * possible unsigned int wrap into account.
2203 WARN_ON_ONCE(((int)(handle
->head
- handle
->offset
)) < 0);
2206 #define perf_output_put(handle, x) \
2207 perf_output_copy((handle), &(x), sizeof(x))
2209 static void perf_output_end(struct perf_output_handle
*handle
)
2211 struct perf_counter
*counter
= handle
->counter
;
2212 struct perf_mmap_data
*data
= handle
->data
;
2214 int wakeup_events
= counter
->hw_event
.wakeup_events
;
2216 if (handle
->overflow
&& wakeup_events
) {
2217 int events
= atomic_inc_return(&data
->events
);
2218 if (events
>= wakeup_events
) {
2219 atomic_sub(wakeup_events
, &data
->events
);
2220 atomic_set(&data
->wakeup
, 1);
2224 perf_output_unlock(handle
);
2228 static void perf_counter_output(struct perf_counter
*counter
,
2229 int nmi
, struct pt_regs
*regs
, u64 addr
)
2232 u64 record_type
= counter
->hw_event
.record_type
;
2233 struct perf_output_handle handle
;
2234 struct perf_event_header header
;
2243 struct perf_callchain_entry
*callchain
= NULL
;
2244 int callchain_size
= 0;
2251 header
.size
= sizeof(header
);
2253 header
.misc
= PERF_EVENT_MISC_OVERFLOW
;
2254 header
.misc
|= perf_misc_flags(regs
);
2256 if (record_type
& PERF_RECORD_IP
) {
2257 ip
= perf_instruction_pointer(regs
);
2258 header
.type
|= PERF_RECORD_IP
;
2259 header
.size
+= sizeof(ip
);
2262 if (record_type
& PERF_RECORD_TID
) {
2263 /* namespace issues */
2264 tid_entry
.pid
= current
->group_leader
->pid
;
2265 tid_entry
.tid
= current
->pid
;
2267 header
.type
|= PERF_RECORD_TID
;
2268 header
.size
+= sizeof(tid_entry
);
2271 if (record_type
& PERF_RECORD_TIME
) {
2273 * Maybe do better on x86 and provide cpu_clock_nmi()
2275 time
= sched_clock();
2277 header
.type
|= PERF_RECORD_TIME
;
2278 header
.size
+= sizeof(u64
);
2281 if (record_type
& PERF_RECORD_ADDR
) {
2282 header
.type
|= PERF_RECORD_ADDR
;
2283 header
.size
+= sizeof(u64
);
2286 if (record_type
& PERF_RECORD_CONFIG
) {
2287 header
.type
|= PERF_RECORD_CONFIG
;
2288 header
.size
+= sizeof(u64
);
2291 if (record_type
& PERF_RECORD_CPU
) {
2292 header
.type
|= PERF_RECORD_CPU
;
2293 header
.size
+= sizeof(cpu_entry
);
2295 cpu_entry
.cpu
= raw_smp_processor_id();
2298 if (record_type
& PERF_RECORD_GROUP
) {
2299 header
.type
|= PERF_RECORD_GROUP
;
2300 header
.size
+= sizeof(u64
) +
2301 counter
->nr_siblings
* sizeof(group_entry
);
2304 if (record_type
& PERF_RECORD_CALLCHAIN
) {
2305 callchain
= perf_callchain(regs
);
2308 callchain_size
= (1 + callchain
->nr
) * sizeof(u64
);
2310 header
.type
|= PERF_RECORD_CALLCHAIN
;
2311 header
.size
+= callchain_size
;
2315 ret
= perf_output_begin(&handle
, counter
, header
.size
, nmi
, 1);
2319 perf_output_put(&handle
, header
);
2321 if (record_type
& PERF_RECORD_IP
)
2322 perf_output_put(&handle
, ip
);
2324 if (record_type
& PERF_RECORD_TID
)
2325 perf_output_put(&handle
, tid_entry
);
2327 if (record_type
& PERF_RECORD_TIME
)
2328 perf_output_put(&handle
, time
);
2330 if (record_type
& PERF_RECORD_ADDR
)
2331 perf_output_put(&handle
, addr
);
2333 if (record_type
& PERF_RECORD_CONFIG
)
2334 perf_output_put(&handle
, counter
->hw_event
.config
);
2336 if (record_type
& PERF_RECORD_CPU
)
2337 perf_output_put(&handle
, cpu_entry
);
2340 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
2342 if (record_type
& PERF_RECORD_GROUP
) {
2343 struct perf_counter
*leader
, *sub
;
2344 u64 nr
= counter
->nr_siblings
;
2346 perf_output_put(&handle
, nr
);
2348 leader
= counter
->group_leader
;
2349 list_for_each_entry(sub
, &leader
->sibling_list
, list_entry
) {
2351 sub
->pmu
->read(sub
);
2353 group_entry
.event
= sub
->hw_event
.config
;
2354 group_entry
.counter
= atomic64_read(&sub
->count
);
2356 perf_output_put(&handle
, group_entry
);
2361 perf_output_copy(&handle
, callchain
, callchain_size
);
2363 perf_output_end(&handle
);
2370 struct perf_comm_event
{
2371 struct task_struct
*task
;
2376 struct perf_event_header header
;
2383 static void perf_counter_comm_output(struct perf_counter
*counter
,
2384 struct perf_comm_event
*comm_event
)
2386 struct perf_output_handle handle
;
2387 int size
= comm_event
->event
.header
.size
;
2388 int ret
= perf_output_begin(&handle
, counter
, size
, 0, 0);
2393 perf_output_put(&handle
, comm_event
->event
);
2394 perf_output_copy(&handle
, comm_event
->comm
,
2395 comm_event
->comm_size
);
2396 perf_output_end(&handle
);
2399 static int perf_counter_comm_match(struct perf_counter
*counter
,
2400 struct perf_comm_event
*comm_event
)
2402 if (counter
->hw_event
.comm
&&
2403 comm_event
->event
.header
.type
== PERF_EVENT_COMM
)
2409 static void perf_counter_comm_ctx(struct perf_counter_context
*ctx
,
2410 struct perf_comm_event
*comm_event
)
2412 struct perf_counter
*counter
;
2414 if (system_state
!= SYSTEM_RUNNING
|| list_empty(&ctx
->event_list
))
2418 list_for_each_entry_rcu(counter
, &ctx
->event_list
, event_entry
) {
2419 if (perf_counter_comm_match(counter
, comm_event
))
2420 perf_counter_comm_output(counter
, comm_event
);
2425 static void perf_counter_comm_event(struct perf_comm_event
*comm_event
)
2427 struct perf_cpu_context
*cpuctx
;
2428 struct perf_counter_context
*ctx
;
2430 char *comm
= comm_event
->task
->comm
;
2432 size
= ALIGN(strlen(comm
)+1, sizeof(u64
));
2434 comm_event
->comm
= comm
;
2435 comm_event
->comm_size
= size
;
2437 comm_event
->event
.header
.size
= sizeof(comm_event
->event
) + size
;
2439 cpuctx
= &get_cpu_var(perf_cpu_context
);
2440 perf_counter_comm_ctx(&cpuctx
->ctx
, comm_event
);
2441 put_cpu_var(perf_cpu_context
);
2445 * doesn't really matter which of the child contexts the
2446 * events ends up in.
2448 ctx
= rcu_dereference(current
->perf_counter_ctxp
);
2450 perf_counter_comm_ctx(ctx
, comm_event
);
2454 void perf_counter_comm(struct task_struct
*task
)
2456 struct perf_comm_event comm_event
;
2458 if (!atomic_read(&nr_comm_tracking
))
2461 comm_event
= (struct perf_comm_event
){
2464 .header
= { .type
= PERF_EVENT_COMM
, },
2465 .pid
= task
->group_leader
->pid
,
2470 perf_counter_comm_event(&comm_event
);
2477 struct perf_mmap_event
{
2483 struct perf_event_header header
;
2493 static void perf_counter_mmap_output(struct perf_counter
*counter
,
2494 struct perf_mmap_event
*mmap_event
)
2496 struct perf_output_handle handle
;
2497 int size
= mmap_event
->event
.header
.size
;
2498 int ret
= perf_output_begin(&handle
, counter
, size
, 0, 0);
2503 perf_output_put(&handle
, mmap_event
->event
);
2504 perf_output_copy(&handle
, mmap_event
->file_name
,
2505 mmap_event
->file_size
);
2506 perf_output_end(&handle
);
2509 static int perf_counter_mmap_match(struct perf_counter
*counter
,
2510 struct perf_mmap_event
*mmap_event
)
2512 if (counter
->hw_event
.mmap
&&
2513 mmap_event
->event
.header
.type
== PERF_EVENT_MMAP
)
2516 if (counter
->hw_event
.munmap
&&
2517 mmap_event
->event
.header
.type
== PERF_EVENT_MUNMAP
)
2523 static void perf_counter_mmap_ctx(struct perf_counter_context
*ctx
,
2524 struct perf_mmap_event
*mmap_event
)
2526 struct perf_counter
*counter
;
2528 if (system_state
!= SYSTEM_RUNNING
|| list_empty(&ctx
->event_list
))
2532 list_for_each_entry_rcu(counter
, &ctx
->event_list
, event_entry
) {
2533 if (perf_counter_mmap_match(counter
, mmap_event
))
2534 perf_counter_mmap_output(counter
, mmap_event
);
2539 static void perf_counter_mmap_event(struct perf_mmap_event
*mmap_event
)
2541 struct perf_cpu_context
*cpuctx
;
2542 struct perf_counter_context
*ctx
;
2543 struct file
*file
= mmap_event
->file
;
2550 buf
= kzalloc(PATH_MAX
, GFP_KERNEL
);
2552 name
= strncpy(tmp
, "//enomem", sizeof(tmp
));
2555 name
= d_path(&file
->f_path
, buf
, PATH_MAX
);
2557 name
= strncpy(tmp
, "//toolong", sizeof(tmp
));
2561 name
= strncpy(tmp
, "//anon", sizeof(tmp
));
2566 size
= ALIGN(strlen(name
)+1, sizeof(u64
));
2568 mmap_event
->file_name
= name
;
2569 mmap_event
->file_size
= size
;
2571 mmap_event
->event
.header
.size
= sizeof(mmap_event
->event
) + size
;
2573 cpuctx
= &get_cpu_var(perf_cpu_context
);
2574 perf_counter_mmap_ctx(&cpuctx
->ctx
, mmap_event
);
2575 put_cpu_var(perf_cpu_context
);
2579 * doesn't really matter which of the child contexts the
2580 * events ends up in.
2582 ctx
= rcu_dereference(current
->perf_counter_ctxp
);
2584 perf_counter_mmap_ctx(ctx
, mmap_event
);
2590 void perf_counter_mmap(unsigned long addr
, unsigned long len
,
2591 unsigned long pgoff
, struct file
*file
)
2593 struct perf_mmap_event mmap_event
;
2595 if (!atomic_read(&nr_mmap_tracking
))
2598 mmap_event
= (struct perf_mmap_event
){
2601 .header
= { .type
= PERF_EVENT_MMAP
, },
2602 .pid
= current
->group_leader
->pid
,
2603 .tid
= current
->pid
,
2610 perf_counter_mmap_event(&mmap_event
);
2613 void perf_counter_munmap(unsigned long addr
, unsigned long len
,
2614 unsigned long pgoff
, struct file
*file
)
2616 struct perf_mmap_event mmap_event
;
2618 if (!atomic_read(&nr_munmap_tracking
))
2621 mmap_event
= (struct perf_mmap_event
){
2624 .header
= { .type
= PERF_EVENT_MUNMAP
, },
2625 .pid
= current
->group_leader
->pid
,
2626 .tid
= current
->pid
,
2633 perf_counter_mmap_event(&mmap_event
);
2637 * Log irq_period changes so that analyzing tools can re-normalize the
2641 static void perf_log_period(struct perf_counter
*counter
, u64 period
)
2643 struct perf_output_handle handle
;
2647 struct perf_event_header header
;
2652 .type
= PERF_EVENT_PERIOD
,
2654 .size
= sizeof(freq_event
),
2656 .time
= sched_clock(),
2660 if (counter
->hw
.irq_period
== period
)
2663 ret
= perf_output_begin(&handle
, counter
, sizeof(freq_event
), 0, 0);
2667 perf_output_put(&handle
, freq_event
);
2668 perf_output_end(&handle
);
2672 * IRQ throttle logging
2675 static void perf_log_throttle(struct perf_counter
*counter
, int enable
)
2677 struct perf_output_handle handle
;
2681 struct perf_event_header header
;
2683 } throttle_event
= {
2685 .type
= PERF_EVENT_THROTTLE
+ 1,
2687 .size
= sizeof(throttle_event
),
2689 .time
= sched_clock(),
2692 ret
= perf_output_begin(&handle
, counter
, sizeof(throttle_event
), 1, 0);
2696 perf_output_put(&handle
, throttle_event
);
2697 perf_output_end(&handle
);
2701 * Generic counter overflow handling.
2704 int perf_counter_overflow(struct perf_counter
*counter
,
2705 int nmi
, struct pt_regs
*regs
, u64 addr
)
2707 int events
= atomic_read(&counter
->event_limit
);
2708 int throttle
= counter
->pmu
->unthrottle
!= NULL
;
2712 counter
->hw
.interrupts
++;
2713 } else if (counter
->hw
.interrupts
!= MAX_INTERRUPTS
) {
2714 counter
->hw
.interrupts
++;
2715 if (HZ
*counter
->hw
.interrupts
> (u64
)sysctl_perf_counter_limit
) {
2716 counter
->hw
.interrupts
= MAX_INTERRUPTS
;
2717 perf_log_throttle(counter
, 0);
2723 * XXX event_limit might not quite work as expected on inherited
2727 counter
->pending_kill
= POLL_IN
;
2728 if (events
&& atomic_dec_and_test(&counter
->event_limit
)) {
2730 counter
->pending_kill
= POLL_HUP
;
2732 counter
->pending_disable
= 1;
2733 perf_pending_queue(&counter
->pending
,
2734 perf_pending_counter
);
2736 perf_counter_disable(counter
);
2739 perf_counter_output(counter
, nmi
, regs
, addr
);
2744 * Generic software counter infrastructure
2747 static void perf_swcounter_update(struct perf_counter
*counter
)
2749 struct hw_perf_counter
*hwc
= &counter
->hw
;
2754 prev
= atomic64_read(&hwc
->prev_count
);
2755 now
= atomic64_read(&hwc
->count
);
2756 if (atomic64_cmpxchg(&hwc
->prev_count
, prev
, now
) != prev
)
2761 atomic64_add(delta
, &counter
->count
);
2762 atomic64_sub(delta
, &hwc
->period_left
);
2765 static void perf_swcounter_set_period(struct perf_counter
*counter
)
2767 struct hw_perf_counter
*hwc
= &counter
->hw
;
2768 s64 left
= atomic64_read(&hwc
->period_left
);
2769 s64 period
= hwc
->irq_period
;
2771 if (unlikely(left
<= -period
)) {
2773 atomic64_set(&hwc
->period_left
, left
);
2776 if (unlikely(left
<= 0)) {
2778 atomic64_add(period
, &hwc
->period_left
);
2781 atomic64_set(&hwc
->prev_count
, -left
);
2782 atomic64_set(&hwc
->count
, -left
);
2785 static enum hrtimer_restart
perf_swcounter_hrtimer(struct hrtimer
*hrtimer
)
2787 enum hrtimer_restart ret
= HRTIMER_RESTART
;
2788 struct perf_counter
*counter
;
2789 struct pt_regs
*regs
;
2792 counter
= container_of(hrtimer
, struct perf_counter
, hw
.hrtimer
);
2793 counter
->pmu
->read(counter
);
2795 regs
= get_irq_regs();
2797 * In case we exclude kernel IPs or are somehow not in interrupt
2798 * context, provide the next best thing, the user IP.
2800 if ((counter
->hw_event
.exclude_kernel
|| !regs
) &&
2801 !counter
->hw_event
.exclude_user
)
2802 regs
= task_pt_regs(current
);
2805 if (perf_counter_overflow(counter
, 0, regs
, 0))
2806 ret
= HRTIMER_NORESTART
;
2809 period
= max_t(u64
, 10000, counter
->hw
.irq_period
);
2810 hrtimer_forward_now(hrtimer
, ns_to_ktime(period
));
2815 static void perf_swcounter_overflow(struct perf_counter
*counter
,
2816 int nmi
, struct pt_regs
*regs
, u64 addr
)
2818 perf_swcounter_update(counter
);
2819 perf_swcounter_set_period(counter
);
2820 if (perf_counter_overflow(counter
, nmi
, regs
, addr
))
2821 /* soft-disable the counter */
2826 static int perf_swcounter_match(struct perf_counter
*counter
,
2827 enum perf_event_types type
,
2828 u32 event
, struct pt_regs
*regs
)
2830 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
2833 if (perf_event_raw(&counter
->hw_event
))
2836 if (perf_event_type(&counter
->hw_event
) != type
)
2839 if (perf_event_id(&counter
->hw_event
) != event
)
2842 if (counter
->hw_event
.exclude_user
&& user_mode(regs
))
2845 if (counter
->hw_event
.exclude_kernel
&& !user_mode(regs
))
2851 static void perf_swcounter_add(struct perf_counter
*counter
, u64 nr
,
2852 int nmi
, struct pt_regs
*regs
, u64 addr
)
2854 int neg
= atomic64_add_negative(nr
, &counter
->hw
.count
);
2855 if (counter
->hw
.irq_period
&& !neg
)
2856 perf_swcounter_overflow(counter
, nmi
, regs
, addr
);
2859 static void perf_swcounter_ctx_event(struct perf_counter_context
*ctx
,
2860 enum perf_event_types type
, u32 event
,
2861 u64 nr
, int nmi
, struct pt_regs
*regs
,
2864 struct perf_counter
*counter
;
2866 if (system_state
!= SYSTEM_RUNNING
|| list_empty(&ctx
->event_list
))
2870 list_for_each_entry_rcu(counter
, &ctx
->event_list
, event_entry
) {
2871 if (perf_swcounter_match(counter
, type
, event
, regs
))
2872 perf_swcounter_add(counter
, nr
, nmi
, regs
, addr
);
2877 static int *perf_swcounter_recursion_context(struct perf_cpu_context
*cpuctx
)
2880 return &cpuctx
->recursion
[3];
2883 return &cpuctx
->recursion
[2];
2886 return &cpuctx
->recursion
[1];
2888 return &cpuctx
->recursion
[0];
2891 static void __perf_swcounter_event(enum perf_event_types type
, u32 event
,
2892 u64 nr
, int nmi
, struct pt_regs
*regs
,
2895 struct perf_cpu_context
*cpuctx
= &get_cpu_var(perf_cpu_context
);
2896 int *recursion
= perf_swcounter_recursion_context(cpuctx
);
2897 struct perf_counter_context
*ctx
;
2905 perf_swcounter_ctx_event(&cpuctx
->ctx
, type
, event
,
2906 nr
, nmi
, regs
, addr
);
2909 * doesn't really matter which of the child contexts the
2910 * events ends up in.
2912 ctx
= rcu_dereference(current
->perf_counter_ctxp
);
2914 perf_swcounter_ctx_event(ctx
, type
, event
, nr
, nmi
, regs
, addr
);
2921 put_cpu_var(perf_cpu_context
);
2925 perf_swcounter_event(u32 event
, u64 nr
, int nmi
, struct pt_regs
*regs
, u64 addr
)
2927 __perf_swcounter_event(PERF_TYPE_SOFTWARE
, event
, nr
, nmi
, regs
, addr
);
2930 static void perf_swcounter_read(struct perf_counter
*counter
)
2932 perf_swcounter_update(counter
);
2935 static int perf_swcounter_enable(struct perf_counter
*counter
)
2937 perf_swcounter_set_period(counter
);
2941 static void perf_swcounter_disable(struct perf_counter
*counter
)
2943 perf_swcounter_update(counter
);
2946 static const struct pmu perf_ops_generic
= {
2947 .enable
= perf_swcounter_enable
,
2948 .disable
= perf_swcounter_disable
,
2949 .read
= perf_swcounter_read
,
2953 * Software counter: cpu wall time clock
2956 static void cpu_clock_perf_counter_update(struct perf_counter
*counter
)
2958 int cpu
= raw_smp_processor_id();
2962 now
= cpu_clock(cpu
);
2963 prev
= atomic64_read(&counter
->hw
.prev_count
);
2964 atomic64_set(&counter
->hw
.prev_count
, now
);
2965 atomic64_add(now
- prev
, &counter
->count
);
2968 static int cpu_clock_perf_counter_enable(struct perf_counter
*counter
)
2970 struct hw_perf_counter
*hwc
= &counter
->hw
;
2971 int cpu
= raw_smp_processor_id();
2973 atomic64_set(&hwc
->prev_count
, cpu_clock(cpu
));
2974 hrtimer_init(&hwc
->hrtimer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
2975 hwc
->hrtimer
.function
= perf_swcounter_hrtimer
;
2976 if (hwc
->irq_period
) {
2977 u64 period
= max_t(u64
, 10000, hwc
->irq_period
);
2978 __hrtimer_start_range_ns(&hwc
->hrtimer
,
2979 ns_to_ktime(period
), 0,
2980 HRTIMER_MODE_REL
, 0);
2986 static void cpu_clock_perf_counter_disable(struct perf_counter
*counter
)
2988 if (counter
->hw
.irq_period
)
2989 hrtimer_cancel(&counter
->hw
.hrtimer
);
2990 cpu_clock_perf_counter_update(counter
);
2993 static void cpu_clock_perf_counter_read(struct perf_counter
*counter
)
2995 cpu_clock_perf_counter_update(counter
);
2998 static const struct pmu perf_ops_cpu_clock
= {
2999 .enable
= cpu_clock_perf_counter_enable
,
3000 .disable
= cpu_clock_perf_counter_disable
,
3001 .read
= cpu_clock_perf_counter_read
,
3005 * Software counter: task time clock
3008 static void task_clock_perf_counter_update(struct perf_counter
*counter
, u64 now
)
3013 prev
= atomic64_xchg(&counter
->hw
.prev_count
, now
);
3015 atomic64_add(delta
, &counter
->count
);
3018 static int task_clock_perf_counter_enable(struct perf_counter
*counter
)
3020 struct hw_perf_counter
*hwc
= &counter
->hw
;
3023 now
= counter
->ctx
->time
;
3025 atomic64_set(&hwc
->prev_count
, now
);
3026 hrtimer_init(&hwc
->hrtimer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
3027 hwc
->hrtimer
.function
= perf_swcounter_hrtimer
;
3028 if (hwc
->irq_period
) {
3029 u64 period
= max_t(u64
, 10000, hwc
->irq_period
);
3030 __hrtimer_start_range_ns(&hwc
->hrtimer
,
3031 ns_to_ktime(period
), 0,
3032 HRTIMER_MODE_REL
, 0);
3038 static void task_clock_perf_counter_disable(struct perf_counter
*counter
)
3040 if (counter
->hw
.irq_period
)
3041 hrtimer_cancel(&counter
->hw
.hrtimer
);
3042 task_clock_perf_counter_update(counter
, counter
->ctx
->time
);
3046 static void task_clock_perf_counter_read(struct perf_counter
*counter
)
3051 update_context_time(counter
->ctx
);
3052 time
= counter
->ctx
->time
;
3054 u64 now
= perf_clock();
3055 u64 delta
= now
- counter
->ctx
->timestamp
;
3056 time
= counter
->ctx
->time
+ delta
;
3059 task_clock_perf_counter_update(counter
, time
);
3062 static const struct pmu perf_ops_task_clock
= {
3063 .enable
= task_clock_perf_counter_enable
,
3064 .disable
= task_clock_perf_counter_disable
,
3065 .read
= task_clock_perf_counter_read
,
3069 * Software counter: cpu migrations
3072 static inline u64
get_cpu_migrations(struct perf_counter
*counter
)
3074 struct task_struct
*curr
= counter
->ctx
->task
;
3077 return curr
->se
.nr_migrations
;
3078 return cpu_nr_migrations(smp_processor_id());
3081 static void cpu_migrations_perf_counter_update(struct perf_counter
*counter
)
3086 prev
= atomic64_read(&counter
->hw
.prev_count
);
3087 now
= get_cpu_migrations(counter
);
3089 atomic64_set(&counter
->hw
.prev_count
, now
);
3093 atomic64_add(delta
, &counter
->count
);
3096 static void cpu_migrations_perf_counter_read(struct perf_counter
*counter
)
3098 cpu_migrations_perf_counter_update(counter
);
3101 static int cpu_migrations_perf_counter_enable(struct perf_counter
*counter
)
3103 if (counter
->prev_state
<= PERF_COUNTER_STATE_OFF
)
3104 atomic64_set(&counter
->hw
.prev_count
,
3105 get_cpu_migrations(counter
));
3109 static void cpu_migrations_perf_counter_disable(struct perf_counter
*counter
)
3111 cpu_migrations_perf_counter_update(counter
);
3114 static const struct pmu perf_ops_cpu_migrations
= {
3115 .enable
= cpu_migrations_perf_counter_enable
,
3116 .disable
= cpu_migrations_perf_counter_disable
,
3117 .read
= cpu_migrations_perf_counter_read
,
3120 #ifdef CONFIG_EVENT_PROFILE
3121 void perf_tpcounter_event(int event_id
)
3123 struct pt_regs
*regs
= get_irq_regs();
3126 regs
= task_pt_regs(current
);
3128 __perf_swcounter_event(PERF_TYPE_TRACEPOINT
, event_id
, 1, 1, regs
, 0);
3130 EXPORT_SYMBOL_GPL(perf_tpcounter_event
);
3132 extern int ftrace_profile_enable(int);
3133 extern void ftrace_profile_disable(int);
3135 static void tp_perf_counter_destroy(struct perf_counter
*counter
)
3137 ftrace_profile_disable(perf_event_id(&counter
->hw_event
));
3140 static const struct pmu
*tp_perf_counter_init(struct perf_counter
*counter
)
3142 int event_id
= perf_event_id(&counter
->hw_event
);
3145 ret
= ftrace_profile_enable(event_id
);
3149 counter
->destroy
= tp_perf_counter_destroy
;
3150 counter
->hw
.irq_period
= counter
->hw_event
.irq_period
;
3152 return &perf_ops_generic
;
3155 static const struct pmu
*tp_perf_counter_init(struct perf_counter
*counter
)
3161 static const struct pmu
*sw_perf_counter_init(struct perf_counter
*counter
)
3163 const struct pmu
*pmu
= NULL
;
3166 * Software counters (currently) can't in general distinguish
3167 * between user, kernel and hypervisor events.
3168 * However, context switches and cpu migrations are considered
3169 * to be kernel events, and page faults are never hypervisor
3172 switch (perf_event_id(&counter
->hw_event
)) {
3173 case PERF_COUNT_CPU_CLOCK
:
3174 pmu
= &perf_ops_cpu_clock
;
3177 case PERF_COUNT_TASK_CLOCK
:
3179 * If the user instantiates this as a per-cpu counter,
3180 * use the cpu_clock counter instead.
3182 if (counter
->ctx
->task
)
3183 pmu
= &perf_ops_task_clock
;
3185 pmu
= &perf_ops_cpu_clock
;
3188 case PERF_COUNT_PAGE_FAULTS
:
3189 case PERF_COUNT_PAGE_FAULTS_MIN
:
3190 case PERF_COUNT_PAGE_FAULTS_MAJ
:
3191 case PERF_COUNT_CONTEXT_SWITCHES
:
3192 pmu
= &perf_ops_generic
;
3194 case PERF_COUNT_CPU_MIGRATIONS
:
3195 if (!counter
->hw_event
.exclude_kernel
)
3196 pmu
= &perf_ops_cpu_migrations
;
3204 * Allocate and initialize a counter structure
3206 static struct perf_counter
*
3207 perf_counter_alloc(struct perf_counter_hw_event
*hw_event
,
3209 struct perf_counter_context
*ctx
,
3210 struct perf_counter
*group_leader
,
3213 const struct pmu
*pmu
;
3214 struct perf_counter
*counter
;
3215 struct hw_perf_counter
*hwc
;
3218 counter
= kzalloc(sizeof(*counter
), gfpflags
);
3220 return ERR_PTR(-ENOMEM
);
3223 * Single counters are their own group leaders, with an
3224 * empty sibling list:
3227 group_leader
= counter
;
3229 mutex_init(&counter
->child_mutex
);
3230 INIT_LIST_HEAD(&counter
->child_list
);
3232 INIT_LIST_HEAD(&counter
->list_entry
);
3233 INIT_LIST_HEAD(&counter
->event_entry
);
3234 INIT_LIST_HEAD(&counter
->sibling_list
);
3235 init_waitqueue_head(&counter
->waitq
);
3237 mutex_init(&counter
->mmap_mutex
);
3240 counter
->hw_event
= *hw_event
;
3241 counter
->group_leader
= group_leader
;
3242 counter
->pmu
= NULL
;
3244 counter
->oncpu
= -1;
3246 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
3247 if (hw_event
->disabled
)
3248 counter
->state
= PERF_COUNTER_STATE_OFF
;
3253 if (hw_event
->freq
&& hw_event
->irq_freq
)
3254 hwc
->irq_period
= div64_u64(TICK_NSEC
, hw_event
->irq_freq
);
3256 hwc
->irq_period
= hw_event
->irq_period
;
3259 * we currently do not support PERF_RECORD_GROUP on inherited counters
3261 if (hw_event
->inherit
&& (hw_event
->record_type
& PERF_RECORD_GROUP
))
3264 if (perf_event_raw(hw_event
)) {
3265 pmu
= hw_perf_counter_init(counter
);
3269 switch (perf_event_type(hw_event
)) {
3270 case PERF_TYPE_HARDWARE
:
3271 pmu
= hw_perf_counter_init(counter
);
3274 case PERF_TYPE_SOFTWARE
:
3275 pmu
= sw_perf_counter_init(counter
);
3278 case PERF_TYPE_TRACEPOINT
:
3279 pmu
= tp_perf_counter_init(counter
);
3286 else if (IS_ERR(pmu
))
3291 return ERR_PTR(err
);
3296 atomic_inc(&nr_counters
);
3297 if (counter
->hw_event
.mmap
)
3298 atomic_inc(&nr_mmap_tracking
);
3299 if (counter
->hw_event
.munmap
)
3300 atomic_inc(&nr_munmap_tracking
);
3301 if (counter
->hw_event
.comm
)
3302 atomic_inc(&nr_comm_tracking
);
3308 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3310 * @hw_event_uptr: event type attributes for monitoring/sampling
3313 * @group_fd: group leader counter fd
3315 SYSCALL_DEFINE5(perf_counter_open
,
3316 const struct perf_counter_hw_event __user
*, hw_event_uptr
,
3317 pid_t
, pid
, int, cpu
, int, group_fd
, unsigned long, flags
)
3319 struct perf_counter
*counter
, *group_leader
;
3320 struct perf_counter_hw_event hw_event
;
3321 struct perf_counter_context
*ctx
;
3322 struct file
*counter_file
= NULL
;
3323 struct file
*group_file
= NULL
;
3324 int fput_needed
= 0;
3325 int fput_needed2
= 0;
3328 /* for future expandability... */
3332 if (copy_from_user(&hw_event
, hw_event_uptr
, sizeof(hw_event
)) != 0)
3336 * Get the target context (task or percpu):
3338 ctx
= find_get_context(pid
, cpu
);
3340 return PTR_ERR(ctx
);
3343 * Look up the group leader (we will attach this counter to it):
3345 group_leader
= NULL
;
3346 if (group_fd
!= -1) {
3348 group_file
= fget_light(group_fd
, &fput_needed
);
3350 goto err_put_context
;
3351 if (group_file
->f_op
!= &perf_fops
)
3352 goto err_put_context
;
3354 group_leader
= group_file
->private_data
;
3356 * Do not allow a recursive hierarchy (this new sibling
3357 * becoming part of another group-sibling):
3359 if (group_leader
->group_leader
!= group_leader
)
3360 goto err_put_context
;
3362 * Do not allow to attach to a group in a different
3363 * task or CPU context:
3365 if (group_leader
->ctx
!= ctx
)
3366 goto err_put_context
;
3368 * Only a group leader can be exclusive or pinned
3370 if (hw_event
.exclusive
|| hw_event
.pinned
)
3371 goto err_put_context
;
3374 counter
= perf_counter_alloc(&hw_event
, cpu
, ctx
, group_leader
,
3376 ret
= PTR_ERR(counter
);
3377 if (IS_ERR(counter
))
3378 goto err_put_context
;
3380 ret
= anon_inode_getfd("[perf_counter]", &perf_fops
, counter
, 0);
3382 goto err_free_put_context
;
3384 counter_file
= fget_light(ret
, &fput_needed2
);
3386 goto err_free_put_context
;
3388 counter
->filp
= counter_file
;
3389 WARN_ON_ONCE(ctx
->parent_ctx
);
3390 mutex_lock(&ctx
->mutex
);
3391 perf_install_in_context(ctx
, counter
, cpu
);
3393 mutex_unlock(&ctx
->mutex
);
3395 counter
->owner
= current
;
3396 get_task_struct(current
);
3397 mutex_lock(¤t
->perf_counter_mutex
);
3398 list_add_tail(&counter
->owner_entry
, ¤t
->perf_counter_list
);
3399 mutex_unlock(¤t
->perf_counter_mutex
);
3401 fput_light(counter_file
, fput_needed2
);
3404 fput_light(group_file
, fput_needed
);
3408 err_free_put_context
:
3418 * inherit a counter from parent task to child task:
3420 static struct perf_counter
*
3421 inherit_counter(struct perf_counter
*parent_counter
,
3422 struct task_struct
*parent
,
3423 struct perf_counter_context
*parent_ctx
,
3424 struct task_struct
*child
,
3425 struct perf_counter
*group_leader
,
3426 struct perf_counter_context
*child_ctx
)
3428 struct perf_counter
*child_counter
;
3431 * Instead of creating recursive hierarchies of counters,
3432 * we link inherited counters back to the original parent,
3433 * which has a filp for sure, which we use as the reference
3436 if (parent_counter
->parent
)
3437 parent_counter
= parent_counter
->parent
;
3439 child_counter
= perf_counter_alloc(&parent_counter
->hw_event
,
3440 parent_counter
->cpu
, child_ctx
,
3441 group_leader
, GFP_KERNEL
);
3442 if (IS_ERR(child_counter
))
3443 return child_counter
;
3447 * Make the child state follow the state of the parent counter,
3448 * not its hw_event.disabled bit. We hold the parent's mutex,
3449 * so we won't race with perf_counter_{en,dis}able_family.
3451 if (parent_counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
3452 child_counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
3454 child_counter
->state
= PERF_COUNTER_STATE_OFF
;
3457 * Link it up in the child's context:
3459 add_counter_to_ctx(child_counter
, child_ctx
);
3461 child_counter
->parent
= parent_counter
;
3463 * inherit into child's child as well:
3465 child_counter
->hw_event
.inherit
= 1;
3468 * Get a reference to the parent filp - we will fput it
3469 * when the child counter exits. This is safe to do because
3470 * we are in the parent and we know that the filp still
3471 * exists and has a nonzero count:
3473 atomic_long_inc(&parent_counter
->filp
->f_count
);
3476 * Link this into the parent counter's child list
3478 WARN_ON_ONCE(parent_counter
->ctx
->parent_ctx
);
3479 mutex_lock(&parent_counter
->child_mutex
);
3480 list_add_tail(&child_counter
->child_list
, &parent_counter
->child_list
);
3481 mutex_unlock(&parent_counter
->child_mutex
);
3483 return child_counter
;
3486 static int inherit_group(struct perf_counter
*parent_counter
,
3487 struct task_struct
*parent
,
3488 struct perf_counter_context
*parent_ctx
,
3489 struct task_struct
*child
,
3490 struct perf_counter_context
*child_ctx
)
3492 struct perf_counter
*leader
;
3493 struct perf_counter
*sub
;
3494 struct perf_counter
*child_ctr
;
3496 leader
= inherit_counter(parent_counter
, parent
, parent_ctx
,
3497 child
, NULL
, child_ctx
);
3499 return PTR_ERR(leader
);
3500 list_for_each_entry(sub
, &parent_counter
->sibling_list
, list_entry
) {
3501 child_ctr
= inherit_counter(sub
, parent
, parent_ctx
,
3502 child
, leader
, child_ctx
);
3503 if (IS_ERR(child_ctr
))
3504 return PTR_ERR(child_ctr
);
3509 static void sync_child_counter(struct perf_counter
*child_counter
,
3510 struct perf_counter
*parent_counter
)
3514 child_val
= atomic64_read(&child_counter
->count
);
3517 * Add back the child's count to the parent's count:
3519 atomic64_add(child_val
, &parent_counter
->count
);
3520 atomic64_add(child_counter
->total_time_enabled
,
3521 &parent_counter
->child_total_time_enabled
);
3522 atomic64_add(child_counter
->total_time_running
,
3523 &parent_counter
->child_total_time_running
);
3526 * Remove this counter from the parent's list
3528 WARN_ON_ONCE(parent_counter
->ctx
->parent_ctx
);
3529 mutex_lock(&parent_counter
->child_mutex
);
3530 list_del_init(&child_counter
->child_list
);
3531 mutex_unlock(&parent_counter
->child_mutex
);
3534 * Release the parent counter, if this was the last
3537 fput(parent_counter
->filp
);
3541 __perf_counter_exit_task(struct perf_counter
*child_counter
,
3542 struct perf_counter_context
*child_ctx
)
3544 struct perf_counter
*parent_counter
;
3546 update_counter_times(child_counter
);
3547 perf_counter_remove_from_context(child_counter
);
3549 parent_counter
= child_counter
->parent
;
3551 * It can happen that parent exits first, and has counters
3552 * that are still around due to the child reference. These
3553 * counters need to be zapped - but otherwise linger.
3555 if (parent_counter
) {
3556 sync_child_counter(child_counter
, parent_counter
);
3557 free_counter(child_counter
);
3562 * When a child task exits, feed back counter values to parent counters.
3564 void perf_counter_exit_task(struct task_struct
*child
)
3566 struct perf_counter
*child_counter
, *tmp
;
3567 struct perf_counter_context
*child_ctx
;
3568 unsigned long flags
;
3570 if (likely(!child
->perf_counter_ctxp
))
3573 local_irq_save(flags
);
3575 * We can't reschedule here because interrupts are disabled,
3576 * and either child is current or it is a task that can't be
3577 * scheduled, so we are now safe from rescheduling changing
3580 child_ctx
= child
->perf_counter_ctxp
;
3581 __perf_counter_task_sched_out(child_ctx
);
3584 * Take the context lock here so that if find_get_context is
3585 * reading child->perf_counter_ctxp, we wait until it has
3586 * incremented the context's refcount before we do put_ctx below.
3588 spin_lock(&child_ctx
->lock
);
3589 child
->perf_counter_ctxp
= NULL
;
3590 if (child_ctx
->parent_ctx
) {
3592 * This context is a clone; unclone it so it can't get
3593 * swapped to another process while we're removing all
3594 * the counters from it.
3596 put_ctx(child_ctx
->parent_ctx
);
3597 child_ctx
->parent_ctx
= NULL
;
3599 spin_unlock(&child_ctx
->lock
);
3600 local_irq_restore(flags
);
3602 mutex_lock(&child_ctx
->mutex
);
3605 list_for_each_entry_safe(child_counter
, tmp
, &child_ctx
->counter_list
,
3607 __perf_counter_exit_task(child_counter
, child_ctx
);
3610 * If the last counter was a group counter, it will have appended all
3611 * its siblings to the list, but we obtained 'tmp' before that which
3612 * will still point to the list head terminating the iteration.
3614 if (!list_empty(&child_ctx
->counter_list
))
3617 mutex_unlock(&child_ctx
->mutex
);
3623 * free an unexposed, unused context as created by inheritance by
3624 * init_task below, used by fork() in case of fail.
3626 void perf_counter_free_task(struct task_struct
*task
)
3628 struct perf_counter_context
*ctx
= task
->perf_counter_ctxp
;
3629 struct perf_counter
*counter
, *tmp
;
3634 mutex_lock(&ctx
->mutex
);
3636 list_for_each_entry_safe(counter
, tmp
, &ctx
->counter_list
, list_entry
) {
3637 struct perf_counter
*parent
= counter
->parent
;
3639 if (WARN_ON_ONCE(!parent
))
3642 mutex_lock(&parent
->child_mutex
);
3643 list_del_init(&counter
->child_list
);
3644 mutex_unlock(&parent
->child_mutex
);
3648 list_del_counter(counter
, ctx
);
3649 free_counter(counter
);
3652 if (!list_empty(&ctx
->counter_list
))
3655 mutex_unlock(&ctx
->mutex
);
3661 * Initialize the perf_counter context in task_struct
3663 int perf_counter_init_task(struct task_struct
*child
)
3665 struct perf_counter_context
*child_ctx
, *parent_ctx
;
3666 struct perf_counter_context
*cloned_ctx
;
3667 struct perf_counter
*counter
;
3668 struct task_struct
*parent
= current
;
3669 int inherited_all
= 1;
3673 child
->perf_counter_ctxp
= NULL
;
3675 mutex_init(&child
->perf_counter_mutex
);
3676 INIT_LIST_HEAD(&child
->perf_counter_list
);
3678 if (likely(!parent
->perf_counter_ctxp
))
3682 * This is executed from the parent task context, so inherit
3683 * counters that have been marked for cloning.
3684 * First allocate and initialize a context for the child.
3687 child_ctx
= kmalloc(sizeof(struct perf_counter_context
), GFP_KERNEL
);
3691 __perf_counter_init_context(child_ctx
, child
);
3692 child
->perf_counter_ctxp
= child_ctx
;
3693 get_task_struct(child
);
3696 * If the parent's context is a clone, temporarily set its
3697 * parent_gen to an impossible value (all 1s) so it won't get
3698 * swapped under us. The rcu_read_lock makes sure that
3699 * parent_ctx continues to exist even if it gets swapped to
3700 * another process and then freed while we are trying to get
3705 parent_ctx
= rcu_dereference(parent
->perf_counter_ctxp
);
3707 * No need to check if parent_ctx != NULL here; since we saw
3708 * it non-NULL earlier, the only reason for it to become NULL
3709 * is if we exit, and since we're currently in the middle of
3710 * a fork we can't be exiting at the same time.
3712 spin_lock_irq(&parent_ctx
->lock
);
3713 if (parent_ctx
!= rcu_dereference(parent
->perf_counter_ctxp
)) {
3714 spin_unlock_irq(&parent_ctx
->lock
);
3717 cloned_gen
= parent_ctx
->parent_gen
;
3718 if (parent_ctx
->parent_ctx
)
3719 parent_ctx
->parent_gen
= ~0ull;
3720 spin_unlock_irq(&parent_ctx
->lock
);
3724 * Lock the parent list. No need to lock the child - not PID
3725 * hashed yet and not running, so nobody can access it.
3727 mutex_lock(&parent_ctx
->mutex
);
3730 * We dont have to disable NMIs - we are only looking at
3731 * the list, not manipulating it:
3733 list_for_each_entry_rcu(counter
, &parent_ctx
->event_list
, event_entry
) {
3734 if (counter
!= counter
->group_leader
)
3737 if (!counter
->hw_event
.inherit
) {
3742 ret
= inherit_group(counter
, parent
, parent_ctx
,
3750 if (inherited_all
) {
3752 * Mark the child context as a clone of the parent
3753 * context, or of whatever the parent is a clone of.
3754 * Note that if the parent is a clone, it could get
3755 * uncloned at any point, but that doesn't matter
3756 * because the list of counters and the generation
3757 * count can't have changed since we took the mutex.
3759 cloned_ctx
= rcu_dereference(parent_ctx
->parent_ctx
);
3761 child_ctx
->parent_ctx
= cloned_ctx
;
3762 child_ctx
->parent_gen
= cloned_gen
;
3764 child_ctx
->parent_ctx
= parent_ctx
;
3765 child_ctx
->parent_gen
= parent_ctx
->generation
;
3767 get_ctx(child_ctx
->parent_ctx
);
3770 mutex_unlock(&parent_ctx
->mutex
);
3773 * Restore the clone status of the parent.
3775 if (parent_ctx
->parent_ctx
) {
3776 spin_lock_irq(&parent_ctx
->lock
);
3777 if (parent_ctx
->parent_ctx
)
3778 parent_ctx
->parent_gen
= cloned_gen
;
3779 spin_unlock_irq(&parent_ctx
->lock
);
3785 static void __cpuinit
perf_counter_init_cpu(int cpu
)
3787 struct perf_cpu_context
*cpuctx
;
3789 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
3790 __perf_counter_init_context(&cpuctx
->ctx
, NULL
);
3792 spin_lock(&perf_resource_lock
);
3793 cpuctx
->max_pertask
= perf_max_counters
- perf_reserved_percpu
;
3794 spin_unlock(&perf_resource_lock
);
3796 hw_perf_counter_setup(cpu
);
3799 #ifdef CONFIG_HOTPLUG_CPU
3800 static void __perf_counter_exit_cpu(void *info
)
3802 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
3803 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
3804 struct perf_counter
*counter
, *tmp
;
3806 list_for_each_entry_safe(counter
, tmp
, &ctx
->counter_list
, list_entry
)
3807 __perf_counter_remove_from_context(counter
);
3809 static void perf_counter_exit_cpu(int cpu
)
3811 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
3812 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
3814 mutex_lock(&ctx
->mutex
);
3815 smp_call_function_single(cpu
, __perf_counter_exit_cpu
, NULL
, 1);
3816 mutex_unlock(&ctx
->mutex
);
3819 static inline void perf_counter_exit_cpu(int cpu
) { }
3822 static int __cpuinit
3823 perf_cpu_notify(struct notifier_block
*self
, unsigned long action
, void *hcpu
)
3825 unsigned int cpu
= (long)hcpu
;
3829 case CPU_UP_PREPARE
:
3830 case CPU_UP_PREPARE_FROZEN
:
3831 perf_counter_init_cpu(cpu
);
3834 case CPU_DOWN_PREPARE
:
3835 case CPU_DOWN_PREPARE_FROZEN
:
3836 perf_counter_exit_cpu(cpu
);
3846 static struct notifier_block __cpuinitdata perf_cpu_nb
= {
3847 .notifier_call
= perf_cpu_notify
,
3850 void __init
perf_counter_init(void)
3852 perf_cpu_notify(&perf_cpu_nb
, (unsigned long)CPU_UP_PREPARE
,
3853 (void *)(long)smp_processor_id());
3854 register_cpu_notifier(&perf_cpu_nb
);
3857 static ssize_t
perf_show_reserve_percpu(struct sysdev_class
*class, char *buf
)
3859 return sprintf(buf
, "%d\n", perf_reserved_percpu
);
3863 perf_set_reserve_percpu(struct sysdev_class
*class,
3867 struct perf_cpu_context
*cpuctx
;
3871 err
= strict_strtoul(buf
, 10, &val
);
3874 if (val
> perf_max_counters
)
3877 spin_lock(&perf_resource_lock
);
3878 perf_reserved_percpu
= val
;
3879 for_each_online_cpu(cpu
) {
3880 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
3881 spin_lock_irq(&cpuctx
->ctx
.lock
);
3882 mpt
= min(perf_max_counters
- cpuctx
->ctx
.nr_counters
,
3883 perf_max_counters
- perf_reserved_percpu
);
3884 cpuctx
->max_pertask
= mpt
;
3885 spin_unlock_irq(&cpuctx
->ctx
.lock
);
3887 spin_unlock(&perf_resource_lock
);
3892 static ssize_t
perf_show_overcommit(struct sysdev_class
*class, char *buf
)
3894 return sprintf(buf
, "%d\n", perf_overcommit
);
3898 perf_set_overcommit(struct sysdev_class
*class, const char *buf
, size_t count
)
3903 err
= strict_strtoul(buf
, 10, &val
);
3909 spin_lock(&perf_resource_lock
);
3910 perf_overcommit
= val
;
3911 spin_unlock(&perf_resource_lock
);
3916 static SYSDEV_CLASS_ATTR(
3919 perf_show_reserve_percpu
,
3920 perf_set_reserve_percpu
3923 static SYSDEV_CLASS_ATTR(
3926 perf_show_overcommit
,
3930 static struct attribute
*perfclass_attrs
[] = {
3931 &attr_reserve_percpu
.attr
,
3932 &attr_overcommit
.attr
,
3936 static struct attribute_group perfclass_attr_group
= {
3937 .attrs
= perfclass_attrs
,
3938 .name
= "perf_counters",
3941 static int __init
perf_counter_sysfs_init(void)
3943 return sysfs_create_group(&cpu_sysdev_class
.kset
.kobj
,
3944 &perfclass_attr_group
);
3946 device_initcall(perf_counter_sysfs_init
);