2 * Performance counter core code
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
8 * For licensing details see kernel-base/COPYING
13 #include <linux/cpu.h>
14 #include <linux/smp.h>
15 #include <linux/file.h>
16 #include <linux/poll.h>
17 #include <linux/sysfs.h>
18 #include <linux/ptrace.h>
19 #include <linux/percpu.h>
20 #include <linux/vmstat.h>
21 #include <linux/hardirq.h>
22 #include <linux/rculist.h>
23 #include <linux/uaccess.h>
24 #include <linux/syscalls.h>
25 #include <linux/anon_inodes.h>
26 #include <linux/kernel_stat.h>
27 #include <linux/perf_counter.h>
28 #include <linux/dcache.h>
30 #include <asm/irq_regs.h>
33 * Each CPU has a list of per CPU counters:
35 DEFINE_PER_CPU(struct perf_cpu_context
, perf_cpu_context
);
37 int perf_max_counters __read_mostly
= 1;
38 static int perf_reserved_percpu __read_mostly
;
39 static int perf_overcommit __read_mostly
= 1;
41 static atomic_t nr_mmap_tracking __read_mostly
;
42 static atomic_t nr_munmap_tracking __read_mostly
;
43 static atomic_t nr_comm_tracking __read_mostly
;
46 * Mutex for (sysadmin-configurable) counter reservations:
48 static DEFINE_MUTEX(perf_resource_mutex
);
51 * Architecture provided APIs - weak aliases:
53 extern __weak
const struct hw_perf_counter_ops
*
54 hw_perf_counter_init(struct perf_counter
*counter
)
59 u64 __weak
hw_perf_save_disable(void) { return 0; }
60 void __weak
hw_perf_restore(u64 ctrl
) { barrier(); }
61 void __weak
hw_perf_counter_setup(int cpu
) { barrier(); }
62 int __weak
hw_perf_group_sched_in(struct perf_counter
*group_leader
,
63 struct perf_cpu_context
*cpuctx
,
64 struct perf_counter_context
*ctx
, int cpu
)
69 void __weak
perf_counter_print_debug(void) { }
72 list_add_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
74 struct perf_counter
*group_leader
= counter
->group_leader
;
77 * Depending on whether it is a standalone or sibling counter,
78 * add it straight to the context's counter list, or to the group
79 * leader's sibling list:
81 if (counter
->group_leader
== counter
)
82 list_add_tail(&counter
->list_entry
, &ctx
->counter_list
);
84 list_add_tail(&counter
->list_entry
, &group_leader
->sibling_list
);
85 group_leader
->nr_siblings
++;
88 list_add_rcu(&counter
->event_entry
, &ctx
->event_list
);
92 list_del_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
94 struct perf_counter
*sibling
, *tmp
;
96 list_del_init(&counter
->list_entry
);
97 list_del_rcu(&counter
->event_entry
);
99 if (counter
->group_leader
!= counter
)
100 counter
->group_leader
->nr_siblings
--;
103 * If this was a group counter with sibling counters then
104 * upgrade the siblings to singleton counters by adding them
105 * to the context list directly:
107 list_for_each_entry_safe(sibling
, tmp
,
108 &counter
->sibling_list
, list_entry
) {
110 list_move_tail(&sibling
->list_entry
, &ctx
->counter_list
);
111 sibling
->group_leader
= sibling
;
116 counter_sched_out(struct perf_counter
*counter
,
117 struct perf_cpu_context
*cpuctx
,
118 struct perf_counter_context
*ctx
)
120 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
123 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
124 counter
->tstamp_stopped
= ctx
->time
;
125 counter
->hw_ops
->disable(counter
);
128 if (!is_software_counter(counter
))
129 cpuctx
->active_oncpu
--;
131 if (counter
->hw_event
.exclusive
|| !cpuctx
->active_oncpu
)
132 cpuctx
->exclusive
= 0;
136 group_sched_out(struct perf_counter
*group_counter
,
137 struct perf_cpu_context
*cpuctx
,
138 struct perf_counter_context
*ctx
)
140 struct perf_counter
*counter
;
142 if (group_counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
145 counter_sched_out(group_counter
, cpuctx
, ctx
);
148 * Schedule out siblings (if any):
150 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
)
151 counter_sched_out(counter
, cpuctx
, ctx
);
153 if (group_counter
->hw_event
.exclusive
)
154 cpuctx
->exclusive
= 0;
158 * Cross CPU call to remove a performance counter
160 * We disable the counter on the hardware level first. After that we
161 * remove it from the context list.
163 static void __perf_counter_remove_from_context(void *info
)
165 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
166 struct perf_counter
*counter
= info
;
167 struct perf_counter_context
*ctx
= counter
->ctx
;
172 * If this is a task context, we need to check whether it is
173 * the current task context of this cpu. If not it has been
174 * scheduled out before the smp call arrived.
176 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
179 spin_lock_irqsave(&ctx
->lock
, flags
);
181 counter_sched_out(counter
, cpuctx
, ctx
);
183 counter
->task
= NULL
;
187 * Protect the list operation against NMI by disabling the
188 * counters on a global level. NOP for non NMI based counters.
190 perf_flags
= hw_perf_save_disable();
191 list_del_counter(counter
, ctx
);
192 hw_perf_restore(perf_flags
);
196 * Allow more per task counters with respect to the
199 cpuctx
->max_pertask
=
200 min(perf_max_counters
- ctx
->nr_counters
,
201 perf_max_counters
- perf_reserved_percpu
);
204 spin_unlock_irqrestore(&ctx
->lock
, flags
);
209 * Remove the counter from a task's (or a CPU's) list of counters.
211 * Must be called with counter->mutex and ctx->mutex held.
213 * CPU counters are removed with a smp call. For task counters we only
214 * call when the task is on a CPU.
216 static void perf_counter_remove_from_context(struct perf_counter
*counter
)
218 struct perf_counter_context
*ctx
= counter
->ctx
;
219 struct task_struct
*task
= ctx
->task
;
223 * Per cpu counters are removed via an smp call and
224 * the removal is always sucessful.
226 smp_call_function_single(counter
->cpu
,
227 __perf_counter_remove_from_context
,
233 task_oncpu_function_call(task
, __perf_counter_remove_from_context
,
236 spin_lock_irq(&ctx
->lock
);
238 * If the context is active we need to retry the smp call.
240 if (ctx
->nr_active
&& !list_empty(&counter
->list_entry
)) {
241 spin_unlock_irq(&ctx
->lock
);
246 * The lock prevents that this context is scheduled in so we
247 * can remove the counter safely, if the call above did not
250 if (!list_empty(&counter
->list_entry
)) {
252 list_del_counter(counter
, ctx
);
253 counter
->task
= NULL
;
255 spin_unlock_irq(&ctx
->lock
);
258 static inline u64
perf_clock(void)
260 return cpu_clock(smp_processor_id());
264 * Update the record of the current time in a context.
266 static void update_context_time(struct perf_counter_context
*ctx
)
268 u64 now
= perf_clock();
270 ctx
->time
+= now
- ctx
->timestamp
;
271 ctx
->timestamp
= now
;
275 * Update the total_time_enabled and total_time_running fields for a counter.
277 static void update_counter_times(struct perf_counter
*counter
)
279 struct perf_counter_context
*ctx
= counter
->ctx
;
282 if (counter
->state
< PERF_COUNTER_STATE_INACTIVE
)
285 counter
->total_time_enabled
= ctx
->time
- counter
->tstamp_enabled
;
287 if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
)
288 run_end
= counter
->tstamp_stopped
;
292 counter
->total_time_running
= run_end
- counter
->tstamp_running
;
296 * Update total_time_enabled and total_time_running for all counters in a group.
298 static void update_group_times(struct perf_counter
*leader
)
300 struct perf_counter
*counter
;
302 update_counter_times(leader
);
303 list_for_each_entry(counter
, &leader
->sibling_list
, list_entry
)
304 update_counter_times(counter
);
308 * Cross CPU call to disable a performance counter
310 static void __perf_counter_disable(void *info
)
312 struct perf_counter
*counter
= info
;
313 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
314 struct perf_counter_context
*ctx
= counter
->ctx
;
318 * If this is a per-task counter, need to check whether this
319 * counter's task is the current task on this cpu.
321 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
324 spin_lock_irqsave(&ctx
->lock
, flags
);
327 * If the counter is on, turn it off.
328 * If it is in error state, leave it in error state.
330 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
) {
331 update_context_time(ctx
);
332 update_counter_times(counter
);
333 if (counter
== counter
->group_leader
)
334 group_sched_out(counter
, cpuctx
, ctx
);
336 counter_sched_out(counter
, cpuctx
, ctx
);
337 counter
->state
= PERF_COUNTER_STATE_OFF
;
340 spin_unlock_irqrestore(&ctx
->lock
, flags
);
346 static void perf_counter_disable(struct perf_counter
*counter
)
348 struct perf_counter_context
*ctx
= counter
->ctx
;
349 struct task_struct
*task
= ctx
->task
;
353 * Disable the counter on the cpu that it's on
355 smp_call_function_single(counter
->cpu
, __perf_counter_disable
,
361 task_oncpu_function_call(task
, __perf_counter_disable
, counter
);
363 spin_lock_irq(&ctx
->lock
);
365 * If the counter is still active, we need to retry the cross-call.
367 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
368 spin_unlock_irq(&ctx
->lock
);
373 * Since we have the lock this context can't be scheduled
374 * in, so we can change the state safely.
376 if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
) {
377 update_counter_times(counter
);
378 counter
->state
= PERF_COUNTER_STATE_OFF
;
381 spin_unlock_irq(&ctx
->lock
);
385 * Disable a counter and all its children.
387 static void perf_counter_disable_family(struct perf_counter
*counter
)
389 struct perf_counter
*child
;
391 perf_counter_disable(counter
);
394 * Lock the mutex to protect the list of children
396 mutex_lock(&counter
->mutex
);
397 list_for_each_entry(child
, &counter
->child_list
, child_list
)
398 perf_counter_disable(child
);
399 mutex_unlock(&counter
->mutex
);
403 counter_sched_in(struct perf_counter
*counter
,
404 struct perf_cpu_context
*cpuctx
,
405 struct perf_counter_context
*ctx
,
408 if (counter
->state
<= PERF_COUNTER_STATE_OFF
)
411 counter
->state
= PERF_COUNTER_STATE_ACTIVE
;
412 counter
->oncpu
= cpu
; /* TODO: put 'cpu' into cpuctx->cpu */
414 * The new state must be visible before we turn it on in the hardware:
418 if (counter
->hw_ops
->enable(counter
)) {
419 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
424 counter
->tstamp_running
+= ctx
->time
- counter
->tstamp_stopped
;
426 if (!is_software_counter(counter
))
427 cpuctx
->active_oncpu
++;
430 if (counter
->hw_event
.exclusive
)
431 cpuctx
->exclusive
= 1;
437 * Return 1 for a group consisting entirely of software counters,
438 * 0 if the group contains any hardware counters.
440 static int is_software_only_group(struct perf_counter
*leader
)
442 struct perf_counter
*counter
;
444 if (!is_software_counter(leader
))
447 list_for_each_entry(counter
, &leader
->sibling_list
, list_entry
)
448 if (!is_software_counter(counter
))
455 * Work out whether we can put this counter group on the CPU now.
457 static int group_can_go_on(struct perf_counter
*counter
,
458 struct perf_cpu_context
*cpuctx
,
462 * Groups consisting entirely of software counters can always go on.
464 if (is_software_only_group(counter
))
467 * If an exclusive group is already on, no other hardware
468 * counters can go on.
470 if (cpuctx
->exclusive
)
473 * If this group is exclusive and there are already
474 * counters on the CPU, it can't go on.
476 if (counter
->hw_event
.exclusive
&& cpuctx
->active_oncpu
)
479 * Otherwise, try to add it if all previous groups were able
485 static void add_counter_to_ctx(struct perf_counter
*counter
,
486 struct perf_counter_context
*ctx
)
488 list_add_counter(counter
, ctx
);
490 counter
->prev_state
= PERF_COUNTER_STATE_OFF
;
491 counter
->tstamp_enabled
= ctx
->time
;
492 counter
->tstamp_running
= ctx
->time
;
493 counter
->tstamp_stopped
= ctx
->time
;
497 * Cross CPU call to install and enable a performance counter
499 static void __perf_install_in_context(void *info
)
501 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
502 struct perf_counter
*counter
= info
;
503 struct perf_counter_context
*ctx
= counter
->ctx
;
504 struct perf_counter
*leader
= counter
->group_leader
;
505 int cpu
= smp_processor_id();
511 * If this is a task context, we need to check whether it is
512 * the current task context of this cpu. If not it has been
513 * scheduled out before the smp call arrived.
515 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
518 spin_lock_irqsave(&ctx
->lock
, flags
);
519 update_context_time(ctx
);
522 * Protect the list operation against NMI by disabling the
523 * counters on a global level. NOP for non NMI based counters.
525 perf_flags
= hw_perf_save_disable();
527 add_counter_to_ctx(counter
, ctx
);
530 * Don't put the counter on if it is disabled or if
531 * it is in a group and the group isn't on.
533 if (counter
->state
!= PERF_COUNTER_STATE_INACTIVE
||
534 (leader
!= counter
&& leader
->state
!= PERF_COUNTER_STATE_ACTIVE
))
538 * An exclusive counter can't go on if there are already active
539 * hardware counters, and no hardware counter can go on if there
540 * is already an exclusive counter on.
542 if (!group_can_go_on(counter
, cpuctx
, 1))
545 err
= counter_sched_in(counter
, cpuctx
, ctx
, cpu
);
549 * This counter couldn't go on. If it is in a group
550 * then we have to pull the whole group off.
551 * If the counter group is pinned then put it in error state.
553 if (leader
!= counter
)
554 group_sched_out(leader
, cpuctx
, ctx
);
555 if (leader
->hw_event
.pinned
) {
556 update_group_times(leader
);
557 leader
->state
= PERF_COUNTER_STATE_ERROR
;
561 if (!err
&& !ctx
->task
&& cpuctx
->max_pertask
)
562 cpuctx
->max_pertask
--;
565 hw_perf_restore(perf_flags
);
567 spin_unlock_irqrestore(&ctx
->lock
, flags
);
571 * Attach a performance counter to a context
573 * First we add the counter to the list with the hardware enable bit
574 * in counter->hw_config cleared.
576 * If the counter is attached to a task which is on a CPU we use a smp
577 * call to enable it in the task context. The task might have been
578 * scheduled away, but we check this in the smp call again.
580 * Must be called with ctx->mutex held.
583 perf_install_in_context(struct perf_counter_context
*ctx
,
584 struct perf_counter
*counter
,
587 struct task_struct
*task
= ctx
->task
;
591 * Per cpu counters are installed via an smp call and
592 * the install is always sucessful.
594 smp_call_function_single(cpu
, __perf_install_in_context
,
599 counter
->task
= task
;
601 task_oncpu_function_call(task
, __perf_install_in_context
,
604 spin_lock_irq(&ctx
->lock
);
606 * we need to retry the smp call.
608 if (ctx
->is_active
&& list_empty(&counter
->list_entry
)) {
609 spin_unlock_irq(&ctx
->lock
);
614 * The lock prevents that this context is scheduled in so we
615 * can add the counter safely, if it the call above did not
618 if (list_empty(&counter
->list_entry
))
619 add_counter_to_ctx(counter
, ctx
);
620 spin_unlock_irq(&ctx
->lock
);
624 * Cross CPU call to enable a performance counter
626 static void __perf_counter_enable(void *info
)
628 struct perf_counter
*counter
= info
;
629 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
630 struct perf_counter_context
*ctx
= counter
->ctx
;
631 struct perf_counter
*leader
= counter
->group_leader
;
636 * If this is a per-task counter, need to check whether this
637 * counter's task is the current task on this cpu.
639 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
642 spin_lock_irqsave(&ctx
->lock
, flags
);
643 update_context_time(ctx
);
645 counter
->prev_state
= counter
->state
;
646 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
648 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
649 counter
->tstamp_enabled
= ctx
->time
- counter
->total_time_enabled
;
652 * If the counter is in a group and isn't the group leader,
653 * then don't put it on unless the group is on.
655 if (leader
!= counter
&& leader
->state
!= PERF_COUNTER_STATE_ACTIVE
)
658 if (!group_can_go_on(counter
, cpuctx
, 1))
661 err
= counter_sched_in(counter
, cpuctx
, ctx
,
666 * If this counter can't go on and it's part of a
667 * group, then the whole group has to come off.
669 if (leader
!= counter
)
670 group_sched_out(leader
, cpuctx
, ctx
);
671 if (leader
->hw_event
.pinned
) {
672 update_group_times(leader
);
673 leader
->state
= PERF_COUNTER_STATE_ERROR
;
678 spin_unlock_irqrestore(&ctx
->lock
, flags
);
684 static void perf_counter_enable(struct perf_counter
*counter
)
686 struct perf_counter_context
*ctx
= counter
->ctx
;
687 struct task_struct
*task
= ctx
->task
;
691 * Enable the counter on the cpu that it's on
693 smp_call_function_single(counter
->cpu
, __perf_counter_enable
,
698 spin_lock_irq(&ctx
->lock
);
699 if (counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
703 * If the counter is in error state, clear that first.
704 * That way, if we see the counter in error state below, we
705 * know that it has gone back into error state, as distinct
706 * from the task having been scheduled away before the
707 * cross-call arrived.
709 if (counter
->state
== PERF_COUNTER_STATE_ERROR
)
710 counter
->state
= PERF_COUNTER_STATE_OFF
;
713 spin_unlock_irq(&ctx
->lock
);
714 task_oncpu_function_call(task
, __perf_counter_enable
, counter
);
716 spin_lock_irq(&ctx
->lock
);
719 * If the context is active and the counter is still off,
720 * we need to retry the cross-call.
722 if (ctx
->is_active
&& counter
->state
== PERF_COUNTER_STATE_OFF
)
726 * Since we have the lock this context can't be scheduled
727 * in, so we can change the state safely.
729 if (counter
->state
== PERF_COUNTER_STATE_OFF
) {
730 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
731 counter
->tstamp_enabled
=
732 ctx
->time
- counter
->total_time_enabled
;
735 spin_unlock_irq(&ctx
->lock
);
738 static void perf_counter_refresh(struct perf_counter
*counter
, int refresh
)
740 atomic_add(refresh
, &counter
->event_limit
);
741 perf_counter_enable(counter
);
745 * Enable a counter and all its children.
747 static void perf_counter_enable_family(struct perf_counter
*counter
)
749 struct perf_counter
*child
;
751 perf_counter_enable(counter
);
754 * Lock the mutex to protect the list of children
756 mutex_lock(&counter
->mutex
);
757 list_for_each_entry(child
, &counter
->child_list
, child_list
)
758 perf_counter_enable(child
);
759 mutex_unlock(&counter
->mutex
);
762 void __perf_counter_sched_out(struct perf_counter_context
*ctx
,
763 struct perf_cpu_context
*cpuctx
)
765 struct perf_counter
*counter
;
768 spin_lock(&ctx
->lock
);
770 if (likely(!ctx
->nr_counters
))
772 update_context_time(ctx
);
774 flags
= hw_perf_save_disable();
775 if (ctx
->nr_active
) {
776 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
)
777 group_sched_out(counter
, cpuctx
, ctx
);
779 hw_perf_restore(flags
);
781 spin_unlock(&ctx
->lock
);
785 * Called from scheduler to remove the counters of the current task,
786 * with interrupts disabled.
788 * We stop each counter and update the counter value in counter->count.
790 * This does not protect us against NMI, but disable()
791 * sets the disabled bit in the control field of counter _before_
792 * accessing the counter control register. If a NMI hits, then it will
793 * not restart the counter.
795 void perf_counter_task_sched_out(struct task_struct
*task
, int cpu
)
797 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
798 struct perf_counter_context
*ctx
= &task
->perf_counter_ctx
;
799 struct pt_regs
*regs
;
801 if (likely(!cpuctx
->task_ctx
))
804 update_context_time(ctx
);
806 regs
= task_pt_regs(task
);
807 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES
, 1, 1, regs
, 0);
808 __perf_counter_sched_out(ctx
, cpuctx
);
810 cpuctx
->task_ctx
= NULL
;
813 static void perf_counter_cpu_sched_out(struct perf_cpu_context
*cpuctx
)
815 __perf_counter_sched_out(&cpuctx
->ctx
, cpuctx
);
819 group_sched_in(struct perf_counter
*group_counter
,
820 struct perf_cpu_context
*cpuctx
,
821 struct perf_counter_context
*ctx
,
824 struct perf_counter
*counter
, *partial_group
;
827 if (group_counter
->state
== PERF_COUNTER_STATE_OFF
)
830 ret
= hw_perf_group_sched_in(group_counter
, cpuctx
, ctx
, cpu
);
832 return ret
< 0 ? ret
: 0;
834 group_counter
->prev_state
= group_counter
->state
;
835 if (counter_sched_in(group_counter
, cpuctx
, ctx
, cpu
))
839 * Schedule in siblings as one group (if any):
841 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
) {
842 counter
->prev_state
= counter
->state
;
843 if (counter_sched_in(counter
, cpuctx
, ctx
, cpu
)) {
844 partial_group
= counter
;
853 * Groups can be scheduled in as one unit only, so undo any
854 * partial group before returning:
856 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
) {
857 if (counter
== partial_group
)
859 counter_sched_out(counter
, cpuctx
, ctx
);
861 counter_sched_out(group_counter
, cpuctx
, ctx
);
867 __perf_counter_sched_in(struct perf_counter_context
*ctx
,
868 struct perf_cpu_context
*cpuctx
, int cpu
)
870 struct perf_counter
*counter
;
874 spin_lock(&ctx
->lock
);
876 if (likely(!ctx
->nr_counters
))
879 ctx
->timestamp
= perf_clock();
881 flags
= hw_perf_save_disable();
884 * First go through the list and put on any pinned groups
885 * in order to give them the best chance of going on.
887 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
888 if (counter
->state
<= PERF_COUNTER_STATE_OFF
||
889 !counter
->hw_event
.pinned
)
891 if (counter
->cpu
!= -1 && counter
->cpu
!= cpu
)
894 if (group_can_go_on(counter
, cpuctx
, 1))
895 group_sched_in(counter
, cpuctx
, ctx
, cpu
);
898 * If this pinned group hasn't been scheduled,
899 * put it in error state.
901 if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
) {
902 update_group_times(counter
);
903 counter
->state
= PERF_COUNTER_STATE_ERROR
;
907 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
909 * Ignore counters in OFF or ERROR state, and
910 * ignore pinned counters since we did them already.
912 if (counter
->state
<= PERF_COUNTER_STATE_OFF
||
913 counter
->hw_event
.pinned
)
917 * Listen to the 'cpu' scheduling filter constraint
920 if (counter
->cpu
!= -1 && counter
->cpu
!= cpu
)
923 if (group_can_go_on(counter
, cpuctx
, can_add_hw
)) {
924 if (group_sched_in(counter
, cpuctx
, ctx
, cpu
))
928 hw_perf_restore(flags
);
930 spin_unlock(&ctx
->lock
);
934 * Called from scheduler to add the counters of the current task
935 * with interrupts disabled.
937 * We restore the counter value and then enable it.
939 * This does not protect us against NMI, but enable()
940 * sets the enabled bit in the control field of counter _before_
941 * accessing the counter control register. If a NMI hits, then it will
942 * keep the counter running.
944 void perf_counter_task_sched_in(struct task_struct
*task
, int cpu
)
946 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
947 struct perf_counter_context
*ctx
= &task
->perf_counter_ctx
;
949 __perf_counter_sched_in(ctx
, cpuctx
, cpu
);
950 cpuctx
->task_ctx
= ctx
;
953 static void perf_counter_cpu_sched_in(struct perf_cpu_context
*cpuctx
, int cpu
)
955 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
957 __perf_counter_sched_in(ctx
, cpuctx
, cpu
);
960 int perf_counter_task_disable(void)
962 struct task_struct
*curr
= current
;
963 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
964 struct perf_counter
*counter
;
969 if (likely(!ctx
->nr_counters
))
972 local_irq_save(flags
);
973 cpu
= smp_processor_id();
975 perf_counter_task_sched_out(curr
, cpu
);
977 spin_lock(&ctx
->lock
);
980 * Disable all the counters:
982 perf_flags
= hw_perf_save_disable();
984 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
985 if (counter
->state
!= PERF_COUNTER_STATE_ERROR
) {
986 update_group_times(counter
);
987 counter
->state
= PERF_COUNTER_STATE_OFF
;
991 hw_perf_restore(perf_flags
);
993 spin_unlock_irqrestore(&ctx
->lock
, flags
);
998 int perf_counter_task_enable(void)
1000 struct task_struct
*curr
= current
;
1001 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
1002 struct perf_counter
*counter
;
1003 unsigned long flags
;
1007 if (likely(!ctx
->nr_counters
))
1010 local_irq_save(flags
);
1011 cpu
= smp_processor_id();
1013 perf_counter_task_sched_out(curr
, cpu
);
1015 spin_lock(&ctx
->lock
);
1018 * Disable all the counters:
1020 perf_flags
= hw_perf_save_disable();
1022 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
1023 if (counter
->state
> PERF_COUNTER_STATE_OFF
)
1025 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
1026 counter
->tstamp_enabled
=
1027 ctx
->time
- counter
->total_time_enabled
;
1028 counter
->hw_event
.disabled
= 0;
1030 hw_perf_restore(perf_flags
);
1032 spin_unlock(&ctx
->lock
);
1034 perf_counter_task_sched_in(curr
, cpu
);
1036 local_irq_restore(flags
);
1042 * Round-robin a context's counters:
1044 static void rotate_ctx(struct perf_counter_context
*ctx
)
1046 struct perf_counter
*counter
;
1049 if (!ctx
->nr_counters
)
1052 spin_lock(&ctx
->lock
);
1054 * Rotate the first entry last (works just fine for group counters too):
1056 perf_flags
= hw_perf_save_disable();
1057 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
1058 list_move_tail(&counter
->list_entry
, &ctx
->counter_list
);
1061 hw_perf_restore(perf_flags
);
1063 spin_unlock(&ctx
->lock
);
1066 void perf_counter_task_tick(struct task_struct
*curr
, int cpu
)
1068 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1069 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
1070 const int rotate_percpu
= 0;
1073 perf_counter_cpu_sched_out(cpuctx
);
1074 perf_counter_task_sched_out(curr
, cpu
);
1077 rotate_ctx(&cpuctx
->ctx
);
1081 perf_counter_cpu_sched_in(cpuctx
, cpu
);
1082 perf_counter_task_sched_in(curr
, cpu
);
1086 * Cross CPU call to read the hardware counter
1088 static void __read(void *info
)
1090 struct perf_counter
*counter
= info
;
1091 struct perf_counter_context
*ctx
= counter
->ctx
;
1092 unsigned long flags
;
1094 local_irq_save(flags
);
1096 update_context_time(ctx
);
1097 counter
->hw_ops
->read(counter
);
1098 update_counter_times(counter
);
1099 local_irq_restore(flags
);
1102 static u64
perf_counter_read(struct perf_counter
*counter
)
1105 * If counter is enabled and currently active on a CPU, update the
1106 * value in the counter structure:
1108 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
1109 smp_call_function_single(counter
->oncpu
,
1110 __read
, counter
, 1);
1111 } else if (counter
->state
== PERF_COUNTER_STATE_INACTIVE
) {
1112 update_counter_times(counter
);
1115 return atomic64_read(&counter
->count
);
1118 static void put_context(struct perf_counter_context
*ctx
)
1121 put_task_struct(ctx
->task
);
1124 static struct perf_counter_context
*find_get_context(pid_t pid
, int cpu
)
1126 struct perf_cpu_context
*cpuctx
;
1127 struct perf_counter_context
*ctx
;
1128 struct task_struct
*task
;
1131 * If cpu is not a wildcard then this is a percpu counter:
1134 /* Must be root to operate on a CPU counter: */
1135 if (!capable(CAP_SYS_ADMIN
))
1136 return ERR_PTR(-EACCES
);
1138 if (cpu
< 0 || cpu
> num_possible_cpus())
1139 return ERR_PTR(-EINVAL
);
1142 * We could be clever and allow to attach a counter to an
1143 * offline CPU and activate it when the CPU comes up, but
1146 if (!cpu_isset(cpu
, cpu_online_map
))
1147 return ERR_PTR(-ENODEV
);
1149 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1159 task
= find_task_by_vpid(pid
);
1161 get_task_struct(task
);
1165 return ERR_PTR(-ESRCH
);
1167 ctx
= &task
->perf_counter_ctx
;
1170 /* Reuse ptrace permission checks for now. */
1171 if (!ptrace_may_access(task
, PTRACE_MODE_READ
)) {
1173 return ERR_PTR(-EACCES
);
1179 static void free_counter_rcu(struct rcu_head
*head
)
1181 struct perf_counter
*counter
;
1183 counter
= container_of(head
, struct perf_counter
, rcu_head
);
1187 static void perf_pending_sync(struct perf_counter
*counter
);
1189 static void free_counter(struct perf_counter
*counter
)
1191 perf_pending_sync(counter
);
1193 if (counter
->hw_event
.mmap
)
1194 atomic_dec(&nr_mmap_tracking
);
1195 if (counter
->hw_event
.munmap
)
1196 atomic_dec(&nr_munmap_tracking
);
1197 if (counter
->hw_event
.comm
)
1198 atomic_dec(&nr_comm_tracking
);
1200 if (counter
->destroy
)
1201 counter
->destroy(counter
);
1203 call_rcu(&counter
->rcu_head
, free_counter_rcu
);
1207 * Called when the last reference to the file is gone.
1209 static int perf_release(struct inode
*inode
, struct file
*file
)
1211 struct perf_counter
*counter
= file
->private_data
;
1212 struct perf_counter_context
*ctx
= counter
->ctx
;
1214 file
->private_data
= NULL
;
1216 mutex_lock(&ctx
->mutex
);
1217 mutex_lock(&counter
->mutex
);
1219 perf_counter_remove_from_context(counter
);
1221 mutex_unlock(&counter
->mutex
);
1222 mutex_unlock(&ctx
->mutex
);
1224 free_counter(counter
);
1231 * Read the performance counter - simple non blocking version for now
1234 perf_read_hw(struct perf_counter
*counter
, char __user
*buf
, size_t count
)
1240 * Return end-of-file for a read on a counter that is in
1241 * error state (i.e. because it was pinned but it couldn't be
1242 * scheduled on to the CPU at some point).
1244 if (counter
->state
== PERF_COUNTER_STATE_ERROR
)
1247 mutex_lock(&counter
->mutex
);
1248 values
[0] = perf_counter_read(counter
);
1250 if (counter
->hw_event
.read_format
& PERF_FORMAT_TOTAL_TIME_ENABLED
)
1251 values
[n
++] = counter
->total_time_enabled
+
1252 atomic64_read(&counter
->child_total_time_enabled
);
1253 if (counter
->hw_event
.read_format
& PERF_FORMAT_TOTAL_TIME_RUNNING
)
1254 values
[n
++] = counter
->total_time_running
+
1255 atomic64_read(&counter
->child_total_time_running
);
1256 mutex_unlock(&counter
->mutex
);
1258 if (count
< n
* sizeof(u64
))
1260 count
= n
* sizeof(u64
);
1262 if (copy_to_user(buf
, values
, count
))
1269 perf_read(struct file
*file
, char __user
*buf
, size_t count
, loff_t
*ppos
)
1271 struct perf_counter
*counter
= file
->private_data
;
1273 return perf_read_hw(counter
, buf
, count
);
1276 static unsigned int perf_poll(struct file
*file
, poll_table
*wait
)
1278 struct perf_counter
*counter
= file
->private_data
;
1279 struct perf_mmap_data
*data
;
1280 unsigned int events
;
1283 data
= rcu_dereference(counter
->data
);
1285 events
= atomic_xchg(&data
->wakeup
, 0);
1290 poll_wait(file
, &counter
->waitq
, wait
);
1295 static long perf_ioctl(struct file
*file
, unsigned int cmd
, unsigned long arg
)
1297 struct perf_counter
*counter
= file
->private_data
;
1301 case PERF_COUNTER_IOC_ENABLE
:
1302 perf_counter_enable_family(counter
);
1304 case PERF_COUNTER_IOC_DISABLE
:
1305 perf_counter_disable_family(counter
);
1307 case PERF_COUNTER_IOC_REFRESH
:
1308 perf_counter_refresh(counter
, arg
);
1317 * Callers need to ensure there can be no nesting of this function, otherwise
1318 * the seqlock logic goes bad. We can not serialize this because the arch
1319 * code calls this from NMI context.
1321 void perf_counter_update_userpage(struct perf_counter
*counter
)
1323 struct perf_mmap_data
*data
;
1324 struct perf_counter_mmap_page
*userpg
;
1327 data
= rcu_dereference(counter
->data
);
1331 userpg
= data
->user_page
;
1334 * Disable preemption so as to not let the corresponding user-space
1335 * spin too long if we get preempted.
1340 userpg
->index
= counter
->hw
.idx
;
1341 userpg
->offset
= atomic64_read(&counter
->count
);
1342 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
)
1343 userpg
->offset
-= atomic64_read(&counter
->hw
.prev_count
);
1352 static int perf_mmap_fault(struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
1354 struct perf_counter
*counter
= vma
->vm_file
->private_data
;
1355 struct perf_mmap_data
*data
;
1356 int ret
= VM_FAULT_SIGBUS
;
1359 data
= rcu_dereference(counter
->data
);
1363 if (vmf
->pgoff
== 0) {
1364 vmf
->page
= virt_to_page(data
->user_page
);
1366 int nr
= vmf
->pgoff
- 1;
1368 if ((unsigned)nr
> data
->nr_pages
)
1371 vmf
->page
= virt_to_page(data
->data_pages
[nr
]);
1373 get_page(vmf
->page
);
1381 static int perf_mmap_data_alloc(struct perf_counter
*counter
, int nr_pages
)
1383 struct perf_mmap_data
*data
;
1387 WARN_ON(atomic_read(&counter
->mmap_count
));
1389 size
= sizeof(struct perf_mmap_data
);
1390 size
+= nr_pages
* sizeof(void *);
1392 data
= kzalloc(size
, GFP_KERNEL
);
1396 data
->user_page
= (void *)get_zeroed_page(GFP_KERNEL
);
1397 if (!data
->user_page
)
1398 goto fail_user_page
;
1400 for (i
= 0; i
< nr_pages
; i
++) {
1401 data
->data_pages
[i
] = (void *)get_zeroed_page(GFP_KERNEL
);
1402 if (!data
->data_pages
[i
])
1403 goto fail_data_pages
;
1406 data
->nr_pages
= nr_pages
;
1408 rcu_assign_pointer(counter
->data
, data
);
1413 for (i
--; i
>= 0; i
--)
1414 free_page((unsigned long)data
->data_pages
[i
]);
1416 free_page((unsigned long)data
->user_page
);
1425 static void __perf_mmap_data_free(struct rcu_head
*rcu_head
)
1427 struct perf_mmap_data
*data
= container_of(rcu_head
,
1428 struct perf_mmap_data
, rcu_head
);
1431 free_page((unsigned long)data
->user_page
);
1432 for (i
= 0; i
< data
->nr_pages
; i
++)
1433 free_page((unsigned long)data
->data_pages
[i
]);
1437 static void perf_mmap_data_free(struct perf_counter
*counter
)
1439 struct perf_mmap_data
*data
= counter
->data
;
1441 WARN_ON(atomic_read(&counter
->mmap_count
));
1443 rcu_assign_pointer(counter
->data
, NULL
);
1444 call_rcu(&data
->rcu_head
, __perf_mmap_data_free
);
1447 static void perf_mmap_open(struct vm_area_struct
*vma
)
1449 struct perf_counter
*counter
= vma
->vm_file
->private_data
;
1451 atomic_inc(&counter
->mmap_count
);
1454 static void perf_mmap_close(struct vm_area_struct
*vma
)
1456 struct perf_counter
*counter
= vma
->vm_file
->private_data
;
1458 if (atomic_dec_and_mutex_lock(&counter
->mmap_count
,
1459 &counter
->mmap_mutex
)) {
1460 vma
->vm_mm
->locked_vm
-= counter
->data
->nr_pages
+ 1;
1461 perf_mmap_data_free(counter
);
1462 mutex_unlock(&counter
->mmap_mutex
);
1466 static struct vm_operations_struct perf_mmap_vmops
= {
1467 .open
= perf_mmap_open
,
1468 .close
= perf_mmap_close
,
1469 .fault
= perf_mmap_fault
,
1472 static int perf_mmap(struct file
*file
, struct vm_area_struct
*vma
)
1474 struct perf_counter
*counter
= file
->private_data
;
1475 unsigned long vma_size
;
1476 unsigned long nr_pages
;
1477 unsigned long locked
, lock_limit
;
1480 if (!(vma
->vm_flags
& VM_SHARED
) || (vma
->vm_flags
& VM_WRITE
))
1483 vma_size
= vma
->vm_end
- vma
->vm_start
;
1484 nr_pages
= (vma_size
/ PAGE_SIZE
) - 1;
1487 * If we have data pages ensure they're a power-of-two number, so we
1488 * can do bitmasks instead of modulo.
1490 if (nr_pages
!= 0 && !is_power_of_2(nr_pages
))
1493 if (vma_size
!= PAGE_SIZE
* (1 + nr_pages
))
1496 if (vma
->vm_pgoff
!= 0)
1499 mutex_lock(&counter
->mmap_mutex
);
1500 if (atomic_inc_not_zero(&counter
->mmap_count
)) {
1501 if (nr_pages
!= counter
->data
->nr_pages
)
1506 locked
= vma
->vm_mm
->locked_vm
;
1507 locked
+= nr_pages
+ 1;
1509 lock_limit
= current
->signal
->rlim
[RLIMIT_MEMLOCK
].rlim_cur
;
1510 lock_limit
>>= PAGE_SHIFT
;
1512 if ((locked
> lock_limit
) && !capable(CAP_IPC_LOCK
)) {
1517 WARN_ON(counter
->data
);
1518 ret
= perf_mmap_data_alloc(counter
, nr_pages
);
1522 atomic_set(&counter
->mmap_count
, 1);
1523 vma
->vm_mm
->locked_vm
+= nr_pages
+ 1;
1525 mutex_unlock(&counter
->mmap_mutex
);
1527 vma
->vm_flags
&= ~VM_MAYWRITE
;
1528 vma
->vm_flags
|= VM_RESERVED
;
1529 vma
->vm_ops
= &perf_mmap_vmops
;
1534 static int perf_fasync(int fd
, struct file
*filp
, int on
)
1536 struct perf_counter
*counter
= filp
->private_data
;
1537 struct inode
*inode
= filp
->f_path
.dentry
->d_inode
;
1540 mutex_lock(&inode
->i_mutex
);
1541 retval
= fasync_helper(fd
, filp
, on
, &counter
->fasync
);
1542 mutex_unlock(&inode
->i_mutex
);
1550 static const struct file_operations perf_fops
= {
1551 .release
= perf_release
,
1554 .unlocked_ioctl
= perf_ioctl
,
1555 .compat_ioctl
= perf_ioctl
,
1557 .fasync
= perf_fasync
,
1561 * Perf counter wakeup
1563 * If there's data, ensure we set the poll() state and publish everything
1564 * to user-space before waking everybody up.
1567 void perf_counter_wakeup(struct perf_counter
*counter
)
1569 struct perf_mmap_data
*data
;
1572 data
= rcu_dereference(counter
->data
);
1574 atomic_set(&data
->wakeup
, POLL_IN
);
1576 * Ensure all data writes are issued before updating the
1577 * user-space data head information. The matching rmb()
1578 * will be in userspace after reading this value.
1581 data
->user_page
->data_head
= atomic_read(&data
->head
);
1585 wake_up_all(&counter
->waitq
);
1587 if (counter
->pending_kill
) {
1588 kill_fasync(&counter
->fasync
, SIGIO
, counter
->pending_kill
);
1589 counter
->pending_kill
= 0;
1596 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1598 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1599 * single linked list and use cmpxchg() to add entries lockless.
1602 static void perf_pending_counter(struct perf_pending_entry
*entry
)
1604 struct perf_counter
*counter
= container_of(entry
,
1605 struct perf_counter
, pending
);
1607 if (counter
->pending_disable
) {
1608 counter
->pending_disable
= 0;
1609 perf_counter_disable(counter
);
1612 if (counter
->pending_wakeup
) {
1613 counter
->pending_wakeup
= 0;
1614 perf_counter_wakeup(counter
);
1618 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1620 static DEFINE_PER_CPU(struct perf_pending_entry
*, perf_pending_head
) = {
1624 static void perf_pending_queue(struct perf_pending_entry
*entry
,
1625 void (*func
)(struct perf_pending_entry
*))
1627 struct perf_pending_entry
**head
;
1629 if (cmpxchg(&entry
->next
, NULL
, PENDING_TAIL
) != NULL
)
1634 head
= &get_cpu_var(perf_pending_head
);
1637 entry
->next
= *head
;
1638 } while (cmpxchg(head
, entry
->next
, entry
) != entry
->next
);
1640 set_perf_counter_pending();
1642 put_cpu_var(perf_pending_head
);
1645 static int __perf_pending_run(void)
1647 struct perf_pending_entry
*list
;
1650 list
= xchg(&__get_cpu_var(perf_pending_head
), PENDING_TAIL
);
1651 while (list
!= PENDING_TAIL
) {
1652 void (*func
)(struct perf_pending_entry
*);
1653 struct perf_pending_entry
*entry
= list
;
1660 * Ensure we observe the unqueue before we issue the wakeup,
1661 * so that we won't be waiting forever.
1662 * -- see perf_not_pending().
1673 static inline int perf_not_pending(struct perf_counter
*counter
)
1676 * If we flush on whatever cpu we run, there is a chance we don't
1680 __perf_pending_run();
1684 * Ensure we see the proper queue state before going to sleep
1685 * so that we do not miss the wakeup. -- see perf_pending_handle()
1688 return counter
->pending
.next
== NULL
;
1691 static void perf_pending_sync(struct perf_counter
*counter
)
1693 wait_event(counter
->waitq
, perf_not_pending(counter
));
1696 void perf_counter_do_pending(void)
1698 __perf_pending_run();
1702 * Callchain support -- arch specific
1705 __weak
struct perf_callchain_entry
*perf_callchain(struct pt_regs
*regs
)
1714 struct perf_output_handle
{
1715 struct perf_counter
*counter
;
1716 struct perf_mmap_data
*data
;
1717 unsigned int offset
;
1724 static inline void __perf_output_wakeup(struct perf_output_handle
*handle
)
1727 handle
->counter
->pending_wakeup
= 1;
1728 perf_pending_queue(&handle
->counter
->pending
,
1729 perf_pending_counter
);
1731 perf_counter_wakeup(handle
->counter
);
1734 static int perf_output_begin(struct perf_output_handle
*handle
,
1735 struct perf_counter
*counter
, unsigned int size
,
1736 int nmi
, int overflow
)
1738 struct perf_mmap_data
*data
;
1739 unsigned int offset
, head
;
1742 data
= rcu_dereference(counter
->data
);
1746 handle
->counter
= counter
;
1748 handle
->overflow
= overflow
;
1750 if (!data
->nr_pages
)
1754 offset
= head
= atomic_read(&data
->head
);
1756 } while (atomic_cmpxchg(&data
->head
, offset
, head
) != offset
);
1758 handle
->data
= data
;
1759 handle
->offset
= offset
;
1760 handle
->head
= head
;
1761 handle
->wakeup
= (offset
>> PAGE_SHIFT
) != (head
>> PAGE_SHIFT
);
1766 __perf_output_wakeup(handle
);
1773 static void perf_output_copy(struct perf_output_handle
*handle
,
1774 void *buf
, unsigned int len
)
1776 unsigned int pages_mask
;
1777 unsigned int offset
;
1781 offset
= handle
->offset
;
1782 pages_mask
= handle
->data
->nr_pages
- 1;
1783 pages
= handle
->data
->data_pages
;
1786 unsigned int page_offset
;
1789 nr
= (offset
>> PAGE_SHIFT
) & pages_mask
;
1790 page_offset
= offset
& (PAGE_SIZE
- 1);
1791 size
= min_t(unsigned int, PAGE_SIZE
- page_offset
, len
);
1793 memcpy(pages
[nr
] + page_offset
, buf
, size
);
1800 handle
->offset
= offset
;
1802 WARN_ON_ONCE(handle
->offset
> handle
->head
);
1805 #define perf_output_put(handle, x) \
1806 perf_output_copy((handle), &(x), sizeof(x))
1808 static void perf_output_end(struct perf_output_handle
*handle
)
1810 int wakeup_events
= handle
->counter
->hw_event
.wakeup_events
;
1812 if (handle
->overflow
&& wakeup_events
) {
1813 int events
= atomic_inc_return(&handle
->data
->events
);
1814 if (events
>= wakeup_events
) {
1815 atomic_sub(wakeup_events
, &handle
->data
->events
);
1816 __perf_output_wakeup(handle
);
1818 } else if (handle
->wakeup
)
1819 __perf_output_wakeup(handle
);
1823 static void perf_counter_output(struct perf_counter
*counter
,
1824 int nmi
, struct pt_regs
*regs
, u64 addr
)
1827 u64 record_type
= counter
->hw_event
.record_type
;
1828 struct perf_output_handle handle
;
1829 struct perf_event_header header
;
1838 struct perf_callchain_entry
*callchain
= NULL
;
1839 int callchain_size
= 0;
1843 header
.size
= sizeof(header
);
1845 header
.misc
= PERF_EVENT_MISC_OVERFLOW
;
1846 header
.misc
|= user_mode(regs
) ?
1847 PERF_EVENT_MISC_USER
: PERF_EVENT_MISC_KERNEL
;
1849 if (record_type
& PERF_RECORD_IP
) {
1850 ip
= instruction_pointer(regs
);
1851 header
.type
|= PERF_RECORD_IP
;
1852 header
.size
+= sizeof(ip
);
1855 if (record_type
& PERF_RECORD_TID
) {
1856 /* namespace issues */
1857 tid_entry
.pid
= current
->group_leader
->pid
;
1858 tid_entry
.tid
= current
->pid
;
1860 header
.type
|= PERF_RECORD_TID
;
1861 header
.size
+= sizeof(tid_entry
);
1864 if (record_type
& PERF_RECORD_TIME
) {
1866 * Maybe do better on x86 and provide cpu_clock_nmi()
1868 time
= sched_clock();
1870 header
.type
|= PERF_RECORD_TIME
;
1871 header
.size
+= sizeof(u64
);
1874 if (record_type
& PERF_RECORD_ADDR
) {
1875 header
.type
|= PERF_RECORD_ADDR
;
1876 header
.size
+= sizeof(u64
);
1879 if (record_type
& PERF_RECORD_GROUP
) {
1880 header
.type
|= PERF_RECORD_GROUP
;
1881 header
.size
+= sizeof(u64
) +
1882 counter
->nr_siblings
* sizeof(group_entry
);
1885 if (record_type
& PERF_RECORD_CALLCHAIN
) {
1886 callchain
= perf_callchain(regs
);
1889 callchain_size
= (1 + callchain
->nr
) * sizeof(u64
);
1891 header
.type
|= PERF_RECORD_CALLCHAIN
;
1892 header
.size
+= callchain_size
;
1896 ret
= perf_output_begin(&handle
, counter
, header
.size
, nmi
, 1);
1900 perf_output_put(&handle
, header
);
1902 if (record_type
& PERF_RECORD_IP
)
1903 perf_output_put(&handle
, ip
);
1905 if (record_type
& PERF_RECORD_TID
)
1906 perf_output_put(&handle
, tid_entry
);
1908 if (record_type
& PERF_RECORD_TIME
)
1909 perf_output_put(&handle
, time
);
1911 if (record_type
& PERF_RECORD_ADDR
)
1912 perf_output_put(&handle
, addr
);
1914 if (record_type
& PERF_RECORD_GROUP
) {
1915 struct perf_counter
*leader
, *sub
;
1916 u64 nr
= counter
->nr_siblings
;
1918 perf_output_put(&handle
, nr
);
1920 leader
= counter
->group_leader
;
1921 list_for_each_entry(sub
, &leader
->sibling_list
, list_entry
) {
1923 sub
->hw_ops
->read(sub
);
1925 group_entry
.event
= sub
->hw_event
.config
;
1926 group_entry
.counter
= atomic64_read(&sub
->count
);
1928 perf_output_put(&handle
, group_entry
);
1933 perf_output_copy(&handle
, callchain
, callchain_size
);
1935 perf_output_end(&handle
);
1942 struct perf_comm_event
{
1943 struct task_struct
*task
;
1948 struct perf_event_header header
;
1955 static void perf_counter_comm_output(struct perf_counter
*counter
,
1956 struct perf_comm_event
*comm_event
)
1958 struct perf_output_handle handle
;
1959 int size
= comm_event
->event
.header
.size
;
1960 int ret
= perf_output_begin(&handle
, counter
, size
, 0, 0);
1965 perf_output_put(&handle
, comm_event
->event
);
1966 perf_output_copy(&handle
, comm_event
->comm
,
1967 comm_event
->comm_size
);
1968 perf_output_end(&handle
);
1971 static int perf_counter_comm_match(struct perf_counter
*counter
,
1972 struct perf_comm_event
*comm_event
)
1974 if (counter
->hw_event
.comm
&&
1975 comm_event
->event
.header
.type
== PERF_EVENT_COMM
)
1981 static void perf_counter_comm_ctx(struct perf_counter_context
*ctx
,
1982 struct perf_comm_event
*comm_event
)
1984 struct perf_counter
*counter
;
1986 if (system_state
!= SYSTEM_RUNNING
|| list_empty(&ctx
->event_list
))
1990 list_for_each_entry_rcu(counter
, &ctx
->event_list
, event_entry
) {
1991 if (perf_counter_comm_match(counter
, comm_event
))
1992 perf_counter_comm_output(counter
, comm_event
);
1997 static void perf_counter_comm_event(struct perf_comm_event
*comm_event
)
1999 struct perf_cpu_context
*cpuctx
;
2001 char *comm
= comm_event
->task
->comm
;
2003 size
= ALIGN(strlen(comm
)+1, sizeof(u64
));
2005 comm_event
->comm
= comm
;
2006 comm_event
->comm_size
= size
;
2008 comm_event
->event
.header
.size
= sizeof(comm_event
->event
) + size
;
2010 cpuctx
= &get_cpu_var(perf_cpu_context
);
2011 perf_counter_comm_ctx(&cpuctx
->ctx
, comm_event
);
2012 put_cpu_var(perf_cpu_context
);
2014 perf_counter_comm_ctx(¤t
->perf_counter_ctx
, comm_event
);
2017 void perf_counter_comm(struct task_struct
*task
)
2019 struct perf_comm_event comm_event
;
2021 if (!atomic_read(&nr_comm_tracking
))
2024 comm_event
= (struct perf_comm_event
){
2027 .header
= { .type
= PERF_EVENT_COMM
, },
2028 .pid
= task
->group_leader
->pid
,
2033 perf_counter_comm_event(&comm_event
);
2040 struct perf_mmap_event
{
2046 struct perf_event_header header
;
2056 static void perf_counter_mmap_output(struct perf_counter
*counter
,
2057 struct perf_mmap_event
*mmap_event
)
2059 struct perf_output_handle handle
;
2060 int size
= mmap_event
->event
.header
.size
;
2061 int ret
= perf_output_begin(&handle
, counter
, size
, 0, 0);
2066 perf_output_put(&handle
, mmap_event
->event
);
2067 perf_output_copy(&handle
, mmap_event
->file_name
,
2068 mmap_event
->file_size
);
2069 perf_output_end(&handle
);
2072 static int perf_counter_mmap_match(struct perf_counter
*counter
,
2073 struct perf_mmap_event
*mmap_event
)
2075 if (counter
->hw_event
.mmap
&&
2076 mmap_event
->event
.header
.type
== PERF_EVENT_MMAP
)
2079 if (counter
->hw_event
.munmap
&&
2080 mmap_event
->event
.header
.type
== PERF_EVENT_MUNMAP
)
2086 static void perf_counter_mmap_ctx(struct perf_counter_context
*ctx
,
2087 struct perf_mmap_event
*mmap_event
)
2089 struct perf_counter
*counter
;
2091 if (system_state
!= SYSTEM_RUNNING
|| list_empty(&ctx
->event_list
))
2095 list_for_each_entry_rcu(counter
, &ctx
->event_list
, event_entry
) {
2096 if (perf_counter_mmap_match(counter
, mmap_event
))
2097 perf_counter_mmap_output(counter
, mmap_event
);
2102 static void perf_counter_mmap_event(struct perf_mmap_event
*mmap_event
)
2104 struct perf_cpu_context
*cpuctx
;
2105 struct file
*file
= mmap_event
->file
;
2112 buf
= kzalloc(PATH_MAX
, GFP_KERNEL
);
2114 name
= strncpy(tmp
, "//enomem", sizeof(tmp
));
2117 name
= dentry_path(file
->f_dentry
, buf
, PATH_MAX
);
2119 name
= strncpy(tmp
, "//toolong", sizeof(tmp
));
2123 name
= strncpy(tmp
, "//anon", sizeof(tmp
));
2128 size
= ALIGN(strlen(name
)+1, sizeof(u64
));
2130 mmap_event
->file_name
= name
;
2131 mmap_event
->file_size
= size
;
2133 mmap_event
->event
.header
.size
= sizeof(mmap_event
->event
) + size
;
2135 cpuctx
= &get_cpu_var(perf_cpu_context
);
2136 perf_counter_mmap_ctx(&cpuctx
->ctx
, mmap_event
);
2137 put_cpu_var(perf_cpu_context
);
2139 perf_counter_mmap_ctx(¤t
->perf_counter_ctx
, mmap_event
);
2144 void perf_counter_mmap(unsigned long addr
, unsigned long len
,
2145 unsigned long pgoff
, struct file
*file
)
2147 struct perf_mmap_event mmap_event
;
2149 if (!atomic_read(&nr_mmap_tracking
))
2152 mmap_event
= (struct perf_mmap_event
){
2155 .header
= { .type
= PERF_EVENT_MMAP
, },
2156 .pid
= current
->group_leader
->pid
,
2157 .tid
= current
->pid
,
2164 perf_counter_mmap_event(&mmap_event
);
2167 void perf_counter_munmap(unsigned long addr
, unsigned long len
,
2168 unsigned long pgoff
, struct file
*file
)
2170 struct perf_mmap_event mmap_event
;
2172 if (!atomic_read(&nr_munmap_tracking
))
2175 mmap_event
= (struct perf_mmap_event
){
2178 .header
= { .type
= PERF_EVENT_MUNMAP
, },
2179 .pid
= current
->group_leader
->pid
,
2180 .tid
= current
->pid
,
2187 perf_counter_mmap_event(&mmap_event
);
2191 * Generic counter overflow handling.
2194 int perf_counter_overflow(struct perf_counter
*counter
,
2195 int nmi
, struct pt_regs
*regs
, u64 addr
)
2197 int events
= atomic_read(&counter
->event_limit
);
2200 counter
->pending_kill
= POLL_IN
;
2201 if (events
&& atomic_dec_and_test(&counter
->event_limit
)) {
2203 counter
->pending_kill
= POLL_HUP
;
2205 counter
->pending_disable
= 1;
2206 perf_pending_queue(&counter
->pending
,
2207 perf_pending_counter
);
2209 perf_counter_disable(counter
);
2212 perf_counter_output(counter
, nmi
, regs
, addr
);
2217 * Generic software counter infrastructure
2220 static void perf_swcounter_update(struct perf_counter
*counter
)
2222 struct hw_perf_counter
*hwc
= &counter
->hw
;
2227 prev
= atomic64_read(&hwc
->prev_count
);
2228 now
= atomic64_read(&hwc
->count
);
2229 if (atomic64_cmpxchg(&hwc
->prev_count
, prev
, now
) != prev
)
2234 atomic64_add(delta
, &counter
->count
);
2235 atomic64_sub(delta
, &hwc
->period_left
);
2238 static void perf_swcounter_set_period(struct perf_counter
*counter
)
2240 struct hw_perf_counter
*hwc
= &counter
->hw
;
2241 s64 left
= atomic64_read(&hwc
->period_left
);
2242 s64 period
= hwc
->irq_period
;
2244 if (unlikely(left
<= -period
)) {
2246 atomic64_set(&hwc
->period_left
, left
);
2249 if (unlikely(left
<= 0)) {
2251 atomic64_add(period
, &hwc
->period_left
);
2254 atomic64_set(&hwc
->prev_count
, -left
);
2255 atomic64_set(&hwc
->count
, -left
);
2258 static enum hrtimer_restart
perf_swcounter_hrtimer(struct hrtimer
*hrtimer
)
2260 enum hrtimer_restart ret
= HRTIMER_RESTART
;
2261 struct perf_counter
*counter
;
2262 struct pt_regs
*regs
;
2264 counter
= container_of(hrtimer
, struct perf_counter
, hw
.hrtimer
);
2265 counter
->hw_ops
->read(counter
);
2267 regs
= get_irq_regs();
2269 * In case we exclude kernel IPs or are somehow not in interrupt
2270 * context, provide the next best thing, the user IP.
2272 if ((counter
->hw_event
.exclude_kernel
|| !regs
) &&
2273 !counter
->hw_event
.exclude_user
)
2274 regs
= task_pt_regs(current
);
2277 if (perf_counter_overflow(counter
, 0, regs
, 0))
2278 ret
= HRTIMER_NORESTART
;
2281 hrtimer_forward_now(hrtimer
, ns_to_ktime(counter
->hw
.irq_period
));
2286 static void perf_swcounter_overflow(struct perf_counter
*counter
,
2287 int nmi
, struct pt_regs
*regs
, u64 addr
)
2289 perf_swcounter_update(counter
);
2290 perf_swcounter_set_period(counter
);
2291 if (perf_counter_overflow(counter
, nmi
, regs
, addr
))
2292 /* soft-disable the counter */
2297 static int perf_swcounter_match(struct perf_counter
*counter
,
2298 enum perf_event_types type
,
2299 u32 event
, struct pt_regs
*regs
)
2301 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
2304 if (perf_event_raw(&counter
->hw_event
))
2307 if (perf_event_type(&counter
->hw_event
) != type
)
2310 if (perf_event_id(&counter
->hw_event
) != event
)
2313 if (counter
->hw_event
.exclude_user
&& user_mode(regs
))
2316 if (counter
->hw_event
.exclude_kernel
&& !user_mode(regs
))
2322 static void perf_swcounter_add(struct perf_counter
*counter
, u64 nr
,
2323 int nmi
, struct pt_regs
*regs
, u64 addr
)
2325 int neg
= atomic64_add_negative(nr
, &counter
->hw
.count
);
2326 if (counter
->hw
.irq_period
&& !neg
)
2327 perf_swcounter_overflow(counter
, nmi
, regs
, addr
);
2330 static void perf_swcounter_ctx_event(struct perf_counter_context
*ctx
,
2331 enum perf_event_types type
, u32 event
,
2332 u64 nr
, int nmi
, struct pt_regs
*regs
,
2335 struct perf_counter
*counter
;
2337 if (system_state
!= SYSTEM_RUNNING
|| list_empty(&ctx
->event_list
))
2341 list_for_each_entry_rcu(counter
, &ctx
->event_list
, event_entry
) {
2342 if (perf_swcounter_match(counter
, type
, event
, regs
))
2343 perf_swcounter_add(counter
, nr
, nmi
, regs
, addr
);
2348 static int *perf_swcounter_recursion_context(struct perf_cpu_context
*cpuctx
)
2351 return &cpuctx
->recursion
[3];
2354 return &cpuctx
->recursion
[2];
2357 return &cpuctx
->recursion
[1];
2359 return &cpuctx
->recursion
[0];
2362 static void __perf_swcounter_event(enum perf_event_types type
, u32 event
,
2363 u64 nr
, int nmi
, struct pt_regs
*regs
,
2366 struct perf_cpu_context
*cpuctx
= &get_cpu_var(perf_cpu_context
);
2367 int *recursion
= perf_swcounter_recursion_context(cpuctx
);
2375 perf_swcounter_ctx_event(&cpuctx
->ctx
, type
, event
,
2376 nr
, nmi
, regs
, addr
);
2377 if (cpuctx
->task_ctx
) {
2378 perf_swcounter_ctx_event(cpuctx
->task_ctx
, type
, event
,
2379 nr
, nmi
, regs
, addr
);
2386 put_cpu_var(perf_cpu_context
);
2390 perf_swcounter_event(u32 event
, u64 nr
, int nmi
, struct pt_regs
*regs
, u64 addr
)
2392 __perf_swcounter_event(PERF_TYPE_SOFTWARE
, event
, nr
, nmi
, regs
, addr
);
2395 static void perf_swcounter_read(struct perf_counter
*counter
)
2397 perf_swcounter_update(counter
);
2400 static int perf_swcounter_enable(struct perf_counter
*counter
)
2402 perf_swcounter_set_period(counter
);
2406 static void perf_swcounter_disable(struct perf_counter
*counter
)
2408 perf_swcounter_update(counter
);
2411 static const struct hw_perf_counter_ops perf_ops_generic
= {
2412 .enable
= perf_swcounter_enable
,
2413 .disable
= perf_swcounter_disable
,
2414 .read
= perf_swcounter_read
,
2418 * Software counter: cpu wall time clock
2421 static void cpu_clock_perf_counter_update(struct perf_counter
*counter
)
2423 int cpu
= raw_smp_processor_id();
2427 now
= cpu_clock(cpu
);
2428 prev
= atomic64_read(&counter
->hw
.prev_count
);
2429 atomic64_set(&counter
->hw
.prev_count
, now
);
2430 atomic64_add(now
- prev
, &counter
->count
);
2433 static int cpu_clock_perf_counter_enable(struct perf_counter
*counter
)
2435 struct hw_perf_counter
*hwc
= &counter
->hw
;
2436 int cpu
= raw_smp_processor_id();
2438 atomic64_set(&hwc
->prev_count
, cpu_clock(cpu
));
2439 hrtimer_init(&hwc
->hrtimer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
2440 hwc
->hrtimer
.function
= perf_swcounter_hrtimer
;
2441 if (hwc
->irq_period
) {
2442 __hrtimer_start_range_ns(&hwc
->hrtimer
,
2443 ns_to_ktime(hwc
->irq_period
), 0,
2444 HRTIMER_MODE_REL
, 0);
2450 static void cpu_clock_perf_counter_disable(struct perf_counter
*counter
)
2452 hrtimer_cancel(&counter
->hw
.hrtimer
);
2453 cpu_clock_perf_counter_update(counter
);
2456 static void cpu_clock_perf_counter_read(struct perf_counter
*counter
)
2458 cpu_clock_perf_counter_update(counter
);
2461 static const struct hw_perf_counter_ops perf_ops_cpu_clock
= {
2462 .enable
= cpu_clock_perf_counter_enable
,
2463 .disable
= cpu_clock_perf_counter_disable
,
2464 .read
= cpu_clock_perf_counter_read
,
2468 * Software counter: task time clock
2471 static void task_clock_perf_counter_update(struct perf_counter
*counter
, u64 now
)
2476 prev
= atomic64_xchg(&counter
->hw
.prev_count
, now
);
2478 atomic64_add(delta
, &counter
->count
);
2481 static int task_clock_perf_counter_enable(struct perf_counter
*counter
)
2483 struct hw_perf_counter
*hwc
= &counter
->hw
;
2486 now
= counter
->ctx
->time
;
2488 atomic64_set(&hwc
->prev_count
, now
);
2489 hrtimer_init(&hwc
->hrtimer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
2490 hwc
->hrtimer
.function
= perf_swcounter_hrtimer
;
2491 if (hwc
->irq_period
) {
2492 __hrtimer_start_range_ns(&hwc
->hrtimer
,
2493 ns_to_ktime(hwc
->irq_period
), 0,
2494 HRTIMER_MODE_REL
, 0);
2500 static void task_clock_perf_counter_disable(struct perf_counter
*counter
)
2502 hrtimer_cancel(&counter
->hw
.hrtimer
);
2503 task_clock_perf_counter_update(counter
, counter
->ctx
->time
);
2507 static void task_clock_perf_counter_read(struct perf_counter
*counter
)
2512 update_context_time(counter
->ctx
);
2513 time
= counter
->ctx
->time
;
2515 u64 now
= perf_clock();
2516 u64 delta
= now
- counter
->ctx
->timestamp
;
2517 time
= counter
->ctx
->time
+ delta
;
2520 task_clock_perf_counter_update(counter
, time
);
2523 static const struct hw_perf_counter_ops perf_ops_task_clock
= {
2524 .enable
= task_clock_perf_counter_enable
,
2525 .disable
= task_clock_perf_counter_disable
,
2526 .read
= task_clock_perf_counter_read
,
2530 * Software counter: cpu migrations
2533 static inline u64
get_cpu_migrations(struct perf_counter
*counter
)
2535 struct task_struct
*curr
= counter
->ctx
->task
;
2538 return curr
->se
.nr_migrations
;
2539 return cpu_nr_migrations(smp_processor_id());
2542 static void cpu_migrations_perf_counter_update(struct perf_counter
*counter
)
2547 prev
= atomic64_read(&counter
->hw
.prev_count
);
2548 now
= get_cpu_migrations(counter
);
2550 atomic64_set(&counter
->hw
.prev_count
, now
);
2554 atomic64_add(delta
, &counter
->count
);
2557 static void cpu_migrations_perf_counter_read(struct perf_counter
*counter
)
2559 cpu_migrations_perf_counter_update(counter
);
2562 static int cpu_migrations_perf_counter_enable(struct perf_counter
*counter
)
2564 if (counter
->prev_state
<= PERF_COUNTER_STATE_OFF
)
2565 atomic64_set(&counter
->hw
.prev_count
,
2566 get_cpu_migrations(counter
));
2570 static void cpu_migrations_perf_counter_disable(struct perf_counter
*counter
)
2572 cpu_migrations_perf_counter_update(counter
);
2575 static const struct hw_perf_counter_ops perf_ops_cpu_migrations
= {
2576 .enable
= cpu_migrations_perf_counter_enable
,
2577 .disable
= cpu_migrations_perf_counter_disable
,
2578 .read
= cpu_migrations_perf_counter_read
,
2581 #ifdef CONFIG_EVENT_PROFILE
2582 void perf_tpcounter_event(int event_id
)
2584 struct pt_regs
*regs
= get_irq_regs();
2587 regs
= task_pt_regs(current
);
2589 __perf_swcounter_event(PERF_TYPE_TRACEPOINT
, event_id
, 1, 1, regs
, 0);
2592 extern int ftrace_profile_enable(int);
2593 extern void ftrace_profile_disable(int);
2595 static void tp_perf_counter_destroy(struct perf_counter
*counter
)
2597 ftrace_profile_disable(perf_event_id(&counter
->hw_event
));
2600 static const struct hw_perf_counter_ops
*
2601 tp_perf_counter_init(struct perf_counter
*counter
)
2603 int event_id
= perf_event_id(&counter
->hw_event
);
2606 ret
= ftrace_profile_enable(event_id
);
2610 counter
->destroy
= tp_perf_counter_destroy
;
2611 counter
->hw
.irq_period
= counter
->hw_event
.irq_period
;
2613 return &perf_ops_generic
;
2616 static const struct hw_perf_counter_ops
*
2617 tp_perf_counter_init(struct perf_counter
*counter
)
2623 static const struct hw_perf_counter_ops
*
2624 sw_perf_counter_init(struct perf_counter
*counter
)
2626 struct perf_counter_hw_event
*hw_event
= &counter
->hw_event
;
2627 const struct hw_perf_counter_ops
*hw_ops
= NULL
;
2628 struct hw_perf_counter
*hwc
= &counter
->hw
;
2631 * Software counters (currently) can't in general distinguish
2632 * between user, kernel and hypervisor events.
2633 * However, context switches and cpu migrations are considered
2634 * to be kernel events, and page faults are never hypervisor
2637 switch (perf_event_id(&counter
->hw_event
)) {
2638 case PERF_COUNT_CPU_CLOCK
:
2639 hw_ops
= &perf_ops_cpu_clock
;
2641 if (hw_event
->irq_period
&& hw_event
->irq_period
< 10000)
2642 hw_event
->irq_period
= 10000;
2644 case PERF_COUNT_TASK_CLOCK
:
2646 * If the user instantiates this as a per-cpu counter,
2647 * use the cpu_clock counter instead.
2649 if (counter
->ctx
->task
)
2650 hw_ops
= &perf_ops_task_clock
;
2652 hw_ops
= &perf_ops_cpu_clock
;
2654 if (hw_event
->irq_period
&& hw_event
->irq_period
< 10000)
2655 hw_event
->irq_period
= 10000;
2657 case PERF_COUNT_PAGE_FAULTS
:
2658 case PERF_COUNT_PAGE_FAULTS_MIN
:
2659 case PERF_COUNT_PAGE_FAULTS_MAJ
:
2660 case PERF_COUNT_CONTEXT_SWITCHES
:
2661 hw_ops
= &perf_ops_generic
;
2663 case PERF_COUNT_CPU_MIGRATIONS
:
2664 if (!counter
->hw_event
.exclude_kernel
)
2665 hw_ops
= &perf_ops_cpu_migrations
;
2670 hwc
->irq_period
= hw_event
->irq_period
;
2676 * Allocate and initialize a counter structure
2678 static struct perf_counter
*
2679 perf_counter_alloc(struct perf_counter_hw_event
*hw_event
,
2681 struct perf_counter_context
*ctx
,
2682 struct perf_counter
*group_leader
,
2685 const struct hw_perf_counter_ops
*hw_ops
;
2686 struct perf_counter
*counter
;
2689 counter
= kzalloc(sizeof(*counter
), gfpflags
);
2691 return ERR_PTR(-ENOMEM
);
2694 * Single counters are their own group leaders, with an
2695 * empty sibling list:
2698 group_leader
= counter
;
2700 mutex_init(&counter
->mutex
);
2701 INIT_LIST_HEAD(&counter
->list_entry
);
2702 INIT_LIST_HEAD(&counter
->event_entry
);
2703 INIT_LIST_HEAD(&counter
->sibling_list
);
2704 init_waitqueue_head(&counter
->waitq
);
2706 mutex_init(&counter
->mmap_mutex
);
2708 INIT_LIST_HEAD(&counter
->child_list
);
2711 counter
->hw_event
= *hw_event
;
2712 counter
->group_leader
= group_leader
;
2713 counter
->hw_ops
= NULL
;
2716 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
2717 if (hw_event
->disabled
)
2718 counter
->state
= PERF_COUNTER_STATE_OFF
;
2722 if (perf_event_raw(hw_event
)) {
2723 hw_ops
= hw_perf_counter_init(counter
);
2727 switch (perf_event_type(hw_event
)) {
2728 case PERF_TYPE_HARDWARE
:
2729 hw_ops
= hw_perf_counter_init(counter
);
2732 case PERF_TYPE_SOFTWARE
:
2733 hw_ops
= sw_perf_counter_init(counter
);
2736 case PERF_TYPE_TRACEPOINT
:
2737 hw_ops
= tp_perf_counter_init(counter
);
2744 else if (IS_ERR(hw_ops
))
2745 err
= PTR_ERR(hw_ops
);
2749 return ERR_PTR(err
);
2752 counter
->hw_ops
= hw_ops
;
2754 if (counter
->hw_event
.mmap
)
2755 atomic_inc(&nr_mmap_tracking
);
2756 if (counter
->hw_event
.munmap
)
2757 atomic_inc(&nr_munmap_tracking
);
2758 if (counter
->hw_event
.comm
)
2759 atomic_inc(&nr_comm_tracking
);
2765 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2767 * @hw_event_uptr: event type attributes for monitoring/sampling
2770 * @group_fd: group leader counter fd
2772 SYSCALL_DEFINE5(perf_counter_open
,
2773 const struct perf_counter_hw_event __user
*, hw_event_uptr
,
2774 pid_t
, pid
, int, cpu
, int, group_fd
, unsigned long, flags
)
2776 struct perf_counter
*counter
, *group_leader
;
2777 struct perf_counter_hw_event hw_event
;
2778 struct perf_counter_context
*ctx
;
2779 struct file
*counter_file
= NULL
;
2780 struct file
*group_file
= NULL
;
2781 int fput_needed
= 0;
2782 int fput_needed2
= 0;
2785 /* for future expandability... */
2789 if (copy_from_user(&hw_event
, hw_event_uptr
, sizeof(hw_event
)) != 0)
2793 * Get the target context (task or percpu):
2795 ctx
= find_get_context(pid
, cpu
);
2797 return PTR_ERR(ctx
);
2800 * Look up the group leader (we will attach this counter to it):
2802 group_leader
= NULL
;
2803 if (group_fd
!= -1) {
2805 group_file
= fget_light(group_fd
, &fput_needed
);
2807 goto err_put_context
;
2808 if (group_file
->f_op
!= &perf_fops
)
2809 goto err_put_context
;
2811 group_leader
= group_file
->private_data
;
2813 * Do not allow a recursive hierarchy (this new sibling
2814 * becoming part of another group-sibling):
2816 if (group_leader
->group_leader
!= group_leader
)
2817 goto err_put_context
;
2819 * Do not allow to attach to a group in a different
2820 * task or CPU context:
2822 if (group_leader
->ctx
!= ctx
)
2823 goto err_put_context
;
2825 * Only a group leader can be exclusive or pinned
2827 if (hw_event
.exclusive
|| hw_event
.pinned
)
2828 goto err_put_context
;
2831 counter
= perf_counter_alloc(&hw_event
, cpu
, ctx
, group_leader
,
2833 ret
= PTR_ERR(counter
);
2834 if (IS_ERR(counter
))
2835 goto err_put_context
;
2837 ret
= anon_inode_getfd("[perf_counter]", &perf_fops
, counter
, 0);
2839 goto err_free_put_context
;
2841 counter_file
= fget_light(ret
, &fput_needed2
);
2843 goto err_free_put_context
;
2845 counter
->filp
= counter_file
;
2846 mutex_lock(&ctx
->mutex
);
2847 perf_install_in_context(ctx
, counter
, cpu
);
2848 mutex_unlock(&ctx
->mutex
);
2850 fput_light(counter_file
, fput_needed2
);
2853 fput_light(group_file
, fput_needed
);
2857 err_free_put_context
:
2867 * Initialize the perf_counter context in a task_struct:
2870 __perf_counter_init_context(struct perf_counter_context
*ctx
,
2871 struct task_struct
*task
)
2873 memset(ctx
, 0, sizeof(*ctx
));
2874 spin_lock_init(&ctx
->lock
);
2875 mutex_init(&ctx
->mutex
);
2876 INIT_LIST_HEAD(&ctx
->counter_list
);
2877 INIT_LIST_HEAD(&ctx
->event_list
);
2882 * inherit a counter from parent task to child task:
2884 static struct perf_counter
*
2885 inherit_counter(struct perf_counter
*parent_counter
,
2886 struct task_struct
*parent
,
2887 struct perf_counter_context
*parent_ctx
,
2888 struct task_struct
*child
,
2889 struct perf_counter
*group_leader
,
2890 struct perf_counter_context
*child_ctx
)
2892 struct perf_counter
*child_counter
;
2895 * Instead of creating recursive hierarchies of counters,
2896 * we link inherited counters back to the original parent,
2897 * which has a filp for sure, which we use as the reference
2900 if (parent_counter
->parent
)
2901 parent_counter
= parent_counter
->parent
;
2903 child_counter
= perf_counter_alloc(&parent_counter
->hw_event
,
2904 parent_counter
->cpu
, child_ctx
,
2905 group_leader
, GFP_KERNEL
);
2906 if (IS_ERR(child_counter
))
2907 return child_counter
;
2910 * Link it up in the child's context:
2912 child_counter
->task
= child
;
2913 add_counter_to_ctx(child_counter
, child_ctx
);
2915 child_counter
->parent
= parent_counter
;
2917 * inherit into child's child as well:
2919 child_counter
->hw_event
.inherit
= 1;
2922 * Get a reference to the parent filp - we will fput it
2923 * when the child counter exits. This is safe to do because
2924 * we are in the parent and we know that the filp still
2925 * exists and has a nonzero count:
2927 atomic_long_inc(&parent_counter
->filp
->f_count
);
2930 * Link this into the parent counter's child list
2932 mutex_lock(&parent_counter
->mutex
);
2933 list_add_tail(&child_counter
->child_list
, &parent_counter
->child_list
);
2936 * Make the child state follow the state of the parent counter,
2937 * not its hw_event.disabled bit. We hold the parent's mutex,
2938 * so we won't race with perf_counter_{en,dis}able_family.
2940 if (parent_counter
->state
>= PERF_COUNTER_STATE_INACTIVE
)
2941 child_counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
2943 child_counter
->state
= PERF_COUNTER_STATE_OFF
;
2945 mutex_unlock(&parent_counter
->mutex
);
2947 return child_counter
;
2950 static int inherit_group(struct perf_counter
*parent_counter
,
2951 struct task_struct
*parent
,
2952 struct perf_counter_context
*parent_ctx
,
2953 struct task_struct
*child
,
2954 struct perf_counter_context
*child_ctx
)
2956 struct perf_counter
*leader
;
2957 struct perf_counter
*sub
;
2958 struct perf_counter
*child_ctr
;
2960 leader
= inherit_counter(parent_counter
, parent
, parent_ctx
,
2961 child
, NULL
, child_ctx
);
2963 return PTR_ERR(leader
);
2964 list_for_each_entry(sub
, &parent_counter
->sibling_list
, list_entry
) {
2965 child_ctr
= inherit_counter(sub
, parent
, parent_ctx
,
2966 child
, leader
, child_ctx
);
2967 if (IS_ERR(child_ctr
))
2968 return PTR_ERR(child_ctr
);
2973 static void sync_child_counter(struct perf_counter
*child_counter
,
2974 struct perf_counter
*parent_counter
)
2976 u64 parent_val
, child_val
;
2978 parent_val
= atomic64_read(&parent_counter
->count
);
2979 child_val
= atomic64_read(&child_counter
->count
);
2982 * Add back the child's count to the parent's count:
2984 atomic64_add(child_val
, &parent_counter
->count
);
2985 atomic64_add(child_counter
->total_time_enabled
,
2986 &parent_counter
->child_total_time_enabled
);
2987 atomic64_add(child_counter
->total_time_running
,
2988 &parent_counter
->child_total_time_running
);
2991 * Remove this counter from the parent's list
2993 mutex_lock(&parent_counter
->mutex
);
2994 list_del_init(&child_counter
->child_list
);
2995 mutex_unlock(&parent_counter
->mutex
);
2998 * Release the parent counter, if this was the last
3001 fput(parent_counter
->filp
);
3005 __perf_counter_exit_task(struct task_struct
*child
,
3006 struct perf_counter
*child_counter
,
3007 struct perf_counter_context
*child_ctx
)
3009 struct perf_counter
*parent_counter
;
3010 struct perf_counter
*sub
, *tmp
;
3013 * If we do not self-reap then we have to wait for the
3014 * child task to unschedule (it will happen for sure),
3015 * so that its counter is at its final count. (This
3016 * condition triggers rarely - child tasks usually get
3017 * off their CPU before the parent has a chance to
3018 * get this far into the reaping action)
3020 if (child
!= current
) {
3021 wait_task_inactive(child
, 0);
3022 list_del_init(&child_counter
->list_entry
);
3023 update_counter_times(child_counter
);
3025 struct perf_cpu_context
*cpuctx
;
3026 unsigned long flags
;
3030 * Disable and unlink this counter.
3032 * Be careful about zapping the list - IRQ/NMI context
3033 * could still be processing it:
3035 local_irq_save(flags
);
3036 perf_flags
= hw_perf_save_disable();
3038 cpuctx
= &__get_cpu_var(perf_cpu_context
);
3040 group_sched_out(child_counter
, cpuctx
, child_ctx
);
3041 update_counter_times(child_counter
);
3043 list_del_init(&child_counter
->list_entry
);
3045 child_ctx
->nr_counters
--;
3047 hw_perf_restore(perf_flags
);
3048 local_irq_restore(flags
);
3051 parent_counter
= child_counter
->parent
;
3053 * It can happen that parent exits first, and has counters
3054 * that are still around due to the child reference. These
3055 * counters need to be zapped - but otherwise linger.
3057 if (parent_counter
) {
3058 sync_child_counter(child_counter
, parent_counter
);
3059 list_for_each_entry_safe(sub
, tmp
, &child_counter
->sibling_list
,
3062 sync_child_counter(sub
, sub
->parent
);
3066 free_counter(child_counter
);
3071 * When a child task exits, feed back counter values to parent counters.
3073 * Note: we may be running in child context, but the PID is not hashed
3074 * anymore so new counters will not be added.
3076 void perf_counter_exit_task(struct task_struct
*child
)
3078 struct perf_counter
*child_counter
, *tmp
;
3079 struct perf_counter_context
*child_ctx
;
3081 child_ctx
= &child
->perf_counter_ctx
;
3083 if (likely(!child_ctx
->nr_counters
))
3086 list_for_each_entry_safe(child_counter
, tmp
, &child_ctx
->counter_list
,
3088 __perf_counter_exit_task(child
, child_counter
, child_ctx
);
3092 * Initialize the perf_counter context in task_struct
3094 void perf_counter_init_task(struct task_struct
*child
)
3096 struct perf_counter_context
*child_ctx
, *parent_ctx
;
3097 struct perf_counter
*counter
;
3098 struct task_struct
*parent
= current
;
3100 child_ctx
= &child
->perf_counter_ctx
;
3101 parent_ctx
= &parent
->perf_counter_ctx
;
3103 __perf_counter_init_context(child_ctx
, child
);
3106 * This is executed from the parent task context, so inherit
3107 * counters that have been marked for cloning:
3110 if (likely(!parent_ctx
->nr_counters
))
3114 * Lock the parent list. No need to lock the child - not PID
3115 * hashed yet and not running, so nobody can access it.
3117 mutex_lock(&parent_ctx
->mutex
);
3120 * We dont have to disable NMIs - we are only looking at
3121 * the list, not manipulating it:
3123 list_for_each_entry(counter
, &parent_ctx
->counter_list
, list_entry
) {
3124 if (!counter
->hw_event
.inherit
)
3127 if (inherit_group(counter
, parent
,
3128 parent_ctx
, child
, child_ctx
))
3132 mutex_unlock(&parent_ctx
->mutex
);
3135 static void __cpuinit
perf_counter_init_cpu(int cpu
)
3137 struct perf_cpu_context
*cpuctx
;
3139 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
3140 __perf_counter_init_context(&cpuctx
->ctx
, NULL
);
3142 mutex_lock(&perf_resource_mutex
);
3143 cpuctx
->max_pertask
= perf_max_counters
- perf_reserved_percpu
;
3144 mutex_unlock(&perf_resource_mutex
);
3146 hw_perf_counter_setup(cpu
);
3149 #ifdef CONFIG_HOTPLUG_CPU
3150 static void __perf_counter_exit_cpu(void *info
)
3152 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
3153 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
3154 struct perf_counter
*counter
, *tmp
;
3156 list_for_each_entry_safe(counter
, tmp
, &ctx
->counter_list
, list_entry
)
3157 __perf_counter_remove_from_context(counter
);
3159 static void perf_counter_exit_cpu(int cpu
)
3161 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
3162 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
3164 mutex_lock(&ctx
->mutex
);
3165 smp_call_function_single(cpu
, __perf_counter_exit_cpu
, NULL
, 1);
3166 mutex_unlock(&ctx
->mutex
);
3169 static inline void perf_counter_exit_cpu(int cpu
) { }
3172 static int __cpuinit
3173 perf_cpu_notify(struct notifier_block
*self
, unsigned long action
, void *hcpu
)
3175 unsigned int cpu
= (long)hcpu
;
3179 case CPU_UP_PREPARE
:
3180 case CPU_UP_PREPARE_FROZEN
:
3181 perf_counter_init_cpu(cpu
);
3184 case CPU_DOWN_PREPARE
:
3185 case CPU_DOWN_PREPARE_FROZEN
:
3186 perf_counter_exit_cpu(cpu
);
3196 static struct notifier_block __cpuinitdata perf_cpu_nb
= {
3197 .notifier_call
= perf_cpu_notify
,
3200 static int __init
perf_counter_init(void)
3202 perf_cpu_notify(&perf_cpu_nb
, (unsigned long)CPU_UP_PREPARE
,
3203 (void *)(long)smp_processor_id());
3204 register_cpu_notifier(&perf_cpu_nb
);
3208 early_initcall(perf_counter_init
);
3210 static ssize_t
perf_show_reserve_percpu(struct sysdev_class
*class, char *buf
)
3212 return sprintf(buf
, "%d\n", perf_reserved_percpu
);
3216 perf_set_reserve_percpu(struct sysdev_class
*class,
3220 struct perf_cpu_context
*cpuctx
;
3224 err
= strict_strtoul(buf
, 10, &val
);
3227 if (val
> perf_max_counters
)
3230 mutex_lock(&perf_resource_mutex
);
3231 perf_reserved_percpu
= val
;
3232 for_each_online_cpu(cpu
) {
3233 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
3234 spin_lock_irq(&cpuctx
->ctx
.lock
);
3235 mpt
= min(perf_max_counters
- cpuctx
->ctx
.nr_counters
,
3236 perf_max_counters
- perf_reserved_percpu
);
3237 cpuctx
->max_pertask
= mpt
;
3238 spin_unlock_irq(&cpuctx
->ctx
.lock
);
3240 mutex_unlock(&perf_resource_mutex
);
3245 static ssize_t
perf_show_overcommit(struct sysdev_class
*class, char *buf
)
3247 return sprintf(buf
, "%d\n", perf_overcommit
);
3251 perf_set_overcommit(struct sysdev_class
*class, const char *buf
, size_t count
)
3256 err
= strict_strtoul(buf
, 10, &val
);
3262 mutex_lock(&perf_resource_mutex
);
3263 perf_overcommit
= val
;
3264 mutex_unlock(&perf_resource_mutex
);
3269 static SYSDEV_CLASS_ATTR(
3272 perf_show_reserve_percpu
,
3273 perf_set_reserve_percpu
3276 static SYSDEV_CLASS_ATTR(
3279 perf_show_overcommit
,
3283 static struct attribute
*perfclass_attrs
[] = {
3284 &attr_reserve_percpu
.attr
,
3285 &attr_overcommit
.attr
,
3289 static struct attribute_group perfclass_attr_group
= {
3290 .attrs
= perfclass_attrs
,
3291 .name
= "perf_counters",
3294 static int __init
perf_counter_sysfs_init(void)
3296 return sysfs_create_group(&cpu_sysdev_class
.kset
.kobj
,
3297 &perfclass_attr_group
);
3299 device_initcall(perf_counter_sysfs_init
);