2 * Performance counter core code
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
7 * For licencing details see kernel-base/COPYING
11 #include <linux/cpu.h>
12 #include <linux/smp.h>
13 #include <linux/file.h>
14 #include <linux/poll.h>
15 #include <linux/sysfs.h>
16 #include <linux/ptrace.h>
17 #include <linux/percpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/syscalls.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/perf_counter.h>
24 * Each CPU has a list of per CPU counters:
26 DEFINE_PER_CPU(struct perf_cpu_context
, perf_cpu_context
);
28 int perf_max_counters __read_mostly
;
29 static int perf_reserved_percpu __read_mostly
;
30 static int perf_overcommit __read_mostly
= 1;
33 * Mutex for (sysadmin-configurable) counter reservations:
35 static DEFINE_MUTEX(perf_resource_mutex
);
38 * Architecture provided APIs - weak aliases:
40 extern __weak
const struct hw_perf_counter_ops
*
41 hw_perf_counter_init(struct perf_counter
*counter
)
43 return ERR_PTR(-EINVAL
);
46 u64 __weak
hw_perf_save_disable(void) { return 0; }
47 void __weak
hw_perf_restore(u64 ctrl
) { }
48 void __weak
hw_perf_counter_setup(void) { }
51 list_add_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
53 struct perf_counter
*group_leader
= counter
->group_leader
;
56 * Depending on whether it is a standalone or sibling counter,
57 * add it straight to the context's counter list, or to the group
58 * leader's sibling list:
60 if (counter
->group_leader
== counter
)
61 list_add_tail(&counter
->list_entry
, &ctx
->counter_list
);
63 list_add_tail(&counter
->list_entry
, &group_leader
->sibling_list
);
67 list_del_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
69 struct perf_counter
*sibling
, *tmp
;
71 list_del_init(&counter
->list_entry
);
74 * If this was a group counter with sibling counters then
75 * upgrade the siblings to singleton counters by adding them
76 * to the context list directly:
78 list_for_each_entry_safe(sibling
, tmp
,
79 &counter
->sibling_list
, list_entry
) {
81 list_del_init(&sibling
->list_entry
);
82 list_add_tail(&sibling
->list_entry
, &ctx
->counter_list
);
83 sibling
->group_leader
= sibling
;
88 * Cross CPU call to remove a performance counter
90 * We disable the counter on the hardware level first. After that we
91 * remove it from the context list.
93 static void __perf_counter_remove_from_context(void *info
)
95 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
96 struct perf_counter
*counter
= info
;
97 struct perf_counter_context
*ctx
= counter
->ctx
;
102 * If this is a task context, we need to check whether it is
103 * the current task context of this cpu. If not it has been
104 * scheduled out before the smp call arrived.
106 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
109 spin_lock_irqsave(&ctx
->lock
, flags
);
111 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
112 counter
->hw_ops
->hw_perf_counter_disable(counter
);
113 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
115 cpuctx
->active_oncpu
--;
116 counter
->task
= NULL
;
121 * Protect the list operation against NMI by disabling the
122 * counters on a global level. NOP for non NMI based counters.
124 perf_flags
= hw_perf_save_disable();
125 list_del_counter(counter
, ctx
);
126 hw_perf_restore(perf_flags
);
130 * Allow more per task counters with respect to the
133 cpuctx
->max_pertask
=
134 min(perf_max_counters
- ctx
->nr_counters
,
135 perf_max_counters
- perf_reserved_percpu
);
138 spin_unlock_irqrestore(&ctx
->lock
, flags
);
143 * Remove the counter from a task's (or a CPU's) list of counters.
145 * Must be called with counter->mutex held.
147 * CPU counters are removed with a smp call. For task counters we only
148 * call when the task is on a CPU.
150 static void perf_counter_remove_from_context(struct perf_counter
*counter
)
152 struct perf_counter_context
*ctx
= counter
->ctx
;
153 struct task_struct
*task
= ctx
->task
;
157 * Per cpu counters are removed via an smp call and
158 * the removal is always sucessful.
160 smp_call_function_single(counter
->cpu
,
161 __perf_counter_remove_from_context
,
167 task_oncpu_function_call(task
, __perf_counter_remove_from_context
,
170 spin_lock_irq(&ctx
->lock
);
172 * If the context is active we need to retry the smp call.
174 if (ctx
->nr_active
&& !list_empty(&counter
->list_entry
)) {
175 spin_unlock_irq(&ctx
->lock
);
180 * The lock prevents that this context is scheduled in so we
181 * can remove the counter safely, if the call above did not
184 if (!list_empty(&counter
->list_entry
)) {
186 list_del_counter(counter
, ctx
);
187 counter
->task
= NULL
;
189 spin_unlock_irq(&ctx
->lock
);
193 * Cross CPU call to install and enable a preformance counter
195 static void __perf_install_in_context(void *info
)
197 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
198 struct perf_counter
*counter
= info
;
199 struct perf_counter_context
*ctx
= counter
->ctx
;
200 int cpu
= smp_processor_id();
205 * If this is a task context, we need to check whether it is
206 * the current task context of this cpu. If not it has been
207 * scheduled out before the smp call arrived.
209 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
212 spin_lock_irqsave(&ctx
->lock
, flags
);
215 * Protect the list operation against NMI by disabling the
216 * counters on a global level. NOP for non NMI based counters.
218 perf_flags
= hw_perf_save_disable();
219 list_add_counter(counter
, ctx
);
220 hw_perf_restore(perf_flags
);
224 if (cpuctx
->active_oncpu
< perf_max_counters
) {
225 counter
->state
= PERF_COUNTER_STATE_ACTIVE
;
226 counter
->oncpu
= cpu
;
228 cpuctx
->active_oncpu
++;
229 counter
->hw_ops
->hw_perf_counter_enable(counter
);
232 if (!ctx
->task
&& cpuctx
->max_pertask
)
233 cpuctx
->max_pertask
--;
235 spin_unlock_irqrestore(&ctx
->lock
, flags
);
239 * Attach a performance counter to a context
241 * First we add the counter to the list with the hardware enable bit
242 * in counter->hw_config cleared.
244 * If the counter is attached to a task which is on a CPU we use a smp
245 * call to enable it in the task context. The task might have been
246 * scheduled away, but we check this in the smp call again.
249 perf_install_in_context(struct perf_counter_context
*ctx
,
250 struct perf_counter
*counter
,
253 struct task_struct
*task
= ctx
->task
;
258 * Per cpu counters are installed via an smp call and
259 * the install is always sucessful.
261 smp_call_function_single(cpu
, __perf_install_in_context
,
266 counter
->task
= task
;
268 task_oncpu_function_call(task
, __perf_install_in_context
,
271 spin_lock_irq(&ctx
->lock
);
273 * we need to retry the smp call.
275 if (ctx
->nr_active
&& list_empty(&counter
->list_entry
)) {
276 spin_unlock_irq(&ctx
->lock
);
281 * The lock prevents that this context is scheduled in so we
282 * can add the counter safely, if it the call above did not
285 if (list_empty(&counter
->list_entry
)) {
286 list_add_counter(counter
, ctx
);
289 spin_unlock_irq(&ctx
->lock
);
293 counter_sched_out(struct perf_counter
*counter
,
294 struct perf_cpu_context
*cpuctx
,
295 struct perf_counter_context
*ctx
)
297 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
300 counter
->hw_ops
->hw_perf_counter_disable(counter
);
301 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
304 cpuctx
->active_oncpu
--;
309 group_sched_out(struct perf_counter
*group_counter
,
310 struct perf_cpu_context
*cpuctx
,
311 struct perf_counter_context
*ctx
)
313 struct perf_counter
*counter
;
315 counter_sched_out(group_counter
, cpuctx
, ctx
);
318 * Schedule out siblings (if any):
320 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
)
321 counter_sched_out(counter
, cpuctx
, ctx
);
325 * Called from scheduler to remove the counters of the current task,
326 * with interrupts disabled.
328 * We stop each counter and update the counter value in counter->count.
330 * This does not protect us against NMI, but hw_perf_counter_disable()
331 * sets the disabled bit in the control field of counter _before_
332 * accessing the counter control register. If a NMI hits, then it will
333 * not restart the counter.
335 void perf_counter_task_sched_out(struct task_struct
*task
, int cpu
)
337 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
338 struct perf_counter_context
*ctx
= &task
->perf_counter_ctx
;
339 struct perf_counter
*counter
;
341 if (likely(!cpuctx
->task_ctx
))
344 spin_lock(&ctx
->lock
);
345 if (ctx
->nr_active
) {
346 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
)
347 group_sched_out(counter
, cpuctx
, ctx
);
349 spin_unlock(&ctx
->lock
);
350 cpuctx
->task_ctx
= NULL
;
354 counter_sched_in(struct perf_counter
*counter
,
355 struct perf_cpu_context
*cpuctx
,
356 struct perf_counter_context
*ctx
,
359 if (counter
->state
== PERF_COUNTER_STATE_OFF
)
362 counter
->hw_ops
->hw_perf_counter_enable(counter
);
363 counter
->state
= PERF_COUNTER_STATE_ACTIVE
;
364 counter
->oncpu
= cpu
; /* TODO: put 'cpu' into cpuctx->cpu */
366 cpuctx
->active_oncpu
++;
371 group_sched_in(struct perf_counter
*group_counter
,
372 struct perf_cpu_context
*cpuctx
,
373 struct perf_counter_context
*ctx
,
376 struct perf_counter
*counter
;
378 counter_sched_in(group_counter
, cpuctx
, ctx
, cpu
);
381 * Schedule in siblings as one group (if any):
383 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
)
384 counter_sched_in(counter
, cpuctx
, ctx
, cpu
);
388 * Called from scheduler to add the counters of the current task
389 * with interrupts disabled.
391 * We restore the counter value and then enable it.
393 * This does not protect us against NMI, but hw_perf_counter_enable()
394 * sets the enabled bit in the control field of counter _before_
395 * accessing the counter control register. If a NMI hits, then it will
396 * keep the counter running.
398 void perf_counter_task_sched_in(struct task_struct
*task
, int cpu
)
400 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
401 struct perf_counter_context
*ctx
= &task
->perf_counter_ctx
;
402 struct perf_counter
*counter
;
404 if (likely(!ctx
->nr_counters
))
407 spin_lock(&ctx
->lock
);
408 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
409 if (ctx
->nr_active
== cpuctx
->max_pertask
)
413 * Listen to the 'cpu' scheduling filter constraint
416 if (counter
->cpu
!= -1 && counter
->cpu
!= cpu
)
419 group_sched_in(counter
, cpuctx
, ctx
, cpu
);
421 spin_unlock(&ctx
->lock
);
423 cpuctx
->task_ctx
= ctx
;
426 int perf_counter_task_disable(void)
428 struct task_struct
*curr
= current
;
429 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
430 struct perf_counter
*counter
;
434 if (likely(!ctx
->nr_counters
))
438 cpu
= smp_processor_id();
440 perf_counter_task_sched_out(curr
, cpu
);
442 spin_lock(&ctx
->lock
);
445 * Disable all the counters:
447 perf_flags
= hw_perf_save_disable();
449 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
)
450 counter
->state
= PERF_COUNTER_STATE_OFF
;
452 hw_perf_restore(perf_flags
);
454 spin_unlock(&ctx
->lock
);
461 int perf_counter_task_enable(void)
463 struct task_struct
*curr
= current
;
464 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
465 struct perf_counter
*counter
;
469 if (likely(!ctx
->nr_counters
))
473 cpu
= smp_processor_id();
475 spin_lock(&ctx
->lock
);
478 * Disable all the counters:
480 perf_flags
= hw_perf_save_disable();
482 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
483 if (counter
->state
!= PERF_COUNTER_STATE_OFF
)
485 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
487 hw_perf_restore(perf_flags
);
489 spin_unlock(&ctx
->lock
);
491 perf_counter_task_sched_in(curr
, cpu
);
498 void perf_counter_task_tick(struct task_struct
*curr
, int cpu
)
500 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
501 struct perf_counter
*counter
;
504 if (likely(!ctx
->nr_counters
))
507 perf_counter_task_sched_out(curr
, cpu
);
509 spin_lock(&ctx
->lock
);
512 * Rotate the first entry last (works just fine for group counters too):
514 perf_flags
= hw_perf_save_disable();
515 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
516 list_del(&counter
->list_entry
);
517 list_add_tail(&counter
->list_entry
, &ctx
->counter_list
);
520 hw_perf_restore(perf_flags
);
522 spin_unlock(&ctx
->lock
);
524 perf_counter_task_sched_in(curr
, cpu
);
528 * Cross CPU call to read the hardware counter
530 static void __hw_perf_counter_read(void *info
)
532 struct perf_counter
*counter
= info
;
534 counter
->hw_ops
->hw_perf_counter_read(counter
);
537 static u64
perf_counter_read(struct perf_counter
*counter
)
540 * If counter is enabled and currently active on a CPU, update the
541 * value in the counter structure:
543 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
544 smp_call_function_single(counter
->oncpu
,
545 __hw_perf_counter_read
, counter
, 1);
548 return atomic64_read(&counter
->count
);
552 * Cross CPU call to switch performance data pointers
554 static void __perf_switch_irq_data(void *info
)
556 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
557 struct perf_counter
*counter
= info
;
558 struct perf_counter_context
*ctx
= counter
->ctx
;
559 struct perf_data
*oldirqdata
= counter
->irqdata
;
562 * If this is a task context, we need to check whether it is
563 * the current task context of this cpu. If not it has been
564 * scheduled out before the smp call arrived.
567 if (cpuctx
->task_ctx
!= ctx
)
569 spin_lock(&ctx
->lock
);
572 /* Change the pointer NMI safe */
573 atomic_long_set((atomic_long_t
*)&counter
->irqdata
,
574 (unsigned long) counter
->usrdata
);
575 counter
->usrdata
= oldirqdata
;
578 spin_unlock(&ctx
->lock
);
581 static struct perf_data
*perf_switch_irq_data(struct perf_counter
*counter
)
583 struct perf_counter_context
*ctx
= counter
->ctx
;
584 struct perf_data
*oldirqdata
= counter
->irqdata
;
585 struct task_struct
*task
= ctx
->task
;
588 smp_call_function_single(counter
->cpu
,
589 __perf_switch_irq_data
,
591 return counter
->usrdata
;
595 spin_lock_irq(&ctx
->lock
);
596 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
) {
597 counter
->irqdata
= counter
->usrdata
;
598 counter
->usrdata
= oldirqdata
;
599 spin_unlock_irq(&ctx
->lock
);
602 spin_unlock_irq(&ctx
->lock
);
603 task_oncpu_function_call(task
, __perf_switch_irq_data
, counter
);
604 /* Might have failed, because task was scheduled out */
605 if (counter
->irqdata
== oldirqdata
)
608 return counter
->usrdata
;
611 static void put_context(struct perf_counter_context
*ctx
)
614 put_task_struct(ctx
->task
);
617 static struct perf_counter_context
*find_get_context(pid_t pid
, int cpu
)
619 struct perf_cpu_context
*cpuctx
;
620 struct perf_counter_context
*ctx
;
621 struct task_struct
*task
;
624 * If cpu is not a wildcard then this is a percpu counter:
627 /* Must be root to operate on a CPU counter: */
628 if (!capable(CAP_SYS_ADMIN
))
629 return ERR_PTR(-EACCES
);
631 if (cpu
< 0 || cpu
> num_possible_cpus())
632 return ERR_PTR(-EINVAL
);
635 * We could be clever and allow to attach a counter to an
636 * offline CPU and activate it when the CPU comes up, but
639 if (!cpu_isset(cpu
, cpu_online_map
))
640 return ERR_PTR(-ENODEV
);
642 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
652 task
= find_task_by_vpid(pid
);
654 get_task_struct(task
);
658 return ERR_PTR(-ESRCH
);
660 ctx
= &task
->perf_counter_ctx
;
663 /* Reuse ptrace permission checks for now. */
664 if (!ptrace_may_access(task
, PTRACE_MODE_READ
)) {
666 return ERR_PTR(-EACCES
);
673 * Called when the last reference to the file is gone.
675 static int perf_release(struct inode
*inode
, struct file
*file
)
677 struct perf_counter
*counter
= file
->private_data
;
678 struct perf_counter_context
*ctx
= counter
->ctx
;
680 file
->private_data
= NULL
;
682 mutex_lock(&counter
->mutex
);
684 perf_counter_remove_from_context(counter
);
687 mutex_unlock(&counter
->mutex
);
695 * Read the performance counter - simple non blocking version for now
698 perf_read_hw(struct perf_counter
*counter
, char __user
*buf
, size_t count
)
702 if (count
!= sizeof(cntval
))
705 mutex_lock(&counter
->mutex
);
706 cntval
= perf_counter_read(counter
);
707 mutex_unlock(&counter
->mutex
);
709 return put_user(cntval
, (u64 __user
*) buf
) ? -EFAULT
: sizeof(cntval
);
713 perf_copy_usrdata(struct perf_data
*usrdata
, char __user
*buf
, size_t count
)
718 count
= min(count
, (size_t)usrdata
->len
);
719 if (copy_to_user(buf
, usrdata
->data
+ usrdata
->rd_idx
, count
))
722 /* Adjust the counters */
723 usrdata
->len
-= count
;
727 usrdata
->rd_idx
+= count
;
733 perf_read_irq_data(struct perf_counter
*counter
,
738 struct perf_data
*irqdata
, *usrdata
;
739 DECLARE_WAITQUEUE(wait
, current
);
742 irqdata
= counter
->irqdata
;
743 usrdata
= counter
->usrdata
;
745 if (usrdata
->len
+ irqdata
->len
>= count
)
751 spin_lock_irq(&counter
->waitq
.lock
);
752 __add_wait_queue(&counter
->waitq
, &wait
);
754 set_current_state(TASK_INTERRUPTIBLE
);
755 if (usrdata
->len
+ irqdata
->len
>= count
)
758 if (signal_pending(current
))
761 spin_unlock_irq(&counter
->waitq
.lock
);
763 spin_lock_irq(&counter
->waitq
.lock
);
765 __remove_wait_queue(&counter
->waitq
, &wait
);
766 __set_current_state(TASK_RUNNING
);
767 spin_unlock_irq(&counter
->waitq
.lock
);
769 if (usrdata
->len
+ irqdata
->len
< count
)
772 mutex_lock(&counter
->mutex
);
774 /* Drain pending data first: */
775 res
= perf_copy_usrdata(usrdata
, buf
, count
);
776 if (res
< 0 || res
== count
)
779 /* Switch irq buffer: */
780 usrdata
= perf_switch_irq_data(counter
);
781 if (perf_copy_usrdata(usrdata
, buf
+ res
, count
- res
) < 0) {
788 mutex_unlock(&counter
->mutex
);
794 perf_read(struct file
*file
, char __user
*buf
, size_t count
, loff_t
*ppos
)
796 struct perf_counter
*counter
= file
->private_data
;
798 switch (counter
->hw_event
.record_type
) {
799 case PERF_RECORD_SIMPLE
:
800 return perf_read_hw(counter
, buf
, count
);
802 case PERF_RECORD_IRQ
:
803 case PERF_RECORD_GROUP
:
804 return perf_read_irq_data(counter
, buf
, count
,
805 file
->f_flags
& O_NONBLOCK
);
810 static unsigned int perf_poll(struct file
*file
, poll_table
*wait
)
812 struct perf_counter
*counter
= file
->private_data
;
813 unsigned int events
= 0;
816 poll_wait(file
, &counter
->waitq
, wait
);
818 spin_lock_irqsave(&counter
->waitq
.lock
, flags
);
819 if (counter
->usrdata
->len
|| counter
->irqdata
->len
)
821 spin_unlock_irqrestore(&counter
->waitq
.lock
, flags
);
826 static const struct file_operations perf_fops
= {
827 .release
= perf_release
,
832 static void cpu_clock_perf_counter_enable(struct perf_counter
*counter
)
836 static void cpu_clock_perf_counter_disable(struct perf_counter
*counter
)
840 static void cpu_clock_perf_counter_read(struct perf_counter
*counter
)
842 int cpu
= raw_smp_processor_id();
844 atomic64_set(&counter
->count
, cpu_clock(cpu
));
847 static const struct hw_perf_counter_ops perf_ops_cpu_clock
= {
848 .hw_perf_counter_enable
= cpu_clock_perf_counter_enable
,
849 .hw_perf_counter_disable
= cpu_clock_perf_counter_disable
,
850 .hw_perf_counter_read
= cpu_clock_perf_counter_read
,
853 static void task_clock_perf_counter_update(struct perf_counter
*counter
)
858 prev
= atomic64_read(&counter
->hw
.prev_count
);
859 now
= current
->se
.sum_exec_runtime
;
861 atomic64_set(&counter
->hw
.prev_count
, now
);
864 if (WARN_ON_ONCE(delta
< 0))
867 atomic64_add(delta
, &counter
->count
);
870 static void task_clock_perf_counter_read(struct perf_counter
*counter
)
872 task_clock_perf_counter_update(counter
);
875 static void task_clock_perf_counter_enable(struct perf_counter
*counter
)
877 atomic64_set(&counter
->hw
.prev_count
, current
->se
.sum_exec_runtime
);
880 static void task_clock_perf_counter_disable(struct perf_counter
*counter
)
882 task_clock_perf_counter_update(counter
);
885 static const struct hw_perf_counter_ops perf_ops_task_clock
= {
886 .hw_perf_counter_enable
= task_clock_perf_counter_enable
,
887 .hw_perf_counter_disable
= task_clock_perf_counter_disable
,
888 .hw_perf_counter_read
= task_clock_perf_counter_read
,
891 static const struct hw_perf_counter_ops
*
892 sw_perf_counter_init(struct perf_counter
*counter
)
894 const struct hw_perf_counter_ops
*hw_ops
= NULL
;
896 switch (counter
->hw_event
.type
) {
897 case PERF_COUNT_CPU_CLOCK
:
898 hw_ops
= &perf_ops_cpu_clock
;
900 case PERF_COUNT_TASK_CLOCK
:
901 hw_ops
= &perf_ops_task_clock
;
910 * Allocate and initialize a counter structure
912 static struct perf_counter
*
913 perf_counter_alloc(struct perf_counter_hw_event
*hw_event
,
915 struct perf_counter
*group_leader
,
918 const struct hw_perf_counter_ops
*hw_ops
;
919 struct perf_counter
*counter
;
921 counter
= kzalloc(sizeof(*counter
), gfpflags
);
926 * Single counters are their own group leaders, with an
927 * empty sibling list:
930 group_leader
= counter
;
932 mutex_init(&counter
->mutex
);
933 INIT_LIST_HEAD(&counter
->list_entry
);
934 INIT_LIST_HEAD(&counter
->sibling_list
);
935 init_waitqueue_head(&counter
->waitq
);
937 counter
->irqdata
= &counter
->data
[0];
938 counter
->usrdata
= &counter
->data
[1];
940 counter
->hw_event
= *hw_event
;
941 counter
->wakeup_pending
= 0;
942 counter
->group_leader
= group_leader
;
943 counter
->hw_ops
= NULL
;
946 if (!hw_event
->raw
&& hw_event
->type
< 0)
947 hw_ops
= sw_perf_counter_init(counter
);
949 hw_ops
= hw_perf_counter_init(counter
);
955 counter
->hw_ops
= hw_ops
;
961 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
963 * @hw_event_uptr: event type attributes for monitoring/sampling
966 * @group_fd: group leader counter fd
969 sys_perf_counter_open(struct perf_counter_hw_event
*hw_event_uptr __user
,
970 pid_t pid
, int cpu
, int group_fd
)
972 struct perf_counter
*counter
, *group_leader
;
973 struct perf_counter_hw_event hw_event
;
974 struct perf_counter_context
*ctx
;
975 struct file
*counter_file
= NULL
;
976 struct file
*group_file
= NULL
;
978 int fput_needed2
= 0;
981 if (copy_from_user(&hw_event
, hw_event_uptr
, sizeof(hw_event
)) != 0)
985 * Get the target context (task or percpu):
987 ctx
= find_get_context(pid
, cpu
);
992 * Look up the group leader (we will attach this counter to it):
995 if (group_fd
!= -1) {
997 group_file
= fget_light(group_fd
, &fput_needed
);
999 goto err_put_context
;
1000 if (group_file
->f_op
!= &perf_fops
)
1001 goto err_put_context
;
1003 group_leader
= group_file
->private_data
;
1005 * Do not allow a recursive hierarchy (this new sibling
1006 * becoming part of another group-sibling):
1008 if (group_leader
->group_leader
!= group_leader
)
1009 goto err_put_context
;
1011 * Do not allow to attach to a group in a different
1012 * task or CPU context:
1014 if (group_leader
->ctx
!= ctx
)
1015 goto err_put_context
;
1019 counter
= perf_counter_alloc(&hw_event
, cpu
, group_leader
, GFP_KERNEL
);
1021 goto err_put_context
;
1023 ret
= anon_inode_getfd("[perf_counter]", &perf_fops
, counter
, 0);
1025 goto err_free_put_context
;
1027 counter_file
= fget_light(ret
, &fput_needed2
);
1029 goto err_free_put_context
;
1031 counter
->filp
= counter_file
;
1032 perf_install_in_context(ctx
, counter
, cpu
);
1034 fput_light(counter_file
, fput_needed2
);
1037 fput_light(group_file
, fput_needed
);
1041 err_free_put_context
:
1051 * Initialize the perf_counter context in a task_struct:
1054 __perf_counter_init_context(struct perf_counter_context
*ctx
,
1055 struct task_struct
*task
)
1057 memset(ctx
, 0, sizeof(*ctx
));
1058 spin_lock_init(&ctx
->lock
);
1059 INIT_LIST_HEAD(&ctx
->counter_list
);
1064 * inherit a counter from parent task to child task:
1067 inherit_counter(struct perf_counter
*parent_counter
,
1068 struct task_struct
*parent
,
1069 struct perf_counter_context
*parent_ctx
,
1070 struct task_struct
*child
,
1071 struct perf_counter_context
*child_ctx
)
1073 struct perf_counter
*child_counter
;
1075 child_counter
= perf_counter_alloc(&parent_counter
->hw_event
,
1076 parent_counter
->cpu
, NULL
,
1082 * Link it up in the child's context:
1084 child_counter
->ctx
= child_ctx
;
1085 child_counter
->task
= child
;
1086 list_add_counter(child_counter
, child_ctx
);
1087 child_ctx
->nr_counters
++;
1089 child_counter
->parent
= parent_counter
;
1090 parent_counter
->nr_inherited
++;
1092 * inherit into child's child as well:
1094 child_counter
->hw_event
.inherit
= 1;
1097 * Get a reference to the parent filp - we will fput it
1098 * when the child counter exits. This is safe to do because
1099 * we are in the parent and we know that the filp still
1100 * exists and has a nonzero count:
1102 atomic_long_inc(&parent_counter
->filp
->f_count
);
1108 __perf_counter_exit_task(struct task_struct
*child
,
1109 struct perf_counter
*child_counter
,
1110 struct perf_counter_context
*child_ctx
)
1112 struct perf_counter
*parent_counter
;
1113 u64 parent_val
, child_val
;
1117 * Disable and unlink this counter.
1119 * Be careful about zapping the list - IRQ/NMI context
1120 * could still be processing it:
1122 local_irq_disable();
1123 perf_flags
= hw_perf_save_disable();
1125 if (child_counter
->state
== PERF_COUNTER_STATE_ACTIVE
)
1126 child_counter
->hw_ops
->hw_perf_counter_disable(child_counter
);
1127 list_del_init(&child_counter
->list_entry
);
1129 hw_perf_restore(perf_flags
);
1132 parent_counter
= child_counter
->parent
;
1134 * It can happen that parent exits first, and has counters
1135 * that are still around due to the child reference. These
1136 * counters need to be zapped - but otherwise linger.
1138 if (!parent_counter
)
1141 parent_val
= atomic64_read(&parent_counter
->count
);
1142 child_val
= atomic64_read(&child_counter
->count
);
1145 * Add back the child's count to the parent's count:
1147 atomic64_add(child_val
, &parent_counter
->count
);
1149 fput(parent_counter
->filp
);
1151 kfree(child_counter
);
1155 * When a child task exist, feed back counter values to parent counters.
1157 * Note: we are running in child context, but the PID is not hashed
1158 * anymore so new counters will not be added.
1160 void perf_counter_exit_task(struct task_struct
*child
)
1162 struct perf_counter
*child_counter
, *tmp
;
1163 struct perf_counter_context
*child_ctx
;
1165 child_ctx
= &child
->perf_counter_ctx
;
1167 if (likely(!child_ctx
->nr_counters
))
1170 list_for_each_entry_safe(child_counter
, tmp
, &child_ctx
->counter_list
,
1172 __perf_counter_exit_task(child
, child_counter
, child_ctx
);
1176 * Initialize the perf_counter context in task_struct
1178 void perf_counter_init_task(struct task_struct
*child
)
1180 struct perf_counter_context
*child_ctx
, *parent_ctx
;
1181 struct perf_counter
*counter
, *parent_counter
;
1182 struct task_struct
*parent
= current
;
1183 unsigned long flags
;
1185 child_ctx
= &child
->perf_counter_ctx
;
1186 parent_ctx
= &parent
->perf_counter_ctx
;
1188 __perf_counter_init_context(child_ctx
, child
);
1191 * This is executed from the parent task context, so inherit
1192 * counters that have been marked for cloning:
1195 if (likely(!parent_ctx
->nr_counters
))
1199 * Lock the parent list. No need to lock the child - not PID
1200 * hashed yet and not running, so nobody can access it.
1202 spin_lock_irqsave(&parent_ctx
->lock
, flags
);
1205 * We dont have to disable NMIs - we are only looking at
1206 * the list, not manipulating it:
1208 list_for_each_entry(counter
, &parent_ctx
->counter_list
, list_entry
) {
1209 if (!counter
->hw_event
.inherit
|| counter
->group_leader
!= counter
)
1213 * Instead of creating recursive hierarchies of counters,
1214 * we link inheritd counters back to the original parent,
1215 * which has a filp for sure, which we use as the reference
1218 parent_counter
= counter
;
1219 if (counter
->parent
)
1220 parent_counter
= counter
->parent
;
1222 if (inherit_counter(parent_counter
, parent
,
1223 parent_ctx
, child
, child_ctx
))
1227 spin_unlock_irqrestore(&parent_ctx
->lock
, flags
);
1230 static void __cpuinit
perf_counter_init_cpu(int cpu
)
1232 struct perf_cpu_context
*cpuctx
;
1234 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1235 __perf_counter_init_context(&cpuctx
->ctx
, NULL
);
1237 mutex_lock(&perf_resource_mutex
);
1238 cpuctx
->max_pertask
= perf_max_counters
- perf_reserved_percpu
;
1239 mutex_unlock(&perf_resource_mutex
);
1241 hw_perf_counter_setup();
1244 #ifdef CONFIG_HOTPLUG_CPU
1245 static void __perf_counter_exit_cpu(void *info
)
1247 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
1248 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
1249 struct perf_counter
*counter
, *tmp
;
1251 list_for_each_entry_safe(counter
, tmp
, &ctx
->counter_list
, list_entry
)
1252 __perf_counter_remove_from_context(counter
);
1255 static void perf_counter_exit_cpu(int cpu
)
1257 smp_call_function_single(cpu
, __perf_counter_exit_cpu
, NULL
, 1);
1260 static inline void perf_counter_exit_cpu(int cpu
) { }
1263 static int __cpuinit
1264 perf_cpu_notify(struct notifier_block
*self
, unsigned long action
, void *hcpu
)
1266 unsigned int cpu
= (long)hcpu
;
1270 case CPU_UP_PREPARE
:
1271 case CPU_UP_PREPARE_FROZEN
:
1272 perf_counter_init_cpu(cpu
);
1275 case CPU_DOWN_PREPARE
:
1276 case CPU_DOWN_PREPARE_FROZEN
:
1277 perf_counter_exit_cpu(cpu
);
1287 static struct notifier_block __cpuinitdata perf_cpu_nb
= {
1288 .notifier_call
= perf_cpu_notify
,
1291 static int __init
perf_counter_init(void)
1293 perf_cpu_notify(&perf_cpu_nb
, (unsigned long)CPU_UP_PREPARE
,
1294 (void *)(long)smp_processor_id());
1295 register_cpu_notifier(&perf_cpu_nb
);
1299 early_initcall(perf_counter_init
);
1301 static ssize_t
perf_show_reserve_percpu(struct sysdev_class
*class, char *buf
)
1303 return sprintf(buf
, "%d\n", perf_reserved_percpu
);
1307 perf_set_reserve_percpu(struct sysdev_class
*class,
1311 struct perf_cpu_context
*cpuctx
;
1315 err
= strict_strtoul(buf
, 10, &val
);
1318 if (val
> perf_max_counters
)
1321 mutex_lock(&perf_resource_mutex
);
1322 perf_reserved_percpu
= val
;
1323 for_each_online_cpu(cpu
) {
1324 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1325 spin_lock_irq(&cpuctx
->ctx
.lock
);
1326 mpt
= min(perf_max_counters
- cpuctx
->ctx
.nr_counters
,
1327 perf_max_counters
- perf_reserved_percpu
);
1328 cpuctx
->max_pertask
= mpt
;
1329 spin_unlock_irq(&cpuctx
->ctx
.lock
);
1331 mutex_unlock(&perf_resource_mutex
);
1336 static ssize_t
perf_show_overcommit(struct sysdev_class
*class, char *buf
)
1338 return sprintf(buf
, "%d\n", perf_overcommit
);
1342 perf_set_overcommit(struct sysdev_class
*class, const char *buf
, size_t count
)
1347 err
= strict_strtoul(buf
, 10, &val
);
1353 mutex_lock(&perf_resource_mutex
);
1354 perf_overcommit
= val
;
1355 mutex_unlock(&perf_resource_mutex
);
1360 static SYSDEV_CLASS_ATTR(
1363 perf_show_reserve_percpu
,
1364 perf_set_reserve_percpu
1367 static SYSDEV_CLASS_ATTR(
1370 perf_show_overcommit
,
1374 static struct attribute
*perfclass_attrs
[] = {
1375 &attr_reserve_percpu
.attr
,
1376 &attr_overcommit
.attr
,
1380 static struct attribute_group perfclass_attr_group
= {
1381 .attrs
= perfclass_attrs
,
1382 .name
= "perf_counters",
1385 static int __init
perf_counter_sysfs_init(void)
1387 return sysfs_create_group(&cpu_sysdev_class
.kset
.kobj
,
1388 &perfclass_attr_group
);
1390 device_initcall(perf_counter_sysfs_init
);