2 * Performance counter core code
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
7 * For licencing details see kernel-base/COPYING
11 #include <linux/cpu.h>
12 #include <linux/smp.h>
13 #include <linux/file.h>
14 #include <linux/poll.h>
15 #include <linux/sysfs.h>
16 #include <linux/ptrace.h>
17 #include <linux/percpu.h>
18 #include <linux/uaccess.h>
19 #include <linux/syscalls.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/perf_counter.h>
24 * Each CPU has a list of per CPU counters:
26 DEFINE_PER_CPU(struct perf_cpu_context
, perf_cpu_context
);
28 int perf_max_counters __read_mostly
= 1;
29 static int perf_reserved_percpu __read_mostly
;
30 static int perf_overcommit __read_mostly
= 1;
33 * Mutex for (sysadmin-configurable) counter reservations:
35 static DEFINE_MUTEX(perf_resource_mutex
);
38 * Architecture provided APIs - weak aliases:
40 extern __weak
const struct hw_perf_counter_ops
*
41 hw_perf_counter_init(struct perf_counter
*counter
)
43 return ERR_PTR(-EINVAL
);
46 u64 __weak
hw_perf_save_disable(void) { return 0; }
47 void __weak
hw_perf_restore(u64 ctrl
) { }
48 void __weak
hw_perf_counter_setup(void) { }
51 list_add_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
53 struct perf_counter
*group_leader
= counter
->group_leader
;
56 * Depending on whether it is a standalone or sibling counter,
57 * add it straight to the context's counter list, or to the group
58 * leader's sibling list:
60 if (counter
->group_leader
== counter
)
61 list_add_tail(&counter
->list_entry
, &ctx
->counter_list
);
63 list_add_tail(&counter
->list_entry
, &group_leader
->sibling_list
);
67 list_del_counter(struct perf_counter
*counter
, struct perf_counter_context
*ctx
)
69 struct perf_counter
*sibling
, *tmp
;
71 list_del_init(&counter
->list_entry
);
74 * If this was a group counter with sibling counters then
75 * upgrade the siblings to singleton counters by adding them
76 * to the context list directly:
78 list_for_each_entry_safe(sibling
, tmp
,
79 &counter
->sibling_list
, list_entry
) {
81 list_del_init(&sibling
->list_entry
);
82 list_add_tail(&sibling
->list_entry
, &ctx
->counter_list
);
83 sibling
->group_leader
= sibling
;
88 * Cross CPU call to remove a performance counter
90 * We disable the counter on the hardware level first. After that we
91 * remove it from the context list.
93 static void __perf_counter_remove_from_context(void *info
)
95 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
96 struct perf_counter
*counter
= info
;
97 struct perf_counter_context
*ctx
= counter
->ctx
;
102 * If this is a task context, we need to check whether it is
103 * the current task context of this cpu. If not it has been
104 * scheduled out before the smp call arrived.
106 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
109 spin_lock_irqsave(&ctx
->lock
, flags
);
111 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
112 counter
->hw_ops
->hw_perf_counter_disable(counter
);
113 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
115 cpuctx
->active_oncpu
--;
116 counter
->task
= NULL
;
121 * Protect the list operation against NMI by disabling the
122 * counters on a global level. NOP for non NMI based counters.
124 perf_flags
= hw_perf_save_disable();
125 list_del_counter(counter
, ctx
);
126 hw_perf_restore(perf_flags
);
130 * Allow more per task counters with respect to the
133 cpuctx
->max_pertask
=
134 min(perf_max_counters
- ctx
->nr_counters
,
135 perf_max_counters
- perf_reserved_percpu
);
138 spin_unlock_irqrestore(&ctx
->lock
, flags
);
143 * Remove the counter from a task's (or a CPU's) list of counters.
145 * Must be called with counter->mutex held.
147 * CPU counters are removed with a smp call. For task counters we only
148 * call when the task is on a CPU.
150 static void perf_counter_remove_from_context(struct perf_counter
*counter
)
152 struct perf_counter_context
*ctx
= counter
->ctx
;
153 struct task_struct
*task
= ctx
->task
;
157 * Per cpu counters are removed via an smp call and
158 * the removal is always sucessful.
160 smp_call_function_single(counter
->cpu
,
161 __perf_counter_remove_from_context
,
167 task_oncpu_function_call(task
, __perf_counter_remove_from_context
,
170 spin_lock_irq(&ctx
->lock
);
172 * If the context is active we need to retry the smp call.
174 if (ctx
->nr_active
&& !list_empty(&counter
->list_entry
)) {
175 spin_unlock_irq(&ctx
->lock
);
180 * The lock prevents that this context is scheduled in so we
181 * can remove the counter safely, if the call above did not
184 if (!list_empty(&counter
->list_entry
)) {
186 list_del_counter(counter
, ctx
);
187 counter
->task
= NULL
;
189 spin_unlock_irq(&ctx
->lock
);
193 * Cross CPU call to install and enable a preformance counter
195 static void __perf_install_in_context(void *info
)
197 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
198 struct perf_counter
*counter
= info
;
199 struct perf_counter_context
*ctx
= counter
->ctx
;
200 int cpu
= smp_processor_id();
205 * If this is a task context, we need to check whether it is
206 * the current task context of this cpu. If not it has been
207 * scheduled out before the smp call arrived.
209 if (ctx
->task
&& cpuctx
->task_ctx
!= ctx
)
212 spin_lock_irqsave(&ctx
->lock
, flags
);
215 * Protect the list operation against NMI by disabling the
216 * counters on a global level. NOP for non NMI based counters.
218 perf_flags
= hw_perf_save_disable();
219 list_add_counter(counter
, ctx
);
220 hw_perf_restore(perf_flags
);
224 if (cpuctx
->active_oncpu
< perf_max_counters
) {
225 counter
->state
= PERF_COUNTER_STATE_ACTIVE
;
226 counter
->oncpu
= cpu
;
228 cpuctx
->active_oncpu
++;
229 counter
->hw_ops
->hw_perf_counter_enable(counter
);
232 if (!ctx
->task
&& cpuctx
->max_pertask
)
233 cpuctx
->max_pertask
--;
235 spin_unlock_irqrestore(&ctx
->lock
, flags
);
239 * Attach a performance counter to a context
241 * First we add the counter to the list with the hardware enable bit
242 * in counter->hw_config cleared.
244 * If the counter is attached to a task which is on a CPU we use a smp
245 * call to enable it in the task context. The task might have been
246 * scheduled away, but we check this in the smp call again.
249 perf_install_in_context(struct perf_counter_context
*ctx
,
250 struct perf_counter
*counter
,
253 struct task_struct
*task
= ctx
->task
;
258 * Per cpu counters are installed via an smp call and
259 * the install is always sucessful.
261 smp_call_function_single(cpu
, __perf_install_in_context
,
266 counter
->task
= task
;
268 task_oncpu_function_call(task
, __perf_install_in_context
,
271 spin_lock_irq(&ctx
->lock
);
273 * we need to retry the smp call.
275 if (ctx
->nr_active
&& list_empty(&counter
->list_entry
)) {
276 spin_unlock_irq(&ctx
->lock
);
281 * The lock prevents that this context is scheduled in so we
282 * can add the counter safely, if it the call above did not
285 if (list_empty(&counter
->list_entry
)) {
286 list_add_counter(counter
, ctx
);
289 spin_unlock_irq(&ctx
->lock
);
293 counter_sched_out(struct perf_counter
*counter
,
294 struct perf_cpu_context
*cpuctx
,
295 struct perf_counter_context
*ctx
)
297 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
)
300 counter
->hw_ops
->hw_perf_counter_disable(counter
);
301 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
304 cpuctx
->active_oncpu
--;
309 group_sched_out(struct perf_counter
*group_counter
,
310 struct perf_cpu_context
*cpuctx
,
311 struct perf_counter_context
*ctx
)
313 struct perf_counter
*counter
;
315 counter_sched_out(group_counter
, cpuctx
, ctx
);
318 * Schedule out siblings (if any):
320 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
)
321 counter_sched_out(counter
, cpuctx
, ctx
);
325 * Called from scheduler to remove the counters of the current task,
326 * with interrupts disabled.
328 * We stop each counter and update the counter value in counter->count.
330 * This does not protect us against NMI, but hw_perf_counter_disable()
331 * sets the disabled bit in the control field of counter _before_
332 * accessing the counter control register. If a NMI hits, then it will
333 * not restart the counter.
335 void perf_counter_task_sched_out(struct task_struct
*task
, int cpu
)
337 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
338 struct perf_counter_context
*ctx
= &task
->perf_counter_ctx
;
339 struct perf_counter
*counter
;
341 if (likely(!cpuctx
->task_ctx
))
344 spin_lock(&ctx
->lock
);
345 if (ctx
->nr_active
) {
346 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
)
347 group_sched_out(counter
, cpuctx
, ctx
);
349 spin_unlock(&ctx
->lock
);
350 cpuctx
->task_ctx
= NULL
;
354 counter_sched_in(struct perf_counter
*counter
,
355 struct perf_cpu_context
*cpuctx
,
356 struct perf_counter_context
*ctx
,
359 if (counter
->state
== PERF_COUNTER_STATE_OFF
)
362 counter
->hw_ops
->hw_perf_counter_enable(counter
);
363 counter
->state
= PERF_COUNTER_STATE_ACTIVE
;
364 counter
->oncpu
= cpu
; /* TODO: put 'cpu' into cpuctx->cpu */
366 cpuctx
->active_oncpu
++;
371 group_sched_in(struct perf_counter
*group_counter
,
372 struct perf_cpu_context
*cpuctx
,
373 struct perf_counter_context
*ctx
,
376 struct perf_counter
*counter
;
378 counter_sched_in(group_counter
, cpuctx
, ctx
, cpu
);
381 * Schedule in siblings as one group (if any):
383 list_for_each_entry(counter
, &group_counter
->sibling_list
, list_entry
)
384 counter_sched_in(counter
, cpuctx
, ctx
, cpu
);
388 * Called from scheduler to add the counters of the current task
389 * with interrupts disabled.
391 * We restore the counter value and then enable it.
393 * This does not protect us against NMI, but hw_perf_counter_enable()
394 * sets the enabled bit in the control field of counter _before_
395 * accessing the counter control register. If a NMI hits, then it will
396 * keep the counter running.
398 void perf_counter_task_sched_in(struct task_struct
*task
, int cpu
)
400 struct perf_cpu_context
*cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
401 struct perf_counter_context
*ctx
= &task
->perf_counter_ctx
;
402 struct perf_counter
*counter
;
404 if (likely(!ctx
->nr_counters
))
407 spin_lock(&ctx
->lock
);
408 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
409 if (ctx
->nr_active
== cpuctx
->max_pertask
)
413 * Listen to the 'cpu' scheduling filter constraint
416 if (counter
->cpu
!= -1 && counter
->cpu
!= cpu
)
419 group_sched_in(counter
, cpuctx
, ctx
, cpu
);
421 spin_unlock(&ctx
->lock
);
423 cpuctx
->task_ctx
= ctx
;
426 int perf_counter_task_disable(void)
428 struct task_struct
*curr
= current
;
429 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
430 struct perf_counter
*counter
;
434 if (likely(!ctx
->nr_counters
))
438 cpu
= smp_processor_id();
440 perf_counter_task_sched_out(curr
, cpu
);
442 spin_lock(&ctx
->lock
);
445 * Disable all the counters:
447 perf_flags
= hw_perf_save_disable();
449 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
)
450 counter
->state
= PERF_COUNTER_STATE_OFF
;
452 hw_perf_restore(perf_flags
);
454 spin_unlock(&ctx
->lock
);
461 int perf_counter_task_enable(void)
463 struct task_struct
*curr
= current
;
464 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
465 struct perf_counter
*counter
;
469 if (likely(!ctx
->nr_counters
))
473 cpu
= smp_processor_id();
475 spin_lock(&ctx
->lock
);
478 * Disable all the counters:
480 perf_flags
= hw_perf_save_disable();
482 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
483 if (counter
->state
!= PERF_COUNTER_STATE_OFF
)
485 counter
->state
= PERF_COUNTER_STATE_INACTIVE
;
487 hw_perf_restore(perf_flags
);
489 spin_unlock(&ctx
->lock
);
491 perf_counter_task_sched_in(curr
, cpu
);
498 void perf_counter_task_tick(struct task_struct
*curr
, int cpu
)
500 struct perf_counter_context
*ctx
= &curr
->perf_counter_ctx
;
501 struct perf_counter
*counter
;
504 if (likely(!ctx
->nr_counters
))
507 perf_counter_task_sched_out(curr
, cpu
);
509 spin_lock(&ctx
->lock
);
512 * Rotate the first entry last (works just fine for group counters too):
514 perf_flags
= hw_perf_save_disable();
515 list_for_each_entry(counter
, &ctx
->counter_list
, list_entry
) {
516 list_del(&counter
->list_entry
);
517 list_add_tail(&counter
->list_entry
, &ctx
->counter_list
);
520 hw_perf_restore(perf_flags
);
522 spin_unlock(&ctx
->lock
);
524 perf_counter_task_sched_in(curr
, cpu
);
528 * Cross CPU call to read the hardware counter
530 static void __hw_perf_counter_read(void *info
)
532 struct perf_counter
*counter
= info
;
534 counter
->hw_ops
->hw_perf_counter_read(counter
);
537 static u64
perf_counter_read(struct perf_counter
*counter
)
540 * If counter is enabled and currently active on a CPU, update the
541 * value in the counter structure:
543 if (counter
->state
== PERF_COUNTER_STATE_ACTIVE
) {
544 smp_call_function_single(counter
->oncpu
,
545 __hw_perf_counter_read
, counter
, 1);
548 return atomic64_read(&counter
->count
);
552 * Cross CPU call to switch performance data pointers
554 static void __perf_switch_irq_data(void *info
)
556 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
557 struct perf_counter
*counter
= info
;
558 struct perf_counter_context
*ctx
= counter
->ctx
;
559 struct perf_data
*oldirqdata
= counter
->irqdata
;
562 * If this is a task context, we need to check whether it is
563 * the current task context of this cpu. If not it has been
564 * scheduled out before the smp call arrived.
567 if (cpuctx
->task_ctx
!= ctx
)
569 spin_lock(&ctx
->lock
);
572 /* Change the pointer NMI safe */
573 atomic_long_set((atomic_long_t
*)&counter
->irqdata
,
574 (unsigned long) counter
->usrdata
);
575 counter
->usrdata
= oldirqdata
;
578 spin_unlock(&ctx
->lock
);
581 static struct perf_data
*perf_switch_irq_data(struct perf_counter
*counter
)
583 struct perf_counter_context
*ctx
= counter
->ctx
;
584 struct perf_data
*oldirqdata
= counter
->irqdata
;
585 struct task_struct
*task
= ctx
->task
;
588 smp_call_function_single(counter
->cpu
,
589 __perf_switch_irq_data
,
591 return counter
->usrdata
;
595 spin_lock_irq(&ctx
->lock
);
596 if (counter
->state
!= PERF_COUNTER_STATE_ACTIVE
) {
597 counter
->irqdata
= counter
->usrdata
;
598 counter
->usrdata
= oldirqdata
;
599 spin_unlock_irq(&ctx
->lock
);
602 spin_unlock_irq(&ctx
->lock
);
603 task_oncpu_function_call(task
, __perf_switch_irq_data
, counter
);
604 /* Might have failed, because task was scheduled out */
605 if (counter
->irqdata
== oldirqdata
)
608 return counter
->usrdata
;
611 static void put_context(struct perf_counter_context
*ctx
)
614 put_task_struct(ctx
->task
);
617 static struct perf_counter_context
*find_get_context(pid_t pid
, int cpu
)
619 struct perf_cpu_context
*cpuctx
;
620 struct perf_counter_context
*ctx
;
621 struct task_struct
*task
;
624 * If cpu is not a wildcard then this is a percpu counter:
627 /* Must be root to operate on a CPU counter: */
628 if (!capable(CAP_SYS_ADMIN
))
629 return ERR_PTR(-EACCES
);
631 if (cpu
< 0 || cpu
> num_possible_cpus())
632 return ERR_PTR(-EINVAL
);
635 * We could be clever and allow to attach a counter to an
636 * offline CPU and activate it when the CPU comes up, but
639 if (!cpu_isset(cpu
, cpu_online_map
))
640 return ERR_PTR(-ENODEV
);
642 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
652 task
= find_task_by_vpid(pid
);
654 get_task_struct(task
);
658 return ERR_PTR(-ESRCH
);
660 ctx
= &task
->perf_counter_ctx
;
663 /* Reuse ptrace permission checks for now. */
664 if (!ptrace_may_access(task
, PTRACE_MODE_READ
)) {
666 return ERR_PTR(-EACCES
);
673 * Called when the last reference to the file is gone.
675 static int perf_release(struct inode
*inode
, struct file
*file
)
677 struct perf_counter
*counter
= file
->private_data
;
678 struct perf_counter_context
*ctx
= counter
->ctx
;
680 file
->private_data
= NULL
;
682 mutex_lock(&counter
->mutex
);
684 perf_counter_remove_from_context(counter
);
687 mutex_unlock(&counter
->mutex
);
695 * Read the performance counter - simple non blocking version for now
698 perf_read_hw(struct perf_counter
*counter
, char __user
*buf
, size_t count
)
702 if (count
!= sizeof(cntval
))
705 mutex_lock(&counter
->mutex
);
706 cntval
= perf_counter_read(counter
);
707 mutex_unlock(&counter
->mutex
);
709 return put_user(cntval
, (u64 __user
*) buf
) ? -EFAULT
: sizeof(cntval
);
713 perf_copy_usrdata(struct perf_data
*usrdata
, char __user
*buf
, size_t count
)
718 count
= min(count
, (size_t)usrdata
->len
);
719 if (copy_to_user(buf
, usrdata
->data
+ usrdata
->rd_idx
, count
))
722 /* Adjust the counters */
723 usrdata
->len
-= count
;
727 usrdata
->rd_idx
+= count
;
733 perf_read_irq_data(struct perf_counter
*counter
,
738 struct perf_data
*irqdata
, *usrdata
;
739 DECLARE_WAITQUEUE(wait
, current
);
742 irqdata
= counter
->irqdata
;
743 usrdata
= counter
->usrdata
;
745 if (usrdata
->len
+ irqdata
->len
>= count
)
751 spin_lock_irq(&counter
->waitq
.lock
);
752 __add_wait_queue(&counter
->waitq
, &wait
);
754 set_current_state(TASK_INTERRUPTIBLE
);
755 if (usrdata
->len
+ irqdata
->len
>= count
)
758 if (signal_pending(current
))
761 spin_unlock_irq(&counter
->waitq
.lock
);
763 spin_lock_irq(&counter
->waitq
.lock
);
765 __remove_wait_queue(&counter
->waitq
, &wait
);
766 __set_current_state(TASK_RUNNING
);
767 spin_unlock_irq(&counter
->waitq
.lock
);
769 if (usrdata
->len
+ irqdata
->len
< count
)
772 mutex_lock(&counter
->mutex
);
774 /* Drain pending data first: */
775 res
= perf_copy_usrdata(usrdata
, buf
, count
);
776 if (res
< 0 || res
== count
)
779 /* Switch irq buffer: */
780 usrdata
= perf_switch_irq_data(counter
);
781 if (perf_copy_usrdata(usrdata
, buf
+ res
, count
- res
) < 0) {
788 mutex_unlock(&counter
->mutex
);
794 perf_read(struct file
*file
, char __user
*buf
, size_t count
, loff_t
*ppos
)
796 struct perf_counter
*counter
= file
->private_data
;
798 switch (counter
->hw_event
.record_type
) {
799 case PERF_RECORD_SIMPLE
:
800 return perf_read_hw(counter
, buf
, count
);
802 case PERF_RECORD_IRQ
:
803 case PERF_RECORD_GROUP
:
804 return perf_read_irq_data(counter
, buf
, count
,
805 file
->f_flags
& O_NONBLOCK
);
810 static unsigned int perf_poll(struct file
*file
, poll_table
*wait
)
812 struct perf_counter
*counter
= file
->private_data
;
813 unsigned int events
= 0;
816 poll_wait(file
, &counter
->waitq
, wait
);
818 spin_lock_irqsave(&counter
->waitq
.lock
, flags
);
819 if (counter
->usrdata
->len
|| counter
->irqdata
->len
)
821 spin_unlock_irqrestore(&counter
->waitq
.lock
, flags
);
826 static const struct file_operations perf_fops
= {
827 .release
= perf_release
,
832 static void cpu_clock_perf_counter_enable(struct perf_counter
*counter
)
836 static void cpu_clock_perf_counter_disable(struct perf_counter
*counter
)
840 static void cpu_clock_perf_counter_read(struct perf_counter
*counter
)
842 int cpu
= raw_smp_processor_id();
844 atomic64_set(&counter
->count
, cpu_clock(cpu
));
847 static const struct hw_perf_counter_ops perf_ops_cpu_clock
= {
848 .hw_perf_counter_enable
= cpu_clock_perf_counter_enable
,
849 .hw_perf_counter_disable
= cpu_clock_perf_counter_disable
,
850 .hw_perf_counter_read
= cpu_clock_perf_counter_read
,
853 static void task_clock_perf_counter_update(struct perf_counter
*counter
)
858 prev
= atomic64_read(&counter
->hw
.prev_count
);
859 now
= current
->se
.sum_exec_runtime
;
861 atomic64_set(&counter
->hw
.prev_count
, now
);
864 if (WARN_ON_ONCE(delta
< 0))
867 atomic64_add(delta
, &counter
->count
);
870 static void task_clock_perf_counter_read(struct perf_counter
*counter
)
872 task_clock_perf_counter_update(counter
);
875 static void task_clock_perf_counter_enable(struct perf_counter
*counter
)
877 atomic64_set(&counter
->hw
.prev_count
, current
->se
.sum_exec_runtime
);
880 static void task_clock_perf_counter_disable(struct perf_counter
*counter
)
882 task_clock_perf_counter_update(counter
);
885 static const struct hw_perf_counter_ops perf_ops_task_clock
= {
886 .hw_perf_counter_enable
= task_clock_perf_counter_enable
,
887 .hw_perf_counter_disable
= task_clock_perf_counter_disable
,
888 .hw_perf_counter_read
= task_clock_perf_counter_read
,
891 static u64
get_page_faults(void)
893 struct task_struct
*curr
= current
;
895 return curr
->maj_flt
+ curr
->min_flt
;
898 static void page_faults_perf_counter_update(struct perf_counter
*counter
)
903 prev
= atomic64_read(&counter
->hw
.prev_count
);
904 now
= get_page_faults();
906 atomic64_set(&counter
->hw
.prev_count
, now
);
909 if (WARN_ON_ONCE(delta
< 0))
912 atomic64_add(delta
, &counter
->count
);
915 static void page_faults_perf_counter_read(struct perf_counter
*counter
)
917 page_faults_perf_counter_update(counter
);
920 static void page_faults_perf_counter_enable(struct perf_counter
*counter
)
923 * page-faults is a per-task value already,
924 * so we dont have to clear it on switch-in.
928 static void page_faults_perf_counter_disable(struct perf_counter
*counter
)
930 page_faults_perf_counter_update(counter
);
933 static const struct hw_perf_counter_ops perf_ops_page_faults
= {
934 .hw_perf_counter_enable
= page_faults_perf_counter_enable
,
935 .hw_perf_counter_disable
= page_faults_perf_counter_disable
,
936 .hw_perf_counter_read
= page_faults_perf_counter_read
,
939 static u64
get_context_switches(void)
941 struct task_struct
*curr
= current
;
943 return curr
->nvcsw
+ curr
->nivcsw
;
946 static void context_switches_perf_counter_update(struct perf_counter
*counter
)
951 prev
= atomic64_read(&counter
->hw
.prev_count
);
952 now
= get_context_switches();
954 atomic64_set(&counter
->hw
.prev_count
, now
);
957 if (WARN_ON_ONCE(delta
< 0))
960 atomic64_add(delta
, &counter
->count
);
963 static void context_switches_perf_counter_read(struct perf_counter
*counter
)
965 context_switches_perf_counter_update(counter
);
968 static void context_switches_perf_counter_enable(struct perf_counter
*counter
)
971 * ->nvcsw + curr->nivcsw is a per-task value already,
972 * so we dont have to clear it on switch-in.
976 static void context_switches_perf_counter_disable(struct perf_counter
*counter
)
978 context_switches_perf_counter_update(counter
);
981 static const struct hw_perf_counter_ops perf_ops_context_switches
= {
982 .hw_perf_counter_enable
= context_switches_perf_counter_enable
,
983 .hw_perf_counter_disable
= context_switches_perf_counter_disable
,
984 .hw_perf_counter_read
= context_switches_perf_counter_read
,
987 static inline u64
get_cpu_migrations(void)
989 return current
->se
.nr_migrations
;
992 static void cpu_migrations_perf_counter_update(struct perf_counter
*counter
)
997 prev
= atomic64_read(&counter
->hw
.prev_count
);
998 now
= get_cpu_migrations();
1000 atomic64_set(&counter
->hw
.prev_count
, now
);
1003 if (WARN_ON_ONCE(delta
< 0))
1006 atomic64_add(delta
, &counter
->count
);
1009 static void cpu_migrations_perf_counter_read(struct perf_counter
*counter
)
1011 cpu_migrations_perf_counter_update(counter
);
1014 static void cpu_migrations_perf_counter_enable(struct perf_counter
*counter
)
1017 * se.nr_migrations is a per-task value already,
1018 * so we dont have to clear it on switch-in.
1022 static void cpu_migrations_perf_counter_disable(struct perf_counter
*counter
)
1024 cpu_migrations_perf_counter_update(counter
);
1027 static const struct hw_perf_counter_ops perf_ops_cpu_migrations
= {
1028 .hw_perf_counter_enable
= cpu_migrations_perf_counter_enable
,
1029 .hw_perf_counter_disable
= cpu_migrations_perf_counter_disable
,
1030 .hw_perf_counter_read
= cpu_migrations_perf_counter_read
,
1033 static const struct hw_perf_counter_ops
*
1034 sw_perf_counter_init(struct perf_counter
*counter
)
1036 const struct hw_perf_counter_ops
*hw_ops
= NULL
;
1038 switch (counter
->hw_event
.type
) {
1039 case PERF_COUNT_CPU_CLOCK
:
1040 hw_ops
= &perf_ops_cpu_clock
;
1042 case PERF_COUNT_TASK_CLOCK
:
1043 hw_ops
= &perf_ops_task_clock
;
1045 case PERF_COUNT_PAGE_FAULTS
:
1046 hw_ops
= &perf_ops_page_faults
;
1048 case PERF_COUNT_CONTEXT_SWITCHES
:
1049 hw_ops
= &perf_ops_context_switches
;
1051 case PERF_COUNT_CPU_MIGRATIONS
:
1052 hw_ops
= &perf_ops_cpu_migrations
;
1061 * Allocate and initialize a counter structure
1063 static struct perf_counter
*
1064 perf_counter_alloc(struct perf_counter_hw_event
*hw_event
,
1066 struct perf_counter
*group_leader
,
1069 const struct hw_perf_counter_ops
*hw_ops
;
1070 struct perf_counter
*counter
;
1072 counter
= kzalloc(sizeof(*counter
), gfpflags
);
1077 * Single counters are their own group leaders, with an
1078 * empty sibling list:
1081 group_leader
= counter
;
1083 mutex_init(&counter
->mutex
);
1084 INIT_LIST_HEAD(&counter
->list_entry
);
1085 INIT_LIST_HEAD(&counter
->sibling_list
);
1086 init_waitqueue_head(&counter
->waitq
);
1088 counter
->irqdata
= &counter
->data
[0];
1089 counter
->usrdata
= &counter
->data
[1];
1091 counter
->hw_event
= *hw_event
;
1092 counter
->wakeup_pending
= 0;
1093 counter
->group_leader
= group_leader
;
1094 counter
->hw_ops
= NULL
;
1097 if (!hw_event
->raw
&& hw_event
->type
< 0)
1098 hw_ops
= sw_perf_counter_init(counter
);
1100 hw_ops
= hw_perf_counter_init(counter
);
1106 counter
->hw_ops
= hw_ops
;
1112 * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1114 * @hw_event_uptr: event type attributes for monitoring/sampling
1117 * @group_fd: group leader counter fd
1120 sys_perf_counter_open(struct perf_counter_hw_event
*hw_event_uptr __user
,
1121 pid_t pid
, int cpu
, int group_fd
)
1123 struct perf_counter
*counter
, *group_leader
;
1124 struct perf_counter_hw_event hw_event
;
1125 struct perf_counter_context
*ctx
;
1126 struct file
*counter_file
= NULL
;
1127 struct file
*group_file
= NULL
;
1128 int fput_needed
= 0;
1129 int fput_needed2
= 0;
1132 if (copy_from_user(&hw_event
, hw_event_uptr
, sizeof(hw_event
)) != 0)
1136 * Get the target context (task or percpu):
1138 ctx
= find_get_context(pid
, cpu
);
1140 return PTR_ERR(ctx
);
1143 * Look up the group leader (we will attach this counter to it):
1145 group_leader
= NULL
;
1146 if (group_fd
!= -1) {
1148 group_file
= fget_light(group_fd
, &fput_needed
);
1150 goto err_put_context
;
1151 if (group_file
->f_op
!= &perf_fops
)
1152 goto err_put_context
;
1154 group_leader
= group_file
->private_data
;
1156 * Do not allow a recursive hierarchy (this new sibling
1157 * becoming part of another group-sibling):
1159 if (group_leader
->group_leader
!= group_leader
)
1160 goto err_put_context
;
1162 * Do not allow to attach to a group in a different
1163 * task or CPU context:
1165 if (group_leader
->ctx
!= ctx
)
1166 goto err_put_context
;
1170 counter
= perf_counter_alloc(&hw_event
, cpu
, group_leader
, GFP_KERNEL
);
1172 goto err_put_context
;
1174 ret
= anon_inode_getfd("[perf_counter]", &perf_fops
, counter
, 0);
1176 goto err_free_put_context
;
1178 counter_file
= fget_light(ret
, &fput_needed2
);
1180 goto err_free_put_context
;
1182 counter
->filp
= counter_file
;
1183 perf_install_in_context(ctx
, counter
, cpu
);
1185 fput_light(counter_file
, fput_needed2
);
1188 fput_light(group_file
, fput_needed
);
1192 err_free_put_context
:
1202 * Initialize the perf_counter context in a task_struct:
1205 __perf_counter_init_context(struct perf_counter_context
*ctx
,
1206 struct task_struct
*task
)
1208 memset(ctx
, 0, sizeof(*ctx
));
1209 spin_lock_init(&ctx
->lock
);
1210 INIT_LIST_HEAD(&ctx
->counter_list
);
1215 * inherit a counter from parent task to child task:
1218 inherit_counter(struct perf_counter
*parent_counter
,
1219 struct task_struct
*parent
,
1220 struct perf_counter_context
*parent_ctx
,
1221 struct task_struct
*child
,
1222 struct perf_counter_context
*child_ctx
)
1224 struct perf_counter
*child_counter
;
1226 child_counter
= perf_counter_alloc(&parent_counter
->hw_event
,
1227 parent_counter
->cpu
, NULL
,
1233 * Link it up in the child's context:
1235 child_counter
->ctx
= child_ctx
;
1236 child_counter
->task
= child
;
1237 list_add_counter(child_counter
, child_ctx
);
1238 child_ctx
->nr_counters
++;
1240 child_counter
->parent
= parent_counter
;
1241 parent_counter
->nr_inherited
++;
1243 * inherit into child's child as well:
1245 child_counter
->hw_event
.inherit
= 1;
1248 * Get a reference to the parent filp - we will fput it
1249 * when the child counter exits. This is safe to do because
1250 * we are in the parent and we know that the filp still
1251 * exists and has a nonzero count:
1253 atomic_long_inc(&parent_counter
->filp
->f_count
);
1259 __perf_counter_exit_task(struct task_struct
*child
,
1260 struct perf_counter
*child_counter
,
1261 struct perf_counter_context
*child_ctx
)
1263 struct perf_counter
*parent_counter
;
1264 u64 parent_val
, child_val
;
1268 * Disable and unlink this counter.
1270 * Be careful about zapping the list - IRQ/NMI context
1271 * could still be processing it:
1273 local_irq_disable();
1274 perf_flags
= hw_perf_save_disable();
1276 if (child_counter
->state
== PERF_COUNTER_STATE_ACTIVE
)
1277 child_counter
->hw_ops
->hw_perf_counter_disable(child_counter
);
1278 list_del_init(&child_counter
->list_entry
);
1280 hw_perf_restore(perf_flags
);
1283 parent_counter
= child_counter
->parent
;
1285 * It can happen that parent exits first, and has counters
1286 * that are still around due to the child reference. These
1287 * counters need to be zapped - but otherwise linger.
1289 if (!parent_counter
)
1292 parent_val
= atomic64_read(&parent_counter
->count
);
1293 child_val
= atomic64_read(&child_counter
->count
);
1296 * Add back the child's count to the parent's count:
1298 atomic64_add(child_val
, &parent_counter
->count
);
1300 fput(parent_counter
->filp
);
1302 kfree(child_counter
);
1306 * When a child task exist, feed back counter values to parent counters.
1308 * Note: we are running in child context, but the PID is not hashed
1309 * anymore so new counters will not be added.
1311 void perf_counter_exit_task(struct task_struct
*child
)
1313 struct perf_counter
*child_counter
, *tmp
;
1314 struct perf_counter_context
*child_ctx
;
1316 child_ctx
= &child
->perf_counter_ctx
;
1318 if (likely(!child_ctx
->nr_counters
))
1321 list_for_each_entry_safe(child_counter
, tmp
, &child_ctx
->counter_list
,
1323 __perf_counter_exit_task(child
, child_counter
, child_ctx
);
1327 * Initialize the perf_counter context in task_struct
1329 void perf_counter_init_task(struct task_struct
*child
)
1331 struct perf_counter_context
*child_ctx
, *parent_ctx
;
1332 struct perf_counter
*counter
, *parent_counter
;
1333 struct task_struct
*parent
= current
;
1334 unsigned long flags
;
1336 child_ctx
= &child
->perf_counter_ctx
;
1337 parent_ctx
= &parent
->perf_counter_ctx
;
1339 __perf_counter_init_context(child_ctx
, child
);
1342 * This is executed from the parent task context, so inherit
1343 * counters that have been marked for cloning:
1346 if (likely(!parent_ctx
->nr_counters
))
1350 * Lock the parent list. No need to lock the child - not PID
1351 * hashed yet and not running, so nobody can access it.
1353 spin_lock_irqsave(&parent_ctx
->lock
, flags
);
1356 * We dont have to disable NMIs - we are only looking at
1357 * the list, not manipulating it:
1359 list_for_each_entry(counter
, &parent_ctx
->counter_list
, list_entry
) {
1360 if (!counter
->hw_event
.inherit
|| counter
->group_leader
!= counter
)
1364 * Instead of creating recursive hierarchies of counters,
1365 * we link inheritd counters back to the original parent,
1366 * which has a filp for sure, which we use as the reference
1369 parent_counter
= counter
;
1370 if (counter
->parent
)
1371 parent_counter
= counter
->parent
;
1373 if (inherit_counter(parent_counter
, parent
,
1374 parent_ctx
, child
, child_ctx
))
1378 spin_unlock_irqrestore(&parent_ctx
->lock
, flags
);
1381 static void __cpuinit
perf_counter_init_cpu(int cpu
)
1383 struct perf_cpu_context
*cpuctx
;
1385 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1386 __perf_counter_init_context(&cpuctx
->ctx
, NULL
);
1388 mutex_lock(&perf_resource_mutex
);
1389 cpuctx
->max_pertask
= perf_max_counters
- perf_reserved_percpu
;
1390 mutex_unlock(&perf_resource_mutex
);
1392 hw_perf_counter_setup();
1395 #ifdef CONFIG_HOTPLUG_CPU
1396 static void __perf_counter_exit_cpu(void *info
)
1398 struct perf_cpu_context
*cpuctx
= &__get_cpu_var(perf_cpu_context
);
1399 struct perf_counter_context
*ctx
= &cpuctx
->ctx
;
1400 struct perf_counter
*counter
, *tmp
;
1402 list_for_each_entry_safe(counter
, tmp
, &ctx
->counter_list
, list_entry
)
1403 __perf_counter_remove_from_context(counter
);
1406 static void perf_counter_exit_cpu(int cpu
)
1408 smp_call_function_single(cpu
, __perf_counter_exit_cpu
, NULL
, 1);
1411 static inline void perf_counter_exit_cpu(int cpu
) { }
1414 static int __cpuinit
1415 perf_cpu_notify(struct notifier_block
*self
, unsigned long action
, void *hcpu
)
1417 unsigned int cpu
= (long)hcpu
;
1421 case CPU_UP_PREPARE
:
1422 case CPU_UP_PREPARE_FROZEN
:
1423 perf_counter_init_cpu(cpu
);
1426 case CPU_DOWN_PREPARE
:
1427 case CPU_DOWN_PREPARE_FROZEN
:
1428 perf_counter_exit_cpu(cpu
);
1438 static struct notifier_block __cpuinitdata perf_cpu_nb
= {
1439 .notifier_call
= perf_cpu_notify
,
1442 static int __init
perf_counter_init(void)
1444 perf_cpu_notify(&perf_cpu_nb
, (unsigned long)CPU_UP_PREPARE
,
1445 (void *)(long)smp_processor_id());
1446 register_cpu_notifier(&perf_cpu_nb
);
1450 early_initcall(perf_counter_init
);
1452 static ssize_t
perf_show_reserve_percpu(struct sysdev_class
*class, char *buf
)
1454 return sprintf(buf
, "%d\n", perf_reserved_percpu
);
1458 perf_set_reserve_percpu(struct sysdev_class
*class,
1462 struct perf_cpu_context
*cpuctx
;
1466 err
= strict_strtoul(buf
, 10, &val
);
1469 if (val
> perf_max_counters
)
1472 mutex_lock(&perf_resource_mutex
);
1473 perf_reserved_percpu
= val
;
1474 for_each_online_cpu(cpu
) {
1475 cpuctx
= &per_cpu(perf_cpu_context
, cpu
);
1476 spin_lock_irq(&cpuctx
->ctx
.lock
);
1477 mpt
= min(perf_max_counters
- cpuctx
->ctx
.nr_counters
,
1478 perf_max_counters
- perf_reserved_percpu
);
1479 cpuctx
->max_pertask
= mpt
;
1480 spin_unlock_irq(&cpuctx
->ctx
.lock
);
1482 mutex_unlock(&perf_resource_mutex
);
1487 static ssize_t
perf_show_overcommit(struct sysdev_class
*class, char *buf
)
1489 return sprintf(buf
, "%d\n", perf_overcommit
);
1493 perf_set_overcommit(struct sysdev_class
*class, const char *buf
, size_t count
)
1498 err
= strict_strtoul(buf
, 10, &val
);
1504 mutex_lock(&perf_resource_mutex
);
1505 perf_overcommit
= val
;
1506 mutex_unlock(&perf_resource_mutex
);
1511 static SYSDEV_CLASS_ATTR(
1514 perf_show_reserve_percpu
,
1515 perf_set_reserve_percpu
1518 static SYSDEV_CLASS_ATTR(
1521 perf_show_overcommit
,
1525 static struct attribute
*perfclass_attrs
[] = {
1526 &attr_reserve_percpu
.attr
,
1527 &attr_overcommit
.attr
,
1531 static struct attribute_group perfclass_attr_group
= {
1532 .attrs
= perfclass_attrs
,
1533 .name
= "perf_counters",
1536 static int __init
perf_counter_sysfs_init(void)
1538 return sysfs_create_group(&cpu_sysdev_class
.kset
.kobj
,
1539 &perfclass_attr_group
);
1541 device_initcall(perf_counter_sysfs_init
);