2 * Performance counter x86 architecture code
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
7 * For licencing details see kernel-base/COPYING
10 #include <linux/perf_counter.h>
11 #include <linux/capability.h>
12 #include <linux/notifier.h>
13 #include <linux/hardirq.h>
14 #include <linux/kprobes.h>
15 #include <linux/kdebug.h>
16 #include <linux/sched.h>
18 #include <asm/intel_arch_perfmon.h>
21 static bool perf_counters_initialized __read_mostly
;
24 * Number of (generic) HW counters:
26 static int nr_hw_counters __read_mostly
;
27 static u32 perf_counter_mask __read_mostly
;
29 /* No support for fixed function counters yet */
31 #define MAX_HW_COUNTERS 8
33 struct cpu_hw_counters
{
34 struct perf_counter
*counters
[MAX_HW_COUNTERS
];
35 unsigned long used
[BITS_TO_LONGS(MAX_HW_COUNTERS
)];
40 * Intel PerfMon v3. Used on Core2 and later.
42 static DEFINE_PER_CPU(struct cpu_hw_counters
, cpu_hw_counters
);
44 const int intel_perfmon_event_map
[] =
46 [PERF_COUNT_CYCLES
] = 0x003c,
47 [PERF_COUNT_INSTRUCTIONS
] = 0x00c0,
48 [PERF_COUNT_CACHE_REFERENCES
] = 0x4f2e,
49 [PERF_COUNT_CACHE_MISSES
] = 0x412e,
50 [PERF_COUNT_BRANCH_INSTRUCTIONS
] = 0x00c4,
51 [PERF_COUNT_BRANCH_MISSES
] = 0x00c5,
54 const int max_intel_perfmon_events
= ARRAY_SIZE(intel_perfmon_event_map
);
57 * Setup the hardware configuration for a given hw_event_type
59 int hw_perf_counter_init(struct perf_counter
*counter
, s32 hw_event_type
)
61 struct hw_perf_counter
*hwc
= &counter
->hw
;
63 if (unlikely(!perf_counters_initialized
))
67 * Count user events, and generate PMC IRQs:
68 * (keep 'enabled' bit clear for now)
70 hwc
->config
= ARCH_PERFMON_EVENTSEL_USR
| ARCH_PERFMON_EVENTSEL_INT
;
73 * If privileged enough, count OS events too, and allow
77 if (capable(CAP_SYS_ADMIN
)) {
78 hwc
->config
|= ARCH_PERFMON_EVENTSEL_OS
;
79 if (hw_event_type
& PERF_COUNT_NMI
)
83 hwc
->config_base
= MSR_ARCH_PERFMON_EVENTSEL0
;
84 hwc
->counter_base
= MSR_ARCH_PERFMON_PERFCTR0
;
86 hwc
->irq_period
= counter
->__irq_period
;
88 * Intel PMCs cannot be accessed sanely above 32 bit width,
89 * so we install an artificial 1<<31 period regardless of
90 * the generic counter period:
93 hwc
->irq_period
= 0x7FFFFFFF;
95 hwc
->next_count
= -((s32
) hwc
->irq_period
);
98 * Negative event types mean raw encoded event+umask values:
100 if (hw_event_type
< 0) {
101 counter
->hw_event_type
= -hw_event_type
;
102 counter
->hw_event_type
&= ~PERF_COUNT_NMI
;
104 hw_event_type
&= ~PERF_COUNT_NMI
;
105 if (hw_event_type
>= max_intel_perfmon_events
)
110 counter
->hw_event_type
= intel_perfmon_event_map
[hw_event_type
];
112 hwc
->config
|= counter
->hw_event_type
;
113 counter
->wakeup_pending
= 0;
118 static void __hw_perf_enable_all(void)
120 wrmsr(MSR_CORE_PERF_GLOBAL_CTRL
, perf_counter_mask
, 0);
123 void hw_perf_enable_all(void)
125 struct cpu_hw_counters
*cpuc
= &__get_cpu_var(cpu_hw_counters
);
127 cpuc
->enable_all
= 1;
128 __hw_perf_enable_all();
131 void hw_perf_disable_all(void)
133 struct cpu_hw_counters
*cpuc
= &__get_cpu_var(cpu_hw_counters
);
135 cpuc
->enable_all
= 0;
136 wrmsr(MSR_CORE_PERF_GLOBAL_CTRL
, 0, 0);
139 static DEFINE_PER_CPU(u64
, prev_next_count
[MAX_HW_COUNTERS
]);
141 static void __hw_perf_counter_enable(struct hw_perf_counter
*hwc
, int idx
)
143 per_cpu(prev_next_count
[idx
], smp_processor_id()) = hwc
->next_count
;
145 wrmsr(hwc
->counter_base
+ idx
, hwc
->next_count
, 0);
146 wrmsr(hwc
->config_base
+ idx
, hwc
->config
, 0);
149 void hw_perf_counter_enable(struct perf_counter
*counter
)
151 struct cpu_hw_counters
*cpuc
= &__get_cpu_var(cpu_hw_counters
);
152 struct hw_perf_counter
*hwc
= &counter
->hw
;
155 /* Try to get the previous counter again */
156 if (test_and_set_bit(idx
, cpuc
->used
)) {
157 idx
= find_first_zero_bit(cpuc
->used
, nr_hw_counters
);
158 set_bit(idx
, cpuc
->used
);
162 perf_counters_lapic_init(hwc
->nmi
);
164 wrmsr(hwc
->config_base
+ idx
,
165 hwc
->config
& ~ARCH_PERFMON_EVENTSEL0_ENABLE
, 0);
167 cpuc
->counters
[idx
] = counter
;
168 counter
->hw
.config
|= ARCH_PERFMON_EVENTSEL0_ENABLE
;
169 __hw_perf_counter_enable(hwc
, idx
);
173 static inline void atomic64_counter_set(struct perf_counter
*counter
, u64 val
)
175 atomic64_set(&counter
->count
, val
);
178 static inline u64
atomic64_counter_read(struct perf_counter
*counter
)
180 return atomic64_read(&counter
->count
);
184 * Todo: add proper atomic64_t support to 32-bit x86:
186 static inline void atomic64_counter_set(struct perf_counter
*counter
, u64 val64
)
188 u32
*val32
= (void *)&val64
;
190 atomic_set(counter
->count32
+ 0, *(val32
+ 0));
191 atomic_set(counter
->count32
+ 1, *(val32
+ 1));
194 static inline u64
atomic64_counter_read(struct perf_counter
*counter
)
196 return atomic_read(counter
->count32
+ 0) |
197 (u64
) atomic_read(counter
->count32
+ 1) << 32;
201 static void __hw_perf_save_counter(struct perf_counter
*counter
,
202 struct hw_perf_counter
*hwc
, int idx
)
209 * Get the raw hw counter value:
211 err
= rdmsrl_safe(hwc
->counter_base
+ idx
, &raw
);
215 * Rebase it to zero (it started counting at -irq_period),
216 * to see the delta since ->prev_count:
218 delta
= (s64
)hwc
->irq_period
+ (s64
)(s32
)raw
;
220 atomic64_counter_set(counter
, hwc
->prev_count
+ delta
);
223 * Adjust the ->prev_count offset - if we went beyond
224 * irq_period of units, then we got an IRQ and the counter
225 * was set back to -irq_period:
227 while (delta
>= (s64
)hwc
->irq_period
) {
228 hwc
->prev_count
+= hwc
->irq_period
;
229 delta
-= (s64
)hwc
->irq_period
;
233 * Calculate the next raw counter value we'll write into
234 * the counter at the next sched-in time:
236 delta
-= (s64
)hwc
->irq_period
;
238 hwc
->next_count
= (s32
)delta
;
241 void perf_counter_print_debug(void)
243 u64 ctrl
, status
, overflow
, pmc_ctrl
, pmc_count
, next_count
;
248 cpu
= smp_processor_id();
250 err
= rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL
, &ctrl
);
253 err
= rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS
, &status
);
256 err
= rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL
, &overflow
);
259 printk(KERN_INFO
"\n");
260 printk(KERN_INFO
"CPU#%d: ctrl: %016llx\n", cpu
, ctrl
);
261 printk(KERN_INFO
"CPU#%d: status: %016llx\n", cpu
, status
);
262 printk(KERN_INFO
"CPU#%d: overflow: %016llx\n", cpu
, overflow
);
264 for (idx
= 0; idx
< nr_hw_counters
; idx
++) {
265 err
= rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0
+ idx
, &pmc_ctrl
);
268 err
= rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0
+ idx
, &pmc_count
);
271 next_count
= per_cpu(prev_next_count
[idx
], cpu
);
273 printk(KERN_INFO
"CPU#%d: PMC%d ctrl: %016llx\n",
275 printk(KERN_INFO
"CPU#%d: PMC%d count: %016llx\n",
276 cpu
, idx
, pmc_count
);
277 printk(KERN_INFO
"CPU#%d: PMC%d next: %016llx\n",
278 cpu
, idx
, next_count
);
283 void hw_perf_counter_disable(struct perf_counter
*counter
)
285 struct cpu_hw_counters
*cpuc
= &__get_cpu_var(cpu_hw_counters
);
286 struct hw_perf_counter
*hwc
= &counter
->hw
;
287 unsigned int idx
= hwc
->idx
;
289 counter
->hw
.config
&= ~ARCH_PERFMON_EVENTSEL0_ENABLE
;
290 wrmsr(hwc
->config_base
+ idx
, hwc
->config
, 0);
292 clear_bit(idx
, cpuc
->used
);
293 cpuc
->counters
[idx
] = NULL
;
294 __hw_perf_save_counter(counter
, hwc
, idx
);
297 void hw_perf_counter_read(struct perf_counter
*counter
)
299 struct hw_perf_counter
*hwc
= &counter
->hw
;
300 unsigned long addr
= hwc
->counter_base
+ hwc
->idx
;
301 s64 offs
, val
= -1LL;
305 /* Careful: NMI might modify the counter offset */
307 offs
= hwc
->prev_count
;
308 err
= rdmsrl_safe(addr
, &val
);
310 } while (offs
!= hwc
->prev_count
);
313 val
= (s64
)hwc
->irq_period
+ (s64
)val32
;
314 atomic64_counter_set(counter
, hwc
->prev_count
+ val
);
317 static void perf_store_irq_data(struct perf_counter
*counter
, u64 data
)
319 struct perf_data
*irqdata
= counter
->irqdata
;
321 if (irqdata
->len
> PERF_DATA_BUFLEN
- sizeof(u64
)) {
324 u64
*p
= (u64
*) &irqdata
->data
[irqdata
->len
];
327 irqdata
->len
+= sizeof(u64
);
331 static void perf_save_and_restart(struct perf_counter
*counter
)
333 struct hw_perf_counter
*hwc
= &counter
->hw
;
336 wrmsr(hwc
->config_base
+ idx
,
337 hwc
->config
& ~ARCH_PERFMON_EVENTSEL0_ENABLE
, 0);
339 if (hwc
->config
& ARCH_PERFMON_EVENTSEL0_ENABLE
) {
340 __hw_perf_save_counter(counter
, hwc
, idx
);
341 __hw_perf_counter_enable(hwc
, idx
);
346 perf_handle_group(struct perf_counter
*leader
, u64
*status
, u64
*overflown
)
348 struct perf_counter_context
*ctx
= leader
->ctx
;
349 struct perf_counter
*counter
;
352 list_for_each_entry(counter
, &ctx
->counters
, list
) {
353 if (counter
->record_type
!= PERF_RECORD_SIMPLE
||
357 if (counter
->active
) {
359 * When counter was not in the overflow mask, we have to
360 * read it from hardware. We read it as well, when it
361 * has not been read yet and clear the bit in the
364 bit
= counter
->hw
.idx
;
365 if (!test_bit(bit
, (unsigned long *) overflown
) ||
366 test_bit(bit
, (unsigned long *) status
)) {
367 clear_bit(bit
, (unsigned long *) status
);
368 perf_save_and_restart(counter
);
371 perf_store_irq_data(leader
, counter
->hw_event_type
);
372 perf_store_irq_data(leader
, atomic64_counter_read(counter
));
377 * This handler is triggered by the local APIC, so the APIC IRQ handling
380 static void __smp_perf_counter_interrupt(struct pt_regs
*regs
, int nmi
)
382 int bit
, cpu
= smp_processor_id();
383 struct cpu_hw_counters
*cpuc
;
386 /* Disable counters globally */
387 wrmsr(MSR_CORE_PERF_GLOBAL_CTRL
, 0, 0);
390 cpuc
= &per_cpu(cpu_hw_counters
, cpu
);
392 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS
, status
);
398 for_each_bit(bit
, (unsigned long *) &status
, nr_hw_counters
) {
399 struct perf_counter
*counter
= cpuc
->counters
[bit
];
401 clear_bit(bit
, (unsigned long *) &status
);
405 perf_save_and_restart(counter
);
407 switch (counter
->record_type
) {
408 case PERF_RECORD_SIMPLE
:
410 case PERF_RECORD_IRQ
:
411 perf_store_irq_data(counter
, instruction_pointer(regs
));
413 case PERF_RECORD_GROUP
:
414 perf_store_irq_data(counter
, counter
->hw_event_type
);
415 perf_store_irq_data(counter
,
416 atomic64_counter_read(counter
));
417 perf_handle_group(counter
, &status
, &ack
);
421 * From NMI context we cannot call into the scheduler to
422 * do a task wakeup - but we mark these counters as
423 * wakeup_pending and initate a wakeup callback:
426 counter
->wakeup_pending
= 1;
427 set_tsk_thread_flag(current
, TIF_PERF_COUNTERS
);
429 wake_up(&counter
->waitq
);
433 wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL
, ack
, 0);
436 * Repeat if there is more work to be done:
438 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS
, status
);
443 * Do not reenable when global enable is off:
445 if (cpuc
->enable_all
)
446 __hw_perf_enable_all();
449 void smp_perf_counter_interrupt(struct pt_regs
*regs
)
453 add_pda(apic_perf_irqs
, 1);
455 per_cpu(irq_stat
, smp_processor_id()).apic_perf_irqs
++;
457 apic_write(APIC_LVTPC
, LOCAL_PERF_VECTOR
);
458 __smp_perf_counter_interrupt(regs
, 0);
464 * This handler is triggered by NMI contexts:
466 void perf_counter_notify(struct pt_regs
*regs
)
468 struct cpu_hw_counters
*cpuc
;
472 local_irq_save(flags
);
473 cpu
= smp_processor_id();
474 cpuc
= &per_cpu(cpu_hw_counters
, cpu
);
476 for_each_bit(bit
, cpuc
->used
, nr_hw_counters
) {
477 struct perf_counter
*counter
= cpuc
->counters
[bit
];
482 if (counter
->wakeup_pending
) {
483 counter
->wakeup_pending
= 0;
484 wake_up(&counter
->waitq
);
488 local_irq_restore(flags
);
491 void __cpuinit
perf_counters_lapic_init(int nmi
)
495 if (!perf_counters_initialized
)
498 * Enable the performance counter vector in the APIC LVT:
500 apic_val
= apic_read(APIC_LVTERR
);
502 apic_write(APIC_LVTERR
, apic_val
| APIC_LVT_MASKED
);
504 apic_write(APIC_LVTPC
, APIC_DM_NMI
);
506 apic_write(APIC_LVTPC
, LOCAL_PERF_VECTOR
);
507 apic_write(APIC_LVTERR
, apic_val
);
511 perf_counter_nmi_handler(struct notifier_block
*self
,
512 unsigned long cmd
, void *__args
)
514 struct die_args
*args
= __args
;
515 struct pt_regs
*regs
;
517 if (likely(cmd
!= DIE_NMI_IPI
))
522 apic_write(APIC_LVTPC
, APIC_DM_NMI
);
523 __smp_perf_counter_interrupt(regs
, 1);
528 static __read_mostly
struct notifier_block perf_counter_nmi_notifier
= {
529 .notifier_call
= perf_counter_nmi_handler
532 void __init
init_hw_perf_counters(void)
534 union cpuid10_eax eax
;
538 if (!cpu_has(&boot_cpu_data
, X86_FEATURE_ARCH_PERFMON
))
542 * Check whether the Architectural PerfMon supports
543 * Branch Misses Retired Event or not.
545 cpuid(10, &(eax
.full
), &ebx
, &unused
, &unused
);
546 if (eax
.split
.mask_length
<= ARCH_PERFMON_BRANCH_MISSES_RETIRED
)
549 printk(KERN_INFO
"Intel Performance Monitoring support detected.\n");
551 printk(KERN_INFO
"... version: %d\n", eax
.split
.version_id
);
552 printk(KERN_INFO
"... num_counters: %d\n", eax
.split
.num_counters
);
553 nr_hw_counters
= eax
.split
.num_counters
;
554 if (nr_hw_counters
> MAX_HW_COUNTERS
) {
555 nr_hw_counters
= MAX_HW_COUNTERS
;
556 WARN(1, KERN_ERR
"hw perf counters %d > max(%d), clipping!",
557 nr_hw_counters
, MAX_HW_COUNTERS
);
559 perf_counter_mask
= (1 << nr_hw_counters
) - 1;
560 perf_max_counters
= nr_hw_counters
;
562 printk(KERN_INFO
"... bit_width: %d\n", eax
.split
.bit_width
);
563 printk(KERN_INFO
"... mask_length: %d\n", eax
.split
.mask_length
);
565 perf_counters_lapic_init(0);
566 register_die_notifier(&perf_counter_nmi_notifier
);
568 perf_counters_initialized
= true;