Commit | Line | Data |
---|---|---|
8d61dd7d ZL |
1 | /* |
2 | * Copyright 2014 Tilera Corporation. All Rights Reserved. | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or | |
5 | * modify it under the terms of the GNU General Public License | |
6 | * as published by the Free Software Foundation, version 2. | |
7 | * | |
8 | * This program is distributed in the hope that it will be useful, but | |
9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
10 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | |
11 | * NON INFRINGEMENT. See the GNU General Public License for | |
12 | * more details. | |
13 | * | |
14 | * | |
15 | * Perf_events support for Tile processor. | |
16 | * | |
17 | * This code is based upon the x86 perf event | |
18 | * code, which is: | |
19 | * | |
20 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | |
21 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar | |
22 | * Copyright (C) 2009 Jaswinder Singh Rajput | |
23 | * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter | |
24 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | |
25 | * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> | |
26 | * Copyright (C) 2009 Google, Inc., Stephane Eranian | |
27 | */ | |
28 | ||
29 | #include <linux/kprobes.h> | |
30 | #include <linux/kernel.h> | |
31 | #include <linux/kdebug.h> | |
32 | #include <linux/mutex.h> | |
33 | #include <linux/bitmap.h> | |
34 | #include <linux/irq.h> | |
35 | #include <linux/interrupt.h> | |
36 | #include <linux/perf_event.h> | |
37 | #include <linux/atomic.h> | |
38 | #include <asm/traps.h> | |
39 | #include <asm/stack.h> | |
40 | #include <asm/pmc.h> | |
41 | #include <hv/hypervisor.h> | |
42 | ||
43 | #define TILE_MAX_COUNTERS 4 | |
44 | ||
45 | #define PERF_COUNT_0_IDX 0 | |
46 | #define PERF_COUNT_1_IDX 1 | |
47 | #define AUX_PERF_COUNT_0_IDX 2 | |
48 | #define AUX_PERF_COUNT_1_IDX 3 | |
49 | ||
50 | struct cpu_hw_events { | |
51 | int n_events; | |
52 | struct perf_event *events[TILE_MAX_COUNTERS]; /* counter order */ | |
53 | struct perf_event *event_list[TILE_MAX_COUNTERS]; /* enabled | |
54 | order */ | |
55 | int assign[TILE_MAX_COUNTERS]; | |
56 | unsigned long active_mask[BITS_TO_LONGS(TILE_MAX_COUNTERS)]; | |
57 | unsigned long used_mask; | |
58 | }; | |
59 | ||
60 | /* TILE arch specific performance monitor unit */ | |
61 | struct tile_pmu { | |
62 | const char *name; | |
63 | int version; | |
64 | const int *hw_events; /* generic hw events table */ | |
65 | /* generic hw cache events table */ | |
66 | const int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] | |
67 | [PERF_COUNT_HW_CACHE_OP_MAX] | |
68 | [PERF_COUNT_HW_CACHE_RESULT_MAX]; | |
69 | int (*map_hw_event)(u64); /*method used to map | |
70 | hw events */ | |
71 | int (*map_cache_event)(u64); /*method used to map | |
72 | cache events */ | |
73 | ||
74 | u64 max_period; /* max sampling period */ | |
75 | u64 cntval_mask; /* counter width mask */ | |
76 | int cntval_bits; /* counter width */ | |
77 | int max_events; /* max generic hw events | |
78 | in map */ | |
79 | int num_counters; /* number base + aux counters */ | |
80 | int num_base_counters; /* number base counters */ | |
81 | }; | |
82 | ||
83 | DEFINE_PER_CPU(u64, perf_irqs); | |
84 | static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); | |
85 | ||
86 | #define TILE_OP_UNSUPP (-1) | |
87 | ||
88 | #ifndef __tilegx__ | |
89 | /* TILEPro hardware events map */ | |
90 | static const int tile_hw_event_map[] = { | |
91 | [PERF_COUNT_HW_CPU_CYCLES] = 0x01, /* ONE */ | |
92 | [PERF_COUNT_HW_INSTRUCTIONS] = 0x06, /* MP_BUNDLE_RETIRED */ | |
93 | [PERF_COUNT_HW_CACHE_REFERENCES] = TILE_OP_UNSUPP, | |
94 | [PERF_COUNT_HW_CACHE_MISSES] = TILE_OP_UNSUPP, | |
95 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x16, /* | |
96 | MP_CONDITIONAL_BRANCH_ISSUED */ | |
97 | [PERF_COUNT_HW_BRANCH_MISSES] = 0x14, /* | |
98 | MP_CONDITIONAL_BRANCH_MISSPREDICT */ | |
99 | [PERF_COUNT_HW_BUS_CYCLES] = TILE_OP_UNSUPP, | |
100 | }; | |
101 | #else | |
102 | /* TILEGx hardware events map */ | |
103 | static const int tile_hw_event_map[] = { | |
104 | [PERF_COUNT_HW_CPU_CYCLES] = 0x181, /* ONE */ | |
105 | [PERF_COUNT_HW_INSTRUCTIONS] = 0xdb, /* INSTRUCTION_BUNDLE */ | |
106 | [PERF_COUNT_HW_CACHE_REFERENCES] = TILE_OP_UNSUPP, | |
107 | [PERF_COUNT_HW_CACHE_MISSES] = TILE_OP_UNSUPP, | |
108 | [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0xd9, /* | |
109 | COND_BRANCH_PRED_CORRECT */ | |
110 | [PERF_COUNT_HW_BRANCH_MISSES] = 0xda, /* | |
111 | COND_BRANCH_PRED_INCORRECT */ | |
112 | [PERF_COUNT_HW_BUS_CYCLES] = TILE_OP_UNSUPP, | |
113 | }; | |
114 | #endif | |
115 | ||
116 | #define C(x) PERF_COUNT_HW_CACHE_##x | |
117 | ||
118 | /* | |
119 | * Generalized hw caching related hw_event table, filled | |
120 | * in on a per model basis. A value of -1 means | |
121 | * 'not supported', any other value means the | |
122 | * raw hw_event ID. | |
123 | */ | |
124 | #ifndef __tilegx__ | |
125 | /* TILEPro hardware cache event map */ | |
126 | static const int tile_cache_event_map[PERF_COUNT_HW_CACHE_MAX] | |
127 | [PERF_COUNT_HW_CACHE_OP_MAX] | |
128 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = { | |
129 | [C(L1D)] = { | |
130 | [C(OP_READ)] = { | |
131 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
132 | [C(RESULT_MISS)] = 0x21, /* RD_MISS */ | |
133 | }, | |
134 | [C(OP_WRITE)] = { | |
135 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
136 | [C(RESULT_MISS)] = 0x22, /* WR_MISS */ | |
137 | }, | |
138 | [C(OP_PREFETCH)] = { | |
139 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
140 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
141 | }, | |
142 | }, | |
143 | [C(L1I)] = { | |
144 | [C(OP_READ)] = { | |
145 | [C(RESULT_ACCESS)] = 0x12, /* MP_ICACHE_HIT_ISSUED */ | |
146 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
147 | }, | |
148 | [C(OP_WRITE)] = { | |
149 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
150 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
151 | }, | |
152 | [C(OP_PREFETCH)] = { | |
153 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
154 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
155 | }, | |
156 | }, | |
157 | [C(LL)] = { | |
158 | [C(OP_READ)] = { | |
159 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
160 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
161 | }, | |
162 | [C(OP_WRITE)] = { | |
163 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
164 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
165 | }, | |
166 | [C(OP_PREFETCH)] = { | |
167 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
168 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
169 | }, | |
170 | }, | |
171 | [C(DTLB)] = { | |
172 | [C(OP_READ)] = { | |
173 | [C(RESULT_ACCESS)] = 0x1d, /* TLB_CNT */ | |
174 | [C(RESULT_MISS)] = 0x20, /* TLB_EXCEPTION */ | |
175 | }, | |
176 | [C(OP_WRITE)] = { | |
177 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
178 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
179 | }, | |
180 | [C(OP_PREFETCH)] = { | |
181 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
182 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
183 | }, | |
184 | }, | |
185 | [C(ITLB)] = { | |
186 | [C(OP_READ)] = { | |
187 | [C(RESULT_ACCESS)] = 0x13, /* MP_ITLB_HIT_ISSUED */ | |
188 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
189 | }, | |
190 | [C(OP_WRITE)] = { | |
191 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
192 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
193 | }, | |
194 | [C(OP_PREFETCH)] = { | |
195 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
196 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
197 | }, | |
198 | }, | |
199 | [C(BPU)] = { | |
200 | [C(OP_READ)] = { | |
201 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
202 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
203 | }, | |
204 | [C(OP_WRITE)] = { | |
205 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
206 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
207 | }, | |
208 | [C(OP_PREFETCH)] = { | |
209 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
210 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
211 | }, | |
212 | }, | |
213 | }; | |
214 | #else | |
215 | /* TILEGx hardware events map */ | |
216 | static const int tile_cache_event_map[PERF_COUNT_HW_CACHE_MAX] | |
217 | [PERF_COUNT_HW_CACHE_OP_MAX] | |
218 | [PERF_COUNT_HW_CACHE_RESULT_MAX] = { | |
219 | [C(L1D)] = { | |
220 | /* | |
221 | * Like some other architectures (e.g. ARM), the performance | |
222 | * counters don't differentiate between read and write | |
223 | * accesses/misses, so this isn't strictly correct, but it's the | |
224 | * best we can do. Writes and reads get combined. | |
225 | */ | |
226 | [C(OP_READ)] = { | |
227 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
228 | [C(RESULT_MISS)] = 0x44, /* RD_MISS */ | |
229 | }, | |
230 | [C(OP_WRITE)] = { | |
231 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
232 | [C(RESULT_MISS)] = 0x45, /* WR_MISS */ | |
233 | }, | |
234 | [C(OP_PREFETCH)] = { | |
235 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
236 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
237 | }, | |
238 | }, | |
239 | [C(L1I)] = { | |
240 | [C(OP_READ)] = { | |
241 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
242 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
243 | }, | |
244 | [C(OP_WRITE)] = { | |
245 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
246 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
247 | }, | |
248 | [C(OP_PREFETCH)] = { | |
249 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
250 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
251 | }, | |
252 | }, | |
253 | [C(LL)] = { | |
254 | [C(OP_READ)] = { | |
255 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
256 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
257 | }, | |
258 | [C(OP_WRITE)] = { | |
259 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
260 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
261 | }, | |
262 | [C(OP_PREFETCH)] = { | |
263 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
264 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
265 | }, | |
266 | }, | |
267 | [C(DTLB)] = { | |
268 | [C(OP_READ)] = { | |
269 | [C(RESULT_ACCESS)] = 0x40, /* TLB_CNT */ | |
270 | [C(RESULT_MISS)] = 0x43, /* TLB_EXCEPTION */ | |
271 | }, | |
272 | [C(OP_WRITE)] = { | |
273 | [C(RESULT_ACCESS)] = 0x40, /* TLB_CNT */ | |
274 | [C(RESULT_MISS)] = 0x43, /* TLB_EXCEPTION */ | |
275 | }, | |
276 | [C(OP_PREFETCH)] = { | |
277 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
278 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
279 | }, | |
280 | }, | |
281 | [C(ITLB)] = { | |
282 | [C(OP_READ)] = { | |
283 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
284 | [C(RESULT_MISS)] = 0xd4, /* ITLB_MISS_INT */ | |
285 | }, | |
286 | [C(OP_WRITE)] = { | |
287 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
288 | [C(RESULT_MISS)] = 0xd4, /* ITLB_MISS_INT */ | |
289 | }, | |
290 | [C(OP_PREFETCH)] = { | |
291 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
292 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
293 | }, | |
294 | }, | |
295 | [C(BPU)] = { | |
296 | [C(OP_READ)] = { | |
297 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
298 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
299 | }, | |
300 | [C(OP_WRITE)] = { | |
301 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
302 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
303 | }, | |
304 | [C(OP_PREFETCH)] = { | |
305 | [C(RESULT_ACCESS)] = TILE_OP_UNSUPP, | |
306 | [C(RESULT_MISS)] = TILE_OP_UNSUPP, | |
307 | }, | |
308 | }, | |
309 | }; | |
310 | #endif | |
311 | ||
312 | static atomic_t tile_active_events; | |
313 | static DEFINE_MUTEX(perf_intr_reserve_mutex); | |
314 | ||
315 | static int tile_map_hw_event(u64 config); | |
316 | static int tile_map_cache_event(u64 config); | |
317 | ||
318 | static int tile_pmu_handle_irq(struct pt_regs *regs, int fault); | |
319 | ||
320 | /* | |
321 | * To avoid new_raw_count getting larger then pre_raw_count | |
322 | * in tile_perf_event_update(), we limit the value of max_period to 2^31 - 1. | |
323 | */ | |
324 | static const struct tile_pmu tilepmu = { | |
325 | #ifndef __tilegx__ | |
326 | .name = "tilepro", | |
327 | #else | |
328 | .name = "tilegx", | |
329 | #endif | |
330 | .max_events = ARRAY_SIZE(tile_hw_event_map), | |
331 | .map_hw_event = tile_map_hw_event, | |
332 | .hw_events = tile_hw_event_map, | |
333 | .map_cache_event = tile_map_cache_event, | |
334 | .cache_events = &tile_cache_event_map, | |
335 | .cntval_bits = 32, | |
336 | .cntval_mask = (1ULL << 32) - 1, | |
337 | .max_period = (1ULL << 31) - 1, | |
338 | .num_counters = TILE_MAX_COUNTERS, | |
339 | .num_base_counters = TILE_BASE_COUNTERS, | |
340 | }; | |
341 | ||
342 | static const struct tile_pmu *tile_pmu __read_mostly; | |
343 | ||
344 | /* | |
345 | * Check whether perf event is enabled. | |
346 | */ | |
347 | int tile_perf_enabled(void) | |
348 | { | |
349 | return atomic_read(&tile_active_events) != 0; | |
350 | } | |
351 | ||
352 | /* | |
353 | * Read Performance Counters. | |
354 | */ | |
355 | static inline u64 read_counter(int idx) | |
356 | { | |
357 | u64 val = 0; | |
358 | ||
359 | /* __insn_mfspr() only takes an immediate argument */ | |
360 | switch (idx) { | |
361 | case PERF_COUNT_0_IDX: | |
362 | val = __insn_mfspr(SPR_PERF_COUNT_0); | |
363 | break; | |
364 | case PERF_COUNT_1_IDX: | |
365 | val = __insn_mfspr(SPR_PERF_COUNT_1); | |
366 | break; | |
367 | case AUX_PERF_COUNT_0_IDX: | |
368 | val = __insn_mfspr(SPR_AUX_PERF_COUNT_0); | |
369 | break; | |
370 | case AUX_PERF_COUNT_1_IDX: | |
371 | val = __insn_mfspr(SPR_AUX_PERF_COUNT_1); | |
372 | break; | |
373 | default: | |
374 | WARN_ON_ONCE(idx > AUX_PERF_COUNT_1_IDX || | |
375 | idx < PERF_COUNT_0_IDX); | |
376 | } | |
377 | ||
378 | return val; | |
379 | } | |
380 | ||
381 | /* | |
382 | * Write Performance Counters. | |
383 | */ | |
384 | static inline void write_counter(int idx, u64 value) | |
385 | { | |
386 | /* __insn_mtspr() only takes an immediate argument */ | |
387 | switch (idx) { | |
388 | case PERF_COUNT_0_IDX: | |
389 | __insn_mtspr(SPR_PERF_COUNT_0, value); | |
390 | break; | |
391 | case PERF_COUNT_1_IDX: | |
392 | __insn_mtspr(SPR_PERF_COUNT_1, value); | |
393 | break; | |
394 | case AUX_PERF_COUNT_0_IDX: | |
395 | __insn_mtspr(SPR_AUX_PERF_COUNT_0, value); | |
396 | break; | |
397 | case AUX_PERF_COUNT_1_IDX: | |
398 | __insn_mtspr(SPR_AUX_PERF_COUNT_1, value); | |
399 | break; | |
400 | default: | |
401 | WARN_ON_ONCE(idx > AUX_PERF_COUNT_1_IDX || | |
402 | idx < PERF_COUNT_0_IDX); | |
403 | } | |
404 | } | |
405 | ||
406 | /* | |
407 | * Enable performance event by setting | |
408 | * Performance Counter Control registers. | |
409 | */ | |
410 | static inline void tile_pmu_enable_event(struct perf_event *event) | |
411 | { | |
412 | struct hw_perf_event *hwc = &event->hw; | |
413 | unsigned long cfg, mask; | |
414 | int shift, idx = hwc->idx; | |
415 | ||
416 | /* | |
417 | * prevent early activation from tile_pmu_start() in hw_perf_enable | |
418 | */ | |
419 | ||
420 | if (WARN_ON_ONCE(idx == -1)) | |
421 | return; | |
422 | ||
423 | if (idx < tile_pmu->num_base_counters) | |
424 | cfg = __insn_mfspr(SPR_PERF_COUNT_CTL); | |
425 | else | |
426 | cfg = __insn_mfspr(SPR_AUX_PERF_COUNT_CTL); | |
427 | ||
428 | switch (idx) { | |
429 | case PERF_COUNT_0_IDX: | |
430 | case AUX_PERF_COUNT_0_IDX: | |
431 | mask = TILE_EVENT_MASK; | |
432 | shift = 0; | |
433 | break; | |
434 | case PERF_COUNT_1_IDX: | |
435 | case AUX_PERF_COUNT_1_IDX: | |
436 | mask = TILE_EVENT_MASK << 16; | |
437 | shift = 16; | |
438 | break; | |
439 | default: | |
440 | WARN_ON_ONCE(idx < PERF_COUNT_0_IDX || | |
441 | idx > AUX_PERF_COUNT_1_IDX); | |
442 | return; | |
443 | } | |
444 | ||
445 | /* Clear mask bits to enable the event. */ | |
446 | cfg &= ~mask; | |
447 | cfg |= hwc->config << shift; | |
448 | ||
449 | if (idx < tile_pmu->num_base_counters) | |
450 | __insn_mtspr(SPR_PERF_COUNT_CTL, cfg); | |
451 | else | |
452 | __insn_mtspr(SPR_AUX_PERF_COUNT_CTL, cfg); | |
453 | } | |
454 | ||
455 | /* | |
456 | * Disable performance event by clearing | |
457 | * Performance Counter Control registers. | |
458 | */ | |
459 | static inline void tile_pmu_disable_event(struct perf_event *event) | |
460 | { | |
461 | struct hw_perf_event *hwc = &event->hw; | |
462 | unsigned long cfg, mask; | |
463 | int idx = hwc->idx; | |
464 | ||
465 | if (idx == -1) | |
466 | return; | |
467 | ||
468 | if (idx < tile_pmu->num_base_counters) | |
469 | cfg = __insn_mfspr(SPR_PERF_COUNT_CTL); | |
470 | else | |
471 | cfg = __insn_mfspr(SPR_AUX_PERF_COUNT_CTL); | |
472 | ||
473 | switch (idx) { | |
474 | case PERF_COUNT_0_IDX: | |
475 | case AUX_PERF_COUNT_0_IDX: | |
476 | mask = TILE_PLM_MASK; | |
477 | break; | |
478 | case PERF_COUNT_1_IDX: | |
479 | case AUX_PERF_COUNT_1_IDX: | |
480 | mask = TILE_PLM_MASK << 16; | |
481 | break; | |
482 | default: | |
483 | WARN_ON_ONCE(idx < PERF_COUNT_0_IDX || | |
484 | idx > AUX_PERF_COUNT_1_IDX); | |
485 | return; | |
486 | } | |
487 | ||
488 | /* Set mask bits to disable the event. */ | |
489 | cfg |= mask; | |
490 | ||
491 | if (idx < tile_pmu->num_base_counters) | |
492 | __insn_mtspr(SPR_PERF_COUNT_CTL, cfg); | |
493 | else | |
494 | __insn_mtspr(SPR_AUX_PERF_COUNT_CTL, cfg); | |
495 | } | |
496 | ||
497 | /* | |
498 | * Propagate event elapsed time into the generic event. | |
499 | * Can only be executed on the CPU where the event is active. | |
500 | * Returns the delta events processed. | |
501 | */ | |
502 | static u64 tile_perf_event_update(struct perf_event *event) | |
503 | { | |
504 | struct hw_perf_event *hwc = &event->hw; | |
505 | int shift = 64 - tile_pmu->cntval_bits; | |
506 | u64 prev_raw_count, new_raw_count; | |
507 | u64 oldval; | |
508 | int idx = hwc->idx; | |
509 | u64 delta; | |
510 | ||
511 | /* | |
512 | * Careful: an NMI might modify the previous event value. | |
513 | * | |
514 | * Our tactic to handle this is to first atomically read and | |
515 | * exchange a new raw count - then add that new-prev delta | |
516 | * count to the generic event atomically: | |
517 | */ | |
518 | again: | |
519 | prev_raw_count = local64_read(&hwc->prev_count); | |
520 | new_raw_count = read_counter(idx); | |
521 | ||
522 | oldval = local64_cmpxchg(&hwc->prev_count, prev_raw_count, | |
523 | new_raw_count); | |
524 | if (oldval != prev_raw_count) | |
525 | goto again; | |
526 | ||
527 | /* | |
528 | * Now we have the new raw value and have updated the prev | |
529 | * timestamp already. We can now calculate the elapsed delta | |
530 | * (event-)time and add that to the generic event. | |
531 | * | |
532 | * Careful, not all hw sign-extends above the physical width | |
533 | * of the count. | |
534 | */ | |
535 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | |
536 | delta >>= shift; | |
537 | ||
538 | local64_add(delta, &event->count); | |
539 | local64_sub(delta, &hwc->period_left); | |
540 | ||
541 | return new_raw_count; | |
542 | } | |
543 | ||
544 | /* | |
545 | * Set the next IRQ period, based on the hwc->period_left value. | |
546 | * To be called with the event disabled in hw: | |
547 | */ | |
548 | static int tile_event_set_period(struct perf_event *event) | |
549 | { | |
550 | struct hw_perf_event *hwc = &event->hw; | |
551 | int idx = hwc->idx; | |
552 | s64 left = local64_read(&hwc->period_left); | |
553 | s64 period = hwc->sample_period; | |
554 | int ret = 0; | |
555 | ||
556 | /* | |
557 | * If we are way outside a reasonable range then just skip forward: | |
558 | */ | |
559 | if (unlikely(left <= -period)) { | |
560 | left = period; | |
561 | local64_set(&hwc->period_left, left); | |
562 | hwc->last_period = period; | |
563 | ret = 1; | |
564 | } | |
565 | ||
566 | if (unlikely(left <= 0)) { | |
567 | left += period; | |
568 | local64_set(&hwc->period_left, left); | |
569 | hwc->last_period = period; | |
570 | ret = 1; | |
571 | } | |
572 | if (left > tile_pmu->max_period) | |
573 | left = tile_pmu->max_period; | |
574 | ||
575 | /* | |
576 | * The hw event starts counting from this event offset, | |
577 | * mark it to be able to extra future deltas: | |
578 | */ | |
579 | local64_set(&hwc->prev_count, (u64)-left); | |
580 | ||
581 | write_counter(idx, (u64)(-left) & tile_pmu->cntval_mask); | |
582 | ||
583 | perf_event_update_userpage(event); | |
584 | ||
585 | return ret; | |
586 | } | |
587 | ||
588 | /* | |
589 | * Stop the event but do not release the PMU counter | |
590 | */ | |
591 | static void tile_pmu_stop(struct perf_event *event, int flags) | |
592 | { | |
593 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | |
594 | struct hw_perf_event *hwc = &event->hw; | |
595 | int idx = hwc->idx; | |
596 | ||
597 | if (__test_and_clear_bit(idx, cpuc->active_mask)) { | |
598 | tile_pmu_disable_event(event); | |
599 | cpuc->events[hwc->idx] = NULL; | |
600 | WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); | |
601 | hwc->state |= PERF_HES_STOPPED; | |
602 | } | |
603 | ||
604 | if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { | |
605 | /* | |
606 | * Drain the remaining delta count out of a event | |
607 | * that we are disabling: | |
608 | */ | |
609 | tile_perf_event_update(event); | |
610 | hwc->state |= PERF_HES_UPTODATE; | |
611 | } | |
612 | } | |
613 | ||
614 | /* | |
615 | * Start an event (without re-assigning counter) | |
616 | */ | |
617 | static void tile_pmu_start(struct perf_event *event, int flags) | |
618 | { | |
619 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | |
620 | int idx = event->hw.idx; | |
621 | ||
622 | if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) | |
623 | return; | |
624 | ||
625 | if (WARN_ON_ONCE(idx == -1)) | |
626 | return; | |
627 | ||
628 | if (flags & PERF_EF_RELOAD) { | |
629 | WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); | |
630 | tile_event_set_period(event); | |
631 | } | |
632 | ||
633 | event->hw.state = 0; | |
634 | ||
635 | cpuc->events[idx] = event; | |
636 | __set_bit(idx, cpuc->active_mask); | |
637 | ||
638 | unmask_pmc_interrupts(); | |
639 | ||
640 | tile_pmu_enable_event(event); | |
641 | ||
642 | perf_event_update_userpage(event); | |
643 | } | |
644 | ||
645 | /* | |
646 | * Add a single event to the PMU. | |
647 | * | |
648 | * The event is added to the group of enabled events | |
649 | * but only if it can be scehduled with existing events. | |
650 | */ | |
651 | static int tile_pmu_add(struct perf_event *event, int flags) | |
652 | { | |
653 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | |
654 | struct hw_perf_event *hwc; | |
655 | unsigned long mask; | |
656 | int b, max_cnt; | |
657 | ||
658 | hwc = &event->hw; | |
659 | ||
660 | /* | |
661 | * We are full. | |
662 | */ | |
663 | if (cpuc->n_events == tile_pmu->num_counters) | |
664 | return -ENOSPC; | |
665 | ||
666 | cpuc->event_list[cpuc->n_events] = event; | |
667 | cpuc->n_events++; | |
668 | ||
669 | hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; | |
670 | if (!(flags & PERF_EF_START)) | |
671 | hwc->state |= PERF_HES_ARCH; | |
672 | ||
673 | /* | |
674 | * Find first empty counter. | |
675 | */ | |
676 | max_cnt = tile_pmu->num_counters; | |
677 | mask = ~cpuc->used_mask; | |
678 | ||
679 | /* Find next free counter. */ | |
680 | b = find_next_bit(&mask, max_cnt, 0); | |
681 | ||
682 | /* Should not happen. */ | |
683 | if (WARN_ON_ONCE(b == max_cnt)) | |
684 | return -ENOSPC; | |
685 | ||
686 | /* | |
687 | * Assign counter to event. | |
688 | */ | |
689 | event->hw.idx = b; | |
690 | __set_bit(b, &cpuc->used_mask); | |
691 | ||
692 | /* | |
693 | * Start if requested. | |
694 | */ | |
695 | if (flags & PERF_EF_START) | |
696 | tile_pmu_start(event, PERF_EF_RELOAD); | |
697 | ||
698 | return 0; | |
699 | } | |
700 | ||
701 | /* | |
702 | * Delete a single event from the PMU. | |
703 | * | |
704 | * The event is deleted from the group of enabled events. | |
705 | * If it is the last event, disable PMU interrupt. | |
706 | */ | |
707 | static void tile_pmu_del(struct perf_event *event, int flags) | |
708 | { | |
709 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | |
710 | int i; | |
711 | ||
712 | /* | |
713 | * Remove event from list, compact list if necessary. | |
714 | */ | |
715 | for (i = 0; i < cpuc->n_events; i++) { | |
716 | if (cpuc->event_list[i] == event) { | |
717 | while (++i < cpuc->n_events) | |
718 | cpuc->event_list[i-1] = cpuc->event_list[i]; | |
719 | --cpuc->n_events; | |
720 | cpuc->events[event->hw.idx] = NULL; | |
721 | __clear_bit(event->hw.idx, &cpuc->used_mask); | |
722 | tile_pmu_stop(event, PERF_EF_UPDATE); | |
723 | break; | |
724 | } | |
725 | } | |
726 | /* | |
727 | * If there are no events left, then mask PMU interrupt. | |
728 | */ | |
729 | if (cpuc->n_events == 0) | |
730 | mask_pmc_interrupts(); | |
731 | perf_event_update_userpage(event); | |
732 | } | |
733 | ||
734 | /* | |
735 | * Propagate event elapsed time into the event. | |
736 | */ | |
737 | static inline void tile_pmu_read(struct perf_event *event) | |
738 | { | |
739 | tile_perf_event_update(event); | |
740 | } | |
741 | ||
742 | /* | |
743 | * Map generic events to Tile PMU. | |
744 | */ | |
745 | static int tile_map_hw_event(u64 config) | |
746 | { | |
747 | if (config >= tile_pmu->max_events) | |
748 | return -EINVAL; | |
749 | return tile_pmu->hw_events[config]; | |
750 | } | |
751 | ||
752 | /* | |
753 | * Map generic hardware cache events to Tile PMU. | |
754 | */ | |
755 | static int tile_map_cache_event(u64 config) | |
756 | { | |
757 | unsigned int cache_type, cache_op, cache_result; | |
758 | int code; | |
759 | ||
760 | if (!tile_pmu->cache_events) | |
761 | return -ENOENT; | |
762 | ||
763 | cache_type = (config >> 0) & 0xff; | |
764 | if (cache_type >= PERF_COUNT_HW_CACHE_MAX) | |
765 | return -EINVAL; | |
766 | ||
767 | cache_op = (config >> 8) & 0xff; | |
768 | if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) | |
769 | return -EINVAL; | |
770 | ||
771 | cache_result = (config >> 16) & 0xff; | |
772 | if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) | |
773 | return -EINVAL; | |
774 | ||
775 | code = (*tile_pmu->cache_events)[cache_type][cache_op][cache_result]; | |
776 | if (code == TILE_OP_UNSUPP) | |
777 | return -EINVAL; | |
778 | ||
779 | return code; | |
780 | } | |
781 | ||
782 | static void tile_event_destroy(struct perf_event *event) | |
783 | { | |
784 | if (atomic_dec_return(&tile_active_events) == 0) | |
785 | release_pmc_hardware(); | |
786 | } | |
787 | ||
788 | static int __tile_event_init(struct perf_event *event) | |
789 | { | |
790 | struct perf_event_attr *attr = &event->attr; | |
791 | struct hw_perf_event *hwc = &event->hw; | |
792 | int code; | |
793 | ||
794 | switch (attr->type) { | |
795 | case PERF_TYPE_HARDWARE: | |
796 | code = tile_pmu->map_hw_event(attr->config); | |
797 | break; | |
798 | case PERF_TYPE_HW_CACHE: | |
799 | code = tile_pmu->map_cache_event(attr->config); | |
800 | break; | |
801 | case PERF_TYPE_RAW: | |
802 | code = attr->config & TILE_EVENT_MASK; | |
803 | break; | |
804 | default: | |
805 | /* Should not happen. */ | |
806 | return -EOPNOTSUPP; | |
807 | } | |
808 | ||
809 | if (code < 0) | |
810 | return code; | |
811 | ||
812 | hwc->config = code; | |
813 | hwc->idx = -1; | |
814 | ||
815 | if (attr->exclude_user) | |
816 | hwc->config |= TILE_CTL_EXCL_USER; | |
817 | ||
818 | if (attr->exclude_kernel) | |
819 | hwc->config |= TILE_CTL_EXCL_KERNEL; | |
820 | ||
821 | if (attr->exclude_hv) | |
822 | hwc->config |= TILE_CTL_EXCL_HV; | |
823 | ||
824 | if (!hwc->sample_period) { | |
825 | hwc->sample_period = tile_pmu->max_period; | |
826 | hwc->last_period = hwc->sample_period; | |
827 | local64_set(&hwc->period_left, hwc->sample_period); | |
828 | } | |
829 | event->destroy = tile_event_destroy; | |
830 | return 0; | |
831 | } | |
832 | ||
833 | static int tile_event_init(struct perf_event *event) | |
834 | { | |
835 | int err = 0; | |
836 | perf_irq_t old_irq_handler = NULL; | |
837 | ||
838 | if (atomic_inc_return(&tile_active_events) == 1) | |
839 | old_irq_handler = reserve_pmc_hardware(tile_pmu_handle_irq); | |
840 | ||
841 | if (old_irq_handler) { | |
842 | pr_warn("PMC hardware busy (reserved by oprofile)\n"); | |
843 | ||
844 | atomic_dec(&tile_active_events); | |
845 | return -EBUSY; | |
846 | } | |
847 | ||
848 | switch (event->attr.type) { | |
849 | case PERF_TYPE_RAW: | |
850 | case PERF_TYPE_HARDWARE: | |
851 | case PERF_TYPE_HW_CACHE: | |
852 | break; | |
853 | ||
854 | default: | |
855 | return -ENOENT; | |
856 | } | |
857 | ||
858 | err = __tile_event_init(event); | |
859 | if (err) { | |
860 | if (event->destroy) | |
861 | event->destroy(event); | |
862 | } | |
863 | return err; | |
864 | } | |
865 | ||
866 | static struct pmu tilera_pmu = { | |
867 | .event_init = tile_event_init, | |
868 | .add = tile_pmu_add, | |
869 | .del = tile_pmu_del, | |
870 | ||
871 | .start = tile_pmu_start, | |
872 | .stop = tile_pmu_stop, | |
873 | ||
874 | .read = tile_pmu_read, | |
875 | }; | |
876 | ||
877 | /* | |
878 | * PMU's IRQ handler, PMU has 2 interrupts, they share the same handler. | |
879 | */ | |
880 | int tile_pmu_handle_irq(struct pt_regs *regs, int fault) | |
881 | { | |
882 | struct perf_sample_data data; | |
883 | struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); | |
884 | struct perf_event *event; | |
885 | struct hw_perf_event *hwc; | |
886 | u64 val; | |
887 | unsigned long status; | |
888 | int bit; | |
889 | ||
890 | __get_cpu_var(perf_irqs)++; | |
891 | ||
892 | if (!atomic_read(&tile_active_events)) | |
893 | return 0; | |
894 | ||
895 | status = pmc_get_overflow(); | |
896 | pmc_ack_overflow(status); | |
897 | ||
898 | for_each_set_bit(bit, &status, tile_pmu->num_counters) { | |
899 | ||
900 | event = cpuc->events[bit]; | |
901 | ||
902 | if (!event) | |
903 | continue; | |
904 | ||
905 | if (!test_bit(bit, cpuc->active_mask)) | |
906 | continue; | |
907 | ||
908 | hwc = &event->hw; | |
909 | ||
910 | val = tile_perf_event_update(event); | |
911 | if (val & (1ULL << (tile_pmu->cntval_bits - 1))) | |
912 | continue; | |
913 | ||
914 | perf_sample_data_init(&data, 0, event->hw.last_period); | |
915 | if (!tile_event_set_period(event)) | |
916 | continue; | |
917 | ||
918 | if (perf_event_overflow(event, &data, regs)) | |
919 | tile_pmu_stop(event, 0); | |
920 | } | |
921 | ||
922 | return 0; | |
923 | } | |
924 | ||
925 | static bool __init supported_pmu(void) | |
926 | { | |
927 | tile_pmu = &tilepmu; | |
928 | return true; | |
929 | } | |
930 | ||
931 | int __init init_hw_perf_events(void) | |
932 | { | |
933 | supported_pmu(); | |
934 | perf_pmu_register(&tilera_pmu, "cpu", PERF_TYPE_RAW); | |
935 | return 0; | |
936 | } | |
937 | arch_initcall(init_hw_perf_events); | |
938 | ||
939 | /* Callchain handling code. */ | |
940 | ||
941 | /* | |
942 | * Tile specific backtracing code for perf_events. | |
943 | */ | |
944 | static inline void perf_callchain(struct perf_callchain_entry *entry, | |
945 | struct pt_regs *regs) | |
946 | { | |
947 | struct KBacktraceIterator kbt; | |
948 | unsigned int i; | |
949 | ||
950 | /* | |
951 | * Get the address just after the "jalr" instruction that | |
952 | * jumps to the handler for a syscall. When we find this | |
953 | * address in a backtrace, we silently ignore it, which gives | |
954 | * us a one-step backtrace connection from the sys_xxx() | |
955 | * function in the kernel to the xxx() function in libc. | |
956 | * Otherwise, we lose the ability to properly attribute time | |
957 | * from the libc calls to the kernel implementations, since | |
958 | * oprofile only considers PCs from backtraces a pair at a time. | |
959 | */ | |
960 | unsigned long handle_syscall_pc = handle_syscall_link_address(); | |
961 | ||
962 | KBacktraceIterator_init(&kbt, NULL, regs); | |
963 | kbt.profile = 1; | |
964 | ||
965 | /* | |
966 | * The sample for the pc is already recorded. Now we are adding the | |
967 | * address of the callsites on the stack. Our iterator starts | |
968 | * with the frame of the (already sampled) call site. If our | |
969 | * iterator contained a "return address" field, we could have just | |
970 | * used it and wouldn't have needed to skip the first | |
971 | * frame. That's in effect what the arm and x86 versions do. | |
972 | * Instead we peel off the first iteration to get the equivalent | |
973 | * behavior. | |
974 | */ | |
975 | ||
976 | if (KBacktraceIterator_end(&kbt)) | |
977 | return; | |
978 | KBacktraceIterator_next(&kbt); | |
979 | ||
980 | /* | |
981 | * Set stack depth to 16 for user and kernel space respectively, that | |
982 | * is, total 32 stack frames. | |
983 | */ | |
984 | for (i = 0; i < 16; ++i) { | |
985 | unsigned long pc; | |
986 | if (KBacktraceIterator_end(&kbt)) | |
987 | break; | |
988 | pc = kbt.it.pc; | |
989 | if (pc != handle_syscall_pc) | |
990 | perf_callchain_store(entry, pc); | |
991 | KBacktraceIterator_next(&kbt); | |
992 | } | |
993 | } | |
994 | ||
995 | void perf_callchain_user(struct perf_callchain_entry *entry, | |
996 | struct pt_regs *regs) | |
997 | { | |
998 | perf_callchain(entry, regs); | |
999 | } | |
1000 | ||
1001 | void perf_callchain_kernel(struct perf_callchain_entry *entry, | |
1002 | struct pt_regs *regs) | |
1003 | { | |
1004 | perf_callchain(entry, regs); | |
1005 | } |