Commit | Line | Data |
---|---|---|
4afbb24c MF |
1 | /* |
2 | * Intel Cache Quality-of-Service Monitoring (CQM) support. | |
3 | * | |
4 | * Based very, very heavily on work by Peter Zijlstra. | |
5 | */ | |
6 | ||
7 | #include <linux/perf_event.h> | |
8 | #include <linux/slab.h> | |
9 | #include <asm/cpu_device_id.h> | |
27f6d22b | 10 | #include "../perf_event.h" |
4afbb24c MF |
11 | |
12 | #define MSR_IA32_PQR_ASSOC 0x0c8f | |
13 | #define MSR_IA32_QM_CTR 0x0c8e | |
14 | #define MSR_IA32_QM_EVTSEL 0x0c8d | |
15 | ||
87f01cc2 | 16 | #define MBM_CNTR_WIDTH 24 |
e7ee3e8c VS |
17 | /* |
18 | * Guaranteed time in ms as per SDM where MBM counters will not overflow. | |
19 | */ | |
20 | #define MBM_CTR_OVERFLOW_TIME 1000 | |
87f01cc2 | 21 | |
adafa999 | 22 | static u32 cqm_max_rmid = -1; |
4afbb24c | 23 | static unsigned int cqm_l3_scale; /* supposedly cacheline size */ |
33c3cc7a | 24 | static bool cqm_enabled, mbm_enabled; |
e7ee3e8c | 25 | unsigned int mbm_socket_max; |
4afbb24c | 26 | |
bf926731 TG |
27 | /** |
28 | * struct intel_pqr_state - State cache for the PQR MSR | |
29 | * @rmid: The cached Resource Monitoring ID | |
30 | * @closid: The cached Class Of Service ID | |
31 | * @rmid_usecnt: The usage counter for rmid | |
32 | * | |
33 | * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the | |
34 | * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always | |
35 | * contains both parts, so we need to cache them. | |
36 | * | |
37 | * The cache also helps to avoid pointless updates if the value does | |
38 | * not change. | |
39 | */ | |
40 | struct intel_pqr_state { | |
b3df4ec4 | 41 | u32 rmid; |
bf926731 TG |
42 | u32 closid; |
43 | int rmid_usecnt; | |
4afbb24c MF |
44 | }; |
45 | ||
9e7eaac9 | 46 | /* |
bf926731 | 47 | * The cached intel_pqr_state is strictly per CPU and can never be |
9e7eaac9 TG |
48 | * updated from a remote CPU. Both functions which modify the state |
49 | * (intel_cqm_event_start and intel_cqm_event_stop) are called with | |
50 | * interrupts disabled, which is sufficient for the protection. | |
51 | */ | |
bf926731 | 52 | static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); |
e7ee3e8c | 53 | static struct hrtimer *mbm_timers; |
33c3cc7a VS |
54 | /** |
55 | * struct sample - mbm event's (local or total) data | |
56 | * @total_bytes #bytes since we began monitoring | |
57 | * @prev_msr previous value of MSR | |
58 | */ | |
59 | struct sample { | |
60 | u64 total_bytes; | |
61 | u64 prev_msr; | |
62 | }; | |
63 | ||
64 | /* | |
65 | * samples profiled for total memory bandwidth type events | |
66 | */ | |
67 | static struct sample *mbm_total; | |
68 | /* | |
69 | * samples profiled for local memory bandwidth type events | |
70 | */ | |
71 | static struct sample *mbm_local; | |
4afbb24c | 72 | |
87f01cc2 TL |
73 | #define pkg_id topology_physical_package_id(smp_processor_id()) |
74 | /* | |
75 | * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array. | |
76 | * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of | |
77 | * rmids per socket, an example is given below | |
78 | * RMID1 of Socket0: vrmid = 1 | |
79 | * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1 | |
80 | * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1 | |
81 | */ | |
82 | #define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid) | |
4afbb24c | 83 | /* |
bff671db MF |
84 | * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. |
85 | * Also protects event->hw.cqm_rmid | |
86 | * | |
87 | * Hold either for stability, both for modification of ->hw.cqm_rmid. | |
4afbb24c MF |
88 | */ |
89 | static DEFINE_MUTEX(cache_mutex); | |
bff671db | 90 | static DEFINE_RAW_SPINLOCK(cache_lock); |
4afbb24c MF |
91 | |
92 | /* | |
93 | * Groups of events that have the same target(s), one RMID per group. | |
94 | */ | |
95 | static LIST_HEAD(cache_groups); | |
96 | ||
97 | /* | |
98 | * Mask of CPUs for reading CQM values. We only need one per-socket. | |
99 | */ | |
100 | static cpumask_t cqm_cpumask; | |
101 | ||
102 | #define RMID_VAL_ERROR (1ULL << 63) | |
103 | #define RMID_VAL_UNAVAIL (1ULL << 62) | |
104 | ||
87f01cc2 TL |
105 | /* |
106 | * Event IDs are used to program IA32_QM_EVTSEL before reading event | |
107 | * counter from IA32_QM_CTR | |
108 | */ | |
109 | #define QOS_L3_OCCUP_EVENT_ID 0x01 | |
110 | #define QOS_MBM_TOTAL_EVENT_ID 0x02 | |
111 | #define QOS_MBM_LOCAL_EVENT_ID 0x03 | |
4afbb24c | 112 | |
bff671db MF |
113 | /* |
114 | * This is central to the rotation algorithm in __intel_cqm_rmid_rotate(). | |
115 | * | |
116 | * This rmid is always free and is guaranteed to have an associated | |
117 | * near-zero occupancy value, i.e. no cachelines are tagged with this | |
118 | * RMID, once __intel_cqm_rmid_rotate() returns. | |
119 | */ | |
adafa999 | 120 | static u32 intel_cqm_rotation_rmid; |
bff671db MF |
121 | |
122 | #define INVALID_RMID (-1) | |
123 | ||
124 | /* | |
125 | * Is @rmid valid for programming the hardware? | |
126 | * | |
127 | * rmid 0 is reserved by the hardware for all non-monitored tasks, which | |
128 | * means that we should never come across an rmid with that value. | |
129 | * Likewise, an rmid value of -1 is used to indicate "no rmid currently | |
130 | * assigned" and is used as part of the rotation code. | |
131 | */ | |
adafa999 | 132 | static inline bool __rmid_valid(u32 rmid) |
bff671db MF |
133 | { |
134 | if (!rmid || rmid == INVALID_RMID) | |
135 | return false; | |
136 | ||
137 | return true; | |
138 | } | |
139 | ||
adafa999 | 140 | static u64 __rmid_read(u32 rmid) |
4afbb24c MF |
141 | { |
142 | u64 val; | |
143 | ||
144 | /* | |
145 | * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt, | |
146 | * it just says that to increase confusion. | |
147 | */ | |
148 | wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid); | |
149 | rdmsrl(MSR_IA32_QM_CTR, val); | |
150 | ||
151 | /* | |
152 | * Aside from the ERROR and UNAVAIL bits, assume this thing returns | |
153 | * the number of cachelines tagged with @rmid. | |
154 | */ | |
155 | return val; | |
156 | } | |
157 | ||
bff671db MF |
158 | enum rmid_recycle_state { |
159 | RMID_YOUNG = 0, | |
160 | RMID_AVAILABLE, | |
161 | RMID_DIRTY, | |
162 | }; | |
163 | ||
35298e55 | 164 | struct cqm_rmid_entry { |
adafa999 | 165 | u32 rmid; |
bff671db | 166 | enum rmid_recycle_state state; |
35298e55 | 167 | struct list_head list; |
bff671db | 168 | unsigned long queue_time; |
35298e55 MF |
169 | }; |
170 | ||
171 | /* | |
bff671db | 172 | * cqm_rmid_free_lru - A least recently used list of RMIDs. |
35298e55 MF |
173 | * |
174 | * Oldest entry at the head, newest (most recently used) entry at the | |
175 | * tail. This list is never traversed, it's only used to keep track of | |
176 | * the lru order. That is, we only pick entries of the head or insert | |
177 | * them on the tail. | |
178 | * | |
179 | * All entries on the list are 'free', and their RMIDs are not currently | |
180 | * in use. To mark an RMID as in use, remove its entry from the lru | |
181 | * list. | |
182 | * | |
bff671db MF |
183 | * |
184 | * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs. | |
185 | * | |
186 | * This list is contains RMIDs that no one is currently using but that | |
187 | * may have a non-zero occupancy value associated with them. The | |
188 | * rotation worker moves RMIDs from the limbo list to the free list once | |
189 | * the occupancy value drops below __intel_cqm_threshold. | |
190 | * | |
191 | * Both lists are protected by cache_mutex. | |
35298e55 | 192 | */ |
bff671db MF |
193 | static LIST_HEAD(cqm_rmid_free_lru); |
194 | static LIST_HEAD(cqm_rmid_limbo_lru); | |
35298e55 MF |
195 | |
196 | /* | |
197 | * We use a simple array of pointers so that we can lookup a struct | |
198 | * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid() | |
199 | * and __put_rmid() from having to worry about dealing with struct | |
200 | * cqm_rmid_entry - they just deal with rmids, i.e. integers. | |
201 | * | |
202 | * Once this array is initialized it is read-only. No locks are required | |
203 | * to access it. | |
204 | * | |
205 | * All entries for all RMIDs can be looked up in the this array at all | |
206 | * times. | |
207 | */ | |
208 | static struct cqm_rmid_entry **cqm_rmid_ptrs; | |
209 | ||
adafa999 | 210 | static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) |
35298e55 MF |
211 | { |
212 | struct cqm_rmid_entry *entry; | |
213 | ||
214 | entry = cqm_rmid_ptrs[rmid]; | |
215 | WARN_ON(entry->rmid != rmid); | |
216 | ||
217 | return entry; | |
218 | } | |
4afbb24c MF |
219 | |
220 | /* | |
221 | * Returns < 0 on fail. | |
35298e55 MF |
222 | * |
223 | * We expect to be called with cache_mutex held. | |
4afbb24c | 224 | */ |
adafa999 | 225 | static u32 __get_rmid(void) |
4afbb24c | 226 | { |
35298e55 MF |
227 | struct cqm_rmid_entry *entry; |
228 | ||
229 | lockdep_assert_held(&cache_mutex); | |
230 | ||
bff671db MF |
231 | if (list_empty(&cqm_rmid_free_lru)) |
232 | return INVALID_RMID; | |
35298e55 | 233 | |
bff671db | 234 | entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list); |
35298e55 MF |
235 | list_del(&entry->list); |
236 | ||
237 | return entry->rmid; | |
4afbb24c MF |
238 | } |
239 | ||
adafa999 | 240 | static void __put_rmid(u32 rmid) |
4afbb24c | 241 | { |
35298e55 MF |
242 | struct cqm_rmid_entry *entry; |
243 | ||
244 | lockdep_assert_held(&cache_mutex); | |
245 | ||
bff671db | 246 | WARN_ON(!__rmid_valid(rmid)); |
35298e55 MF |
247 | entry = __rmid_entry(rmid); |
248 | ||
bff671db MF |
249 | entry->queue_time = jiffies; |
250 | entry->state = RMID_YOUNG; | |
251 | ||
252 | list_add_tail(&entry->list, &cqm_rmid_limbo_lru); | |
4afbb24c MF |
253 | } |
254 | ||
ada2f634 VS |
255 | static void cqm_cleanup(void) |
256 | { | |
257 | int i; | |
258 | ||
259 | if (!cqm_rmid_ptrs) | |
260 | return; | |
261 | ||
262 | for (i = 0; i < cqm_max_rmid; i++) | |
263 | kfree(cqm_rmid_ptrs[i]); | |
264 | ||
265 | kfree(cqm_rmid_ptrs); | |
266 | cqm_rmid_ptrs = NULL; | |
33c3cc7a | 267 | cqm_enabled = false; |
ada2f634 VS |
268 | } |
269 | ||
4afbb24c MF |
270 | static int intel_cqm_setup_rmid_cache(void) |
271 | { | |
35298e55 | 272 | struct cqm_rmid_entry *entry; |
bff671db MF |
273 | unsigned int nr_rmids; |
274 | int r = 0; | |
35298e55 | 275 | |
bff671db | 276 | nr_rmids = cqm_max_rmid + 1; |
ada2f634 | 277 | cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) * |
bff671db | 278 | nr_rmids, GFP_KERNEL); |
35298e55 | 279 | if (!cqm_rmid_ptrs) |
4afbb24c MF |
280 | return -ENOMEM; |
281 | ||
bff671db | 282 | for (; r <= cqm_max_rmid; r++) { |
35298e55 MF |
283 | struct cqm_rmid_entry *entry; |
284 | ||
285 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | |
286 | if (!entry) | |
287 | goto fail; | |
288 | ||
289 | INIT_LIST_HEAD(&entry->list); | |
290 | entry->rmid = r; | |
291 | cqm_rmid_ptrs[r] = entry; | |
292 | ||
bff671db | 293 | list_add_tail(&entry->list, &cqm_rmid_free_lru); |
35298e55 | 294 | } |
4afbb24c MF |
295 | |
296 | /* | |
297 | * RMID 0 is special and is always allocated. It's used for all | |
298 | * tasks that are not monitored. | |
299 | */ | |
35298e55 MF |
300 | entry = __rmid_entry(0); |
301 | list_del(&entry->list); | |
4afbb24c | 302 | |
bff671db MF |
303 | mutex_lock(&cache_mutex); |
304 | intel_cqm_rotation_rmid = __get_rmid(); | |
305 | mutex_unlock(&cache_mutex); | |
306 | ||
4afbb24c | 307 | return 0; |
35298e55 | 308 | |
ada2f634 VS |
309 | fail: |
310 | cqm_cleanup(); | |
35298e55 | 311 | return -ENOMEM; |
4afbb24c MF |
312 | } |
313 | ||
314 | /* | |
315 | * Determine if @a and @b measure the same set of tasks. | |
bfe1fcd2 MF |
316 | * |
317 | * If @a and @b measure the same set of tasks then we want to share a | |
318 | * single RMID. | |
4afbb24c MF |
319 | */ |
320 | static bool __match_event(struct perf_event *a, struct perf_event *b) | |
321 | { | |
bfe1fcd2 | 322 | /* Per-cpu and task events don't mix */ |
4afbb24c MF |
323 | if ((a->attach_state & PERF_ATTACH_TASK) != |
324 | (b->attach_state & PERF_ATTACH_TASK)) | |
325 | return false; | |
326 | ||
bfe1fcd2 MF |
327 | #ifdef CONFIG_CGROUP_PERF |
328 | if (a->cgrp != b->cgrp) | |
329 | return false; | |
330 | #endif | |
331 | ||
332 | /* If not task event, we're machine wide */ | |
333 | if (!(b->attach_state & PERF_ATTACH_TASK)) | |
334 | return true; | |
335 | ||
336 | /* | |
337 | * Events that target same task are placed into the same cache group. | |
a223c1c7 VS |
338 | * Mark it as a multi event group, so that we update ->count |
339 | * for every event rather than just the group leader later. | |
bfe1fcd2 | 340 | */ |
a223c1c7 VS |
341 | if (a->hw.target == b->hw.target) { |
342 | b->hw.is_group_event = true; | |
bfe1fcd2 | 343 | return true; |
a223c1c7 | 344 | } |
bfe1fcd2 MF |
345 | |
346 | /* | |
347 | * Are we an inherited event? | |
348 | */ | |
349 | if (b->parent == a) | |
350 | return true; | |
351 | ||
352 | return false; | |
353 | } | |
354 | ||
355 | #ifdef CONFIG_CGROUP_PERF | |
356 | static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) | |
357 | { | |
358 | if (event->attach_state & PERF_ATTACH_TASK) | |
614e4c4e | 359 | return perf_cgroup_from_task(event->hw.target, event->ctx); |
4afbb24c | 360 | |
bfe1fcd2 | 361 | return event->cgrp; |
4afbb24c | 362 | } |
bfe1fcd2 | 363 | #endif |
4afbb24c MF |
364 | |
365 | /* | |
366 | * Determine if @a's tasks intersect with @b's tasks | |
bfe1fcd2 MF |
367 | * |
368 | * There are combinations of events that we explicitly prohibit, | |
369 | * | |
370 | * PROHIBITS | |
371 | * system-wide -> cgroup and task | |
372 | * cgroup -> system-wide | |
373 | * -> task in cgroup | |
374 | * task -> system-wide | |
375 | * -> task in cgroup | |
376 | * | |
377 | * Call this function before allocating an RMID. | |
4afbb24c MF |
378 | */ |
379 | static bool __conflict_event(struct perf_event *a, struct perf_event *b) | |
380 | { | |
bfe1fcd2 MF |
381 | #ifdef CONFIG_CGROUP_PERF |
382 | /* | |
383 | * We can have any number of cgroups but only one system-wide | |
384 | * event at a time. | |
385 | */ | |
386 | if (a->cgrp && b->cgrp) { | |
387 | struct perf_cgroup *ac = a->cgrp; | |
388 | struct perf_cgroup *bc = b->cgrp; | |
389 | ||
390 | /* | |
391 | * This condition should have been caught in | |
392 | * __match_event() and we should be sharing an RMID. | |
393 | */ | |
394 | WARN_ON_ONCE(ac == bc); | |
395 | ||
396 | if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || | |
397 | cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) | |
398 | return true; | |
399 | ||
400 | return false; | |
401 | } | |
402 | ||
403 | if (a->cgrp || b->cgrp) { | |
404 | struct perf_cgroup *ac, *bc; | |
405 | ||
406 | /* | |
407 | * cgroup and system-wide events are mutually exclusive | |
408 | */ | |
409 | if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || | |
410 | (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) | |
411 | return true; | |
412 | ||
413 | /* | |
414 | * Ensure neither event is part of the other's cgroup | |
415 | */ | |
416 | ac = event_to_cgroup(a); | |
417 | bc = event_to_cgroup(b); | |
418 | if (ac == bc) | |
419 | return true; | |
420 | ||
421 | /* | |
422 | * Must have cgroup and non-intersecting task events. | |
423 | */ | |
424 | if (!ac || !bc) | |
425 | return false; | |
426 | ||
427 | /* | |
428 | * We have cgroup and task events, and the task belongs | |
429 | * to a cgroup. Check for for overlap. | |
430 | */ | |
431 | if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || | |
432 | cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) | |
433 | return true; | |
434 | ||
435 | return false; | |
436 | } | |
437 | #endif | |
4afbb24c MF |
438 | /* |
439 | * If one of them is not a task, same story as above with cgroups. | |
440 | */ | |
441 | if (!(a->attach_state & PERF_ATTACH_TASK) || | |
442 | !(b->attach_state & PERF_ATTACH_TASK)) | |
443 | return true; | |
444 | ||
445 | /* | |
446 | * Must be non-overlapping. | |
447 | */ | |
448 | return false; | |
449 | } | |
450 | ||
bff671db | 451 | struct rmid_read { |
adafa999 | 452 | u32 rmid; |
87f01cc2 | 453 | u32 evt_type; |
bff671db MF |
454 | atomic64_t value; |
455 | }; | |
456 | ||
457 | static void __intel_cqm_event_count(void *info); | |
87f01cc2 | 458 | static void init_mbm_sample(u32 rmid, u32 evt_type); |
2d4de837 | 459 | static void __intel_mbm_event_count(void *info); |
87f01cc2 TL |
460 | |
461 | static bool is_mbm_event(int e) | |
462 | { | |
463 | return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); | |
464 | } | |
bff671db | 465 | |
27348f38 PZ |
466 | static void cqm_mask_call(struct rmid_read *rr) |
467 | { | |
468 | if (is_mbm_event(rr->evt_type)) | |
469 | on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1); | |
470 | else | |
471 | on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1); | |
472 | } | |
473 | ||
bff671db MF |
474 | /* |
475 | * Exchange the RMID of a group of events. | |
476 | */ | |
adafa999 | 477 | static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) |
bff671db MF |
478 | { |
479 | struct perf_event *event; | |
bff671db | 480 | struct list_head *head = &group->hw.cqm_group_entry; |
adafa999 | 481 | u32 old_rmid = group->hw.cqm_rmid; |
bff671db MF |
482 | |
483 | lockdep_assert_held(&cache_mutex); | |
484 | ||
485 | /* | |
486 | * If our RMID is being deallocated, perform a read now. | |
487 | */ | |
488 | if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) { | |
489 | struct rmid_read rr = { | |
bff671db | 490 | .rmid = old_rmid, |
27348f38 PZ |
491 | .evt_type = group->attr.config, |
492 | .value = ATOMIC64_INIT(0), | |
bff671db MF |
493 | }; |
494 | ||
27348f38 | 495 | cqm_mask_call(&rr); |
bff671db MF |
496 | local64_set(&group->count, atomic64_read(&rr.value)); |
497 | } | |
498 | ||
499 | raw_spin_lock_irq(&cache_lock); | |
500 | ||
501 | group->hw.cqm_rmid = rmid; | |
502 | list_for_each_entry(event, head, hw.cqm_group_entry) | |
503 | event->hw.cqm_rmid = rmid; | |
504 | ||
505 | raw_spin_unlock_irq(&cache_lock); | |
506 | ||
2d4de837 VS |
507 | /* |
508 | * If the allocation is for mbm, init the mbm stats. | |
509 | * Need to check if each event in the group is mbm event | |
510 | * because there could be multiple type of events in the same group. | |
511 | */ | |
512 | if (__rmid_valid(rmid)) { | |
513 | event = group; | |
514 | if (is_mbm_event(event->attr.config)) | |
515 | init_mbm_sample(rmid, event->attr.config); | |
516 | ||
517 | list_for_each_entry(event, head, hw.cqm_group_entry) { | |
518 | if (is_mbm_event(event->attr.config)) | |
519 | init_mbm_sample(rmid, event->attr.config); | |
520 | } | |
521 | } | |
522 | ||
bff671db MF |
523 | return old_rmid; |
524 | } | |
525 | ||
526 | /* | |
527 | * If we fail to assign a new RMID for intel_cqm_rotation_rmid because | |
528 | * cachelines are still tagged with RMIDs in limbo, we progressively | |
529 | * increment the threshold until we find an RMID in limbo with <= | |
530 | * __intel_cqm_threshold lines tagged. This is designed to mitigate the | |
531 | * problem where cachelines tagged with an RMID are not steadily being | |
532 | * evicted. | |
533 | * | |
534 | * On successful rotations we decrease the threshold back towards zero. | |
535 | * | |
536 | * __intel_cqm_max_threshold provides an upper bound on the threshold, | |
537 | * and is measured in bytes because it's exposed to userland. | |
538 | */ | |
539 | static unsigned int __intel_cqm_threshold; | |
540 | static unsigned int __intel_cqm_max_threshold; | |
541 | ||
542 | /* | |
543 | * Test whether an RMID has a zero occupancy value on this cpu. | |
544 | */ | |
545 | static void intel_cqm_stable(void *arg) | |
546 | { | |
547 | struct cqm_rmid_entry *entry; | |
548 | ||
549 | list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { | |
550 | if (entry->state != RMID_AVAILABLE) | |
551 | break; | |
552 | ||
553 | if (__rmid_read(entry->rmid) > __intel_cqm_threshold) | |
554 | entry->state = RMID_DIRTY; | |
555 | } | |
556 | } | |
557 | ||
558 | /* | |
559 | * If we have group events waiting for an RMID that don't conflict with | |
560 | * events already running, assign @rmid. | |
561 | */ | |
adafa999 | 562 | static bool intel_cqm_sched_in_event(u32 rmid) |
bff671db MF |
563 | { |
564 | struct perf_event *leader, *event; | |
565 | ||
566 | lockdep_assert_held(&cache_mutex); | |
567 | ||
568 | leader = list_first_entry(&cache_groups, struct perf_event, | |
569 | hw.cqm_groups_entry); | |
570 | event = leader; | |
571 | ||
572 | list_for_each_entry_continue(event, &cache_groups, | |
573 | hw.cqm_groups_entry) { | |
574 | if (__rmid_valid(event->hw.cqm_rmid)) | |
575 | continue; | |
576 | ||
577 | if (__conflict_event(event, leader)) | |
578 | continue; | |
579 | ||
580 | intel_cqm_xchg_rmid(event, rmid); | |
581 | return true; | |
582 | } | |
583 | ||
584 | return false; | |
585 | } | |
586 | ||
587 | /* | |
588 | * Initially use this constant for both the limbo queue time and the | |
589 | * rotation timer interval, pmu::hrtimer_interval_ms. | |
590 | * | |
591 | * They don't need to be the same, but the two are related since if you | |
592 | * rotate faster than you recycle RMIDs, you may run out of available | |
593 | * RMIDs. | |
594 | */ | |
595 | #define RMID_DEFAULT_QUEUE_TIME 250 /* ms */ | |
596 | ||
597 | static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME; | |
598 | ||
599 | /* | |
600 | * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list | |
601 | * @nr_available: number of freeable RMIDs on the limbo list | |
602 | * | |
603 | * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no | |
604 | * cachelines are tagged with those RMIDs. After this we can reuse them | |
605 | * and know that the current set of active RMIDs is stable. | |
606 | * | |
607 | * Return %true or %false depending on whether stabilization needs to be | |
608 | * reattempted. | |
609 | * | |
610 | * If we return %true then @nr_available is updated to indicate the | |
611 | * number of RMIDs on the limbo list that have been queued for the | |
612 | * minimum queue time (RMID_AVAILABLE), but whose data occupancy values | |
613 | * are above __intel_cqm_threshold. | |
614 | */ | |
615 | static bool intel_cqm_rmid_stabilize(unsigned int *available) | |
616 | { | |
617 | struct cqm_rmid_entry *entry, *tmp; | |
bff671db MF |
618 | |
619 | lockdep_assert_held(&cache_mutex); | |
620 | ||
621 | *available = 0; | |
622 | list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) { | |
623 | unsigned long min_queue_time; | |
624 | unsigned long now = jiffies; | |
625 | ||
626 | /* | |
627 | * We hold RMIDs placed into limbo for a minimum queue | |
628 | * time. Before the minimum queue time has elapsed we do | |
629 | * not recycle RMIDs. | |
630 | * | |
631 | * The reasoning is that until a sufficient time has | |
632 | * passed since we stopped using an RMID, any RMID | |
633 | * placed onto the limbo list will likely still have | |
634 | * data tagged in the cache, which means we'll probably | |
635 | * fail to recycle it anyway. | |
636 | * | |
637 | * We can save ourselves an expensive IPI by skipping | |
638 | * any RMIDs that have not been queued for the minimum | |
639 | * time. | |
640 | */ | |
641 | min_queue_time = entry->queue_time + | |
642 | msecs_to_jiffies(__rmid_queue_time_ms); | |
643 | ||
644 | if (time_after(min_queue_time, now)) | |
645 | break; | |
646 | ||
647 | entry->state = RMID_AVAILABLE; | |
648 | (*available)++; | |
649 | } | |
650 | ||
651 | /* | |
652 | * Fast return if none of the RMIDs on the limbo list have been | |
653 | * sitting on the queue for the minimum queue time. | |
654 | */ | |
655 | if (!*available) | |
656 | return false; | |
657 | ||
658 | /* | |
659 | * Test whether an RMID is free for each package. | |
660 | */ | |
661 | on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true); | |
662 | ||
663 | list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) { | |
664 | /* | |
665 | * Exhausted all RMIDs that have waited min queue time. | |
666 | */ | |
667 | if (entry->state == RMID_YOUNG) | |
668 | break; | |
669 | ||
670 | if (entry->state == RMID_DIRTY) | |
671 | continue; | |
672 | ||
673 | list_del(&entry->list); /* remove from limbo */ | |
674 | ||
675 | /* | |
676 | * The rotation RMID gets priority if it's | |
677 | * currently invalid. In which case, skip adding | |
678 | * the RMID to the the free lru. | |
679 | */ | |
680 | if (!__rmid_valid(intel_cqm_rotation_rmid)) { | |
681 | intel_cqm_rotation_rmid = entry->rmid; | |
682 | continue; | |
683 | } | |
684 | ||
685 | /* | |
686 | * If we have groups waiting for RMIDs, hand | |
59bf7fd4 | 687 | * them one now provided they don't conflict. |
bff671db | 688 | */ |
59bf7fd4 | 689 | if (intel_cqm_sched_in_event(entry->rmid)) |
bff671db MF |
690 | continue; |
691 | ||
692 | /* | |
693 | * Otherwise place it onto the free list. | |
694 | */ | |
695 | list_add_tail(&entry->list, &cqm_rmid_free_lru); | |
696 | } | |
697 | ||
698 | ||
699 | return __rmid_valid(intel_cqm_rotation_rmid); | |
700 | } | |
701 | ||
702 | /* | |
703 | * Pick a victim group and move it to the tail of the group list. | |
59bf7fd4 | 704 | * @next: The first group without an RMID |
bff671db | 705 | */ |
59bf7fd4 | 706 | static void __intel_cqm_pick_and_rotate(struct perf_event *next) |
bff671db MF |
707 | { |
708 | struct perf_event *rotor; | |
adafa999 | 709 | u32 rmid; |
bff671db MF |
710 | |
711 | lockdep_assert_held(&cache_mutex); | |
bff671db MF |
712 | |
713 | rotor = list_first_entry(&cache_groups, struct perf_event, | |
714 | hw.cqm_groups_entry); | |
59bf7fd4 MF |
715 | |
716 | /* | |
717 | * The group at the front of the list should always have a valid | |
718 | * RMID. If it doesn't then no groups have RMIDs assigned and we | |
719 | * don't need to rotate the list. | |
720 | */ | |
721 | if (next == rotor) | |
722 | return; | |
723 | ||
724 | rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID); | |
725 | __put_rmid(rmid); | |
726 | ||
bff671db | 727 | list_rotate_left(&cache_groups); |
59bf7fd4 MF |
728 | } |
729 | ||
730 | /* | |
731 | * Deallocate the RMIDs from any events that conflict with @event, and | |
732 | * place them on the back of the group list. | |
733 | */ | |
734 | static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) | |
735 | { | |
736 | struct perf_event *group, *g; | |
adafa999 | 737 | u32 rmid; |
59bf7fd4 MF |
738 | |
739 | lockdep_assert_held(&cache_mutex); | |
740 | ||
741 | list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) { | |
742 | if (group == event) | |
743 | continue; | |
744 | ||
745 | rmid = group->hw.cqm_rmid; | |
746 | ||
747 | /* | |
748 | * Skip events that don't have a valid RMID. | |
749 | */ | |
750 | if (!__rmid_valid(rmid)) | |
751 | continue; | |
752 | ||
753 | /* | |
754 | * No conflict? No problem! Leave the event alone. | |
755 | */ | |
756 | if (!__conflict_event(group, event)) | |
757 | continue; | |
bff671db | 758 | |
59bf7fd4 MF |
759 | intel_cqm_xchg_rmid(group, INVALID_RMID); |
760 | __put_rmid(rmid); | |
761 | } | |
bff671db MF |
762 | } |
763 | ||
764 | /* | |
765 | * Attempt to rotate the groups and assign new RMIDs. | |
766 | * | |
59bf7fd4 MF |
767 | * We rotate for two reasons, |
768 | * 1. To handle the scheduling of conflicting events | |
769 | * 2. To recycle RMIDs | |
770 | * | |
bff671db MF |
771 | * Rotating RMIDs is complicated because the hardware doesn't give us |
772 | * any clues. | |
773 | * | |
774 | * There's problems with the hardware interface; when you change the | |
775 | * task:RMID map cachelines retain their 'old' tags, giving a skewed | |
776 | * picture. In order to work around this, we must always keep one free | |
777 | * RMID - intel_cqm_rotation_rmid. | |
778 | * | |
779 | * Rotation works by taking away an RMID from a group (the old RMID), | |
780 | * and assigning the free RMID to another group (the new RMID). We must | |
781 | * then wait for the old RMID to not be used (no cachelines tagged). | |
782 | * This ensure that all cachelines are tagged with 'active' RMIDs. At | |
783 | * this point we can start reading values for the new RMID and treat the | |
784 | * old RMID as the free RMID for the next rotation. | |
785 | * | |
786 | * Return %true or %false depending on whether we did any rotating. | |
787 | */ | |
788 | static bool __intel_cqm_rmid_rotate(void) | |
789 | { | |
59bf7fd4 | 790 | struct perf_event *group, *start = NULL; |
bff671db MF |
791 | unsigned int threshold_limit; |
792 | unsigned int nr_needed = 0; | |
793 | unsigned int nr_available; | |
bff671db MF |
794 | bool rotated = false; |
795 | ||
796 | mutex_lock(&cache_mutex); | |
797 | ||
798 | again: | |
799 | /* | |
800 | * Fast path through this function if there are no groups and no | |
801 | * RMIDs that need cleaning. | |
802 | */ | |
803 | if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru)) | |
804 | goto out; | |
805 | ||
806 | list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) { | |
807 | if (!__rmid_valid(group->hw.cqm_rmid)) { | |
808 | if (!start) | |
809 | start = group; | |
810 | nr_needed++; | |
811 | } | |
812 | } | |
813 | ||
814 | /* | |
815 | * We have some event groups, but they all have RMIDs assigned | |
816 | * and no RMIDs need cleaning. | |
817 | */ | |
818 | if (!nr_needed && list_empty(&cqm_rmid_limbo_lru)) | |
819 | goto out; | |
820 | ||
821 | if (!nr_needed) | |
822 | goto stabilize; | |
823 | ||
824 | /* | |
59bf7fd4 MF |
825 | * We have more event groups without RMIDs than available RMIDs, |
826 | * or we have event groups that conflict with the ones currently | |
827 | * scheduled. | |
bff671db MF |
828 | * |
829 | * We force deallocate the rmid of the group at the head of | |
830 | * cache_groups. The first event group without an RMID then gets | |
831 | * assigned intel_cqm_rotation_rmid. This ensures we always make | |
832 | * forward progress. | |
833 | * | |
834 | * Rotate the cache_groups list so the previous head is now the | |
835 | * tail. | |
836 | */ | |
59bf7fd4 | 837 | __intel_cqm_pick_and_rotate(start); |
bff671db MF |
838 | |
839 | /* | |
840 | * If the rotation is going to succeed, reduce the threshold so | |
841 | * that we don't needlessly reuse dirty RMIDs. | |
842 | */ | |
843 | if (__rmid_valid(intel_cqm_rotation_rmid)) { | |
844 | intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid); | |
59bf7fd4 MF |
845 | intel_cqm_rotation_rmid = __get_rmid(); |
846 | ||
847 | intel_cqm_sched_out_conflicting_events(start); | |
bff671db MF |
848 | |
849 | if (__intel_cqm_threshold) | |
850 | __intel_cqm_threshold--; | |
851 | } | |
852 | ||
bff671db MF |
853 | rotated = true; |
854 | ||
855 | stabilize: | |
856 | /* | |
857 | * We now need to stablize the RMID we freed above (if any) to | |
858 | * ensure that the next time we rotate we have an RMID with zero | |
859 | * occupancy value. | |
860 | * | |
861 | * Alternatively, if we didn't need to perform any rotation, | |
862 | * we'll have a bunch of RMIDs in limbo that need stabilizing. | |
863 | */ | |
864 | threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale; | |
865 | ||
866 | while (intel_cqm_rmid_stabilize(&nr_available) && | |
867 | __intel_cqm_threshold < threshold_limit) { | |
868 | unsigned int steal_limit; | |
869 | ||
870 | /* | |
871 | * Don't spin if nobody is actively waiting for an RMID, | |
872 | * the rotation worker will be kicked as soon as an | |
873 | * event needs an RMID anyway. | |
874 | */ | |
875 | if (!nr_needed) | |
876 | break; | |
877 | ||
878 | /* Allow max 25% of RMIDs to be in limbo. */ | |
879 | steal_limit = (cqm_max_rmid + 1) / 4; | |
880 | ||
881 | /* | |
882 | * We failed to stabilize any RMIDs so our rotation | |
883 | * logic is now stuck. In order to make forward progress | |
884 | * we have a few options: | |
885 | * | |
886 | * 1. rotate ("steal") another RMID | |
887 | * 2. increase the threshold | |
888 | * 3. do nothing | |
889 | * | |
890 | * We do both of 1. and 2. until we hit the steal limit. | |
891 | * | |
892 | * The steal limit prevents all RMIDs ending up on the | |
893 | * limbo list. This can happen if every RMID has a | |
894 | * non-zero occupancy above threshold_limit, and the | |
895 | * occupancy values aren't dropping fast enough. | |
896 | * | |
897 | * Note that there is prioritisation at work here - we'd | |
898 | * rather increase the number of RMIDs on the limbo list | |
899 | * than increase the threshold, because increasing the | |
900 | * threshold skews the event data (because we reuse | |
901 | * dirty RMIDs) - threshold bumps are a last resort. | |
902 | */ | |
903 | if (nr_available < steal_limit) | |
904 | goto again; | |
905 | ||
906 | __intel_cqm_threshold++; | |
907 | } | |
908 | ||
909 | out: | |
910 | mutex_unlock(&cache_mutex); | |
911 | return rotated; | |
912 | } | |
913 | ||
914 | static void intel_cqm_rmid_rotate(struct work_struct *work); | |
915 | ||
916 | static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate); | |
917 | ||
918 | static struct pmu intel_cqm_pmu; | |
919 | ||
920 | static void intel_cqm_rmid_rotate(struct work_struct *work) | |
921 | { | |
922 | unsigned long delay; | |
923 | ||
924 | __intel_cqm_rmid_rotate(); | |
925 | ||
926 | delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms); | |
927 | schedule_delayed_work(&intel_cqm_rmid_work, delay); | |
928 | } | |
929 | ||
87f01cc2 TL |
930 | static u64 update_sample(unsigned int rmid, u32 evt_type, int first) |
931 | { | |
932 | struct sample *mbm_current; | |
933 | u32 vrmid = rmid_2_index(rmid); | |
934 | u64 val, bytes, shift; | |
935 | u32 eventid; | |
936 | ||
937 | if (evt_type == QOS_MBM_LOCAL_EVENT_ID) { | |
938 | mbm_current = &mbm_local[vrmid]; | |
939 | eventid = QOS_MBM_LOCAL_EVENT_ID; | |
940 | } else { | |
941 | mbm_current = &mbm_total[vrmid]; | |
942 | eventid = QOS_MBM_TOTAL_EVENT_ID; | |
943 | } | |
944 | ||
945 | wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); | |
946 | rdmsrl(MSR_IA32_QM_CTR, val); | |
947 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | |
948 | return mbm_current->total_bytes; | |
949 | ||
950 | if (first) { | |
951 | mbm_current->prev_msr = val; | |
952 | mbm_current->total_bytes = 0; | |
953 | return mbm_current->total_bytes; | |
954 | } | |
955 | ||
e7ee3e8c VS |
956 | /* |
957 | * The h/w guarantees that counters will not overflow | |
958 | * so long as we poll them at least once per second. | |
959 | */ | |
87f01cc2 TL |
960 | shift = 64 - MBM_CNTR_WIDTH; |
961 | bytes = (val << shift) - (mbm_current->prev_msr << shift); | |
962 | bytes >>= shift; | |
963 | ||
964 | bytes *= cqm_l3_scale; | |
965 | ||
966 | mbm_current->total_bytes += bytes; | |
967 | mbm_current->prev_msr = val; | |
968 | ||
969 | return mbm_current->total_bytes; | |
970 | } | |
971 | ||
972 | static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type) | |
973 | { | |
974 | return update_sample(rmid, evt_type, 0); | |
975 | } | |
976 | ||
977 | static void __intel_mbm_event_init(void *info) | |
978 | { | |
979 | struct rmid_read *rr = info; | |
980 | ||
981 | update_sample(rr->rmid, rr->evt_type, 1); | |
982 | } | |
983 | ||
984 | static void init_mbm_sample(u32 rmid, u32 evt_type) | |
985 | { | |
986 | struct rmid_read rr = { | |
987 | .rmid = rmid, | |
988 | .evt_type = evt_type, | |
989 | .value = ATOMIC64_INIT(0), | |
990 | }; | |
991 | ||
992 | /* on each socket, init sample */ | |
993 | on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1); | |
994 | } | |
995 | ||
4afbb24c MF |
996 | /* |
997 | * Find a group and setup RMID. | |
998 | * | |
999 | * If we're part of a group, we use the group's RMID. | |
1000 | */ | |
59bf7fd4 MF |
1001 | static void intel_cqm_setup_event(struct perf_event *event, |
1002 | struct perf_event **group) | |
4afbb24c MF |
1003 | { |
1004 | struct perf_event *iter; | |
59bf7fd4 | 1005 | bool conflict = false; |
adafa999 | 1006 | u32 rmid; |
4afbb24c | 1007 | |
a223c1c7 | 1008 | event->hw.is_group_event = false; |
4afbb24c | 1009 | list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { |
59bf7fd4 MF |
1010 | rmid = iter->hw.cqm_rmid; |
1011 | ||
4afbb24c MF |
1012 | if (__match_event(iter, event)) { |
1013 | /* All tasks in a group share an RMID */ | |
59bf7fd4 | 1014 | event->hw.cqm_rmid = rmid; |
4afbb24c | 1015 | *group = iter; |
2d4de837 | 1016 | if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) |
87f01cc2 | 1017 | init_mbm_sample(rmid, event->attr.config); |
59bf7fd4 | 1018 | return; |
4afbb24c MF |
1019 | } |
1020 | ||
59bf7fd4 MF |
1021 | /* |
1022 | * We only care about conflicts for events that are | |
1023 | * actually scheduled in (and hence have a valid RMID). | |
1024 | */ | |
1025 | if (__conflict_event(iter, event) && __rmid_valid(rmid)) | |
1026 | conflict = true; | |
4afbb24c MF |
1027 | } |
1028 | ||
59bf7fd4 MF |
1029 | if (conflict) |
1030 | rmid = INVALID_RMID; | |
1031 | else | |
1032 | rmid = __get_rmid(); | |
1033 | ||
2d4de837 | 1034 | if (is_mbm_event(event->attr.config) && __rmid_valid(rmid)) |
87f01cc2 TL |
1035 | init_mbm_sample(rmid, event->attr.config); |
1036 | ||
59bf7fd4 | 1037 | event->hw.cqm_rmid = rmid; |
4afbb24c MF |
1038 | } |
1039 | ||
1040 | static void intel_cqm_event_read(struct perf_event *event) | |
1041 | { | |
bff671db | 1042 | unsigned long flags; |
adafa999 | 1043 | u32 rmid; |
4afbb24c MF |
1044 | u64 val; |
1045 | ||
bfe1fcd2 MF |
1046 | /* |
1047 | * Task events are handled by intel_cqm_event_count(). | |
1048 | */ | |
1049 | if (event->cpu == -1) | |
1050 | return; | |
1051 | ||
bff671db | 1052 | raw_spin_lock_irqsave(&cache_lock, flags); |
bfe1fcd2 | 1053 | rmid = event->hw.cqm_rmid; |
bff671db MF |
1054 | |
1055 | if (!__rmid_valid(rmid)) | |
1056 | goto out; | |
1057 | ||
87f01cc2 TL |
1058 | if (is_mbm_event(event->attr.config)) |
1059 | val = rmid_read_mbm(rmid, event->attr.config); | |
1060 | else | |
1061 | val = __rmid_read(rmid); | |
4afbb24c MF |
1062 | |
1063 | /* | |
1064 | * Ignore this reading on error states and do not update the value. | |
1065 | */ | |
1066 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | |
bff671db | 1067 | goto out; |
4afbb24c MF |
1068 | |
1069 | local64_set(&event->count, val); | |
bff671db MF |
1070 | out: |
1071 | raw_spin_unlock_irqrestore(&cache_lock, flags); | |
4afbb24c MF |
1072 | } |
1073 | ||
bfe1fcd2 MF |
1074 | static void __intel_cqm_event_count(void *info) |
1075 | { | |
1076 | struct rmid_read *rr = info; | |
1077 | u64 val; | |
1078 | ||
1079 | val = __rmid_read(rr->rmid); | |
1080 | ||
1081 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | |
1082 | return; | |
1083 | ||
1084 | atomic64_add(val, &rr->value); | |
1085 | } | |
1086 | ||
1087 | static inline bool cqm_group_leader(struct perf_event *event) | |
1088 | { | |
1089 | return !list_empty(&event->hw.cqm_groups_entry); | |
1090 | } | |
1091 | ||
87f01cc2 TL |
1092 | static void __intel_mbm_event_count(void *info) |
1093 | { | |
1094 | struct rmid_read *rr = info; | |
1095 | u64 val; | |
1096 | ||
1097 | val = rmid_read_mbm(rr->rmid, rr->evt_type); | |
1098 | if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) | |
1099 | return; | |
1100 | atomic64_add(val, &rr->value); | |
1101 | } | |
1102 | ||
e7ee3e8c VS |
1103 | static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer) |
1104 | { | |
1105 | struct perf_event *iter, *iter1; | |
1106 | int ret = HRTIMER_RESTART; | |
1107 | struct list_head *head; | |
1108 | unsigned long flags; | |
1109 | u32 grp_rmid; | |
1110 | ||
1111 | /* | |
1112 | * Need to cache_lock as the timer Event Select MSR reads | |
1113 | * can race with the mbm/cqm count() and mbm_init() reads. | |
1114 | */ | |
1115 | raw_spin_lock_irqsave(&cache_lock, flags); | |
1116 | ||
1117 | if (list_empty(&cache_groups)) { | |
1118 | ret = HRTIMER_NORESTART; | |
1119 | goto out; | |
1120 | } | |
1121 | ||
1122 | list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { | |
1123 | grp_rmid = iter->hw.cqm_rmid; | |
1124 | if (!__rmid_valid(grp_rmid)) | |
1125 | continue; | |
1126 | if (is_mbm_event(iter->attr.config)) | |
1127 | update_sample(grp_rmid, iter->attr.config, 0); | |
1128 | ||
1129 | head = &iter->hw.cqm_group_entry; | |
1130 | if (list_empty(head)) | |
1131 | continue; | |
1132 | list_for_each_entry(iter1, head, hw.cqm_group_entry) { | |
1133 | if (!iter1->hw.is_group_event) | |
1134 | break; | |
1135 | if (is_mbm_event(iter1->attr.config)) | |
1136 | update_sample(iter1->hw.cqm_rmid, | |
1137 | iter1->attr.config, 0); | |
1138 | } | |
1139 | } | |
1140 | ||
1141 | hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME)); | |
1142 | out: | |
1143 | raw_spin_unlock_irqrestore(&cache_lock, flags); | |
1144 | ||
1145 | return ret; | |
1146 | } | |
1147 | ||
1148 | static void __mbm_start_timer(void *info) | |
1149 | { | |
1150 | hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME), | |
1151 | HRTIMER_MODE_REL_PINNED); | |
1152 | } | |
1153 | ||
1154 | static void __mbm_stop_timer(void *info) | |
1155 | { | |
1156 | hrtimer_cancel(&mbm_timers[pkg_id]); | |
1157 | } | |
1158 | ||
1159 | static void mbm_start_timers(void) | |
1160 | { | |
1161 | on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1); | |
1162 | } | |
1163 | ||
1164 | static void mbm_stop_timers(void) | |
1165 | { | |
1166 | on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1); | |
1167 | } | |
1168 | ||
1169 | static void mbm_hrtimer_init(void) | |
1170 | { | |
1171 | struct hrtimer *hr; | |
1172 | int i; | |
1173 | ||
1174 | for (i = 0; i < mbm_socket_max; i++) { | |
1175 | hr = &mbm_timers[i]; | |
1176 | hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
1177 | hr->function = mbm_hrtimer_handle; | |
1178 | } | |
1179 | } | |
1180 | ||
bfe1fcd2 MF |
1181 | static u64 intel_cqm_event_count(struct perf_event *event) |
1182 | { | |
bff671db | 1183 | unsigned long flags; |
bfe1fcd2 | 1184 | struct rmid_read rr = { |
27348f38 | 1185 | .evt_type = event->attr.config, |
bfe1fcd2 MF |
1186 | .value = ATOMIC64_INIT(0), |
1187 | }; | |
1188 | ||
1189 | /* | |
1190 | * We only need to worry about task events. System-wide events | |
1191 | * are handled like usual, i.e. entirely with | |
1192 | * intel_cqm_event_read(). | |
1193 | */ | |
1194 | if (event->cpu != -1) | |
1195 | return __perf_event_count(event); | |
1196 | ||
1197 | /* | |
a223c1c7 VS |
1198 | * Only the group leader gets to report values except in case of |
1199 | * multiple events in the same group, we still need to read the | |
1200 | * other events.This stops us | |
bfe1fcd2 MF |
1201 | * reporting duplicate values to userspace, and gives us a clear |
1202 | * rule for which task gets to report the values. | |
1203 | * | |
1204 | * Note that it is impossible to attribute these values to | |
1205 | * specific packages - we forfeit that ability when we create | |
1206 | * task events. | |
1207 | */ | |
a223c1c7 | 1208 | if (!cqm_group_leader(event) && !event->hw.is_group_event) |
bfe1fcd2 MF |
1209 | return 0; |
1210 | ||
2c534c0d MF |
1211 | /* |
1212 | * Getting up-to-date values requires an SMP IPI which is not | |
1213 | * possible if we're being called in interrupt context. Return | |
1214 | * the cached values instead. | |
1215 | */ | |
1216 | if (unlikely(in_interrupt())) | |
1217 | goto out; | |
1218 | ||
bff671db MF |
1219 | /* |
1220 | * Notice that we don't perform the reading of an RMID | |
1221 | * atomically, because we can't hold a spin lock across the | |
1222 | * IPIs. | |
1223 | * | |
1224 | * Speculatively perform the read, since @event might be | |
1225 | * assigned a different (possibly invalid) RMID while we're | |
1226 | * busying performing the IPI calls. It's therefore necessary to | |
1227 | * check @event's RMID afterwards, and if it has changed, | |
1228 | * discard the result of the read. | |
1229 | */ | |
1230 | rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid); | |
bfe1fcd2 | 1231 | |
bff671db MF |
1232 | if (!__rmid_valid(rr.rmid)) |
1233 | goto out; | |
1234 | ||
27348f38 | 1235 | cqm_mask_call(&rr); |
bfe1fcd2 | 1236 | |
bff671db MF |
1237 | raw_spin_lock_irqsave(&cache_lock, flags); |
1238 | if (event->hw.cqm_rmid == rr.rmid) | |
1239 | local64_set(&event->count, atomic64_read(&rr.value)); | |
1240 | raw_spin_unlock_irqrestore(&cache_lock, flags); | |
1241 | out: | |
bfe1fcd2 MF |
1242 | return __perf_event_count(event); |
1243 | } | |
1244 | ||
4afbb24c MF |
1245 | static void intel_cqm_event_start(struct perf_event *event, int mode) |
1246 | { | |
bf926731 | 1247 | struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); |
b3df4ec4 | 1248 | u32 rmid = event->hw.cqm_rmid; |
4afbb24c MF |
1249 | |
1250 | if (!(event->hw.cqm_state & PERF_HES_STOPPED)) | |
1251 | return; | |
1252 | ||
1253 | event->hw.cqm_state &= ~PERF_HES_STOPPED; | |
1254 | ||
bf926731 | 1255 | if (state->rmid_usecnt++) { |
0bac2378 TG |
1256 | if (!WARN_ON_ONCE(state->rmid != rmid)) |
1257 | return; | |
1258 | } else { | |
4afbb24c | 1259 | WARN_ON_ONCE(state->rmid); |
0bac2378 | 1260 | } |
4afbb24c MF |
1261 | |
1262 | state->rmid = rmid; | |
bf926731 | 1263 | wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); |
4afbb24c MF |
1264 | } |
1265 | ||
1266 | static void intel_cqm_event_stop(struct perf_event *event, int mode) | |
1267 | { | |
bf926731 | 1268 | struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); |
4afbb24c MF |
1269 | |
1270 | if (event->hw.cqm_state & PERF_HES_STOPPED) | |
1271 | return; | |
1272 | ||
1273 | event->hw.cqm_state |= PERF_HES_STOPPED; | |
1274 | ||
4afbb24c MF |
1275 | intel_cqm_event_read(event); |
1276 | ||
bf926731 | 1277 | if (!--state->rmid_usecnt) { |
4afbb24c | 1278 | state->rmid = 0; |
bf926731 | 1279 | wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); |
4afbb24c MF |
1280 | } else { |
1281 | WARN_ON_ONCE(!state->rmid); | |
1282 | } | |
4afbb24c MF |
1283 | } |
1284 | ||
1285 | static int intel_cqm_event_add(struct perf_event *event, int mode) | |
1286 | { | |
bff671db | 1287 | unsigned long flags; |
adafa999 | 1288 | u32 rmid; |
bff671db MF |
1289 | |
1290 | raw_spin_lock_irqsave(&cache_lock, flags); | |
4afbb24c MF |
1291 | |
1292 | event->hw.cqm_state = PERF_HES_STOPPED; | |
1293 | rmid = event->hw.cqm_rmid; | |
4afbb24c | 1294 | |
bff671db | 1295 | if (__rmid_valid(rmid) && (mode & PERF_EF_START)) |
4afbb24c MF |
1296 | intel_cqm_event_start(event, mode); |
1297 | ||
bff671db MF |
1298 | raw_spin_unlock_irqrestore(&cache_lock, flags); |
1299 | ||
4afbb24c MF |
1300 | return 0; |
1301 | } | |
1302 | ||
4afbb24c MF |
1303 | static void intel_cqm_event_destroy(struct perf_event *event) |
1304 | { | |
1305 | struct perf_event *group_other = NULL; | |
e7ee3e8c | 1306 | unsigned long flags; |
4afbb24c MF |
1307 | |
1308 | mutex_lock(&cache_mutex); | |
e7ee3e8c VS |
1309 | /* |
1310 | * Hold the cache_lock as mbm timer handlers could be | |
1311 | * scanning the list of events. | |
1312 | */ | |
1313 | raw_spin_lock_irqsave(&cache_lock, flags); | |
4afbb24c MF |
1314 | |
1315 | /* | |
1316 | * If there's another event in this group... | |
1317 | */ | |
1318 | if (!list_empty(&event->hw.cqm_group_entry)) { | |
1319 | group_other = list_first_entry(&event->hw.cqm_group_entry, | |
1320 | struct perf_event, | |
1321 | hw.cqm_group_entry); | |
1322 | list_del(&event->hw.cqm_group_entry); | |
1323 | } | |
1324 | ||
1325 | /* | |
1326 | * And we're the group leader.. | |
1327 | */ | |
bfe1fcd2 | 1328 | if (cqm_group_leader(event)) { |
4afbb24c MF |
1329 | /* |
1330 | * If there was a group_other, make that leader, otherwise | |
1331 | * destroy the group and return the RMID. | |
1332 | */ | |
1333 | if (group_other) { | |
1334 | list_replace(&event->hw.cqm_groups_entry, | |
1335 | &group_other->hw.cqm_groups_entry); | |
1336 | } else { | |
adafa999 | 1337 | u32 rmid = event->hw.cqm_rmid; |
4afbb24c | 1338 | |
bff671db MF |
1339 | if (__rmid_valid(rmid)) |
1340 | __put_rmid(rmid); | |
4afbb24c MF |
1341 | list_del(&event->hw.cqm_groups_entry); |
1342 | } | |
1343 | } | |
1344 | ||
e7ee3e8c VS |
1345 | raw_spin_unlock_irqrestore(&cache_lock, flags); |
1346 | ||
1347 | /* | |
1348 | * Stop the mbm overflow timers when the last event is destroyed. | |
1349 | */ | |
1350 | if (mbm_enabled && list_empty(&cache_groups)) | |
1351 | mbm_stop_timers(); | |
1352 | ||
4afbb24c MF |
1353 | mutex_unlock(&cache_mutex); |
1354 | } | |
1355 | ||
4afbb24c MF |
1356 | static int intel_cqm_event_init(struct perf_event *event) |
1357 | { | |
1358 | struct perf_event *group = NULL; | |
bff671db | 1359 | bool rotate = false; |
e7ee3e8c | 1360 | unsigned long flags; |
4afbb24c MF |
1361 | |
1362 | if (event->attr.type != intel_cqm_pmu.type) | |
1363 | return -ENOENT; | |
1364 | ||
87f01cc2 TL |
1365 | if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) || |
1366 | (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) | |
4afbb24c MF |
1367 | return -EINVAL; |
1368 | ||
4afbb24c MF |
1369 | /* unsupported modes and filters */ |
1370 | if (event->attr.exclude_user || | |
1371 | event->attr.exclude_kernel || | |
1372 | event->attr.exclude_hv || | |
1373 | event->attr.exclude_idle || | |
1374 | event->attr.exclude_host || | |
1375 | event->attr.exclude_guest || | |
1376 | event->attr.sample_period) /* no sampling */ | |
1377 | return -EINVAL; | |
1378 | ||
1379 | INIT_LIST_HEAD(&event->hw.cqm_group_entry); | |
1380 | INIT_LIST_HEAD(&event->hw.cqm_groups_entry); | |
1381 | ||
1382 | event->destroy = intel_cqm_event_destroy; | |
1383 | ||
1384 | mutex_lock(&cache_mutex); | |
1385 | ||
e7ee3e8c VS |
1386 | /* |
1387 | * Start the mbm overflow timers when the first event is created. | |
1388 | */ | |
1389 | if (mbm_enabled && list_empty(&cache_groups)) | |
1390 | mbm_start_timers(); | |
1391 | ||
bfe1fcd2 | 1392 | /* Will also set rmid */ |
59bf7fd4 | 1393 | intel_cqm_setup_event(event, &group); |
4afbb24c | 1394 | |
e7ee3e8c VS |
1395 | /* |
1396 | * Hold the cache_lock as mbm timer handlers be | |
1397 | * scanning the list of events. | |
1398 | */ | |
1399 | raw_spin_lock_irqsave(&cache_lock, flags); | |
1400 | ||
4afbb24c MF |
1401 | if (group) { |
1402 | list_add_tail(&event->hw.cqm_group_entry, | |
1403 | &group->hw.cqm_group_entry); | |
1404 | } else { | |
1405 | list_add_tail(&event->hw.cqm_groups_entry, | |
1406 | &cache_groups); | |
bff671db MF |
1407 | |
1408 | /* | |
1409 | * All RMIDs are either in use or have recently been | |
1410 | * used. Kick the rotation worker to clean/free some. | |
1411 | * | |
1412 | * We only do this for the group leader, rather than for | |
1413 | * every event in a group to save on needless work. | |
1414 | */ | |
1415 | if (!__rmid_valid(event->hw.cqm_rmid)) | |
1416 | rotate = true; | |
4afbb24c MF |
1417 | } |
1418 | ||
e7ee3e8c | 1419 | raw_spin_unlock_irqrestore(&cache_lock, flags); |
4afbb24c | 1420 | mutex_unlock(&cache_mutex); |
bff671db MF |
1421 | |
1422 | if (rotate) | |
1423 | schedule_delayed_work(&intel_cqm_rmid_work, 0); | |
1424 | ||
59bf7fd4 | 1425 | return 0; |
4afbb24c MF |
1426 | } |
1427 | ||
1428 | EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01"); | |
1429 | EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1"); | |
1430 | EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes"); | |
1431 | EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL); | |
1432 | EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1"); | |
1433 | ||
33c3cc7a VS |
1434 | EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02"); |
1435 | EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1"); | |
1436 | EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB"); | |
1437 | EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6"); | |
1438 | ||
1439 | EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03"); | |
1440 | EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1"); | |
1441 | EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB"); | |
1442 | EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6"); | |
1443 | ||
4afbb24c MF |
1444 | static struct attribute *intel_cqm_events_attr[] = { |
1445 | EVENT_PTR(intel_cqm_llc), | |
1446 | EVENT_PTR(intel_cqm_llc_pkg), | |
1447 | EVENT_PTR(intel_cqm_llc_unit), | |
1448 | EVENT_PTR(intel_cqm_llc_scale), | |
1449 | EVENT_PTR(intel_cqm_llc_snapshot), | |
1450 | NULL, | |
1451 | }; | |
1452 | ||
33c3cc7a VS |
1453 | static struct attribute *intel_mbm_events_attr[] = { |
1454 | EVENT_PTR(intel_cqm_total_bytes), | |
1455 | EVENT_PTR(intel_cqm_local_bytes), | |
1456 | EVENT_PTR(intel_cqm_total_bytes_pkg), | |
1457 | EVENT_PTR(intel_cqm_local_bytes_pkg), | |
1458 | EVENT_PTR(intel_cqm_total_bytes_unit), | |
1459 | EVENT_PTR(intel_cqm_local_bytes_unit), | |
1460 | EVENT_PTR(intel_cqm_total_bytes_scale), | |
1461 | EVENT_PTR(intel_cqm_local_bytes_scale), | |
1462 | NULL, | |
1463 | }; | |
1464 | ||
1465 | static struct attribute *intel_cmt_mbm_events_attr[] = { | |
1466 | EVENT_PTR(intel_cqm_llc), | |
1467 | EVENT_PTR(intel_cqm_total_bytes), | |
1468 | EVENT_PTR(intel_cqm_local_bytes), | |
1469 | EVENT_PTR(intel_cqm_llc_pkg), | |
1470 | EVENT_PTR(intel_cqm_total_bytes_pkg), | |
1471 | EVENT_PTR(intel_cqm_local_bytes_pkg), | |
1472 | EVENT_PTR(intel_cqm_llc_unit), | |
1473 | EVENT_PTR(intel_cqm_total_bytes_unit), | |
1474 | EVENT_PTR(intel_cqm_local_bytes_unit), | |
1475 | EVENT_PTR(intel_cqm_llc_scale), | |
1476 | EVENT_PTR(intel_cqm_total_bytes_scale), | |
1477 | EVENT_PTR(intel_cqm_local_bytes_scale), | |
1478 | EVENT_PTR(intel_cqm_llc_snapshot), | |
1479 | NULL, | |
1480 | }; | |
1481 | ||
4afbb24c MF |
1482 | static struct attribute_group intel_cqm_events_group = { |
1483 | .name = "events", | |
33c3cc7a | 1484 | .attrs = NULL, |
4afbb24c MF |
1485 | }; |
1486 | ||
1487 | PMU_FORMAT_ATTR(event, "config:0-7"); | |
1488 | static struct attribute *intel_cqm_formats_attr[] = { | |
1489 | &format_attr_event.attr, | |
1490 | NULL, | |
1491 | }; | |
1492 | ||
1493 | static struct attribute_group intel_cqm_format_group = { | |
1494 | .name = "format", | |
1495 | .attrs = intel_cqm_formats_attr, | |
1496 | }; | |
1497 | ||
bff671db MF |
1498 | static ssize_t |
1499 | max_recycle_threshold_show(struct device *dev, struct device_attribute *attr, | |
1500 | char *page) | |
1501 | { | |
1502 | ssize_t rv; | |
1503 | ||
1504 | mutex_lock(&cache_mutex); | |
1505 | rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold); | |
1506 | mutex_unlock(&cache_mutex); | |
1507 | ||
1508 | return rv; | |
1509 | } | |
1510 | ||
1511 | static ssize_t | |
1512 | max_recycle_threshold_store(struct device *dev, | |
1513 | struct device_attribute *attr, | |
1514 | const char *buf, size_t count) | |
1515 | { | |
1516 | unsigned int bytes, cachelines; | |
1517 | int ret; | |
1518 | ||
1519 | ret = kstrtouint(buf, 0, &bytes); | |
1520 | if (ret) | |
1521 | return ret; | |
1522 | ||
1523 | mutex_lock(&cache_mutex); | |
1524 | ||
1525 | __intel_cqm_max_threshold = bytes; | |
1526 | cachelines = bytes / cqm_l3_scale; | |
1527 | ||
1528 | /* | |
1529 | * The new maximum takes effect immediately. | |
1530 | */ | |
1531 | if (__intel_cqm_threshold > cachelines) | |
1532 | __intel_cqm_threshold = cachelines; | |
1533 | ||
1534 | mutex_unlock(&cache_mutex); | |
1535 | ||
1536 | return count; | |
1537 | } | |
1538 | ||
1539 | static DEVICE_ATTR_RW(max_recycle_threshold); | |
1540 | ||
1541 | static struct attribute *intel_cqm_attrs[] = { | |
1542 | &dev_attr_max_recycle_threshold.attr, | |
1543 | NULL, | |
1544 | }; | |
1545 | ||
1546 | static const struct attribute_group intel_cqm_group = { | |
1547 | .attrs = intel_cqm_attrs, | |
1548 | }; | |
1549 | ||
4afbb24c MF |
1550 | static const struct attribute_group *intel_cqm_attr_groups[] = { |
1551 | &intel_cqm_events_group, | |
1552 | &intel_cqm_format_group, | |
bff671db | 1553 | &intel_cqm_group, |
4afbb24c MF |
1554 | NULL, |
1555 | }; | |
1556 | ||
1557 | static struct pmu intel_cqm_pmu = { | |
bff671db MF |
1558 | .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME, |
1559 | .attr_groups = intel_cqm_attr_groups, | |
1560 | .task_ctx_nr = perf_sw_context, | |
1561 | .event_init = intel_cqm_event_init, | |
1562 | .add = intel_cqm_event_add, | |
43d0c2f6 | 1563 | .del = intel_cqm_event_stop, |
bff671db MF |
1564 | .start = intel_cqm_event_start, |
1565 | .stop = intel_cqm_event_stop, | |
1566 | .read = intel_cqm_event_read, | |
1567 | .count = intel_cqm_event_count, | |
4afbb24c MF |
1568 | }; |
1569 | ||
1570 | static inline void cqm_pick_event_reader(int cpu) | |
1571 | { | |
827db839 | 1572 | int reader; |
4afbb24c | 1573 | |
827db839 TG |
1574 | /* First online cpu in package becomes the reader */ |
1575 | reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu)); | |
1576 | if (reader >= nr_cpu_ids) | |
1577 | cpumask_set_cpu(cpu, &cqm_cpumask); | |
4afbb24c MF |
1578 | } |
1579 | ||
d7a702f0 | 1580 | static void intel_cqm_cpu_starting(unsigned int cpu) |
4afbb24c | 1581 | { |
bf926731 | 1582 | struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); |
4afbb24c MF |
1583 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
1584 | ||
4afbb24c | 1585 | state->rmid = 0; |
bf926731 TG |
1586 | state->closid = 0; |
1587 | state->rmid_usecnt = 0; | |
4afbb24c MF |
1588 | |
1589 | WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); | |
1590 | WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); | |
1591 | } | |
1592 | ||
1593 | static void intel_cqm_cpu_exit(unsigned int cpu) | |
1594 | { | |
827db839 | 1595 | int target; |
4afbb24c | 1596 | |
827db839 | 1597 | /* Is @cpu the current cqm reader for this package ? */ |
4afbb24c MF |
1598 | if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask)) |
1599 | return; | |
1600 | ||
827db839 TG |
1601 | /* Find another online reader in this package */ |
1602 | target = cpumask_any_but(topology_core_cpumask(cpu), cpu); | |
4afbb24c | 1603 | |
827db839 TG |
1604 | if (target < nr_cpu_ids) |
1605 | cpumask_set_cpu(target, &cqm_cpumask); | |
4afbb24c MF |
1606 | } |
1607 | ||
1608 | static int intel_cqm_cpu_notifier(struct notifier_block *nb, | |
1609 | unsigned long action, void *hcpu) | |
1610 | { | |
1611 | unsigned int cpu = (unsigned long)hcpu; | |
1612 | ||
1613 | switch (action & ~CPU_TASKS_FROZEN) { | |
4afbb24c MF |
1614 | case CPU_DOWN_PREPARE: |
1615 | intel_cqm_cpu_exit(cpu); | |
1616 | break; | |
1617 | case CPU_STARTING: | |
d7a702f0 | 1618 | intel_cqm_cpu_starting(cpu); |
4afbb24c MF |
1619 | cqm_pick_event_reader(cpu); |
1620 | break; | |
1621 | } | |
1622 | ||
1623 | return NOTIFY_OK; | |
1624 | } | |
1625 | ||
1626 | static const struct x86_cpu_id intel_cqm_match[] = { | |
1627 | { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC }, | |
1628 | {} | |
1629 | }; | |
1630 | ||
33c3cc7a VS |
1631 | static void mbm_cleanup(void) |
1632 | { | |
1633 | if (!mbm_enabled) | |
1634 | return; | |
1635 | ||
1636 | kfree(mbm_local); | |
1637 | kfree(mbm_total); | |
1638 | mbm_enabled = false; | |
1639 | } | |
1640 | ||
1641 | static const struct x86_cpu_id intel_mbm_local_match[] = { | |
1642 | { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL }, | |
1643 | {} | |
1644 | }; | |
1645 | ||
1646 | static const struct x86_cpu_id intel_mbm_total_match[] = { | |
1647 | { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL }, | |
1648 | {} | |
1649 | }; | |
1650 | ||
1651 | static int intel_mbm_init(void) | |
1652 | { | |
e7ee3e8c | 1653 | int ret = 0, array_size, maxid = cqm_max_rmid + 1; |
33c3cc7a | 1654 | |
e7ee3e8c VS |
1655 | mbm_socket_max = topology_max_packages(); |
1656 | array_size = sizeof(struct sample) * maxid * mbm_socket_max; | |
33c3cc7a VS |
1657 | mbm_local = kmalloc(array_size, GFP_KERNEL); |
1658 | if (!mbm_local) | |
1659 | return -ENOMEM; | |
1660 | ||
1661 | mbm_total = kmalloc(array_size, GFP_KERNEL); | |
1662 | if (!mbm_total) { | |
e7ee3e8c VS |
1663 | ret = -ENOMEM; |
1664 | goto out; | |
33c3cc7a VS |
1665 | } |
1666 | ||
e7ee3e8c VS |
1667 | array_size = sizeof(struct hrtimer) * mbm_socket_max; |
1668 | mbm_timers = kmalloc(array_size, GFP_KERNEL); | |
1669 | if (!mbm_timers) { | |
1670 | ret = -ENOMEM; | |
1671 | goto out; | |
1672 | } | |
1673 | mbm_hrtimer_init(); | |
1674 | ||
1675 | out: | |
1676 | if (ret) | |
1677 | mbm_cleanup(); | |
1678 | ||
1679 | return ret; | |
33c3cc7a VS |
1680 | } |
1681 | ||
4afbb24c MF |
1682 | static int __init intel_cqm_init(void) |
1683 | { | |
ada2f634 | 1684 | char *str = NULL, scale[20]; |
4afbb24c MF |
1685 | int i, cpu, ret; |
1686 | ||
33c3cc7a VS |
1687 | if (x86_match_cpu(intel_cqm_match)) |
1688 | cqm_enabled = true; | |
1689 | ||
1690 | if (x86_match_cpu(intel_mbm_local_match) && | |
1691 | x86_match_cpu(intel_mbm_total_match)) | |
1692 | mbm_enabled = true; | |
1693 | ||
1694 | if (!cqm_enabled && !mbm_enabled) | |
4afbb24c MF |
1695 | return -ENODEV; |
1696 | ||
1697 | cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale; | |
1698 | ||
1699 | /* | |
1700 | * It's possible that not all resources support the same number | |
1701 | * of RMIDs. Instead of making scheduling much more complicated | |
1702 | * (where we have to match a task's RMID to a cpu that supports | |
1703 | * that many RMIDs) just find the minimum RMIDs supported across | |
1704 | * all cpus. | |
1705 | * | |
1706 | * Also, check that the scales match on all cpus. | |
1707 | */ | |
1708 | cpu_notifier_register_begin(); | |
1709 | ||
1710 | for_each_online_cpu(cpu) { | |
1711 | struct cpuinfo_x86 *c = &cpu_data(cpu); | |
1712 | ||
1713 | if (c->x86_cache_max_rmid < cqm_max_rmid) | |
1714 | cqm_max_rmid = c->x86_cache_max_rmid; | |
1715 | ||
1716 | if (c->x86_cache_occ_scale != cqm_l3_scale) { | |
1717 | pr_err("Multiple LLC scale values, disabling\n"); | |
1718 | ret = -EINVAL; | |
1719 | goto out; | |
1720 | } | |
1721 | } | |
1722 | ||
bff671db MF |
1723 | /* |
1724 | * A reasonable upper limit on the max threshold is the number | |
1725 | * of lines tagged per RMID if all RMIDs have the same number of | |
1726 | * lines tagged in the LLC. | |
1727 | * | |
1728 | * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. | |
1729 | */ | |
1730 | __intel_cqm_max_threshold = | |
1731 | boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1); | |
1732 | ||
4afbb24c MF |
1733 | snprintf(scale, sizeof(scale), "%u", cqm_l3_scale); |
1734 | str = kstrdup(scale, GFP_KERNEL); | |
1735 | if (!str) { | |
1736 | ret = -ENOMEM; | |
1737 | goto out; | |
1738 | } | |
1739 | ||
1740 | event_attr_intel_cqm_llc_scale.event_str = str; | |
1741 | ||
1742 | ret = intel_cqm_setup_rmid_cache(); | |
1743 | if (ret) | |
1744 | goto out; | |
1745 | ||
1746 | for_each_online_cpu(i) { | |
d7a702f0 | 1747 | intel_cqm_cpu_starting(i); |
4afbb24c MF |
1748 | cqm_pick_event_reader(i); |
1749 | } | |
1750 | ||
33c3cc7a VS |
1751 | if (mbm_enabled) |
1752 | ret = intel_mbm_init(); | |
1753 | if (ret && !cqm_enabled) | |
1754 | goto out; | |
1755 | ||
1756 | if (cqm_enabled && mbm_enabled) | |
1757 | intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr; | |
1758 | else if (!cqm_enabled && mbm_enabled) | |
1759 | intel_cqm_events_group.attrs = intel_mbm_events_attr; | |
1760 | else if (cqm_enabled && !mbm_enabled) | |
1761 | intel_cqm_events_group.attrs = intel_cqm_events_attr; | |
1762 | ||
50f16a8b | 1763 | ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); |
ada2f634 | 1764 | if (ret) { |
4afbb24c | 1765 | pr_err("Intel CQM perf registration failed: %d\n", ret); |
ada2f634 VS |
1766 | goto out; |
1767 | } | |
4afbb24c | 1768 | |
33c3cc7a VS |
1769 | if (cqm_enabled) |
1770 | pr_info("Intel CQM monitoring enabled\n"); | |
1771 | if (mbm_enabled) | |
1772 | pr_info("Intel MBM enabled\n"); | |
ada2f634 VS |
1773 | |
1774 | /* | |
1775 | * Register the hot cpu notifier once we are sure cqm | |
1776 | * is enabled to avoid notifier leak. | |
1777 | */ | |
1778 | __perf_cpu_notifier(intel_cqm_cpu_notifier); | |
4afbb24c MF |
1779 | out: |
1780 | cpu_notifier_register_done(); | |
ada2f634 VS |
1781 | if (ret) { |
1782 | kfree(str); | |
1783 | cqm_cleanup(); | |
33c3cc7a | 1784 | mbm_cleanup(); |
ada2f634 | 1785 | } |
4afbb24c MF |
1786 | |
1787 | return ret; | |
1788 | } | |
1789 | device_initcall(intel_cqm_init); |