perf/x86/mbm: Implement RMID recycling
[deliverable/linux.git] / arch / x86 / events / intel / cqm.c
CommitLineData
4afbb24c
MF
1/*
2 * Intel Cache Quality-of-Service Monitoring (CQM) support.
3 *
4 * Based very, very heavily on work by Peter Zijlstra.
5 */
6
7#include <linux/perf_event.h>
8#include <linux/slab.h>
9#include <asm/cpu_device_id.h>
27f6d22b 10#include "../perf_event.h"
4afbb24c
MF
11
12#define MSR_IA32_PQR_ASSOC 0x0c8f
13#define MSR_IA32_QM_CTR 0x0c8e
14#define MSR_IA32_QM_EVTSEL 0x0c8d
15
87f01cc2
TL
16#define MBM_CNTR_WIDTH 24
17
adafa999 18static u32 cqm_max_rmid = -1;
4afbb24c 19static unsigned int cqm_l3_scale; /* supposedly cacheline size */
33c3cc7a 20static bool cqm_enabled, mbm_enabled;
4afbb24c 21
bf926731
TG
22/**
23 * struct intel_pqr_state - State cache for the PQR MSR
24 * @rmid: The cached Resource Monitoring ID
25 * @closid: The cached Class Of Service ID
26 * @rmid_usecnt: The usage counter for rmid
27 *
28 * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
29 * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
30 * contains both parts, so we need to cache them.
31 *
32 * The cache also helps to avoid pointless updates if the value does
33 * not change.
34 */
35struct intel_pqr_state {
b3df4ec4 36 u32 rmid;
bf926731
TG
37 u32 closid;
38 int rmid_usecnt;
4afbb24c
MF
39};
40
9e7eaac9 41/*
bf926731 42 * The cached intel_pqr_state is strictly per CPU and can never be
9e7eaac9
TG
43 * updated from a remote CPU. Both functions which modify the state
44 * (intel_cqm_event_start and intel_cqm_event_stop) are called with
45 * interrupts disabled, which is sufficient for the protection.
46 */
bf926731 47static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
33c3cc7a
VS
48/**
49 * struct sample - mbm event's (local or total) data
50 * @total_bytes #bytes since we began monitoring
51 * @prev_msr previous value of MSR
52 */
53struct sample {
54 u64 total_bytes;
55 u64 prev_msr;
56};
57
58/*
59 * samples profiled for total memory bandwidth type events
60 */
61static struct sample *mbm_total;
62/*
63 * samples profiled for local memory bandwidth type events
64 */
65static struct sample *mbm_local;
4afbb24c 66
87f01cc2
TL
67#define pkg_id topology_physical_package_id(smp_processor_id())
68/*
69 * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
70 * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
71 * rmids per socket, an example is given below
72 * RMID1 of Socket0: vrmid = 1
73 * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1
74 * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1
75 */
76#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid)
4afbb24c 77/*
bff671db
MF
78 * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
79 * Also protects event->hw.cqm_rmid
80 *
81 * Hold either for stability, both for modification of ->hw.cqm_rmid.
4afbb24c
MF
82 */
83static DEFINE_MUTEX(cache_mutex);
bff671db 84static DEFINE_RAW_SPINLOCK(cache_lock);
4afbb24c
MF
85
86/*
87 * Groups of events that have the same target(s), one RMID per group.
88 */
89static LIST_HEAD(cache_groups);
90
91/*
92 * Mask of CPUs for reading CQM values. We only need one per-socket.
93 */
94static cpumask_t cqm_cpumask;
95
96#define RMID_VAL_ERROR (1ULL << 63)
97#define RMID_VAL_UNAVAIL (1ULL << 62)
98
87f01cc2
TL
99/*
100 * Event IDs are used to program IA32_QM_EVTSEL before reading event
101 * counter from IA32_QM_CTR
102 */
103#define QOS_L3_OCCUP_EVENT_ID 0x01
104#define QOS_MBM_TOTAL_EVENT_ID 0x02
105#define QOS_MBM_LOCAL_EVENT_ID 0x03
4afbb24c 106
bff671db
MF
107/*
108 * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
109 *
110 * This rmid is always free and is guaranteed to have an associated
111 * near-zero occupancy value, i.e. no cachelines are tagged with this
112 * RMID, once __intel_cqm_rmid_rotate() returns.
113 */
adafa999 114static u32 intel_cqm_rotation_rmid;
bff671db
MF
115
116#define INVALID_RMID (-1)
117
118/*
119 * Is @rmid valid for programming the hardware?
120 *
121 * rmid 0 is reserved by the hardware for all non-monitored tasks, which
122 * means that we should never come across an rmid with that value.
123 * Likewise, an rmid value of -1 is used to indicate "no rmid currently
124 * assigned" and is used as part of the rotation code.
125 */
adafa999 126static inline bool __rmid_valid(u32 rmid)
bff671db
MF
127{
128 if (!rmid || rmid == INVALID_RMID)
129 return false;
130
131 return true;
132}
133
adafa999 134static u64 __rmid_read(u32 rmid)
4afbb24c
MF
135{
136 u64 val;
137
138 /*
139 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
140 * it just says that to increase confusion.
141 */
142 wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
143 rdmsrl(MSR_IA32_QM_CTR, val);
144
145 /*
146 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
147 * the number of cachelines tagged with @rmid.
148 */
149 return val;
150}
151
bff671db
MF
152enum rmid_recycle_state {
153 RMID_YOUNG = 0,
154 RMID_AVAILABLE,
155 RMID_DIRTY,
156};
157
35298e55 158struct cqm_rmid_entry {
adafa999 159 u32 rmid;
bff671db 160 enum rmid_recycle_state state;
35298e55 161 struct list_head list;
bff671db 162 unsigned long queue_time;
35298e55
MF
163};
164
165/*
bff671db 166 * cqm_rmid_free_lru - A least recently used list of RMIDs.
35298e55
MF
167 *
168 * Oldest entry at the head, newest (most recently used) entry at the
169 * tail. This list is never traversed, it's only used to keep track of
170 * the lru order. That is, we only pick entries of the head or insert
171 * them on the tail.
172 *
173 * All entries on the list are 'free', and their RMIDs are not currently
174 * in use. To mark an RMID as in use, remove its entry from the lru
175 * list.
176 *
bff671db
MF
177 *
178 * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
179 *
180 * This list is contains RMIDs that no one is currently using but that
181 * may have a non-zero occupancy value associated with them. The
182 * rotation worker moves RMIDs from the limbo list to the free list once
183 * the occupancy value drops below __intel_cqm_threshold.
184 *
185 * Both lists are protected by cache_mutex.
35298e55 186 */
bff671db
MF
187static LIST_HEAD(cqm_rmid_free_lru);
188static LIST_HEAD(cqm_rmid_limbo_lru);
35298e55
MF
189
190/*
191 * We use a simple array of pointers so that we can lookup a struct
192 * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
193 * and __put_rmid() from having to worry about dealing with struct
194 * cqm_rmid_entry - they just deal with rmids, i.e. integers.
195 *
196 * Once this array is initialized it is read-only. No locks are required
197 * to access it.
198 *
199 * All entries for all RMIDs can be looked up in the this array at all
200 * times.
201 */
202static struct cqm_rmid_entry **cqm_rmid_ptrs;
203
adafa999 204static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
35298e55
MF
205{
206 struct cqm_rmid_entry *entry;
207
208 entry = cqm_rmid_ptrs[rmid];
209 WARN_ON(entry->rmid != rmid);
210
211 return entry;
212}
4afbb24c
MF
213
214/*
215 * Returns < 0 on fail.
35298e55
MF
216 *
217 * We expect to be called with cache_mutex held.
4afbb24c 218 */
adafa999 219static u32 __get_rmid(void)
4afbb24c 220{
35298e55
MF
221 struct cqm_rmid_entry *entry;
222
223 lockdep_assert_held(&cache_mutex);
224
bff671db
MF
225 if (list_empty(&cqm_rmid_free_lru))
226 return INVALID_RMID;
35298e55 227
bff671db 228 entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
35298e55
MF
229 list_del(&entry->list);
230
231 return entry->rmid;
4afbb24c
MF
232}
233
adafa999 234static void __put_rmid(u32 rmid)
4afbb24c 235{
35298e55
MF
236 struct cqm_rmid_entry *entry;
237
238 lockdep_assert_held(&cache_mutex);
239
bff671db 240 WARN_ON(!__rmid_valid(rmid));
35298e55
MF
241 entry = __rmid_entry(rmid);
242
bff671db
MF
243 entry->queue_time = jiffies;
244 entry->state = RMID_YOUNG;
245
246 list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
4afbb24c
MF
247}
248
ada2f634
VS
249static void cqm_cleanup(void)
250{
251 int i;
252
253 if (!cqm_rmid_ptrs)
254 return;
255
256 for (i = 0; i < cqm_max_rmid; i++)
257 kfree(cqm_rmid_ptrs[i]);
258
259 kfree(cqm_rmid_ptrs);
260 cqm_rmid_ptrs = NULL;
33c3cc7a 261 cqm_enabled = false;
ada2f634
VS
262}
263
4afbb24c
MF
264static int intel_cqm_setup_rmid_cache(void)
265{
35298e55 266 struct cqm_rmid_entry *entry;
bff671db
MF
267 unsigned int nr_rmids;
268 int r = 0;
35298e55 269
bff671db 270 nr_rmids = cqm_max_rmid + 1;
ada2f634 271 cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
bff671db 272 nr_rmids, GFP_KERNEL);
35298e55 273 if (!cqm_rmid_ptrs)
4afbb24c
MF
274 return -ENOMEM;
275
bff671db 276 for (; r <= cqm_max_rmid; r++) {
35298e55
MF
277 struct cqm_rmid_entry *entry;
278
279 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
280 if (!entry)
281 goto fail;
282
283 INIT_LIST_HEAD(&entry->list);
284 entry->rmid = r;
285 cqm_rmid_ptrs[r] = entry;
286
bff671db 287 list_add_tail(&entry->list, &cqm_rmid_free_lru);
35298e55 288 }
4afbb24c
MF
289
290 /*
291 * RMID 0 is special and is always allocated. It's used for all
292 * tasks that are not monitored.
293 */
35298e55
MF
294 entry = __rmid_entry(0);
295 list_del(&entry->list);
4afbb24c 296
bff671db
MF
297 mutex_lock(&cache_mutex);
298 intel_cqm_rotation_rmid = __get_rmid();
299 mutex_unlock(&cache_mutex);
300
4afbb24c 301 return 0;
35298e55 302
ada2f634
VS
303fail:
304 cqm_cleanup();
35298e55 305 return -ENOMEM;
4afbb24c
MF
306}
307
308/*
309 * Determine if @a and @b measure the same set of tasks.
bfe1fcd2
MF
310 *
311 * If @a and @b measure the same set of tasks then we want to share a
312 * single RMID.
4afbb24c
MF
313 */
314static bool __match_event(struct perf_event *a, struct perf_event *b)
315{
bfe1fcd2 316 /* Per-cpu and task events don't mix */
4afbb24c
MF
317 if ((a->attach_state & PERF_ATTACH_TASK) !=
318 (b->attach_state & PERF_ATTACH_TASK))
319 return false;
320
bfe1fcd2
MF
321#ifdef CONFIG_CGROUP_PERF
322 if (a->cgrp != b->cgrp)
323 return false;
324#endif
325
326 /* If not task event, we're machine wide */
327 if (!(b->attach_state & PERF_ATTACH_TASK))
328 return true;
329
330 /*
331 * Events that target same task are placed into the same cache group.
a223c1c7
VS
332 * Mark it as a multi event group, so that we update ->count
333 * for every event rather than just the group leader later.
bfe1fcd2 334 */
a223c1c7
VS
335 if (a->hw.target == b->hw.target) {
336 b->hw.is_group_event = true;
bfe1fcd2 337 return true;
a223c1c7 338 }
bfe1fcd2
MF
339
340 /*
341 * Are we an inherited event?
342 */
343 if (b->parent == a)
344 return true;
345
346 return false;
347}
348
349#ifdef CONFIG_CGROUP_PERF
350static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
351{
352 if (event->attach_state & PERF_ATTACH_TASK)
614e4c4e 353 return perf_cgroup_from_task(event->hw.target, event->ctx);
4afbb24c 354
bfe1fcd2 355 return event->cgrp;
4afbb24c 356}
bfe1fcd2 357#endif
4afbb24c
MF
358
359/*
360 * Determine if @a's tasks intersect with @b's tasks
bfe1fcd2
MF
361 *
362 * There are combinations of events that we explicitly prohibit,
363 *
364 * PROHIBITS
365 * system-wide -> cgroup and task
366 * cgroup -> system-wide
367 * -> task in cgroup
368 * task -> system-wide
369 * -> task in cgroup
370 *
371 * Call this function before allocating an RMID.
4afbb24c
MF
372 */
373static bool __conflict_event(struct perf_event *a, struct perf_event *b)
374{
bfe1fcd2
MF
375#ifdef CONFIG_CGROUP_PERF
376 /*
377 * We can have any number of cgroups but only one system-wide
378 * event at a time.
379 */
380 if (a->cgrp && b->cgrp) {
381 struct perf_cgroup *ac = a->cgrp;
382 struct perf_cgroup *bc = b->cgrp;
383
384 /*
385 * This condition should have been caught in
386 * __match_event() and we should be sharing an RMID.
387 */
388 WARN_ON_ONCE(ac == bc);
389
390 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
391 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
392 return true;
393
394 return false;
395 }
396
397 if (a->cgrp || b->cgrp) {
398 struct perf_cgroup *ac, *bc;
399
400 /*
401 * cgroup and system-wide events are mutually exclusive
402 */
403 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
404 (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
405 return true;
406
407 /*
408 * Ensure neither event is part of the other's cgroup
409 */
410 ac = event_to_cgroup(a);
411 bc = event_to_cgroup(b);
412 if (ac == bc)
413 return true;
414
415 /*
416 * Must have cgroup and non-intersecting task events.
417 */
418 if (!ac || !bc)
419 return false;
420
421 /*
422 * We have cgroup and task events, and the task belongs
423 * to a cgroup. Check for for overlap.
424 */
425 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
426 cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
427 return true;
428
429 return false;
430 }
431#endif
4afbb24c
MF
432 /*
433 * If one of them is not a task, same story as above with cgroups.
434 */
435 if (!(a->attach_state & PERF_ATTACH_TASK) ||
436 !(b->attach_state & PERF_ATTACH_TASK))
437 return true;
438
439 /*
440 * Must be non-overlapping.
441 */
442 return false;
443}
444
bff671db 445struct rmid_read {
adafa999 446 u32 rmid;
87f01cc2 447 u32 evt_type;
bff671db
MF
448 atomic64_t value;
449};
450
451static void __intel_cqm_event_count(void *info);
87f01cc2 452static void init_mbm_sample(u32 rmid, u32 evt_type);
2d4de837 453static void __intel_mbm_event_count(void *info);
87f01cc2
TL
454
455static bool is_mbm_event(int e)
456{
457 return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
458}
bff671db
MF
459
460/*
461 * Exchange the RMID of a group of events.
462 */
adafa999 463static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
bff671db
MF
464{
465 struct perf_event *event;
bff671db 466 struct list_head *head = &group->hw.cqm_group_entry;
adafa999 467 u32 old_rmid = group->hw.cqm_rmid;
bff671db
MF
468
469 lockdep_assert_held(&cache_mutex);
470
471 /*
472 * If our RMID is being deallocated, perform a read now.
473 */
474 if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
475 struct rmid_read rr = {
476 .value = ATOMIC64_INIT(0),
477 .rmid = old_rmid,
478 };
479
2d4de837
VS
480 if (is_mbm_event(group->attr.config)) {
481 rr.evt_type = group->attr.config;
482 on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count,
483 &rr, 1);
484 } else {
485 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
486 &rr, 1);
487 }
bff671db
MF
488 local64_set(&group->count, atomic64_read(&rr.value));
489 }
490
491 raw_spin_lock_irq(&cache_lock);
492
493 group->hw.cqm_rmid = rmid;
494 list_for_each_entry(event, head, hw.cqm_group_entry)
495 event->hw.cqm_rmid = rmid;
496
497 raw_spin_unlock_irq(&cache_lock);
498
2d4de837
VS
499 /*
500 * If the allocation is for mbm, init the mbm stats.
501 * Need to check if each event in the group is mbm event
502 * because there could be multiple type of events in the same group.
503 */
504 if (__rmid_valid(rmid)) {
505 event = group;
506 if (is_mbm_event(event->attr.config))
507 init_mbm_sample(rmid, event->attr.config);
508
509 list_for_each_entry(event, head, hw.cqm_group_entry) {
510 if (is_mbm_event(event->attr.config))
511 init_mbm_sample(rmid, event->attr.config);
512 }
513 }
514
bff671db
MF
515 return old_rmid;
516}
517
518/*
519 * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
520 * cachelines are still tagged with RMIDs in limbo, we progressively
521 * increment the threshold until we find an RMID in limbo with <=
522 * __intel_cqm_threshold lines tagged. This is designed to mitigate the
523 * problem where cachelines tagged with an RMID are not steadily being
524 * evicted.
525 *
526 * On successful rotations we decrease the threshold back towards zero.
527 *
528 * __intel_cqm_max_threshold provides an upper bound on the threshold,
529 * and is measured in bytes because it's exposed to userland.
530 */
531static unsigned int __intel_cqm_threshold;
532static unsigned int __intel_cqm_max_threshold;
533
534/*
535 * Test whether an RMID has a zero occupancy value on this cpu.
536 */
537static void intel_cqm_stable(void *arg)
538{
539 struct cqm_rmid_entry *entry;
540
541 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
542 if (entry->state != RMID_AVAILABLE)
543 break;
544
545 if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
546 entry->state = RMID_DIRTY;
547 }
548}
549
550/*
551 * If we have group events waiting for an RMID that don't conflict with
552 * events already running, assign @rmid.
553 */
adafa999 554static bool intel_cqm_sched_in_event(u32 rmid)
bff671db
MF
555{
556 struct perf_event *leader, *event;
557
558 lockdep_assert_held(&cache_mutex);
559
560 leader = list_first_entry(&cache_groups, struct perf_event,
561 hw.cqm_groups_entry);
562 event = leader;
563
564 list_for_each_entry_continue(event, &cache_groups,
565 hw.cqm_groups_entry) {
566 if (__rmid_valid(event->hw.cqm_rmid))
567 continue;
568
569 if (__conflict_event(event, leader))
570 continue;
571
572 intel_cqm_xchg_rmid(event, rmid);
573 return true;
574 }
575
576 return false;
577}
578
579/*
580 * Initially use this constant for both the limbo queue time and the
581 * rotation timer interval, pmu::hrtimer_interval_ms.
582 *
583 * They don't need to be the same, but the two are related since if you
584 * rotate faster than you recycle RMIDs, you may run out of available
585 * RMIDs.
586 */
587#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
588
589static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
590
591/*
592 * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
593 * @nr_available: number of freeable RMIDs on the limbo list
594 *
595 * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
596 * cachelines are tagged with those RMIDs. After this we can reuse them
597 * and know that the current set of active RMIDs is stable.
598 *
599 * Return %true or %false depending on whether stabilization needs to be
600 * reattempted.
601 *
602 * If we return %true then @nr_available is updated to indicate the
603 * number of RMIDs on the limbo list that have been queued for the
604 * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
605 * are above __intel_cqm_threshold.
606 */
607static bool intel_cqm_rmid_stabilize(unsigned int *available)
608{
609 struct cqm_rmid_entry *entry, *tmp;
bff671db
MF
610
611 lockdep_assert_held(&cache_mutex);
612
613 *available = 0;
614 list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
615 unsigned long min_queue_time;
616 unsigned long now = jiffies;
617
618 /*
619 * We hold RMIDs placed into limbo for a minimum queue
620 * time. Before the minimum queue time has elapsed we do
621 * not recycle RMIDs.
622 *
623 * The reasoning is that until a sufficient time has
624 * passed since we stopped using an RMID, any RMID
625 * placed onto the limbo list will likely still have
626 * data tagged in the cache, which means we'll probably
627 * fail to recycle it anyway.
628 *
629 * We can save ourselves an expensive IPI by skipping
630 * any RMIDs that have not been queued for the minimum
631 * time.
632 */
633 min_queue_time = entry->queue_time +
634 msecs_to_jiffies(__rmid_queue_time_ms);
635
636 if (time_after(min_queue_time, now))
637 break;
638
639 entry->state = RMID_AVAILABLE;
640 (*available)++;
641 }
642
643 /*
644 * Fast return if none of the RMIDs on the limbo list have been
645 * sitting on the queue for the minimum queue time.
646 */
647 if (!*available)
648 return false;
649
650 /*
651 * Test whether an RMID is free for each package.
652 */
653 on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
654
655 list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
656 /*
657 * Exhausted all RMIDs that have waited min queue time.
658 */
659 if (entry->state == RMID_YOUNG)
660 break;
661
662 if (entry->state == RMID_DIRTY)
663 continue;
664
665 list_del(&entry->list); /* remove from limbo */
666
667 /*
668 * The rotation RMID gets priority if it's
669 * currently invalid. In which case, skip adding
670 * the RMID to the the free lru.
671 */
672 if (!__rmid_valid(intel_cqm_rotation_rmid)) {
673 intel_cqm_rotation_rmid = entry->rmid;
674 continue;
675 }
676
677 /*
678 * If we have groups waiting for RMIDs, hand
59bf7fd4 679 * them one now provided they don't conflict.
bff671db 680 */
59bf7fd4 681 if (intel_cqm_sched_in_event(entry->rmid))
bff671db
MF
682 continue;
683
684 /*
685 * Otherwise place it onto the free list.
686 */
687 list_add_tail(&entry->list, &cqm_rmid_free_lru);
688 }
689
690
691 return __rmid_valid(intel_cqm_rotation_rmid);
692}
693
694/*
695 * Pick a victim group and move it to the tail of the group list.
59bf7fd4 696 * @next: The first group without an RMID
bff671db 697 */
59bf7fd4 698static void __intel_cqm_pick_and_rotate(struct perf_event *next)
bff671db
MF
699{
700 struct perf_event *rotor;
adafa999 701 u32 rmid;
bff671db
MF
702
703 lockdep_assert_held(&cache_mutex);
bff671db
MF
704
705 rotor = list_first_entry(&cache_groups, struct perf_event,
706 hw.cqm_groups_entry);
59bf7fd4
MF
707
708 /*
709 * The group at the front of the list should always have a valid
710 * RMID. If it doesn't then no groups have RMIDs assigned and we
711 * don't need to rotate the list.
712 */
713 if (next == rotor)
714 return;
715
716 rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
717 __put_rmid(rmid);
718
bff671db 719 list_rotate_left(&cache_groups);
59bf7fd4
MF
720}
721
722/*
723 * Deallocate the RMIDs from any events that conflict with @event, and
724 * place them on the back of the group list.
725 */
726static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
727{
728 struct perf_event *group, *g;
adafa999 729 u32 rmid;
59bf7fd4
MF
730
731 lockdep_assert_held(&cache_mutex);
732
733 list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
734 if (group == event)
735 continue;
736
737 rmid = group->hw.cqm_rmid;
738
739 /*
740 * Skip events that don't have a valid RMID.
741 */
742 if (!__rmid_valid(rmid))
743 continue;
744
745 /*
746 * No conflict? No problem! Leave the event alone.
747 */
748 if (!__conflict_event(group, event))
749 continue;
bff671db 750
59bf7fd4
MF
751 intel_cqm_xchg_rmid(group, INVALID_RMID);
752 __put_rmid(rmid);
753 }
bff671db
MF
754}
755
756/*
757 * Attempt to rotate the groups and assign new RMIDs.
758 *
59bf7fd4
MF
759 * We rotate for two reasons,
760 * 1. To handle the scheduling of conflicting events
761 * 2. To recycle RMIDs
762 *
bff671db
MF
763 * Rotating RMIDs is complicated because the hardware doesn't give us
764 * any clues.
765 *
766 * There's problems with the hardware interface; when you change the
767 * task:RMID map cachelines retain their 'old' tags, giving a skewed
768 * picture. In order to work around this, we must always keep one free
769 * RMID - intel_cqm_rotation_rmid.
770 *
771 * Rotation works by taking away an RMID from a group (the old RMID),
772 * and assigning the free RMID to another group (the new RMID). We must
773 * then wait for the old RMID to not be used (no cachelines tagged).
774 * This ensure that all cachelines are tagged with 'active' RMIDs. At
775 * this point we can start reading values for the new RMID and treat the
776 * old RMID as the free RMID for the next rotation.
777 *
778 * Return %true or %false depending on whether we did any rotating.
779 */
780static bool __intel_cqm_rmid_rotate(void)
781{
59bf7fd4 782 struct perf_event *group, *start = NULL;
bff671db
MF
783 unsigned int threshold_limit;
784 unsigned int nr_needed = 0;
785 unsigned int nr_available;
bff671db
MF
786 bool rotated = false;
787
788 mutex_lock(&cache_mutex);
789
790again:
791 /*
792 * Fast path through this function if there are no groups and no
793 * RMIDs that need cleaning.
794 */
795 if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
796 goto out;
797
798 list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
799 if (!__rmid_valid(group->hw.cqm_rmid)) {
800 if (!start)
801 start = group;
802 nr_needed++;
803 }
804 }
805
806 /*
807 * We have some event groups, but they all have RMIDs assigned
808 * and no RMIDs need cleaning.
809 */
810 if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
811 goto out;
812
813 if (!nr_needed)
814 goto stabilize;
815
816 /*
59bf7fd4
MF
817 * We have more event groups without RMIDs than available RMIDs,
818 * or we have event groups that conflict with the ones currently
819 * scheduled.
bff671db
MF
820 *
821 * We force deallocate the rmid of the group at the head of
822 * cache_groups. The first event group without an RMID then gets
823 * assigned intel_cqm_rotation_rmid. This ensures we always make
824 * forward progress.
825 *
826 * Rotate the cache_groups list so the previous head is now the
827 * tail.
828 */
59bf7fd4 829 __intel_cqm_pick_and_rotate(start);
bff671db
MF
830
831 /*
832 * If the rotation is going to succeed, reduce the threshold so
833 * that we don't needlessly reuse dirty RMIDs.
834 */
835 if (__rmid_valid(intel_cqm_rotation_rmid)) {
836 intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
59bf7fd4
MF
837 intel_cqm_rotation_rmid = __get_rmid();
838
839 intel_cqm_sched_out_conflicting_events(start);
bff671db
MF
840
841 if (__intel_cqm_threshold)
842 __intel_cqm_threshold--;
843 }
844
bff671db
MF
845 rotated = true;
846
847stabilize:
848 /*
849 * We now need to stablize the RMID we freed above (if any) to
850 * ensure that the next time we rotate we have an RMID with zero
851 * occupancy value.
852 *
853 * Alternatively, if we didn't need to perform any rotation,
854 * we'll have a bunch of RMIDs in limbo that need stabilizing.
855 */
856 threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
857
858 while (intel_cqm_rmid_stabilize(&nr_available) &&
859 __intel_cqm_threshold < threshold_limit) {
860 unsigned int steal_limit;
861
862 /*
863 * Don't spin if nobody is actively waiting for an RMID,
864 * the rotation worker will be kicked as soon as an
865 * event needs an RMID anyway.
866 */
867 if (!nr_needed)
868 break;
869
870 /* Allow max 25% of RMIDs to be in limbo. */
871 steal_limit = (cqm_max_rmid + 1) / 4;
872
873 /*
874 * We failed to stabilize any RMIDs so our rotation
875 * logic is now stuck. In order to make forward progress
876 * we have a few options:
877 *
878 * 1. rotate ("steal") another RMID
879 * 2. increase the threshold
880 * 3. do nothing
881 *
882 * We do both of 1. and 2. until we hit the steal limit.
883 *
884 * The steal limit prevents all RMIDs ending up on the
885 * limbo list. This can happen if every RMID has a
886 * non-zero occupancy above threshold_limit, and the
887 * occupancy values aren't dropping fast enough.
888 *
889 * Note that there is prioritisation at work here - we'd
890 * rather increase the number of RMIDs on the limbo list
891 * than increase the threshold, because increasing the
892 * threshold skews the event data (because we reuse
893 * dirty RMIDs) - threshold bumps are a last resort.
894 */
895 if (nr_available < steal_limit)
896 goto again;
897
898 __intel_cqm_threshold++;
899 }
900
901out:
902 mutex_unlock(&cache_mutex);
903 return rotated;
904}
905
906static void intel_cqm_rmid_rotate(struct work_struct *work);
907
908static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
909
910static struct pmu intel_cqm_pmu;
911
912static void intel_cqm_rmid_rotate(struct work_struct *work)
913{
914 unsigned long delay;
915
916 __intel_cqm_rmid_rotate();
917
918 delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
919 schedule_delayed_work(&intel_cqm_rmid_work, delay);
920}
921
87f01cc2
TL
922static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
923{
924 struct sample *mbm_current;
925 u32 vrmid = rmid_2_index(rmid);
926 u64 val, bytes, shift;
927 u32 eventid;
928
929 if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
930 mbm_current = &mbm_local[vrmid];
931 eventid = QOS_MBM_LOCAL_EVENT_ID;
932 } else {
933 mbm_current = &mbm_total[vrmid];
934 eventid = QOS_MBM_TOTAL_EVENT_ID;
935 }
936
937 wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
938 rdmsrl(MSR_IA32_QM_CTR, val);
939 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
940 return mbm_current->total_bytes;
941
942 if (first) {
943 mbm_current->prev_msr = val;
944 mbm_current->total_bytes = 0;
945 return mbm_current->total_bytes;
946 }
947
948 shift = 64 - MBM_CNTR_WIDTH;
949 bytes = (val << shift) - (mbm_current->prev_msr << shift);
950 bytes >>= shift;
951
952 bytes *= cqm_l3_scale;
953
954 mbm_current->total_bytes += bytes;
955 mbm_current->prev_msr = val;
956
957 return mbm_current->total_bytes;
958}
959
960static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
961{
962 return update_sample(rmid, evt_type, 0);
963}
964
965static void __intel_mbm_event_init(void *info)
966{
967 struct rmid_read *rr = info;
968
969 update_sample(rr->rmid, rr->evt_type, 1);
970}
971
972static void init_mbm_sample(u32 rmid, u32 evt_type)
973{
974 struct rmid_read rr = {
975 .rmid = rmid,
976 .evt_type = evt_type,
977 .value = ATOMIC64_INIT(0),
978 };
979
980 /* on each socket, init sample */
981 on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
982}
983
4afbb24c
MF
984/*
985 * Find a group and setup RMID.
986 *
987 * If we're part of a group, we use the group's RMID.
988 */
59bf7fd4
MF
989static void intel_cqm_setup_event(struct perf_event *event,
990 struct perf_event **group)
4afbb24c
MF
991{
992 struct perf_event *iter;
59bf7fd4 993 bool conflict = false;
adafa999 994 u32 rmid;
4afbb24c 995
a223c1c7 996 event->hw.is_group_event = false;
4afbb24c 997 list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
59bf7fd4
MF
998 rmid = iter->hw.cqm_rmid;
999
4afbb24c
MF
1000 if (__match_event(iter, event)) {
1001 /* All tasks in a group share an RMID */
59bf7fd4 1002 event->hw.cqm_rmid = rmid;
4afbb24c 1003 *group = iter;
2d4de837 1004 if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
87f01cc2 1005 init_mbm_sample(rmid, event->attr.config);
59bf7fd4 1006 return;
4afbb24c
MF
1007 }
1008
59bf7fd4
MF
1009 /*
1010 * We only care about conflicts for events that are
1011 * actually scheduled in (and hence have a valid RMID).
1012 */
1013 if (__conflict_event(iter, event) && __rmid_valid(rmid))
1014 conflict = true;
4afbb24c
MF
1015 }
1016
59bf7fd4
MF
1017 if (conflict)
1018 rmid = INVALID_RMID;
1019 else
1020 rmid = __get_rmid();
1021
2d4de837 1022 if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
87f01cc2
TL
1023 init_mbm_sample(rmid, event->attr.config);
1024
59bf7fd4 1025 event->hw.cqm_rmid = rmid;
4afbb24c
MF
1026}
1027
1028static void intel_cqm_event_read(struct perf_event *event)
1029{
bff671db 1030 unsigned long flags;
adafa999 1031 u32 rmid;
4afbb24c
MF
1032 u64 val;
1033
bfe1fcd2
MF
1034 /*
1035 * Task events are handled by intel_cqm_event_count().
1036 */
1037 if (event->cpu == -1)
1038 return;
1039
bff671db 1040 raw_spin_lock_irqsave(&cache_lock, flags);
bfe1fcd2 1041 rmid = event->hw.cqm_rmid;
bff671db
MF
1042
1043 if (!__rmid_valid(rmid))
1044 goto out;
1045
87f01cc2
TL
1046 if (is_mbm_event(event->attr.config))
1047 val = rmid_read_mbm(rmid, event->attr.config);
1048 else
1049 val = __rmid_read(rmid);
4afbb24c
MF
1050
1051 /*
1052 * Ignore this reading on error states and do not update the value.
1053 */
1054 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
bff671db 1055 goto out;
4afbb24c
MF
1056
1057 local64_set(&event->count, val);
bff671db
MF
1058out:
1059 raw_spin_unlock_irqrestore(&cache_lock, flags);
4afbb24c
MF
1060}
1061
bfe1fcd2
MF
1062static void __intel_cqm_event_count(void *info)
1063{
1064 struct rmid_read *rr = info;
1065 u64 val;
1066
1067 val = __rmid_read(rr->rmid);
1068
1069 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
1070 return;
1071
1072 atomic64_add(val, &rr->value);
1073}
1074
1075static inline bool cqm_group_leader(struct perf_event *event)
1076{
1077 return !list_empty(&event->hw.cqm_groups_entry);
1078}
1079
87f01cc2
TL
1080static void __intel_mbm_event_count(void *info)
1081{
1082 struct rmid_read *rr = info;
1083 u64 val;
1084
1085 val = rmid_read_mbm(rr->rmid, rr->evt_type);
1086 if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
1087 return;
1088 atomic64_add(val, &rr->value);
1089}
1090
bfe1fcd2
MF
1091static u64 intel_cqm_event_count(struct perf_event *event)
1092{
bff671db 1093 unsigned long flags;
bfe1fcd2 1094 struct rmid_read rr = {
bfe1fcd2
MF
1095 .value = ATOMIC64_INIT(0),
1096 };
1097
1098 /*
1099 * We only need to worry about task events. System-wide events
1100 * are handled like usual, i.e. entirely with
1101 * intel_cqm_event_read().
1102 */
1103 if (event->cpu != -1)
1104 return __perf_event_count(event);
1105
1106 /*
a223c1c7
VS
1107 * Only the group leader gets to report values except in case of
1108 * multiple events in the same group, we still need to read the
1109 * other events.This stops us
bfe1fcd2
MF
1110 * reporting duplicate values to userspace, and gives us a clear
1111 * rule for which task gets to report the values.
1112 *
1113 * Note that it is impossible to attribute these values to
1114 * specific packages - we forfeit that ability when we create
1115 * task events.
1116 */
a223c1c7 1117 if (!cqm_group_leader(event) && !event->hw.is_group_event)
bfe1fcd2
MF
1118 return 0;
1119
2c534c0d
MF
1120 /*
1121 * Getting up-to-date values requires an SMP IPI which is not
1122 * possible if we're being called in interrupt context. Return
1123 * the cached values instead.
1124 */
1125 if (unlikely(in_interrupt()))
1126 goto out;
1127
bff671db
MF
1128 /*
1129 * Notice that we don't perform the reading of an RMID
1130 * atomically, because we can't hold a spin lock across the
1131 * IPIs.
1132 *
1133 * Speculatively perform the read, since @event might be
1134 * assigned a different (possibly invalid) RMID while we're
1135 * busying performing the IPI calls. It's therefore necessary to
1136 * check @event's RMID afterwards, and if it has changed,
1137 * discard the result of the read.
1138 */
1139 rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
bfe1fcd2 1140
bff671db
MF
1141 if (!__rmid_valid(rr.rmid))
1142 goto out;
1143
87f01cc2
TL
1144 if (is_mbm_event(event->attr.config)) {
1145 rr.evt_type = event->attr.config;
1146 on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, &rr, 1);
1147 } else {
1148 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
1149 }
bfe1fcd2 1150
bff671db
MF
1151 raw_spin_lock_irqsave(&cache_lock, flags);
1152 if (event->hw.cqm_rmid == rr.rmid)
1153 local64_set(&event->count, atomic64_read(&rr.value));
1154 raw_spin_unlock_irqrestore(&cache_lock, flags);
1155out:
bfe1fcd2
MF
1156 return __perf_event_count(event);
1157}
1158
4afbb24c
MF
1159static void intel_cqm_event_start(struct perf_event *event, int mode)
1160{
bf926731 1161 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
b3df4ec4 1162 u32 rmid = event->hw.cqm_rmid;
4afbb24c
MF
1163
1164 if (!(event->hw.cqm_state & PERF_HES_STOPPED))
1165 return;
1166
1167 event->hw.cqm_state &= ~PERF_HES_STOPPED;
1168
bf926731 1169 if (state->rmid_usecnt++) {
0bac2378
TG
1170 if (!WARN_ON_ONCE(state->rmid != rmid))
1171 return;
1172 } else {
4afbb24c 1173 WARN_ON_ONCE(state->rmid);
0bac2378 1174 }
4afbb24c
MF
1175
1176 state->rmid = rmid;
bf926731 1177 wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
4afbb24c
MF
1178}
1179
1180static void intel_cqm_event_stop(struct perf_event *event, int mode)
1181{
bf926731 1182 struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
4afbb24c
MF
1183
1184 if (event->hw.cqm_state & PERF_HES_STOPPED)
1185 return;
1186
1187 event->hw.cqm_state |= PERF_HES_STOPPED;
1188
4afbb24c
MF
1189 intel_cqm_event_read(event);
1190
bf926731 1191 if (!--state->rmid_usecnt) {
4afbb24c 1192 state->rmid = 0;
bf926731 1193 wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
4afbb24c
MF
1194 } else {
1195 WARN_ON_ONCE(!state->rmid);
1196 }
4afbb24c
MF
1197}
1198
1199static int intel_cqm_event_add(struct perf_event *event, int mode)
1200{
bff671db 1201 unsigned long flags;
adafa999 1202 u32 rmid;
bff671db
MF
1203
1204 raw_spin_lock_irqsave(&cache_lock, flags);
4afbb24c
MF
1205
1206 event->hw.cqm_state = PERF_HES_STOPPED;
1207 rmid = event->hw.cqm_rmid;
4afbb24c 1208
bff671db 1209 if (__rmid_valid(rmid) && (mode & PERF_EF_START))
4afbb24c
MF
1210 intel_cqm_event_start(event, mode);
1211
bff671db
MF
1212 raw_spin_unlock_irqrestore(&cache_lock, flags);
1213
4afbb24c
MF
1214 return 0;
1215}
1216
4afbb24c
MF
1217static void intel_cqm_event_destroy(struct perf_event *event)
1218{
1219 struct perf_event *group_other = NULL;
1220
1221 mutex_lock(&cache_mutex);
1222
1223 /*
1224 * If there's another event in this group...
1225 */
1226 if (!list_empty(&event->hw.cqm_group_entry)) {
1227 group_other = list_first_entry(&event->hw.cqm_group_entry,
1228 struct perf_event,
1229 hw.cqm_group_entry);
1230 list_del(&event->hw.cqm_group_entry);
1231 }
1232
1233 /*
1234 * And we're the group leader..
1235 */
bfe1fcd2 1236 if (cqm_group_leader(event)) {
4afbb24c
MF
1237 /*
1238 * If there was a group_other, make that leader, otherwise
1239 * destroy the group and return the RMID.
1240 */
1241 if (group_other) {
1242 list_replace(&event->hw.cqm_groups_entry,
1243 &group_other->hw.cqm_groups_entry);
1244 } else {
adafa999 1245 u32 rmid = event->hw.cqm_rmid;
4afbb24c 1246
bff671db
MF
1247 if (__rmid_valid(rmid))
1248 __put_rmid(rmid);
4afbb24c
MF
1249 list_del(&event->hw.cqm_groups_entry);
1250 }
1251 }
1252
1253 mutex_unlock(&cache_mutex);
1254}
1255
4afbb24c
MF
1256static int intel_cqm_event_init(struct perf_event *event)
1257{
1258 struct perf_event *group = NULL;
bff671db 1259 bool rotate = false;
4afbb24c
MF
1260
1261 if (event->attr.type != intel_cqm_pmu.type)
1262 return -ENOENT;
1263
87f01cc2
TL
1264 if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
1265 (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
4afbb24c
MF
1266 return -EINVAL;
1267
4afbb24c
MF
1268 /* unsupported modes and filters */
1269 if (event->attr.exclude_user ||
1270 event->attr.exclude_kernel ||
1271 event->attr.exclude_hv ||
1272 event->attr.exclude_idle ||
1273 event->attr.exclude_host ||
1274 event->attr.exclude_guest ||
1275 event->attr.sample_period) /* no sampling */
1276 return -EINVAL;
1277
1278 INIT_LIST_HEAD(&event->hw.cqm_group_entry);
1279 INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
1280
1281 event->destroy = intel_cqm_event_destroy;
1282
1283 mutex_lock(&cache_mutex);
1284
bfe1fcd2 1285 /* Will also set rmid */
59bf7fd4 1286 intel_cqm_setup_event(event, &group);
4afbb24c
MF
1287
1288 if (group) {
1289 list_add_tail(&event->hw.cqm_group_entry,
1290 &group->hw.cqm_group_entry);
1291 } else {
1292 list_add_tail(&event->hw.cqm_groups_entry,
1293 &cache_groups);
bff671db
MF
1294
1295 /*
1296 * All RMIDs are either in use or have recently been
1297 * used. Kick the rotation worker to clean/free some.
1298 *
1299 * We only do this for the group leader, rather than for
1300 * every event in a group to save on needless work.
1301 */
1302 if (!__rmid_valid(event->hw.cqm_rmid))
1303 rotate = true;
4afbb24c
MF
1304 }
1305
4afbb24c 1306 mutex_unlock(&cache_mutex);
bff671db
MF
1307
1308 if (rotate)
1309 schedule_delayed_work(&intel_cqm_rmid_work, 0);
1310
59bf7fd4 1311 return 0;
4afbb24c
MF
1312}
1313
1314EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
1315EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
1316EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
1317EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
1318EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
1319
33c3cc7a
VS
1320EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
1321EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
1322EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
1323EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
1324
1325EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
1326EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
1327EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
1328EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
1329
4afbb24c
MF
1330static struct attribute *intel_cqm_events_attr[] = {
1331 EVENT_PTR(intel_cqm_llc),
1332 EVENT_PTR(intel_cqm_llc_pkg),
1333 EVENT_PTR(intel_cqm_llc_unit),
1334 EVENT_PTR(intel_cqm_llc_scale),
1335 EVENT_PTR(intel_cqm_llc_snapshot),
1336 NULL,
1337};
1338
33c3cc7a
VS
1339static struct attribute *intel_mbm_events_attr[] = {
1340 EVENT_PTR(intel_cqm_total_bytes),
1341 EVENT_PTR(intel_cqm_local_bytes),
1342 EVENT_PTR(intel_cqm_total_bytes_pkg),
1343 EVENT_PTR(intel_cqm_local_bytes_pkg),
1344 EVENT_PTR(intel_cqm_total_bytes_unit),
1345 EVENT_PTR(intel_cqm_local_bytes_unit),
1346 EVENT_PTR(intel_cqm_total_bytes_scale),
1347 EVENT_PTR(intel_cqm_local_bytes_scale),
1348 NULL,
1349};
1350
1351static struct attribute *intel_cmt_mbm_events_attr[] = {
1352 EVENT_PTR(intel_cqm_llc),
1353 EVENT_PTR(intel_cqm_total_bytes),
1354 EVENT_PTR(intel_cqm_local_bytes),
1355 EVENT_PTR(intel_cqm_llc_pkg),
1356 EVENT_PTR(intel_cqm_total_bytes_pkg),
1357 EVENT_PTR(intel_cqm_local_bytes_pkg),
1358 EVENT_PTR(intel_cqm_llc_unit),
1359 EVENT_PTR(intel_cqm_total_bytes_unit),
1360 EVENT_PTR(intel_cqm_local_bytes_unit),
1361 EVENT_PTR(intel_cqm_llc_scale),
1362 EVENT_PTR(intel_cqm_total_bytes_scale),
1363 EVENT_PTR(intel_cqm_local_bytes_scale),
1364 EVENT_PTR(intel_cqm_llc_snapshot),
1365 NULL,
1366};
1367
4afbb24c
MF
1368static struct attribute_group intel_cqm_events_group = {
1369 .name = "events",
33c3cc7a 1370 .attrs = NULL,
4afbb24c
MF
1371};
1372
1373PMU_FORMAT_ATTR(event, "config:0-7");
1374static struct attribute *intel_cqm_formats_attr[] = {
1375 &format_attr_event.attr,
1376 NULL,
1377};
1378
1379static struct attribute_group intel_cqm_format_group = {
1380 .name = "format",
1381 .attrs = intel_cqm_formats_attr,
1382};
1383
bff671db
MF
1384static ssize_t
1385max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
1386 char *page)
1387{
1388 ssize_t rv;
1389
1390 mutex_lock(&cache_mutex);
1391 rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
1392 mutex_unlock(&cache_mutex);
1393
1394 return rv;
1395}
1396
1397static ssize_t
1398max_recycle_threshold_store(struct device *dev,
1399 struct device_attribute *attr,
1400 const char *buf, size_t count)
1401{
1402 unsigned int bytes, cachelines;
1403 int ret;
1404
1405 ret = kstrtouint(buf, 0, &bytes);
1406 if (ret)
1407 return ret;
1408
1409 mutex_lock(&cache_mutex);
1410
1411 __intel_cqm_max_threshold = bytes;
1412 cachelines = bytes / cqm_l3_scale;
1413
1414 /*
1415 * The new maximum takes effect immediately.
1416 */
1417 if (__intel_cqm_threshold > cachelines)
1418 __intel_cqm_threshold = cachelines;
1419
1420 mutex_unlock(&cache_mutex);
1421
1422 return count;
1423}
1424
1425static DEVICE_ATTR_RW(max_recycle_threshold);
1426
1427static struct attribute *intel_cqm_attrs[] = {
1428 &dev_attr_max_recycle_threshold.attr,
1429 NULL,
1430};
1431
1432static const struct attribute_group intel_cqm_group = {
1433 .attrs = intel_cqm_attrs,
1434};
1435
4afbb24c
MF
1436static const struct attribute_group *intel_cqm_attr_groups[] = {
1437 &intel_cqm_events_group,
1438 &intel_cqm_format_group,
bff671db 1439 &intel_cqm_group,
4afbb24c
MF
1440 NULL,
1441};
1442
1443static struct pmu intel_cqm_pmu = {
bff671db
MF
1444 .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
1445 .attr_groups = intel_cqm_attr_groups,
1446 .task_ctx_nr = perf_sw_context,
1447 .event_init = intel_cqm_event_init,
1448 .add = intel_cqm_event_add,
43d0c2f6 1449 .del = intel_cqm_event_stop,
bff671db
MF
1450 .start = intel_cqm_event_start,
1451 .stop = intel_cqm_event_stop,
1452 .read = intel_cqm_event_read,
1453 .count = intel_cqm_event_count,
4afbb24c
MF
1454};
1455
1456static inline void cqm_pick_event_reader(int cpu)
1457{
827db839 1458 int reader;
4afbb24c 1459
827db839
TG
1460 /* First online cpu in package becomes the reader */
1461 reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
1462 if (reader >= nr_cpu_ids)
1463 cpumask_set_cpu(cpu, &cqm_cpumask);
4afbb24c
MF
1464}
1465
d7a702f0 1466static void intel_cqm_cpu_starting(unsigned int cpu)
4afbb24c 1467{
bf926731 1468 struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
4afbb24c
MF
1469 struct cpuinfo_x86 *c = &cpu_data(cpu);
1470
4afbb24c 1471 state->rmid = 0;
bf926731
TG
1472 state->closid = 0;
1473 state->rmid_usecnt = 0;
4afbb24c
MF
1474
1475 WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
1476 WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
1477}
1478
1479static void intel_cqm_cpu_exit(unsigned int cpu)
1480{
827db839 1481 int target;
4afbb24c 1482
827db839 1483 /* Is @cpu the current cqm reader for this package ? */
4afbb24c
MF
1484 if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
1485 return;
1486
827db839
TG
1487 /* Find another online reader in this package */
1488 target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
4afbb24c 1489
827db839
TG
1490 if (target < nr_cpu_ids)
1491 cpumask_set_cpu(target, &cqm_cpumask);
4afbb24c
MF
1492}
1493
1494static int intel_cqm_cpu_notifier(struct notifier_block *nb,
1495 unsigned long action, void *hcpu)
1496{
1497 unsigned int cpu = (unsigned long)hcpu;
1498
1499 switch (action & ~CPU_TASKS_FROZEN) {
4afbb24c
MF
1500 case CPU_DOWN_PREPARE:
1501 intel_cqm_cpu_exit(cpu);
1502 break;
1503 case CPU_STARTING:
d7a702f0 1504 intel_cqm_cpu_starting(cpu);
4afbb24c
MF
1505 cqm_pick_event_reader(cpu);
1506 break;
1507 }
1508
1509 return NOTIFY_OK;
1510}
1511
1512static const struct x86_cpu_id intel_cqm_match[] = {
1513 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
1514 {}
1515};
1516
33c3cc7a
VS
1517static void mbm_cleanup(void)
1518{
1519 if (!mbm_enabled)
1520 return;
1521
1522 kfree(mbm_local);
1523 kfree(mbm_total);
1524 mbm_enabled = false;
1525}
1526
1527static const struct x86_cpu_id intel_mbm_local_match[] = {
1528 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
1529 {}
1530};
1531
1532static const struct x86_cpu_id intel_mbm_total_match[] = {
1533 { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
1534 {}
1535};
1536
1537static int intel_mbm_init(void)
1538{
1539 int array_size, maxid = cqm_max_rmid + 1;
1540
1541 array_size = sizeof(struct sample) * maxid * topology_max_packages();
1542 mbm_local = kmalloc(array_size, GFP_KERNEL);
1543 if (!mbm_local)
1544 return -ENOMEM;
1545
1546 mbm_total = kmalloc(array_size, GFP_KERNEL);
1547 if (!mbm_total) {
1548 mbm_cleanup();
1549 return -ENOMEM;
1550 }
1551
1552 return 0;
1553}
1554
4afbb24c
MF
1555static int __init intel_cqm_init(void)
1556{
ada2f634 1557 char *str = NULL, scale[20];
4afbb24c
MF
1558 int i, cpu, ret;
1559
33c3cc7a
VS
1560 if (x86_match_cpu(intel_cqm_match))
1561 cqm_enabled = true;
1562
1563 if (x86_match_cpu(intel_mbm_local_match) &&
1564 x86_match_cpu(intel_mbm_total_match))
1565 mbm_enabled = true;
1566
1567 if (!cqm_enabled && !mbm_enabled)
4afbb24c
MF
1568 return -ENODEV;
1569
1570 cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
1571
1572 /*
1573 * It's possible that not all resources support the same number
1574 * of RMIDs. Instead of making scheduling much more complicated
1575 * (where we have to match a task's RMID to a cpu that supports
1576 * that many RMIDs) just find the minimum RMIDs supported across
1577 * all cpus.
1578 *
1579 * Also, check that the scales match on all cpus.
1580 */
1581 cpu_notifier_register_begin();
1582
1583 for_each_online_cpu(cpu) {
1584 struct cpuinfo_x86 *c = &cpu_data(cpu);
1585
1586 if (c->x86_cache_max_rmid < cqm_max_rmid)
1587 cqm_max_rmid = c->x86_cache_max_rmid;
1588
1589 if (c->x86_cache_occ_scale != cqm_l3_scale) {
1590 pr_err("Multiple LLC scale values, disabling\n");
1591 ret = -EINVAL;
1592 goto out;
1593 }
1594 }
1595
bff671db
MF
1596 /*
1597 * A reasonable upper limit on the max threshold is the number
1598 * of lines tagged per RMID if all RMIDs have the same number of
1599 * lines tagged in the LLC.
1600 *
1601 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
1602 */
1603 __intel_cqm_max_threshold =
1604 boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
1605
4afbb24c
MF
1606 snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
1607 str = kstrdup(scale, GFP_KERNEL);
1608 if (!str) {
1609 ret = -ENOMEM;
1610 goto out;
1611 }
1612
1613 event_attr_intel_cqm_llc_scale.event_str = str;
1614
1615 ret = intel_cqm_setup_rmid_cache();
1616 if (ret)
1617 goto out;
1618
1619 for_each_online_cpu(i) {
d7a702f0 1620 intel_cqm_cpu_starting(i);
4afbb24c
MF
1621 cqm_pick_event_reader(i);
1622 }
1623
33c3cc7a
VS
1624 if (mbm_enabled)
1625 ret = intel_mbm_init();
1626 if (ret && !cqm_enabled)
1627 goto out;
1628
1629 if (cqm_enabled && mbm_enabled)
1630 intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
1631 else if (!cqm_enabled && mbm_enabled)
1632 intel_cqm_events_group.attrs = intel_mbm_events_attr;
1633 else if (cqm_enabled && !mbm_enabled)
1634 intel_cqm_events_group.attrs = intel_cqm_events_attr;
1635
50f16a8b 1636 ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
ada2f634 1637 if (ret) {
4afbb24c 1638 pr_err("Intel CQM perf registration failed: %d\n", ret);
ada2f634
VS
1639 goto out;
1640 }
4afbb24c 1641
33c3cc7a
VS
1642 if (cqm_enabled)
1643 pr_info("Intel CQM monitoring enabled\n");
1644 if (mbm_enabled)
1645 pr_info("Intel MBM enabled\n");
ada2f634
VS
1646
1647 /*
1648 * Register the hot cpu notifier once we are sure cqm
1649 * is enabled to avoid notifier leak.
1650 */
1651 __perf_cpu_notifier(intel_cqm_cpu_notifier);
4afbb24c
MF
1652out:
1653 cpu_notifier_register_done();
ada2f634
VS
1654 if (ret) {
1655 kfree(str);
1656 cqm_cleanup();
33c3cc7a 1657 mbm_cleanup();
ada2f634 1658 }
4afbb24c
MF
1659
1660 return ret;
1661}
1662device_initcall(intel_cqm_init);
This page took 0.135792 seconds and 5 git commands to generate.