Merge branch 'for-linus-4.1' of git://git.kernel.org/pub/scm/linux/kernel/git/mason...
[deliverable/linux.git] / kernel / events / core.c
1 /*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/idr.h>
17 #include <linux/file.h>
18 #include <linux/poll.h>
19 #include <linux/slab.h>
20 #include <linux/hash.h>
21 #include <linux/tick.h>
22 #include <linux/sysfs.h>
23 #include <linux/dcache.h>
24 #include <linux/percpu.h>
25 #include <linux/ptrace.h>
26 #include <linux/reboot.h>
27 #include <linux/vmstat.h>
28 #include <linux/device.h>
29 #include <linux/export.h>
30 #include <linux/vmalloc.h>
31 #include <linux/hardirq.h>
32 #include <linux/rculist.h>
33 #include <linux/uaccess.h>
34 #include <linux/syscalls.h>
35 #include <linux/anon_inodes.h>
36 #include <linux/kernel_stat.h>
37 #include <linux/cgroup.h>
38 #include <linux/perf_event.h>
39 #include <linux/ftrace_event.h>
40 #include <linux/hw_breakpoint.h>
41 #include <linux/mm_types.h>
42 #include <linux/module.h>
43 #include <linux/mman.h>
44 #include <linux/compat.h>
45 #include <linux/bpf.h>
46 #include <linux/filter.h>
47
48 #include "internal.h"
49
50 #include <asm/irq_regs.h>
51
52 static struct workqueue_struct *perf_wq;
53
54 struct remote_function_call {
55 struct task_struct *p;
56 int (*func)(void *info);
57 void *info;
58 int ret;
59 };
60
61 static void remote_function(void *data)
62 {
63 struct remote_function_call *tfc = data;
64 struct task_struct *p = tfc->p;
65
66 if (p) {
67 tfc->ret = -EAGAIN;
68 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
69 return;
70 }
71
72 tfc->ret = tfc->func(tfc->info);
73 }
74
75 /**
76 * task_function_call - call a function on the cpu on which a task runs
77 * @p: the task to evaluate
78 * @func: the function to be called
79 * @info: the function call argument
80 *
81 * Calls the function @func when the task is currently running. This might
82 * be on the current CPU, which just calls the function directly
83 *
84 * returns: @func return value, or
85 * -ESRCH - when the process isn't running
86 * -EAGAIN - when the process moved away
87 */
88 static int
89 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
90 {
91 struct remote_function_call data = {
92 .p = p,
93 .func = func,
94 .info = info,
95 .ret = -ESRCH, /* No such (running) process */
96 };
97
98 if (task_curr(p))
99 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
100
101 return data.ret;
102 }
103
104 /**
105 * cpu_function_call - call a function on the cpu
106 * @func: the function to be called
107 * @info: the function call argument
108 *
109 * Calls the function @func on the remote cpu.
110 *
111 * returns: @func return value or -ENXIO when the cpu is offline
112 */
113 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
114 {
115 struct remote_function_call data = {
116 .p = NULL,
117 .func = func,
118 .info = info,
119 .ret = -ENXIO, /* No such CPU */
120 };
121
122 smp_call_function_single(cpu, remote_function, &data, 1);
123
124 return data.ret;
125 }
126
127 #define EVENT_OWNER_KERNEL ((void *) -1)
128
129 static bool is_kernel_event(struct perf_event *event)
130 {
131 return event->owner == EVENT_OWNER_KERNEL;
132 }
133
134 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
135 PERF_FLAG_FD_OUTPUT |\
136 PERF_FLAG_PID_CGROUP |\
137 PERF_FLAG_FD_CLOEXEC)
138
139 /*
140 * branch priv levels that need permission checks
141 */
142 #define PERF_SAMPLE_BRANCH_PERM_PLM \
143 (PERF_SAMPLE_BRANCH_KERNEL |\
144 PERF_SAMPLE_BRANCH_HV)
145
146 enum event_type_t {
147 EVENT_FLEXIBLE = 0x1,
148 EVENT_PINNED = 0x2,
149 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
150 };
151
152 /*
153 * perf_sched_events : >0 events exist
154 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
155 */
156 struct static_key_deferred perf_sched_events __read_mostly;
157 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
158 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
159
160 static atomic_t nr_mmap_events __read_mostly;
161 static atomic_t nr_comm_events __read_mostly;
162 static atomic_t nr_task_events __read_mostly;
163 static atomic_t nr_freq_events __read_mostly;
164
165 static LIST_HEAD(pmus);
166 static DEFINE_MUTEX(pmus_lock);
167 static struct srcu_struct pmus_srcu;
168
169 /*
170 * perf event paranoia level:
171 * -1 - not paranoid at all
172 * 0 - disallow raw tracepoint access for unpriv
173 * 1 - disallow cpu events for unpriv
174 * 2 - disallow kernel profiling for unpriv
175 */
176 int sysctl_perf_event_paranoid __read_mostly = 1;
177
178 /* Minimum for 512 kiB + 1 user control page */
179 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
180
181 /*
182 * max perf event sample rate
183 */
184 #define DEFAULT_MAX_SAMPLE_RATE 100000
185 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
186 #define DEFAULT_CPU_TIME_MAX_PERCENT 25
187
188 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
189
190 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
191 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
192
193 static int perf_sample_allowed_ns __read_mostly =
194 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
195
196 void update_perf_cpu_limits(void)
197 {
198 u64 tmp = perf_sample_period_ns;
199
200 tmp *= sysctl_perf_cpu_time_max_percent;
201 do_div(tmp, 100);
202 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
203 }
204
205 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
206
207 int perf_proc_update_handler(struct ctl_table *table, int write,
208 void __user *buffer, size_t *lenp,
209 loff_t *ppos)
210 {
211 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
212
213 if (ret || !write)
214 return ret;
215
216 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
217 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
218 update_perf_cpu_limits();
219
220 return 0;
221 }
222
223 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
224
225 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
226 void __user *buffer, size_t *lenp,
227 loff_t *ppos)
228 {
229 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
230
231 if (ret || !write)
232 return ret;
233
234 update_perf_cpu_limits();
235
236 return 0;
237 }
238
239 /*
240 * perf samples are done in some very critical code paths (NMIs).
241 * If they take too much CPU time, the system can lock up and not
242 * get any real work done. This will drop the sample rate when
243 * we detect that events are taking too long.
244 */
245 #define NR_ACCUMULATED_SAMPLES 128
246 static DEFINE_PER_CPU(u64, running_sample_length);
247
248 static void perf_duration_warn(struct irq_work *w)
249 {
250 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
251 u64 avg_local_sample_len;
252 u64 local_samples_len;
253
254 local_samples_len = __this_cpu_read(running_sample_length);
255 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
256
257 printk_ratelimited(KERN_WARNING
258 "perf interrupt took too long (%lld > %lld), lowering "
259 "kernel.perf_event_max_sample_rate to %d\n",
260 avg_local_sample_len, allowed_ns >> 1,
261 sysctl_perf_event_sample_rate);
262 }
263
264 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
265
266 void perf_sample_event_took(u64 sample_len_ns)
267 {
268 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
269 u64 avg_local_sample_len;
270 u64 local_samples_len;
271
272 if (allowed_ns == 0)
273 return;
274
275 /* decay the counter by 1 average sample */
276 local_samples_len = __this_cpu_read(running_sample_length);
277 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
278 local_samples_len += sample_len_ns;
279 __this_cpu_write(running_sample_length, local_samples_len);
280
281 /*
282 * note: this will be biased artifically low until we have
283 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
284 * from having to maintain a count.
285 */
286 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
287
288 if (avg_local_sample_len <= allowed_ns)
289 return;
290
291 if (max_samples_per_tick <= 1)
292 return;
293
294 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
295 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
296 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
297
298 update_perf_cpu_limits();
299
300 if (!irq_work_queue(&perf_duration_work)) {
301 early_printk("perf interrupt took too long (%lld > %lld), lowering "
302 "kernel.perf_event_max_sample_rate to %d\n",
303 avg_local_sample_len, allowed_ns >> 1,
304 sysctl_perf_event_sample_rate);
305 }
306 }
307
308 static atomic64_t perf_event_id;
309
310 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
311 enum event_type_t event_type);
312
313 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
314 enum event_type_t event_type,
315 struct task_struct *task);
316
317 static void update_context_time(struct perf_event_context *ctx);
318 static u64 perf_event_time(struct perf_event *event);
319
320 void __weak perf_event_print_debug(void) { }
321
322 extern __weak const char *perf_pmu_name(void)
323 {
324 return "pmu";
325 }
326
327 static inline u64 perf_clock(void)
328 {
329 return local_clock();
330 }
331
332 static inline u64 perf_event_clock(struct perf_event *event)
333 {
334 return event->clock();
335 }
336
337 static inline struct perf_cpu_context *
338 __get_cpu_context(struct perf_event_context *ctx)
339 {
340 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
341 }
342
343 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
344 struct perf_event_context *ctx)
345 {
346 raw_spin_lock(&cpuctx->ctx.lock);
347 if (ctx)
348 raw_spin_lock(&ctx->lock);
349 }
350
351 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
352 struct perf_event_context *ctx)
353 {
354 if (ctx)
355 raw_spin_unlock(&ctx->lock);
356 raw_spin_unlock(&cpuctx->ctx.lock);
357 }
358
359 #ifdef CONFIG_CGROUP_PERF
360
361 static inline bool
362 perf_cgroup_match(struct perf_event *event)
363 {
364 struct perf_event_context *ctx = event->ctx;
365 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
366
367 /* @event doesn't care about cgroup */
368 if (!event->cgrp)
369 return true;
370
371 /* wants specific cgroup scope but @cpuctx isn't associated with any */
372 if (!cpuctx->cgrp)
373 return false;
374
375 /*
376 * Cgroup scoping is recursive. An event enabled for a cgroup is
377 * also enabled for all its descendant cgroups. If @cpuctx's
378 * cgroup is a descendant of @event's (the test covers identity
379 * case), it's a match.
380 */
381 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
382 event->cgrp->css.cgroup);
383 }
384
385 static inline void perf_detach_cgroup(struct perf_event *event)
386 {
387 css_put(&event->cgrp->css);
388 event->cgrp = NULL;
389 }
390
391 static inline int is_cgroup_event(struct perf_event *event)
392 {
393 return event->cgrp != NULL;
394 }
395
396 static inline u64 perf_cgroup_event_time(struct perf_event *event)
397 {
398 struct perf_cgroup_info *t;
399
400 t = per_cpu_ptr(event->cgrp->info, event->cpu);
401 return t->time;
402 }
403
404 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
405 {
406 struct perf_cgroup_info *info;
407 u64 now;
408
409 now = perf_clock();
410
411 info = this_cpu_ptr(cgrp->info);
412
413 info->time += now - info->timestamp;
414 info->timestamp = now;
415 }
416
417 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
418 {
419 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
420 if (cgrp_out)
421 __update_cgrp_time(cgrp_out);
422 }
423
424 static inline void update_cgrp_time_from_event(struct perf_event *event)
425 {
426 struct perf_cgroup *cgrp;
427
428 /*
429 * ensure we access cgroup data only when needed and
430 * when we know the cgroup is pinned (css_get)
431 */
432 if (!is_cgroup_event(event))
433 return;
434
435 cgrp = perf_cgroup_from_task(current);
436 /*
437 * Do not update time when cgroup is not active
438 */
439 if (cgrp == event->cgrp)
440 __update_cgrp_time(event->cgrp);
441 }
442
443 static inline void
444 perf_cgroup_set_timestamp(struct task_struct *task,
445 struct perf_event_context *ctx)
446 {
447 struct perf_cgroup *cgrp;
448 struct perf_cgroup_info *info;
449
450 /*
451 * ctx->lock held by caller
452 * ensure we do not access cgroup data
453 * unless we have the cgroup pinned (css_get)
454 */
455 if (!task || !ctx->nr_cgroups)
456 return;
457
458 cgrp = perf_cgroup_from_task(task);
459 info = this_cpu_ptr(cgrp->info);
460 info->timestamp = ctx->timestamp;
461 }
462
463 #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
464 #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
465
466 /*
467 * reschedule events based on the cgroup constraint of task.
468 *
469 * mode SWOUT : schedule out everything
470 * mode SWIN : schedule in based on cgroup for next
471 */
472 void perf_cgroup_switch(struct task_struct *task, int mode)
473 {
474 struct perf_cpu_context *cpuctx;
475 struct pmu *pmu;
476 unsigned long flags;
477
478 /*
479 * disable interrupts to avoid geting nr_cgroup
480 * changes via __perf_event_disable(). Also
481 * avoids preemption.
482 */
483 local_irq_save(flags);
484
485 /*
486 * we reschedule only in the presence of cgroup
487 * constrained events.
488 */
489 rcu_read_lock();
490
491 list_for_each_entry_rcu(pmu, &pmus, entry) {
492 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
493 if (cpuctx->unique_pmu != pmu)
494 continue; /* ensure we process each cpuctx once */
495
496 /*
497 * perf_cgroup_events says at least one
498 * context on this CPU has cgroup events.
499 *
500 * ctx->nr_cgroups reports the number of cgroup
501 * events for a context.
502 */
503 if (cpuctx->ctx.nr_cgroups > 0) {
504 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
505 perf_pmu_disable(cpuctx->ctx.pmu);
506
507 if (mode & PERF_CGROUP_SWOUT) {
508 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
509 /*
510 * must not be done before ctxswout due
511 * to event_filter_match() in event_sched_out()
512 */
513 cpuctx->cgrp = NULL;
514 }
515
516 if (mode & PERF_CGROUP_SWIN) {
517 WARN_ON_ONCE(cpuctx->cgrp);
518 /*
519 * set cgrp before ctxsw in to allow
520 * event_filter_match() to not have to pass
521 * task around
522 */
523 cpuctx->cgrp = perf_cgroup_from_task(task);
524 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
525 }
526 perf_pmu_enable(cpuctx->ctx.pmu);
527 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
528 }
529 }
530
531 rcu_read_unlock();
532
533 local_irq_restore(flags);
534 }
535
536 static inline void perf_cgroup_sched_out(struct task_struct *task,
537 struct task_struct *next)
538 {
539 struct perf_cgroup *cgrp1;
540 struct perf_cgroup *cgrp2 = NULL;
541
542 /*
543 * we come here when we know perf_cgroup_events > 0
544 */
545 cgrp1 = perf_cgroup_from_task(task);
546
547 /*
548 * next is NULL when called from perf_event_enable_on_exec()
549 * that will systematically cause a cgroup_switch()
550 */
551 if (next)
552 cgrp2 = perf_cgroup_from_task(next);
553
554 /*
555 * only schedule out current cgroup events if we know
556 * that we are switching to a different cgroup. Otherwise,
557 * do no touch the cgroup events.
558 */
559 if (cgrp1 != cgrp2)
560 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
561 }
562
563 static inline void perf_cgroup_sched_in(struct task_struct *prev,
564 struct task_struct *task)
565 {
566 struct perf_cgroup *cgrp1;
567 struct perf_cgroup *cgrp2 = NULL;
568
569 /*
570 * we come here when we know perf_cgroup_events > 0
571 */
572 cgrp1 = perf_cgroup_from_task(task);
573
574 /* prev can never be NULL */
575 cgrp2 = perf_cgroup_from_task(prev);
576
577 /*
578 * only need to schedule in cgroup events if we are changing
579 * cgroup during ctxsw. Cgroup events were not scheduled
580 * out of ctxsw out if that was not the case.
581 */
582 if (cgrp1 != cgrp2)
583 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
584 }
585
586 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
587 struct perf_event_attr *attr,
588 struct perf_event *group_leader)
589 {
590 struct perf_cgroup *cgrp;
591 struct cgroup_subsys_state *css;
592 struct fd f = fdget(fd);
593 int ret = 0;
594
595 if (!f.file)
596 return -EBADF;
597
598 css = css_tryget_online_from_dir(f.file->f_path.dentry,
599 &perf_event_cgrp_subsys);
600 if (IS_ERR(css)) {
601 ret = PTR_ERR(css);
602 goto out;
603 }
604
605 cgrp = container_of(css, struct perf_cgroup, css);
606 event->cgrp = cgrp;
607
608 /*
609 * all events in a group must monitor
610 * the same cgroup because a task belongs
611 * to only one perf cgroup at a time
612 */
613 if (group_leader && group_leader->cgrp != cgrp) {
614 perf_detach_cgroup(event);
615 ret = -EINVAL;
616 }
617 out:
618 fdput(f);
619 return ret;
620 }
621
622 static inline void
623 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
624 {
625 struct perf_cgroup_info *t;
626 t = per_cpu_ptr(event->cgrp->info, event->cpu);
627 event->shadow_ctx_time = now - t->timestamp;
628 }
629
630 static inline void
631 perf_cgroup_defer_enabled(struct perf_event *event)
632 {
633 /*
634 * when the current task's perf cgroup does not match
635 * the event's, we need to remember to call the
636 * perf_mark_enable() function the first time a task with
637 * a matching perf cgroup is scheduled in.
638 */
639 if (is_cgroup_event(event) && !perf_cgroup_match(event))
640 event->cgrp_defer_enabled = 1;
641 }
642
643 static inline void
644 perf_cgroup_mark_enabled(struct perf_event *event,
645 struct perf_event_context *ctx)
646 {
647 struct perf_event *sub;
648 u64 tstamp = perf_event_time(event);
649
650 if (!event->cgrp_defer_enabled)
651 return;
652
653 event->cgrp_defer_enabled = 0;
654
655 event->tstamp_enabled = tstamp - event->total_time_enabled;
656 list_for_each_entry(sub, &event->sibling_list, group_entry) {
657 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
658 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
659 sub->cgrp_defer_enabled = 0;
660 }
661 }
662 }
663 #else /* !CONFIG_CGROUP_PERF */
664
665 static inline bool
666 perf_cgroup_match(struct perf_event *event)
667 {
668 return true;
669 }
670
671 static inline void perf_detach_cgroup(struct perf_event *event)
672 {}
673
674 static inline int is_cgroup_event(struct perf_event *event)
675 {
676 return 0;
677 }
678
679 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
680 {
681 return 0;
682 }
683
684 static inline void update_cgrp_time_from_event(struct perf_event *event)
685 {
686 }
687
688 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
689 {
690 }
691
692 static inline void perf_cgroup_sched_out(struct task_struct *task,
693 struct task_struct *next)
694 {
695 }
696
697 static inline void perf_cgroup_sched_in(struct task_struct *prev,
698 struct task_struct *task)
699 {
700 }
701
702 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
703 struct perf_event_attr *attr,
704 struct perf_event *group_leader)
705 {
706 return -EINVAL;
707 }
708
709 static inline void
710 perf_cgroup_set_timestamp(struct task_struct *task,
711 struct perf_event_context *ctx)
712 {
713 }
714
715 void
716 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
717 {
718 }
719
720 static inline void
721 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
722 {
723 }
724
725 static inline u64 perf_cgroup_event_time(struct perf_event *event)
726 {
727 return 0;
728 }
729
730 static inline void
731 perf_cgroup_defer_enabled(struct perf_event *event)
732 {
733 }
734
735 static inline void
736 perf_cgroup_mark_enabled(struct perf_event *event,
737 struct perf_event_context *ctx)
738 {
739 }
740 #endif
741
742 /*
743 * set default to be dependent on timer tick just
744 * like original code
745 */
746 #define PERF_CPU_HRTIMER (1000 / HZ)
747 /*
748 * function must be called with interrupts disbled
749 */
750 static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
751 {
752 struct perf_cpu_context *cpuctx;
753 enum hrtimer_restart ret = HRTIMER_NORESTART;
754 int rotations = 0;
755
756 WARN_ON(!irqs_disabled());
757
758 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
759
760 rotations = perf_rotate_context(cpuctx);
761
762 /*
763 * arm timer if needed
764 */
765 if (rotations) {
766 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
767 ret = HRTIMER_RESTART;
768 }
769
770 return ret;
771 }
772
773 /* CPU is going down */
774 void perf_cpu_hrtimer_cancel(int cpu)
775 {
776 struct perf_cpu_context *cpuctx;
777 struct pmu *pmu;
778 unsigned long flags;
779
780 if (WARN_ON(cpu != smp_processor_id()))
781 return;
782
783 local_irq_save(flags);
784
785 rcu_read_lock();
786
787 list_for_each_entry_rcu(pmu, &pmus, entry) {
788 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
789
790 if (pmu->task_ctx_nr == perf_sw_context)
791 continue;
792
793 hrtimer_cancel(&cpuctx->hrtimer);
794 }
795
796 rcu_read_unlock();
797
798 local_irq_restore(flags);
799 }
800
801 static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
802 {
803 struct hrtimer *hr = &cpuctx->hrtimer;
804 struct pmu *pmu = cpuctx->ctx.pmu;
805 int timer;
806
807 /* no multiplexing needed for SW PMU */
808 if (pmu->task_ctx_nr == perf_sw_context)
809 return;
810
811 /*
812 * check default is sane, if not set then force to
813 * default interval (1/tick)
814 */
815 timer = pmu->hrtimer_interval_ms;
816 if (timer < 1)
817 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
818
819 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
820
821 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
822 hr->function = perf_cpu_hrtimer_handler;
823 }
824
825 static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
826 {
827 struct hrtimer *hr = &cpuctx->hrtimer;
828 struct pmu *pmu = cpuctx->ctx.pmu;
829
830 /* not for SW PMU */
831 if (pmu->task_ctx_nr == perf_sw_context)
832 return;
833
834 if (hrtimer_active(hr))
835 return;
836
837 if (!hrtimer_callback_running(hr))
838 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
839 0, HRTIMER_MODE_REL_PINNED, 0);
840 }
841
842 void perf_pmu_disable(struct pmu *pmu)
843 {
844 int *count = this_cpu_ptr(pmu->pmu_disable_count);
845 if (!(*count)++)
846 pmu->pmu_disable(pmu);
847 }
848
849 void perf_pmu_enable(struct pmu *pmu)
850 {
851 int *count = this_cpu_ptr(pmu->pmu_disable_count);
852 if (!--(*count))
853 pmu->pmu_enable(pmu);
854 }
855
856 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
857
858 /*
859 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
860 * perf_event_task_tick() are fully serialized because they're strictly cpu
861 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
862 * disabled, while perf_event_task_tick is called from IRQ context.
863 */
864 static void perf_event_ctx_activate(struct perf_event_context *ctx)
865 {
866 struct list_head *head = this_cpu_ptr(&active_ctx_list);
867
868 WARN_ON(!irqs_disabled());
869
870 WARN_ON(!list_empty(&ctx->active_ctx_list));
871
872 list_add(&ctx->active_ctx_list, head);
873 }
874
875 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
876 {
877 WARN_ON(!irqs_disabled());
878
879 WARN_ON(list_empty(&ctx->active_ctx_list));
880
881 list_del_init(&ctx->active_ctx_list);
882 }
883
884 static void get_ctx(struct perf_event_context *ctx)
885 {
886 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
887 }
888
889 static void free_ctx(struct rcu_head *head)
890 {
891 struct perf_event_context *ctx;
892
893 ctx = container_of(head, struct perf_event_context, rcu_head);
894 kfree(ctx->task_ctx_data);
895 kfree(ctx);
896 }
897
898 static void put_ctx(struct perf_event_context *ctx)
899 {
900 if (atomic_dec_and_test(&ctx->refcount)) {
901 if (ctx->parent_ctx)
902 put_ctx(ctx->parent_ctx);
903 if (ctx->task)
904 put_task_struct(ctx->task);
905 call_rcu(&ctx->rcu_head, free_ctx);
906 }
907 }
908
909 /*
910 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
911 * perf_pmu_migrate_context() we need some magic.
912 *
913 * Those places that change perf_event::ctx will hold both
914 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
915 *
916 * Lock ordering is by mutex address. There is one other site where
917 * perf_event_context::mutex nests and that is put_event(). But remember that
918 * that is a parent<->child context relation, and migration does not affect
919 * children, therefore these two orderings should not interact.
920 *
921 * The change in perf_event::ctx does not affect children (as claimed above)
922 * because the sys_perf_event_open() case will install a new event and break
923 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
924 * concerned with cpuctx and that doesn't have children.
925 *
926 * The places that change perf_event::ctx will issue:
927 *
928 * perf_remove_from_context();
929 * synchronize_rcu();
930 * perf_install_in_context();
931 *
932 * to affect the change. The remove_from_context() + synchronize_rcu() should
933 * quiesce the event, after which we can install it in the new location. This
934 * means that only external vectors (perf_fops, prctl) can perturb the event
935 * while in transit. Therefore all such accessors should also acquire
936 * perf_event_context::mutex to serialize against this.
937 *
938 * However; because event->ctx can change while we're waiting to acquire
939 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
940 * function.
941 *
942 * Lock order:
943 * task_struct::perf_event_mutex
944 * perf_event_context::mutex
945 * perf_event_context::lock
946 * perf_event::child_mutex;
947 * perf_event::mmap_mutex
948 * mmap_sem
949 */
950 static struct perf_event_context *
951 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
952 {
953 struct perf_event_context *ctx;
954
955 again:
956 rcu_read_lock();
957 ctx = ACCESS_ONCE(event->ctx);
958 if (!atomic_inc_not_zero(&ctx->refcount)) {
959 rcu_read_unlock();
960 goto again;
961 }
962 rcu_read_unlock();
963
964 mutex_lock_nested(&ctx->mutex, nesting);
965 if (event->ctx != ctx) {
966 mutex_unlock(&ctx->mutex);
967 put_ctx(ctx);
968 goto again;
969 }
970
971 return ctx;
972 }
973
974 static inline struct perf_event_context *
975 perf_event_ctx_lock(struct perf_event *event)
976 {
977 return perf_event_ctx_lock_nested(event, 0);
978 }
979
980 static void perf_event_ctx_unlock(struct perf_event *event,
981 struct perf_event_context *ctx)
982 {
983 mutex_unlock(&ctx->mutex);
984 put_ctx(ctx);
985 }
986
987 /*
988 * This must be done under the ctx->lock, such as to serialize against
989 * context_equiv(), therefore we cannot call put_ctx() since that might end up
990 * calling scheduler related locks and ctx->lock nests inside those.
991 */
992 static __must_check struct perf_event_context *
993 unclone_ctx(struct perf_event_context *ctx)
994 {
995 struct perf_event_context *parent_ctx = ctx->parent_ctx;
996
997 lockdep_assert_held(&ctx->lock);
998
999 if (parent_ctx)
1000 ctx->parent_ctx = NULL;
1001 ctx->generation++;
1002
1003 return parent_ctx;
1004 }
1005
1006 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1007 {
1008 /*
1009 * only top level events have the pid namespace they were created in
1010 */
1011 if (event->parent)
1012 event = event->parent;
1013
1014 return task_tgid_nr_ns(p, event->ns);
1015 }
1016
1017 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1018 {
1019 /*
1020 * only top level events have the pid namespace they were created in
1021 */
1022 if (event->parent)
1023 event = event->parent;
1024
1025 return task_pid_nr_ns(p, event->ns);
1026 }
1027
1028 /*
1029 * If we inherit events we want to return the parent event id
1030 * to userspace.
1031 */
1032 static u64 primary_event_id(struct perf_event *event)
1033 {
1034 u64 id = event->id;
1035
1036 if (event->parent)
1037 id = event->parent->id;
1038
1039 return id;
1040 }
1041
1042 /*
1043 * Get the perf_event_context for a task and lock it.
1044 * This has to cope with with the fact that until it is locked,
1045 * the context could get moved to another task.
1046 */
1047 static struct perf_event_context *
1048 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1049 {
1050 struct perf_event_context *ctx;
1051
1052 retry:
1053 /*
1054 * One of the few rules of preemptible RCU is that one cannot do
1055 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1056 * part of the read side critical section was preemptible -- see
1057 * rcu_read_unlock_special().
1058 *
1059 * Since ctx->lock nests under rq->lock we must ensure the entire read
1060 * side critical section is non-preemptible.
1061 */
1062 preempt_disable();
1063 rcu_read_lock();
1064 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1065 if (ctx) {
1066 /*
1067 * If this context is a clone of another, it might
1068 * get swapped for another underneath us by
1069 * perf_event_task_sched_out, though the
1070 * rcu_read_lock() protects us from any context
1071 * getting freed. Lock the context and check if it
1072 * got swapped before we could get the lock, and retry
1073 * if so. If we locked the right context, then it
1074 * can't get swapped on us any more.
1075 */
1076 raw_spin_lock_irqsave(&ctx->lock, *flags);
1077 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1078 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1079 rcu_read_unlock();
1080 preempt_enable();
1081 goto retry;
1082 }
1083
1084 if (!atomic_inc_not_zero(&ctx->refcount)) {
1085 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1086 ctx = NULL;
1087 }
1088 }
1089 rcu_read_unlock();
1090 preempt_enable();
1091 return ctx;
1092 }
1093
1094 /*
1095 * Get the context for a task and increment its pin_count so it
1096 * can't get swapped to another task. This also increments its
1097 * reference count so that the context can't get freed.
1098 */
1099 static struct perf_event_context *
1100 perf_pin_task_context(struct task_struct *task, int ctxn)
1101 {
1102 struct perf_event_context *ctx;
1103 unsigned long flags;
1104
1105 ctx = perf_lock_task_context(task, ctxn, &flags);
1106 if (ctx) {
1107 ++ctx->pin_count;
1108 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1109 }
1110 return ctx;
1111 }
1112
1113 static void perf_unpin_context(struct perf_event_context *ctx)
1114 {
1115 unsigned long flags;
1116
1117 raw_spin_lock_irqsave(&ctx->lock, flags);
1118 --ctx->pin_count;
1119 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1120 }
1121
1122 /*
1123 * Update the record of the current time in a context.
1124 */
1125 static void update_context_time(struct perf_event_context *ctx)
1126 {
1127 u64 now = perf_clock();
1128
1129 ctx->time += now - ctx->timestamp;
1130 ctx->timestamp = now;
1131 }
1132
1133 static u64 perf_event_time(struct perf_event *event)
1134 {
1135 struct perf_event_context *ctx = event->ctx;
1136
1137 if (is_cgroup_event(event))
1138 return perf_cgroup_event_time(event);
1139
1140 return ctx ? ctx->time : 0;
1141 }
1142
1143 /*
1144 * Update the total_time_enabled and total_time_running fields for a event.
1145 * The caller of this function needs to hold the ctx->lock.
1146 */
1147 static void update_event_times(struct perf_event *event)
1148 {
1149 struct perf_event_context *ctx = event->ctx;
1150 u64 run_end;
1151
1152 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1153 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1154 return;
1155 /*
1156 * in cgroup mode, time_enabled represents
1157 * the time the event was enabled AND active
1158 * tasks were in the monitored cgroup. This is
1159 * independent of the activity of the context as
1160 * there may be a mix of cgroup and non-cgroup events.
1161 *
1162 * That is why we treat cgroup events differently
1163 * here.
1164 */
1165 if (is_cgroup_event(event))
1166 run_end = perf_cgroup_event_time(event);
1167 else if (ctx->is_active)
1168 run_end = ctx->time;
1169 else
1170 run_end = event->tstamp_stopped;
1171
1172 event->total_time_enabled = run_end - event->tstamp_enabled;
1173
1174 if (event->state == PERF_EVENT_STATE_INACTIVE)
1175 run_end = event->tstamp_stopped;
1176 else
1177 run_end = perf_event_time(event);
1178
1179 event->total_time_running = run_end - event->tstamp_running;
1180
1181 }
1182
1183 /*
1184 * Update total_time_enabled and total_time_running for all events in a group.
1185 */
1186 static void update_group_times(struct perf_event *leader)
1187 {
1188 struct perf_event *event;
1189
1190 update_event_times(leader);
1191 list_for_each_entry(event, &leader->sibling_list, group_entry)
1192 update_event_times(event);
1193 }
1194
1195 static struct list_head *
1196 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1197 {
1198 if (event->attr.pinned)
1199 return &ctx->pinned_groups;
1200 else
1201 return &ctx->flexible_groups;
1202 }
1203
1204 /*
1205 * Add a event from the lists for its context.
1206 * Must be called with ctx->mutex and ctx->lock held.
1207 */
1208 static void
1209 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1210 {
1211 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1212 event->attach_state |= PERF_ATTACH_CONTEXT;
1213
1214 /*
1215 * If we're a stand alone event or group leader, we go to the context
1216 * list, group events are kept attached to the group so that
1217 * perf_group_detach can, at all times, locate all siblings.
1218 */
1219 if (event->group_leader == event) {
1220 struct list_head *list;
1221
1222 if (is_software_event(event))
1223 event->group_flags |= PERF_GROUP_SOFTWARE;
1224
1225 list = ctx_group_list(event, ctx);
1226 list_add_tail(&event->group_entry, list);
1227 }
1228
1229 if (is_cgroup_event(event))
1230 ctx->nr_cgroups++;
1231
1232 list_add_rcu(&event->event_entry, &ctx->event_list);
1233 ctx->nr_events++;
1234 if (event->attr.inherit_stat)
1235 ctx->nr_stat++;
1236
1237 ctx->generation++;
1238 }
1239
1240 /*
1241 * Initialize event state based on the perf_event_attr::disabled.
1242 */
1243 static inline void perf_event__state_init(struct perf_event *event)
1244 {
1245 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1246 PERF_EVENT_STATE_INACTIVE;
1247 }
1248
1249 /*
1250 * Called at perf_event creation and when events are attached/detached from a
1251 * group.
1252 */
1253 static void perf_event__read_size(struct perf_event *event)
1254 {
1255 int entry = sizeof(u64); /* value */
1256 int size = 0;
1257 int nr = 1;
1258
1259 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1260 size += sizeof(u64);
1261
1262 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1263 size += sizeof(u64);
1264
1265 if (event->attr.read_format & PERF_FORMAT_ID)
1266 entry += sizeof(u64);
1267
1268 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1269 nr += event->group_leader->nr_siblings;
1270 size += sizeof(u64);
1271 }
1272
1273 size += entry * nr;
1274 event->read_size = size;
1275 }
1276
1277 static void perf_event__header_size(struct perf_event *event)
1278 {
1279 struct perf_sample_data *data;
1280 u64 sample_type = event->attr.sample_type;
1281 u16 size = 0;
1282
1283 perf_event__read_size(event);
1284
1285 if (sample_type & PERF_SAMPLE_IP)
1286 size += sizeof(data->ip);
1287
1288 if (sample_type & PERF_SAMPLE_ADDR)
1289 size += sizeof(data->addr);
1290
1291 if (sample_type & PERF_SAMPLE_PERIOD)
1292 size += sizeof(data->period);
1293
1294 if (sample_type & PERF_SAMPLE_WEIGHT)
1295 size += sizeof(data->weight);
1296
1297 if (sample_type & PERF_SAMPLE_READ)
1298 size += event->read_size;
1299
1300 if (sample_type & PERF_SAMPLE_DATA_SRC)
1301 size += sizeof(data->data_src.val);
1302
1303 if (sample_type & PERF_SAMPLE_TRANSACTION)
1304 size += sizeof(data->txn);
1305
1306 event->header_size = size;
1307 }
1308
1309 static void perf_event__id_header_size(struct perf_event *event)
1310 {
1311 struct perf_sample_data *data;
1312 u64 sample_type = event->attr.sample_type;
1313 u16 size = 0;
1314
1315 if (sample_type & PERF_SAMPLE_TID)
1316 size += sizeof(data->tid_entry);
1317
1318 if (sample_type & PERF_SAMPLE_TIME)
1319 size += sizeof(data->time);
1320
1321 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1322 size += sizeof(data->id);
1323
1324 if (sample_type & PERF_SAMPLE_ID)
1325 size += sizeof(data->id);
1326
1327 if (sample_type & PERF_SAMPLE_STREAM_ID)
1328 size += sizeof(data->stream_id);
1329
1330 if (sample_type & PERF_SAMPLE_CPU)
1331 size += sizeof(data->cpu_entry);
1332
1333 event->id_header_size = size;
1334 }
1335
1336 static void perf_group_attach(struct perf_event *event)
1337 {
1338 struct perf_event *group_leader = event->group_leader, *pos;
1339
1340 /*
1341 * We can have double attach due to group movement in perf_event_open.
1342 */
1343 if (event->attach_state & PERF_ATTACH_GROUP)
1344 return;
1345
1346 event->attach_state |= PERF_ATTACH_GROUP;
1347
1348 if (group_leader == event)
1349 return;
1350
1351 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1352
1353 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1354 !is_software_event(event))
1355 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1356
1357 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1358 group_leader->nr_siblings++;
1359
1360 perf_event__header_size(group_leader);
1361
1362 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1363 perf_event__header_size(pos);
1364 }
1365
1366 /*
1367 * Remove a event from the lists for its context.
1368 * Must be called with ctx->mutex and ctx->lock held.
1369 */
1370 static void
1371 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1372 {
1373 struct perf_cpu_context *cpuctx;
1374
1375 WARN_ON_ONCE(event->ctx != ctx);
1376 lockdep_assert_held(&ctx->lock);
1377
1378 /*
1379 * We can have double detach due to exit/hot-unplug + close.
1380 */
1381 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1382 return;
1383
1384 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1385
1386 if (is_cgroup_event(event)) {
1387 ctx->nr_cgroups--;
1388 cpuctx = __get_cpu_context(ctx);
1389 /*
1390 * if there are no more cgroup events
1391 * then cler cgrp to avoid stale pointer
1392 * in update_cgrp_time_from_cpuctx()
1393 */
1394 if (!ctx->nr_cgroups)
1395 cpuctx->cgrp = NULL;
1396 }
1397
1398 ctx->nr_events--;
1399 if (event->attr.inherit_stat)
1400 ctx->nr_stat--;
1401
1402 list_del_rcu(&event->event_entry);
1403
1404 if (event->group_leader == event)
1405 list_del_init(&event->group_entry);
1406
1407 update_group_times(event);
1408
1409 /*
1410 * If event was in error state, then keep it
1411 * that way, otherwise bogus counts will be
1412 * returned on read(). The only way to get out
1413 * of error state is by explicit re-enabling
1414 * of the event
1415 */
1416 if (event->state > PERF_EVENT_STATE_OFF)
1417 event->state = PERF_EVENT_STATE_OFF;
1418
1419 ctx->generation++;
1420 }
1421
1422 static void perf_group_detach(struct perf_event *event)
1423 {
1424 struct perf_event *sibling, *tmp;
1425 struct list_head *list = NULL;
1426
1427 /*
1428 * We can have double detach due to exit/hot-unplug + close.
1429 */
1430 if (!(event->attach_state & PERF_ATTACH_GROUP))
1431 return;
1432
1433 event->attach_state &= ~PERF_ATTACH_GROUP;
1434
1435 /*
1436 * If this is a sibling, remove it from its group.
1437 */
1438 if (event->group_leader != event) {
1439 list_del_init(&event->group_entry);
1440 event->group_leader->nr_siblings--;
1441 goto out;
1442 }
1443
1444 if (!list_empty(&event->group_entry))
1445 list = &event->group_entry;
1446
1447 /*
1448 * If this was a group event with sibling events then
1449 * upgrade the siblings to singleton events by adding them
1450 * to whatever list we are on.
1451 */
1452 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1453 if (list)
1454 list_move_tail(&sibling->group_entry, list);
1455 sibling->group_leader = sibling;
1456
1457 /* Inherit group flags from the previous leader */
1458 sibling->group_flags = event->group_flags;
1459
1460 WARN_ON_ONCE(sibling->ctx != event->ctx);
1461 }
1462
1463 out:
1464 perf_event__header_size(event->group_leader);
1465
1466 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1467 perf_event__header_size(tmp);
1468 }
1469
1470 /*
1471 * User event without the task.
1472 */
1473 static bool is_orphaned_event(struct perf_event *event)
1474 {
1475 return event && !is_kernel_event(event) && !event->owner;
1476 }
1477
1478 /*
1479 * Event has a parent but parent's task finished and it's
1480 * alive only because of children holding refference.
1481 */
1482 static bool is_orphaned_child(struct perf_event *event)
1483 {
1484 return is_orphaned_event(event->parent);
1485 }
1486
1487 static void orphans_remove_work(struct work_struct *work);
1488
1489 static void schedule_orphans_remove(struct perf_event_context *ctx)
1490 {
1491 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1492 return;
1493
1494 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1495 get_ctx(ctx);
1496 ctx->orphans_remove_sched = true;
1497 }
1498 }
1499
1500 static int __init perf_workqueue_init(void)
1501 {
1502 perf_wq = create_singlethread_workqueue("perf");
1503 WARN(!perf_wq, "failed to create perf workqueue\n");
1504 return perf_wq ? 0 : -1;
1505 }
1506
1507 core_initcall(perf_workqueue_init);
1508
1509 static inline int
1510 event_filter_match(struct perf_event *event)
1511 {
1512 return (event->cpu == -1 || event->cpu == smp_processor_id())
1513 && perf_cgroup_match(event);
1514 }
1515
1516 static void
1517 event_sched_out(struct perf_event *event,
1518 struct perf_cpu_context *cpuctx,
1519 struct perf_event_context *ctx)
1520 {
1521 u64 tstamp = perf_event_time(event);
1522 u64 delta;
1523
1524 WARN_ON_ONCE(event->ctx != ctx);
1525 lockdep_assert_held(&ctx->lock);
1526
1527 /*
1528 * An event which could not be activated because of
1529 * filter mismatch still needs to have its timings
1530 * maintained, otherwise bogus information is return
1531 * via read() for time_enabled, time_running:
1532 */
1533 if (event->state == PERF_EVENT_STATE_INACTIVE
1534 && !event_filter_match(event)) {
1535 delta = tstamp - event->tstamp_stopped;
1536 event->tstamp_running += delta;
1537 event->tstamp_stopped = tstamp;
1538 }
1539
1540 if (event->state != PERF_EVENT_STATE_ACTIVE)
1541 return;
1542
1543 perf_pmu_disable(event->pmu);
1544
1545 event->state = PERF_EVENT_STATE_INACTIVE;
1546 if (event->pending_disable) {
1547 event->pending_disable = 0;
1548 event->state = PERF_EVENT_STATE_OFF;
1549 }
1550 event->tstamp_stopped = tstamp;
1551 event->pmu->del(event, 0);
1552 event->oncpu = -1;
1553
1554 if (!is_software_event(event))
1555 cpuctx->active_oncpu--;
1556 if (!--ctx->nr_active)
1557 perf_event_ctx_deactivate(ctx);
1558 if (event->attr.freq && event->attr.sample_freq)
1559 ctx->nr_freq--;
1560 if (event->attr.exclusive || !cpuctx->active_oncpu)
1561 cpuctx->exclusive = 0;
1562
1563 if (is_orphaned_child(event))
1564 schedule_orphans_remove(ctx);
1565
1566 perf_pmu_enable(event->pmu);
1567 }
1568
1569 static void
1570 group_sched_out(struct perf_event *group_event,
1571 struct perf_cpu_context *cpuctx,
1572 struct perf_event_context *ctx)
1573 {
1574 struct perf_event *event;
1575 int state = group_event->state;
1576
1577 event_sched_out(group_event, cpuctx, ctx);
1578
1579 /*
1580 * Schedule out siblings (if any):
1581 */
1582 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1583 event_sched_out(event, cpuctx, ctx);
1584
1585 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1586 cpuctx->exclusive = 0;
1587 }
1588
1589 struct remove_event {
1590 struct perf_event *event;
1591 bool detach_group;
1592 };
1593
1594 /*
1595 * Cross CPU call to remove a performance event
1596 *
1597 * We disable the event on the hardware level first. After that we
1598 * remove it from the context list.
1599 */
1600 static int __perf_remove_from_context(void *info)
1601 {
1602 struct remove_event *re = info;
1603 struct perf_event *event = re->event;
1604 struct perf_event_context *ctx = event->ctx;
1605 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1606
1607 raw_spin_lock(&ctx->lock);
1608 event_sched_out(event, cpuctx, ctx);
1609 if (re->detach_group)
1610 perf_group_detach(event);
1611 list_del_event(event, ctx);
1612 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1613 ctx->is_active = 0;
1614 cpuctx->task_ctx = NULL;
1615 }
1616 raw_spin_unlock(&ctx->lock);
1617
1618 return 0;
1619 }
1620
1621
1622 /*
1623 * Remove the event from a task's (or a CPU's) list of events.
1624 *
1625 * CPU events are removed with a smp call. For task events we only
1626 * call when the task is on a CPU.
1627 *
1628 * If event->ctx is a cloned context, callers must make sure that
1629 * every task struct that event->ctx->task could possibly point to
1630 * remains valid. This is OK when called from perf_release since
1631 * that only calls us on the top-level context, which can't be a clone.
1632 * When called from perf_event_exit_task, it's OK because the
1633 * context has been detached from its task.
1634 */
1635 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1636 {
1637 struct perf_event_context *ctx = event->ctx;
1638 struct task_struct *task = ctx->task;
1639 struct remove_event re = {
1640 .event = event,
1641 .detach_group = detach_group,
1642 };
1643
1644 lockdep_assert_held(&ctx->mutex);
1645
1646 if (!task) {
1647 /*
1648 * Per cpu events are removed via an smp call. The removal can
1649 * fail if the CPU is currently offline, but in that case we
1650 * already called __perf_remove_from_context from
1651 * perf_event_exit_cpu.
1652 */
1653 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1654 return;
1655 }
1656
1657 retry:
1658 if (!task_function_call(task, __perf_remove_from_context, &re))
1659 return;
1660
1661 raw_spin_lock_irq(&ctx->lock);
1662 /*
1663 * If we failed to find a running task, but find the context active now
1664 * that we've acquired the ctx->lock, retry.
1665 */
1666 if (ctx->is_active) {
1667 raw_spin_unlock_irq(&ctx->lock);
1668 /*
1669 * Reload the task pointer, it might have been changed by
1670 * a concurrent perf_event_context_sched_out().
1671 */
1672 task = ctx->task;
1673 goto retry;
1674 }
1675
1676 /*
1677 * Since the task isn't running, its safe to remove the event, us
1678 * holding the ctx->lock ensures the task won't get scheduled in.
1679 */
1680 if (detach_group)
1681 perf_group_detach(event);
1682 list_del_event(event, ctx);
1683 raw_spin_unlock_irq(&ctx->lock);
1684 }
1685
1686 /*
1687 * Cross CPU call to disable a performance event
1688 */
1689 int __perf_event_disable(void *info)
1690 {
1691 struct perf_event *event = info;
1692 struct perf_event_context *ctx = event->ctx;
1693 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1694
1695 /*
1696 * If this is a per-task event, need to check whether this
1697 * event's task is the current task on this cpu.
1698 *
1699 * Can trigger due to concurrent perf_event_context_sched_out()
1700 * flipping contexts around.
1701 */
1702 if (ctx->task && cpuctx->task_ctx != ctx)
1703 return -EINVAL;
1704
1705 raw_spin_lock(&ctx->lock);
1706
1707 /*
1708 * If the event is on, turn it off.
1709 * If it is in error state, leave it in error state.
1710 */
1711 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1712 update_context_time(ctx);
1713 update_cgrp_time_from_event(event);
1714 update_group_times(event);
1715 if (event == event->group_leader)
1716 group_sched_out(event, cpuctx, ctx);
1717 else
1718 event_sched_out(event, cpuctx, ctx);
1719 event->state = PERF_EVENT_STATE_OFF;
1720 }
1721
1722 raw_spin_unlock(&ctx->lock);
1723
1724 return 0;
1725 }
1726
1727 /*
1728 * Disable a event.
1729 *
1730 * If event->ctx is a cloned context, callers must make sure that
1731 * every task struct that event->ctx->task could possibly point to
1732 * remains valid. This condition is satisifed when called through
1733 * perf_event_for_each_child or perf_event_for_each because they
1734 * hold the top-level event's child_mutex, so any descendant that
1735 * goes to exit will block in sync_child_event.
1736 * When called from perf_pending_event it's OK because event->ctx
1737 * is the current context on this CPU and preemption is disabled,
1738 * hence we can't get into perf_event_task_sched_out for this context.
1739 */
1740 static void _perf_event_disable(struct perf_event *event)
1741 {
1742 struct perf_event_context *ctx = event->ctx;
1743 struct task_struct *task = ctx->task;
1744
1745 if (!task) {
1746 /*
1747 * Disable the event on the cpu that it's on
1748 */
1749 cpu_function_call(event->cpu, __perf_event_disable, event);
1750 return;
1751 }
1752
1753 retry:
1754 if (!task_function_call(task, __perf_event_disable, event))
1755 return;
1756
1757 raw_spin_lock_irq(&ctx->lock);
1758 /*
1759 * If the event is still active, we need to retry the cross-call.
1760 */
1761 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1762 raw_spin_unlock_irq(&ctx->lock);
1763 /*
1764 * Reload the task pointer, it might have been changed by
1765 * a concurrent perf_event_context_sched_out().
1766 */
1767 task = ctx->task;
1768 goto retry;
1769 }
1770
1771 /*
1772 * Since we have the lock this context can't be scheduled
1773 * in, so we can change the state safely.
1774 */
1775 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1776 update_group_times(event);
1777 event->state = PERF_EVENT_STATE_OFF;
1778 }
1779 raw_spin_unlock_irq(&ctx->lock);
1780 }
1781
1782 /*
1783 * Strictly speaking kernel users cannot create groups and therefore this
1784 * interface does not need the perf_event_ctx_lock() magic.
1785 */
1786 void perf_event_disable(struct perf_event *event)
1787 {
1788 struct perf_event_context *ctx;
1789
1790 ctx = perf_event_ctx_lock(event);
1791 _perf_event_disable(event);
1792 perf_event_ctx_unlock(event, ctx);
1793 }
1794 EXPORT_SYMBOL_GPL(perf_event_disable);
1795
1796 static void perf_set_shadow_time(struct perf_event *event,
1797 struct perf_event_context *ctx,
1798 u64 tstamp)
1799 {
1800 /*
1801 * use the correct time source for the time snapshot
1802 *
1803 * We could get by without this by leveraging the
1804 * fact that to get to this function, the caller
1805 * has most likely already called update_context_time()
1806 * and update_cgrp_time_xx() and thus both timestamp
1807 * are identical (or very close). Given that tstamp is,
1808 * already adjusted for cgroup, we could say that:
1809 * tstamp - ctx->timestamp
1810 * is equivalent to
1811 * tstamp - cgrp->timestamp.
1812 *
1813 * Then, in perf_output_read(), the calculation would
1814 * work with no changes because:
1815 * - event is guaranteed scheduled in
1816 * - no scheduled out in between
1817 * - thus the timestamp would be the same
1818 *
1819 * But this is a bit hairy.
1820 *
1821 * So instead, we have an explicit cgroup call to remain
1822 * within the time time source all along. We believe it
1823 * is cleaner and simpler to understand.
1824 */
1825 if (is_cgroup_event(event))
1826 perf_cgroup_set_shadow_time(event, tstamp);
1827 else
1828 event->shadow_ctx_time = tstamp - ctx->timestamp;
1829 }
1830
1831 #define MAX_INTERRUPTS (~0ULL)
1832
1833 static void perf_log_throttle(struct perf_event *event, int enable);
1834 static void perf_log_itrace_start(struct perf_event *event);
1835
1836 static int
1837 event_sched_in(struct perf_event *event,
1838 struct perf_cpu_context *cpuctx,
1839 struct perf_event_context *ctx)
1840 {
1841 u64 tstamp = perf_event_time(event);
1842 int ret = 0;
1843
1844 lockdep_assert_held(&ctx->lock);
1845
1846 if (event->state <= PERF_EVENT_STATE_OFF)
1847 return 0;
1848
1849 event->state = PERF_EVENT_STATE_ACTIVE;
1850 event->oncpu = smp_processor_id();
1851
1852 /*
1853 * Unthrottle events, since we scheduled we might have missed several
1854 * ticks already, also for a heavily scheduling task there is little
1855 * guarantee it'll get a tick in a timely manner.
1856 */
1857 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1858 perf_log_throttle(event, 1);
1859 event->hw.interrupts = 0;
1860 }
1861
1862 /*
1863 * The new state must be visible before we turn it on in the hardware:
1864 */
1865 smp_wmb();
1866
1867 perf_pmu_disable(event->pmu);
1868
1869 event->tstamp_running += tstamp - event->tstamp_stopped;
1870
1871 perf_set_shadow_time(event, ctx, tstamp);
1872
1873 perf_log_itrace_start(event);
1874
1875 if (event->pmu->add(event, PERF_EF_START)) {
1876 event->state = PERF_EVENT_STATE_INACTIVE;
1877 event->oncpu = -1;
1878 ret = -EAGAIN;
1879 goto out;
1880 }
1881
1882 if (!is_software_event(event))
1883 cpuctx->active_oncpu++;
1884 if (!ctx->nr_active++)
1885 perf_event_ctx_activate(ctx);
1886 if (event->attr.freq && event->attr.sample_freq)
1887 ctx->nr_freq++;
1888
1889 if (event->attr.exclusive)
1890 cpuctx->exclusive = 1;
1891
1892 if (is_orphaned_child(event))
1893 schedule_orphans_remove(ctx);
1894
1895 out:
1896 perf_pmu_enable(event->pmu);
1897
1898 return ret;
1899 }
1900
1901 static int
1902 group_sched_in(struct perf_event *group_event,
1903 struct perf_cpu_context *cpuctx,
1904 struct perf_event_context *ctx)
1905 {
1906 struct perf_event *event, *partial_group = NULL;
1907 struct pmu *pmu = ctx->pmu;
1908 u64 now = ctx->time;
1909 bool simulate = false;
1910
1911 if (group_event->state == PERF_EVENT_STATE_OFF)
1912 return 0;
1913
1914 pmu->start_txn(pmu);
1915
1916 if (event_sched_in(group_event, cpuctx, ctx)) {
1917 pmu->cancel_txn(pmu);
1918 perf_cpu_hrtimer_restart(cpuctx);
1919 return -EAGAIN;
1920 }
1921
1922 /*
1923 * Schedule in siblings as one group (if any):
1924 */
1925 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1926 if (event_sched_in(event, cpuctx, ctx)) {
1927 partial_group = event;
1928 goto group_error;
1929 }
1930 }
1931
1932 if (!pmu->commit_txn(pmu))
1933 return 0;
1934
1935 group_error:
1936 /*
1937 * Groups can be scheduled in as one unit only, so undo any
1938 * partial group before returning:
1939 * The events up to the failed event are scheduled out normally,
1940 * tstamp_stopped will be updated.
1941 *
1942 * The failed events and the remaining siblings need to have
1943 * their timings updated as if they had gone thru event_sched_in()
1944 * and event_sched_out(). This is required to get consistent timings
1945 * across the group. This also takes care of the case where the group
1946 * could never be scheduled by ensuring tstamp_stopped is set to mark
1947 * the time the event was actually stopped, such that time delta
1948 * calculation in update_event_times() is correct.
1949 */
1950 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1951 if (event == partial_group)
1952 simulate = true;
1953
1954 if (simulate) {
1955 event->tstamp_running += now - event->tstamp_stopped;
1956 event->tstamp_stopped = now;
1957 } else {
1958 event_sched_out(event, cpuctx, ctx);
1959 }
1960 }
1961 event_sched_out(group_event, cpuctx, ctx);
1962
1963 pmu->cancel_txn(pmu);
1964
1965 perf_cpu_hrtimer_restart(cpuctx);
1966
1967 return -EAGAIN;
1968 }
1969
1970 /*
1971 * Work out whether we can put this event group on the CPU now.
1972 */
1973 static int group_can_go_on(struct perf_event *event,
1974 struct perf_cpu_context *cpuctx,
1975 int can_add_hw)
1976 {
1977 /*
1978 * Groups consisting entirely of software events can always go on.
1979 */
1980 if (event->group_flags & PERF_GROUP_SOFTWARE)
1981 return 1;
1982 /*
1983 * If an exclusive group is already on, no other hardware
1984 * events can go on.
1985 */
1986 if (cpuctx->exclusive)
1987 return 0;
1988 /*
1989 * If this group is exclusive and there are already
1990 * events on the CPU, it can't go on.
1991 */
1992 if (event->attr.exclusive && cpuctx->active_oncpu)
1993 return 0;
1994 /*
1995 * Otherwise, try to add it if all previous groups were able
1996 * to go on.
1997 */
1998 return can_add_hw;
1999 }
2000
2001 static void add_event_to_ctx(struct perf_event *event,
2002 struct perf_event_context *ctx)
2003 {
2004 u64 tstamp = perf_event_time(event);
2005
2006 list_add_event(event, ctx);
2007 perf_group_attach(event);
2008 event->tstamp_enabled = tstamp;
2009 event->tstamp_running = tstamp;
2010 event->tstamp_stopped = tstamp;
2011 }
2012
2013 static void task_ctx_sched_out(struct perf_event_context *ctx);
2014 static void
2015 ctx_sched_in(struct perf_event_context *ctx,
2016 struct perf_cpu_context *cpuctx,
2017 enum event_type_t event_type,
2018 struct task_struct *task);
2019
2020 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2021 struct perf_event_context *ctx,
2022 struct task_struct *task)
2023 {
2024 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2025 if (ctx)
2026 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2027 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2028 if (ctx)
2029 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2030 }
2031
2032 /*
2033 * Cross CPU call to install and enable a performance event
2034 *
2035 * Must be called with ctx->mutex held
2036 */
2037 static int __perf_install_in_context(void *info)
2038 {
2039 struct perf_event *event = info;
2040 struct perf_event_context *ctx = event->ctx;
2041 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2042 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2043 struct task_struct *task = current;
2044
2045 perf_ctx_lock(cpuctx, task_ctx);
2046 perf_pmu_disable(cpuctx->ctx.pmu);
2047
2048 /*
2049 * If there was an active task_ctx schedule it out.
2050 */
2051 if (task_ctx)
2052 task_ctx_sched_out(task_ctx);
2053
2054 /*
2055 * If the context we're installing events in is not the
2056 * active task_ctx, flip them.
2057 */
2058 if (ctx->task && task_ctx != ctx) {
2059 if (task_ctx)
2060 raw_spin_unlock(&task_ctx->lock);
2061 raw_spin_lock(&ctx->lock);
2062 task_ctx = ctx;
2063 }
2064
2065 if (task_ctx) {
2066 cpuctx->task_ctx = task_ctx;
2067 task = task_ctx->task;
2068 }
2069
2070 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2071
2072 update_context_time(ctx);
2073 /*
2074 * update cgrp time only if current cgrp
2075 * matches event->cgrp. Must be done before
2076 * calling add_event_to_ctx()
2077 */
2078 update_cgrp_time_from_event(event);
2079
2080 add_event_to_ctx(event, ctx);
2081
2082 /*
2083 * Schedule everything back in
2084 */
2085 perf_event_sched_in(cpuctx, task_ctx, task);
2086
2087 perf_pmu_enable(cpuctx->ctx.pmu);
2088 perf_ctx_unlock(cpuctx, task_ctx);
2089
2090 return 0;
2091 }
2092
2093 /*
2094 * Attach a performance event to a context
2095 *
2096 * First we add the event to the list with the hardware enable bit
2097 * in event->hw_config cleared.
2098 *
2099 * If the event is attached to a task which is on a CPU we use a smp
2100 * call to enable it in the task context. The task might have been
2101 * scheduled away, but we check this in the smp call again.
2102 */
2103 static void
2104 perf_install_in_context(struct perf_event_context *ctx,
2105 struct perf_event *event,
2106 int cpu)
2107 {
2108 struct task_struct *task = ctx->task;
2109
2110 lockdep_assert_held(&ctx->mutex);
2111
2112 event->ctx = ctx;
2113 if (event->cpu != -1)
2114 event->cpu = cpu;
2115
2116 if (!task) {
2117 /*
2118 * Per cpu events are installed via an smp call and
2119 * the install is always successful.
2120 */
2121 cpu_function_call(cpu, __perf_install_in_context, event);
2122 return;
2123 }
2124
2125 retry:
2126 if (!task_function_call(task, __perf_install_in_context, event))
2127 return;
2128
2129 raw_spin_lock_irq(&ctx->lock);
2130 /*
2131 * If we failed to find a running task, but find the context active now
2132 * that we've acquired the ctx->lock, retry.
2133 */
2134 if (ctx->is_active) {
2135 raw_spin_unlock_irq(&ctx->lock);
2136 /*
2137 * Reload the task pointer, it might have been changed by
2138 * a concurrent perf_event_context_sched_out().
2139 */
2140 task = ctx->task;
2141 goto retry;
2142 }
2143
2144 /*
2145 * Since the task isn't running, its safe to add the event, us holding
2146 * the ctx->lock ensures the task won't get scheduled in.
2147 */
2148 add_event_to_ctx(event, ctx);
2149 raw_spin_unlock_irq(&ctx->lock);
2150 }
2151
2152 /*
2153 * Put a event into inactive state and update time fields.
2154 * Enabling the leader of a group effectively enables all
2155 * the group members that aren't explicitly disabled, so we
2156 * have to update their ->tstamp_enabled also.
2157 * Note: this works for group members as well as group leaders
2158 * since the non-leader members' sibling_lists will be empty.
2159 */
2160 static void __perf_event_mark_enabled(struct perf_event *event)
2161 {
2162 struct perf_event *sub;
2163 u64 tstamp = perf_event_time(event);
2164
2165 event->state = PERF_EVENT_STATE_INACTIVE;
2166 event->tstamp_enabled = tstamp - event->total_time_enabled;
2167 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2168 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2169 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2170 }
2171 }
2172
2173 /*
2174 * Cross CPU call to enable a performance event
2175 */
2176 static int __perf_event_enable(void *info)
2177 {
2178 struct perf_event *event = info;
2179 struct perf_event_context *ctx = event->ctx;
2180 struct perf_event *leader = event->group_leader;
2181 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2182 int err;
2183
2184 /*
2185 * There's a time window between 'ctx->is_active' check
2186 * in perf_event_enable function and this place having:
2187 * - IRQs on
2188 * - ctx->lock unlocked
2189 *
2190 * where the task could be killed and 'ctx' deactivated
2191 * by perf_event_exit_task.
2192 */
2193 if (!ctx->is_active)
2194 return -EINVAL;
2195
2196 raw_spin_lock(&ctx->lock);
2197 update_context_time(ctx);
2198
2199 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2200 goto unlock;
2201
2202 /*
2203 * set current task's cgroup time reference point
2204 */
2205 perf_cgroup_set_timestamp(current, ctx);
2206
2207 __perf_event_mark_enabled(event);
2208
2209 if (!event_filter_match(event)) {
2210 if (is_cgroup_event(event))
2211 perf_cgroup_defer_enabled(event);
2212 goto unlock;
2213 }
2214
2215 /*
2216 * If the event is in a group and isn't the group leader,
2217 * then don't put it on unless the group is on.
2218 */
2219 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2220 goto unlock;
2221
2222 if (!group_can_go_on(event, cpuctx, 1)) {
2223 err = -EEXIST;
2224 } else {
2225 if (event == leader)
2226 err = group_sched_in(event, cpuctx, ctx);
2227 else
2228 err = event_sched_in(event, cpuctx, ctx);
2229 }
2230
2231 if (err) {
2232 /*
2233 * If this event can't go on and it's part of a
2234 * group, then the whole group has to come off.
2235 */
2236 if (leader != event) {
2237 group_sched_out(leader, cpuctx, ctx);
2238 perf_cpu_hrtimer_restart(cpuctx);
2239 }
2240 if (leader->attr.pinned) {
2241 update_group_times(leader);
2242 leader->state = PERF_EVENT_STATE_ERROR;
2243 }
2244 }
2245
2246 unlock:
2247 raw_spin_unlock(&ctx->lock);
2248
2249 return 0;
2250 }
2251
2252 /*
2253 * Enable a event.
2254 *
2255 * If event->ctx is a cloned context, callers must make sure that
2256 * every task struct that event->ctx->task could possibly point to
2257 * remains valid. This condition is satisfied when called through
2258 * perf_event_for_each_child or perf_event_for_each as described
2259 * for perf_event_disable.
2260 */
2261 static void _perf_event_enable(struct perf_event *event)
2262 {
2263 struct perf_event_context *ctx = event->ctx;
2264 struct task_struct *task = ctx->task;
2265
2266 if (!task) {
2267 /*
2268 * Enable the event on the cpu that it's on
2269 */
2270 cpu_function_call(event->cpu, __perf_event_enable, event);
2271 return;
2272 }
2273
2274 raw_spin_lock_irq(&ctx->lock);
2275 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2276 goto out;
2277
2278 /*
2279 * If the event is in error state, clear that first.
2280 * That way, if we see the event in error state below, we
2281 * know that it has gone back into error state, as distinct
2282 * from the task having been scheduled away before the
2283 * cross-call arrived.
2284 */
2285 if (event->state == PERF_EVENT_STATE_ERROR)
2286 event->state = PERF_EVENT_STATE_OFF;
2287
2288 retry:
2289 if (!ctx->is_active) {
2290 __perf_event_mark_enabled(event);
2291 goto out;
2292 }
2293
2294 raw_spin_unlock_irq(&ctx->lock);
2295
2296 if (!task_function_call(task, __perf_event_enable, event))
2297 return;
2298
2299 raw_spin_lock_irq(&ctx->lock);
2300
2301 /*
2302 * If the context is active and the event is still off,
2303 * we need to retry the cross-call.
2304 */
2305 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2306 /*
2307 * task could have been flipped by a concurrent
2308 * perf_event_context_sched_out()
2309 */
2310 task = ctx->task;
2311 goto retry;
2312 }
2313
2314 out:
2315 raw_spin_unlock_irq(&ctx->lock);
2316 }
2317
2318 /*
2319 * See perf_event_disable();
2320 */
2321 void perf_event_enable(struct perf_event *event)
2322 {
2323 struct perf_event_context *ctx;
2324
2325 ctx = perf_event_ctx_lock(event);
2326 _perf_event_enable(event);
2327 perf_event_ctx_unlock(event, ctx);
2328 }
2329 EXPORT_SYMBOL_GPL(perf_event_enable);
2330
2331 static int _perf_event_refresh(struct perf_event *event, int refresh)
2332 {
2333 /*
2334 * not supported on inherited events
2335 */
2336 if (event->attr.inherit || !is_sampling_event(event))
2337 return -EINVAL;
2338
2339 atomic_add(refresh, &event->event_limit);
2340 _perf_event_enable(event);
2341
2342 return 0;
2343 }
2344
2345 /*
2346 * See perf_event_disable()
2347 */
2348 int perf_event_refresh(struct perf_event *event, int refresh)
2349 {
2350 struct perf_event_context *ctx;
2351 int ret;
2352
2353 ctx = perf_event_ctx_lock(event);
2354 ret = _perf_event_refresh(event, refresh);
2355 perf_event_ctx_unlock(event, ctx);
2356
2357 return ret;
2358 }
2359 EXPORT_SYMBOL_GPL(perf_event_refresh);
2360
2361 static void ctx_sched_out(struct perf_event_context *ctx,
2362 struct perf_cpu_context *cpuctx,
2363 enum event_type_t event_type)
2364 {
2365 struct perf_event *event;
2366 int is_active = ctx->is_active;
2367
2368 ctx->is_active &= ~event_type;
2369 if (likely(!ctx->nr_events))
2370 return;
2371
2372 update_context_time(ctx);
2373 update_cgrp_time_from_cpuctx(cpuctx);
2374 if (!ctx->nr_active)
2375 return;
2376
2377 perf_pmu_disable(ctx->pmu);
2378 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2379 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2380 group_sched_out(event, cpuctx, ctx);
2381 }
2382
2383 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2384 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2385 group_sched_out(event, cpuctx, ctx);
2386 }
2387 perf_pmu_enable(ctx->pmu);
2388 }
2389
2390 /*
2391 * Test whether two contexts are equivalent, i.e. whether they have both been
2392 * cloned from the same version of the same context.
2393 *
2394 * Equivalence is measured using a generation number in the context that is
2395 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2396 * and list_del_event().
2397 */
2398 static int context_equiv(struct perf_event_context *ctx1,
2399 struct perf_event_context *ctx2)
2400 {
2401 lockdep_assert_held(&ctx1->lock);
2402 lockdep_assert_held(&ctx2->lock);
2403
2404 /* Pinning disables the swap optimization */
2405 if (ctx1->pin_count || ctx2->pin_count)
2406 return 0;
2407
2408 /* If ctx1 is the parent of ctx2 */
2409 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2410 return 1;
2411
2412 /* If ctx2 is the parent of ctx1 */
2413 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2414 return 1;
2415
2416 /*
2417 * If ctx1 and ctx2 have the same parent; we flatten the parent
2418 * hierarchy, see perf_event_init_context().
2419 */
2420 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2421 ctx1->parent_gen == ctx2->parent_gen)
2422 return 1;
2423
2424 /* Unmatched */
2425 return 0;
2426 }
2427
2428 static void __perf_event_sync_stat(struct perf_event *event,
2429 struct perf_event *next_event)
2430 {
2431 u64 value;
2432
2433 if (!event->attr.inherit_stat)
2434 return;
2435
2436 /*
2437 * Update the event value, we cannot use perf_event_read()
2438 * because we're in the middle of a context switch and have IRQs
2439 * disabled, which upsets smp_call_function_single(), however
2440 * we know the event must be on the current CPU, therefore we
2441 * don't need to use it.
2442 */
2443 switch (event->state) {
2444 case PERF_EVENT_STATE_ACTIVE:
2445 event->pmu->read(event);
2446 /* fall-through */
2447
2448 case PERF_EVENT_STATE_INACTIVE:
2449 update_event_times(event);
2450 break;
2451
2452 default:
2453 break;
2454 }
2455
2456 /*
2457 * In order to keep per-task stats reliable we need to flip the event
2458 * values when we flip the contexts.
2459 */
2460 value = local64_read(&next_event->count);
2461 value = local64_xchg(&event->count, value);
2462 local64_set(&next_event->count, value);
2463
2464 swap(event->total_time_enabled, next_event->total_time_enabled);
2465 swap(event->total_time_running, next_event->total_time_running);
2466
2467 /*
2468 * Since we swizzled the values, update the user visible data too.
2469 */
2470 perf_event_update_userpage(event);
2471 perf_event_update_userpage(next_event);
2472 }
2473
2474 static void perf_event_sync_stat(struct perf_event_context *ctx,
2475 struct perf_event_context *next_ctx)
2476 {
2477 struct perf_event *event, *next_event;
2478
2479 if (!ctx->nr_stat)
2480 return;
2481
2482 update_context_time(ctx);
2483
2484 event = list_first_entry(&ctx->event_list,
2485 struct perf_event, event_entry);
2486
2487 next_event = list_first_entry(&next_ctx->event_list,
2488 struct perf_event, event_entry);
2489
2490 while (&event->event_entry != &ctx->event_list &&
2491 &next_event->event_entry != &next_ctx->event_list) {
2492
2493 __perf_event_sync_stat(event, next_event);
2494
2495 event = list_next_entry(event, event_entry);
2496 next_event = list_next_entry(next_event, event_entry);
2497 }
2498 }
2499
2500 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2501 struct task_struct *next)
2502 {
2503 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2504 struct perf_event_context *next_ctx;
2505 struct perf_event_context *parent, *next_parent;
2506 struct perf_cpu_context *cpuctx;
2507 int do_switch = 1;
2508
2509 if (likely(!ctx))
2510 return;
2511
2512 cpuctx = __get_cpu_context(ctx);
2513 if (!cpuctx->task_ctx)
2514 return;
2515
2516 rcu_read_lock();
2517 next_ctx = next->perf_event_ctxp[ctxn];
2518 if (!next_ctx)
2519 goto unlock;
2520
2521 parent = rcu_dereference(ctx->parent_ctx);
2522 next_parent = rcu_dereference(next_ctx->parent_ctx);
2523
2524 /* If neither context have a parent context; they cannot be clones. */
2525 if (!parent && !next_parent)
2526 goto unlock;
2527
2528 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2529 /*
2530 * Looks like the two contexts are clones, so we might be
2531 * able to optimize the context switch. We lock both
2532 * contexts and check that they are clones under the
2533 * lock (including re-checking that neither has been
2534 * uncloned in the meantime). It doesn't matter which
2535 * order we take the locks because no other cpu could
2536 * be trying to lock both of these tasks.
2537 */
2538 raw_spin_lock(&ctx->lock);
2539 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2540 if (context_equiv(ctx, next_ctx)) {
2541 /*
2542 * XXX do we need a memory barrier of sorts
2543 * wrt to rcu_dereference() of perf_event_ctxp
2544 */
2545 task->perf_event_ctxp[ctxn] = next_ctx;
2546 next->perf_event_ctxp[ctxn] = ctx;
2547 ctx->task = next;
2548 next_ctx->task = task;
2549
2550 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2551
2552 do_switch = 0;
2553
2554 perf_event_sync_stat(ctx, next_ctx);
2555 }
2556 raw_spin_unlock(&next_ctx->lock);
2557 raw_spin_unlock(&ctx->lock);
2558 }
2559 unlock:
2560 rcu_read_unlock();
2561
2562 if (do_switch) {
2563 raw_spin_lock(&ctx->lock);
2564 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2565 cpuctx->task_ctx = NULL;
2566 raw_spin_unlock(&ctx->lock);
2567 }
2568 }
2569
2570 void perf_sched_cb_dec(struct pmu *pmu)
2571 {
2572 this_cpu_dec(perf_sched_cb_usages);
2573 }
2574
2575 void perf_sched_cb_inc(struct pmu *pmu)
2576 {
2577 this_cpu_inc(perf_sched_cb_usages);
2578 }
2579
2580 /*
2581 * This function provides the context switch callback to the lower code
2582 * layer. It is invoked ONLY when the context switch callback is enabled.
2583 */
2584 static void perf_pmu_sched_task(struct task_struct *prev,
2585 struct task_struct *next,
2586 bool sched_in)
2587 {
2588 struct perf_cpu_context *cpuctx;
2589 struct pmu *pmu;
2590 unsigned long flags;
2591
2592 if (prev == next)
2593 return;
2594
2595 local_irq_save(flags);
2596
2597 rcu_read_lock();
2598
2599 list_for_each_entry_rcu(pmu, &pmus, entry) {
2600 if (pmu->sched_task) {
2601 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2602
2603 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2604
2605 perf_pmu_disable(pmu);
2606
2607 pmu->sched_task(cpuctx->task_ctx, sched_in);
2608
2609 perf_pmu_enable(pmu);
2610
2611 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2612 }
2613 }
2614
2615 rcu_read_unlock();
2616
2617 local_irq_restore(flags);
2618 }
2619
2620 #define for_each_task_context_nr(ctxn) \
2621 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2622
2623 /*
2624 * Called from scheduler to remove the events of the current task,
2625 * with interrupts disabled.
2626 *
2627 * We stop each event and update the event value in event->count.
2628 *
2629 * This does not protect us against NMI, but disable()
2630 * sets the disabled bit in the control field of event _before_
2631 * accessing the event control register. If a NMI hits, then it will
2632 * not restart the event.
2633 */
2634 void __perf_event_task_sched_out(struct task_struct *task,
2635 struct task_struct *next)
2636 {
2637 int ctxn;
2638
2639 if (__this_cpu_read(perf_sched_cb_usages))
2640 perf_pmu_sched_task(task, next, false);
2641
2642 for_each_task_context_nr(ctxn)
2643 perf_event_context_sched_out(task, ctxn, next);
2644
2645 /*
2646 * if cgroup events exist on this CPU, then we need
2647 * to check if we have to switch out PMU state.
2648 * cgroup event are system-wide mode only
2649 */
2650 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2651 perf_cgroup_sched_out(task, next);
2652 }
2653
2654 static void task_ctx_sched_out(struct perf_event_context *ctx)
2655 {
2656 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2657
2658 if (!cpuctx->task_ctx)
2659 return;
2660
2661 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2662 return;
2663
2664 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2665 cpuctx->task_ctx = NULL;
2666 }
2667
2668 /*
2669 * Called with IRQs disabled
2670 */
2671 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2672 enum event_type_t event_type)
2673 {
2674 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2675 }
2676
2677 static void
2678 ctx_pinned_sched_in(struct perf_event_context *ctx,
2679 struct perf_cpu_context *cpuctx)
2680 {
2681 struct perf_event *event;
2682
2683 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2684 if (event->state <= PERF_EVENT_STATE_OFF)
2685 continue;
2686 if (!event_filter_match(event))
2687 continue;
2688
2689 /* may need to reset tstamp_enabled */
2690 if (is_cgroup_event(event))
2691 perf_cgroup_mark_enabled(event, ctx);
2692
2693 if (group_can_go_on(event, cpuctx, 1))
2694 group_sched_in(event, cpuctx, ctx);
2695
2696 /*
2697 * If this pinned group hasn't been scheduled,
2698 * put it in error state.
2699 */
2700 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2701 update_group_times(event);
2702 event->state = PERF_EVENT_STATE_ERROR;
2703 }
2704 }
2705 }
2706
2707 static void
2708 ctx_flexible_sched_in(struct perf_event_context *ctx,
2709 struct perf_cpu_context *cpuctx)
2710 {
2711 struct perf_event *event;
2712 int can_add_hw = 1;
2713
2714 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2715 /* Ignore events in OFF or ERROR state */
2716 if (event->state <= PERF_EVENT_STATE_OFF)
2717 continue;
2718 /*
2719 * Listen to the 'cpu' scheduling filter constraint
2720 * of events:
2721 */
2722 if (!event_filter_match(event))
2723 continue;
2724
2725 /* may need to reset tstamp_enabled */
2726 if (is_cgroup_event(event))
2727 perf_cgroup_mark_enabled(event, ctx);
2728
2729 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2730 if (group_sched_in(event, cpuctx, ctx))
2731 can_add_hw = 0;
2732 }
2733 }
2734 }
2735
2736 static void
2737 ctx_sched_in(struct perf_event_context *ctx,
2738 struct perf_cpu_context *cpuctx,
2739 enum event_type_t event_type,
2740 struct task_struct *task)
2741 {
2742 u64 now;
2743 int is_active = ctx->is_active;
2744
2745 ctx->is_active |= event_type;
2746 if (likely(!ctx->nr_events))
2747 return;
2748
2749 now = perf_clock();
2750 ctx->timestamp = now;
2751 perf_cgroup_set_timestamp(task, ctx);
2752 /*
2753 * First go through the list and put on any pinned groups
2754 * in order to give them the best chance of going on.
2755 */
2756 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2757 ctx_pinned_sched_in(ctx, cpuctx);
2758
2759 /* Then walk through the lower prio flexible groups */
2760 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2761 ctx_flexible_sched_in(ctx, cpuctx);
2762 }
2763
2764 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2765 enum event_type_t event_type,
2766 struct task_struct *task)
2767 {
2768 struct perf_event_context *ctx = &cpuctx->ctx;
2769
2770 ctx_sched_in(ctx, cpuctx, event_type, task);
2771 }
2772
2773 static void perf_event_context_sched_in(struct perf_event_context *ctx,
2774 struct task_struct *task)
2775 {
2776 struct perf_cpu_context *cpuctx;
2777
2778 cpuctx = __get_cpu_context(ctx);
2779 if (cpuctx->task_ctx == ctx)
2780 return;
2781
2782 perf_ctx_lock(cpuctx, ctx);
2783 perf_pmu_disable(ctx->pmu);
2784 /*
2785 * We want to keep the following priority order:
2786 * cpu pinned (that don't need to move), task pinned,
2787 * cpu flexible, task flexible.
2788 */
2789 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2790
2791 if (ctx->nr_events)
2792 cpuctx->task_ctx = ctx;
2793
2794 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2795
2796 perf_pmu_enable(ctx->pmu);
2797 perf_ctx_unlock(cpuctx, ctx);
2798 }
2799
2800 /*
2801 * Called from scheduler to add the events of the current task
2802 * with interrupts disabled.
2803 *
2804 * We restore the event value and then enable it.
2805 *
2806 * This does not protect us against NMI, but enable()
2807 * sets the enabled bit in the control field of event _before_
2808 * accessing the event control register. If a NMI hits, then it will
2809 * keep the event running.
2810 */
2811 void __perf_event_task_sched_in(struct task_struct *prev,
2812 struct task_struct *task)
2813 {
2814 struct perf_event_context *ctx;
2815 int ctxn;
2816
2817 for_each_task_context_nr(ctxn) {
2818 ctx = task->perf_event_ctxp[ctxn];
2819 if (likely(!ctx))
2820 continue;
2821
2822 perf_event_context_sched_in(ctx, task);
2823 }
2824 /*
2825 * if cgroup events exist on this CPU, then we need
2826 * to check if we have to switch in PMU state.
2827 * cgroup event are system-wide mode only
2828 */
2829 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2830 perf_cgroup_sched_in(prev, task);
2831
2832 if (__this_cpu_read(perf_sched_cb_usages))
2833 perf_pmu_sched_task(prev, task, true);
2834 }
2835
2836 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2837 {
2838 u64 frequency = event->attr.sample_freq;
2839 u64 sec = NSEC_PER_SEC;
2840 u64 divisor, dividend;
2841
2842 int count_fls, nsec_fls, frequency_fls, sec_fls;
2843
2844 count_fls = fls64(count);
2845 nsec_fls = fls64(nsec);
2846 frequency_fls = fls64(frequency);
2847 sec_fls = 30;
2848
2849 /*
2850 * We got @count in @nsec, with a target of sample_freq HZ
2851 * the target period becomes:
2852 *
2853 * @count * 10^9
2854 * period = -------------------
2855 * @nsec * sample_freq
2856 *
2857 */
2858
2859 /*
2860 * Reduce accuracy by one bit such that @a and @b converge
2861 * to a similar magnitude.
2862 */
2863 #define REDUCE_FLS(a, b) \
2864 do { \
2865 if (a##_fls > b##_fls) { \
2866 a >>= 1; \
2867 a##_fls--; \
2868 } else { \
2869 b >>= 1; \
2870 b##_fls--; \
2871 } \
2872 } while (0)
2873
2874 /*
2875 * Reduce accuracy until either term fits in a u64, then proceed with
2876 * the other, so that finally we can do a u64/u64 division.
2877 */
2878 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2879 REDUCE_FLS(nsec, frequency);
2880 REDUCE_FLS(sec, count);
2881 }
2882
2883 if (count_fls + sec_fls > 64) {
2884 divisor = nsec * frequency;
2885
2886 while (count_fls + sec_fls > 64) {
2887 REDUCE_FLS(count, sec);
2888 divisor >>= 1;
2889 }
2890
2891 dividend = count * sec;
2892 } else {
2893 dividend = count * sec;
2894
2895 while (nsec_fls + frequency_fls > 64) {
2896 REDUCE_FLS(nsec, frequency);
2897 dividend >>= 1;
2898 }
2899
2900 divisor = nsec * frequency;
2901 }
2902
2903 if (!divisor)
2904 return dividend;
2905
2906 return div64_u64(dividend, divisor);
2907 }
2908
2909 static DEFINE_PER_CPU(int, perf_throttled_count);
2910 static DEFINE_PER_CPU(u64, perf_throttled_seq);
2911
2912 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2913 {
2914 struct hw_perf_event *hwc = &event->hw;
2915 s64 period, sample_period;
2916 s64 delta;
2917
2918 period = perf_calculate_period(event, nsec, count);
2919
2920 delta = (s64)(period - hwc->sample_period);
2921 delta = (delta + 7) / 8; /* low pass filter */
2922
2923 sample_period = hwc->sample_period + delta;
2924
2925 if (!sample_period)
2926 sample_period = 1;
2927
2928 hwc->sample_period = sample_period;
2929
2930 if (local64_read(&hwc->period_left) > 8*sample_period) {
2931 if (disable)
2932 event->pmu->stop(event, PERF_EF_UPDATE);
2933
2934 local64_set(&hwc->period_left, 0);
2935
2936 if (disable)
2937 event->pmu->start(event, PERF_EF_RELOAD);
2938 }
2939 }
2940
2941 /*
2942 * combine freq adjustment with unthrottling to avoid two passes over the
2943 * events. At the same time, make sure, having freq events does not change
2944 * the rate of unthrottling as that would introduce bias.
2945 */
2946 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2947 int needs_unthr)
2948 {
2949 struct perf_event *event;
2950 struct hw_perf_event *hwc;
2951 u64 now, period = TICK_NSEC;
2952 s64 delta;
2953
2954 /*
2955 * only need to iterate over all events iff:
2956 * - context have events in frequency mode (needs freq adjust)
2957 * - there are events to unthrottle on this cpu
2958 */
2959 if (!(ctx->nr_freq || needs_unthr))
2960 return;
2961
2962 raw_spin_lock(&ctx->lock);
2963 perf_pmu_disable(ctx->pmu);
2964
2965 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2966 if (event->state != PERF_EVENT_STATE_ACTIVE)
2967 continue;
2968
2969 if (!event_filter_match(event))
2970 continue;
2971
2972 perf_pmu_disable(event->pmu);
2973
2974 hwc = &event->hw;
2975
2976 if (hwc->interrupts == MAX_INTERRUPTS) {
2977 hwc->interrupts = 0;
2978 perf_log_throttle(event, 1);
2979 event->pmu->start(event, 0);
2980 }
2981
2982 if (!event->attr.freq || !event->attr.sample_freq)
2983 goto next;
2984
2985 /*
2986 * stop the event and update event->count
2987 */
2988 event->pmu->stop(event, PERF_EF_UPDATE);
2989
2990 now = local64_read(&event->count);
2991 delta = now - hwc->freq_count_stamp;
2992 hwc->freq_count_stamp = now;
2993
2994 /*
2995 * restart the event
2996 * reload only if value has changed
2997 * we have stopped the event so tell that
2998 * to perf_adjust_period() to avoid stopping it
2999 * twice.
3000 */
3001 if (delta > 0)
3002 perf_adjust_period(event, period, delta, false);
3003
3004 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3005 next:
3006 perf_pmu_enable(event->pmu);
3007 }
3008
3009 perf_pmu_enable(ctx->pmu);
3010 raw_spin_unlock(&ctx->lock);
3011 }
3012
3013 /*
3014 * Round-robin a context's events:
3015 */
3016 static void rotate_ctx(struct perf_event_context *ctx)
3017 {
3018 /*
3019 * Rotate the first entry last of non-pinned groups. Rotation might be
3020 * disabled by the inheritance code.
3021 */
3022 if (!ctx->rotate_disable)
3023 list_rotate_left(&ctx->flexible_groups);
3024 }
3025
3026 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3027 {
3028 struct perf_event_context *ctx = NULL;
3029 int rotate = 0;
3030
3031 if (cpuctx->ctx.nr_events) {
3032 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3033 rotate = 1;
3034 }
3035
3036 ctx = cpuctx->task_ctx;
3037 if (ctx && ctx->nr_events) {
3038 if (ctx->nr_events != ctx->nr_active)
3039 rotate = 1;
3040 }
3041
3042 if (!rotate)
3043 goto done;
3044
3045 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3046 perf_pmu_disable(cpuctx->ctx.pmu);
3047
3048 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3049 if (ctx)
3050 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3051
3052 rotate_ctx(&cpuctx->ctx);
3053 if (ctx)
3054 rotate_ctx(ctx);
3055
3056 perf_event_sched_in(cpuctx, ctx, current);
3057
3058 perf_pmu_enable(cpuctx->ctx.pmu);
3059 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3060 done:
3061
3062 return rotate;
3063 }
3064
3065 #ifdef CONFIG_NO_HZ_FULL
3066 bool perf_event_can_stop_tick(void)
3067 {
3068 if (atomic_read(&nr_freq_events) ||
3069 __this_cpu_read(perf_throttled_count))
3070 return false;
3071 else
3072 return true;
3073 }
3074 #endif
3075
3076 void perf_event_task_tick(void)
3077 {
3078 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3079 struct perf_event_context *ctx, *tmp;
3080 int throttled;
3081
3082 WARN_ON(!irqs_disabled());
3083
3084 __this_cpu_inc(perf_throttled_seq);
3085 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3086
3087 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3088 perf_adjust_freq_unthr_context(ctx, throttled);
3089 }
3090
3091 static int event_enable_on_exec(struct perf_event *event,
3092 struct perf_event_context *ctx)
3093 {
3094 if (!event->attr.enable_on_exec)
3095 return 0;
3096
3097 event->attr.enable_on_exec = 0;
3098 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3099 return 0;
3100
3101 __perf_event_mark_enabled(event);
3102
3103 return 1;
3104 }
3105
3106 /*
3107 * Enable all of a task's events that have been marked enable-on-exec.
3108 * This expects task == current.
3109 */
3110 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
3111 {
3112 struct perf_event_context *clone_ctx = NULL;
3113 struct perf_event *event;
3114 unsigned long flags;
3115 int enabled = 0;
3116 int ret;
3117
3118 local_irq_save(flags);
3119 if (!ctx || !ctx->nr_events)
3120 goto out;
3121
3122 /*
3123 * We must ctxsw out cgroup events to avoid conflict
3124 * when invoking perf_task_event_sched_in() later on
3125 * in this function. Otherwise we end up trying to
3126 * ctxswin cgroup events which are already scheduled
3127 * in.
3128 */
3129 perf_cgroup_sched_out(current, NULL);
3130
3131 raw_spin_lock(&ctx->lock);
3132 task_ctx_sched_out(ctx);
3133
3134 list_for_each_entry(event, &ctx->event_list, event_entry) {
3135 ret = event_enable_on_exec(event, ctx);
3136 if (ret)
3137 enabled = 1;
3138 }
3139
3140 /*
3141 * Unclone this context if we enabled any event.
3142 */
3143 if (enabled)
3144 clone_ctx = unclone_ctx(ctx);
3145
3146 raw_spin_unlock(&ctx->lock);
3147
3148 /*
3149 * Also calls ctxswin for cgroup events, if any:
3150 */
3151 perf_event_context_sched_in(ctx, ctx->task);
3152 out:
3153 local_irq_restore(flags);
3154
3155 if (clone_ctx)
3156 put_ctx(clone_ctx);
3157 }
3158
3159 void perf_event_exec(void)
3160 {
3161 struct perf_event_context *ctx;
3162 int ctxn;
3163
3164 rcu_read_lock();
3165 for_each_task_context_nr(ctxn) {
3166 ctx = current->perf_event_ctxp[ctxn];
3167 if (!ctx)
3168 continue;
3169
3170 perf_event_enable_on_exec(ctx);
3171 }
3172 rcu_read_unlock();
3173 }
3174
3175 /*
3176 * Cross CPU call to read the hardware event
3177 */
3178 static void __perf_event_read(void *info)
3179 {
3180 struct perf_event *event = info;
3181 struct perf_event_context *ctx = event->ctx;
3182 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3183
3184 /*
3185 * If this is a task context, we need to check whether it is
3186 * the current task context of this cpu. If not it has been
3187 * scheduled out before the smp call arrived. In that case
3188 * event->count would have been updated to a recent sample
3189 * when the event was scheduled out.
3190 */
3191 if (ctx->task && cpuctx->task_ctx != ctx)
3192 return;
3193
3194 raw_spin_lock(&ctx->lock);
3195 if (ctx->is_active) {
3196 update_context_time(ctx);
3197 update_cgrp_time_from_event(event);
3198 }
3199 update_event_times(event);
3200 if (event->state == PERF_EVENT_STATE_ACTIVE)
3201 event->pmu->read(event);
3202 raw_spin_unlock(&ctx->lock);
3203 }
3204
3205 static inline u64 perf_event_count(struct perf_event *event)
3206 {
3207 if (event->pmu->count)
3208 return event->pmu->count(event);
3209
3210 return __perf_event_count(event);
3211 }
3212
3213 static u64 perf_event_read(struct perf_event *event)
3214 {
3215 /*
3216 * If event is enabled and currently active on a CPU, update the
3217 * value in the event structure:
3218 */
3219 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3220 smp_call_function_single(event->oncpu,
3221 __perf_event_read, event, 1);
3222 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3223 struct perf_event_context *ctx = event->ctx;
3224 unsigned long flags;
3225
3226 raw_spin_lock_irqsave(&ctx->lock, flags);
3227 /*
3228 * may read while context is not active
3229 * (e.g., thread is blocked), in that case
3230 * we cannot update context time
3231 */
3232 if (ctx->is_active) {
3233 update_context_time(ctx);
3234 update_cgrp_time_from_event(event);
3235 }
3236 update_event_times(event);
3237 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3238 }
3239
3240 return perf_event_count(event);
3241 }
3242
3243 /*
3244 * Initialize the perf_event context in a task_struct:
3245 */
3246 static void __perf_event_init_context(struct perf_event_context *ctx)
3247 {
3248 raw_spin_lock_init(&ctx->lock);
3249 mutex_init(&ctx->mutex);
3250 INIT_LIST_HEAD(&ctx->active_ctx_list);
3251 INIT_LIST_HEAD(&ctx->pinned_groups);
3252 INIT_LIST_HEAD(&ctx->flexible_groups);
3253 INIT_LIST_HEAD(&ctx->event_list);
3254 atomic_set(&ctx->refcount, 1);
3255 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3256 }
3257
3258 static struct perf_event_context *
3259 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3260 {
3261 struct perf_event_context *ctx;
3262
3263 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3264 if (!ctx)
3265 return NULL;
3266
3267 __perf_event_init_context(ctx);
3268 if (task) {
3269 ctx->task = task;
3270 get_task_struct(task);
3271 }
3272 ctx->pmu = pmu;
3273
3274 return ctx;
3275 }
3276
3277 static struct task_struct *
3278 find_lively_task_by_vpid(pid_t vpid)
3279 {
3280 struct task_struct *task;
3281 int err;
3282
3283 rcu_read_lock();
3284 if (!vpid)
3285 task = current;
3286 else
3287 task = find_task_by_vpid(vpid);
3288 if (task)
3289 get_task_struct(task);
3290 rcu_read_unlock();
3291
3292 if (!task)
3293 return ERR_PTR(-ESRCH);
3294
3295 /* Reuse ptrace permission checks for now. */
3296 err = -EACCES;
3297 if (!ptrace_may_access(task, PTRACE_MODE_READ))
3298 goto errout;
3299
3300 return task;
3301 errout:
3302 put_task_struct(task);
3303 return ERR_PTR(err);
3304
3305 }
3306
3307 /*
3308 * Returns a matching context with refcount and pincount.
3309 */
3310 static struct perf_event_context *
3311 find_get_context(struct pmu *pmu, struct task_struct *task,
3312 struct perf_event *event)
3313 {
3314 struct perf_event_context *ctx, *clone_ctx = NULL;
3315 struct perf_cpu_context *cpuctx;
3316 void *task_ctx_data = NULL;
3317 unsigned long flags;
3318 int ctxn, err;
3319 int cpu = event->cpu;
3320
3321 if (!task) {
3322 /* Must be root to operate on a CPU event: */
3323 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3324 return ERR_PTR(-EACCES);
3325
3326 /*
3327 * We could be clever and allow to attach a event to an
3328 * offline CPU and activate it when the CPU comes up, but
3329 * that's for later.
3330 */
3331 if (!cpu_online(cpu))
3332 return ERR_PTR(-ENODEV);
3333
3334 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3335 ctx = &cpuctx->ctx;
3336 get_ctx(ctx);
3337 ++ctx->pin_count;
3338
3339 return ctx;
3340 }
3341
3342 err = -EINVAL;
3343 ctxn = pmu->task_ctx_nr;
3344 if (ctxn < 0)
3345 goto errout;
3346
3347 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3348 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3349 if (!task_ctx_data) {
3350 err = -ENOMEM;
3351 goto errout;
3352 }
3353 }
3354
3355 retry:
3356 ctx = perf_lock_task_context(task, ctxn, &flags);
3357 if (ctx) {
3358 clone_ctx = unclone_ctx(ctx);
3359 ++ctx->pin_count;
3360
3361 if (task_ctx_data && !ctx->task_ctx_data) {
3362 ctx->task_ctx_data = task_ctx_data;
3363 task_ctx_data = NULL;
3364 }
3365 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3366
3367 if (clone_ctx)
3368 put_ctx(clone_ctx);
3369 } else {
3370 ctx = alloc_perf_context(pmu, task);
3371 err = -ENOMEM;
3372 if (!ctx)
3373 goto errout;
3374
3375 if (task_ctx_data) {
3376 ctx->task_ctx_data = task_ctx_data;
3377 task_ctx_data = NULL;
3378 }
3379
3380 err = 0;
3381 mutex_lock(&task->perf_event_mutex);
3382 /*
3383 * If it has already passed perf_event_exit_task().
3384 * we must see PF_EXITING, it takes this mutex too.
3385 */
3386 if (task->flags & PF_EXITING)
3387 err = -ESRCH;
3388 else if (task->perf_event_ctxp[ctxn])
3389 err = -EAGAIN;
3390 else {
3391 get_ctx(ctx);
3392 ++ctx->pin_count;
3393 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3394 }
3395 mutex_unlock(&task->perf_event_mutex);
3396
3397 if (unlikely(err)) {
3398 put_ctx(ctx);
3399
3400 if (err == -EAGAIN)
3401 goto retry;
3402 goto errout;
3403 }
3404 }
3405
3406 kfree(task_ctx_data);
3407 return ctx;
3408
3409 errout:
3410 kfree(task_ctx_data);
3411 return ERR_PTR(err);
3412 }
3413
3414 static void perf_event_free_filter(struct perf_event *event);
3415 static void perf_event_free_bpf_prog(struct perf_event *event);
3416
3417 static void free_event_rcu(struct rcu_head *head)
3418 {
3419 struct perf_event *event;
3420
3421 event = container_of(head, struct perf_event, rcu_head);
3422 if (event->ns)
3423 put_pid_ns(event->ns);
3424 perf_event_free_filter(event);
3425 perf_event_free_bpf_prog(event);
3426 kfree(event);
3427 }
3428
3429 static void ring_buffer_attach(struct perf_event *event,
3430 struct ring_buffer *rb);
3431
3432 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3433 {
3434 if (event->parent)
3435 return;
3436
3437 if (is_cgroup_event(event))
3438 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3439 }
3440
3441 static void unaccount_event(struct perf_event *event)
3442 {
3443 if (event->parent)
3444 return;
3445
3446 if (event->attach_state & PERF_ATTACH_TASK)
3447 static_key_slow_dec_deferred(&perf_sched_events);
3448 if (event->attr.mmap || event->attr.mmap_data)
3449 atomic_dec(&nr_mmap_events);
3450 if (event->attr.comm)
3451 atomic_dec(&nr_comm_events);
3452 if (event->attr.task)
3453 atomic_dec(&nr_task_events);
3454 if (event->attr.freq)
3455 atomic_dec(&nr_freq_events);
3456 if (is_cgroup_event(event))
3457 static_key_slow_dec_deferred(&perf_sched_events);
3458 if (has_branch_stack(event))
3459 static_key_slow_dec_deferred(&perf_sched_events);
3460
3461 unaccount_event_cpu(event, event->cpu);
3462 }
3463
3464 /*
3465 * The following implement mutual exclusion of events on "exclusive" pmus
3466 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3467 * at a time, so we disallow creating events that might conflict, namely:
3468 *
3469 * 1) cpu-wide events in the presence of per-task events,
3470 * 2) per-task events in the presence of cpu-wide events,
3471 * 3) two matching events on the same context.
3472 *
3473 * The former two cases are handled in the allocation path (perf_event_alloc(),
3474 * __free_event()), the latter -- before the first perf_install_in_context().
3475 */
3476 static int exclusive_event_init(struct perf_event *event)
3477 {
3478 struct pmu *pmu = event->pmu;
3479
3480 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3481 return 0;
3482
3483 /*
3484 * Prevent co-existence of per-task and cpu-wide events on the
3485 * same exclusive pmu.
3486 *
3487 * Negative pmu::exclusive_cnt means there are cpu-wide
3488 * events on this "exclusive" pmu, positive means there are
3489 * per-task events.
3490 *
3491 * Since this is called in perf_event_alloc() path, event::ctx
3492 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
3493 * to mean "per-task event", because unlike other attach states it
3494 * never gets cleared.
3495 */
3496 if (event->attach_state & PERF_ATTACH_TASK) {
3497 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
3498 return -EBUSY;
3499 } else {
3500 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
3501 return -EBUSY;
3502 }
3503
3504 return 0;
3505 }
3506
3507 static void exclusive_event_destroy(struct perf_event *event)
3508 {
3509 struct pmu *pmu = event->pmu;
3510
3511 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3512 return;
3513
3514 /* see comment in exclusive_event_init() */
3515 if (event->attach_state & PERF_ATTACH_TASK)
3516 atomic_dec(&pmu->exclusive_cnt);
3517 else
3518 atomic_inc(&pmu->exclusive_cnt);
3519 }
3520
3521 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
3522 {
3523 if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
3524 (e1->cpu == e2->cpu ||
3525 e1->cpu == -1 ||
3526 e2->cpu == -1))
3527 return true;
3528 return false;
3529 }
3530
3531 /* Called under the same ctx::mutex as perf_install_in_context() */
3532 static bool exclusive_event_installable(struct perf_event *event,
3533 struct perf_event_context *ctx)
3534 {
3535 struct perf_event *iter_event;
3536 struct pmu *pmu = event->pmu;
3537
3538 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3539 return true;
3540
3541 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
3542 if (exclusive_event_match(iter_event, event))
3543 return false;
3544 }
3545
3546 return true;
3547 }
3548
3549 static void __free_event(struct perf_event *event)
3550 {
3551 if (!event->parent) {
3552 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3553 put_callchain_buffers();
3554 }
3555
3556 if (event->destroy)
3557 event->destroy(event);
3558
3559 if (event->ctx)
3560 put_ctx(event->ctx);
3561
3562 if (event->pmu) {
3563 exclusive_event_destroy(event);
3564 module_put(event->pmu->module);
3565 }
3566
3567 call_rcu(&event->rcu_head, free_event_rcu);
3568 }
3569
3570 static void _free_event(struct perf_event *event)
3571 {
3572 irq_work_sync(&event->pending);
3573
3574 unaccount_event(event);
3575
3576 if (event->rb) {
3577 /*
3578 * Can happen when we close an event with re-directed output.
3579 *
3580 * Since we have a 0 refcount, perf_mmap_close() will skip
3581 * over us; possibly making our ring_buffer_put() the last.
3582 */
3583 mutex_lock(&event->mmap_mutex);
3584 ring_buffer_attach(event, NULL);
3585 mutex_unlock(&event->mmap_mutex);
3586 }
3587
3588 if (is_cgroup_event(event))
3589 perf_detach_cgroup(event);
3590
3591 __free_event(event);
3592 }
3593
3594 /*
3595 * Used to free events which have a known refcount of 1, such as in error paths
3596 * where the event isn't exposed yet and inherited events.
3597 */
3598 static void free_event(struct perf_event *event)
3599 {
3600 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3601 "unexpected event refcount: %ld; ptr=%p\n",
3602 atomic_long_read(&event->refcount), event)) {
3603 /* leak to avoid use-after-free */
3604 return;
3605 }
3606
3607 _free_event(event);
3608 }
3609
3610 /*
3611 * Remove user event from the owner task.
3612 */
3613 static void perf_remove_from_owner(struct perf_event *event)
3614 {
3615 struct task_struct *owner;
3616
3617 rcu_read_lock();
3618 owner = ACCESS_ONCE(event->owner);
3619 /*
3620 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3621 * !owner it means the list deletion is complete and we can indeed
3622 * free this event, otherwise we need to serialize on
3623 * owner->perf_event_mutex.
3624 */
3625 smp_read_barrier_depends();
3626 if (owner) {
3627 /*
3628 * Since delayed_put_task_struct() also drops the last
3629 * task reference we can safely take a new reference
3630 * while holding the rcu_read_lock().
3631 */
3632 get_task_struct(owner);
3633 }
3634 rcu_read_unlock();
3635
3636 if (owner) {
3637 /*
3638 * If we're here through perf_event_exit_task() we're already
3639 * holding ctx->mutex which would be an inversion wrt. the
3640 * normal lock order.
3641 *
3642 * However we can safely take this lock because its the child
3643 * ctx->mutex.
3644 */
3645 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3646
3647 /*
3648 * We have to re-check the event->owner field, if it is cleared
3649 * we raced with perf_event_exit_task(), acquiring the mutex
3650 * ensured they're done, and we can proceed with freeing the
3651 * event.
3652 */
3653 if (event->owner)
3654 list_del_init(&event->owner_entry);
3655 mutex_unlock(&owner->perf_event_mutex);
3656 put_task_struct(owner);
3657 }
3658 }
3659
3660 /*
3661 * Called when the last reference to the file is gone.
3662 */
3663 static void put_event(struct perf_event *event)
3664 {
3665 struct perf_event_context *ctx;
3666
3667 if (!atomic_long_dec_and_test(&event->refcount))
3668 return;
3669
3670 if (!is_kernel_event(event))
3671 perf_remove_from_owner(event);
3672
3673 /*
3674 * There are two ways this annotation is useful:
3675 *
3676 * 1) there is a lock recursion from perf_event_exit_task
3677 * see the comment there.
3678 *
3679 * 2) there is a lock-inversion with mmap_sem through
3680 * perf_event_read_group(), which takes faults while
3681 * holding ctx->mutex, however this is called after
3682 * the last filedesc died, so there is no possibility
3683 * to trigger the AB-BA case.
3684 */
3685 ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
3686 WARN_ON_ONCE(ctx->parent_ctx);
3687 perf_remove_from_context(event, true);
3688 perf_event_ctx_unlock(event, ctx);
3689
3690 _free_event(event);
3691 }
3692
3693 int perf_event_release_kernel(struct perf_event *event)
3694 {
3695 put_event(event);
3696 return 0;
3697 }
3698 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3699
3700 static int perf_release(struct inode *inode, struct file *file)
3701 {
3702 put_event(file->private_data);
3703 return 0;
3704 }
3705
3706 /*
3707 * Remove all orphanes events from the context.
3708 */
3709 static void orphans_remove_work(struct work_struct *work)
3710 {
3711 struct perf_event_context *ctx;
3712 struct perf_event *event, *tmp;
3713
3714 ctx = container_of(work, struct perf_event_context,
3715 orphans_remove.work);
3716
3717 mutex_lock(&ctx->mutex);
3718 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3719 struct perf_event *parent_event = event->parent;
3720
3721 if (!is_orphaned_child(event))
3722 continue;
3723
3724 perf_remove_from_context(event, true);
3725
3726 mutex_lock(&parent_event->child_mutex);
3727 list_del_init(&event->child_list);
3728 mutex_unlock(&parent_event->child_mutex);
3729
3730 free_event(event);
3731 put_event(parent_event);
3732 }
3733
3734 raw_spin_lock_irq(&ctx->lock);
3735 ctx->orphans_remove_sched = false;
3736 raw_spin_unlock_irq(&ctx->lock);
3737 mutex_unlock(&ctx->mutex);
3738
3739 put_ctx(ctx);
3740 }
3741
3742 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3743 {
3744 struct perf_event *child;
3745 u64 total = 0;
3746
3747 *enabled = 0;
3748 *running = 0;
3749
3750 mutex_lock(&event->child_mutex);
3751 total += perf_event_read(event);
3752 *enabled += event->total_time_enabled +
3753 atomic64_read(&event->child_total_time_enabled);
3754 *running += event->total_time_running +
3755 atomic64_read(&event->child_total_time_running);
3756
3757 list_for_each_entry(child, &event->child_list, child_list) {
3758 total += perf_event_read(child);
3759 *enabled += child->total_time_enabled;
3760 *running += child->total_time_running;
3761 }
3762 mutex_unlock(&event->child_mutex);
3763
3764 return total;
3765 }
3766 EXPORT_SYMBOL_GPL(perf_event_read_value);
3767
3768 static int perf_event_read_group(struct perf_event *event,
3769 u64 read_format, char __user *buf)
3770 {
3771 struct perf_event *leader = event->group_leader, *sub;
3772 struct perf_event_context *ctx = leader->ctx;
3773 int n = 0, size = 0, ret;
3774 u64 count, enabled, running;
3775 u64 values[5];
3776
3777 lockdep_assert_held(&ctx->mutex);
3778
3779 count = perf_event_read_value(leader, &enabled, &running);
3780
3781 values[n++] = 1 + leader->nr_siblings;
3782 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3783 values[n++] = enabled;
3784 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3785 values[n++] = running;
3786 values[n++] = count;
3787 if (read_format & PERF_FORMAT_ID)
3788 values[n++] = primary_event_id(leader);
3789
3790 size = n * sizeof(u64);
3791
3792 if (copy_to_user(buf, values, size))
3793 return -EFAULT;
3794
3795 ret = size;
3796
3797 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3798 n = 0;
3799
3800 values[n++] = perf_event_read_value(sub, &enabled, &running);
3801 if (read_format & PERF_FORMAT_ID)
3802 values[n++] = primary_event_id(sub);
3803
3804 size = n * sizeof(u64);
3805
3806 if (copy_to_user(buf + ret, values, size)) {
3807 return -EFAULT;
3808 }
3809
3810 ret += size;
3811 }
3812
3813 return ret;
3814 }
3815
3816 static int perf_event_read_one(struct perf_event *event,
3817 u64 read_format, char __user *buf)
3818 {
3819 u64 enabled, running;
3820 u64 values[4];
3821 int n = 0;
3822
3823 values[n++] = perf_event_read_value(event, &enabled, &running);
3824 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3825 values[n++] = enabled;
3826 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3827 values[n++] = running;
3828 if (read_format & PERF_FORMAT_ID)
3829 values[n++] = primary_event_id(event);
3830
3831 if (copy_to_user(buf, values, n * sizeof(u64)))
3832 return -EFAULT;
3833
3834 return n * sizeof(u64);
3835 }
3836
3837 static bool is_event_hup(struct perf_event *event)
3838 {
3839 bool no_children;
3840
3841 if (event->state != PERF_EVENT_STATE_EXIT)
3842 return false;
3843
3844 mutex_lock(&event->child_mutex);
3845 no_children = list_empty(&event->child_list);
3846 mutex_unlock(&event->child_mutex);
3847 return no_children;
3848 }
3849
3850 /*
3851 * Read the performance event - simple non blocking version for now
3852 */
3853 static ssize_t
3854 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3855 {
3856 u64 read_format = event->attr.read_format;
3857 int ret;
3858
3859 /*
3860 * Return end-of-file for a read on a event that is in
3861 * error state (i.e. because it was pinned but it couldn't be
3862 * scheduled on to the CPU at some point).
3863 */
3864 if (event->state == PERF_EVENT_STATE_ERROR)
3865 return 0;
3866
3867 if (count < event->read_size)
3868 return -ENOSPC;
3869
3870 WARN_ON_ONCE(event->ctx->parent_ctx);
3871 if (read_format & PERF_FORMAT_GROUP)
3872 ret = perf_event_read_group(event, read_format, buf);
3873 else
3874 ret = perf_event_read_one(event, read_format, buf);
3875
3876 return ret;
3877 }
3878
3879 static ssize_t
3880 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3881 {
3882 struct perf_event *event = file->private_data;
3883 struct perf_event_context *ctx;
3884 int ret;
3885
3886 ctx = perf_event_ctx_lock(event);
3887 ret = perf_read_hw(event, buf, count);
3888 perf_event_ctx_unlock(event, ctx);
3889
3890 return ret;
3891 }
3892
3893 static unsigned int perf_poll(struct file *file, poll_table *wait)
3894 {
3895 struct perf_event *event = file->private_data;
3896 struct ring_buffer *rb;
3897 unsigned int events = POLLHUP;
3898
3899 poll_wait(file, &event->waitq, wait);
3900
3901 if (is_event_hup(event))
3902 return events;
3903
3904 /*
3905 * Pin the event->rb by taking event->mmap_mutex; otherwise
3906 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3907 */
3908 mutex_lock(&event->mmap_mutex);
3909 rb = event->rb;
3910 if (rb)
3911 events = atomic_xchg(&rb->poll, 0);
3912 mutex_unlock(&event->mmap_mutex);
3913 return events;
3914 }
3915
3916 static void _perf_event_reset(struct perf_event *event)
3917 {
3918 (void)perf_event_read(event);
3919 local64_set(&event->count, 0);
3920 perf_event_update_userpage(event);
3921 }
3922
3923 /*
3924 * Holding the top-level event's child_mutex means that any
3925 * descendant process that has inherited this event will block
3926 * in sync_child_event if it goes to exit, thus satisfying the
3927 * task existence requirements of perf_event_enable/disable.
3928 */
3929 static void perf_event_for_each_child(struct perf_event *event,
3930 void (*func)(struct perf_event *))
3931 {
3932 struct perf_event *child;
3933
3934 WARN_ON_ONCE(event->ctx->parent_ctx);
3935
3936 mutex_lock(&event->child_mutex);
3937 func(event);
3938 list_for_each_entry(child, &event->child_list, child_list)
3939 func(child);
3940 mutex_unlock(&event->child_mutex);
3941 }
3942
3943 static void perf_event_for_each(struct perf_event *event,
3944 void (*func)(struct perf_event *))
3945 {
3946 struct perf_event_context *ctx = event->ctx;
3947 struct perf_event *sibling;
3948
3949 lockdep_assert_held(&ctx->mutex);
3950
3951 event = event->group_leader;
3952
3953 perf_event_for_each_child(event, func);
3954 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3955 perf_event_for_each_child(sibling, func);
3956 }
3957
3958 static int perf_event_period(struct perf_event *event, u64 __user *arg)
3959 {
3960 struct perf_event_context *ctx = event->ctx;
3961 int ret = 0, active;
3962 u64 value;
3963
3964 if (!is_sampling_event(event))
3965 return -EINVAL;
3966
3967 if (copy_from_user(&value, arg, sizeof(value)))
3968 return -EFAULT;
3969
3970 if (!value)
3971 return -EINVAL;
3972
3973 raw_spin_lock_irq(&ctx->lock);
3974 if (event->attr.freq) {
3975 if (value > sysctl_perf_event_sample_rate) {
3976 ret = -EINVAL;
3977 goto unlock;
3978 }
3979
3980 event->attr.sample_freq = value;
3981 } else {
3982 event->attr.sample_period = value;
3983 event->hw.sample_period = value;
3984 }
3985
3986 active = (event->state == PERF_EVENT_STATE_ACTIVE);
3987 if (active) {
3988 perf_pmu_disable(ctx->pmu);
3989 event->pmu->stop(event, PERF_EF_UPDATE);
3990 }
3991
3992 local64_set(&event->hw.period_left, 0);
3993
3994 if (active) {
3995 event->pmu->start(event, PERF_EF_RELOAD);
3996 perf_pmu_enable(ctx->pmu);
3997 }
3998
3999 unlock:
4000 raw_spin_unlock_irq(&ctx->lock);
4001
4002 return ret;
4003 }
4004
4005 static const struct file_operations perf_fops;
4006
4007 static inline int perf_fget_light(int fd, struct fd *p)
4008 {
4009 struct fd f = fdget(fd);
4010 if (!f.file)
4011 return -EBADF;
4012
4013 if (f.file->f_op != &perf_fops) {
4014 fdput(f);
4015 return -EBADF;
4016 }
4017 *p = f;
4018 return 0;
4019 }
4020
4021 static int perf_event_set_output(struct perf_event *event,
4022 struct perf_event *output_event);
4023 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4024 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4025
4026 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4027 {
4028 void (*func)(struct perf_event *);
4029 u32 flags = arg;
4030
4031 switch (cmd) {
4032 case PERF_EVENT_IOC_ENABLE:
4033 func = _perf_event_enable;
4034 break;
4035 case PERF_EVENT_IOC_DISABLE:
4036 func = _perf_event_disable;
4037 break;
4038 case PERF_EVENT_IOC_RESET:
4039 func = _perf_event_reset;
4040 break;
4041
4042 case PERF_EVENT_IOC_REFRESH:
4043 return _perf_event_refresh(event, arg);
4044
4045 case PERF_EVENT_IOC_PERIOD:
4046 return perf_event_period(event, (u64 __user *)arg);
4047
4048 case PERF_EVENT_IOC_ID:
4049 {
4050 u64 id = primary_event_id(event);
4051
4052 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4053 return -EFAULT;
4054 return 0;
4055 }
4056
4057 case PERF_EVENT_IOC_SET_OUTPUT:
4058 {
4059 int ret;
4060 if (arg != -1) {
4061 struct perf_event *output_event;
4062 struct fd output;
4063 ret = perf_fget_light(arg, &output);
4064 if (ret)
4065 return ret;
4066 output_event = output.file->private_data;
4067 ret = perf_event_set_output(event, output_event);
4068 fdput(output);
4069 } else {
4070 ret = perf_event_set_output(event, NULL);
4071 }
4072 return ret;
4073 }
4074
4075 case PERF_EVENT_IOC_SET_FILTER:
4076 return perf_event_set_filter(event, (void __user *)arg);
4077
4078 case PERF_EVENT_IOC_SET_BPF:
4079 return perf_event_set_bpf_prog(event, arg);
4080
4081 default:
4082 return -ENOTTY;
4083 }
4084
4085 if (flags & PERF_IOC_FLAG_GROUP)
4086 perf_event_for_each(event, func);
4087 else
4088 perf_event_for_each_child(event, func);
4089
4090 return 0;
4091 }
4092
4093 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4094 {
4095 struct perf_event *event = file->private_data;
4096 struct perf_event_context *ctx;
4097 long ret;
4098
4099 ctx = perf_event_ctx_lock(event);
4100 ret = _perf_ioctl(event, cmd, arg);
4101 perf_event_ctx_unlock(event, ctx);
4102
4103 return ret;
4104 }
4105
4106 #ifdef CONFIG_COMPAT
4107 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4108 unsigned long arg)
4109 {
4110 switch (_IOC_NR(cmd)) {
4111 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4112 case _IOC_NR(PERF_EVENT_IOC_ID):
4113 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4114 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4115 cmd &= ~IOCSIZE_MASK;
4116 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4117 }
4118 break;
4119 }
4120 return perf_ioctl(file, cmd, arg);
4121 }
4122 #else
4123 # define perf_compat_ioctl NULL
4124 #endif
4125
4126 int perf_event_task_enable(void)
4127 {
4128 struct perf_event_context *ctx;
4129 struct perf_event *event;
4130
4131 mutex_lock(&current->perf_event_mutex);
4132 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4133 ctx = perf_event_ctx_lock(event);
4134 perf_event_for_each_child(event, _perf_event_enable);
4135 perf_event_ctx_unlock(event, ctx);
4136 }
4137 mutex_unlock(&current->perf_event_mutex);
4138
4139 return 0;
4140 }
4141
4142 int perf_event_task_disable(void)
4143 {
4144 struct perf_event_context *ctx;
4145 struct perf_event *event;
4146
4147 mutex_lock(&current->perf_event_mutex);
4148 list_for_each_entry(event, &current->perf_event_list, owner_entry) {
4149 ctx = perf_event_ctx_lock(event);
4150 perf_event_for_each_child(event, _perf_event_disable);
4151 perf_event_ctx_unlock(event, ctx);
4152 }
4153 mutex_unlock(&current->perf_event_mutex);
4154
4155 return 0;
4156 }
4157
4158 static int perf_event_index(struct perf_event *event)
4159 {
4160 if (event->hw.state & PERF_HES_STOPPED)
4161 return 0;
4162
4163 if (event->state != PERF_EVENT_STATE_ACTIVE)
4164 return 0;
4165
4166 return event->pmu->event_idx(event);
4167 }
4168
4169 static void calc_timer_values(struct perf_event *event,
4170 u64 *now,
4171 u64 *enabled,
4172 u64 *running)
4173 {
4174 u64 ctx_time;
4175
4176 *now = perf_clock();
4177 ctx_time = event->shadow_ctx_time + *now;
4178 *enabled = ctx_time - event->tstamp_enabled;
4179 *running = ctx_time - event->tstamp_running;
4180 }
4181
4182 static void perf_event_init_userpage(struct perf_event *event)
4183 {
4184 struct perf_event_mmap_page *userpg;
4185 struct ring_buffer *rb;
4186
4187 rcu_read_lock();
4188 rb = rcu_dereference(event->rb);
4189 if (!rb)
4190 goto unlock;
4191
4192 userpg = rb->user_page;
4193
4194 /* Allow new userspace to detect that bit 0 is deprecated */
4195 userpg->cap_bit0_is_deprecated = 1;
4196 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4197 userpg->data_offset = PAGE_SIZE;
4198 userpg->data_size = perf_data_size(rb);
4199
4200 unlock:
4201 rcu_read_unlock();
4202 }
4203
4204 void __weak arch_perf_update_userpage(
4205 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4206 {
4207 }
4208
4209 /*
4210 * Callers need to ensure there can be no nesting of this function, otherwise
4211 * the seqlock logic goes bad. We can not serialize this because the arch
4212 * code calls this from NMI context.
4213 */
4214 void perf_event_update_userpage(struct perf_event *event)
4215 {
4216 struct perf_event_mmap_page *userpg;
4217 struct ring_buffer *rb;
4218 u64 enabled, running, now;
4219
4220 rcu_read_lock();
4221 rb = rcu_dereference(event->rb);
4222 if (!rb)
4223 goto unlock;
4224
4225 /*
4226 * compute total_time_enabled, total_time_running
4227 * based on snapshot values taken when the event
4228 * was last scheduled in.
4229 *
4230 * we cannot simply called update_context_time()
4231 * because of locking issue as we can be called in
4232 * NMI context
4233 */
4234 calc_timer_values(event, &now, &enabled, &running);
4235
4236 userpg = rb->user_page;
4237 /*
4238 * Disable preemption so as to not let the corresponding user-space
4239 * spin too long if we get preempted.
4240 */
4241 preempt_disable();
4242 ++userpg->lock;
4243 barrier();
4244 userpg->index = perf_event_index(event);
4245 userpg->offset = perf_event_count(event);
4246 if (userpg->index)
4247 userpg->offset -= local64_read(&event->hw.prev_count);
4248
4249 userpg->time_enabled = enabled +
4250 atomic64_read(&event->child_total_time_enabled);
4251
4252 userpg->time_running = running +
4253 atomic64_read(&event->child_total_time_running);
4254
4255 arch_perf_update_userpage(event, userpg, now);
4256
4257 barrier();
4258 ++userpg->lock;
4259 preempt_enable();
4260 unlock:
4261 rcu_read_unlock();
4262 }
4263
4264 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4265 {
4266 struct perf_event *event = vma->vm_file->private_data;
4267 struct ring_buffer *rb;
4268 int ret = VM_FAULT_SIGBUS;
4269
4270 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4271 if (vmf->pgoff == 0)
4272 ret = 0;
4273 return ret;
4274 }
4275
4276 rcu_read_lock();
4277 rb = rcu_dereference(event->rb);
4278 if (!rb)
4279 goto unlock;
4280
4281 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4282 goto unlock;
4283
4284 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4285 if (!vmf->page)
4286 goto unlock;
4287
4288 get_page(vmf->page);
4289 vmf->page->mapping = vma->vm_file->f_mapping;
4290 vmf->page->index = vmf->pgoff;
4291
4292 ret = 0;
4293 unlock:
4294 rcu_read_unlock();
4295
4296 return ret;
4297 }
4298
4299 static void ring_buffer_attach(struct perf_event *event,
4300 struct ring_buffer *rb)
4301 {
4302 struct ring_buffer *old_rb = NULL;
4303 unsigned long flags;
4304
4305 if (event->rb) {
4306 /*
4307 * Should be impossible, we set this when removing
4308 * event->rb_entry and wait/clear when adding event->rb_entry.
4309 */
4310 WARN_ON_ONCE(event->rcu_pending);
4311
4312 old_rb = event->rb;
4313 event->rcu_batches = get_state_synchronize_rcu();
4314 event->rcu_pending = 1;
4315
4316 spin_lock_irqsave(&old_rb->event_lock, flags);
4317 list_del_rcu(&event->rb_entry);
4318 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4319 }
4320
4321 if (event->rcu_pending && rb) {
4322 cond_synchronize_rcu(event->rcu_batches);
4323 event->rcu_pending = 0;
4324 }
4325
4326 if (rb) {
4327 spin_lock_irqsave(&rb->event_lock, flags);
4328 list_add_rcu(&event->rb_entry, &rb->event_list);
4329 spin_unlock_irqrestore(&rb->event_lock, flags);
4330 }
4331
4332 rcu_assign_pointer(event->rb, rb);
4333
4334 if (old_rb) {
4335 ring_buffer_put(old_rb);
4336 /*
4337 * Since we detached before setting the new rb, so that we
4338 * could attach the new rb, we could have missed a wakeup.
4339 * Provide it now.
4340 */
4341 wake_up_all(&event->waitq);
4342 }
4343 }
4344
4345 static void ring_buffer_wakeup(struct perf_event *event)
4346 {
4347 struct ring_buffer *rb;
4348
4349 rcu_read_lock();
4350 rb = rcu_dereference(event->rb);
4351 if (rb) {
4352 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4353 wake_up_all(&event->waitq);
4354 }
4355 rcu_read_unlock();
4356 }
4357
4358 static void rb_free_rcu(struct rcu_head *rcu_head)
4359 {
4360 struct ring_buffer *rb;
4361
4362 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
4363 rb_free(rb);
4364 }
4365
4366 struct ring_buffer *ring_buffer_get(struct perf_event *event)
4367 {
4368 struct ring_buffer *rb;
4369
4370 rcu_read_lock();
4371 rb = rcu_dereference(event->rb);
4372 if (rb) {
4373 if (!atomic_inc_not_zero(&rb->refcount))
4374 rb = NULL;
4375 }
4376 rcu_read_unlock();
4377
4378 return rb;
4379 }
4380
4381 void ring_buffer_put(struct ring_buffer *rb)
4382 {
4383 if (!atomic_dec_and_test(&rb->refcount))
4384 return;
4385
4386 WARN_ON_ONCE(!list_empty(&rb->event_list));
4387
4388 call_rcu(&rb->rcu_head, rb_free_rcu);
4389 }
4390
4391 static void perf_mmap_open(struct vm_area_struct *vma)
4392 {
4393 struct perf_event *event = vma->vm_file->private_data;
4394
4395 atomic_inc(&event->mmap_count);
4396 atomic_inc(&event->rb->mmap_count);
4397
4398 if (vma->vm_pgoff)
4399 atomic_inc(&event->rb->aux_mmap_count);
4400
4401 if (event->pmu->event_mapped)
4402 event->pmu->event_mapped(event);
4403 }
4404
4405 /*
4406 * A buffer can be mmap()ed multiple times; either directly through the same
4407 * event, or through other events by use of perf_event_set_output().
4408 *
4409 * In order to undo the VM accounting done by perf_mmap() we need to destroy
4410 * the buffer here, where we still have a VM context. This means we need
4411 * to detach all events redirecting to us.
4412 */
4413 static void perf_mmap_close(struct vm_area_struct *vma)
4414 {
4415 struct perf_event *event = vma->vm_file->private_data;
4416
4417 struct ring_buffer *rb = ring_buffer_get(event);
4418 struct user_struct *mmap_user = rb->mmap_user;
4419 int mmap_locked = rb->mmap_locked;
4420 unsigned long size = perf_data_size(rb);
4421
4422 if (event->pmu->event_unmapped)
4423 event->pmu->event_unmapped(event);
4424
4425 /*
4426 * rb->aux_mmap_count will always drop before rb->mmap_count and
4427 * event->mmap_count, so it is ok to use event->mmap_mutex to
4428 * serialize with perf_mmap here.
4429 */
4430 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
4431 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
4432 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
4433 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
4434
4435 rb_free_aux(rb);
4436 mutex_unlock(&event->mmap_mutex);
4437 }
4438
4439 atomic_dec(&rb->mmap_count);
4440
4441 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4442 goto out_put;
4443
4444 ring_buffer_attach(event, NULL);
4445 mutex_unlock(&event->mmap_mutex);
4446
4447 /* If there's still other mmap()s of this buffer, we're done. */
4448 if (atomic_read(&rb->mmap_count))
4449 goto out_put;
4450
4451 /*
4452 * No other mmap()s, detach from all other events that might redirect
4453 * into the now unreachable buffer. Somewhat complicated by the
4454 * fact that rb::event_lock otherwise nests inside mmap_mutex.
4455 */
4456 again:
4457 rcu_read_lock();
4458 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4459 if (!atomic_long_inc_not_zero(&event->refcount)) {
4460 /*
4461 * This event is en-route to free_event() which will
4462 * detach it and remove it from the list.
4463 */
4464 continue;
4465 }
4466 rcu_read_unlock();
4467
4468 mutex_lock(&event->mmap_mutex);
4469 /*
4470 * Check we didn't race with perf_event_set_output() which can
4471 * swizzle the rb from under us while we were waiting to
4472 * acquire mmap_mutex.
4473 *
4474 * If we find a different rb; ignore this event, a next
4475 * iteration will no longer find it on the list. We have to
4476 * still restart the iteration to make sure we're not now
4477 * iterating the wrong list.
4478 */
4479 if (event->rb == rb)
4480 ring_buffer_attach(event, NULL);
4481
4482 mutex_unlock(&event->mmap_mutex);
4483 put_event(event);
4484
4485 /*
4486 * Restart the iteration; either we're on the wrong list or
4487 * destroyed its integrity by doing a deletion.
4488 */
4489 goto again;
4490 }
4491 rcu_read_unlock();
4492
4493 /*
4494 * It could be there's still a few 0-ref events on the list; they'll
4495 * get cleaned up by free_event() -- they'll also still have their
4496 * ref on the rb and will free it whenever they are done with it.
4497 *
4498 * Aside from that, this buffer is 'fully' detached and unmapped,
4499 * undo the VM accounting.
4500 */
4501
4502 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4503 vma->vm_mm->pinned_vm -= mmap_locked;
4504 free_uid(mmap_user);
4505
4506 out_put:
4507 ring_buffer_put(rb); /* could be last */
4508 }
4509
4510 static const struct vm_operations_struct perf_mmap_vmops = {
4511 .open = perf_mmap_open,
4512 .close = perf_mmap_close, /* non mergable */
4513 .fault = perf_mmap_fault,
4514 .page_mkwrite = perf_mmap_fault,
4515 };
4516
4517 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4518 {
4519 struct perf_event *event = file->private_data;
4520 unsigned long user_locked, user_lock_limit;
4521 struct user_struct *user = current_user();
4522 unsigned long locked, lock_limit;
4523 struct ring_buffer *rb = NULL;
4524 unsigned long vma_size;
4525 unsigned long nr_pages;
4526 long user_extra = 0, extra = 0;
4527 int ret = 0, flags = 0;
4528
4529 /*
4530 * Don't allow mmap() of inherited per-task counters. This would
4531 * create a performance issue due to all children writing to the
4532 * same rb.
4533 */
4534 if (event->cpu == -1 && event->attr.inherit)
4535 return -EINVAL;
4536
4537 if (!(vma->vm_flags & VM_SHARED))
4538 return -EINVAL;
4539
4540 vma_size = vma->vm_end - vma->vm_start;
4541
4542 if (vma->vm_pgoff == 0) {
4543 nr_pages = (vma_size / PAGE_SIZE) - 1;
4544 } else {
4545 /*
4546 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
4547 * mapped, all subsequent mappings should have the same size
4548 * and offset. Must be above the normal perf buffer.
4549 */
4550 u64 aux_offset, aux_size;
4551
4552 if (!event->rb)
4553 return -EINVAL;
4554
4555 nr_pages = vma_size / PAGE_SIZE;
4556
4557 mutex_lock(&event->mmap_mutex);
4558 ret = -EINVAL;
4559
4560 rb = event->rb;
4561 if (!rb)
4562 goto aux_unlock;
4563
4564 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
4565 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
4566
4567 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
4568 goto aux_unlock;
4569
4570 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
4571 goto aux_unlock;
4572
4573 /* already mapped with a different offset */
4574 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
4575 goto aux_unlock;
4576
4577 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
4578 goto aux_unlock;
4579
4580 /* already mapped with a different size */
4581 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
4582 goto aux_unlock;
4583
4584 if (!is_power_of_2(nr_pages))
4585 goto aux_unlock;
4586
4587 if (!atomic_inc_not_zero(&rb->mmap_count))
4588 goto aux_unlock;
4589
4590 if (rb_has_aux(rb)) {
4591 atomic_inc(&rb->aux_mmap_count);
4592 ret = 0;
4593 goto unlock;
4594 }
4595
4596 atomic_set(&rb->aux_mmap_count, 1);
4597 user_extra = nr_pages;
4598
4599 goto accounting;
4600 }
4601
4602 /*
4603 * If we have rb pages ensure they're a power-of-two number, so we
4604 * can do bitmasks instead of modulo.
4605 */
4606 if (nr_pages != 0 && !is_power_of_2(nr_pages))
4607 return -EINVAL;
4608
4609 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4610 return -EINVAL;
4611
4612 WARN_ON_ONCE(event->ctx->parent_ctx);
4613 again:
4614 mutex_lock(&event->mmap_mutex);
4615 if (event->rb) {
4616 if (event->rb->nr_pages != nr_pages) {
4617 ret = -EINVAL;
4618 goto unlock;
4619 }
4620
4621 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4622 /*
4623 * Raced against perf_mmap_close() through
4624 * perf_event_set_output(). Try again, hope for better
4625 * luck.
4626 */
4627 mutex_unlock(&event->mmap_mutex);
4628 goto again;
4629 }
4630
4631 goto unlock;
4632 }
4633
4634 user_extra = nr_pages + 1;
4635
4636 accounting:
4637 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4638
4639 /*
4640 * Increase the limit linearly with more CPUs:
4641 */
4642 user_lock_limit *= num_online_cpus();
4643
4644 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4645
4646 if (user_locked > user_lock_limit)
4647 extra = user_locked - user_lock_limit;
4648
4649 lock_limit = rlimit(RLIMIT_MEMLOCK);
4650 lock_limit >>= PAGE_SHIFT;
4651 locked = vma->vm_mm->pinned_vm + extra;
4652
4653 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4654 !capable(CAP_IPC_LOCK)) {
4655 ret = -EPERM;
4656 goto unlock;
4657 }
4658
4659 WARN_ON(!rb && event->rb);
4660
4661 if (vma->vm_flags & VM_WRITE)
4662 flags |= RING_BUFFER_WRITABLE;
4663
4664 if (!rb) {
4665 rb = rb_alloc(nr_pages,
4666 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4667 event->cpu, flags);
4668
4669 if (!rb) {
4670 ret = -ENOMEM;
4671 goto unlock;
4672 }
4673
4674 atomic_set(&rb->mmap_count, 1);
4675 rb->mmap_user = get_current_user();
4676 rb->mmap_locked = extra;
4677
4678 ring_buffer_attach(event, rb);
4679
4680 perf_event_init_userpage(event);
4681 perf_event_update_userpage(event);
4682 } else {
4683 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
4684 event->attr.aux_watermark, flags);
4685 if (!ret)
4686 rb->aux_mmap_locked = extra;
4687 }
4688
4689 unlock:
4690 if (!ret) {
4691 atomic_long_add(user_extra, &user->locked_vm);
4692 vma->vm_mm->pinned_vm += extra;
4693
4694 atomic_inc(&event->mmap_count);
4695 } else if (rb) {
4696 atomic_dec(&rb->mmap_count);
4697 }
4698 aux_unlock:
4699 mutex_unlock(&event->mmap_mutex);
4700
4701 /*
4702 * Since pinned accounting is per vm we cannot allow fork() to copy our
4703 * vma.
4704 */
4705 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4706 vma->vm_ops = &perf_mmap_vmops;
4707
4708 if (event->pmu->event_mapped)
4709 event->pmu->event_mapped(event);
4710
4711 return ret;
4712 }
4713
4714 static int perf_fasync(int fd, struct file *filp, int on)
4715 {
4716 struct inode *inode = file_inode(filp);
4717 struct perf_event *event = filp->private_data;
4718 int retval;
4719
4720 mutex_lock(&inode->i_mutex);
4721 retval = fasync_helper(fd, filp, on, &event->fasync);
4722 mutex_unlock(&inode->i_mutex);
4723
4724 if (retval < 0)
4725 return retval;
4726
4727 return 0;
4728 }
4729
4730 static const struct file_operations perf_fops = {
4731 .llseek = no_llseek,
4732 .release = perf_release,
4733 .read = perf_read,
4734 .poll = perf_poll,
4735 .unlocked_ioctl = perf_ioctl,
4736 .compat_ioctl = perf_compat_ioctl,
4737 .mmap = perf_mmap,
4738 .fasync = perf_fasync,
4739 };
4740
4741 /*
4742 * Perf event wakeup
4743 *
4744 * If there's data, ensure we set the poll() state and publish everything
4745 * to user-space before waking everybody up.
4746 */
4747
4748 void perf_event_wakeup(struct perf_event *event)
4749 {
4750 ring_buffer_wakeup(event);
4751
4752 if (event->pending_kill) {
4753 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
4754 event->pending_kill = 0;
4755 }
4756 }
4757
4758 static void perf_pending_event(struct irq_work *entry)
4759 {
4760 struct perf_event *event = container_of(entry,
4761 struct perf_event, pending);
4762 int rctx;
4763
4764 rctx = perf_swevent_get_recursion_context();
4765 /*
4766 * If we 'fail' here, that's OK, it means recursion is already disabled
4767 * and we won't recurse 'further'.
4768 */
4769
4770 if (event->pending_disable) {
4771 event->pending_disable = 0;
4772 __perf_event_disable(event);
4773 }
4774
4775 if (event->pending_wakeup) {
4776 event->pending_wakeup = 0;
4777 perf_event_wakeup(event);
4778 }
4779
4780 if (rctx >= 0)
4781 perf_swevent_put_recursion_context(rctx);
4782 }
4783
4784 /*
4785 * We assume there is only KVM supporting the callbacks.
4786 * Later on, we might change it to a list if there is
4787 * another virtualization implementation supporting the callbacks.
4788 */
4789 struct perf_guest_info_callbacks *perf_guest_cbs;
4790
4791 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4792 {
4793 perf_guest_cbs = cbs;
4794 return 0;
4795 }
4796 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4797
4798 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4799 {
4800 perf_guest_cbs = NULL;
4801 return 0;
4802 }
4803 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4804
4805 static void
4806 perf_output_sample_regs(struct perf_output_handle *handle,
4807 struct pt_regs *regs, u64 mask)
4808 {
4809 int bit;
4810
4811 for_each_set_bit(bit, (const unsigned long *) &mask,
4812 sizeof(mask) * BITS_PER_BYTE) {
4813 u64 val;
4814
4815 val = perf_reg_value(regs, bit);
4816 perf_output_put(handle, val);
4817 }
4818 }
4819
4820 static void perf_sample_regs_user(struct perf_regs *regs_user,
4821 struct pt_regs *regs,
4822 struct pt_regs *regs_user_copy)
4823 {
4824 if (user_mode(regs)) {
4825 regs_user->abi = perf_reg_abi(current);
4826 regs_user->regs = regs;
4827 } else if (current->mm) {
4828 perf_get_regs_user(regs_user, regs, regs_user_copy);
4829 } else {
4830 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
4831 regs_user->regs = NULL;
4832 }
4833 }
4834
4835 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
4836 struct pt_regs *regs)
4837 {
4838 regs_intr->regs = regs;
4839 regs_intr->abi = perf_reg_abi(current);
4840 }
4841
4842
4843 /*
4844 * Get remaining task size from user stack pointer.
4845 *
4846 * It'd be better to take stack vma map and limit this more
4847 * precisly, but there's no way to get it safely under interrupt,
4848 * so using TASK_SIZE as limit.
4849 */
4850 static u64 perf_ustack_task_size(struct pt_regs *regs)
4851 {
4852 unsigned long addr = perf_user_stack_pointer(regs);
4853
4854 if (!addr || addr >= TASK_SIZE)
4855 return 0;
4856
4857 return TASK_SIZE - addr;
4858 }
4859
4860 static u16
4861 perf_sample_ustack_size(u16 stack_size, u16 header_size,
4862 struct pt_regs *regs)
4863 {
4864 u64 task_size;
4865
4866 /* No regs, no stack pointer, no dump. */
4867 if (!regs)
4868 return 0;
4869
4870 /*
4871 * Check if we fit in with the requested stack size into the:
4872 * - TASK_SIZE
4873 * If we don't, we limit the size to the TASK_SIZE.
4874 *
4875 * - remaining sample size
4876 * If we don't, we customize the stack size to
4877 * fit in to the remaining sample size.
4878 */
4879
4880 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4881 stack_size = min(stack_size, (u16) task_size);
4882
4883 /* Current header size plus static size and dynamic size. */
4884 header_size += 2 * sizeof(u64);
4885
4886 /* Do we fit in with the current stack dump size? */
4887 if ((u16) (header_size + stack_size) < header_size) {
4888 /*
4889 * If we overflow the maximum size for the sample,
4890 * we customize the stack dump size to fit in.
4891 */
4892 stack_size = USHRT_MAX - header_size - sizeof(u64);
4893 stack_size = round_up(stack_size, sizeof(u64));
4894 }
4895
4896 return stack_size;
4897 }
4898
4899 static void
4900 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4901 struct pt_regs *regs)
4902 {
4903 /* Case of a kernel thread, nothing to dump */
4904 if (!regs) {
4905 u64 size = 0;
4906 perf_output_put(handle, size);
4907 } else {
4908 unsigned long sp;
4909 unsigned int rem;
4910 u64 dyn_size;
4911
4912 /*
4913 * We dump:
4914 * static size
4915 * - the size requested by user or the best one we can fit
4916 * in to the sample max size
4917 * data
4918 * - user stack dump data
4919 * dynamic size
4920 * - the actual dumped size
4921 */
4922
4923 /* Static size. */
4924 perf_output_put(handle, dump_size);
4925
4926 /* Data. */
4927 sp = perf_user_stack_pointer(regs);
4928 rem = __output_copy_user(handle, (void *) sp, dump_size);
4929 dyn_size = dump_size - rem;
4930
4931 perf_output_skip(handle, rem);
4932
4933 /* Dynamic size. */
4934 perf_output_put(handle, dyn_size);
4935 }
4936 }
4937
4938 static void __perf_event_header__init_id(struct perf_event_header *header,
4939 struct perf_sample_data *data,
4940 struct perf_event *event)
4941 {
4942 u64 sample_type = event->attr.sample_type;
4943
4944 data->type = sample_type;
4945 header->size += event->id_header_size;
4946
4947 if (sample_type & PERF_SAMPLE_TID) {
4948 /* namespace issues */
4949 data->tid_entry.pid = perf_event_pid(event, current);
4950 data->tid_entry.tid = perf_event_tid(event, current);
4951 }
4952
4953 if (sample_type & PERF_SAMPLE_TIME)
4954 data->time = perf_event_clock(event);
4955
4956 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4957 data->id = primary_event_id(event);
4958
4959 if (sample_type & PERF_SAMPLE_STREAM_ID)
4960 data->stream_id = event->id;
4961
4962 if (sample_type & PERF_SAMPLE_CPU) {
4963 data->cpu_entry.cpu = raw_smp_processor_id();
4964 data->cpu_entry.reserved = 0;
4965 }
4966 }
4967
4968 void perf_event_header__init_id(struct perf_event_header *header,
4969 struct perf_sample_data *data,
4970 struct perf_event *event)
4971 {
4972 if (event->attr.sample_id_all)
4973 __perf_event_header__init_id(header, data, event);
4974 }
4975
4976 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4977 struct perf_sample_data *data)
4978 {
4979 u64 sample_type = data->type;
4980
4981 if (sample_type & PERF_SAMPLE_TID)
4982 perf_output_put(handle, data->tid_entry);
4983
4984 if (sample_type & PERF_SAMPLE_TIME)
4985 perf_output_put(handle, data->time);
4986
4987 if (sample_type & PERF_SAMPLE_ID)
4988 perf_output_put(handle, data->id);
4989
4990 if (sample_type & PERF_SAMPLE_STREAM_ID)
4991 perf_output_put(handle, data->stream_id);
4992
4993 if (sample_type & PERF_SAMPLE_CPU)
4994 perf_output_put(handle, data->cpu_entry);
4995
4996 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4997 perf_output_put(handle, data->id);
4998 }
4999
5000 void perf_event__output_id_sample(struct perf_event *event,
5001 struct perf_output_handle *handle,
5002 struct perf_sample_data *sample)
5003 {
5004 if (event->attr.sample_id_all)
5005 __perf_event__output_id_sample(handle, sample);
5006 }
5007
5008 static void perf_output_read_one(struct perf_output_handle *handle,
5009 struct perf_event *event,
5010 u64 enabled, u64 running)
5011 {
5012 u64 read_format = event->attr.read_format;
5013 u64 values[4];
5014 int n = 0;
5015
5016 values[n++] = perf_event_count(event);
5017 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5018 values[n++] = enabled +
5019 atomic64_read(&event->child_total_time_enabled);
5020 }
5021 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5022 values[n++] = running +
5023 atomic64_read(&event->child_total_time_running);
5024 }
5025 if (read_format & PERF_FORMAT_ID)
5026 values[n++] = primary_event_id(event);
5027
5028 __output_copy(handle, values, n * sizeof(u64));
5029 }
5030
5031 /*
5032 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
5033 */
5034 static void perf_output_read_group(struct perf_output_handle *handle,
5035 struct perf_event *event,
5036 u64 enabled, u64 running)
5037 {
5038 struct perf_event *leader = event->group_leader, *sub;
5039 u64 read_format = event->attr.read_format;
5040 u64 values[5];
5041 int n = 0;
5042
5043 values[n++] = 1 + leader->nr_siblings;
5044
5045 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5046 values[n++] = enabled;
5047
5048 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5049 values[n++] = running;
5050
5051 if (leader != event)
5052 leader->pmu->read(leader);
5053
5054 values[n++] = perf_event_count(leader);
5055 if (read_format & PERF_FORMAT_ID)
5056 values[n++] = primary_event_id(leader);
5057
5058 __output_copy(handle, values, n * sizeof(u64));
5059
5060 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5061 n = 0;
5062
5063 if ((sub != event) &&
5064 (sub->state == PERF_EVENT_STATE_ACTIVE))
5065 sub->pmu->read(sub);
5066
5067 values[n++] = perf_event_count(sub);
5068 if (read_format & PERF_FORMAT_ID)
5069 values[n++] = primary_event_id(sub);
5070
5071 __output_copy(handle, values, n * sizeof(u64));
5072 }
5073 }
5074
5075 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5076 PERF_FORMAT_TOTAL_TIME_RUNNING)
5077
5078 static void perf_output_read(struct perf_output_handle *handle,
5079 struct perf_event *event)
5080 {
5081 u64 enabled = 0, running = 0, now;
5082 u64 read_format = event->attr.read_format;
5083
5084 /*
5085 * compute total_time_enabled, total_time_running
5086 * based on snapshot values taken when the event
5087 * was last scheduled in.
5088 *
5089 * we cannot simply called update_context_time()
5090 * because of locking issue as we are called in
5091 * NMI context
5092 */
5093 if (read_format & PERF_FORMAT_TOTAL_TIMES)
5094 calc_timer_values(event, &now, &enabled, &running);
5095
5096 if (event->attr.read_format & PERF_FORMAT_GROUP)
5097 perf_output_read_group(handle, event, enabled, running);
5098 else
5099 perf_output_read_one(handle, event, enabled, running);
5100 }
5101
5102 void perf_output_sample(struct perf_output_handle *handle,
5103 struct perf_event_header *header,
5104 struct perf_sample_data *data,
5105 struct perf_event *event)
5106 {
5107 u64 sample_type = data->type;
5108
5109 perf_output_put(handle, *header);
5110
5111 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5112 perf_output_put(handle, data->id);
5113
5114 if (sample_type & PERF_SAMPLE_IP)
5115 perf_output_put(handle, data->ip);
5116
5117 if (sample_type & PERF_SAMPLE_TID)
5118 perf_output_put(handle, data->tid_entry);
5119
5120 if (sample_type & PERF_SAMPLE_TIME)
5121 perf_output_put(handle, data->time);
5122
5123 if (sample_type & PERF_SAMPLE_ADDR)
5124 perf_output_put(handle, data->addr);
5125
5126 if (sample_type & PERF_SAMPLE_ID)
5127 perf_output_put(handle, data->id);
5128
5129 if (sample_type & PERF_SAMPLE_STREAM_ID)
5130 perf_output_put(handle, data->stream_id);
5131
5132 if (sample_type & PERF_SAMPLE_CPU)
5133 perf_output_put(handle, data->cpu_entry);
5134
5135 if (sample_type & PERF_SAMPLE_PERIOD)
5136 perf_output_put(handle, data->period);
5137
5138 if (sample_type & PERF_SAMPLE_READ)
5139 perf_output_read(handle, event);
5140
5141 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5142 if (data->callchain) {
5143 int size = 1;
5144
5145 if (data->callchain)
5146 size += data->callchain->nr;
5147
5148 size *= sizeof(u64);
5149
5150 __output_copy(handle, data->callchain, size);
5151 } else {
5152 u64 nr = 0;
5153 perf_output_put(handle, nr);
5154 }
5155 }
5156
5157 if (sample_type & PERF_SAMPLE_RAW) {
5158 if (data->raw) {
5159 perf_output_put(handle, data->raw->size);
5160 __output_copy(handle, data->raw->data,
5161 data->raw->size);
5162 } else {
5163 struct {
5164 u32 size;
5165 u32 data;
5166 } raw = {
5167 .size = sizeof(u32),
5168 .data = 0,
5169 };
5170 perf_output_put(handle, raw);
5171 }
5172 }
5173
5174 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5175 if (data->br_stack) {
5176 size_t size;
5177
5178 size = data->br_stack->nr
5179 * sizeof(struct perf_branch_entry);
5180
5181 perf_output_put(handle, data->br_stack->nr);
5182 perf_output_copy(handle, data->br_stack->entries, size);
5183 } else {
5184 /*
5185 * we always store at least the value of nr
5186 */
5187 u64 nr = 0;
5188 perf_output_put(handle, nr);
5189 }
5190 }
5191
5192 if (sample_type & PERF_SAMPLE_REGS_USER) {
5193 u64 abi = data->regs_user.abi;
5194
5195 /*
5196 * If there are no regs to dump, notice it through
5197 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5198 */
5199 perf_output_put(handle, abi);
5200
5201 if (abi) {
5202 u64 mask = event->attr.sample_regs_user;
5203 perf_output_sample_regs(handle,
5204 data->regs_user.regs,
5205 mask);
5206 }
5207 }
5208
5209 if (sample_type & PERF_SAMPLE_STACK_USER) {
5210 perf_output_sample_ustack(handle,
5211 data->stack_user_size,
5212 data->regs_user.regs);
5213 }
5214
5215 if (sample_type & PERF_SAMPLE_WEIGHT)
5216 perf_output_put(handle, data->weight);
5217
5218 if (sample_type & PERF_SAMPLE_DATA_SRC)
5219 perf_output_put(handle, data->data_src.val);
5220
5221 if (sample_type & PERF_SAMPLE_TRANSACTION)
5222 perf_output_put(handle, data->txn);
5223
5224 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5225 u64 abi = data->regs_intr.abi;
5226 /*
5227 * If there are no regs to dump, notice it through
5228 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5229 */
5230 perf_output_put(handle, abi);
5231
5232 if (abi) {
5233 u64 mask = event->attr.sample_regs_intr;
5234
5235 perf_output_sample_regs(handle,
5236 data->regs_intr.regs,
5237 mask);
5238 }
5239 }
5240
5241 if (!event->attr.watermark) {
5242 int wakeup_events = event->attr.wakeup_events;
5243
5244 if (wakeup_events) {
5245 struct ring_buffer *rb = handle->rb;
5246 int events = local_inc_return(&rb->events);
5247
5248 if (events >= wakeup_events) {
5249 local_sub(wakeup_events, &rb->events);
5250 local_inc(&rb->wakeup);
5251 }
5252 }
5253 }
5254 }
5255
5256 void perf_prepare_sample(struct perf_event_header *header,
5257 struct perf_sample_data *data,
5258 struct perf_event *event,
5259 struct pt_regs *regs)
5260 {
5261 u64 sample_type = event->attr.sample_type;
5262
5263 header->type = PERF_RECORD_SAMPLE;
5264 header->size = sizeof(*header) + event->header_size;
5265
5266 header->misc = 0;
5267 header->misc |= perf_misc_flags(regs);
5268
5269 __perf_event_header__init_id(header, data, event);
5270
5271 if (sample_type & PERF_SAMPLE_IP)
5272 data->ip = perf_instruction_pointer(regs);
5273
5274 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5275 int size = 1;
5276
5277 data->callchain = perf_callchain(event, regs);
5278
5279 if (data->callchain)
5280 size += data->callchain->nr;
5281
5282 header->size += size * sizeof(u64);
5283 }
5284
5285 if (sample_type & PERF_SAMPLE_RAW) {
5286 int size = sizeof(u32);
5287
5288 if (data->raw)
5289 size += data->raw->size;
5290 else
5291 size += sizeof(u32);
5292
5293 WARN_ON_ONCE(size & (sizeof(u64)-1));
5294 header->size += size;
5295 }
5296
5297 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5298 int size = sizeof(u64); /* nr */
5299 if (data->br_stack) {
5300 size += data->br_stack->nr
5301 * sizeof(struct perf_branch_entry);
5302 }
5303 header->size += size;
5304 }
5305
5306 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5307 perf_sample_regs_user(&data->regs_user, regs,
5308 &data->regs_user_copy);
5309
5310 if (sample_type & PERF_SAMPLE_REGS_USER) {
5311 /* regs dump ABI info */
5312 int size = sizeof(u64);
5313
5314 if (data->regs_user.regs) {
5315 u64 mask = event->attr.sample_regs_user;
5316 size += hweight64(mask) * sizeof(u64);
5317 }
5318
5319 header->size += size;
5320 }
5321
5322 if (sample_type & PERF_SAMPLE_STACK_USER) {
5323 /*
5324 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5325 * processed as the last one or have additional check added
5326 * in case new sample type is added, because we could eat
5327 * up the rest of the sample size.
5328 */
5329 u16 stack_size = event->attr.sample_stack_user;
5330 u16 size = sizeof(u64);
5331
5332 stack_size = perf_sample_ustack_size(stack_size, header->size,
5333 data->regs_user.regs);
5334
5335 /*
5336 * If there is something to dump, add space for the dump
5337 * itself and for the field that tells the dynamic size,
5338 * which is how many have been actually dumped.
5339 */
5340 if (stack_size)
5341 size += sizeof(u64) + stack_size;
5342
5343 data->stack_user_size = stack_size;
5344 header->size += size;
5345 }
5346
5347 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5348 /* regs dump ABI info */
5349 int size = sizeof(u64);
5350
5351 perf_sample_regs_intr(&data->regs_intr, regs);
5352
5353 if (data->regs_intr.regs) {
5354 u64 mask = event->attr.sample_regs_intr;
5355
5356 size += hweight64(mask) * sizeof(u64);
5357 }
5358
5359 header->size += size;
5360 }
5361 }
5362
5363 static void perf_event_output(struct perf_event *event,
5364 struct perf_sample_data *data,
5365 struct pt_regs *regs)
5366 {
5367 struct perf_output_handle handle;
5368 struct perf_event_header header;
5369
5370 /* protect the callchain buffers */
5371 rcu_read_lock();
5372
5373 perf_prepare_sample(&header, data, event, regs);
5374
5375 if (perf_output_begin(&handle, event, header.size))
5376 goto exit;
5377
5378 perf_output_sample(&handle, &header, data, event);
5379
5380 perf_output_end(&handle);
5381
5382 exit:
5383 rcu_read_unlock();
5384 }
5385
5386 /*
5387 * read event_id
5388 */
5389
5390 struct perf_read_event {
5391 struct perf_event_header header;
5392
5393 u32 pid;
5394 u32 tid;
5395 };
5396
5397 static void
5398 perf_event_read_event(struct perf_event *event,
5399 struct task_struct *task)
5400 {
5401 struct perf_output_handle handle;
5402 struct perf_sample_data sample;
5403 struct perf_read_event read_event = {
5404 .header = {
5405 .type = PERF_RECORD_READ,
5406 .misc = 0,
5407 .size = sizeof(read_event) + event->read_size,
5408 },
5409 .pid = perf_event_pid(event, task),
5410 .tid = perf_event_tid(event, task),
5411 };
5412 int ret;
5413
5414 perf_event_header__init_id(&read_event.header, &sample, event);
5415 ret = perf_output_begin(&handle, event, read_event.header.size);
5416 if (ret)
5417 return;
5418
5419 perf_output_put(&handle, read_event);
5420 perf_output_read(&handle, event);
5421 perf_event__output_id_sample(event, &handle, &sample);
5422
5423 perf_output_end(&handle);
5424 }
5425
5426 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5427
5428 static void
5429 perf_event_aux_ctx(struct perf_event_context *ctx,
5430 perf_event_aux_output_cb output,
5431 void *data)
5432 {
5433 struct perf_event *event;
5434
5435 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5436 if (event->state < PERF_EVENT_STATE_INACTIVE)
5437 continue;
5438 if (!event_filter_match(event))
5439 continue;
5440 output(event, data);
5441 }
5442 }
5443
5444 static void
5445 perf_event_aux(perf_event_aux_output_cb output, void *data,
5446 struct perf_event_context *task_ctx)
5447 {
5448 struct perf_cpu_context *cpuctx;
5449 struct perf_event_context *ctx;
5450 struct pmu *pmu;
5451 int ctxn;
5452
5453 rcu_read_lock();
5454 list_for_each_entry_rcu(pmu, &pmus, entry) {
5455 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5456 if (cpuctx->unique_pmu != pmu)
5457 goto next;
5458 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5459 if (task_ctx)
5460 goto next;
5461 ctxn = pmu->task_ctx_nr;
5462 if (ctxn < 0)
5463 goto next;
5464 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5465 if (ctx)
5466 perf_event_aux_ctx(ctx, output, data);
5467 next:
5468 put_cpu_ptr(pmu->pmu_cpu_context);
5469 }
5470
5471 if (task_ctx) {
5472 preempt_disable();
5473 perf_event_aux_ctx(task_ctx, output, data);
5474 preempt_enable();
5475 }
5476 rcu_read_unlock();
5477 }
5478
5479 /*
5480 * task tracking -- fork/exit
5481 *
5482 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
5483 */
5484
5485 struct perf_task_event {
5486 struct task_struct *task;
5487 struct perf_event_context *task_ctx;
5488
5489 struct {
5490 struct perf_event_header header;
5491
5492 u32 pid;
5493 u32 ppid;
5494 u32 tid;
5495 u32 ptid;
5496 u64 time;
5497 } event_id;
5498 };
5499
5500 static int perf_event_task_match(struct perf_event *event)
5501 {
5502 return event->attr.comm || event->attr.mmap ||
5503 event->attr.mmap2 || event->attr.mmap_data ||
5504 event->attr.task;
5505 }
5506
5507 static void perf_event_task_output(struct perf_event *event,
5508 void *data)
5509 {
5510 struct perf_task_event *task_event = data;
5511 struct perf_output_handle handle;
5512 struct perf_sample_data sample;
5513 struct task_struct *task = task_event->task;
5514 int ret, size = task_event->event_id.header.size;
5515
5516 if (!perf_event_task_match(event))
5517 return;
5518
5519 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5520
5521 ret = perf_output_begin(&handle, event,
5522 task_event->event_id.header.size);
5523 if (ret)
5524 goto out;
5525
5526 task_event->event_id.pid = perf_event_pid(event, task);
5527 task_event->event_id.ppid = perf_event_pid(event, current);
5528
5529 task_event->event_id.tid = perf_event_tid(event, task);
5530 task_event->event_id.ptid = perf_event_tid(event, current);
5531
5532 task_event->event_id.time = perf_event_clock(event);
5533
5534 perf_output_put(&handle, task_event->event_id);
5535
5536 perf_event__output_id_sample(event, &handle, &sample);
5537
5538 perf_output_end(&handle);
5539 out:
5540 task_event->event_id.header.size = size;
5541 }
5542
5543 static void perf_event_task(struct task_struct *task,
5544 struct perf_event_context *task_ctx,
5545 int new)
5546 {
5547 struct perf_task_event task_event;
5548
5549 if (!atomic_read(&nr_comm_events) &&
5550 !atomic_read(&nr_mmap_events) &&
5551 !atomic_read(&nr_task_events))
5552 return;
5553
5554 task_event = (struct perf_task_event){
5555 .task = task,
5556 .task_ctx = task_ctx,
5557 .event_id = {
5558 .header = {
5559 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5560 .misc = 0,
5561 .size = sizeof(task_event.event_id),
5562 },
5563 /* .pid */
5564 /* .ppid */
5565 /* .tid */
5566 /* .ptid */
5567 /* .time */
5568 },
5569 };
5570
5571 perf_event_aux(perf_event_task_output,
5572 &task_event,
5573 task_ctx);
5574 }
5575
5576 void perf_event_fork(struct task_struct *task)
5577 {
5578 perf_event_task(task, NULL, 1);
5579 }
5580
5581 /*
5582 * comm tracking
5583 */
5584
5585 struct perf_comm_event {
5586 struct task_struct *task;
5587 char *comm;
5588 int comm_size;
5589
5590 struct {
5591 struct perf_event_header header;
5592
5593 u32 pid;
5594 u32 tid;
5595 } event_id;
5596 };
5597
5598 static int perf_event_comm_match(struct perf_event *event)
5599 {
5600 return event->attr.comm;
5601 }
5602
5603 static void perf_event_comm_output(struct perf_event *event,
5604 void *data)
5605 {
5606 struct perf_comm_event *comm_event = data;
5607 struct perf_output_handle handle;
5608 struct perf_sample_data sample;
5609 int size = comm_event->event_id.header.size;
5610 int ret;
5611
5612 if (!perf_event_comm_match(event))
5613 return;
5614
5615 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5616 ret = perf_output_begin(&handle, event,
5617 comm_event->event_id.header.size);
5618
5619 if (ret)
5620 goto out;
5621
5622 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5623 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5624
5625 perf_output_put(&handle, comm_event->event_id);
5626 __output_copy(&handle, comm_event->comm,
5627 comm_event->comm_size);
5628
5629 perf_event__output_id_sample(event, &handle, &sample);
5630
5631 perf_output_end(&handle);
5632 out:
5633 comm_event->event_id.header.size = size;
5634 }
5635
5636 static void perf_event_comm_event(struct perf_comm_event *comm_event)
5637 {
5638 char comm[TASK_COMM_LEN];
5639 unsigned int size;
5640
5641 memset(comm, 0, sizeof(comm));
5642 strlcpy(comm, comm_event->task->comm, sizeof(comm));
5643 size = ALIGN(strlen(comm)+1, sizeof(u64));
5644
5645 comm_event->comm = comm;
5646 comm_event->comm_size = size;
5647
5648 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5649
5650 perf_event_aux(perf_event_comm_output,
5651 comm_event,
5652 NULL);
5653 }
5654
5655 void perf_event_comm(struct task_struct *task, bool exec)
5656 {
5657 struct perf_comm_event comm_event;
5658
5659 if (!atomic_read(&nr_comm_events))
5660 return;
5661
5662 comm_event = (struct perf_comm_event){
5663 .task = task,
5664 /* .comm */
5665 /* .comm_size */
5666 .event_id = {
5667 .header = {
5668 .type = PERF_RECORD_COMM,
5669 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5670 /* .size */
5671 },
5672 /* .pid */
5673 /* .tid */
5674 },
5675 };
5676
5677 perf_event_comm_event(&comm_event);
5678 }
5679
5680 /*
5681 * mmap tracking
5682 */
5683
5684 struct perf_mmap_event {
5685 struct vm_area_struct *vma;
5686
5687 const char *file_name;
5688 int file_size;
5689 int maj, min;
5690 u64 ino;
5691 u64 ino_generation;
5692 u32 prot, flags;
5693
5694 struct {
5695 struct perf_event_header header;
5696
5697 u32 pid;
5698 u32 tid;
5699 u64 start;
5700 u64 len;
5701 u64 pgoff;
5702 } event_id;
5703 };
5704
5705 static int perf_event_mmap_match(struct perf_event *event,
5706 void *data)
5707 {
5708 struct perf_mmap_event *mmap_event = data;
5709 struct vm_area_struct *vma = mmap_event->vma;
5710 int executable = vma->vm_flags & VM_EXEC;
5711
5712 return (!executable && event->attr.mmap_data) ||
5713 (executable && (event->attr.mmap || event->attr.mmap2));
5714 }
5715
5716 static void perf_event_mmap_output(struct perf_event *event,
5717 void *data)
5718 {
5719 struct perf_mmap_event *mmap_event = data;
5720 struct perf_output_handle handle;
5721 struct perf_sample_data sample;
5722 int size = mmap_event->event_id.header.size;
5723 int ret;
5724
5725 if (!perf_event_mmap_match(event, data))
5726 return;
5727
5728 if (event->attr.mmap2) {
5729 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5730 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5731 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5732 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5733 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5734 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5735 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5736 }
5737
5738 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5739 ret = perf_output_begin(&handle, event,
5740 mmap_event->event_id.header.size);
5741 if (ret)
5742 goto out;
5743
5744 mmap_event->event_id.pid = perf_event_pid(event, current);
5745 mmap_event->event_id.tid = perf_event_tid(event, current);
5746
5747 perf_output_put(&handle, mmap_event->event_id);
5748
5749 if (event->attr.mmap2) {
5750 perf_output_put(&handle, mmap_event->maj);
5751 perf_output_put(&handle, mmap_event->min);
5752 perf_output_put(&handle, mmap_event->ino);
5753 perf_output_put(&handle, mmap_event->ino_generation);
5754 perf_output_put(&handle, mmap_event->prot);
5755 perf_output_put(&handle, mmap_event->flags);
5756 }
5757
5758 __output_copy(&handle, mmap_event->file_name,
5759 mmap_event->file_size);
5760
5761 perf_event__output_id_sample(event, &handle, &sample);
5762
5763 perf_output_end(&handle);
5764 out:
5765 mmap_event->event_id.header.size = size;
5766 }
5767
5768 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5769 {
5770 struct vm_area_struct *vma = mmap_event->vma;
5771 struct file *file = vma->vm_file;
5772 int maj = 0, min = 0;
5773 u64 ino = 0, gen = 0;
5774 u32 prot = 0, flags = 0;
5775 unsigned int size;
5776 char tmp[16];
5777 char *buf = NULL;
5778 char *name;
5779
5780 if (file) {
5781 struct inode *inode;
5782 dev_t dev;
5783
5784 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5785 if (!buf) {
5786 name = "//enomem";
5787 goto cpy_name;
5788 }
5789 /*
5790 * d_path() works from the end of the rb backwards, so we
5791 * need to add enough zero bytes after the string to handle
5792 * the 64bit alignment we do later.
5793 */
5794 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5795 if (IS_ERR(name)) {
5796 name = "//toolong";
5797 goto cpy_name;
5798 }
5799 inode = file_inode(vma->vm_file);
5800 dev = inode->i_sb->s_dev;
5801 ino = inode->i_ino;
5802 gen = inode->i_generation;
5803 maj = MAJOR(dev);
5804 min = MINOR(dev);
5805
5806 if (vma->vm_flags & VM_READ)
5807 prot |= PROT_READ;
5808 if (vma->vm_flags & VM_WRITE)
5809 prot |= PROT_WRITE;
5810 if (vma->vm_flags & VM_EXEC)
5811 prot |= PROT_EXEC;
5812
5813 if (vma->vm_flags & VM_MAYSHARE)
5814 flags = MAP_SHARED;
5815 else
5816 flags = MAP_PRIVATE;
5817
5818 if (vma->vm_flags & VM_DENYWRITE)
5819 flags |= MAP_DENYWRITE;
5820 if (vma->vm_flags & VM_MAYEXEC)
5821 flags |= MAP_EXECUTABLE;
5822 if (vma->vm_flags & VM_LOCKED)
5823 flags |= MAP_LOCKED;
5824 if (vma->vm_flags & VM_HUGETLB)
5825 flags |= MAP_HUGETLB;
5826
5827 goto got_name;
5828 } else {
5829 if (vma->vm_ops && vma->vm_ops->name) {
5830 name = (char *) vma->vm_ops->name(vma);
5831 if (name)
5832 goto cpy_name;
5833 }
5834
5835 name = (char *)arch_vma_name(vma);
5836 if (name)
5837 goto cpy_name;
5838
5839 if (vma->vm_start <= vma->vm_mm->start_brk &&
5840 vma->vm_end >= vma->vm_mm->brk) {
5841 name = "[heap]";
5842 goto cpy_name;
5843 }
5844 if (vma->vm_start <= vma->vm_mm->start_stack &&
5845 vma->vm_end >= vma->vm_mm->start_stack) {
5846 name = "[stack]";
5847 goto cpy_name;
5848 }
5849
5850 name = "//anon";
5851 goto cpy_name;
5852 }
5853
5854 cpy_name:
5855 strlcpy(tmp, name, sizeof(tmp));
5856 name = tmp;
5857 got_name:
5858 /*
5859 * Since our buffer works in 8 byte units we need to align our string
5860 * size to a multiple of 8. However, we must guarantee the tail end is
5861 * zero'd out to avoid leaking random bits to userspace.
5862 */
5863 size = strlen(name)+1;
5864 while (!IS_ALIGNED(size, sizeof(u64)))
5865 name[size++] = '\0';
5866
5867 mmap_event->file_name = name;
5868 mmap_event->file_size = size;
5869 mmap_event->maj = maj;
5870 mmap_event->min = min;
5871 mmap_event->ino = ino;
5872 mmap_event->ino_generation = gen;
5873 mmap_event->prot = prot;
5874 mmap_event->flags = flags;
5875
5876 if (!(vma->vm_flags & VM_EXEC))
5877 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5878
5879 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5880
5881 perf_event_aux(perf_event_mmap_output,
5882 mmap_event,
5883 NULL);
5884
5885 kfree(buf);
5886 }
5887
5888 void perf_event_mmap(struct vm_area_struct *vma)
5889 {
5890 struct perf_mmap_event mmap_event;
5891
5892 if (!atomic_read(&nr_mmap_events))
5893 return;
5894
5895 mmap_event = (struct perf_mmap_event){
5896 .vma = vma,
5897 /* .file_name */
5898 /* .file_size */
5899 .event_id = {
5900 .header = {
5901 .type = PERF_RECORD_MMAP,
5902 .misc = PERF_RECORD_MISC_USER,
5903 /* .size */
5904 },
5905 /* .pid */
5906 /* .tid */
5907 .start = vma->vm_start,
5908 .len = vma->vm_end - vma->vm_start,
5909 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
5910 },
5911 /* .maj (attr_mmap2 only) */
5912 /* .min (attr_mmap2 only) */
5913 /* .ino (attr_mmap2 only) */
5914 /* .ino_generation (attr_mmap2 only) */
5915 /* .prot (attr_mmap2 only) */
5916 /* .flags (attr_mmap2 only) */
5917 };
5918
5919 perf_event_mmap_event(&mmap_event);
5920 }
5921
5922 void perf_event_aux_event(struct perf_event *event, unsigned long head,
5923 unsigned long size, u64 flags)
5924 {
5925 struct perf_output_handle handle;
5926 struct perf_sample_data sample;
5927 struct perf_aux_event {
5928 struct perf_event_header header;
5929 u64 offset;
5930 u64 size;
5931 u64 flags;
5932 } rec = {
5933 .header = {
5934 .type = PERF_RECORD_AUX,
5935 .misc = 0,
5936 .size = sizeof(rec),
5937 },
5938 .offset = head,
5939 .size = size,
5940 .flags = flags,
5941 };
5942 int ret;
5943
5944 perf_event_header__init_id(&rec.header, &sample, event);
5945 ret = perf_output_begin(&handle, event, rec.header.size);
5946
5947 if (ret)
5948 return;
5949
5950 perf_output_put(&handle, rec);
5951 perf_event__output_id_sample(event, &handle, &sample);
5952
5953 perf_output_end(&handle);
5954 }
5955
5956 /*
5957 * IRQ throttle logging
5958 */
5959
5960 static void perf_log_throttle(struct perf_event *event, int enable)
5961 {
5962 struct perf_output_handle handle;
5963 struct perf_sample_data sample;
5964 int ret;
5965
5966 struct {
5967 struct perf_event_header header;
5968 u64 time;
5969 u64 id;
5970 u64 stream_id;
5971 } throttle_event = {
5972 .header = {
5973 .type = PERF_RECORD_THROTTLE,
5974 .misc = 0,
5975 .size = sizeof(throttle_event),
5976 },
5977 .time = perf_event_clock(event),
5978 .id = primary_event_id(event),
5979 .stream_id = event->id,
5980 };
5981
5982 if (enable)
5983 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
5984
5985 perf_event_header__init_id(&throttle_event.header, &sample, event);
5986
5987 ret = perf_output_begin(&handle, event,
5988 throttle_event.header.size);
5989 if (ret)
5990 return;
5991
5992 perf_output_put(&handle, throttle_event);
5993 perf_event__output_id_sample(event, &handle, &sample);
5994 perf_output_end(&handle);
5995 }
5996
5997 static void perf_log_itrace_start(struct perf_event *event)
5998 {
5999 struct perf_output_handle handle;
6000 struct perf_sample_data sample;
6001 struct perf_aux_event {
6002 struct perf_event_header header;
6003 u32 pid;
6004 u32 tid;
6005 } rec;
6006 int ret;
6007
6008 if (event->parent)
6009 event = event->parent;
6010
6011 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
6012 event->hw.itrace_started)
6013 return;
6014
6015 event->hw.itrace_started = 1;
6016
6017 rec.header.type = PERF_RECORD_ITRACE_START;
6018 rec.header.misc = 0;
6019 rec.header.size = sizeof(rec);
6020 rec.pid = perf_event_pid(event, current);
6021 rec.tid = perf_event_tid(event, current);
6022
6023 perf_event_header__init_id(&rec.header, &sample, event);
6024 ret = perf_output_begin(&handle, event, rec.header.size);
6025
6026 if (ret)
6027 return;
6028
6029 perf_output_put(&handle, rec);
6030 perf_event__output_id_sample(event, &handle, &sample);
6031
6032 perf_output_end(&handle);
6033 }
6034
6035 /*
6036 * Generic event overflow handling, sampling.
6037 */
6038
6039 static int __perf_event_overflow(struct perf_event *event,
6040 int throttle, struct perf_sample_data *data,
6041 struct pt_regs *regs)
6042 {
6043 int events = atomic_read(&event->event_limit);
6044 struct hw_perf_event *hwc = &event->hw;
6045 u64 seq;
6046 int ret = 0;
6047
6048 /*
6049 * Non-sampling counters might still use the PMI to fold short
6050 * hardware counters, ignore those.
6051 */
6052 if (unlikely(!is_sampling_event(event)))
6053 return 0;
6054
6055 seq = __this_cpu_read(perf_throttled_seq);
6056 if (seq != hwc->interrupts_seq) {
6057 hwc->interrupts_seq = seq;
6058 hwc->interrupts = 1;
6059 } else {
6060 hwc->interrupts++;
6061 if (unlikely(throttle
6062 && hwc->interrupts >= max_samples_per_tick)) {
6063 __this_cpu_inc(perf_throttled_count);
6064 hwc->interrupts = MAX_INTERRUPTS;
6065 perf_log_throttle(event, 0);
6066 tick_nohz_full_kick();
6067 ret = 1;
6068 }
6069 }
6070
6071 if (event->attr.freq) {
6072 u64 now = perf_clock();
6073 s64 delta = now - hwc->freq_time_stamp;
6074
6075 hwc->freq_time_stamp = now;
6076
6077 if (delta > 0 && delta < 2*TICK_NSEC)
6078 perf_adjust_period(event, delta, hwc->last_period, true);
6079 }
6080
6081 /*
6082 * XXX event_limit might not quite work as expected on inherited
6083 * events
6084 */
6085
6086 event->pending_kill = POLL_IN;
6087 if (events && atomic_dec_and_test(&event->event_limit)) {
6088 ret = 1;
6089 event->pending_kill = POLL_HUP;
6090 event->pending_disable = 1;
6091 irq_work_queue(&event->pending);
6092 }
6093
6094 if (event->overflow_handler)
6095 event->overflow_handler(event, data, regs);
6096 else
6097 perf_event_output(event, data, regs);
6098
6099 if (event->fasync && event->pending_kill) {
6100 event->pending_wakeup = 1;
6101 irq_work_queue(&event->pending);
6102 }
6103
6104 return ret;
6105 }
6106
6107 int perf_event_overflow(struct perf_event *event,
6108 struct perf_sample_data *data,
6109 struct pt_regs *regs)
6110 {
6111 return __perf_event_overflow(event, 1, data, regs);
6112 }
6113
6114 /*
6115 * Generic software event infrastructure
6116 */
6117
6118 struct swevent_htable {
6119 struct swevent_hlist *swevent_hlist;
6120 struct mutex hlist_mutex;
6121 int hlist_refcount;
6122
6123 /* Recursion avoidance in each contexts */
6124 int recursion[PERF_NR_CONTEXTS];
6125
6126 /* Keeps track of cpu being initialized/exited */
6127 bool online;
6128 };
6129
6130 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
6131
6132 /*
6133 * We directly increment event->count and keep a second value in
6134 * event->hw.period_left to count intervals. This period event
6135 * is kept in the range [-sample_period, 0] so that we can use the
6136 * sign as trigger.
6137 */
6138
6139 u64 perf_swevent_set_period(struct perf_event *event)
6140 {
6141 struct hw_perf_event *hwc = &event->hw;
6142 u64 period = hwc->last_period;
6143 u64 nr, offset;
6144 s64 old, val;
6145
6146 hwc->last_period = hwc->sample_period;
6147
6148 again:
6149 old = val = local64_read(&hwc->period_left);
6150 if (val < 0)
6151 return 0;
6152
6153 nr = div64_u64(period + val, period);
6154 offset = nr * period;
6155 val -= offset;
6156 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
6157 goto again;
6158
6159 return nr;
6160 }
6161
6162 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
6163 struct perf_sample_data *data,
6164 struct pt_regs *regs)
6165 {
6166 struct hw_perf_event *hwc = &event->hw;
6167 int throttle = 0;
6168
6169 if (!overflow)
6170 overflow = perf_swevent_set_period(event);
6171
6172 if (hwc->interrupts == MAX_INTERRUPTS)
6173 return;
6174
6175 for (; overflow; overflow--) {
6176 if (__perf_event_overflow(event, throttle,
6177 data, regs)) {
6178 /*
6179 * We inhibit the overflow from happening when
6180 * hwc->interrupts == MAX_INTERRUPTS.
6181 */
6182 break;
6183 }
6184 throttle = 1;
6185 }
6186 }
6187
6188 static void perf_swevent_event(struct perf_event *event, u64 nr,
6189 struct perf_sample_data *data,
6190 struct pt_regs *regs)
6191 {
6192 struct hw_perf_event *hwc = &event->hw;
6193
6194 local64_add(nr, &event->count);
6195
6196 if (!regs)
6197 return;
6198
6199 if (!is_sampling_event(event))
6200 return;
6201
6202 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
6203 data->period = nr;
6204 return perf_swevent_overflow(event, 1, data, regs);
6205 } else
6206 data->period = event->hw.last_period;
6207
6208 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
6209 return perf_swevent_overflow(event, 1, data, regs);
6210
6211 if (local64_add_negative(nr, &hwc->period_left))
6212 return;
6213
6214 perf_swevent_overflow(event, 0, data, regs);
6215 }
6216
6217 static int perf_exclude_event(struct perf_event *event,
6218 struct pt_regs *regs)
6219 {
6220 if (event->hw.state & PERF_HES_STOPPED)
6221 return 1;
6222
6223 if (regs) {
6224 if (event->attr.exclude_user && user_mode(regs))
6225 return 1;
6226
6227 if (event->attr.exclude_kernel && !user_mode(regs))
6228 return 1;
6229 }
6230
6231 return 0;
6232 }
6233
6234 static int perf_swevent_match(struct perf_event *event,
6235 enum perf_type_id type,
6236 u32 event_id,
6237 struct perf_sample_data *data,
6238 struct pt_regs *regs)
6239 {
6240 if (event->attr.type != type)
6241 return 0;
6242
6243 if (event->attr.config != event_id)
6244 return 0;
6245
6246 if (perf_exclude_event(event, regs))
6247 return 0;
6248
6249 return 1;
6250 }
6251
6252 static inline u64 swevent_hash(u64 type, u32 event_id)
6253 {
6254 u64 val = event_id | (type << 32);
6255
6256 return hash_64(val, SWEVENT_HLIST_BITS);
6257 }
6258
6259 static inline struct hlist_head *
6260 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
6261 {
6262 u64 hash = swevent_hash(type, event_id);
6263
6264 return &hlist->heads[hash];
6265 }
6266
6267 /* For the read side: events when they trigger */
6268 static inline struct hlist_head *
6269 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
6270 {
6271 struct swevent_hlist *hlist;
6272
6273 hlist = rcu_dereference(swhash->swevent_hlist);
6274 if (!hlist)
6275 return NULL;
6276
6277 return __find_swevent_head(hlist, type, event_id);
6278 }
6279
6280 /* For the event head insertion and removal in the hlist */
6281 static inline struct hlist_head *
6282 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
6283 {
6284 struct swevent_hlist *hlist;
6285 u32 event_id = event->attr.config;
6286 u64 type = event->attr.type;
6287
6288 /*
6289 * Event scheduling is always serialized against hlist allocation
6290 * and release. Which makes the protected version suitable here.
6291 * The context lock guarantees that.
6292 */
6293 hlist = rcu_dereference_protected(swhash->swevent_hlist,
6294 lockdep_is_held(&event->ctx->lock));
6295 if (!hlist)
6296 return NULL;
6297
6298 return __find_swevent_head(hlist, type, event_id);
6299 }
6300
6301 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
6302 u64 nr,
6303 struct perf_sample_data *data,
6304 struct pt_regs *regs)
6305 {
6306 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6307 struct perf_event *event;
6308 struct hlist_head *head;
6309
6310 rcu_read_lock();
6311 head = find_swevent_head_rcu(swhash, type, event_id);
6312 if (!head)
6313 goto end;
6314
6315 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6316 if (perf_swevent_match(event, type, event_id, data, regs))
6317 perf_swevent_event(event, nr, data, regs);
6318 }
6319 end:
6320 rcu_read_unlock();
6321 }
6322
6323 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6324
6325 int perf_swevent_get_recursion_context(void)
6326 {
6327 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6328
6329 return get_recursion_context(swhash->recursion);
6330 }
6331 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6332
6333 inline void perf_swevent_put_recursion_context(int rctx)
6334 {
6335 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6336
6337 put_recursion_context(swhash->recursion, rctx);
6338 }
6339
6340 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6341 {
6342 struct perf_sample_data data;
6343
6344 if (WARN_ON_ONCE(!regs))
6345 return;
6346
6347 perf_sample_data_init(&data, addr, 0);
6348 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6349 }
6350
6351 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6352 {
6353 int rctx;
6354
6355 preempt_disable_notrace();
6356 rctx = perf_swevent_get_recursion_context();
6357 if (unlikely(rctx < 0))
6358 goto fail;
6359
6360 ___perf_sw_event(event_id, nr, regs, addr);
6361
6362 perf_swevent_put_recursion_context(rctx);
6363 fail:
6364 preempt_enable_notrace();
6365 }
6366
6367 static void perf_swevent_read(struct perf_event *event)
6368 {
6369 }
6370
6371 static int perf_swevent_add(struct perf_event *event, int flags)
6372 {
6373 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6374 struct hw_perf_event *hwc = &event->hw;
6375 struct hlist_head *head;
6376
6377 if (is_sampling_event(event)) {
6378 hwc->last_period = hwc->sample_period;
6379 perf_swevent_set_period(event);
6380 }
6381
6382 hwc->state = !(flags & PERF_EF_START);
6383
6384 head = find_swevent_head(swhash, event);
6385 if (!head) {
6386 /*
6387 * We can race with cpu hotplug code. Do not
6388 * WARN if the cpu just got unplugged.
6389 */
6390 WARN_ON_ONCE(swhash->online);
6391 return -EINVAL;
6392 }
6393
6394 hlist_add_head_rcu(&event->hlist_entry, head);
6395 perf_event_update_userpage(event);
6396
6397 return 0;
6398 }
6399
6400 static void perf_swevent_del(struct perf_event *event, int flags)
6401 {
6402 hlist_del_rcu(&event->hlist_entry);
6403 }
6404
6405 static void perf_swevent_start(struct perf_event *event, int flags)
6406 {
6407 event->hw.state = 0;
6408 }
6409
6410 static void perf_swevent_stop(struct perf_event *event, int flags)
6411 {
6412 event->hw.state = PERF_HES_STOPPED;
6413 }
6414
6415 /* Deref the hlist from the update side */
6416 static inline struct swevent_hlist *
6417 swevent_hlist_deref(struct swevent_htable *swhash)
6418 {
6419 return rcu_dereference_protected(swhash->swevent_hlist,
6420 lockdep_is_held(&swhash->hlist_mutex));
6421 }
6422
6423 static void swevent_hlist_release(struct swevent_htable *swhash)
6424 {
6425 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
6426
6427 if (!hlist)
6428 return;
6429
6430 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
6431 kfree_rcu(hlist, rcu_head);
6432 }
6433
6434 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6435 {
6436 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6437
6438 mutex_lock(&swhash->hlist_mutex);
6439
6440 if (!--swhash->hlist_refcount)
6441 swevent_hlist_release(swhash);
6442
6443 mutex_unlock(&swhash->hlist_mutex);
6444 }
6445
6446 static void swevent_hlist_put(struct perf_event *event)
6447 {
6448 int cpu;
6449
6450 for_each_possible_cpu(cpu)
6451 swevent_hlist_put_cpu(event, cpu);
6452 }
6453
6454 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6455 {
6456 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6457 int err = 0;
6458
6459 mutex_lock(&swhash->hlist_mutex);
6460
6461 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
6462 struct swevent_hlist *hlist;
6463
6464 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6465 if (!hlist) {
6466 err = -ENOMEM;
6467 goto exit;
6468 }
6469 rcu_assign_pointer(swhash->swevent_hlist, hlist);
6470 }
6471 swhash->hlist_refcount++;
6472 exit:
6473 mutex_unlock(&swhash->hlist_mutex);
6474
6475 return err;
6476 }
6477
6478 static int swevent_hlist_get(struct perf_event *event)
6479 {
6480 int err;
6481 int cpu, failed_cpu;
6482
6483 get_online_cpus();
6484 for_each_possible_cpu(cpu) {
6485 err = swevent_hlist_get_cpu(event, cpu);
6486 if (err) {
6487 failed_cpu = cpu;
6488 goto fail;
6489 }
6490 }
6491 put_online_cpus();
6492
6493 return 0;
6494 fail:
6495 for_each_possible_cpu(cpu) {
6496 if (cpu == failed_cpu)
6497 break;
6498 swevent_hlist_put_cpu(event, cpu);
6499 }
6500
6501 put_online_cpus();
6502 return err;
6503 }
6504
6505 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
6506
6507 static void sw_perf_event_destroy(struct perf_event *event)
6508 {
6509 u64 event_id = event->attr.config;
6510
6511 WARN_ON(event->parent);
6512
6513 static_key_slow_dec(&perf_swevent_enabled[event_id]);
6514 swevent_hlist_put(event);
6515 }
6516
6517 static int perf_swevent_init(struct perf_event *event)
6518 {
6519 u64 event_id = event->attr.config;
6520
6521 if (event->attr.type != PERF_TYPE_SOFTWARE)
6522 return -ENOENT;
6523
6524 /*
6525 * no branch sampling for software events
6526 */
6527 if (has_branch_stack(event))
6528 return -EOPNOTSUPP;
6529
6530 switch (event_id) {
6531 case PERF_COUNT_SW_CPU_CLOCK:
6532 case PERF_COUNT_SW_TASK_CLOCK:
6533 return -ENOENT;
6534
6535 default:
6536 break;
6537 }
6538
6539 if (event_id >= PERF_COUNT_SW_MAX)
6540 return -ENOENT;
6541
6542 if (!event->parent) {
6543 int err;
6544
6545 err = swevent_hlist_get(event);
6546 if (err)
6547 return err;
6548
6549 static_key_slow_inc(&perf_swevent_enabled[event_id]);
6550 event->destroy = sw_perf_event_destroy;
6551 }
6552
6553 return 0;
6554 }
6555
6556 static struct pmu perf_swevent = {
6557 .task_ctx_nr = perf_sw_context,
6558
6559 .capabilities = PERF_PMU_CAP_NO_NMI,
6560
6561 .event_init = perf_swevent_init,
6562 .add = perf_swevent_add,
6563 .del = perf_swevent_del,
6564 .start = perf_swevent_start,
6565 .stop = perf_swevent_stop,
6566 .read = perf_swevent_read,
6567 };
6568
6569 #ifdef CONFIG_EVENT_TRACING
6570
6571 static int perf_tp_filter_match(struct perf_event *event,
6572 struct perf_sample_data *data)
6573 {
6574 void *record = data->raw->data;
6575
6576 if (likely(!event->filter) || filter_match_preds(event->filter, record))
6577 return 1;
6578 return 0;
6579 }
6580
6581 static int perf_tp_event_match(struct perf_event *event,
6582 struct perf_sample_data *data,
6583 struct pt_regs *regs)
6584 {
6585 if (event->hw.state & PERF_HES_STOPPED)
6586 return 0;
6587 /*
6588 * All tracepoints are from kernel-space.
6589 */
6590 if (event->attr.exclude_kernel)
6591 return 0;
6592
6593 if (!perf_tp_filter_match(event, data))
6594 return 0;
6595
6596 return 1;
6597 }
6598
6599 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6600 struct pt_regs *regs, struct hlist_head *head, int rctx,
6601 struct task_struct *task)
6602 {
6603 struct perf_sample_data data;
6604 struct perf_event *event;
6605
6606 struct perf_raw_record raw = {
6607 .size = entry_size,
6608 .data = record,
6609 };
6610
6611 perf_sample_data_init(&data, addr, 0);
6612 data.raw = &raw;
6613
6614 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6615 if (perf_tp_event_match(event, &data, regs))
6616 perf_swevent_event(event, count, &data, regs);
6617 }
6618
6619 /*
6620 * If we got specified a target task, also iterate its context and
6621 * deliver this event there too.
6622 */
6623 if (task && task != current) {
6624 struct perf_event_context *ctx;
6625 struct trace_entry *entry = record;
6626
6627 rcu_read_lock();
6628 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6629 if (!ctx)
6630 goto unlock;
6631
6632 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6633 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6634 continue;
6635 if (event->attr.config != entry->type)
6636 continue;
6637 if (perf_tp_event_match(event, &data, regs))
6638 perf_swevent_event(event, count, &data, regs);
6639 }
6640 unlock:
6641 rcu_read_unlock();
6642 }
6643
6644 perf_swevent_put_recursion_context(rctx);
6645 }
6646 EXPORT_SYMBOL_GPL(perf_tp_event);
6647
6648 static void tp_perf_event_destroy(struct perf_event *event)
6649 {
6650 perf_trace_destroy(event);
6651 }
6652
6653 static int perf_tp_event_init(struct perf_event *event)
6654 {
6655 int err;
6656
6657 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6658 return -ENOENT;
6659
6660 /*
6661 * no branch sampling for tracepoint events
6662 */
6663 if (has_branch_stack(event))
6664 return -EOPNOTSUPP;
6665
6666 err = perf_trace_init(event);
6667 if (err)
6668 return err;
6669
6670 event->destroy = tp_perf_event_destroy;
6671
6672 return 0;
6673 }
6674
6675 static struct pmu perf_tracepoint = {
6676 .task_ctx_nr = perf_sw_context,
6677
6678 .event_init = perf_tp_event_init,
6679 .add = perf_trace_add,
6680 .del = perf_trace_del,
6681 .start = perf_swevent_start,
6682 .stop = perf_swevent_stop,
6683 .read = perf_swevent_read,
6684 };
6685
6686 static inline void perf_tp_register(void)
6687 {
6688 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
6689 }
6690
6691 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6692 {
6693 char *filter_str;
6694 int ret;
6695
6696 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6697 return -EINVAL;
6698
6699 filter_str = strndup_user(arg, PAGE_SIZE);
6700 if (IS_ERR(filter_str))
6701 return PTR_ERR(filter_str);
6702
6703 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
6704
6705 kfree(filter_str);
6706 return ret;
6707 }
6708
6709 static void perf_event_free_filter(struct perf_event *event)
6710 {
6711 ftrace_profile_free_filter(event);
6712 }
6713
6714 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6715 {
6716 struct bpf_prog *prog;
6717
6718 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6719 return -EINVAL;
6720
6721 if (event->tp_event->prog)
6722 return -EEXIST;
6723
6724 if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
6725 /* bpf programs can only be attached to kprobes */
6726 return -EINVAL;
6727
6728 prog = bpf_prog_get(prog_fd);
6729 if (IS_ERR(prog))
6730 return PTR_ERR(prog);
6731
6732 if (prog->type != BPF_PROG_TYPE_KPROBE) {
6733 /* valid fd, but invalid bpf program type */
6734 bpf_prog_put(prog);
6735 return -EINVAL;
6736 }
6737
6738 event->tp_event->prog = prog;
6739
6740 return 0;
6741 }
6742
6743 static void perf_event_free_bpf_prog(struct perf_event *event)
6744 {
6745 struct bpf_prog *prog;
6746
6747 if (!event->tp_event)
6748 return;
6749
6750 prog = event->tp_event->prog;
6751 if (prog) {
6752 event->tp_event->prog = NULL;
6753 bpf_prog_put(prog);
6754 }
6755 }
6756
6757 #else
6758
6759 static inline void perf_tp_register(void)
6760 {
6761 }
6762
6763 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6764 {
6765 return -ENOENT;
6766 }
6767
6768 static void perf_event_free_filter(struct perf_event *event)
6769 {
6770 }
6771
6772 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6773 {
6774 return -ENOENT;
6775 }
6776
6777 static void perf_event_free_bpf_prog(struct perf_event *event)
6778 {
6779 }
6780 #endif /* CONFIG_EVENT_TRACING */
6781
6782 #ifdef CONFIG_HAVE_HW_BREAKPOINT
6783 void perf_bp_event(struct perf_event *bp, void *data)
6784 {
6785 struct perf_sample_data sample;
6786 struct pt_regs *regs = data;
6787
6788 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
6789
6790 if (!bp->hw.state && !perf_exclude_event(bp, regs))
6791 perf_swevent_event(bp, 1, &sample, regs);
6792 }
6793 #endif
6794
6795 /*
6796 * hrtimer based swevent callback
6797 */
6798
6799 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
6800 {
6801 enum hrtimer_restart ret = HRTIMER_RESTART;
6802 struct perf_sample_data data;
6803 struct pt_regs *regs;
6804 struct perf_event *event;
6805 u64 period;
6806
6807 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
6808
6809 if (event->state != PERF_EVENT_STATE_ACTIVE)
6810 return HRTIMER_NORESTART;
6811
6812 event->pmu->read(event);
6813
6814 perf_sample_data_init(&data, 0, event->hw.last_period);
6815 regs = get_irq_regs();
6816
6817 if (regs && !perf_exclude_event(event, regs)) {
6818 if (!(event->attr.exclude_idle && is_idle_task(current)))
6819 if (__perf_event_overflow(event, 1, &data, regs))
6820 ret = HRTIMER_NORESTART;
6821 }
6822
6823 period = max_t(u64, 10000, event->hw.sample_period);
6824 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
6825
6826 return ret;
6827 }
6828
6829 static void perf_swevent_start_hrtimer(struct perf_event *event)
6830 {
6831 struct hw_perf_event *hwc = &event->hw;
6832 s64 period;
6833
6834 if (!is_sampling_event(event))
6835 return;
6836
6837 period = local64_read(&hwc->period_left);
6838 if (period) {
6839 if (period < 0)
6840 period = 10000;
6841
6842 local64_set(&hwc->period_left, 0);
6843 } else {
6844 period = max_t(u64, 10000, hwc->sample_period);
6845 }
6846 __hrtimer_start_range_ns(&hwc->hrtimer,
6847 ns_to_ktime(period), 0,
6848 HRTIMER_MODE_REL_PINNED, 0);
6849 }
6850
6851 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
6852 {
6853 struct hw_perf_event *hwc = &event->hw;
6854
6855 if (is_sampling_event(event)) {
6856 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
6857 local64_set(&hwc->period_left, ktime_to_ns(remaining));
6858
6859 hrtimer_cancel(&hwc->hrtimer);
6860 }
6861 }
6862
6863 static void perf_swevent_init_hrtimer(struct perf_event *event)
6864 {
6865 struct hw_perf_event *hwc = &event->hw;
6866
6867 if (!is_sampling_event(event))
6868 return;
6869
6870 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6871 hwc->hrtimer.function = perf_swevent_hrtimer;
6872
6873 /*
6874 * Since hrtimers have a fixed rate, we can do a static freq->period
6875 * mapping and avoid the whole period adjust feedback stuff.
6876 */
6877 if (event->attr.freq) {
6878 long freq = event->attr.sample_freq;
6879
6880 event->attr.sample_period = NSEC_PER_SEC / freq;
6881 hwc->sample_period = event->attr.sample_period;
6882 local64_set(&hwc->period_left, hwc->sample_period);
6883 hwc->last_period = hwc->sample_period;
6884 event->attr.freq = 0;
6885 }
6886 }
6887
6888 /*
6889 * Software event: cpu wall time clock
6890 */
6891
6892 static void cpu_clock_event_update(struct perf_event *event)
6893 {
6894 s64 prev;
6895 u64 now;
6896
6897 now = local_clock();
6898 prev = local64_xchg(&event->hw.prev_count, now);
6899 local64_add(now - prev, &event->count);
6900 }
6901
6902 static void cpu_clock_event_start(struct perf_event *event, int flags)
6903 {
6904 local64_set(&event->hw.prev_count, local_clock());
6905 perf_swevent_start_hrtimer(event);
6906 }
6907
6908 static void cpu_clock_event_stop(struct perf_event *event, int flags)
6909 {
6910 perf_swevent_cancel_hrtimer(event);
6911 cpu_clock_event_update(event);
6912 }
6913
6914 static int cpu_clock_event_add(struct perf_event *event, int flags)
6915 {
6916 if (flags & PERF_EF_START)
6917 cpu_clock_event_start(event, flags);
6918 perf_event_update_userpage(event);
6919
6920 return 0;
6921 }
6922
6923 static void cpu_clock_event_del(struct perf_event *event, int flags)
6924 {
6925 cpu_clock_event_stop(event, flags);
6926 }
6927
6928 static void cpu_clock_event_read(struct perf_event *event)
6929 {
6930 cpu_clock_event_update(event);
6931 }
6932
6933 static int cpu_clock_event_init(struct perf_event *event)
6934 {
6935 if (event->attr.type != PERF_TYPE_SOFTWARE)
6936 return -ENOENT;
6937
6938 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
6939 return -ENOENT;
6940
6941 /*
6942 * no branch sampling for software events
6943 */
6944 if (has_branch_stack(event))
6945 return -EOPNOTSUPP;
6946
6947 perf_swevent_init_hrtimer(event);
6948
6949 return 0;
6950 }
6951
6952 static struct pmu perf_cpu_clock = {
6953 .task_ctx_nr = perf_sw_context,
6954
6955 .capabilities = PERF_PMU_CAP_NO_NMI,
6956
6957 .event_init = cpu_clock_event_init,
6958 .add = cpu_clock_event_add,
6959 .del = cpu_clock_event_del,
6960 .start = cpu_clock_event_start,
6961 .stop = cpu_clock_event_stop,
6962 .read = cpu_clock_event_read,
6963 };
6964
6965 /*
6966 * Software event: task time clock
6967 */
6968
6969 static void task_clock_event_update(struct perf_event *event, u64 now)
6970 {
6971 u64 prev;
6972 s64 delta;
6973
6974 prev = local64_xchg(&event->hw.prev_count, now);
6975 delta = now - prev;
6976 local64_add(delta, &event->count);
6977 }
6978
6979 static void task_clock_event_start(struct perf_event *event, int flags)
6980 {
6981 local64_set(&event->hw.prev_count, event->ctx->time);
6982 perf_swevent_start_hrtimer(event);
6983 }
6984
6985 static void task_clock_event_stop(struct perf_event *event, int flags)
6986 {
6987 perf_swevent_cancel_hrtimer(event);
6988 task_clock_event_update(event, event->ctx->time);
6989 }
6990
6991 static int task_clock_event_add(struct perf_event *event, int flags)
6992 {
6993 if (flags & PERF_EF_START)
6994 task_clock_event_start(event, flags);
6995 perf_event_update_userpage(event);
6996
6997 return 0;
6998 }
6999
7000 static void task_clock_event_del(struct perf_event *event, int flags)
7001 {
7002 task_clock_event_stop(event, PERF_EF_UPDATE);
7003 }
7004
7005 static void task_clock_event_read(struct perf_event *event)
7006 {
7007 u64 now = perf_clock();
7008 u64 delta = now - event->ctx->timestamp;
7009 u64 time = event->ctx->time + delta;
7010
7011 task_clock_event_update(event, time);
7012 }
7013
7014 static int task_clock_event_init(struct perf_event *event)
7015 {
7016 if (event->attr.type != PERF_TYPE_SOFTWARE)
7017 return -ENOENT;
7018
7019 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
7020 return -ENOENT;
7021
7022 /*
7023 * no branch sampling for software events
7024 */
7025 if (has_branch_stack(event))
7026 return -EOPNOTSUPP;
7027
7028 perf_swevent_init_hrtimer(event);
7029
7030 return 0;
7031 }
7032
7033 static struct pmu perf_task_clock = {
7034 .task_ctx_nr = perf_sw_context,
7035
7036 .capabilities = PERF_PMU_CAP_NO_NMI,
7037
7038 .event_init = task_clock_event_init,
7039 .add = task_clock_event_add,
7040 .del = task_clock_event_del,
7041 .start = task_clock_event_start,
7042 .stop = task_clock_event_stop,
7043 .read = task_clock_event_read,
7044 };
7045
7046 static void perf_pmu_nop_void(struct pmu *pmu)
7047 {
7048 }
7049
7050 static int perf_pmu_nop_int(struct pmu *pmu)
7051 {
7052 return 0;
7053 }
7054
7055 static void perf_pmu_start_txn(struct pmu *pmu)
7056 {
7057 perf_pmu_disable(pmu);
7058 }
7059
7060 static int perf_pmu_commit_txn(struct pmu *pmu)
7061 {
7062 perf_pmu_enable(pmu);
7063 return 0;
7064 }
7065
7066 static void perf_pmu_cancel_txn(struct pmu *pmu)
7067 {
7068 perf_pmu_enable(pmu);
7069 }
7070
7071 static int perf_event_idx_default(struct perf_event *event)
7072 {
7073 return 0;
7074 }
7075
7076 /*
7077 * Ensures all contexts with the same task_ctx_nr have the same
7078 * pmu_cpu_context too.
7079 */
7080 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
7081 {
7082 struct pmu *pmu;
7083
7084 if (ctxn < 0)
7085 return NULL;
7086
7087 list_for_each_entry(pmu, &pmus, entry) {
7088 if (pmu->task_ctx_nr == ctxn)
7089 return pmu->pmu_cpu_context;
7090 }
7091
7092 return NULL;
7093 }
7094
7095 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
7096 {
7097 int cpu;
7098
7099 for_each_possible_cpu(cpu) {
7100 struct perf_cpu_context *cpuctx;
7101
7102 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7103
7104 if (cpuctx->unique_pmu == old_pmu)
7105 cpuctx->unique_pmu = pmu;
7106 }
7107 }
7108
7109 static void free_pmu_context(struct pmu *pmu)
7110 {
7111 struct pmu *i;
7112
7113 mutex_lock(&pmus_lock);
7114 /*
7115 * Like a real lame refcount.
7116 */
7117 list_for_each_entry(i, &pmus, entry) {
7118 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
7119 update_pmu_context(i, pmu);
7120 goto out;
7121 }
7122 }
7123
7124 free_percpu(pmu->pmu_cpu_context);
7125 out:
7126 mutex_unlock(&pmus_lock);
7127 }
7128 static struct idr pmu_idr;
7129
7130 static ssize_t
7131 type_show(struct device *dev, struct device_attribute *attr, char *page)
7132 {
7133 struct pmu *pmu = dev_get_drvdata(dev);
7134
7135 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
7136 }
7137 static DEVICE_ATTR_RO(type);
7138
7139 static ssize_t
7140 perf_event_mux_interval_ms_show(struct device *dev,
7141 struct device_attribute *attr,
7142 char *page)
7143 {
7144 struct pmu *pmu = dev_get_drvdata(dev);
7145
7146 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
7147 }
7148
7149 static ssize_t
7150 perf_event_mux_interval_ms_store(struct device *dev,
7151 struct device_attribute *attr,
7152 const char *buf, size_t count)
7153 {
7154 struct pmu *pmu = dev_get_drvdata(dev);
7155 int timer, cpu, ret;
7156
7157 ret = kstrtoint(buf, 0, &timer);
7158 if (ret)
7159 return ret;
7160
7161 if (timer < 1)
7162 return -EINVAL;
7163
7164 /* same value, noting to do */
7165 if (timer == pmu->hrtimer_interval_ms)
7166 return count;
7167
7168 pmu->hrtimer_interval_ms = timer;
7169
7170 /* update all cpuctx for this PMU */
7171 for_each_possible_cpu(cpu) {
7172 struct perf_cpu_context *cpuctx;
7173 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7174 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
7175
7176 if (hrtimer_active(&cpuctx->hrtimer))
7177 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
7178 }
7179
7180 return count;
7181 }
7182 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
7183
7184 static struct attribute *pmu_dev_attrs[] = {
7185 &dev_attr_type.attr,
7186 &dev_attr_perf_event_mux_interval_ms.attr,
7187 NULL,
7188 };
7189 ATTRIBUTE_GROUPS(pmu_dev);
7190
7191 static int pmu_bus_running;
7192 static struct bus_type pmu_bus = {
7193 .name = "event_source",
7194 .dev_groups = pmu_dev_groups,
7195 };
7196
7197 static void pmu_dev_release(struct device *dev)
7198 {
7199 kfree(dev);
7200 }
7201
7202 static int pmu_dev_alloc(struct pmu *pmu)
7203 {
7204 int ret = -ENOMEM;
7205
7206 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
7207 if (!pmu->dev)
7208 goto out;
7209
7210 pmu->dev->groups = pmu->attr_groups;
7211 device_initialize(pmu->dev);
7212 ret = dev_set_name(pmu->dev, "%s", pmu->name);
7213 if (ret)
7214 goto free_dev;
7215
7216 dev_set_drvdata(pmu->dev, pmu);
7217 pmu->dev->bus = &pmu_bus;
7218 pmu->dev->release = pmu_dev_release;
7219 ret = device_add(pmu->dev);
7220 if (ret)
7221 goto free_dev;
7222
7223 out:
7224 return ret;
7225
7226 free_dev:
7227 put_device(pmu->dev);
7228 goto out;
7229 }
7230
7231 static struct lock_class_key cpuctx_mutex;
7232 static struct lock_class_key cpuctx_lock;
7233
7234 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
7235 {
7236 int cpu, ret;
7237
7238 mutex_lock(&pmus_lock);
7239 ret = -ENOMEM;
7240 pmu->pmu_disable_count = alloc_percpu(int);
7241 if (!pmu->pmu_disable_count)
7242 goto unlock;
7243
7244 pmu->type = -1;
7245 if (!name)
7246 goto skip_type;
7247 pmu->name = name;
7248
7249 if (type < 0) {
7250 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
7251 if (type < 0) {
7252 ret = type;
7253 goto free_pdc;
7254 }
7255 }
7256 pmu->type = type;
7257
7258 if (pmu_bus_running) {
7259 ret = pmu_dev_alloc(pmu);
7260 if (ret)
7261 goto free_idr;
7262 }
7263
7264 skip_type:
7265 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
7266 if (pmu->pmu_cpu_context)
7267 goto got_cpu_context;
7268
7269 ret = -ENOMEM;
7270 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
7271 if (!pmu->pmu_cpu_context)
7272 goto free_dev;
7273
7274 for_each_possible_cpu(cpu) {
7275 struct perf_cpu_context *cpuctx;
7276
7277 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
7278 __perf_event_init_context(&cpuctx->ctx);
7279 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
7280 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
7281 cpuctx->ctx.pmu = pmu;
7282
7283 __perf_cpu_hrtimer_init(cpuctx, cpu);
7284
7285 cpuctx->unique_pmu = pmu;
7286 }
7287
7288 got_cpu_context:
7289 if (!pmu->start_txn) {
7290 if (pmu->pmu_enable) {
7291 /*
7292 * If we have pmu_enable/pmu_disable calls, install
7293 * transaction stubs that use that to try and batch
7294 * hardware accesses.
7295 */
7296 pmu->start_txn = perf_pmu_start_txn;
7297 pmu->commit_txn = perf_pmu_commit_txn;
7298 pmu->cancel_txn = perf_pmu_cancel_txn;
7299 } else {
7300 pmu->start_txn = perf_pmu_nop_void;
7301 pmu->commit_txn = perf_pmu_nop_int;
7302 pmu->cancel_txn = perf_pmu_nop_void;
7303 }
7304 }
7305
7306 if (!pmu->pmu_enable) {
7307 pmu->pmu_enable = perf_pmu_nop_void;
7308 pmu->pmu_disable = perf_pmu_nop_void;
7309 }
7310
7311 if (!pmu->event_idx)
7312 pmu->event_idx = perf_event_idx_default;
7313
7314 list_add_rcu(&pmu->entry, &pmus);
7315 atomic_set(&pmu->exclusive_cnt, 0);
7316 ret = 0;
7317 unlock:
7318 mutex_unlock(&pmus_lock);
7319
7320 return ret;
7321
7322 free_dev:
7323 device_del(pmu->dev);
7324 put_device(pmu->dev);
7325
7326 free_idr:
7327 if (pmu->type >= PERF_TYPE_MAX)
7328 idr_remove(&pmu_idr, pmu->type);
7329
7330 free_pdc:
7331 free_percpu(pmu->pmu_disable_count);
7332 goto unlock;
7333 }
7334 EXPORT_SYMBOL_GPL(perf_pmu_register);
7335
7336 void perf_pmu_unregister(struct pmu *pmu)
7337 {
7338 mutex_lock(&pmus_lock);
7339 list_del_rcu(&pmu->entry);
7340 mutex_unlock(&pmus_lock);
7341
7342 /*
7343 * We dereference the pmu list under both SRCU and regular RCU, so
7344 * synchronize against both of those.
7345 */
7346 synchronize_srcu(&pmus_srcu);
7347 synchronize_rcu();
7348
7349 free_percpu(pmu->pmu_disable_count);
7350 if (pmu->type >= PERF_TYPE_MAX)
7351 idr_remove(&pmu_idr, pmu->type);
7352 device_del(pmu->dev);
7353 put_device(pmu->dev);
7354 free_pmu_context(pmu);
7355 }
7356 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7357
7358 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7359 {
7360 struct perf_event_context *ctx = NULL;
7361 int ret;
7362
7363 if (!try_module_get(pmu->module))
7364 return -ENODEV;
7365
7366 if (event->group_leader != event) {
7367 ctx = perf_event_ctx_lock(event->group_leader);
7368 BUG_ON(!ctx);
7369 }
7370
7371 event->pmu = pmu;
7372 ret = pmu->event_init(event);
7373
7374 if (ctx)
7375 perf_event_ctx_unlock(event->group_leader, ctx);
7376
7377 if (ret)
7378 module_put(pmu->module);
7379
7380 return ret;
7381 }
7382
7383 struct pmu *perf_init_event(struct perf_event *event)
7384 {
7385 struct pmu *pmu = NULL;
7386 int idx;
7387 int ret;
7388
7389 idx = srcu_read_lock(&pmus_srcu);
7390
7391 rcu_read_lock();
7392 pmu = idr_find(&pmu_idr, event->attr.type);
7393 rcu_read_unlock();
7394 if (pmu) {
7395 ret = perf_try_init_event(pmu, event);
7396 if (ret)
7397 pmu = ERR_PTR(ret);
7398 goto unlock;
7399 }
7400
7401 list_for_each_entry_rcu(pmu, &pmus, entry) {
7402 ret = perf_try_init_event(pmu, event);
7403 if (!ret)
7404 goto unlock;
7405
7406 if (ret != -ENOENT) {
7407 pmu = ERR_PTR(ret);
7408 goto unlock;
7409 }
7410 }
7411 pmu = ERR_PTR(-ENOENT);
7412 unlock:
7413 srcu_read_unlock(&pmus_srcu, idx);
7414
7415 return pmu;
7416 }
7417
7418 static void account_event_cpu(struct perf_event *event, int cpu)
7419 {
7420 if (event->parent)
7421 return;
7422
7423 if (is_cgroup_event(event))
7424 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7425 }
7426
7427 static void account_event(struct perf_event *event)
7428 {
7429 if (event->parent)
7430 return;
7431
7432 if (event->attach_state & PERF_ATTACH_TASK)
7433 static_key_slow_inc(&perf_sched_events.key);
7434 if (event->attr.mmap || event->attr.mmap_data)
7435 atomic_inc(&nr_mmap_events);
7436 if (event->attr.comm)
7437 atomic_inc(&nr_comm_events);
7438 if (event->attr.task)
7439 atomic_inc(&nr_task_events);
7440 if (event->attr.freq) {
7441 if (atomic_inc_return(&nr_freq_events) == 1)
7442 tick_nohz_full_kick_all();
7443 }
7444 if (has_branch_stack(event))
7445 static_key_slow_inc(&perf_sched_events.key);
7446 if (is_cgroup_event(event))
7447 static_key_slow_inc(&perf_sched_events.key);
7448
7449 account_event_cpu(event, event->cpu);
7450 }
7451
7452 /*
7453 * Allocate and initialize a event structure
7454 */
7455 static struct perf_event *
7456 perf_event_alloc(struct perf_event_attr *attr, int cpu,
7457 struct task_struct *task,
7458 struct perf_event *group_leader,
7459 struct perf_event *parent_event,
7460 perf_overflow_handler_t overflow_handler,
7461 void *context, int cgroup_fd)
7462 {
7463 struct pmu *pmu;
7464 struct perf_event *event;
7465 struct hw_perf_event *hwc;
7466 long err = -EINVAL;
7467
7468 if ((unsigned)cpu >= nr_cpu_ids) {
7469 if (!task || cpu != -1)
7470 return ERR_PTR(-EINVAL);
7471 }
7472
7473 event = kzalloc(sizeof(*event), GFP_KERNEL);
7474 if (!event)
7475 return ERR_PTR(-ENOMEM);
7476
7477 /*
7478 * Single events are their own group leaders, with an
7479 * empty sibling list:
7480 */
7481 if (!group_leader)
7482 group_leader = event;
7483
7484 mutex_init(&event->child_mutex);
7485 INIT_LIST_HEAD(&event->child_list);
7486
7487 INIT_LIST_HEAD(&event->group_entry);
7488 INIT_LIST_HEAD(&event->event_entry);
7489 INIT_LIST_HEAD(&event->sibling_list);
7490 INIT_LIST_HEAD(&event->rb_entry);
7491 INIT_LIST_HEAD(&event->active_entry);
7492 INIT_HLIST_NODE(&event->hlist_entry);
7493
7494
7495 init_waitqueue_head(&event->waitq);
7496 init_irq_work(&event->pending, perf_pending_event);
7497
7498 mutex_init(&event->mmap_mutex);
7499
7500 atomic_long_set(&event->refcount, 1);
7501 event->cpu = cpu;
7502 event->attr = *attr;
7503 event->group_leader = group_leader;
7504 event->pmu = NULL;
7505 event->oncpu = -1;
7506
7507 event->parent = parent_event;
7508
7509 event->ns = get_pid_ns(task_active_pid_ns(current));
7510 event->id = atomic64_inc_return(&perf_event_id);
7511
7512 event->state = PERF_EVENT_STATE_INACTIVE;
7513
7514 if (task) {
7515 event->attach_state = PERF_ATTACH_TASK;
7516 /*
7517 * XXX pmu::event_init needs to know what task to account to
7518 * and we cannot use the ctx information because we need the
7519 * pmu before we get a ctx.
7520 */
7521 event->hw.target = task;
7522 }
7523
7524 event->clock = &local_clock;
7525 if (parent_event)
7526 event->clock = parent_event->clock;
7527
7528 if (!overflow_handler && parent_event) {
7529 overflow_handler = parent_event->overflow_handler;
7530 context = parent_event->overflow_handler_context;
7531 }
7532
7533 event->overflow_handler = overflow_handler;
7534 event->overflow_handler_context = context;
7535
7536 perf_event__state_init(event);
7537
7538 pmu = NULL;
7539
7540 hwc = &event->hw;
7541 hwc->sample_period = attr->sample_period;
7542 if (attr->freq && attr->sample_freq)
7543 hwc->sample_period = 1;
7544 hwc->last_period = hwc->sample_period;
7545
7546 local64_set(&hwc->period_left, hwc->sample_period);
7547
7548 /*
7549 * we currently do not support PERF_FORMAT_GROUP on inherited events
7550 */
7551 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7552 goto err_ns;
7553
7554 if (!has_branch_stack(event))
7555 event->attr.branch_sample_type = 0;
7556
7557 if (cgroup_fd != -1) {
7558 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
7559 if (err)
7560 goto err_ns;
7561 }
7562
7563 pmu = perf_init_event(event);
7564 if (!pmu)
7565 goto err_ns;
7566 else if (IS_ERR(pmu)) {
7567 err = PTR_ERR(pmu);
7568 goto err_ns;
7569 }
7570
7571 err = exclusive_event_init(event);
7572 if (err)
7573 goto err_pmu;
7574
7575 if (!event->parent) {
7576 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7577 err = get_callchain_buffers();
7578 if (err)
7579 goto err_per_task;
7580 }
7581 }
7582
7583 return event;
7584
7585 err_per_task:
7586 exclusive_event_destroy(event);
7587
7588 err_pmu:
7589 if (event->destroy)
7590 event->destroy(event);
7591 module_put(pmu->module);
7592 err_ns:
7593 if (is_cgroup_event(event))
7594 perf_detach_cgroup(event);
7595 if (event->ns)
7596 put_pid_ns(event->ns);
7597 kfree(event);
7598
7599 return ERR_PTR(err);
7600 }
7601
7602 static int perf_copy_attr(struct perf_event_attr __user *uattr,
7603 struct perf_event_attr *attr)
7604 {
7605 u32 size;
7606 int ret;
7607
7608 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
7609 return -EFAULT;
7610
7611 /*
7612 * zero the full structure, so that a short copy will be nice.
7613 */
7614 memset(attr, 0, sizeof(*attr));
7615
7616 ret = get_user(size, &uattr->size);
7617 if (ret)
7618 return ret;
7619
7620 if (size > PAGE_SIZE) /* silly large */
7621 goto err_size;
7622
7623 if (!size) /* abi compat */
7624 size = PERF_ATTR_SIZE_VER0;
7625
7626 if (size < PERF_ATTR_SIZE_VER0)
7627 goto err_size;
7628
7629 /*
7630 * If we're handed a bigger struct than we know of,
7631 * ensure all the unknown bits are 0 - i.e. new
7632 * user-space does not rely on any kernel feature
7633 * extensions we dont know about yet.
7634 */
7635 if (size > sizeof(*attr)) {
7636 unsigned char __user *addr;
7637 unsigned char __user *end;
7638 unsigned char val;
7639
7640 addr = (void __user *)uattr + sizeof(*attr);
7641 end = (void __user *)uattr + size;
7642
7643 for (; addr < end; addr++) {
7644 ret = get_user(val, addr);
7645 if (ret)
7646 return ret;
7647 if (val)
7648 goto err_size;
7649 }
7650 size = sizeof(*attr);
7651 }
7652
7653 ret = copy_from_user(attr, uattr, size);
7654 if (ret)
7655 return -EFAULT;
7656
7657 if (attr->__reserved_1)
7658 return -EINVAL;
7659
7660 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
7661 return -EINVAL;
7662
7663 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
7664 return -EINVAL;
7665
7666 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
7667 u64 mask = attr->branch_sample_type;
7668
7669 /* only using defined bits */
7670 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
7671 return -EINVAL;
7672
7673 /* at least one branch bit must be set */
7674 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
7675 return -EINVAL;
7676
7677 /* propagate priv level, when not set for branch */
7678 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
7679
7680 /* exclude_kernel checked on syscall entry */
7681 if (!attr->exclude_kernel)
7682 mask |= PERF_SAMPLE_BRANCH_KERNEL;
7683
7684 if (!attr->exclude_user)
7685 mask |= PERF_SAMPLE_BRANCH_USER;
7686
7687 if (!attr->exclude_hv)
7688 mask |= PERF_SAMPLE_BRANCH_HV;
7689 /*
7690 * adjust user setting (for HW filter setup)
7691 */
7692 attr->branch_sample_type = mask;
7693 }
7694 /* privileged levels capture (kernel, hv): check permissions */
7695 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
7696 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7697 return -EACCES;
7698 }
7699
7700 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
7701 ret = perf_reg_validate(attr->sample_regs_user);
7702 if (ret)
7703 return ret;
7704 }
7705
7706 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
7707 if (!arch_perf_have_user_stack_dump())
7708 return -ENOSYS;
7709
7710 /*
7711 * We have __u32 type for the size, but so far
7712 * we can only use __u16 as maximum due to the
7713 * __u16 sample size limit.
7714 */
7715 if (attr->sample_stack_user >= USHRT_MAX)
7716 ret = -EINVAL;
7717 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
7718 ret = -EINVAL;
7719 }
7720
7721 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
7722 ret = perf_reg_validate(attr->sample_regs_intr);
7723 out:
7724 return ret;
7725
7726 err_size:
7727 put_user(sizeof(*attr), &uattr->size);
7728 ret = -E2BIG;
7729 goto out;
7730 }
7731
7732 static int
7733 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7734 {
7735 struct ring_buffer *rb = NULL;
7736 int ret = -EINVAL;
7737
7738 if (!output_event)
7739 goto set;
7740
7741 /* don't allow circular references */
7742 if (event == output_event)
7743 goto out;
7744
7745 /*
7746 * Don't allow cross-cpu buffers
7747 */
7748 if (output_event->cpu != event->cpu)
7749 goto out;
7750
7751 /*
7752 * If its not a per-cpu rb, it must be the same task.
7753 */
7754 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7755 goto out;
7756
7757 /*
7758 * Mixing clocks in the same buffer is trouble you don't need.
7759 */
7760 if (output_event->clock != event->clock)
7761 goto out;
7762
7763 /*
7764 * If both events generate aux data, they must be on the same PMU
7765 */
7766 if (has_aux(event) && has_aux(output_event) &&
7767 event->pmu != output_event->pmu)
7768 goto out;
7769
7770 set:
7771 mutex_lock(&event->mmap_mutex);
7772 /* Can't redirect output if we've got an active mmap() */
7773 if (atomic_read(&event->mmap_count))
7774 goto unlock;
7775
7776 if (output_event) {
7777 /* get the rb we want to redirect to */
7778 rb = ring_buffer_get(output_event);
7779 if (!rb)
7780 goto unlock;
7781 }
7782
7783 ring_buffer_attach(event, rb);
7784
7785 ret = 0;
7786 unlock:
7787 mutex_unlock(&event->mmap_mutex);
7788
7789 out:
7790 return ret;
7791 }
7792
7793 static void mutex_lock_double(struct mutex *a, struct mutex *b)
7794 {
7795 if (b < a)
7796 swap(a, b);
7797
7798 mutex_lock(a);
7799 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7800 }
7801
7802 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
7803 {
7804 bool nmi_safe = false;
7805
7806 switch (clk_id) {
7807 case CLOCK_MONOTONIC:
7808 event->clock = &ktime_get_mono_fast_ns;
7809 nmi_safe = true;
7810 break;
7811
7812 case CLOCK_MONOTONIC_RAW:
7813 event->clock = &ktime_get_raw_fast_ns;
7814 nmi_safe = true;
7815 break;
7816
7817 case CLOCK_REALTIME:
7818 event->clock = &ktime_get_real_ns;
7819 break;
7820
7821 case CLOCK_BOOTTIME:
7822 event->clock = &ktime_get_boot_ns;
7823 break;
7824
7825 case CLOCK_TAI:
7826 event->clock = &ktime_get_tai_ns;
7827 break;
7828
7829 default:
7830 return -EINVAL;
7831 }
7832
7833 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
7834 return -EINVAL;
7835
7836 return 0;
7837 }
7838
7839 /**
7840 * sys_perf_event_open - open a performance event, associate it to a task/cpu
7841 *
7842 * @attr_uptr: event_id type attributes for monitoring/sampling
7843 * @pid: target pid
7844 * @cpu: target cpu
7845 * @group_fd: group leader event fd
7846 */
7847 SYSCALL_DEFINE5(perf_event_open,
7848 struct perf_event_attr __user *, attr_uptr,
7849 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
7850 {
7851 struct perf_event *group_leader = NULL, *output_event = NULL;
7852 struct perf_event *event, *sibling;
7853 struct perf_event_attr attr;
7854 struct perf_event_context *ctx, *uninitialized_var(gctx);
7855 struct file *event_file = NULL;
7856 struct fd group = {NULL, 0};
7857 struct task_struct *task = NULL;
7858 struct pmu *pmu;
7859 int event_fd;
7860 int move_group = 0;
7861 int err;
7862 int f_flags = O_RDWR;
7863 int cgroup_fd = -1;
7864
7865 /* for future expandability... */
7866 if (flags & ~PERF_FLAG_ALL)
7867 return -EINVAL;
7868
7869 err = perf_copy_attr(attr_uptr, &attr);
7870 if (err)
7871 return err;
7872
7873 if (!attr.exclude_kernel) {
7874 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7875 return -EACCES;
7876 }
7877
7878 if (attr.freq) {
7879 if (attr.sample_freq > sysctl_perf_event_sample_rate)
7880 return -EINVAL;
7881 } else {
7882 if (attr.sample_period & (1ULL << 63))
7883 return -EINVAL;
7884 }
7885
7886 /*
7887 * In cgroup mode, the pid argument is used to pass the fd
7888 * opened to the cgroup directory in cgroupfs. The cpu argument
7889 * designates the cpu on which to monitor threads from that
7890 * cgroup.
7891 */
7892 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7893 return -EINVAL;
7894
7895 if (flags & PERF_FLAG_FD_CLOEXEC)
7896 f_flags |= O_CLOEXEC;
7897
7898 event_fd = get_unused_fd_flags(f_flags);
7899 if (event_fd < 0)
7900 return event_fd;
7901
7902 if (group_fd != -1) {
7903 err = perf_fget_light(group_fd, &group);
7904 if (err)
7905 goto err_fd;
7906 group_leader = group.file->private_data;
7907 if (flags & PERF_FLAG_FD_OUTPUT)
7908 output_event = group_leader;
7909 if (flags & PERF_FLAG_FD_NO_GROUP)
7910 group_leader = NULL;
7911 }
7912
7913 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
7914 task = find_lively_task_by_vpid(pid);
7915 if (IS_ERR(task)) {
7916 err = PTR_ERR(task);
7917 goto err_group_fd;
7918 }
7919 }
7920
7921 if (task && group_leader &&
7922 group_leader->attr.inherit != attr.inherit) {
7923 err = -EINVAL;
7924 goto err_task;
7925 }
7926
7927 get_online_cpus();
7928
7929 if (flags & PERF_FLAG_PID_CGROUP)
7930 cgroup_fd = pid;
7931
7932 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7933 NULL, NULL, cgroup_fd);
7934 if (IS_ERR(event)) {
7935 err = PTR_ERR(event);
7936 goto err_cpus;
7937 }
7938
7939 if (is_sampling_event(event)) {
7940 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7941 err = -ENOTSUPP;
7942 goto err_alloc;
7943 }
7944 }
7945
7946 account_event(event);
7947
7948 /*
7949 * Special case software events and allow them to be part of
7950 * any hardware group.
7951 */
7952 pmu = event->pmu;
7953
7954 if (attr.use_clockid) {
7955 err = perf_event_set_clock(event, attr.clockid);
7956 if (err)
7957 goto err_alloc;
7958 }
7959
7960 if (group_leader &&
7961 (is_software_event(event) != is_software_event(group_leader))) {
7962 if (is_software_event(event)) {
7963 /*
7964 * If event and group_leader are not both a software
7965 * event, and event is, then group leader is not.
7966 *
7967 * Allow the addition of software events to !software
7968 * groups, this is safe because software events never
7969 * fail to schedule.
7970 */
7971 pmu = group_leader->pmu;
7972 } else if (is_software_event(group_leader) &&
7973 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
7974 /*
7975 * In case the group is a pure software group, and we
7976 * try to add a hardware event, move the whole group to
7977 * the hardware context.
7978 */
7979 move_group = 1;
7980 }
7981 }
7982
7983 /*
7984 * Get the target context (task or percpu):
7985 */
7986 ctx = find_get_context(pmu, task, event);
7987 if (IS_ERR(ctx)) {
7988 err = PTR_ERR(ctx);
7989 goto err_alloc;
7990 }
7991
7992 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
7993 err = -EBUSY;
7994 goto err_context;
7995 }
7996
7997 if (task) {
7998 put_task_struct(task);
7999 task = NULL;
8000 }
8001
8002 /*
8003 * Look up the group leader (we will attach this event to it):
8004 */
8005 if (group_leader) {
8006 err = -EINVAL;
8007
8008 /*
8009 * Do not allow a recursive hierarchy (this new sibling
8010 * becoming part of another group-sibling):
8011 */
8012 if (group_leader->group_leader != group_leader)
8013 goto err_context;
8014
8015 /* All events in a group should have the same clock */
8016 if (group_leader->clock != event->clock)
8017 goto err_context;
8018
8019 /*
8020 * Do not allow to attach to a group in a different
8021 * task or CPU context:
8022 */
8023 if (move_group) {
8024 /*
8025 * Make sure we're both on the same task, or both
8026 * per-cpu events.
8027 */
8028 if (group_leader->ctx->task != ctx->task)
8029 goto err_context;
8030
8031 /*
8032 * Make sure we're both events for the same CPU;
8033 * grouping events for different CPUs is broken; since
8034 * you can never concurrently schedule them anyhow.
8035 */
8036 if (group_leader->cpu != event->cpu)
8037 goto err_context;
8038 } else {
8039 if (group_leader->ctx != ctx)
8040 goto err_context;
8041 }
8042
8043 /*
8044 * Only a group leader can be exclusive or pinned
8045 */
8046 if (attr.exclusive || attr.pinned)
8047 goto err_context;
8048 }
8049
8050 if (output_event) {
8051 err = perf_event_set_output(event, output_event);
8052 if (err)
8053 goto err_context;
8054 }
8055
8056 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
8057 f_flags);
8058 if (IS_ERR(event_file)) {
8059 err = PTR_ERR(event_file);
8060 goto err_context;
8061 }
8062
8063 if (move_group) {
8064 gctx = group_leader->ctx;
8065
8066 /*
8067 * See perf_event_ctx_lock() for comments on the details
8068 * of swizzling perf_event::ctx.
8069 */
8070 mutex_lock_double(&gctx->mutex, &ctx->mutex);
8071
8072 perf_remove_from_context(group_leader, false);
8073
8074 list_for_each_entry(sibling, &group_leader->sibling_list,
8075 group_entry) {
8076 perf_remove_from_context(sibling, false);
8077 put_ctx(gctx);
8078 }
8079 } else {
8080 mutex_lock(&ctx->mutex);
8081 }
8082
8083 WARN_ON_ONCE(ctx->parent_ctx);
8084
8085 if (move_group) {
8086 /*
8087 * Wait for everybody to stop referencing the events through
8088 * the old lists, before installing it on new lists.
8089 */
8090 synchronize_rcu();
8091
8092 /*
8093 * Install the group siblings before the group leader.
8094 *
8095 * Because a group leader will try and install the entire group
8096 * (through the sibling list, which is still in-tact), we can
8097 * end up with siblings installed in the wrong context.
8098 *
8099 * By installing siblings first we NO-OP because they're not
8100 * reachable through the group lists.
8101 */
8102 list_for_each_entry(sibling, &group_leader->sibling_list,
8103 group_entry) {
8104 perf_event__state_init(sibling);
8105 perf_install_in_context(ctx, sibling, sibling->cpu);
8106 get_ctx(ctx);
8107 }
8108
8109 /*
8110 * Removing from the context ends up with disabled
8111 * event. What we want here is event in the initial
8112 * startup state, ready to be add into new context.
8113 */
8114 perf_event__state_init(group_leader);
8115 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8116 get_ctx(ctx);
8117 }
8118
8119 if (!exclusive_event_installable(event, ctx)) {
8120 err = -EBUSY;
8121 mutex_unlock(&ctx->mutex);
8122 fput(event_file);
8123 goto err_context;
8124 }
8125
8126 perf_install_in_context(ctx, event, event->cpu);
8127 perf_unpin_context(ctx);
8128
8129 if (move_group) {
8130 mutex_unlock(&gctx->mutex);
8131 put_ctx(gctx);
8132 }
8133 mutex_unlock(&ctx->mutex);
8134
8135 put_online_cpus();
8136
8137 event->owner = current;
8138
8139 mutex_lock(&current->perf_event_mutex);
8140 list_add_tail(&event->owner_entry, &current->perf_event_list);
8141 mutex_unlock(&current->perf_event_mutex);
8142
8143 /*
8144 * Precalculate sample_data sizes
8145 */
8146 perf_event__header_size(event);
8147 perf_event__id_header_size(event);
8148
8149 /*
8150 * Drop the reference on the group_event after placing the
8151 * new event on the sibling_list. This ensures destruction
8152 * of the group leader will find the pointer to itself in
8153 * perf_group_detach().
8154 */
8155 fdput(group);
8156 fd_install(event_fd, event_file);
8157 return event_fd;
8158
8159 err_context:
8160 perf_unpin_context(ctx);
8161 put_ctx(ctx);
8162 err_alloc:
8163 free_event(event);
8164 err_cpus:
8165 put_online_cpus();
8166 err_task:
8167 if (task)
8168 put_task_struct(task);
8169 err_group_fd:
8170 fdput(group);
8171 err_fd:
8172 put_unused_fd(event_fd);
8173 return err;
8174 }
8175
8176 /**
8177 * perf_event_create_kernel_counter
8178 *
8179 * @attr: attributes of the counter to create
8180 * @cpu: cpu in which the counter is bound
8181 * @task: task to profile (NULL for percpu)
8182 */
8183 struct perf_event *
8184 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8185 struct task_struct *task,
8186 perf_overflow_handler_t overflow_handler,
8187 void *context)
8188 {
8189 struct perf_event_context *ctx;
8190 struct perf_event *event;
8191 int err;
8192
8193 /*
8194 * Get the target context (task or percpu):
8195 */
8196
8197 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
8198 overflow_handler, context, -1);
8199 if (IS_ERR(event)) {
8200 err = PTR_ERR(event);
8201 goto err;
8202 }
8203
8204 /* Mark owner so we could distinguish it from user events. */
8205 event->owner = EVENT_OWNER_KERNEL;
8206
8207 account_event(event);
8208
8209 ctx = find_get_context(event->pmu, task, event);
8210 if (IS_ERR(ctx)) {
8211 err = PTR_ERR(ctx);
8212 goto err_free;
8213 }
8214
8215 WARN_ON_ONCE(ctx->parent_ctx);
8216 mutex_lock(&ctx->mutex);
8217 if (!exclusive_event_installable(event, ctx)) {
8218 mutex_unlock(&ctx->mutex);
8219 perf_unpin_context(ctx);
8220 put_ctx(ctx);
8221 err = -EBUSY;
8222 goto err_free;
8223 }
8224
8225 perf_install_in_context(ctx, event, cpu);
8226 perf_unpin_context(ctx);
8227 mutex_unlock(&ctx->mutex);
8228
8229 return event;
8230
8231 err_free:
8232 free_event(event);
8233 err:
8234 return ERR_PTR(err);
8235 }
8236 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
8237
8238 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
8239 {
8240 struct perf_event_context *src_ctx;
8241 struct perf_event_context *dst_ctx;
8242 struct perf_event *event, *tmp;
8243 LIST_HEAD(events);
8244
8245 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
8246 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
8247
8248 /*
8249 * See perf_event_ctx_lock() for comments on the details
8250 * of swizzling perf_event::ctx.
8251 */
8252 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
8253 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
8254 event_entry) {
8255 perf_remove_from_context(event, false);
8256 unaccount_event_cpu(event, src_cpu);
8257 put_ctx(src_ctx);
8258 list_add(&event->migrate_entry, &events);
8259 }
8260
8261 /*
8262 * Wait for the events to quiesce before re-instating them.
8263 */
8264 synchronize_rcu();
8265
8266 /*
8267 * Re-instate events in 2 passes.
8268 *
8269 * Skip over group leaders and only install siblings on this first
8270 * pass, siblings will not get enabled without a leader, however a
8271 * leader will enable its siblings, even if those are still on the old
8272 * context.
8273 */
8274 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8275 if (event->group_leader == event)
8276 continue;
8277
8278 list_del(&event->migrate_entry);
8279 if (event->state >= PERF_EVENT_STATE_OFF)
8280 event->state = PERF_EVENT_STATE_INACTIVE;
8281 account_event_cpu(event, dst_cpu);
8282 perf_install_in_context(dst_ctx, event, dst_cpu);
8283 get_ctx(dst_ctx);
8284 }
8285
8286 /*
8287 * Once all the siblings are setup properly, install the group leaders
8288 * to make it go.
8289 */
8290 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
8291 list_del(&event->migrate_entry);
8292 if (event->state >= PERF_EVENT_STATE_OFF)
8293 event->state = PERF_EVENT_STATE_INACTIVE;
8294 account_event_cpu(event, dst_cpu);
8295 perf_install_in_context(dst_ctx, event, dst_cpu);
8296 get_ctx(dst_ctx);
8297 }
8298 mutex_unlock(&dst_ctx->mutex);
8299 mutex_unlock(&src_ctx->mutex);
8300 }
8301 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
8302
8303 static void sync_child_event(struct perf_event *child_event,
8304 struct task_struct *child)
8305 {
8306 struct perf_event *parent_event = child_event->parent;
8307 u64 child_val;
8308
8309 if (child_event->attr.inherit_stat)
8310 perf_event_read_event(child_event, child);
8311
8312 child_val = perf_event_count(child_event);
8313
8314 /*
8315 * Add back the child's count to the parent's count:
8316 */
8317 atomic64_add(child_val, &parent_event->child_count);
8318 atomic64_add(child_event->total_time_enabled,
8319 &parent_event->child_total_time_enabled);
8320 atomic64_add(child_event->total_time_running,
8321 &parent_event->child_total_time_running);
8322
8323 /*
8324 * Remove this event from the parent's list
8325 */
8326 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8327 mutex_lock(&parent_event->child_mutex);
8328 list_del_init(&child_event->child_list);
8329 mutex_unlock(&parent_event->child_mutex);
8330
8331 /*
8332 * Make sure user/parent get notified, that we just
8333 * lost one event.
8334 */
8335 perf_event_wakeup(parent_event);
8336
8337 /*
8338 * Release the parent event, if this was the last
8339 * reference to it.
8340 */
8341 put_event(parent_event);
8342 }
8343
8344 static void
8345 __perf_event_exit_task(struct perf_event *child_event,
8346 struct perf_event_context *child_ctx,
8347 struct task_struct *child)
8348 {
8349 /*
8350 * Do not destroy the 'original' grouping; because of the context
8351 * switch optimization the original events could've ended up in a
8352 * random child task.
8353 *
8354 * If we were to destroy the original group, all group related
8355 * operations would cease to function properly after this random
8356 * child dies.
8357 *
8358 * Do destroy all inherited groups, we don't care about those
8359 * and being thorough is better.
8360 */
8361 perf_remove_from_context(child_event, !!child_event->parent);
8362
8363 /*
8364 * It can happen that the parent exits first, and has events
8365 * that are still around due to the child reference. These
8366 * events need to be zapped.
8367 */
8368 if (child_event->parent) {
8369 sync_child_event(child_event, child);
8370 free_event(child_event);
8371 } else {
8372 child_event->state = PERF_EVENT_STATE_EXIT;
8373 perf_event_wakeup(child_event);
8374 }
8375 }
8376
8377 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
8378 {
8379 struct perf_event *child_event, *next;
8380 struct perf_event_context *child_ctx, *clone_ctx = NULL;
8381 unsigned long flags;
8382
8383 if (likely(!child->perf_event_ctxp[ctxn])) {
8384 perf_event_task(child, NULL, 0);
8385 return;
8386 }
8387
8388 local_irq_save(flags);
8389 /*
8390 * We can't reschedule here because interrupts are disabled,
8391 * and either child is current or it is a task that can't be
8392 * scheduled, so we are now safe from rescheduling changing
8393 * our context.
8394 */
8395 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
8396
8397 /*
8398 * Take the context lock here so that if find_get_context is
8399 * reading child->perf_event_ctxp, we wait until it has
8400 * incremented the context's refcount before we do put_ctx below.
8401 */
8402 raw_spin_lock(&child_ctx->lock);
8403 task_ctx_sched_out(child_ctx);
8404 child->perf_event_ctxp[ctxn] = NULL;
8405
8406 /*
8407 * If this context is a clone; unclone it so it can't get
8408 * swapped to another process while we're removing all
8409 * the events from it.
8410 */
8411 clone_ctx = unclone_ctx(child_ctx);
8412 update_context_time(child_ctx);
8413 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8414
8415 if (clone_ctx)
8416 put_ctx(clone_ctx);
8417
8418 /*
8419 * Report the task dead after unscheduling the events so that we
8420 * won't get any samples after PERF_RECORD_EXIT. We can however still
8421 * get a few PERF_RECORD_READ events.
8422 */
8423 perf_event_task(child, child_ctx, 0);
8424
8425 /*
8426 * We can recurse on the same lock type through:
8427 *
8428 * __perf_event_exit_task()
8429 * sync_child_event()
8430 * put_event()
8431 * mutex_lock(&ctx->mutex)
8432 *
8433 * But since its the parent context it won't be the same instance.
8434 */
8435 mutex_lock(&child_ctx->mutex);
8436
8437 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8438 __perf_event_exit_task(child_event, child_ctx, child);
8439
8440 mutex_unlock(&child_ctx->mutex);
8441
8442 put_ctx(child_ctx);
8443 }
8444
8445 /*
8446 * When a child task exits, feed back event values to parent events.
8447 */
8448 void perf_event_exit_task(struct task_struct *child)
8449 {
8450 struct perf_event *event, *tmp;
8451 int ctxn;
8452
8453 mutex_lock(&child->perf_event_mutex);
8454 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8455 owner_entry) {
8456 list_del_init(&event->owner_entry);
8457
8458 /*
8459 * Ensure the list deletion is visible before we clear
8460 * the owner, closes a race against perf_release() where
8461 * we need to serialize on the owner->perf_event_mutex.
8462 */
8463 smp_wmb();
8464 event->owner = NULL;
8465 }
8466 mutex_unlock(&child->perf_event_mutex);
8467
8468 for_each_task_context_nr(ctxn)
8469 perf_event_exit_task_context(child, ctxn);
8470 }
8471
8472 static void perf_free_event(struct perf_event *event,
8473 struct perf_event_context *ctx)
8474 {
8475 struct perf_event *parent = event->parent;
8476
8477 if (WARN_ON_ONCE(!parent))
8478 return;
8479
8480 mutex_lock(&parent->child_mutex);
8481 list_del_init(&event->child_list);
8482 mutex_unlock(&parent->child_mutex);
8483
8484 put_event(parent);
8485
8486 raw_spin_lock_irq(&ctx->lock);
8487 perf_group_detach(event);
8488 list_del_event(event, ctx);
8489 raw_spin_unlock_irq(&ctx->lock);
8490 free_event(event);
8491 }
8492
8493 /*
8494 * Free an unexposed, unused context as created by inheritance by
8495 * perf_event_init_task below, used by fork() in case of fail.
8496 *
8497 * Not all locks are strictly required, but take them anyway to be nice and
8498 * help out with the lockdep assertions.
8499 */
8500 void perf_event_free_task(struct task_struct *task)
8501 {
8502 struct perf_event_context *ctx;
8503 struct perf_event *event, *tmp;
8504 int ctxn;
8505
8506 for_each_task_context_nr(ctxn) {
8507 ctx = task->perf_event_ctxp[ctxn];
8508 if (!ctx)
8509 continue;
8510
8511 mutex_lock(&ctx->mutex);
8512 again:
8513 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8514 group_entry)
8515 perf_free_event(event, ctx);
8516
8517 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8518 group_entry)
8519 perf_free_event(event, ctx);
8520
8521 if (!list_empty(&ctx->pinned_groups) ||
8522 !list_empty(&ctx->flexible_groups))
8523 goto again;
8524
8525 mutex_unlock(&ctx->mutex);
8526
8527 put_ctx(ctx);
8528 }
8529 }
8530
8531 void perf_event_delayed_put(struct task_struct *task)
8532 {
8533 int ctxn;
8534
8535 for_each_task_context_nr(ctxn)
8536 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8537 }
8538
8539 /*
8540 * inherit a event from parent task to child task:
8541 */
8542 static struct perf_event *
8543 inherit_event(struct perf_event *parent_event,
8544 struct task_struct *parent,
8545 struct perf_event_context *parent_ctx,
8546 struct task_struct *child,
8547 struct perf_event *group_leader,
8548 struct perf_event_context *child_ctx)
8549 {
8550 enum perf_event_active_state parent_state = parent_event->state;
8551 struct perf_event *child_event;
8552 unsigned long flags;
8553
8554 /*
8555 * Instead of creating recursive hierarchies of events,
8556 * we link inherited events back to the original parent,
8557 * which has a filp for sure, which we use as the reference
8558 * count:
8559 */
8560 if (parent_event->parent)
8561 parent_event = parent_event->parent;
8562
8563 child_event = perf_event_alloc(&parent_event->attr,
8564 parent_event->cpu,
8565 child,
8566 group_leader, parent_event,
8567 NULL, NULL, -1);
8568 if (IS_ERR(child_event))
8569 return child_event;
8570
8571 if (is_orphaned_event(parent_event) ||
8572 !atomic_long_inc_not_zero(&parent_event->refcount)) {
8573 free_event(child_event);
8574 return NULL;
8575 }
8576
8577 get_ctx(child_ctx);
8578
8579 /*
8580 * Make the child state follow the state of the parent event,
8581 * not its attr.disabled bit. We hold the parent's mutex,
8582 * so we won't race with perf_event_{en, dis}able_family.
8583 */
8584 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
8585 child_event->state = PERF_EVENT_STATE_INACTIVE;
8586 else
8587 child_event->state = PERF_EVENT_STATE_OFF;
8588
8589 if (parent_event->attr.freq) {
8590 u64 sample_period = parent_event->hw.sample_period;
8591 struct hw_perf_event *hwc = &child_event->hw;
8592
8593 hwc->sample_period = sample_period;
8594 hwc->last_period = sample_period;
8595
8596 local64_set(&hwc->period_left, sample_period);
8597 }
8598
8599 child_event->ctx = child_ctx;
8600 child_event->overflow_handler = parent_event->overflow_handler;
8601 child_event->overflow_handler_context
8602 = parent_event->overflow_handler_context;
8603
8604 /*
8605 * Precalculate sample_data sizes
8606 */
8607 perf_event__header_size(child_event);
8608 perf_event__id_header_size(child_event);
8609
8610 /*
8611 * Link it up in the child's context:
8612 */
8613 raw_spin_lock_irqsave(&child_ctx->lock, flags);
8614 add_event_to_ctx(child_event, child_ctx);
8615 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8616
8617 /*
8618 * Link this into the parent event's child list
8619 */
8620 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8621 mutex_lock(&parent_event->child_mutex);
8622 list_add_tail(&child_event->child_list, &parent_event->child_list);
8623 mutex_unlock(&parent_event->child_mutex);
8624
8625 return child_event;
8626 }
8627
8628 static int inherit_group(struct perf_event *parent_event,
8629 struct task_struct *parent,
8630 struct perf_event_context *parent_ctx,
8631 struct task_struct *child,
8632 struct perf_event_context *child_ctx)
8633 {
8634 struct perf_event *leader;
8635 struct perf_event *sub;
8636 struct perf_event *child_ctr;
8637
8638 leader = inherit_event(parent_event, parent, parent_ctx,
8639 child, NULL, child_ctx);
8640 if (IS_ERR(leader))
8641 return PTR_ERR(leader);
8642 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
8643 child_ctr = inherit_event(sub, parent, parent_ctx,
8644 child, leader, child_ctx);
8645 if (IS_ERR(child_ctr))
8646 return PTR_ERR(child_ctr);
8647 }
8648 return 0;
8649 }
8650
8651 static int
8652 inherit_task_group(struct perf_event *event, struct task_struct *parent,
8653 struct perf_event_context *parent_ctx,
8654 struct task_struct *child, int ctxn,
8655 int *inherited_all)
8656 {
8657 int ret;
8658 struct perf_event_context *child_ctx;
8659
8660 if (!event->attr.inherit) {
8661 *inherited_all = 0;
8662 return 0;
8663 }
8664
8665 child_ctx = child->perf_event_ctxp[ctxn];
8666 if (!child_ctx) {
8667 /*
8668 * This is executed from the parent task context, so
8669 * inherit events that have been marked for cloning.
8670 * First allocate and initialize a context for the
8671 * child.
8672 */
8673
8674 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
8675 if (!child_ctx)
8676 return -ENOMEM;
8677
8678 child->perf_event_ctxp[ctxn] = child_ctx;
8679 }
8680
8681 ret = inherit_group(event, parent, parent_ctx,
8682 child, child_ctx);
8683
8684 if (ret)
8685 *inherited_all = 0;
8686
8687 return ret;
8688 }
8689
8690 /*
8691 * Initialize the perf_event context in task_struct
8692 */
8693 static int perf_event_init_context(struct task_struct *child, int ctxn)
8694 {
8695 struct perf_event_context *child_ctx, *parent_ctx;
8696 struct perf_event_context *cloned_ctx;
8697 struct perf_event *event;
8698 struct task_struct *parent = current;
8699 int inherited_all = 1;
8700 unsigned long flags;
8701 int ret = 0;
8702
8703 if (likely(!parent->perf_event_ctxp[ctxn]))
8704 return 0;
8705
8706 /*
8707 * If the parent's context is a clone, pin it so it won't get
8708 * swapped under us.
8709 */
8710 parent_ctx = perf_pin_task_context(parent, ctxn);
8711 if (!parent_ctx)
8712 return 0;
8713
8714 /*
8715 * No need to check if parent_ctx != NULL here; since we saw
8716 * it non-NULL earlier, the only reason for it to become NULL
8717 * is if we exit, and since we're currently in the middle of
8718 * a fork we can't be exiting at the same time.
8719 */
8720
8721 /*
8722 * Lock the parent list. No need to lock the child - not PID
8723 * hashed yet and not running, so nobody can access it.
8724 */
8725 mutex_lock(&parent_ctx->mutex);
8726
8727 /*
8728 * We dont have to disable NMIs - we are only looking at
8729 * the list, not manipulating it:
8730 */
8731 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
8732 ret = inherit_task_group(event, parent, parent_ctx,
8733 child, ctxn, &inherited_all);
8734 if (ret)
8735 break;
8736 }
8737
8738 /*
8739 * We can't hold ctx->lock when iterating the ->flexible_group list due
8740 * to allocations, but we need to prevent rotation because
8741 * rotate_ctx() will change the list from interrupt context.
8742 */
8743 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8744 parent_ctx->rotate_disable = 1;
8745 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8746
8747 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
8748 ret = inherit_task_group(event, parent, parent_ctx,
8749 child, ctxn, &inherited_all);
8750 if (ret)
8751 break;
8752 }
8753
8754 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8755 parent_ctx->rotate_disable = 0;
8756
8757 child_ctx = child->perf_event_ctxp[ctxn];
8758
8759 if (child_ctx && inherited_all) {
8760 /*
8761 * Mark the child context as a clone of the parent
8762 * context, or of whatever the parent is a clone of.
8763 *
8764 * Note that if the parent is a clone, the holding of
8765 * parent_ctx->lock avoids it from being uncloned.
8766 */
8767 cloned_ctx = parent_ctx->parent_ctx;
8768 if (cloned_ctx) {
8769 child_ctx->parent_ctx = cloned_ctx;
8770 child_ctx->parent_gen = parent_ctx->parent_gen;
8771 } else {
8772 child_ctx->parent_ctx = parent_ctx;
8773 child_ctx->parent_gen = parent_ctx->generation;
8774 }
8775 get_ctx(child_ctx->parent_ctx);
8776 }
8777
8778 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8779 mutex_unlock(&parent_ctx->mutex);
8780
8781 perf_unpin_context(parent_ctx);
8782 put_ctx(parent_ctx);
8783
8784 return ret;
8785 }
8786
8787 /*
8788 * Initialize the perf_event context in task_struct
8789 */
8790 int perf_event_init_task(struct task_struct *child)
8791 {
8792 int ctxn, ret;
8793
8794 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
8795 mutex_init(&child->perf_event_mutex);
8796 INIT_LIST_HEAD(&child->perf_event_list);
8797
8798 for_each_task_context_nr(ctxn) {
8799 ret = perf_event_init_context(child, ctxn);
8800 if (ret) {
8801 perf_event_free_task(child);
8802 return ret;
8803 }
8804 }
8805
8806 return 0;
8807 }
8808
8809 static void __init perf_event_init_all_cpus(void)
8810 {
8811 struct swevent_htable *swhash;
8812 int cpu;
8813
8814 for_each_possible_cpu(cpu) {
8815 swhash = &per_cpu(swevent_htable, cpu);
8816 mutex_init(&swhash->hlist_mutex);
8817 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
8818 }
8819 }
8820
8821 static void perf_event_init_cpu(int cpu)
8822 {
8823 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8824
8825 mutex_lock(&swhash->hlist_mutex);
8826 swhash->online = true;
8827 if (swhash->hlist_refcount > 0) {
8828 struct swevent_hlist *hlist;
8829
8830 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
8831 WARN_ON(!hlist);
8832 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8833 }
8834 mutex_unlock(&swhash->hlist_mutex);
8835 }
8836
8837 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
8838 static void __perf_event_exit_context(void *__info)
8839 {
8840 struct remove_event re = { .detach_group = true };
8841 struct perf_event_context *ctx = __info;
8842
8843 rcu_read_lock();
8844 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8845 __perf_remove_from_context(&re);
8846 rcu_read_unlock();
8847 }
8848
8849 static void perf_event_exit_cpu_context(int cpu)
8850 {
8851 struct perf_event_context *ctx;
8852 struct pmu *pmu;
8853 int idx;
8854
8855 idx = srcu_read_lock(&pmus_srcu);
8856 list_for_each_entry_rcu(pmu, &pmus, entry) {
8857 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
8858
8859 mutex_lock(&ctx->mutex);
8860 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
8861 mutex_unlock(&ctx->mutex);
8862 }
8863 srcu_read_unlock(&pmus_srcu, idx);
8864 }
8865
8866 static void perf_event_exit_cpu(int cpu)
8867 {
8868 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8869
8870 perf_event_exit_cpu_context(cpu);
8871
8872 mutex_lock(&swhash->hlist_mutex);
8873 swhash->online = false;
8874 swevent_hlist_release(swhash);
8875 mutex_unlock(&swhash->hlist_mutex);
8876 }
8877 #else
8878 static inline void perf_event_exit_cpu(int cpu) { }
8879 #endif
8880
8881 static int
8882 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
8883 {
8884 int cpu;
8885
8886 for_each_online_cpu(cpu)
8887 perf_event_exit_cpu(cpu);
8888
8889 return NOTIFY_OK;
8890 }
8891
8892 /*
8893 * Run the perf reboot notifier at the very last possible moment so that
8894 * the generic watchdog code runs as long as possible.
8895 */
8896 static struct notifier_block perf_reboot_notifier = {
8897 .notifier_call = perf_reboot,
8898 .priority = INT_MIN,
8899 };
8900
8901 static int
8902 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
8903 {
8904 unsigned int cpu = (long)hcpu;
8905
8906 switch (action & ~CPU_TASKS_FROZEN) {
8907
8908 case CPU_UP_PREPARE:
8909 case CPU_DOWN_FAILED:
8910 perf_event_init_cpu(cpu);
8911 break;
8912
8913 case CPU_UP_CANCELED:
8914 case CPU_DOWN_PREPARE:
8915 perf_event_exit_cpu(cpu);
8916 break;
8917 default:
8918 break;
8919 }
8920
8921 return NOTIFY_OK;
8922 }
8923
8924 void __init perf_event_init(void)
8925 {
8926 int ret;
8927
8928 idr_init(&pmu_idr);
8929
8930 perf_event_init_all_cpus();
8931 init_srcu_struct(&pmus_srcu);
8932 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
8933 perf_pmu_register(&perf_cpu_clock, NULL, -1);
8934 perf_pmu_register(&perf_task_clock, NULL, -1);
8935 perf_tp_register();
8936 perf_cpu_notifier(perf_cpu_notify);
8937 register_reboot_notifier(&perf_reboot_notifier);
8938
8939 ret = init_hw_breakpoint();
8940 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
8941
8942 /* do not patch jump label more than once per second */
8943 jump_label_rate_limit(&perf_sched_events, HZ);
8944
8945 /*
8946 * Build time assertion that we keep the data_head at the intended
8947 * location. IOW, validation we got the __reserved[] size right.
8948 */
8949 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
8950 != 1024);
8951 }
8952
8953 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
8954 char *page)
8955 {
8956 struct perf_pmu_events_attr *pmu_attr =
8957 container_of(attr, struct perf_pmu_events_attr, attr);
8958
8959 if (pmu_attr->event_str)
8960 return sprintf(page, "%s\n", pmu_attr->event_str);
8961
8962 return 0;
8963 }
8964
8965 static int __init perf_event_sysfs_init(void)
8966 {
8967 struct pmu *pmu;
8968 int ret;
8969
8970 mutex_lock(&pmus_lock);
8971
8972 ret = bus_register(&pmu_bus);
8973 if (ret)
8974 goto unlock;
8975
8976 list_for_each_entry(pmu, &pmus, entry) {
8977 if (!pmu->name || pmu->type < 0)
8978 continue;
8979
8980 ret = pmu_dev_alloc(pmu);
8981 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
8982 }
8983 pmu_bus_running = 1;
8984 ret = 0;
8985
8986 unlock:
8987 mutex_unlock(&pmus_lock);
8988
8989 return ret;
8990 }
8991 device_initcall(perf_event_sysfs_init);
8992
8993 #ifdef CONFIG_CGROUP_PERF
8994 static struct cgroup_subsys_state *
8995 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8996 {
8997 struct perf_cgroup *jc;
8998
8999 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
9000 if (!jc)
9001 return ERR_PTR(-ENOMEM);
9002
9003 jc->info = alloc_percpu(struct perf_cgroup_info);
9004 if (!jc->info) {
9005 kfree(jc);
9006 return ERR_PTR(-ENOMEM);
9007 }
9008
9009 return &jc->css;
9010 }
9011
9012 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
9013 {
9014 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
9015
9016 free_percpu(jc->info);
9017 kfree(jc);
9018 }
9019
9020 static int __perf_cgroup_move(void *info)
9021 {
9022 struct task_struct *task = info;
9023 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
9024 return 0;
9025 }
9026
9027 static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9028 struct cgroup_taskset *tset)
9029 {
9030 struct task_struct *task;
9031
9032 cgroup_taskset_for_each(task, tset)
9033 task_function_call(task, __perf_cgroup_move, task);
9034 }
9035
9036 static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9037 struct cgroup_subsys_state *old_css,
9038 struct task_struct *task)
9039 {
9040 /*
9041 * cgroup_exit() is called in the copy_process() failure path.
9042 * Ignore this case since the task hasn't ran yet, this avoids
9043 * trying to poke a half freed task state from generic code.
9044 */
9045 if (!(task->flags & PF_EXITING))
9046 return;
9047
9048 task_function_call(task, __perf_cgroup_move, task);
9049 }
9050
9051 struct cgroup_subsys perf_event_cgrp_subsys = {
9052 .css_alloc = perf_cgroup_css_alloc,
9053 .css_free = perf_cgroup_css_free,
9054 .exit = perf_cgroup_exit,
9055 .attach = perf_cgroup_attach,
9056 };
9057 #endif /* CONFIG_CGROUP_PERF */
This page took 0.31405 seconds and 5 git commands to generate.