kernel/perf_event.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/sysfs.h>
  19 #include <linux/dcache.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/vmstat.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/hardirq.h>
  25 #include <linux/rculist.h>
  26 #include <linux/uaccess.h>
  27 #include <linux/syscalls.h>
  28 #include <linux/anon_inodes.h>
  29 #include <linux/kernel_stat.h>
  30 #include <linux/perf_event.h>
  31 #include <linux/ftrace_event.h>
  32
  33 #include <asm/irq_regs.h>
  34
  35 /*
  36  * Each CPU has a list of per CPU events:
  37  */
  38 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  39
  40 int perf_max_events __read_mostly = 1;
  41 static int perf_reserved_percpu __read_mostly;
  42 static int perf_overcommit __read_mostly = 1;
  43
  44 static atomic_t nr_events __read_mostly;
  45 static atomic_t nr_mmap_events __read_mostly;
  46 static atomic_t nr_comm_events __read_mostly;
  47 static atomic_t nr_task_events __read_mostly;
  48
  49 /*
  50  * perf event paranoia level:
  51  *  -1 - not paranoid at all
  52  *   0 - disallow raw tracepoint access for unpriv
  53  *   1 - disallow cpu events for unpriv
  54  *   2 - disallow kernel profiling for unpriv
  55  */
  56 int sysctl_perf_event_paranoid __read_mostly = 1;
  57
  58 static inline bool perf_paranoid_tracepoint_raw(void)
  59 {
  60         return sysctl_perf_event_paranoid > -1;
  61 }
  62
  63 static inline bool perf_paranoid_cpu(void)
  64 {
  65         return sysctl_perf_event_paranoid > 0;
  66 }
  67
  68 static inline bool perf_paranoid_kernel(void)
  69 {
  70         return sysctl_perf_event_paranoid > 1;
  71 }
  72
  73 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
  74
  75 /*
  76  * max perf event sample rate
  77  */
  78 int sysctl_perf_event_sample_rate __read_mostly = 100000;
  79
  80 static atomic64_t perf_event_id;
  81
  82 /*
  83  * Lock for (sysadmin-configurable) event reservations:
  84  */
  85 static DEFINE_SPINLOCK(perf_resource_lock);
  86
  87 /*
  88  * Architecture provided APIs - weak aliases:
  89  */
  90 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  91 {
  92         return NULL;
  93 }
  94
  95 void __weak hw_perf_disable(void)               { barrier(); }
  96 void __weak hw_perf_enable(void)                { barrier(); }
  97
  98 void __weak hw_perf_event_setup(int cpu)        { barrier(); }
  99 void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
 100
 101 int __weak
 102 hw_perf_group_sched_in(struct perf_event *group_leader,
 103                struct perf_cpu_context *cpuctx,
 104                struct perf_event_context *ctx, int cpu)
 105 {
 106         return 0;
 107 }
 108
 109 void __weak perf_event_print_debug(void)        { }
 110
 111 static DEFINE_PER_CPU(int, perf_disable_count);
 112
 113 void __perf_disable(void)
 114 {
 115         __get_cpu_var(perf_disable_count)++;
 116 }
 117
 118 bool __perf_enable(void)
 119 {
 120         return !--__get_cpu_var(perf_disable_count);
 121 }
 122
 123 void perf_disable(void)
 124 {
 125         __perf_disable();
 126         hw_perf_disable();
 127 }
 128
 129 void perf_enable(void)
 130 {
 131         if (__perf_enable())
 132                 hw_perf_enable();
 133 }
 134
 135 static void get_ctx(struct perf_event_context *ctx)
 136 {
 137         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 138 }
 139
 140 static void free_ctx(struct rcu_head *head)
 141 {
 142         struct perf_event_context *ctx;
 143
 144         ctx = container_of(head, struct perf_event_context, rcu_head);
 145         kfree(ctx);
 146 }
 147
 148 static void put_ctx(struct perf_event_context *ctx)
 149 {
 150         if (atomic_dec_and_test(&ctx->refcount)) {
 151                 if (ctx->parent_ctx)
 152                         put_ctx(ctx->parent_ctx);
 153                 if (ctx->task)
 154                         put_task_struct(ctx->task);
 155                 call_rcu(&ctx->rcu_head, free_ctx);
 156         }
 157 }
 158
 159 static void unclone_ctx(struct perf_event_context *ctx)
 160 {
 161         if (ctx->parent_ctx) {
 162                 put_ctx(ctx->parent_ctx);
 163                 ctx->parent_ctx = NULL;
 164         }
 165 }
 166
 167 /*
 168  * If we inherit events we want to return the parent event id
 169  * to userspace.
 170  */
 171 static u64 primary_event_id(struct perf_event *event)
 172 {
 173         u64 id = event->id;
 174
 175         if (event->parent)
 176                 id = event->parent->id;
 177
 178         return id;
 179 }
 180
 181 /*
 182  * Get the perf_event_context for a task and lock it.
 183  * This has to cope with with the fact that until it is locked,
 184  * the context could get moved to another task.
 185  */
 186 static struct perf_event_context *
 187 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 188 {
 189         struct perf_event_context *ctx;
 190
 191         rcu_read_lock();
 192  retry:
 193         ctx = rcu_dereference(task->perf_event_ctxp);
 194         if (ctx) {
 195                 /*
 196                  * If this context is a clone of another, it might
 197                  * get swapped for another underneath us by
 198                  * perf_event_task_sched_out, though the
 199                  * rcu_read_lock() protects us from any context
 200                  * getting freed.  Lock the context and check if it
 201                  * got swapped before we could get the lock, and retry
 202                  * if so.  If we locked the right context, then it
 203                  * can't get swapped on us any more.
 204                  */
 205                 spin_lock_irqsave(&ctx->lock, *flags);
 206                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 207                         spin_unlock_irqrestore(&ctx->lock, *flags);
 208                         goto retry;
 209                 }
 210
 211                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 212                         spin_unlock_irqrestore(&ctx->lock, *flags);
 213                         ctx = NULL;
 214                 }
 215         }
 216         rcu_read_unlock();
 217         return ctx;
 218 }
 219
 220 /*
 221  * Get the context for a task and increment its pin_count so it
 222  * can't get swapped to another task.  This also increments its
 223  * reference count so that the context can't get freed.
 224  */
 225 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 226 {
 227         struct perf_event_context *ctx;
 228         unsigned long flags;
 229
 230         ctx = perf_lock_task_context(task, &flags);
 231         if (ctx) {
 232                 ++ctx->pin_count;
 233                 spin_unlock_irqrestore(&ctx->lock, flags);
 234         }
 235         return ctx;
 236 }
 237
 238 static void perf_unpin_context(struct perf_event_context *ctx)
 239 {
 240         unsigned long flags;
 241
 242         spin_lock_irqsave(&ctx->lock, flags);
 243         --ctx->pin_count;
 244         spin_unlock_irqrestore(&ctx->lock, flags);
 245         put_ctx(ctx);
 246 }
 247
 248 /*
 249  * Add a event from the lists for its context.
 250  * Must be called with ctx->mutex and ctx->lock held.
 251  */
 252 static void
 253 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 254 {
 255         struct perf_event *group_leader = event->group_leader;
 256
 257         /*
 258          * Depending on whether it is a standalone or sibling event,
 259          * add it straight to the context's event list, or to the group
 260          * leader's sibling list:
 261          */
 262         if (group_leader == event)
 263                 list_add_tail(&event->group_entry, &ctx->group_list);
 264         else {
 265                 list_add_tail(&event->group_entry, &group_leader->sibling_list);
 266                 group_leader->nr_siblings++;
 267         }
 268
 269         list_add_rcu(&event->event_entry, &ctx->event_list);
 270         ctx->nr_events++;
 271         if (event->attr.inherit_stat)
 272                 ctx->nr_stat++;
 273 }
 274
 275 /*
 276  * Remove a event from the lists for its context.
 277  * Must be called with ctx->mutex and ctx->lock held.
 278  */
 279 static void
 280 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 281 {
 282         struct perf_event *sibling, *tmp;
 283
 284         if (list_empty(&event->group_entry))
 285                 return;
 286         ctx->nr_events--;
 287         if (event->attr.inherit_stat)
 288                 ctx->nr_stat--;
 289
 290         list_del_init(&event->group_entry);
 291         list_del_rcu(&event->event_entry);
 292
 293         if (event->group_leader != event)
 294                 event->group_leader->nr_siblings--;
 295
 296         /*
 297          * If this was a group event with sibling events then
 298          * upgrade the siblings to singleton events by adding them
 299          * to the context list directly:
 300          */
 301         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 302
 303                 list_move_tail(&sibling->group_entry, &ctx->group_list);
 304                 sibling->group_leader = sibling;
 305         }
 306 }
 307
 308 static void
 309 event_sched_out(struct perf_event *event,
 310                   struct perf_cpu_context *cpuctx,
 311                   struct perf_event_context *ctx)
 312 {
 313         if (event->state != PERF_EVENT_STATE_ACTIVE)
 314                 return;
 315
 316         event->state = PERF_EVENT_STATE_INACTIVE;
 317         if (event->pending_disable) {
 318                 event->pending_disable = 0;
 319                 event->state = PERF_EVENT_STATE_OFF;
 320         }
 321         event->tstamp_stopped = ctx->time;
 322         event->pmu->disable(event);
 323         event->oncpu = -1;
 324
 325         if (!is_software_event(event))
 326                 cpuctx->active_oncpu--;
 327         ctx->nr_active--;
 328         if (event->attr.exclusive || !cpuctx->active_oncpu)
 329                 cpuctx->exclusive = 0;
 330 }
 331
 332 static void
 333 group_sched_out(struct perf_event *group_event,
 334                 struct perf_cpu_context *cpuctx,
 335                 struct perf_event_context *ctx)
 336 {
 337         struct perf_event *event;
 338
 339         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 340                 return;
 341
 342         event_sched_out(group_event, cpuctx, ctx);
 343
 344         /*
 345          * Schedule out siblings (if any):
 346          */
 347         list_for_each_entry(event, &group_event->sibling_list, group_entry)
 348                 event_sched_out(event, cpuctx, ctx);
 349
 350         if (group_event->attr.exclusive)
 351                 cpuctx->exclusive = 0;
 352 }
 353
 354 /*
 355  * Cross CPU call to remove a performance event
 356  *
 357  * We disable the event on the hardware level first. After that we
 358  * remove it from the context list.
 359  */
 360 static void __perf_event_remove_from_context(void *info)
 361 {
 362         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 363         struct perf_event *event = info;
 364         struct perf_event_context *ctx = event->ctx;
 365
 366         /*
 367          * If this is a task context, we need to check whether it is
 368          * the current task context of this cpu. If not it has been
 369          * scheduled out before the smp call arrived.
 370          */
 371         if (ctx->task && cpuctx->task_ctx != ctx)
 372                 return;
 373
 374         spin_lock(&ctx->lock);
 375         /*
 376          * Protect the list operation against NMI by disabling the
 377          * events on a global level.
 378          */
 379         perf_disable();
 380
 381         event_sched_out(event, cpuctx, ctx);
 382
 383         list_del_event(event, ctx);
 384
 385         if (!ctx->task) {
 386                 /*
 387                  * Allow more per task events with respect to the
 388                  * reservation:
 389                  */
 390                 cpuctx->max_pertask =
 391                         min(perf_max_events - ctx->nr_events,
 392                             perf_max_events - perf_reserved_percpu);
 393         }
 394
 395         perf_enable();
 396         spin_unlock(&ctx->lock);
 397 }
 398
 399
 400 /*
 401  * Remove the event from a task's (or a CPU's) list of events.
 402  *
 403  * Must be called with ctx->mutex held.
 404  *
 405  * CPU events are removed with a smp call. For task events we only
 406  * call when the task is on a CPU.
 407  *
 408  * If event->ctx is a cloned context, callers must make sure that
 409  * every task struct that event->ctx->task could possibly point to
 410  * remains valid.  This is OK when called from perf_release since
 411  * that only calls us on the top-level context, which can't be a clone.
 412  * When called from perf_event_exit_task, it's OK because the
 413  * context has been detached from its task.
 414  */
 415 static void perf_event_remove_from_context(struct perf_event *event)
 416 {
 417         struct perf_event_context *ctx = event->ctx;
 418         struct task_struct *task = ctx->task;
 419
 420         if (!task) {
 421                 /*
 422                  * Per cpu events are removed via an smp call and
 423                  * the removal is always sucessful.
 424                  */
 425                 smp_call_function_single(event->cpu,
 426                                          __perf_event_remove_from_context,
 427                                          event, 1);
 428                 return;
 429         }
 430
 431 retry:
 432         task_oncpu_function_call(task, __perf_event_remove_from_context,
 433                                  event);
 434
 435         spin_lock_irq(&ctx->lock);
 436         /*
 437          * If the context is active we need to retry the smp call.
 438          */
 439         if (ctx->nr_active && !list_empty(&event->group_entry)) {
 440                 spin_unlock_irq(&ctx->lock);
 441                 goto retry;
 442         }
 443
 444         /*
 445          * The lock prevents that this context is scheduled in so we
 446          * can remove the event safely, if the call above did not
 447          * succeed.
 448          */
 449         if (!list_empty(&event->group_entry)) {
 450                 list_del_event(event, ctx);
 451         }
 452         spin_unlock_irq(&ctx->lock);
 453 }
 454
 455 static inline u64 perf_clock(void)
 456 {
 457         return cpu_clock(smp_processor_id());
 458 }
 459
 460 /*
 461  * Update the record of the current time in a context.
 462  */
 463 static void update_context_time(struct perf_event_context *ctx)
 464 {
 465         u64 now = perf_clock();
 466
 467         ctx->time += now - ctx->timestamp;
 468         ctx->timestamp = now;
 469 }
 470
 471 /*
 472  * Update the total_time_enabled and total_time_running fields for a event.
 473  */
 474 static void update_event_times(struct perf_event *event)
 475 {
 476         struct perf_event_context *ctx = event->ctx;
 477         u64 run_end;
 478
 479         if (event->state < PERF_EVENT_STATE_INACTIVE ||
 480             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 481                 return;
 482
 483         event->total_time_enabled = ctx->time - event->tstamp_enabled;
 484
 485         if (event->state == PERF_EVENT_STATE_INACTIVE)
 486                 run_end = event->tstamp_stopped;
 487         else
 488                 run_end = ctx->time;
 489
 490         event->total_time_running = run_end - event->tstamp_running;
 491 }
 492
 493 /*
 494  * Update total_time_enabled and total_time_running for all events in a group.
 495  */
 496 static void update_group_times(struct perf_event *leader)
 497 {
 498         struct perf_event *event;
 499
 500         update_event_times(leader);
 501         list_for_each_entry(event, &leader->sibling_list, group_entry)
 502                 update_event_times(event);
 503 }
 504
 505 /*
 506  * Cross CPU call to disable a performance event
 507  */
 508 static void __perf_event_disable(void *info)
 509 {
 510         struct perf_event *event = info;
 511         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 512         struct perf_event_context *ctx = event->ctx;
 513
 514         /*
 515          * If this is a per-task event, need to check whether this
 516          * event's task is the current task on this cpu.
 517          */
 518         if (ctx->task && cpuctx->task_ctx != ctx)
 519                 return;
 520
 521         spin_lock(&ctx->lock);
 522
 523         /*
 524          * If the event is on, turn it off.
 525          * If it is in error state, leave it in error state.
 526          */
 527         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 528                 update_context_time(ctx);
 529                 update_group_times(event);
 530                 if (event == event->group_leader)
 531                         group_sched_out(event, cpuctx, ctx);
 532                 else
 533                         event_sched_out(event, cpuctx, ctx);
 534                 event->state = PERF_EVENT_STATE_OFF;
 535         }
 536
 537         spin_unlock(&ctx->lock);
 538 }
 539
 540 /*
 541  * Disable a event.
 542  *
 543  * If event->ctx is a cloned context, callers must make sure that
 544  * every task struct that event->ctx->task could possibly point to
 545  * remains valid.  This condition is satisifed when called through
 546  * perf_event_for_each_child or perf_event_for_each because they
 547  * hold the top-level event's child_mutex, so any descendant that
 548  * goes to exit will block in sync_child_event.
 549  * When called from perf_pending_event it's OK because event->ctx
 550  * is the current context on this CPU and preemption is disabled,
 551  * hence we can't get into perf_event_task_sched_out for this context.
 552  */
 553 static void perf_event_disable(struct perf_event *event)
 554 {
 555         struct perf_event_context *ctx = event->ctx;
 556         struct task_struct *task = ctx->task;
 557
 558         if (!task) {
 559                 /*
 560                  * Disable the event on the cpu that it's on
 561                  */
 562                 smp_call_function_single(event->cpu, __perf_event_disable,
 563                                          event, 1);
 564                 return;
 565         }
 566
 567  retry:
 568         task_oncpu_function_call(task, __perf_event_disable, event);
 569
 570         spin_lock_irq(&ctx->lock);
 571         /*
 572          * If the event is still active, we need to retry the cross-call.
 573          */
 574         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 575                 spin_unlock_irq(&ctx->lock);
 576                 goto retry;
 577         }
 578
 579         /*
 580          * Since we have the lock this context can't be scheduled
 581          * in, so we can change the state safely.
 582          */
 583         if (event->state == PERF_EVENT_STATE_INACTIVE) {
 584                 update_group_times(event);
 585                 event->state = PERF_EVENT_STATE_OFF;
 586         }
 587
 588         spin_unlock_irq(&ctx->lock);
 589 }
 590
 591 static int
 592 event_sched_in(struct perf_event *event,
 593                  struct perf_cpu_context *cpuctx,
 594                  struct perf_event_context *ctx,
 595                  int cpu)
 596 {
 597         if (event->state <= PERF_EVENT_STATE_OFF)
 598                 return 0;
 599
 600         event->state = PERF_EVENT_STATE_ACTIVE;
 601         event->oncpu = cpu;     /* TODO: put 'cpu' into cpuctx->cpu */
 602         /*
 603          * The new state must be visible before we turn it on in the hardware:
 604          */
 605         smp_wmb();
 606
 607         if (event->pmu->enable(event)) {
 608                 event->state = PERF_EVENT_STATE_INACTIVE;
 609                 event->oncpu = -1;
 610                 return -EAGAIN;
 611         }
 612
 613         event->tstamp_running += ctx->time - event->tstamp_stopped;
 614
 615         if (!is_software_event(event))
 616                 cpuctx->active_oncpu++;
 617         ctx->nr_active++;
 618
 619         if (event->attr.exclusive)
 620                 cpuctx->exclusive = 1;
 621
 622         return 0;
 623 }
 624
 625 static int
 626 group_sched_in(struct perf_event *group_event,
 627                struct perf_cpu_context *cpuctx,
 628                struct perf_event_context *ctx,
 629                int cpu)
 630 {
 631         struct perf_event *event, *partial_group;
 632         int ret;
 633
 634         if (group_event->state == PERF_EVENT_STATE_OFF)
 635                 return 0;
 636
 637         ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
 638         if (ret)
 639                 return ret < 0 ? ret : 0;
 640
 641         if (event_sched_in(group_event, cpuctx, ctx, cpu))
 642                 return -EAGAIN;
 643
 644         /*
 645          * Schedule in siblings as one group (if any):
 646          */
 647         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 648                 if (event_sched_in(event, cpuctx, ctx, cpu)) {
 649                         partial_group = event;
 650                         goto group_error;
 651                 }
 652         }
 653
 654         return 0;
 655
 656 group_error:
 657         /*
 658          * Groups can be scheduled in as one unit only, so undo any
 659          * partial group before returning:
 660          */
 661         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 662                 if (event == partial_group)
 663                         break;
 664                 event_sched_out(event, cpuctx, ctx);
 665         }
 666         event_sched_out(group_event, cpuctx, ctx);
 667
 668         return -EAGAIN;
 669 }
 670
 671 /*
 672  * Return 1 for a group consisting entirely of software events,
 673  * 0 if the group contains any hardware events.
 674  */
 675 static int is_software_only_group(struct perf_event *leader)
 676 {
 677         struct perf_event *event;
 678
 679         if (!is_software_event(leader))
 680                 return 0;
 681
 682         list_for_each_entry(event, &leader->sibling_list, group_entry)
 683                 if (!is_software_event(event))
 684                         return 0;
 685
 686         return 1;
 687 }
 688
 689 /*
 690  * Work out whether we can put this event group on the CPU now.
 691  */
 692 static int group_can_go_on(struct perf_event *event,
 693                            struct perf_cpu_context *cpuctx,
 694                            int can_add_hw)
 695 {
 696         /*
 697          * Groups consisting entirely of software events can always go on.
 698          */
 699         if (is_software_only_group(event))
 700                 return 1;
 701         /*
 702          * If an exclusive group is already on, no other hardware
 703          * events can go on.
 704          */
 705         if (cpuctx->exclusive)
 706                 return 0;
 707         /*
 708          * If this group is exclusive and there are already
 709          * events on the CPU, it can't go on.
 710          */
 711         if (event->attr.exclusive && cpuctx->active_oncpu)
 712                 return 0;
 713         /*
 714          * Otherwise, try to add it if all previous groups were able
 715          * to go on.
 716          */
 717         return can_add_hw;
 718 }
 719
 720 static void add_event_to_ctx(struct perf_event *event,
 721                                struct perf_event_context *ctx)
 722 {
 723         list_add_event(event, ctx);
 724         event->tstamp_enabled = ctx->time;
 725         event->tstamp_running = ctx->time;
 726         event->tstamp_stopped = ctx->time;
 727 }
 728
 729 /*
 730  * Cross CPU call to install and enable a performance event
 731  *
 732  * Must be called with ctx->mutex held
 733  */
 734 static void __perf_install_in_context(void *info)
 735 {
 736         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 737         struct perf_event *event = info;
 738         struct perf_event_context *ctx = event->ctx;
 739         struct perf_event *leader = event->group_leader;
 740         int cpu = smp_processor_id();
 741         int err;
 742
 743         /*
 744          * If this is a task context, we need to check whether it is
 745          * the current task context of this cpu. If not it has been
 746          * scheduled out before the smp call arrived.
 747          * Or possibly this is the right context but it isn't
 748          * on this cpu because it had no events.
 749          */
 750         if (ctx->task && cpuctx->task_ctx != ctx) {
 751                 if (cpuctx->task_ctx || ctx->task != current)
 752                         return;
 753                 cpuctx->task_ctx = ctx;
 754         }
 755
 756         spin_lock(&ctx->lock);
 757         ctx->is_active = 1;
 758         update_context_time(ctx);
 759
 760         /*
 761          * Protect the list operation against NMI by disabling the
 762          * events on a global level. NOP for non NMI based events.
 763          */
 764         perf_disable();
 765
 766         add_event_to_ctx(event, ctx);
 767
 768         /*
 769          * Don't put the event on if it is disabled or if
 770          * it is in a group and the group isn't on.
 771          */
 772         if (event->state != PERF_EVENT_STATE_INACTIVE ||
 773             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 774                 goto unlock;
 775
 776         /*
 777          * An exclusive event can't go on if there are already active
 778          * hardware events, and no hardware event can go on if there
 779          * is already an exclusive event on.
 780          */
 781         if (!group_can_go_on(event, cpuctx, 1))
 782                 err = -EEXIST;
 783         else
 784                 err = event_sched_in(event, cpuctx, ctx, cpu);
 785
 786         if (err) {
 787                 /*
 788                  * This event couldn't go on.  If it is in a group
 789                  * then we have to pull the whole group off.
 790                  * If the event group is pinned then put it in error state.
 791                  */
 792                 if (leader != event)
 793                         group_sched_out(leader, cpuctx, ctx);
 794                 if (leader->attr.pinned) {
 795                         update_group_times(leader);
 796                         leader->state = PERF_EVENT_STATE_ERROR;
 797                 }
 798         }
 799
 800         if (!err && !ctx->task && cpuctx->max_pertask)
 801                 cpuctx->max_pertask--;
 802
 803  unlock:
 804         perf_enable();
 805
 806         spin_unlock(&ctx->lock);
 807 }
 808
 809 /*
 810  * Attach a performance event to a context
 811  *
 812  * First we add the event to the list with the hardware enable bit
 813  * in event->hw_config cleared.
 814  *
 815  * If the event is attached to a task which is on a CPU we use a smp
 816  * call to enable it in the task context. The task might have been
 817  * scheduled away, but we check this in the smp call again.
 818  *
 819  * Must be called with ctx->mutex held.
 820  */
 821 static void
 822 perf_install_in_context(struct perf_event_context *ctx,
 823                         struct perf_event *event,
 824                         int cpu)
 825 {
 826         struct task_struct *task = ctx->task;
 827
 828         if (!task) {
 829                 /*
 830                  * Per cpu events are installed via an smp call and
 831                  * the install is always sucessful.
 832                  */
 833                 smp_call_function_single(cpu, __perf_install_in_context,
 834                                          event, 1);
 835                 return;
 836         }
 837
 838 retry:
 839         task_oncpu_function_call(task, __perf_install_in_context,
 840                                  event);
 841
 842         spin_lock_irq(&ctx->lock);
 843         /*
 844          * we need to retry the smp call.
 845          */
 846         if (ctx->is_active && list_empty(&event->group_entry)) {
 847                 spin_unlock_irq(&ctx->lock);
 848                 goto retry;
 849         }
 850
 851         /*
 852          * The lock prevents that this context is scheduled in so we
 853          * can add the event safely, if it the call above did not
 854          * succeed.
 855          */
 856         if (list_empty(&event->group_entry))
 857                 add_event_to_ctx(event, ctx);
 858         spin_unlock_irq(&ctx->lock);
 859 }
 860
 861 /*
 862  * Put a event into inactive state and update time fields.
 863  * Enabling the leader of a group effectively enables all
 864  * the group members that aren't explicitly disabled, so we
 865  * have to update their ->tstamp_enabled also.
 866  * Note: this works for group members as well as group leaders
 867  * since the non-leader members' sibling_lists will be empty.
 868  */
 869 static void __perf_event_mark_enabled(struct perf_event *event,
 870                                         struct perf_event_context *ctx)
 871 {
 872         struct perf_event *sub;
 873
 874         event->state = PERF_EVENT_STATE_INACTIVE;
 875         event->tstamp_enabled = ctx->time - event->total_time_enabled;
 876         list_for_each_entry(sub, &event->sibling_list, group_entry)
 877                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 878                         sub->tstamp_enabled =
 879                                 ctx->time - sub->total_time_enabled;
 880 }
 881
 882 /*
 883  * Cross CPU call to enable a performance event
 884  */
 885 static void __perf_event_enable(void *info)
 886 {
 887         struct perf_event *event = info;
 888         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 889         struct perf_event_context *ctx = event->ctx;
 890         struct perf_event *leader = event->group_leader;
 891         int err;
 892
 893         /*
 894          * If this is a per-task event, need to check whether this
 895          * event's task is the current task on this cpu.
 896          */
 897         if (ctx->task && cpuctx->task_ctx != ctx) {
 898                 if (cpuctx->task_ctx || ctx->task != current)
 899                         return;
 900                 cpuctx->task_ctx = ctx;
 901         }
 902
 903         spin_lock(&ctx->lock);
 904         ctx->is_active = 1;
 905         update_context_time(ctx);
 906
 907         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 908                 goto unlock;
 909         __perf_event_mark_enabled(event, ctx);
 910
 911         /*
 912          * If the event is in a group and isn't the group leader,
 913          * then don't put it on unless the group is on.
 914          */
 915         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 916                 goto unlock;
 917
 918         if (!group_can_go_on(event, cpuctx, 1)) {
 919                 err = -EEXIST;
 920         } else {
 921                 perf_disable();
 922                 if (event == leader)
 923                         err = group_sched_in(event, cpuctx, ctx,
 924                                              smp_processor_id());
 925                 else
 926                         err = event_sched_in(event, cpuctx, ctx,
 927                                                smp_processor_id());
 928                 perf_enable();
 929         }
 930
 931         if (err) {
 932                 /*
 933                  * If this event can't go on and it's part of a
 934                  * group, then the whole group has to come off.
 935                  */
 936                 if (leader != event)
 937                         group_sched_out(leader, cpuctx, ctx);
 938                 if (leader->attr.pinned) {
 939                         update_group_times(leader);
 940                         leader->state = PERF_EVENT_STATE_ERROR;
 941                 }
 942         }
 943
 944  unlock:
 945         spin_unlock(&ctx->lock);
 946 }
 947
 948 /*
 949  * Enable a event.
 950  *
 951  * If event->ctx is a cloned context, callers must make sure that
 952  * every task struct that event->ctx->task could possibly point to
 953  * remains valid.  This condition is satisfied when called through
 954  * perf_event_for_each_child or perf_event_for_each as described
 955  * for perf_event_disable.
 956  */
 957 static void perf_event_enable(struct perf_event *event)
 958 {
 959         struct perf_event_context *ctx = event->ctx;
 960         struct task_struct *task = ctx->task;
 961
 962         if (!task) {
 963                 /*
 964                  * Enable the event on the cpu that it's on
 965                  */
 966                 smp_call_function_single(event->cpu, __perf_event_enable,
 967                                          event, 1);
 968                 return;
 969         }
 970
 971         spin_lock_irq(&ctx->lock);
 972         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 973                 goto out;
 974
 975         /*
 976          * If the event is in error state, clear that first.
 977          * That way, if we see the event in error state below, we
 978          * know that it has gone back into error state, as distinct
 979          * from the task having been scheduled away before the
 980          * cross-call arrived.
 981          */
 982         if (event->state == PERF_EVENT_STATE_ERROR)
 983                 event->state = PERF_EVENT_STATE_OFF;
 984
 985  retry:
 986         spin_unlock_irq(&ctx->lock);
 987         task_oncpu_function_call(task, __perf_event_enable, event);
 988
 989         spin_lock_irq(&ctx->lock);
 990
 991         /*
 992          * If the context is active and the event is still off,
 993          * we need to retry the cross-call.
 994          */
 995         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
 996                 goto retry;
 997
 998         /*
 999          * Since we have the lock this context can't be scheduled
1000          * in, so we can change the state safely.
1001          */
1002         if (event->state == PERF_EVENT_STATE_OFF)
1003                 __perf_event_mark_enabled(event, ctx);
1004
1005  out:
1006         spin_unlock_irq(&ctx->lock);
1007 }
1008
1009 static int perf_event_refresh(struct perf_event *event, int refresh)
1010 {
1011         /*
1012          * not supported on inherited events
1013          */
1014         if (event->attr.inherit)
1015                 return -EINVAL;
1016
1017         atomic_add(refresh, &event->event_limit);
1018         perf_event_enable(event);
1019
1020         return 0;
1021 }
1022
1023 void __perf_event_sched_out(struct perf_event_context *ctx,
1024                               struct perf_cpu_context *cpuctx)
1025 {
1026         struct perf_event *event;
1027
1028         spin_lock(&ctx->lock);
1029         ctx->is_active = 0;
1030         if (likely(!ctx->nr_events))
1031                 goto out;
1032         update_context_time(ctx);
1033
1034         perf_disable();
1035         if (ctx->nr_active)
1036                 list_for_each_entry(event, &ctx->group_list, group_entry)
1037                         group_sched_out(event, cpuctx, ctx);
1038
1039         perf_enable();
1040  out:
1041         spin_unlock(&ctx->lock);
1042 }
1043
1044 /*
1045  * Test whether two contexts are equivalent, i.e. whether they
1046  * have both been cloned from the same version of the same context
1047  * and they both have the same number of enabled events.
1048  * If the number of enabled events is the same, then the set
1049  * of enabled events should be the same, because these are both
1050  * inherited contexts, therefore we can't access individual events
1051  * in them directly with an fd; we can only enable/disable all
1052  * events via prctl, or enable/disable all events in a family
1053  * via ioctl, which will have the same effect on both contexts.
1054  */
1055 static int context_equiv(struct perf_event_context *ctx1,
1056                          struct perf_event_context *ctx2)
1057 {
1058         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1059                 && ctx1->parent_gen == ctx2->parent_gen
1060                 && !ctx1->pin_count && !ctx2->pin_count;
1061 }
1062
1063 static void __perf_event_read(void *event);
1064
1065 static void __perf_event_sync_stat(struct perf_event *event,
1066                                      struct perf_event *next_event)
1067 {
1068         u64 value;
1069
1070         if (!event->attr.inherit_stat)
1071                 return;
1072
1073         /*
1074          * Update the event value, we cannot use perf_event_read()
1075          * because we're in the middle of a context switch and have IRQs
1076          * disabled, which upsets smp_call_function_single(), however
1077          * we know the event must be on the current CPU, therefore we
1078          * don't need to use it.
1079          */
1080         switch (event->state) {
1081         case PERF_EVENT_STATE_ACTIVE:
1082                 __perf_event_read(event);
1083                 break;
1084
1085         case PERF_EVENT_STATE_INACTIVE:
1086                 update_event_times(event);
1087                 break;
1088
1089         default:
1090                 break;
1091         }
1092
1093         /*
1094          * In order to keep per-task stats reliable we need to flip the event
1095          * values when we flip the contexts.
1096          */
1097         value = atomic64_read(&next_event->count);
1098         value = atomic64_xchg(&event->count, value);
1099         atomic64_set(&next_event->count, value);
1100
1101         swap(event->total_time_enabled, next_event->total_time_enabled);
1102         swap(event->total_time_running, next_event->total_time_running);
1103
1104         /*
1105          * Since we swizzled the values, update the user visible data too.
1106          */
1107         perf_event_update_userpage(event);
1108         perf_event_update_userpage(next_event);
1109 }
1110
1111 #define list_next_entry(pos, member) \
1112         list_entry(pos->member.next, typeof(*pos), member)
1113
1114 static void perf_event_sync_stat(struct perf_event_context *ctx,
1115                                    struct perf_event_context *next_ctx)
1116 {
1117         struct perf_event *event, *next_event;
1118
1119         if (!ctx->nr_stat)
1120                 return;
1121
1122         event = list_first_entry(&ctx->event_list,
1123                                    struct perf_event, event_entry);
1124
1125         next_event = list_first_entry(&next_ctx->event_list,
1126                                         struct perf_event, event_entry);
1127
1128         while (&event->event_entry != &ctx->event_list &&
1129                &next_event->event_entry != &next_ctx->event_list) {
1130
1131                 __perf_event_sync_stat(event, next_event);
1132
1133                 event = list_next_entry(event, event_entry);
1134                 next_event = list_next_entry(next_event, event_entry);
1135         }
1136 }
1137
1138 /*
1139  * Called from scheduler to remove the events of the current task,
1140  * with interrupts disabled.
1141  *
1142  * We stop each event and update the event value in event->count.
1143  *
1144  * This does not protect us against NMI, but disable()
1145  * sets the disabled bit in the control field of event _before_
1146  * accessing the event control register. If a NMI hits, then it will
1147  * not restart the event.
1148  */
1149 void perf_event_task_sched_out(struct task_struct *task,
1150                                  struct task_struct *next, int cpu)
1151 {
1152         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1153         struct perf_event_context *ctx = task->perf_event_ctxp;
1154         struct perf_event_context *next_ctx;
1155         struct perf_event_context *parent;
1156         struct pt_regs *regs;
1157         int do_switch = 1;
1158
1159         regs = task_pt_regs(task);
1160         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1161
1162         if (likely(!ctx || !cpuctx->task_ctx))
1163                 return;
1164
1165         update_context_time(ctx);
1166
1167         rcu_read_lock();
1168         parent = rcu_dereference(ctx->parent_ctx);
1169         next_ctx = next->perf_event_ctxp;
1170         if (parent && next_ctx &&
1171             rcu_dereference(next_ctx->parent_ctx) == parent) {
1172                 /*
1173                  * Looks like the two contexts are clones, so we might be
1174                  * able to optimize the context switch.  We lock both
1175                  * contexts and check that they are clones under the
1176                  * lock (including re-checking that neither has been
1177                  * uncloned in the meantime).  It doesn't matter which
1178                  * order we take the locks because no other cpu could
1179                  * be trying to lock both of these tasks.
1180                  */
1181                 spin_lock(&ctx->lock);
1182                 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1183                 if (context_equiv(ctx, next_ctx)) {
1184                         /*
1185                          * XXX do we need a memory barrier of sorts
1186                          * wrt to rcu_dereference() of perf_event_ctxp
1187                          */
1188                         task->perf_event_ctxp = next_ctx;
1189                         next->perf_event_ctxp = ctx;
1190                         ctx->task = next;
1191                         next_ctx->task = task;
1192                         do_switch = 0;
1193
1194                         perf_event_sync_stat(ctx, next_ctx);
1195                 }
1196                 spin_unlock(&next_ctx->lock);
1197                 spin_unlock(&ctx->lock);
1198         }
1199         rcu_read_unlock();
1200
1201         if (do_switch) {
1202                 __perf_event_sched_out(ctx, cpuctx);
1203                 cpuctx->task_ctx = NULL;
1204         }
1205 }
1206
1207 /*
1208  * Called with IRQs disabled
1209  */
1210 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1211 {
1212         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1213
1214         if (!cpuctx->task_ctx)
1215                 return;
1216
1217         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1218                 return;
1219
1220         __perf_event_sched_out(ctx, cpuctx);
1221         cpuctx->task_ctx = NULL;
1222 }
1223
1224 /*
1225  * Called with IRQs disabled
1226  */
1227 static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1228 {
1229         __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1230 }
1231
1232 static void
1233 __perf_event_sched_in(struct perf_event_context *ctx,
1234                         struct perf_cpu_context *cpuctx, int cpu)
1235 {
1236         struct perf_event *event;
1237         int can_add_hw = 1;
1238
1239         spin_lock(&ctx->lock);
1240         ctx->is_active = 1;
1241         if (likely(!ctx->nr_events))
1242                 goto out;
1243
1244         ctx->timestamp = perf_clock();
1245
1246         perf_disable();
1247
1248         /*
1249          * First go through the list and put on any pinned groups
1250          * in order to give them the best chance of going on.
1251          */
1252         list_for_each_entry(event, &ctx->group_list, group_entry) {
1253                 if (event->state <= PERF_EVENT_STATE_OFF ||
1254                     !event->attr.pinned)
1255                         continue;
1256                 if (event->cpu != -1 && event->cpu != cpu)
1257                         continue;
1258
1259                 if (group_can_go_on(event, cpuctx, 1))
1260                         group_sched_in(event, cpuctx, ctx, cpu);
1261
1262                 /*
1263                  * If this pinned group hasn't been scheduled,
1264                  * put it in error state.
1265                  */
1266                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1267                         update_group_times(event);
1268                         event->state = PERF_EVENT_STATE_ERROR;
1269                 }
1270         }
1271
1272         list_for_each_entry(event, &ctx->group_list, group_entry) {
1273                 /*
1274                  * Ignore events in OFF or ERROR state, and
1275                  * ignore pinned events since we did them already.
1276                  */
1277                 if (event->state <= PERF_EVENT_STATE_OFF ||
1278                     event->attr.pinned)
1279                         continue;
1280
1281                 /*
1282                  * Listen to the 'cpu' scheduling filter constraint
1283                  * of events:
1284                  */
1285                 if (event->cpu != -1 && event->cpu != cpu)
1286                         continue;
1287
1288                 if (group_can_go_on(event, cpuctx, can_add_hw))
1289                         if (group_sched_in(event, cpuctx, ctx, cpu))
1290                                 can_add_hw = 0;
1291         }
1292         perf_enable();
1293  out:
1294         spin_unlock(&ctx->lock);
1295 }
1296
1297 /*
1298  * Called from scheduler to add the events of the current task
1299  * with interrupts disabled.
1300  *
1301  * We restore the event value and then enable it.
1302  *
1303  * This does not protect us against NMI, but enable()
1304  * sets the enabled bit in the control field of event _before_
1305  * accessing the event control register. If a NMI hits, then it will
1306  * keep the event running.
1307  */
1308 void perf_event_task_sched_in(struct task_struct *task, int cpu)
1309 {
1310         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1311         struct perf_event_context *ctx = task->perf_event_ctxp;
1312
1313         if (likely(!ctx))
1314                 return;
1315         if (cpuctx->task_ctx == ctx)
1316                 return;
1317         __perf_event_sched_in(ctx, cpuctx, cpu);
1318         cpuctx->task_ctx = ctx;
1319 }
1320
1321 static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1322 {
1323         struct perf_event_context *ctx = &cpuctx->ctx;
1324
1325         __perf_event_sched_in(ctx, cpuctx, cpu);
1326 }
1327
1328 #define MAX_INTERRUPTS (~0ULL)
1329
1330 static void perf_log_throttle(struct perf_event *event, int enable);
1331
1332 static void perf_adjust_period(struct perf_event *event, u64 events)
1333 {
1334         struct hw_perf_event *hwc = &event->hw;
1335         u64 period, sample_period;
1336         s64 delta;
1337
1338         events *= hwc->sample_period;
1339         period = div64_u64(events, event->attr.sample_freq);
1340
1341         delta = (s64)(period - hwc->sample_period);
1342         delta = (delta + 7) / 8; /* low pass filter */
1343
1344         sample_period = hwc->sample_period + delta;
1345
1346         if (!sample_period)
1347                 sample_period = 1;
1348
1349         hwc->sample_period = sample_period;
1350 }
1351
1352 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1353 {
1354         struct perf_event *event;
1355         struct hw_perf_event *hwc;
1356         u64 interrupts, freq;
1357
1358         spin_lock(&ctx->lock);
1359         list_for_each_entry(event, &ctx->group_list, group_entry) {
1360                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1361                         continue;
1362
1363                 hwc = &event->hw;
1364
1365                 interrupts = hwc->interrupts;
1366                 hwc->interrupts = 0;
1367
1368                 /*
1369                  * unthrottle events on the tick
1370                  */
1371                 if (interrupts == MAX_INTERRUPTS) {
1372                         perf_log_throttle(event, 1);
1373                         event->pmu->unthrottle(event);
1374                         interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1375                 }
1376
1377                 if (!event->attr.freq || !event->attr.sample_freq)
1378                         continue;
1379
1380                 /*
1381                  * if the specified freq < HZ then we need to skip ticks
1382                  */
1383                 if (event->attr.sample_freq < HZ) {
1384                         freq = event->attr.sample_freq;
1385
1386                         hwc->freq_count += freq;
1387                         hwc->freq_interrupts += interrupts;
1388
1389                         if (hwc->freq_count < HZ)
1390                                 continue;
1391
1392                         interrupts = hwc->freq_interrupts;
1393                         hwc->freq_interrupts = 0;
1394                         hwc->freq_count -= HZ;
1395                 } else
1396                         freq = HZ;
1397
1398                 perf_adjust_period(event, freq * interrupts);
1399
1400                 /*
1401                  * In order to avoid being stalled by an (accidental) huge
1402                  * sample period, force reset the sample period if we didn't
1403                  * get any events in this freq period.
1404                  */
1405                 if (!interrupts) {
1406                         perf_disable();
1407                         event->pmu->disable(event);
1408                         atomic64_set(&hwc->period_left, 0);
1409                         event->pmu->enable(event);
1410                         perf_enable();
1411                 }
1412         }
1413         spin_unlock(&ctx->lock);
1414 }
1415
1416 /*
1417  * Round-robin a context's events:
1418  */
1419 static void rotate_ctx(struct perf_event_context *ctx)
1420 {
1421         struct perf_event *event;
1422
1423         if (!ctx->nr_events)
1424                 return;
1425
1426         spin_lock(&ctx->lock);
1427         /*
1428          * Rotate the first entry last (works just fine for group events too):
1429          */
1430         perf_disable();
1431         list_for_each_entry(event, &ctx->group_list, group_entry) {
1432                 list_move_tail(&event->group_entry, &ctx->group_list);
1433                 break;
1434         }
1435         perf_enable();
1436
1437         spin_unlock(&ctx->lock);
1438 }
1439
1440 void perf_event_task_tick(struct task_struct *curr, int cpu)
1441 {
1442         struct perf_cpu_context *cpuctx;
1443         struct perf_event_context *ctx;
1444
1445         if (!atomic_read(&nr_events))
1446                 return;
1447
1448         cpuctx = &per_cpu(perf_cpu_context, cpu);
1449         ctx = curr->perf_event_ctxp;
1450
1451         perf_ctx_adjust_freq(&cpuctx->ctx);
1452         if (ctx)
1453                 perf_ctx_adjust_freq(ctx);
1454
1455         perf_event_cpu_sched_out(cpuctx);
1456         if (ctx)
1457                 __perf_event_task_sched_out(ctx);
1458
1459         rotate_ctx(&cpuctx->ctx);
1460         if (ctx)
1461                 rotate_ctx(ctx);
1462
1463         perf_event_cpu_sched_in(cpuctx, cpu);
1464         if (ctx)
1465                 perf_event_task_sched_in(curr, cpu);
1466 }
1467
1468 /*
1469  * Enable all of a task's events that have been marked enable-on-exec.
1470  * This expects task == current.
1471  */
1472 static void perf_event_enable_on_exec(struct task_struct *task)
1473 {
1474         struct perf_event_context *ctx;
1475         struct perf_event *event;
1476         unsigned long flags;
1477         int enabled = 0;
1478
1479         local_irq_save(flags);
1480         ctx = task->perf_event_ctxp;
1481         if (!ctx || !ctx->nr_events)
1482                 goto out;
1483
1484         __perf_event_task_sched_out(ctx);
1485
1486         spin_lock(&ctx->lock);
1487
1488         list_for_each_entry(event, &ctx->group_list, group_entry) {
1489                 if (!event->attr.enable_on_exec)
1490                         continue;
1491                 event->attr.enable_on_exec = 0;
1492                 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1493                         continue;
1494                 __perf_event_mark_enabled(event, ctx);
1495                 enabled = 1;
1496         }
1497
1498         /*
1499          * Unclone this context if we enabled any event.
1500          */
1501         if (enabled)
1502                 unclone_ctx(ctx);
1503
1504         spin_unlock(&ctx->lock);
1505
1506         perf_event_task_sched_in(task, smp_processor_id());
1507  out:
1508         local_irq_restore(flags);
1509 }
1510
1511 /*
1512  * Cross CPU call to read the hardware event
1513  */
1514 static void __perf_event_read(void *info)
1515 {
1516         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1517         struct perf_event *event = info;
1518         struct perf_event_context *ctx = event->ctx;
1519         unsigned long flags;
1520
1521         /*
1522          * If this is a task context, we need to check whether it is
1523          * the current task context of this cpu.  If not it has been
1524          * scheduled out before the smp call arrived.  In that case
1525          * event->count would have been updated to a recent sample
1526          * when the event was scheduled out.
1527          */
1528         if (ctx->task && cpuctx->task_ctx != ctx)
1529                 return;
1530
1531         local_irq_save(flags);
1532         if (ctx->is_active)
1533                 update_context_time(ctx);
1534         event->pmu->read(event);
1535         update_event_times(event);
1536         local_irq_restore(flags);
1537 }
1538
1539 static u64 perf_event_read(struct perf_event *event)
1540 {
1541         /*
1542          * If event is enabled and currently active on a CPU, update the
1543          * value in the event structure:
1544          */
1545         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1546                 smp_call_function_single(event->oncpu,
1547                                          __perf_event_read, event, 1);
1548         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1549                 update_event_times(event);
1550         }
1551
1552         return atomic64_read(&event->count);
1553 }
1554
1555 /*
1556  * Initialize the perf_event context in a task_struct:
1557  */
1558 static void
1559 __perf_event_init_context(struct perf_event_context *ctx,
1560                             struct task_struct *task)
1561 {
1562         memset(ctx, 0, sizeof(*ctx));
1563         spin_lock_init(&ctx->lock);
1564         mutex_init(&ctx->mutex);
1565         INIT_LIST_HEAD(&ctx->group_list);
1566         INIT_LIST_HEAD(&ctx->event_list);
1567         atomic_set(&ctx->refcount, 1);
1568         ctx->task = task;
1569 }
1570
1571 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1572 {
1573         struct perf_event_context *ctx;
1574         struct perf_cpu_context *cpuctx;
1575         struct task_struct *task;
1576         unsigned long flags;
1577         int err;
1578
1579         /*
1580          * If cpu is not a wildcard then this is a percpu event:
1581          */
1582         if (cpu != -1) {
1583                 /* Must be root to operate on a CPU event: */
1584                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1585                         return ERR_PTR(-EACCES);
1586
1587                 if (cpu < 0 || cpu > num_possible_cpus())
1588                         return ERR_PTR(-EINVAL);
1589
1590                 /*
1591                  * We could be clever and allow to attach a event to an
1592                  * offline CPU and activate it when the CPU comes up, but
1593                  * that's for later.
1594                  */
1595                 if (!cpu_isset(cpu, cpu_online_map))
1596                         return ERR_PTR(-ENODEV);
1597
1598                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1599                 ctx = &cpuctx->ctx;
1600                 get_ctx(ctx);
1601
1602                 return ctx;
1603         }
1604
1605         rcu_read_lock();
1606         if (!pid)
1607                 task = current;
1608         else
1609                 task = find_task_by_vpid(pid);
1610         if (task)
1611                 get_task_struct(task);
1612         rcu_read_unlock();
1613
1614         if (!task)
1615                 return ERR_PTR(-ESRCH);
1616
1617         /*
1618          * Can't attach events to a dying task.
1619          */
1620         err = -ESRCH;
1621         if (task->flags & PF_EXITING)
1622                 goto errout;
1623
1624         /* Reuse ptrace permission checks for now. */
1625         err = -EACCES;
1626         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1627                 goto errout;
1628
1629  retry:
1630         ctx = perf_lock_task_context(task, &flags);
1631         if (ctx) {
1632                 unclone_ctx(ctx);
1633                 spin_unlock_irqrestore(&ctx->lock, flags);
1634         }
1635
1636         if (!ctx) {
1637                 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1638                 err = -ENOMEM;
1639                 if (!ctx)
1640                         goto errout;
1641                 __perf_event_init_context(ctx, task);
1642                 get_ctx(ctx);
1643                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1644                         /*
1645                          * We raced with some other task; use
1646                          * the context they set.
1647                          */
1648                         kfree(ctx);
1649                         goto retry;
1650                 }
1651                 get_task_struct(task);
1652         }
1653
1654         put_task_struct(task);
1655         return ctx;
1656
1657  errout:
1658         put_task_struct(task);
1659         return ERR_PTR(err);
1660 }
1661
1662 static void perf_event_free_filter(struct perf_event *event);
1663
1664 static void free_event_rcu(struct rcu_head *head)
1665 {
1666         struct perf_event *event;
1667
1668         event = container_of(head, struct perf_event, rcu_head);
1669         if (event->ns)
1670                 put_pid_ns(event->ns);
1671         perf_event_free_filter(event);
1672         kfree(event);
1673 }
1674
1675 static void perf_pending_sync(struct perf_event *event);
1676
1677 static void free_event(struct perf_event *event)
1678 {
1679         perf_pending_sync(event);
1680
1681         if (!event->parent) {
1682                 atomic_dec(&nr_events);
1683                 if (event->attr.mmap)
1684                         atomic_dec(&nr_mmap_events);
1685                 if (event->attr.comm)
1686                         atomic_dec(&nr_comm_events);
1687                 if (event->attr.task)
1688                         atomic_dec(&nr_task_events);
1689         }
1690
1691         if (event->output) {
1692                 fput(event->output->filp);
1693                 event->output = NULL;
1694         }
1695
1696         if (event->destroy)
1697                 event->destroy(event);
1698
1699         put_ctx(event->ctx);
1700         call_rcu(&event->rcu_head, free_event_rcu);
1701 }
1702
1703 /*
1704  * Called when the last reference to the file is gone.
1705  */
1706 static int perf_release(struct inode *inode, struct file *file)
1707 {
1708         struct perf_event *event = file->private_data;
1709         struct perf_event_context *ctx = event->ctx;
1710
1711         file->private_data = NULL;
1712
1713         WARN_ON_ONCE(ctx->parent_ctx);
1714         mutex_lock(&ctx->mutex);
1715         perf_event_remove_from_context(event);
1716         mutex_unlock(&ctx->mutex);
1717
1718         mutex_lock(&event->owner->perf_event_mutex);
1719         list_del_init(&event->owner_entry);
1720         mutex_unlock(&event->owner->perf_event_mutex);
1721         put_task_struct(event->owner);
1722
1723         free_event(event);
1724
1725         return 0;
1726 }
1727
1728 static int perf_event_read_size(struct perf_event *event)
1729 {
1730         int entry = sizeof(u64); /* value */
1731         int size = 0;
1732         int nr = 1;
1733
1734         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1735                 size += sizeof(u64);
1736
1737         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1738                 size += sizeof(u64);
1739
1740         if (event->attr.read_format & PERF_FORMAT_ID)
1741                 entry += sizeof(u64);
1742
1743         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1744                 nr += event->group_leader->nr_siblings;
1745                 size += sizeof(u64);
1746         }
1747
1748         size += entry * nr;
1749
1750         return size;
1751 }
1752
1753 static u64 perf_event_read_value(struct perf_event *event)
1754 {
1755         struct perf_event *child;
1756         u64 total = 0;
1757
1758         total += perf_event_read(event);
1759         list_for_each_entry(child, &event->child_list, child_list)
1760                 total += perf_event_read(child);
1761
1762         return total;
1763 }
1764
1765 static int perf_event_read_entry(struct perf_event *event,
1766                                    u64 read_format, char __user *buf)
1767 {
1768         int n = 0, count = 0;
1769         u64 values[2];
1770
1771         values[n++] = perf_event_read_value(event);
1772         if (read_format & PERF_FORMAT_ID)
1773                 values[n++] = primary_event_id(event);
1774
1775         count = n * sizeof(u64);
1776
1777         if (copy_to_user(buf, values, count))
1778                 return -EFAULT;
1779
1780         return count;
1781 }
1782
1783 static int perf_event_read_group(struct perf_event *event,
1784                                    u64 read_format, char __user *buf)
1785 {
1786         struct perf_event *leader = event->group_leader, *sub;
1787         int n = 0, size = 0, err = -EFAULT;
1788         u64 values[3];
1789
1790         values[n++] = 1 + leader->nr_siblings;
1791         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1792                 values[n++] = leader->total_time_enabled +
1793                         atomic64_read(&leader->child_total_time_enabled);
1794         }
1795         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1796                 values[n++] = leader->total_time_running +
1797                         atomic64_read(&leader->child_total_time_running);
1798         }
1799
1800         size = n * sizeof(u64);
1801
1802         if (copy_to_user(buf, values, size))
1803                 return -EFAULT;
1804
1805         err = perf_event_read_entry(leader, read_format, buf + size);
1806         if (err < 0)
1807                 return err;
1808
1809         size += err;
1810
1811         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1812                 err = perf_event_read_entry(sub, read_format,
1813                                 buf + size);
1814                 if (err < 0)
1815                         return err;
1816
1817                 size += err;
1818         }
1819
1820         return size;
1821 }
1822
1823 static int perf_event_read_one(struct perf_event *event,
1824                                  u64 read_format, char __user *buf)
1825 {
1826         u64 values[4];
1827         int n = 0;
1828
1829         values[n++] = perf_event_read_value(event);
1830         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1831                 values[n++] = event->total_time_enabled +
1832                         atomic64_read(&event->child_total_time_enabled);
1833         }
1834         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1835                 values[n++] = event->total_time_running +
1836                         atomic64_read(&event->child_total_time_running);
1837         }
1838         if (read_format & PERF_FORMAT_ID)
1839                 values[n++] = primary_event_id(event);
1840
1841         if (copy_to_user(buf, values, n * sizeof(u64)))
1842                 return -EFAULT;
1843
1844         return n * sizeof(u64);
1845 }
1846
1847 /*
1848  * Read the performance event - simple non blocking version for now
1849  */
1850 static ssize_t
1851 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1852 {
1853         u64 read_format = event->attr.read_format;
1854         int ret;
1855
1856         /*
1857          * Return end-of-file for a read on a event that is in
1858          * error state (i.e. because it was pinned but it couldn't be
1859          * scheduled on to the CPU at some point).
1860          */
1861         if (event->state == PERF_EVENT_STATE_ERROR)
1862                 return 0;
1863
1864         if (count < perf_event_read_size(event))
1865                 return -ENOSPC;
1866
1867         WARN_ON_ONCE(event->ctx->parent_ctx);
1868         mutex_lock(&event->child_mutex);
1869         if (read_format & PERF_FORMAT_GROUP)
1870                 ret = perf_event_read_group(event, read_format, buf);
1871         else
1872                 ret = perf_event_read_one(event, read_format, buf);
1873         mutex_unlock(&event->child_mutex);
1874
1875         return ret;
1876 }
1877
1878 static ssize_t
1879 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1880 {
1881         struct perf_event *event = file->private_data;
1882
1883         return perf_read_hw(event, buf, count);
1884 }
1885
1886 static unsigned int perf_poll(struct file *file, poll_table *wait)
1887 {
1888         struct perf_event *event = file->private_data;
1889         struct perf_mmap_data *data;
1890         unsigned int events = POLL_HUP;
1891
1892         rcu_read_lock();
1893         data = rcu_dereference(event->data);
1894         if (data)
1895                 events = atomic_xchg(&data->poll, 0);
1896         rcu_read_unlock();
1897
1898         poll_wait(file, &event->waitq, wait);
1899
1900         return events;
1901 }
1902
1903 static void perf_event_reset(struct perf_event *event)
1904 {
1905         (void)perf_event_read(event);
1906         atomic64_set(&event->count, 0);
1907         perf_event_update_userpage(event);
1908 }
1909
1910 /*
1911  * Holding the top-level event's child_mutex means that any
1912  * descendant process that has inherited this event will block
1913  * in sync_child_event if it goes to exit, thus satisfying the
1914  * task existence requirements of perf_event_enable/disable.
1915  */
1916 static void perf_event_for_each_child(struct perf_event *event,
1917                                         void (*func)(struct perf_event *))
1918 {
1919         struct perf_event *child;
1920
1921         WARN_ON_ONCE(event->ctx->parent_ctx);
1922         mutex_lock(&event->child_mutex);
1923         func(event);
1924         list_for_each_entry(child, &event->child_list, child_list)
1925                 func(child);
1926         mutex_unlock(&event->child_mutex);
1927 }
1928
1929 static void perf_event_for_each(struct perf_event *event,
1930                                   void (*func)(struct perf_event *))
1931 {
1932         struct perf_event_context *ctx = event->ctx;
1933         struct perf_event *sibling;
1934
1935         WARN_ON_ONCE(ctx->parent_ctx);
1936         mutex_lock(&ctx->mutex);
1937         event = event->group_leader;
1938
1939         perf_event_for_each_child(event, func);
1940         func(event);
1941         list_for_each_entry(sibling, &event->sibling_list, group_entry)
1942                 perf_event_for_each_child(event, func);
1943         mutex_unlock(&ctx->mutex);
1944 }
1945
1946 static int perf_event_period(struct perf_event *event, u64 __user *arg)
1947 {
1948         struct perf_event_context *ctx = event->ctx;
1949         unsigned long size;
1950         int ret = 0;
1951         u64 value;
1952
1953         if (!event->attr.sample_period)
1954                 return -EINVAL;
1955
1956         size = copy_from_user(&value, arg, sizeof(value));
1957         if (size != sizeof(value))
1958                 return -EFAULT;
1959
1960         if (!value)
1961                 return -EINVAL;
1962
1963         spin_lock_irq(&ctx->lock);
1964         if (event->attr.freq) {
1965                 if (value > sysctl_perf_event_sample_rate) {
1966                         ret = -EINVAL;
1967                         goto unlock;
1968                 }
1969
1970                 event->attr.sample_freq = value;
1971         } else {
1972                 event->attr.sample_period = value;
1973                 event->hw.sample_period = value;
1974         }
1975 unlock:
1976         spin_unlock_irq(&ctx->lock);
1977
1978         return ret;
1979 }
1980
1981 static int perf_event_set_output(struct perf_event *event, int output_fd);
1982 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1983
1984 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1985 {
1986         struct perf_event *event = file->private_data;
1987         void (*func)(struct perf_event *);
1988         u32 flags = arg;
1989
1990         switch (cmd) {
1991         case PERF_EVENT_IOC_ENABLE:
1992                 func = perf_event_enable;
1993                 break;
1994         case PERF_EVENT_IOC_DISABLE:
1995                 func = perf_event_disable;
1996                 break;
1997         case PERF_EVENT_IOC_RESET:
1998                 func = perf_event_reset;
1999                 break;
2000
2001         case PERF_EVENT_IOC_REFRESH:
2002                 return perf_event_refresh(event, arg);
2003
2004         case PERF_EVENT_IOC_PERIOD:
2005                 return perf_event_period(event, (u64 __user *)arg);
2006
2007         case PERF_EVENT_IOC_SET_OUTPUT:
2008                 return perf_event_set_output(event, arg);
2009
2010         case PERF_EVENT_IOC_SET_FILTER:
2011                 return perf_event_set_filter(event, (void __user *)arg);
2012
2013         default:
2014                 return -ENOTTY;
2015         }
2016
2017         if (flags & PERF_IOC_FLAG_GROUP)
2018                 perf_event_for_each(event, func);
2019         else
2020                 perf_event_for_each_child(event, func);
2021
2022         return 0;
2023 }
2024
2025 int perf_event_task_enable(void)
2026 {
2027         struct perf_event *event;
2028
2029         mutex_lock(&current->perf_event_mutex);
2030         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2031                 perf_event_for_each_child(event, perf_event_enable);
2032         mutex_unlock(&current->perf_event_mutex);
2033
2034         return 0;
2035 }
2036
2037 int perf_event_task_disable(void)
2038 {
2039         struct perf_event *event;
2040
2041         mutex_lock(&current->perf_event_mutex);
2042         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2043                 perf_event_for_each_child(event, perf_event_disable);
2044         mutex_unlock(&current->perf_event_mutex);
2045
2046         return 0;
2047 }
2048
2049 #ifndef PERF_EVENT_INDEX_OFFSET
2050 # define PERF_EVENT_INDEX_OFFSET 0
2051 #endif
2052
2053 static int perf_event_index(struct perf_event *event)
2054 {
2055         if (event->state != PERF_EVENT_STATE_ACTIVE)
2056                 return 0;
2057
2058         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2059 }
2060
2061 /*
2062  * Callers need to ensure there can be no nesting of this function, otherwise
2063  * the seqlock logic goes bad. We can not serialize this because the arch
2064  * code calls this from NMI context.
2065  */
2066 void perf_event_update_userpage(struct perf_event *event)
2067 {
2068         struct perf_event_mmap_page *userpg;
2069         struct perf_mmap_data *data;
2070
2071         rcu_read_lock();
2072         data = rcu_dereference(event->data);
2073         if (!data)
2074                 goto unlock;
2075
2076         userpg = data->user_page;
2077
2078         /*
2079          * Disable preemption so as to not let the corresponding user-space
2080          * spin too long if we get preempted.
2081          */
2082         preempt_disable();
2083         ++userpg->lock;
2084         barrier();
2085         userpg->index = perf_event_index(event);
2086         userpg->offset = atomic64_read(&event->count);
2087         if (event->state == PERF_EVENT_STATE_ACTIVE)
2088                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2089
2090         userpg->time_enabled = event->total_time_enabled +
2091                         atomic64_read(&event->child_total_time_enabled);
2092
2093         userpg->time_running = event->total_time_running +
2094                         atomic64_read(&event->child_total_time_running);
2095
2096         barrier();
2097         ++userpg->lock;
2098         preempt_enable();
2099 unlock:
2100         rcu_read_unlock();
2101 }
2102
2103 static unsigned long perf_data_size(struct perf_mmap_data *data)
2104 {
2105         return data->nr_pages << (PAGE_SHIFT + data->data_order);
2106 }
2107
2108 #ifndef CONFIG_PERF_USE_VMALLOC
2109
2110 /*
2111  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2112  */
2113
2114 static struct page *
2115 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2116 {
2117         if (pgoff > data->nr_pages)
2118                 return NULL;
2119
2120         if (pgoff == 0)
2121                 return virt_to_page(data->user_page);
2122
2123         return virt_to_page(data->data_pages[pgoff - 1]);
2124 }
2125
2126 static struct perf_mmap_data *
2127 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2128 {
2129         struct perf_mmap_data *data;
2130         unsigned long size;
2131         int i;
2132
2133         WARN_ON(atomic_read(&event->mmap_count));
2134
2135         size = sizeof(struct perf_mmap_data);
2136         size += nr_pages * sizeof(void *);
2137
2138         data = kzalloc(size, GFP_KERNEL);
2139         if (!data)
2140                 goto fail;
2141
2142         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2143         if (!data->user_page)
2144                 goto fail_user_page;
2145
2146         for (i = 0; i < nr_pages; i++) {
2147                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2148                 if (!data->data_pages[i])
2149                         goto fail_data_pages;
2150         }
2151
2152         data->data_order = 0;
2153         data->nr_pages = nr_pages;
2154
2155         return data;
2156
2157 fail_data_pages:
2158         for (i--; i >= 0; i--)
2159                 free_page((unsigned long)data->data_pages[i]);
2160
2161         free_page((unsigned long)data->user_page);
2162
2163 fail_user_page:
2164         kfree(data);
2165
2166 fail:
2167         return NULL;
2168 }
2169
2170 static void perf_mmap_free_page(unsigned long addr)
2171 {
2172         struct page *page = virt_to_page((void *)addr);
2173
2174         page->mapping = NULL;
2175         __free_page(page);
2176 }
2177
2178 static void perf_mmap_data_free(struct perf_mmap_data *data)
2179 {
2180         int i;
2181
2182         perf_mmap_free_page((unsigned long)data->user_page);
2183         for (i = 0; i < data->nr_pages; i++)
2184                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2185 }
2186
2187 #else
2188
2189 /*
2190  * Back perf_mmap() with vmalloc memory.
2191  *
2192  * Required for architectures that have d-cache aliasing issues.
2193  */
2194
2195 static struct page *
2196 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2197 {
2198         if (pgoff > (1UL << data->data_order))
2199                 return NULL;
2200
2201         return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2202 }
2203
2204 static void perf_mmap_unmark_page(void *addr)
2205 {
2206         struct page *page = vmalloc_to_page(addr);
2207
2208         page->mapping = NULL;
2209 }
2210
2211 static void perf_mmap_data_free_work(struct work_struct *work)
2212 {
2213         struct perf_mmap_data *data;
2214         void *base;
2215         int i, nr;
2216
2217         data = container_of(work, struct perf_mmap_data, work);
2218         nr = 1 << data->data_order;
2219
2220         base = data->user_page;
2221         for (i = 0; i < nr + 1; i++)
2222                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2223
2224         vfree(base);
2225 }
2226
2227 static void perf_mmap_data_free(struct perf_mmap_data *data)
2228 {
2229         schedule_work(&data->work);
2230 }
2231
2232 static struct perf_mmap_data *
2233 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2234 {
2235         struct perf_mmap_data *data;
2236         unsigned long size;
2237         void *all_buf;
2238
2239         WARN_ON(atomic_read(&event->mmap_count));
2240
2241         size = sizeof(struct perf_mmap_data);
2242         size += sizeof(void *);
2243
2244         data = kzalloc(size, GFP_KERNEL);
2245         if (!data)
2246                 goto fail;
2247
2248         INIT_WORK(&data->work, perf_mmap_data_free_work);
2249
2250         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2251         if (!all_buf)
2252                 goto fail_all_buf;
2253
2254         data->user_page = all_buf;
2255         data->data_pages[0] = all_buf + PAGE_SIZE;
2256         data->data_order = ilog2(nr_pages);
2257         data->nr_pages = 1;
2258
2259         return data;
2260
2261 fail_all_buf:
2262         kfree(data);
2263
2264 fail:
2265         return NULL;
2266 }
2267
2268 #endif
2269
2270 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2271 {
2272         struct perf_event *event = vma->vm_file->private_data;
2273         struct perf_mmap_data *data;
2274         int ret = VM_FAULT_SIGBUS;
2275
2276         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2277                 if (vmf->pgoff == 0)
2278                         ret = 0;
2279                 return ret;
2280         }
2281
2282         rcu_read_lock();
2283         data = rcu_dereference(event->data);
2284         if (!data)
2285                 goto unlock;
2286
2287         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2288                 goto unlock;
2289
2290         vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2291         if (!vmf->page)
2292                 goto unlock;
2293
2294         get_page(vmf->page);
2295         vmf->page->mapping = vma->vm_file->f_mapping;
2296         vmf->page->index   = vmf->pgoff;
2297
2298         ret = 0;
2299 unlock:
2300         rcu_read_unlock();
2301
2302         return ret;
2303 }
2304
2305 static void
2306 perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2307 {
2308         long max_size = perf_data_size(data);
2309
2310         atomic_set(&data->lock, -1);
2311
2312         if (event->attr.watermark) {
2313                 data->watermark = min_t(long, max_size,
2314                                         event->attr.wakeup_watermark);
2315         }
2316
2317         if (!data->watermark)
2318                 data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
2319
2320
2321         rcu_assign_pointer(event->data, data);
2322 }
2323
2324 static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2325 {
2326         struct perf_mmap_data *data;
2327
2328         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2329         perf_mmap_data_free(data);
2330         kfree(data);
2331 }
2332
2333 static void perf_mmap_data_release(struct perf_event *event)
2334 {
2335         struct perf_mmap_data *data = event->data;
2336
2337         WARN_ON(atomic_read(&event->mmap_count));
2338
2339         rcu_assign_pointer(event->data, NULL);
2340         call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2341 }
2342
2343 static void perf_mmap_open(struct vm_area_struct *vma)
2344 {
2345         struct perf_event *event = vma->vm_file->private_data;
2346
2347         atomic_inc(&event->mmap_count);
2348 }
2349
2350 static void perf_mmap_close(struct vm_area_struct *vma)
2351 {
2352         struct perf_event *event = vma->vm_file->private_data;
2353
2354         WARN_ON_ONCE(event->ctx->parent_ctx);
2355         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2356                 unsigned long size = perf_data_size(event->data);
2357                 struct user_struct *user = current_user();
2358
2359                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2360                 vma->vm_mm->locked_vm -= event->data->nr_locked;
2361                 perf_mmap_data_release(event);
2362                 mutex_unlock(&event->mmap_mutex);
2363         }
2364 }
2365
2366 static const struct vm_operations_struct perf_mmap_vmops = {
2367         .open           = perf_mmap_open,
2368         .close          = perf_mmap_close,
2369         .fault          = perf_mmap_fault,
2370         .page_mkwrite   = perf_mmap_fault,
2371 };
2372
2373 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2374 {
2375         struct perf_event *event = file->private_data;
2376         unsigned long user_locked, user_lock_limit;
2377         struct user_struct *user = current_user();
2378         unsigned long locked, lock_limit;
2379         struct perf_mmap_data *data;
2380         unsigned long vma_size;
2381         unsigned long nr_pages;
2382         long user_extra, extra;
2383         int ret = 0;
2384
2385         if (!(vma->vm_flags & VM_SHARED))
2386                 return -EINVAL;
2387
2388         vma_size = vma->vm_end - vma->vm_start;
2389         nr_pages = (vma_size / PAGE_SIZE) - 1;
2390
2391         /*
2392          * If we have data pages ensure they're a power-of-two number, so we
2393          * can do bitmasks instead of modulo.
2394          */
2395         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2396                 return -EINVAL;
2397
2398         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2399                 return -EINVAL;
2400
2401         if (vma->vm_pgoff != 0)
2402                 return -EINVAL;
2403
2404         WARN_ON_ONCE(event->ctx->parent_ctx);
2405         mutex_lock(&event->mmap_mutex);
2406         if (event->output) {
2407                 ret = -EINVAL;
2408                 goto unlock;
2409         }
2410
2411         if (atomic_inc_not_zero(&event->mmap_count)) {
2412                 if (nr_pages != event->data->nr_pages)
2413                         ret = -EINVAL;
2414                 goto unlock;
2415         }
2416
2417         user_extra = nr_pages + 1;
2418         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2419
2420         /*
2421          * Increase the limit linearly with more CPUs:
2422          */
2423         user_lock_limit *= num_online_cpus();
2424
2425         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2426
2427         extra = 0;
2428         if (user_locked > user_lock_limit)
2429                 extra = user_locked - user_lock_limit;
2430
2431         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2432         lock_limit >>= PAGE_SHIFT;
2433         locked = vma->vm_mm->locked_vm + extra;
2434
2435         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2436                 !capable(CAP_IPC_LOCK)) {
2437                 ret = -EPERM;
2438                 goto unlock;
2439         }
2440
2441         WARN_ON(event->data);
2442
2443         data = perf_mmap_data_alloc(event, nr_pages);
2444         ret = -ENOMEM;
2445         if (!data)
2446                 goto unlock;
2447
2448         ret = 0;
2449         perf_mmap_data_init(event, data);
2450
2451         atomic_set(&event->mmap_count, 1);
2452         atomic_long_add(user_extra, &user->locked_vm);
2453         vma->vm_mm->locked_vm += extra;
2454         event->data->nr_locked = extra;
2455         if (vma->vm_flags & VM_WRITE)
2456                 event->data->writable = 1;
2457
2458 unlock:
2459         mutex_unlock(&event->mmap_mutex);
2460
2461         vma->vm_flags |= VM_RESERVED;
2462         vma->vm_ops = &perf_mmap_vmops;
2463
2464         return ret;
2465 }
2466
2467 static int perf_fasync(int fd, struct file *filp, int on)
2468 {
2469         struct inode *inode = filp->f_path.dentry->d_inode;
2470         struct perf_event *event = filp->private_data;
2471         int retval;
2472
2473         mutex_lock(&inode->i_mutex);
2474         retval = fasync_helper(fd, filp, on, &event->fasync);
2475         mutex_unlock(&inode->i_mutex);
2476
2477         if (retval < 0)
2478                 return retval;
2479
2480         return 0;
2481 }
2482
2483 static const struct file_operations perf_fops = {
2484         .release                = perf_release,
2485         .read                   = perf_read,
2486         .poll                   = perf_poll,
2487         .unlocked_ioctl         = perf_ioctl,
2488         .compat_ioctl           = perf_ioctl,
2489         .mmap                   = perf_mmap,
2490         .fasync                 = perf_fasync,
2491 };
2492
2493 /*
2494  * Perf event wakeup
2495  *
2496  * If there's data, ensure we set the poll() state and publish everything
2497  * to user-space before waking everybody up.
2498  */
2499
2500 void perf_event_wakeup(struct perf_event *event)
2501 {
2502         wake_up_all(&event->waitq);
2503
2504         if (event->pending_kill) {
2505                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2506                 event->pending_kill = 0;
2507         }
2508 }
2509
2510 /*
2511  * Pending wakeups
2512  *
2513  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2514  *
2515  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2516  * single linked list and use cmpxchg() to add entries lockless.
2517  */
2518
2519 static void perf_pending_event(struct perf_pending_entry *entry)
2520 {
2521         struct perf_event *event = container_of(entry,
2522                         struct perf_event, pending);
2523
2524         if (event->pending_disable) {
2525                 event->pending_disable = 0;
2526                 __perf_event_disable(event);
2527         }
2528
2529         if (event->pending_wakeup) {
2530                 event->pending_wakeup = 0;
2531                 perf_event_wakeup(event);
2532         }
2533 }
2534
2535 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2536
2537 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2538         PENDING_TAIL,
2539 };
2540
2541 static void perf_pending_queue(struct perf_pending_entry *entry,
2542                                void (*func)(struct perf_pending_entry *))
2543 {
2544         struct perf_pending_entry **head;
2545
2546         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2547                 return;
2548
2549         entry->func = func;
2550
2551         head = &get_cpu_var(perf_pending_head);
2552
2553         do {
2554                 entry->next = *head;
2555         } while (cmpxchg(head, entry->next, entry) != entry->next);
2556
2557         set_perf_event_pending();
2558
2559         put_cpu_var(perf_pending_head);
2560 }
2561
2562 static int __perf_pending_run(void)
2563 {
2564         struct perf_pending_entry *list;
2565         int nr = 0;
2566
2567         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2568         while (list != PENDING_TAIL) {
2569                 void (*func)(struct perf_pending_entry *);
2570                 struct perf_pending_entry *entry = list;
2571
2572                 list = list->next;
2573
2574                 func = entry->func;
2575                 entry->next = NULL;
2576                 /*
2577                  * Ensure we observe the unqueue before we issue the wakeup,
2578                  * so that we won't be waiting forever.
2579                  * -- see perf_not_pending().
2580                  */
2581                 smp_wmb();
2582
2583                 func(entry);
2584                 nr++;
2585         }
2586
2587         return nr;
2588 }
2589
2590 static inline int perf_not_pending(struct perf_event *event)
2591 {
2592         /*
2593          * If we flush on whatever cpu we run, there is a chance we don't
2594          * need to wait.
2595          */
2596         get_cpu();
2597         __perf_pending_run();
2598         put_cpu();
2599
2600         /*
2601          * Ensure we see the proper queue state before going to sleep
2602          * so that we do not miss the wakeup. -- see perf_pending_handle()
2603          */
2604         smp_rmb();
2605         return event->pending.next == NULL;
2606 }
2607
2608 static void perf_pending_sync(struct perf_event *event)
2609 {
2610         wait_event(event->waitq, perf_not_pending(event));
2611 }
2612
2613 void perf_event_do_pending(void)
2614 {
2615         __perf_pending_run();
2616 }
2617
2618 /*
2619  * Callchain support -- arch specific
2620  */
2621
2622 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2623 {
2624         return NULL;
2625 }
2626
2627 /*
2628  * Output
2629  */
2630 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2631                               unsigned long offset, unsigned long head)
2632 {
2633         unsigned long mask;
2634
2635         if (!data->writable)
2636                 return true;
2637
2638         mask = perf_data_size(data) - 1;
2639
2640         offset = (offset - tail) & mask;
2641         head   = (head   - tail) & mask;
2642
2643         if ((int)(head - offset) < 0)
2644                 return false;
2645
2646         return true;
2647 }
2648
2649 static void perf_output_wakeup(struct perf_output_handle *handle)
2650 {
2651         atomic_set(&handle->data->poll, POLL_IN);
2652
2653         if (handle->nmi) {
2654                 handle->event->pending_wakeup = 1;
2655                 perf_pending_queue(&handle->event->pending,
2656                                    perf_pending_event);
2657         } else
2658                 perf_event_wakeup(handle->event);
2659 }
2660
2661 /*
2662  * Curious locking construct.
2663  *
2664  * We need to ensure a later event_id doesn't publish a head when a former
2665  * event_id isn't done writing. However since we need to deal with NMIs we
2666  * cannot fully serialize things.
2667  *
2668  * What we do is serialize between CPUs so we only have to deal with NMI
2669  * nesting on a single CPU.
2670  *
2671  * We only publish the head (and generate a wakeup) when the outer-most
2672  * event_id completes.
2673  */
2674 static void perf_output_lock(struct perf_output_handle *handle)
2675 {
2676         struct perf_mmap_data *data = handle->data;
2677         int cpu;
2678
2679         handle->locked = 0;
2680
2681         local_irq_save(handle->flags);
2682         cpu = smp_processor_id();
2683
2684         if (in_nmi() && atomic_read(&data->lock) == cpu)
2685                 return;
2686
2687         while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2688                 cpu_relax();
2689
2690         handle->locked = 1;
2691 }
2692
2693 static void perf_output_unlock(struct perf_output_handle *handle)
2694 {
2695         struct perf_mmap_data *data = handle->data;
2696         unsigned long head;
2697         int cpu;
2698
2699         data->done_head = data->head;
2700
2701         if (!handle->locked)
2702                 goto out;
2703
2704 again:
2705         /*
2706          * The xchg implies a full barrier that ensures all writes are done
2707          * before we publish the new head, matched by a rmb() in userspace when
2708          * reading this position.
2709          */
2710         while ((head = atomic_long_xchg(&data->done_head, 0)))
2711                 data->user_page->data_head = head;
2712
2713         /*
2714          * NMI can happen here, which means we can miss a done_head update.
2715          */
2716
2717         cpu = atomic_xchg(&data->lock, -1);
2718         WARN_ON_ONCE(cpu != smp_processor_id());
2719
2720         /*
2721          * Therefore we have to validate we did not indeed do so.
2722          */
2723         if (unlikely(atomic_long_read(&data->done_head))) {
2724                 /*
2725                  * Since we had it locked, we can lock it again.
2726                  */
2727                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2728                         cpu_relax();
2729
2730                 goto again;
2731         }
2732
2733         if (atomic_xchg(&data->wakeup, 0))
2734                 perf_output_wakeup(handle);
2735 out:
2736         local_irq_restore(handle->flags);
2737 }
2738
2739 void perf_output_copy(struct perf_output_handle *handle,
2740                       const void *buf, unsigned int len)
2741 {
2742         unsigned int pages_mask;
2743         unsigned long offset;
2744         unsigned int size;
2745         void **pages;
2746
2747         offset          = handle->offset;
2748         pages_mask      = handle->data->nr_pages - 1;
2749         pages           = handle->data->data_pages;
2750
2751         do {
2752                 unsigned long page_offset;
2753                 unsigned long page_size;
2754                 int nr;
2755
2756                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2757                 page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
2758                 page_offset = offset & (page_size - 1);
2759                 size        = min_t(unsigned int, page_size - page_offset, len);
2760
2761                 memcpy(pages[nr] + page_offset, buf, size);
2762
2763                 len         -= size;
2764                 buf         += size;
2765                 offset      += size;
2766         } while (len);
2767
2768         handle->offset = offset;
2769
2770         /*
2771          * Check we didn't copy past our reservation window, taking the
2772          * possible unsigned int wrap into account.
2773          */
2774         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2775 }
2776
2777 int perf_output_begin(struct perf_output_handle *handle,
2778                       struct perf_event *event, unsigned int size,
2779                       int nmi, int sample)
2780 {
2781         struct perf_event *output_event;
2782         struct perf_mmap_data *data;
2783         unsigned long tail, offset, head;
2784         int have_lost;
2785         struct {
2786                 struct perf_event_header header;
2787                 u64                      id;
2788                 u64                      lost;
2789         } lost_event;
2790
2791         rcu_read_lock();
2792         /*
2793          * For inherited events we send all the output towards the parent.
2794          */
2795         if (event->parent)
2796                 event = event->parent;
2797
2798         output_event = rcu_dereference(event->output);
2799         if (output_event)
2800                 event = output_event;
2801
2802         data = rcu_dereference(event->data);
2803         if (!data)
2804                 goto out;
2805
2806         handle->data    = data;
2807         handle->event   = event;
2808         handle->nmi     = nmi;
2809         handle->sample  = sample;
2810
2811         if (!data->nr_pages)
2812                 goto fail;
2813
2814         have_lost = atomic_read(&data->lost);
2815         if (have_lost)
2816                 size += sizeof(lost_event);
2817
2818         perf_output_lock(handle);
2819
2820         do {
2821                 /*
2822                  * Userspace could choose to issue a mb() before updating the
2823                  * tail pointer. So that all reads will be completed before the
2824                  * write is issued.
2825                  */
2826                 tail = ACCESS_ONCE(data->user_page->data_tail);
2827                 smp_rmb();
2828                 offset = head = atomic_long_read(&data->head);
2829                 head += size;
2830                 if (unlikely(!perf_output_space(data, tail, offset, head)))
2831                         goto fail;
2832         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2833
2834         handle->offset  = offset;
2835         handle->head    = head;
2836
2837         if (head - tail > data->watermark)
2838                 atomic_set(&data->wakeup, 1);
2839
2840         if (have_lost) {
2841                 lost_event.header.type = PERF_RECORD_LOST;
2842                 lost_event.header.misc = 0;
2843                 lost_event.header.size = sizeof(lost_event);
2844                 lost_event.id          = event->id;
2845                 lost_event.lost        = atomic_xchg(&data->lost, 0);
2846
2847                 perf_output_put(handle, lost_event);
2848         }
2849
2850         return 0;
2851
2852 fail:
2853         atomic_inc(&data->lost);
2854         perf_output_unlock(handle);
2855 out:
2856         rcu_read_unlock();
2857
2858         return -ENOSPC;
2859 }
2860
2861 void perf_output_end(struct perf_output_handle *handle)
2862 {
2863         struct perf_event *event = handle->event;
2864         struct perf_mmap_data *data = handle->data;
2865
2866         int wakeup_events = event->attr.wakeup_events;
2867
2868         if (handle->sample && wakeup_events) {
2869                 int events = atomic_inc_return(&data->events);
2870                 if (events >= wakeup_events) {
2871                         atomic_sub(wakeup_events, &data->events);
2872                         atomic_set(&data->wakeup, 1);
2873                 }
2874         }
2875
2876         perf_output_unlock(handle);
2877         rcu_read_unlock();
2878 }
2879
2880 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2881 {
2882         /*
2883          * only top level events have the pid namespace they were created in
2884          */
2885         if (event->parent)
2886                 event = event->parent;
2887
2888         return task_tgid_nr_ns(p, event->ns);
2889 }
2890
2891 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2892 {
2893         /*
2894          * only top level events have the pid namespace they were created in
2895          */
2896         if (event->parent)
2897                 event = event->parent;
2898
2899         return task_pid_nr_ns(p, event->ns);
2900 }
2901
2902 static void perf_output_read_one(struct perf_output_handle *handle,
2903                                  struct perf_event *event)
2904 {
2905         u64 read_format = event->attr.read_format;
2906         u64 values[4];
2907         int n = 0;
2908
2909         values[n++] = atomic64_read(&event->count);
2910         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2911                 values[n++] = event->total_time_enabled +
2912                         atomic64_read(&event->child_total_time_enabled);
2913         }
2914         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2915                 values[n++] = event->total_time_running +
2916                         atomic64_read(&event->child_total_time_running);
2917         }
2918         if (read_format & PERF_FORMAT_ID)
2919                 values[n++] = primary_event_id(event);
2920
2921         perf_output_copy(handle, values, n * sizeof(u64));
2922 }
2923
2924 /*
2925  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2926  */
2927 static void perf_output_read_group(struct perf_output_handle *handle,
2928                             struct perf_event *event)
2929 {
2930         struct perf_event *leader = event->group_leader, *sub;
2931         u64 read_format = event->attr.read_format;
2932         u64 values[5];
2933         int n = 0;
2934
2935         values[n++] = 1 + leader->nr_siblings;
2936
2937         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2938                 values[n++] = leader->total_time_enabled;
2939
2940         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2941                 values[n++] = leader->total_time_running;
2942
2943         if (leader != event)
2944                 leader->pmu->read(leader);
2945
2946         values[n++] = atomic64_read(&leader->count);
2947         if (read_format & PERF_FORMAT_ID)
2948                 values[n++] = primary_event_id(leader);
2949
2950         perf_output_copy(handle, values, n * sizeof(u64));
2951
2952         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2953                 n = 0;
2954
2955                 if (sub != event)
2956                         sub->pmu->read(sub);
2957
2958                 values[n++] = atomic64_read(&sub->count);
2959                 if (read_format & PERF_FORMAT_ID)
2960                         values[n++] = primary_event_id(sub);
2961
2962                 perf_output_copy(handle, values, n * sizeof(u64));
2963         }
2964 }
2965
2966 static void perf_output_read(struct perf_output_handle *handle,
2967                              struct perf_event *event)
2968 {
2969         if (event->attr.read_format & PERF_FORMAT_GROUP)
2970                 perf_output_read_group(handle, event);
2971         else
2972                 perf_output_read_one(handle, event);
2973 }
2974
2975 void perf_output_sample(struct perf_output_handle *handle,
2976                         struct perf_event_header *header,
2977                         struct perf_sample_data *data,
2978                         struct perf_event *event)
2979 {
2980         u64 sample_type = data->type;
2981
2982         perf_output_put(handle, *header);
2983
2984         if (sample_type & PERF_SAMPLE_IP)
2985                 perf_output_put(handle, data->ip);
2986
2987         if (sample_type & PERF_SAMPLE_TID)
2988                 perf_output_put(handle, data->tid_entry);
2989
2990         if (sample_type & PERF_SAMPLE_TIME)
2991                 perf_output_put(handle, data->time);
2992
2993         if (sample_type & PERF_SAMPLE_ADDR)
2994                 perf_output_put(handle, data->addr);
2995
2996         if (sample_type & PERF_SAMPLE_ID)
2997                 perf_output_put(handle, data->id);
2998
2999         if (sample_type & PERF_SAMPLE_STREAM_ID)
3000                 perf_output_put(handle, data->stream_id);
3001
3002         if (sample_type & PERF_SAMPLE_CPU)
3003                 perf_output_put(handle, data->cpu_entry);
3004
3005         if (sample_type & PERF_SAMPLE_PERIOD)
3006                 perf_output_put(handle, data->period);
3007
3008         if (sample_type & PERF_SAMPLE_READ)
3009                 perf_output_read(handle, event);
3010
3011         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3012                 if (data->callchain) {
3013                         int size = 1;
3014
3015                         if (data->callchain)
3016                                 size += data->callchain->nr;
3017
3018                         size *= sizeof(u64);
3019
3020                         perf_output_copy(handle, data->callchain, size);
3021                 } else {
3022                         u64 nr = 0;
3023                         perf_output_put(handle, nr);
3024                 }
3025         }
3026
3027         if (sample_type & PERF_SAMPLE_RAW) {
3028                 if (data->raw) {
3029                         perf_output_put(handle, data->raw->size);
3030                         perf_output_copy(handle, data->raw->data,
3031                                          data->raw->size);
3032                 } else {
3033                         struct {
3034                                 u32     size;
3035                                 u32     data;
3036                         } raw = {
3037                                 .size = sizeof(u32),
3038                                 .data = 0,
3039                         };
3040                         perf_output_put(handle, raw);
3041                 }
3042         }
3043 }
3044
3045 void perf_prepare_sample(struct perf_event_header *header,
3046                          struct perf_sample_data *data,
3047                          struct perf_event *event,
3048                          struct pt_regs *regs)
3049 {
3050         u64 sample_type = event->attr.sample_type;
3051
3052         data->type = sample_type;
3053
3054         header->type = PERF_RECORD_SAMPLE;
3055         header->size = sizeof(*header);
3056
3057         header->misc = 0;
3058         header->misc |= perf_misc_flags(regs);
3059
3060         if (sample_type & PERF_SAMPLE_IP) {
3061                 data->ip = perf_instruction_pointer(regs);
3062
3063                 header->size += sizeof(data->ip);
3064         }
3065
3066         if (sample_type & PERF_SAMPLE_TID) {
3067                 /* namespace issues */
3068                 data->tid_entry.pid = perf_event_pid(event, current);
3069                 data->tid_entry.tid = perf_event_tid(event, current);
3070
3071                 header->size += sizeof(data->tid_entry);
3072         }
3073
3074         if (sample_type & PERF_SAMPLE_TIME) {
3075                 data->time = perf_clock();
3076
3077                 header->size += sizeof(data->time);
3078         }
3079
3080         if (sample_type & PERF_SAMPLE_ADDR)
3081                 header->size += sizeof(data->addr);
3082
3083         if (sample_type & PERF_SAMPLE_ID) {
3084                 data->id = primary_event_id(event);
3085
3086                 header->size += sizeof(data->id);
3087         }
3088
3089         if (sample_type & PERF_SAMPLE_STREAM_ID) {
3090                 data->stream_id = event->id;
3091
3092                 header->size += sizeof(data->stream_id);
3093         }
3094
3095         if (sample_type & PERF_SAMPLE_CPU) {
3096                 data->cpu_entry.cpu             = raw_smp_processor_id();
3097                 data->cpu_entry.reserved        = 0;
3098
3099                 header->size += sizeof(data->cpu_entry);
3100         }
3101
3102         if (sample_type & PERF_SAMPLE_PERIOD)
3103                 header->size += sizeof(data->period);
3104
3105         if (sample_type & PERF_SAMPLE_READ)
3106                 header->size += perf_event_read_size(event);
3107
3108         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3109                 int size = 1;
3110
3111                 data->callchain = perf_callchain(regs);
3112
3113                 if (data->callchain)
3114                         size += data->callchain->nr;
3115
3116                 header->size += size * sizeof(u64);
3117         }
3118
3119         if (sample_type & PERF_SAMPLE_RAW) {
3120                 int size = sizeof(u32);
3121
3122                 if (data->raw)
3123                         size += data->raw->size;
3124                 else
3125                         size += sizeof(u32);
3126
3127                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3128                 header->size += size;
3129         }
3130 }
3131
3132 static void perf_event_output(struct perf_event *event, int nmi,
3133                                 struct perf_sample_data *data,
3134                                 struct pt_regs *regs)
3135 {
3136         struct perf_output_handle handle;
3137         struct perf_event_header header;
3138
3139         perf_prepare_sample(&header, data, event, regs);
3140
3141         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3142                 return;
3143
3144         perf_output_sample(&handle, &header, data, event);
3145
3146         perf_output_end(&handle);
3147 }
3148
3149 /*
3150  * read event_id
3151  */
3152
3153 struct perf_read_event {
3154         struct perf_event_header        header;
3155
3156         u32                             pid;
3157         u32                             tid;
3158 };
3159
3160 static void
3161 perf_event_read_event(struct perf_event *event,
3162                         struct task_struct *task)
3163 {
3164         struct perf_output_handle handle;
3165         struct perf_read_event read_event = {
3166                 .header = {
3167                         .type = PERF_RECORD_READ,
3168                         .misc = 0,
3169                         .size = sizeof(read_event) + perf_event_read_size(event),
3170                 },
3171                 .pid = perf_event_pid(event, task),
3172                 .tid = perf_event_tid(event, task),
3173         };
3174         int ret;
3175
3176         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3177         if (ret)
3178                 return;
3179
3180         perf_output_put(&handle, read_event);
3181         perf_output_read(&handle, event);
3182
3183         perf_output_end(&handle);
3184 }
3185
3186 /*
3187  * task tracking -- fork/exit
3188  *
3189  * enabled by: attr.comm | attr.mmap | attr.task
3190  */
3191
3192 struct perf_task_event {
3193         struct task_struct              *task;
3194         struct perf_event_context       *task_ctx;
3195
3196         struct {
3197                 struct perf_event_header        header;
3198
3199                 u32                             pid;
3200                 u32                             ppid;
3201                 u32                             tid;
3202                 u32                             ptid;
3203                 u64                             time;
3204         } event_id;
3205 };
3206
3207 static void perf_event_task_output(struct perf_event *event,
3208                                      struct perf_task_event *task_event)
3209 {
3210         struct perf_output_handle handle;
3211         int size;
3212         struct task_struct *task = task_event->task;
3213         int ret;
3214
3215         size  = task_event->event_id.header.size;
3216         ret = perf_output_begin(&handle, event, size, 0, 0);
3217
3218         if (ret)
3219                 return;
3220
3221         task_event->event_id.pid = perf_event_pid(event, task);
3222         task_event->event_id.ppid = perf_event_pid(event, current);
3223
3224         task_event->event_id.tid = perf_event_tid(event, task);
3225         task_event->event_id.ptid = perf_event_tid(event, current);
3226
3227         task_event->event_id.time = perf_clock();
3228
3229         perf_output_put(&handle, task_event->event_id);
3230
3231         perf_output_end(&handle);
3232 }
3233
3234 static int perf_event_task_match(struct perf_event *event)
3235 {
3236         if (event->attr.comm || event->attr.mmap || event->attr.task)
3237                 return 1;
3238
3239         return 0;
3240 }
3241
3242 static void perf_event_task_ctx(struct perf_event_context *ctx,
3243                                   struct perf_task_event *task_event)
3244 {
3245         struct perf_event *event;
3246
3247         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3248                 return;
3249
3250         rcu_read_lock();
3251         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3252                 if (perf_event_task_match(event))
3253                         perf_event_task_output(event, task_event);
3254         }
3255         rcu_read_unlock();
3256 }
3257
3258 static void perf_event_task_event(struct perf_task_event *task_event)
3259 {
3260         struct perf_cpu_context *cpuctx;
3261         struct perf_event_context *ctx = task_event->task_ctx;
3262
3263         cpuctx = &get_cpu_var(perf_cpu_context);
3264         perf_event_task_ctx(&cpuctx->ctx, task_event);
3265         put_cpu_var(perf_cpu_context);
3266
3267         rcu_read_lock();
3268         if (!ctx)
3269                 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3270         if (ctx)
3271                 perf_event_task_ctx(ctx, task_event);
3272         rcu_read_unlock();
3273 }
3274
3275 static void perf_event_task(struct task_struct *task,
3276                               struct perf_event_context *task_ctx,
3277                               int new)
3278 {
3279         struct perf_task_event task_event;
3280
3281         if (!atomic_read(&nr_comm_events) &&
3282             !atomic_read(&nr_mmap_events) &&
3283             !atomic_read(&nr_task_events))
3284                 return;
3285
3286         task_event = (struct perf_task_event){
3287                 .task     = task,
3288                 .task_ctx = task_ctx,
3289                 .event_id    = {
3290                         .header = {
3291                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3292                                 .misc = 0,
3293                                 .size = sizeof(task_event.event_id),
3294                         },
3295                         /* .pid  */
3296                         /* .ppid */
3297                         /* .tid  */
3298                         /* .ptid */
3299                 },
3300         };
3301
3302         perf_event_task_event(&task_event);
3303 }
3304
3305 void perf_event_fork(struct task_struct *task)
3306 {
3307         perf_event_task(task, NULL, 1);
3308 }
3309
3310 /*
3311  * comm tracking
3312  */
3313
3314 struct perf_comm_event {
3315         struct task_struct      *task;
3316         char                    *comm;
3317         int                     comm_size;
3318
3319         struct {
3320                 struct perf_event_header        header;
3321
3322                 u32                             pid;
3323                 u32                             tid;
3324         } event_id;
3325 };
3326
3327 static void perf_event_comm_output(struct perf_event *event,
3328                                      struct perf_comm_event *comm_event)
3329 {
3330         struct perf_output_handle handle;
3331         int size = comm_event->event_id.header.size;
3332         int ret = perf_output_begin(&handle, event, size, 0, 0);
3333
3334         if (ret)
3335                 return;
3336
3337         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3338         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3339
3340         perf_output_put(&handle, comm_event->event_id);
3341         perf_output_copy(&handle, comm_event->comm,
3342                                    comm_event->comm_size);
3343         perf_output_end(&handle);
3344 }
3345
3346 static int perf_event_comm_match(struct perf_event *event)
3347 {
3348         if (event->attr.comm)
3349                 return 1;
3350
3351         return 0;
3352 }
3353
3354 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3355                                   struct perf_comm_event *comm_event)
3356 {
3357         struct perf_event *event;
3358
3359         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3360                 return;
3361
3362         rcu_read_lock();
3363         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3364                 if (perf_event_comm_match(event))
3365                         perf_event_comm_output(event, comm_event);
3366         }
3367         rcu_read_unlock();
3368 }
3369
3370 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3371 {
3372         struct perf_cpu_context *cpuctx;
3373         struct perf_event_context *ctx;
3374         unsigned int size;
3375         char comm[TASK_COMM_LEN];
3376
3377         memset(comm, 0, sizeof(comm));
3378         strncpy(comm, comm_event->task->comm, sizeof(comm));
3379         size = ALIGN(strlen(comm)+1, sizeof(u64));
3380
3381         comm_event->comm = comm;
3382         comm_event->comm_size = size;
3383
3384         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3385
3386         cpuctx = &get_cpu_var(perf_cpu_context);
3387         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3388         put_cpu_var(perf_cpu_context);
3389
3390         rcu_read_lock();
3391         /*
3392          * doesn't really matter which of the child contexts the
3393          * events ends up in.
3394          */
3395         ctx = rcu_dereference(current->perf_event_ctxp);
3396         if (ctx)
3397                 perf_event_comm_ctx(ctx, comm_event);
3398         rcu_read_unlock();
3399 }
3400
3401 void perf_event_comm(struct task_struct *task)
3402 {
3403         struct perf_comm_event comm_event;
3404
3405         if (task->perf_event_ctxp)
3406                 perf_event_enable_on_exec(task);
3407
3408         if (!atomic_read(&nr_comm_events))
3409                 return;
3410
3411         comm_event = (struct perf_comm_event){
3412                 .task   = task,
3413                 /* .comm      */
3414                 /* .comm_size */
3415                 .event_id  = {
3416                         .header = {
3417                                 .type = PERF_RECORD_COMM,
3418                                 .misc = 0,
3419                                 /* .size */
3420                         },
3421                         /* .pid */
3422                         /* .tid */
3423                 },
3424         };
3425
3426         perf_event_comm_event(&comm_event);
3427 }
3428
3429 /*
3430  * mmap tracking
3431  */
3432
3433 struct perf_mmap_event {
3434         struct vm_area_struct   *vma;
3435
3436         const char              *file_name;
3437         int                     file_size;
3438
3439         struct {
3440                 struct perf_event_header        header;
3441
3442                 u32                             pid;
3443                 u32                             tid;
3444                 u64                             start;
3445                 u64                             len;
3446                 u64                             pgoff;
3447         } event_id;
3448 };
3449
3450 static void perf_event_mmap_output(struct perf_event *event,
3451                                      struct perf_mmap_event *mmap_event)
3452 {
3453         struct perf_output_handle handle;
3454         int size = mmap_event->event_id.header.size;
3455         int ret = perf_output_begin(&handle, event, size, 0, 0);
3456
3457         if (ret)
3458                 return;
3459
3460         mmap_event->event_id.pid = perf_event_pid(event, current);
3461         mmap_event->event_id.tid = perf_event_tid(event, current);
3462
3463         perf_output_put(&handle, mmap_event->event_id);
3464         perf_output_copy(&handle, mmap_event->file_name,
3465                                    mmap_event->file_size);
3466         perf_output_end(&handle);
3467 }
3468
3469 static int perf_event_mmap_match(struct perf_event *event,
3470                                    struct perf_mmap_event *mmap_event)
3471 {
3472         if (event->attr.mmap)
3473                 return 1;
3474
3475         return 0;
3476 }
3477
3478 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3479                                   struct perf_mmap_event *mmap_event)
3480 {
3481         struct perf_event *event;
3482
3483         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3484                 return;
3485
3486         rcu_read_lock();
3487         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3488                 if (perf_event_mmap_match(event, mmap_event))
3489                         perf_event_mmap_output(event, mmap_event);
3490         }
3491         rcu_read_unlock();
3492 }
3493
3494 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3495 {
3496         struct perf_cpu_context *cpuctx;
3497         struct perf_event_context *ctx;
3498         struct vm_area_struct *vma = mmap_event->vma;
3499         struct file *file = vma->vm_file;
3500         unsigned int size;
3501         char tmp[16];
3502         char *buf = NULL;
3503         const char *name;
3504
3505         memset(tmp, 0, sizeof(tmp));
3506
3507         if (file) {
3508                 /*
3509                  * d_path works from the end of the buffer backwards, so we
3510                  * need to add enough zero bytes after the string to handle
3511                  * the 64bit alignment we do later.
3512                  */
3513                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3514                 if (!buf) {
3515                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3516                         goto got_name;
3517                 }
3518                 name = d_path(&file->f_path, buf, PATH_MAX);
3519                 if (IS_ERR(name)) {
3520                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3521                         goto got_name;
3522                 }
3523         } else {
3524                 if (arch_vma_name(mmap_event->vma)) {
3525                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3526                                        sizeof(tmp));
3527                         goto got_name;
3528                 }
3529
3530                 if (!vma->vm_mm) {
3531                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3532                         goto got_name;
3533                 }
3534
3535                 name = strncpy(tmp, "//anon", sizeof(tmp));
3536                 goto got_name;
3537         }
3538
3539 got_name:
3540         size = ALIGN(strlen(name)+1, sizeof(u64));
3541
3542         mmap_event->file_name = name;
3543         mmap_event->file_size = size;
3544
3545         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3546
3547         cpuctx = &get_cpu_var(perf_cpu_context);
3548         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3549         put_cpu_var(perf_cpu_context);
3550
3551         rcu_read_lock();
3552         /*
3553          * doesn't really matter which of the child contexts the
3554          * events ends up in.
3555          */
3556         ctx = rcu_dereference(current->perf_event_ctxp);
3557         if (ctx)
3558                 perf_event_mmap_ctx(ctx, mmap_event);
3559         rcu_read_unlock();
3560
3561         kfree(buf);
3562 }
3563
3564 void __perf_event_mmap(struct vm_area_struct *vma)
3565 {
3566         struct perf_mmap_event mmap_event;
3567
3568         if (!atomic_read(&nr_mmap_events))
3569                 return;
3570
3571         mmap_event = (struct perf_mmap_event){
3572                 .vma    = vma,
3573                 /* .file_name */
3574                 /* .file_size */
3575                 .event_id  = {
3576                         .header = {
3577                                 .type = PERF_RECORD_MMAP,
3578                                 .misc = 0,
3579                                 /* .size */
3580                         },
3581                         /* .pid */
3582                         /* .tid */
3583                         .start  = vma->vm_start,
3584                         .len    = vma->vm_end - vma->vm_start,
3585                         .pgoff  = vma->vm_pgoff,
3586                 },
3587         };
3588
3589         perf_event_mmap_event(&mmap_event);
3590 }
3591
3592 /*
3593  * IRQ throttle logging
3594  */
3595
3596 static void perf_log_throttle(struct perf_event *event, int enable)
3597 {
3598         struct perf_output_handle handle;
3599         int ret;
3600
3601         struct {
3602                 struct perf_event_header        header;
3603                 u64                             time;
3604                 u64                             id;
3605                 u64                             stream_id;
3606         } throttle_event = {
3607                 .header = {
3608                         .type = PERF_RECORD_THROTTLE,
3609                         .misc = 0,
3610                         .size = sizeof(throttle_event),
3611                 },
3612                 .time           = perf_clock(),
3613                 .id             = primary_event_id(event),
3614                 .stream_id      = event->id,
3615         };
3616
3617         if (enable)
3618                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3619
3620         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3621         if (ret)
3622                 return;
3623
3624         perf_output_put(&handle, throttle_event);
3625         perf_output_end(&handle);
3626 }
3627
3628 /*
3629  * Generic event overflow handling, sampling.
3630  */
3631
3632 static int __perf_event_overflow(struct perf_event *event, int nmi,
3633                                    int throttle, struct perf_sample_data *data,
3634                                    struct pt_regs *regs)
3635 {
3636         int events = atomic_read(&event->event_limit);
3637         struct hw_perf_event *hwc = &event->hw;
3638         int ret = 0;
3639
3640         throttle = (throttle && event->pmu->unthrottle != NULL);
3641
3642         if (!throttle) {
3643                 hwc->interrupts++;
3644         } else {
3645                 if (hwc->interrupts != MAX_INTERRUPTS) {
3646                         hwc->interrupts++;
3647                         if (HZ * hwc->interrupts >
3648                                         (u64)sysctl_perf_event_sample_rate) {
3649                                 hwc->interrupts = MAX_INTERRUPTS;
3650                                 perf_log_throttle(event, 0);
3651                                 ret = 1;
3652                         }
3653                 } else {
3654                         /*
3655                          * Keep re-disabling events even though on the previous
3656                          * pass we disabled it - just in case we raced with a
3657                          * sched-in and the event got enabled again:
3658                          */
3659                         ret = 1;
3660                 }
3661         }
3662
3663         if (event->attr.freq) {
3664                 u64 now = perf_clock();
3665                 s64 delta = now - hwc->freq_stamp;
3666
3667                 hwc->freq_stamp = now;
3668
3669                 if (delta > 0 && delta < TICK_NSEC)
3670                         perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3671         }
3672
3673         /*
3674          * XXX event_limit might not quite work as expected on inherited
3675          * events
3676          */
3677
3678         event->pending_kill = POLL_IN;
3679         if (events && atomic_dec_and_test(&event->event_limit)) {
3680                 ret = 1;
3681                 event->pending_kill = POLL_HUP;
3682                 if (nmi) {
3683                         event->pending_disable = 1;
3684                         perf_pending_queue(&event->pending,
3685                                            perf_pending_event);
3686                 } else
3687                         perf_event_disable(event);
3688         }
3689
3690         perf_event_output(event, nmi, data, regs);
3691         return ret;
3692 }
3693
3694 int perf_event_overflow(struct perf_event *event, int nmi,
3695                           struct perf_sample_data *data,
3696                           struct pt_regs *regs)
3697 {
3698         return __perf_event_overflow(event, nmi, 1, data, regs);
3699 }
3700
3701 /*
3702  * Generic software event infrastructure
3703  */
3704
3705 /*
3706  * We directly increment event->count and keep a second value in
3707  * event->hw.period_left to count intervals. This period event
3708  * is kept in the range [-sample_period, 0] so that we can use the
3709  * sign as trigger.
3710  */
3711
3712 static u64 perf_swevent_set_period(struct perf_event *event)
3713 {
3714         struct hw_perf_event *hwc = &event->hw;
3715         u64 period = hwc->last_period;
3716         u64 nr, offset;
3717         s64 old, val;
3718
3719         hwc->last_period = hwc->sample_period;
3720
3721 again:
3722         old = val = atomic64_read(&hwc->period_left);
3723         if (val < 0)
3724                 return 0;
3725
3726         nr = div64_u64(period + val, period);
3727         offset = nr * period;
3728         val -= offset;
3729         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3730                 goto again;
3731
3732         return nr;
3733 }
3734
3735 static void perf_swevent_overflow(struct perf_event *event,
3736                                     int nmi, struct perf_sample_data *data,
3737                                     struct pt_regs *regs)
3738 {
3739         struct hw_perf_event *hwc = &event->hw;
3740         int throttle = 0;
3741         u64 overflow;
3742
3743         data->period = event->hw.last_period;
3744         overflow = perf_swevent_set_period(event);
3745
3746         if (hwc->interrupts == MAX_INTERRUPTS)
3747                 return;
3748
3749         for (; overflow; overflow--) {
3750                 if (__perf_event_overflow(event, nmi, throttle,
3751                                             data, regs)) {
3752                         /*
3753                          * We inhibit the overflow from happening when
3754                          * hwc->interrupts == MAX_INTERRUPTS.
3755                          */
3756                         break;
3757                 }
3758                 throttle = 1;
3759         }
3760 }
3761
3762 static void perf_swevent_unthrottle(struct perf_event *event)
3763 {
3764         /*
3765          * Nothing to do, we already reset hwc->interrupts.
3766          */
3767 }
3768
3769 static void perf_swevent_add(struct perf_event *event, u64 nr,
3770                                int nmi, struct perf_sample_data *data,
3771                                struct pt_regs *regs)
3772 {
3773         struct hw_perf_event *hwc = &event->hw;
3774
3775         atomic64_add(nr, &event->count);
3776
3777         if (!hwc->sample_period)
3778                 return;
3779
3780         if (!regs)
3781                 return;
3782
3783         if (!atomic64_add_negative(nr, &hwc->period_left))
3784                 perf_swevent_overflow(event, nmi, data, regs);
3785 }
3786
3787 static int perf_swevent_is_counting(struct perf_event *event)
3788 {
3789         /*
3790          * The event is active, we're good!
3791          */
3792         if (event->state == PERF_EVENT_STATE_ACTIVE)
3793                 return 1;
3794
3795         /*
3796          * The event is off/error, not counting.
3797          */
3798         if (event->state != PERF_EVENT_STATE_INACTIVE)
3799                 return 0;
3800
3801         /*
3802          * The event is inactive, if the context is active
3803          * we're part of a group that didn't make it on the 'pmu',
3804          * not counting.
3805          */
3806         if (event->ctx->is_active)
3807                 return 0;
3808
3809         /*
3810          * We're inactive and the context is too, this means the
3811          * task is scheduled out, we're counting events that happen
3812          * to us, like migration events.
3813          */
3814         return 1;
3815 }
3816
3817 static int perf_tp_event_match(struct perf_event *event,
3818                                 struct perf_sample_data *data);
3819
3820 static int perf_swevent_match(struct perf_event *event,
3821                                 enum perf_type_id type,
3822                                 u32 event_id,
3823                                 struct perf_sample_data *data,
3824                                 struct pt_regs *regs)
3825 {
3826         if (!perf_swevent_is_counting(event))
3827                 return 0;
3828
3829         if (event->attr.type != type)
3830                 return 0;
3831         if (event->attr.config != event_id)
3832                 return 0;
3833
3834         if (regs) {
3835                 if (event->attr.exclude_user && user_mode(regs))
3836                         return 0;
3837
3838                 if (event->attr.exclude_kernel && !user_mode(regs))
3839                         return 0;
3840         }
3841
3842         if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3843             !perf_tp_event_match(event, data))
3844                 return 0;
3845
3846         return 1;
3847 }
3848
3849 static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3850                                      enum perf_type_id type,
3851                                      u32 event_id, u64 nr, int nmi,
3852                                      struct perf_sample_data *data,
3853                                      struct pt_regs *regs)
3854 {
3855         struct perf_event *event;
3856
3857         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3858                 return;
3859
3860         rcu_read_lock();
3861         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3862                 if (perf_swevent_match(event, type, event_id, data, regs))
3863                         perf_swevent_add(event, nr, nmi, data, regs);
3864         }
3865         rcu_read_unlock();
3866 }
3867
3868 static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3869 {
3870         if (in_nmi())
3871                 return &cpuctx->recursion[3];
3872
3873         if (in_irq())
3874                 return &cpuctx->recursion[2];
3875
3876         if (in_softirq())
3877                 return &cpuctx->recursion[1];
3878
3879         return &cpuctx->recursion[0];
3880 }
3881
3882 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3883                                     u64 nr, int nmi,
3884                                     struct perf_sample_data *data,
3885                                     struct pt_regs *regs)
3886 {
3887         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3888         int *recursion = perf_swevent_recursion_context(cpuctx);
3889         struct perf_event_context *ctx;
3890
3891         if (*recursion)
3892                 goto out;
3893
3894         (*recursion)++;
3895         barrier();
3896
3897         perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3898                                  nr, nmi, data, regs);
3899         rcu_read_lock();
3900         /*
3901          * doesn't really matter which of the child contexts the
3902          * events ends up in.
3903          */
3904         ctx = rcu_dereference(current->perf_event_ctxp);
3905         if (ctx)
3906                 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3907         rcu_read_unlock();
3908
3909         barrier();
3910         (*recursion)--;
3911
3912 out:
3913         put_cpu_var(perf_cpu_context);
3914 }
3915
3916 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3917                             struct pt_regs *regs, u64 addr)
3918 {
3919         struct perf_sample_data data = {
3920                 .addr = addr,
3921         };
3922
3923         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3924                                 &data, regs);
3925 }
3926
3927 static void perf_swevent_read(struct perf_event *event)
3928 {
3929 }
3930
3931 static int perf_swevent_enable(struct perf_event *event)
3932 {
3933         struct hw_perf_event *hwc = &event->hw;
3934
3935         if (hwc->sample_period) {
3936                 hwc->last_period = hwc->sample_period;
3937                 perf_swevent_set_period(event);
3938         }
3939         return 0;
3940 }
3941
3942 static void perf_swevent_disable(struct perf_event *event)
3943 {
3944 }
3945
3946 static const struct pmu perf_ops_generic = {
3947         .enable         = perf_swevent_enable,
3948         .disable        = perf_swevent_disable,
3949         .read           = perf_swevent_read,
3950         .unthrottle     = perf_swevent_unthrottle,
3951 };
3952
3953 /*
3954  * hrtimer based swevent callback
3955  */
3956
3957 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3958 {
3959         enum hrtimer_restart ret = HRTIMER_RESTART;
3960         struct perf_sample_data data;
3961         struct pt_regs *regs;
3962         struct perf_event *event;
3963         u64 period;
3964
3965         event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
3966         event->pmu->read(event);
3967
3968         data.addr = 0;
3969         regs = get_irq_regs();
3970         /*
3971          * In case we exclude kernel IPs or are somehow not in interrupt
3972          * context, provide the next best thing, the user IP.
3973          */
3974         if ((event->attr.exclude_kernel || !regs) &&
3975                         !event->attr.exclude_user)
3976                 regs = task_pt_regs(current);
3977
3978         if (regs) {
3979                 if (perf_event_overflow(event, 0, &data, regs))
3980                         ret = HRTIMER_NORESTART;
3981         }
3982
3983         period = max_t(u64, 10000, event->hw.sample_period);
3984         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3985
3986         return ret;
3987 }
3988
3989 /*
3990  * Software event: cpu wall time clock
3991  */
3992
3993 static void cpu_clock_perf_event_update(struct perf_event *event)
3994 {
3995         int cpu = raw_smp_processor_id();
3996         s64 prev;
3997         u64 now;
3998
3999         now = cpu_clock(cpu);
4000         prev = atomic64_read(&event->hw.prev_count);
4001         atomic64_set(&event->hw.prev_count, now);
4002         atomic64_add(now - prev, &event->count);
4003 }
4004
4005 static int cpu_clock_perf_event_enable(struct perf_event *event)
4006 {
4007         struct hw_perf_event *hwc = &event->hw;
4008         int cpu = raw_smp_processor_id();
4009
4010         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4011         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4012         hwc->hrtimer.function = perf_swevent_hrtimer;
4013         if (hwc->sample_period) {
4014                 u64 period = max_t(u64, 10000, hwc->sample_period);
4015                 __hrtimer_start_range_ns(&hwc->hrtimer,
4016                                 ns_to_ktime(period), 0,
4017                                 HRTIMER_MODE_REL, 0);
4018         }
4019
4020         return 0;
4021 }
4022
4023 static void cpu_clock_perf_event_disable(struct perf_event *event)
4024 {
4025         if (event->hw.sample_period)
4026                 hrtimer_cancel(&event->hw.hrtimer);
4027         cpu_clock_perf_event_update(event);
4028 }
4029
4030 static void cpu_clock_perf_event_read(struct perf_event *event)
4031 {
4032         cpu_clock_perf_event_update(event);
4033 }
4034
4035 static const struct pmu perf_ops_cpu_clock = {
4036         .enable         = cpu_clock_perf_event_enable,
4037         .disable        = cpu_clock_perf_event_disable,
4038         .read           = cpu_clock_perf_event_read,
4039 };
4040
4041 /*
4042  * Software event: task time clock
4043  */
4044
4045 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4046 {
4047         u64 prev;
4048         s64 delta;
4049
4050         prev = atomic64_xchg(&event->hw.prev_count, now);
4051         delta = now - prev;
4052         atomic64_add(delta, &event->count);
4053 }
4054
4055 static int task_clock_perf_event_enable(struct perf_event *event)
4056 {
4057         struct hw_perf_event *hwc = &event->hw;
4058         u64 now;
4059
4060         now = event->ctx->time;
4061
4062         atomic64_set(&hwc->prev_count, now);
4063         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4064         hwc->hrtimer.function = perf_swevent_hrtimer;
4065         if (hwc->sample_period) {
4066                 u64 period = max_t(u64, 10000, hwc->sample_period);
4067                 __hrtimer_start_range_ns(&hwc->hrtimer,
4068                                 ns_to_ktime(period), 0,
4069                                 HRTIMER_MODE_REL, 0);
4070         }
4071
4072         return 0;
4073 }
4074
4075 static void task_clock_perf_event_disable(struct perf_event *event)
4076 {
4077         if (event->hw.sample_period)
4078                 hrtimer_cancel(&event->hw.hrtimer);
4079         task_clock_perf_event_update(event, event->ctx->time);
4080
4081 }
4082
4083 static void task_clock_perf_event_read(struct perf_event *event)
4084 {
4085         u64 time;
4086
4087         if (!in_nmi()) {
4088                 update_context_time(event->ctx);
4089                 time = event->ctx->time;
4090         } else {
4091                 u64 now = perf_clock();
4092                 u64 delta = now - event->ctx->timestamp;
4093                 time = event->ctx->time + delta;
4094         }
4095
4096         task_clock_perf_event_update(event, time);
4097 }
4098
4099 static const struct pmu perf_ops_task_clock = {
4100         .enable         = task_clock_perf_event_enable,
4101         .disable        = task_clock_perf_event_disable,
4102         .read           = task_clock_perf_event_read,
4103 };
4104
4105 #ifdef CONFIG_EVENT_PROFILE
4106
4107 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4108                           int entry_size)
4109 {
4110         struct perf_raw_record raw = {
4111                 .size = entry_size,
4112                 .data = record,
4113         };
4114
4115         struct perf_sample_data data = {
4116                 .addr = addr,
4117                 .raw = &raw,
4118         };
4119
4120         struct pt_regs *regs = get_irq_regs();
4121
4122         if (!regs)
4123                 regs = task_pt_regs(current);
4124
4125         do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4126                                 &data, regs);
4127 }
4128 EXPORT_SYMBOL_GPL(perf_tp_event);
4129
4130 static int perf_tp_event_match(struct perf_event *event,
4131                                 struct perf_sample_data *data)
4132 {
4133         void *record = data->raw->data;
4134
4135         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4136                 return 1;
4137         return 0;
4138 }
4139
4140 static void tp_perf_event_destroy(struct perf_event *event)
4141 {
4142         ftrace_profile_disable(event->attr.config);
4143 }
4144
4145 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4146 {
4147         /*
4148          * Raw tracepoint data is a severe data leak, only allow root to
4149          * have these.
4150          */
4151         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4152                         perf_paranoid_tracepoint_raw() &&
4153                         !capable(CAP_SYS_ADMIN))
4154                 return ERR_PTR(-EPERM);
4155
4156         if (ftrace_profile_enable(event->attr.config))
4157                 return NULL;
4158
4159         event->destroy = tp_perf_event_destroy;
4160
4161         return &perf_ops_generic;
4162 }
4163
4164 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4165 {
4166         char *filter_str;
4167         int ret;
4168
4169         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4170                 return -EINVAL;
4171
4172         filter_str = strndup_user(arg, PAGE_SIZE);
4173         if (IS_ERR(filter_str))
4174                 return PTR_ERR(filter_str);
4175
4176         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4177
4178         kfree(filter_str);
4179         return ret;
4180 }
4181
4182 static void perf_event_free_filter(struct perf_event *event)
4183 {
4184         ftrace_profile_free_filter(event);
4185 }
4186
4187 #else
4188
4189 static int perf_tp_event_match(struct perf_event *event,
4190                                 struct perf_sample_data *data)
4191 {
4192         return 1;
4193 }
4194
4195 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4196 {
4197         return NULL;
4198 }
4199
4200 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4201 {
4202         return -ENOENT;
4203 }
4204
4205 static void perf_event_free_filter(struct perf_event *event)
4206 {
4207 }
4208
4209 #endif /* CONFIG_EVENT_PROFILE */
4210
4211 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4212
4213 static void sw_perf_event_destroy(struct perf_event *event)
4214 {
4215         u64 event_id = event->attr.config;
4216
4217         WARN_ON(event->parent);
4218
4219         atomic_dec(&perf_swevent_enabled[event_id]);
4220 }
4221
4222 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4223 {
4224         const struct pmu *pmu = NULL;
4225         u64 event_id = event->attr.config;
4226
4227         /*
4228          * Software events (currently) can't in general distinguish
4229          * between user, kernel and hypervisor events.
4230          * However, context switches and cpu migrations are considered
4231          * to be kernel events, and page faults are never hypervisor
4232          * events.
4233          */
4234         switch (event_id) {
4235         case PERF_COUNT_SW_CPU_CLOCK:
4236                 pmu = &perf_ops_cpu_clock;
4237
4238                 break;
4239         case PERF_COUNT_SW_TASK_CLOCK:
4240                 /*
4241                  * If the user instantiates this as a per-cpu event,
4242                  * use the cpu_clock event instead.
4243                  */
4244                 if (event->ctx->task)
4245                         pmu = &perf_ops_task_clock;
4246                 else
4247                         pmu = &perf_ops_cpu_clock;
4248
4249                 break;
4250         case PERF_COUNT_SW_PAGE_FAULTS:
4251         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4252         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4253         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4254         case PERF_COUNT_SW_CPU_MIGRATIONS:
4255                 if (!event->parent) {
4256                         atomic_inc(&perf_swevent_enabled[event_id]);
4257                         event->destroy = sw_perf_event_destroy;
4258                 }
4259                 pmu = &perf_ops_generic;
4260                 break;
4261         }
4262
4263         return pmu;
4264 }
4265
4266 /*
4267  * Allocate and initialize a event structure
4268  */
4269 static struct perf_event *
4270 perf_event_alloc(struct perf_event_attr *attr,
4271                    int cpu,
4272                    struct perf_event_context *ctx,
4273                    struct perf_event *group_leader,
4274                    struct perf_event *parent_event,
4275                    gfp_t gfpflags)
4276 {
4277         const struct pmu *pmu;
4278         struct perf_event *event;
4279         struct hw_perf_event *hwc;
4280         long err;
4281
4282         event = kzalloc(sizeof(*event), gfpflags);
4283         if (!event)
4284                 return ERR_PTR(-ENOMEM);
4285
4286         /*
4287          * Single events are their own group leaders, with an
4288          * empty sibling list:
4289          */
4290         if (!group_leader)
4291                 group_leader = event;
4292
4293         mutex_init(&event->child_mutex);
4294         INIT_LIST_HEAD(&event->child_list);
4295
4296         INIT_LIST_HEAD(&event->group_entry);
4297         INIT_LIST_HEAD(&event->event_entry);
4298         INIT_LIST_HEAD(&event->sibling_list);
4299         init_waitqueue_head(&event->waitq);
4300
4301         mutex_init(&event->mmap_mutex);
4302
4303         event->cpu              = cpu;
4304         event->attr             = *attr;
4305         event->group_leader     = group_leader;
4306         event->pmu              = NULL;
4307         event->ctx              = ctx;
4308         event->oncpu            = -1;
4309
4310         event->parent           = parent_event;
4311
4312         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4313         event->id               = atomic64_inc_return(&perf_event_id);
4314
4315         event->state            = PERF_EVENT_STATE_INACTIVE;
4316
4317         if (attr->disabled)
4318                 event->state = PERF_EVENT_STATE_OFF;
4319
4320         pmu = NULL;
4321
4322         hwc = &event->hw;
4323         hwc->sample_period = attr->sample_period;
4324         if (attr->freq && attr->sample_freq)
4325                 hwc->sample_period = 1;
4326         hwc->last_period = hwc->sample_period;
4327
4328         atomic64_set(&hwc->period_left, hwc->sample_period);
4329
4330         /*
4331          * we currently do not support PERF_FORMAT_GROUP on inherited events
4332          */
4333         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4334                 goto done;
4335
4336         switch (attr->type) {
4337         case PERF_TYPE_RAW:
4338         case PERF_TYPE_HARDWARE:
4339         case PERF_TYPE_HW_CACHE:
4340                 pmu = hw_perf_event_init(event);
4341                 break;
4342
4343         case PERF_TYPE_SOFTWARE:
4344                 pmu = sw_perf_event_init(event);
4345                 break;
4346
4347         case PERF_TYPE_TRACEPOINT:
4348                 pmu = tp_perf_event_init(event);
4349                 break;
4350
4351         default:
4352                 break;
4353         }
4354 done:
4355         err = 0;
4356         if (!pmu)
4357                 err = -EINVAL;
4358         else if (IS_ERR(pmu))
4359                 err = PTR_ERR(pmu);
4360
4361         if (err) {
4362                 if (event->ns)
4363                         put_pid_ns(event->ns);
4364                 kfree(event);
4365                 return ERR_PTR(err);
4366         }
4367
4368         event->pmu = pmu;
4369
4370         if (!event->parent) {
4371                 atomic_inc(&nr_events);
4372                 if (event->attr.mmap)
4373                         atomic_inc(&nr_mmap_events);
4374                 if (event->attr.comm)
4375                         atomic_inc(&nr_comm_events);
4376                 if (event->attr.task)
4377                         atomic_inc(&nr_task_events);
4378         }
4379
4380         return event;
4381 }
4382
4383 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4384                           struct perf_event_attr *attr)
4385 {
4386         u32 size;
4387         int ret;
4388
4389         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4390                 return -EFAULT;
4391
4392         /*
4393          * zero the full structure, so that a short copy will be nice.
4394          */
4395         memset(attr, 0, sizeof(*attr));
4396
4397         ret = get_user(size, &uattr->size);
4398         if (ret)
4399                 return ret;
4400
4401         if (size > PAGE_SIZE)   /* silly large */
4402                 goto err_size;
4403
4404         if (!size)              /* abi compat */
4405                 size = PERF_ATTR_SIZE_VER0;
4406
4407         if (size < PERF_ATTR_SIZE_VER0)
4408                 goto err_size;
4409
4410         /*
4411          * If we're handed a bigger struct than we know of,
4412          * ensure all the unknown bits are 0 - i.e. new
4413          * user-space does not rely on any kernel feature
4414          * extensions we dont know about yet.
4415          */
4416         if (size > sizeof(*attr)) {
4417                 unsigned char __user *addr;
4418                 unsigned char __user *end;
4419                 unsigned char val;
4420
4421                 addr = (void __user *)uattr + sizeof(*attr);
4422                 end  = (void __user *)uattr + size;
4423
4424                 for (; addr < end; addr++) {
4425                         ret = get_user(val, addr);
4426                         if (ret)
4427                                 return ret;
4428                         if (val)
4429                                 goto err_size;
4430                 }
4431                 size = sizeof(*attr);
4432         }
4433
4434         ret = copy_from_user(attr, uattr, size);
4435         if (ret)
4436                 return -EFAULT;
4437
4438         /*
4439          * If the type exists, the corresponding creation will verify
4440          * the attr->config.
4441          */
4442         if (attr->type >= PERF_TYPE_MAX)
4443                 return -EINVAL;
4444
4445         if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4446                 return -EINVAL;
4447
4448         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4449                 return -EINVAL;
4450
4451         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4452                 return -EINVAL;
4453
4454 out:
4455         return ret;
4456
4457 err_size:
4458         put_user(sizeof(*attr), &uattr->size);
4459         ret = -E2BIG;
4460         goto out;
4461 }
4462
4463 static int perf_event_set_output(struct perf_event *event, int output_fd)
4464 {
4465         struct perf_event *output_event = NULL;
4466         struct file *output_file = NULL;
4467         struct perf_event *old_output;
4468         int fput_needed = 0;
4469         int ret = -EINVAL;
4470
4471         if (!output_fd)
4472                 goto set;
4473
4474         output_file = fget_light(output_fd, &fput_needed);
4475         if (!output_file)
4476                 return -EBADF;
4477
4478         if (output_file->f_op != &perf_fops)
4479                 goto out;
4480
4481         output_event = output_file->private_data;
4482
4483         /* Don't chain output fds */
4484         if (output_event->output)
4485                 goto out;
4486
4487         /* Don't set an output fd when we already have an output channel */
4488         if (event->data)
4489                 goto out;
4490
4491         atomic_long_inc(&output_file->f_count);
4492
4493 set:
4494         mutex_lock(&event->mmap_mutex);
4495         old_output = event->output;
4496         rcu_assign_pointer(event->output, output_event);
4497         mutex_unlock(&event->mmap_mutex);
4498
4499         if (old_output) {
4500                 /*
4501                  * we need to make sure no existing perf_output_*()
4502                  * is still referencing this event.
4503                  */
4504                 synchronize_rcu();
4505                 fput(old_output->filp);
4506         }
4507
4508         ret = 0;
4509 out:
4510         fput_light(output_file, fput_needed);
4511         return ret;
4512 }
4513
4514 /**
4515  * sys_perf_event_open - open a performance event, associate it to a task/cpu
4516  *
4517  * @attr_uptr:  event_id type attributes for monitoring/sampling
4518  * @pid:                target pid
4519  * @cpu:                target cpu
4520  * @group_fd:           group leader event fd
4521  */
4522 SYSCALL_DEFINE5(perf_event_open,
4523                 struct perf_event_attr __user *, attr_uptr,
4524                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4525 {
4526         struct perf_event *event, *group_leader;
4527         struct perf_event_attr attr;
4528         struct perf_event_context *ctx;
4529         struct file *event_file = NULL;
4530         struct file *group_file = NULL;
4531         int fput_needed = 0;
4532         int fput_needed2 = 0;
4533         int err;
4534
4535         /* for future expandability... */
4536         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4537                 return -EINVAL;
4538
4539         err = perf_copy_attr(attr_uptr, &attr);
4540         if (err)
4541                 return err;
4542
4543         if (!attr.exclude_kernel) {
4544                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4545                         return -EACCES;
4546         }
4547
4548         if (attr.freq) {
4549                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4550                         return -EINVAL;
4551         }
4552
4553         /*
4554          * Get the target context (task or percpu):
4555          */
4556         ctx = find_get_context(pid, cpu);
4557         if (IS_ERR(ctx))
4558                 return PTR_ERR(ctx);
4559
4560         /*
4561          * Look up the group leader (we will attach this event to it):
4562          */
4563         group_leader = NULL;
4564         if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4565                 err = -EINVAL;
4566                 group_file = fget_light(group_fd, &fput_needed);
4567                 if (!group_file)
4568                         goto err_put_context;
4569                 if (group_file->f_op != &perf_fops)
4570                         goto err_put_context;
4571
4572                 group_leader = group_file->private_data;
4573                 /*
4574                  * Do not allow a recursive hierarchy (this new sibling
4575                  * becoming part of another group-sibling):
4576                  */
4577                 if (group_leader->group_leader != group_leader)
4578                         goto err_put_context;
4579                 /*
4580                  * Do not allow to attach to a group in a different
4581                  * task or CPU context:
4582                  */
4583                 if (group_leader->ctx != ctx)
4584                         goto err_put_context;
4585                 /*
4586                  * Only a group leader can be exclusive or pinned
4587                  */
4588                 if (attr.exclusive || attr.pinned)
4589                         goto err_put_context;
4590         }
4591
4592         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4593                                      NULL, GFP_KERNEL);
4594         err = PTR_ERR(event);
4595         if (IS_ERR(event))
4596                 goto err_put_context;
4597
4598         err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4599         if (err < 0)
4600                 goto err_free_put_context;
4601
4602         event_file = fget_light(err, &fput_needed2);
4603         if (!event_file)
4604                 goto err_free_put_context;
4605
4606         if (flags & PERF_FLAG_FD_OUTPUT) {
4607                 err = perf_event_set_output(event, group_fd);
4608                 if (err)
4609                         goto err_fput_free_put_context;
4610         }
4611
4612         event->filp = event_file;
4613         WARN_ON_ONCE(ctx->parent_ctx);
4614         mutex_lock(&ctx->mutex);
4615         perf_install_in_context(ctx, event, cpu);
4616         ++ctx->generation;
4617         mutex_unlock(&ctx->mutex);
4618
4619         event->owner = current;
4620         get_task_struct(current);
4621         mutex_lock(&current->perf_event_mutex);
4622         list_add_tail(&event->owner_entry, &current->perf_event_list);
4623         mutex_unlock(&current->perf_event_mutex);
4624
4625 err_fput_free_put_context:
4626         fput_light(event_file, fput_needed2);
4627
4628 err_free_put_context:
4629         if (err < 0)
4630                 kfree(event);
4631
4632 err_put_context:
4633         if (err < 0)
4634                 put_ctx(ctx);
4635
4636         fput_light(group_file, fput_needed);
4637
4638         return err;
4639 }
4640
4641 /*
4642  * inherit a event from parent task to child task:
4643  */
4644 static struct perf_event *
4645 inherit_event(struct perf_event *parent_event,
4646               struct task_struct *parent,
4647               struct perf_event_context *parent_ctx,
4648               struct task_struct *child,
4649               struct perf_event *group_leader,
4650               struct perf_event_context *child_ctx)
4651 {
4652         struct perf_event *child_event;
4653
4654         /*
4655          * Instead of creating recursive hierarchies of events,
4656          * we link inherited events back to the original parent,
4657          * which has a filp for sure, which we use as the reference
4658          * count:
4659          */
4660         if (parent_event->parent)
4661                 parent_event = parent_event->parent;
4662
4663         child_event = perf_event_alloc(&parent_event->attr,
4664                                            parent_event->cpu, child_ctx,
4665                                            group_leader, parent_event,
4666                                            GFP_KERNEL);
4667         if (IS_ERR(child_event))
4668                 return child_event;
4669         get_ctx(child_ctx);
4670
4671         /*
4672          * Make the child state follow the state of the parent event,
4673          * not its attr.disabled bit.  We hold the parent's mutex,
4674          * so we won't race with perf_event_{en, dis}able_family.
4675          */
4676         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4677                 child_event->state = PERF_EVENT_STATE_INACTIVE;
4678         else
4679                 child_event->state = PERF_EVENT_STATE_OFF;
4680
4681         if (parent_event->attr.freq)
4682                 child_event->hw.sample_period = parent_event->hw.sample_period;
4683
4684         /*
4685          * Link it up in the child's context:
4686          */
4687         add_event_to_ctx(child_event, child_ctx);
4688
4689         /*
4690          * Get a reference to the parent filp - we will fput it
4691          * when the child event exits. This is safe to do because
4692          * we are in the parent and we know that the filp still
4693          * exists and has a nonzero count:
4694          */
4695         atomic_long_inc(&parent_event->filp->f_count);
4696
4697         /*
4698          * Link this into the parent event's child list
4699          */
4700         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4701         mutex_lock(&parent_event->child_mutex);
4702         list_add_tail(&child_event->child_list, &parent_event->child_list);
4703         mutex_unlock(&parent_event->child_mutex);
4704
4705         return child_event;
4706 }
4707
4708 static int inherit_group(struct perf_event *parent_event,
4709               struct task_struct *parent,
4710               struct perf_event_context *parent_ctx,
4711               struct task_struct *child,
4712               struct perf_event_context *child_ctx)
4713 {
4714         struct perf_event *leader;
4715         struct perf_event *sub;
4716         struct perf_event *child_ctr;
4717
4718         leader = inherit_event(parent_event, parent, parent_ctx,
4719                                  child, NULL, child_ctx);
4720         if (IS_ERR(leader))
4721                 return PTR_ERR(leader);
4722         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4723                 child_ctr = inherit_event(sub, parent, parent_ctx,
4724                                             child, leader, child_ctx);
4725                 if (IS_ERR(child_ctr))
4726                         return PTR_ERR(child_ctr);
4727         }
4728         return 0;
4729 }
4730
4731 static void sync_child_event(struct perf_event *child_event,
4732                                struct task_struct *child)
4733 {
4734         struct perf_event *parent_event = child_event->parent;
4735         u64 child_val;
4736
4737         if (child_event->attr.inherit_stat)
4738                 perf_event_read_event(child_event, child);
4739
4740         child_val = atomic64_read(&child_event->count);
4741
4742         /*
4743          * Add back the child's count to the parent's count:
4744          */
4745         atomic64_add(child_val, &parent_event->count);
4746         atomic64_add(child_event->total_time_enabled,
4747                      &parent_event->child_total_time_enabled);
4748         atomic64_add(child_event->total_time_running,
4749                      &parent_event->child_total_time_running);
4750
4751         /*
4752          * Remove this event from the parent's list
4753          */
4754         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4755         mutex_lock(&parent_event->child_mutex);
4756         list_del_init(&child_event->child_list);
4757         mutex_unlock(&parent_event->child_mutex);
4758
4759         /*
4760          * Release the parent event, if this was the last
4761          * reference to it.
4762          */
4763         fput(parent_event->filp);
4764 }
4765
4766 static void
4767 __perf_event_exit_task(struct perf_event *child_event,
4768                          struct perf_event_context *child_ctx,
4769                          struct task_struct *child)
4770 {
4771         struct perf_event *parent_event;
4772
4773         update_event_times(child_event);
4774         perf_event_remove_from_context(child_event);
4775
4776         parent_event = child_event->parent;
4777         /*
4778          * It can happen that parent exits first, and has events
4779          * that are still around due to the child reference. These
4780          * events need to be zapped - but otherwise linger.
4781          */
4782         if (parent_event) {
4783                 sync_child_event(child_event, child);
4784                 free_event(child_event);
4785         }
4786 }
4787
4788 /*
4789  * When a child task exits, feed back event values to parent events.
4790  */
4791 void perf_event_exit_task(struct task_struct *child)
4792 {
4793         struct perf_event *child_event, *tmp;
4794         struct perf_event_context *child_ctx;
4795         unsigned long flags;
4796
4797         if (likely(!child->perf_event_ctxp)) {
4798                 perf_event_task(child, NULL, 0);
4799                 return;
4800         }
4801
4802         local_irq_save(flags);
4803         /*
4804          * We can't reschedule here because interrupts are disabled,
4805          * and either child is current or it is a task that can't be
4806          * scheduled, so we are now safe from rescheduling changing
4807          * our context.
4808          */
4809         child_ctx = child->perf_event_ctxp;
4810         __perf_event_task_sched_out(child_ctx);
4811
4812         /*
4813          * Take the context lock here so that if find_get_context is
4814          * reading child->perf_event_ctxp, we wait until it has
4815          * incremented the context's refcount before we do put_ctx below.
4816          */
4817         spin_lock(&child_ctx->lock);
4818         child->perf_event_ctxp = NULL;
4819         /*
4820          * If this context is a clone; unclone it so it can't get
4821          * swapped to another process while we're removing all
4822          * the events from it.
4823          */
4824         unclone_ctx(child_ctx);
4825         spin_unlock_irqrestore(&child_ctx->lock, flags);
4826
4827         /*
4828          * Report the task dead after unscheduling the events so that we
4829          * won't get any samples after PERF_RECORD_EXIT. We can however still
4830          * get a few PERF_RECORD_READ events.
4831          */
4832         perf_event_task(child, child_ctx, 0);
4833
4834         /*
4835          * We can recurse on the same lock type through:
4836          *
4837          *   __perf_event_exit_task()
4838          *     sync_child_event()
4839          *       fput(parent_event->filp)
4840          *         perf_release()
4841          *           mutex_lock(&ctx->mutex)
4842          *
4843          * But since its the parent context it won't be the same instance.
4844          */
4845         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4846
4847 again:
4848         list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4849                                  group_entry)
4850                 __perf_event_exit_task(child_event, child_ctx, child);
4851
4852         /*
4853          * If the last event was a group event, it will have appended all
4854          * its siblings to the list, but we obtained 'tmp' before that which
4855          * will still point to the list head terminating the iteration.
4856          */
4857         if (!list_empty(&child_ctx->group_list))
4858                 goto again;
4859
4860         mutex_unlock(&child_ctx->mutex);
4861
4862         put_ctx(child_ctx);
4863 }
4864
4865 /*
4866  * free an unexposed, unused context as created by inheritance by
4867  * init_task below, used by fork() in case of fail.
4868  */
4869 void perf_event_free_task(struct task_struct *task)
4870 {
4871         struct perf_event_context *ctx = task->perf_event_ctxp;
4872         struct perf_event *event, *tmp;
4873
4874         if (!ctx)
4875                 return;
4876
4877         mutex_lock(&ctx->mutex);
4878 again:
4879         list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4880                 struct perf_event *parent = event->parent;
4881
4882                 if (WARN_ON_ONCE(!parent))
4883                         continue;
4884
4885                 mutex_lock(&parent->child_mutex);
4886                 list_del_init(&event->child_list);
4887                 mutex_unlock(&parent->child_mutex);
4888
4889                 fput(parent->filp);
4890
4891                 list_del_event(event, ctx);
4892                 free_event(event);
4893         }
4894
4895         if (!list_empty(&ctx->group_list))
4896                 goto again;
4897
4898         mutex_unlock(&ctx->mutex);
4899
4900         put_ctx(ctx);
4901 }
4902
4903 /*
4904  * Initialize the perf_event context in task_struct
4905  */
4906 int perf_event_init_task(struct task_struct *child)
4907 {
4908         struct perf_event_context *child_ctx, *parent_ctx;
4909         struct perf_event_context *cloned_ctx;
4910         struct perf_event *event;
4911         struct task_struct *parent = current;
4912         int inherited_all = 1;
4913         int ret = 0;
4914
4915         child->perf_event_ctxp = NULL;
4916
4917         mutex_init(&child->perf_event_mutex);
4918         INIT_LIST_HEAD(&child->perf_event_list);
4919
4920         if (likely(!parent->perf_event_ctxp))
4921                 return 0;
4922
4923         /*
4924          * This is executed from the parent task context, so inherit
4925          * events that have been marked for cloning.
4926          * First allocate and initialize a context for the child.
4927          */
4928
4929         child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4930         if (!child_ctx)
4931                 return -ENOMEM;
4932
4933         __perf_event_init_context(child_ctx, child);
4934         child->perf_event_ctxp = child_ctx;
4935         get_task_struct(child);
4936
4937         /*
4938          * If the parent's context is a clone, pin it so it won't get
4939          * swapped under us.
4940          */
4941         parent_ctx = perf_pin_task_context(parent);
4942
4943         /*
4944          * No need to check if parent_ctx != NULL here; since we saw
4945          * it non-NULL earlier, the only reason for it to become NULL
4946          * is if we exit, and since we're currently in the middle of
4947          * a fork we can't be exiting at the same time.
4948          */
4949
4950         /*
4951          * Lock the parent list. No need to lock the child - not PID
4952          * hashed yet and not running, so nobody can access it.
4953          */
4954         mutex_lock(&parent_ctx->mutex);
4955
4956         /*
4957          * We dont have to disable NMIs - we are only looking at
4958          * the list, not manipulating it:
4959          */
4960         list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4961
4962                 if (!event->attr.inherit) {
4963                         inherited_all = 0;
4964                         continue;
4965                 }
4966
4967                 ret = inherit_group(event, parent, parent_ctx,
4968                                              child, child_ctx);
4969                 if (ret) {
4970                         inherited_all = 0;
4971                         break;
4972                 }
4973         }
4974
4975         if (inherited_all) {
4976                 /*
4977                  * Mark the child context as a clone of the parent
4978                  * context, or of whatever the parent is a clone of.
4979                  * Note that if the parent is a clone, it could get
4980                  * uncloned at any point, but that doesn't matter
4981                  * because the list of events and the generation
4982                  * count can't have changed since we took the mutex.
4983                  */
4984                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4985                 if (cloned_ctx) {
4986                         child_ctx->parent_ctx = cloned_ctx;
4987                         child_ctx->parent_gen = parent_ctx->parent_gen;
4988                 } else {
4989                         child_ctx->parent_ctx = parent_ctx;
4990                         child_ctx->parent_gen = parent_ctx->generation;
4991                 }
4992                 get_ctx(child_ctx->parent_ctx);
4993         }
4994
4995         mutex_unlock(&parent_ctx->mutex);
4996
4997         perf_unpin_context(parent_ctx);
4998
4999         return ret;
5000 }
5001
5002 static void __cpuinit perf_event_init_cpu(int cpu)
5003 {
5004         struct perf_cpu_context *cpuctx;
5005
5006         cpuctx = &per_cpu(perf_cpu_context, cpu);
5007         __perf_event_init_context(&cpuctx->ctx, NULL);
5008
5009         spin_lock(&perf_resource_lock);
5010         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5011         spin_unlock(&perf_resource_lock);
5012
5013         hw_perf_event_setup(cpu);
5014 }
5015
5016 #ifdef CONFIG_HOTPLUG_CPU
5017 static void __perf_event_exit_cpu(void *info)
5018 {
5019         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5020         struct perf_event_context *ctx = &cpuctx->ctx;
5021         struct perf_event *event, *tmp;
5022
5023         list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
5024                 __perf_event_remove_from_context(event);
5025 }
5026 static void perf_event_exit_cpu(int cpu)
5027 {
5028         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5029         struct perf_event_context *ctx = &cpuctx->ctx;
5030
5031         mutex_lock(&ctx->mutex);
5032         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5033         mutex_unlock(&ctx->mutex);
5034 }
5035 #else
5036 static inline void perf_event_exit_cpu(int cpu) { }
5037 #endif
5038
5039 static int __cpuinit
5040 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5041 {
5042         unsigned int cpu = (long)hcpu;
5043
5044         switch (action) {
5045
5046         case CPU_UP_PREPARE:
5047         case CPU_UP_PREPARE_FROZEN:
5048                 perf_event_init_cpu(cpu);
5049                 break;
5050
5051         case CPU_ONLINE:
5052         case CPU_ONLINE_FROZEN:
5053                 hw_perf_event_setup_online(cpu);
5054                 break;
5055
5056         case CPU_DOWN_PREPARE:
5057         case CPU_DOWN_PREPARE_FROZEN:
5058                 perf_event_exit_cpu(cpu);
5059                 break;
5060
5061         default:
5062                 break;
5063         }
5064
5065         return NOTIFY_OK;
5066 }
5067
5068 /*
5069  * This has to have a higher priority than migration_notifier in sched.c.
5070  */
5071 static struct notifier_block __cpuinitdata perf_cpu_nb = {
5072         .notifier_call          = perf_cpu_notify,
5073         .priority               = 20,
5074 };
5075
5076 void __init perf_event_init(void)
5077 {
5078         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5079                         (void *)(long)smp_processor_id());
5080         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5081                         (void *)(long)smp_processor_id());
5082         register_cpu_notifier(&perf_cpu_nb);
5083 }
5084
5085 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5086 {
5087         return sprintf(buf, "%d\n", perf_reserved_percpu);
5088 }
5089
5090 static ssize_t
5091 perf_set_reserve_percpu(struct sysdev_class *class,
5092                         const char *buf,
5093                         size_t count)
5094 {
5095         struct perf_cpu_context *cpuctx;
5096         unsigned long val;
5097         int err, cpu, mpt;
5098
5099         err = strict_strtoul(buf, 10, &val);
5100         if (err)
5101                 return err;
5102         if (val > perf_max_events)
5103                 return -EINVAL;
5104
5105         spin_lock(&perf_resource_lock);
5106         perf_reserved_percpu = val;
5107         for_each_online_cpu(cpu) {
5108                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5109                 spin_lock_irq(&cpuctx->ctx.lock);
5110                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5111                           perf_max_events - perf_reserved_percpu);
5112                 cpuctx->max_pertask = mpt;
5113                 spin_unlock_irq(&cpuctx->ctx.lock);
5114         }
5115         spin_unlock(&perf_resource_lock);
5116
5117         return count;
5118 }
5119
5120 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5121 {
5122         return sprintf(buf, "%d\n", perf_overcommit);
5123 }
5124
5125 static ssize_t
5126 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5127 {
5128         unsigned long val;
5129         int err;
5130
5131         err = strict_strtoul(buf, 10, &val);
5132         if (err)
5133                 return err;
5134         if (val > 1)
5135                 return -EINVAL;
5136
5137         spin_lock(&perf_resource_lock);
5138         perf_overcommit = val;
5139         spin_unlock(&perf_resource_lock);
5140
5141         return count;
5142 }
5143
5144 static SYSDEV_CLASS_ATTR(
5145                                 reserve_percpu,
5146                                 0644,
5147                                 perf_show_reserve_percpu,
5148                                 perf_set_reserve_percpu
5149                         );
5150
5151 static SYSDEV_CLASS_ATTR(
5152                                 overcommit,
5153                                 0644,
5154                                 perf_show_overcommit,
5155                                 perf_set_overcommit
5156                         );
5157
5158 static struct attribute *perfclass_attrs[] = {
5159         &attr_reserve_percpu.attr,
5160         &attr_overcommit.attr,
5161         NULL
5162 };
5163
5164 static struct attribute_group perfclass_attr_group = {
5165         .attrs                  = perfclass_attrs,
5166         .name                   = "perf_events",
5167 };
5168
5169 static int __init perf_event_sysfs_init(void)
5170 {
5171         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5172                                   &perfclass_attr_group);
5173 }
5174 device_initcall(perf_event_sysfs_init);