kernel/perf_event.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/file.h>
  17 #include <linux/poll.h>
  18 #include <linux/sysfs.h>
  19 #include <linux/dcache.h>
  20 #include <linux/percpu.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/vmstat.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/hardirq.h>
  25 #include <linux/rculist.h>
  26 #include <linux/uaccess.h>
  27 #include <linux/syscalls.h>
  28 #include <linux/anon_inodes.h>
  29 #include <linux/kernel_stat.h>
  30 #include <linux/perf_event.h>
  31 #include <linux/ftrace_event.h>
  32 #include <linux/hw_breakpoint.h>
  33
  34 #include <asm/irq_regs.h>
  35
  36 /*
  37  * Each CPU has a list of per CPU events:
  38  */
  39 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  40
  41 int perf_max_events __read_mostly = 1;
  42 static int perf_reserved_percpu __read_mostly;
  43 static int perf_overcommit __read_mostly = 1;
  44
  45 static atomic_t nr_events __read_mostly;
  46 static atomic_t nr_mmap_events __read_mostly;
  47 static atomic_t nr_comm_events __read_mostly;
  48 static atomic_t nr_task_events __read_mostly;
  49
  50 /*
  51  * perf event paranoia level:
  52  *  -1 - not paranoid at all
  53  *   0 - disallow raw tracepoint access for unpriv
  54  *   1 - disallow cpu events for unpriv
  55  *   2 - disallow kernel profiling for unpriv
  56  */
  57 int sysctl_perf_event_paranoid __read_mostly = 1;
  58
  59 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
  60
  61 /*
  62  * max perf event sample rate
  63  */
  64 int sysctl_perf_event_sample_rate __read_mostly = 100000;
  65
  66 static atomic64_t perf_event_id;
  67
  68 /*
  69  * Lock for (sysadmin-configurable) event reservations:
  70  */
  71 static DEFINE_SPINLOCK(perf_resource_lock);
  72
  73 /*
  74  * Architecture provided APIs - weak aliases:
  75  */
  76 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
  77 {
  78         return NULL;
  79 }
  80
  81 void __weak hw_perf_disable(void)               { barrier(); }
  82 void __weak hw_perf_enable(void)                { barrier(); }
  83
  84 int __weak
  85 hw_perf_group_sched_in(struct perf_event *group_leader,
  86                struct perf_cpu_context *cpuctx,
  87                struct perf_event_context *ctx)
  88 {
  89         return 0;
  90 }
  91
  92 void __weak perf_event_print_debug(void)        { }
  93
  94 static DEFINE_PER_CPU(int, perf_disable_count);
  95
  96 void __perf_disable(void)
  97 {
  98         __get_cpu_var(perf_disable_count)++;
  99 }
 100
 101 bool __perf_enable(void)
 102 {
 103         return !--__get_cpu_var(perf_disable_count);
 104 }
 105
 106 void perf_disable(void)
 107 {
 108         __perf_disable();
 109         hw_perf_disable();
 110 }
 111
 112 void perf_enable(void)
 113 {
 114         if (__perf_enable())
 115                 hw_perf_enable();
 116 }
 117
 118 static void get_ctx(struct perf_event_context *ctx)
 119 {
 120         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
 121 }
 122
 123 static void free_ctx(struct rcu_head *head)
 124 {
 125         struct perf_event_context *ctx;
 126
 127         ctx = container_of(head, struct perf_event_context, rcu_head);
 128         kfree(ctx);
 129 }
 130
 131 static void put_ctx(struct perf_event_context *ctx)
 132 {
 133         if (atomic_dec_and_test(&ctx->refcount)) {
 134                 if (ctx->parent_ctx)
 135                         put_ctx(ctx->parent_ctx);
 136                 if (ctx->task)
 137                         put_task_struct(ctx->task);
 138                 call_rcu(&ctx->rcu_head, free_ctx);
 139         }
 140 }
 141
 142 static void unclone_ctx(struct perf_event_context *ctx)
 143 {
 144         if (ctx->parent_ctx) {
 145                 put_ctx(ctx->parent_ctx);
 146                 ctx->parent_ctx = NULL;
 147         }
 148 }
 149
 150 /*
 151  * If we inherit events we want to return the parent event id
 152  * to userspace.
 153  */
 154 static u64 primary_event_id(struct perf_event *event)
 155 {
 156         u64 id = event->id;
 157
 158         if (event->parent)
 159                 id = event->parent->id;
 160
 161         return id;
 162 }
 163
 164 /*
 165  * Get the perf_event_context for a task and lock it.
 166  * This has to cope with with the fact that until it is locked,
 167  * the context could get moved to another task.
 168  */
 169 static struct perf_event_context *
 170 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 171 {
 172         struct perf_event_context *ctx;
 173
 174         rcu_read_lock();
 175  retry:
 176         ctx = rcu_dereference(task->perf_event_ctxp);
 177         if (ctx) {
 178                 /*
 179                  * If this context is a clone of another, it might
 180                  * get swapped for another underneath us by
 181                  * perf_event_task_sched_out, though the
 182                  * rcu_read_lock() protects us from any context
 183                  * getting freed.  Lock the context and check if it
 184                  * got swapped before we could get the lock, and retry
 185                  * if so.  If we locked the right context, then it
 186                  * can't get swapped on us any more.
 187                  */
 188                 raw_spin_lock_irqsave(&ctx->lock, *flags);
 189                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
 190                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 191                         goto retry;
 192                 }
 193
 194                 if (!atomic_inc_not_zero(&ctx->refcount)) {
 195                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
 196                         ctx = NULL;
 197                 }
 198         }
 199         rcu_read_unlock();
 200         return ctx;
 201 }
 202
 203 /*
 204  * Get the context for a task and increment its pin_count so it
 205  * can't get swapped to another task.  This also increments its
 206  * reference count so that the context can't get freed.
 207  */
 208 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
 209 {
 210         struct perf_event_context *ctx;
 211         unsigned long flags;
 212
 213         ctx = perf_lock_task_context(task, &flags);
 214         if (ctx) {
 215                 ++ctx->pin_count;
 216                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
 217         }
 218         return ctx;
 219 }
 220
 221 static void perf_unpin_context(struct perf_event_context *ctx)
 222 {
 223         unsigned long flags;
 224
 225         raw_spin_lock_irqsave(&ctx->lock, flags);
 226         --ctx->pin_count;
 227         raw_spin_unlock_irqrestore(&ctx->lock, flags);
 228         put_ctx(ctx);
 229 }
 230
 231 static inline u64 perf_clock(void)
 232 {
 233         return cpu_clock(raw_smp_processor_id());
 234 }
 235
 236 /*
 237  * Update the record of the current time in a context.
 238  */
 239 static void update_context_time(struct perf_event_context *ctx)
 240 {
 241         u64 now = perf_clock();
 242
 243         ctx->time += now - ctx->timestamp;
 244         ctx->timestamp = now;
 245 }
 246
 247 /*
 248  * Update the total_time_enabled and total_time_running fields for a event.
 249  */
 250 static void update_event_times(struct perf_event *event)
 251 {
 252         struct perf_event_context *ctx = event->ctx;
 253         u64 run_end;
 254
 255         if (event->state < PERF_EVENT_STATE_INACTIVE ||
 256             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
 257                 return;
 258
 259         if (ctx->is_active)
 260                 run_end = ctx->time;
 261         else
 262                 run_end = event->tstamp_stopped;
 263
 264         event->total_time_enabled = run_end - event->tstamp_enabled;
 265
 266         if (event->state == PERF_EVENT_STATE_INACTIVE)
 267                 run_end = event->tstamp_stopped;
 268         else
 269                 run_end = ctx->time;
 270
 271         event->total_time_running = run_end - event->tstamp_running;
 272 }
 273
 274 static struct list_head *
 275 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 276 {
 277         if (event->attr.pinned)
 278                 return &ctx->pinned_groups;
 279         else
 280                 return &ctx->flexible_groups;
 281 }
 282
 283 /*
 284  * Add a event from the lists for its context.
 285  * Must be called with ctx->mutex and ctx->lock held.
 286  */
 287 static void
 288 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 289 {
 290         struct perf_event *group_leader = event->group_leader;
 291
 292         /*
 293          * Depending on whether it is a standalone or sibling event,
 294          * add it straight to the context's event list, or to the group
 295          * leader's sibling list:
 296          */
 297         if (group_leader == event) {
 298                 struct list_head *list;
 299
 300                 if (is_software_event(event))
 301                         event->group_flags |= PERF_GROUP_SOFTWARE;
 302
 303                 list = ctx_group_list(event, ctx);
 304                 list_add_tail(&event->group_entry, list);
 305         } else {
 306                 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
 307                     !is_software_event(event))
 308                         group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
 309
 310                 list_add_tail(&event->group_entry, &group_leader->sibling_list);
 311                 group_leader->nr_siblings++;
 312         }
 313
 314         list_add_rcu(&event->event_entry, &ctx->event_list);
 315         ctx->nr_events++;
 316         if (event->attr.inherit_stat)
 317                 ctx->nr_stat++;
 318 }
 319
 320 /*
 321  * Remove a event from the lists for its context.
 322  * Must be called with ctx->mutex and ctx->lock held.
 323  */
 324 static void
 325 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 326 {
 327         struct perf_event *sibling, *tmp;
 328
 329         if (list_empty(&event->group_entry))
 330                 return;
 331         ctx->nr_events--;
 332         if (event->attr.inherit_stat)
 333                 ctx->nr_stat--;
 334
 335         list_del_init(&event->group_entry);
 336         list_del_rcu(&event->event_entry);
 337
 338         if (event->group_leader != event)
 339                 event->group_leader->nr_siblings--;
 340
 341         update_event_times(event);
 342
 343         /*
 344          * If event was in error state, then keep it
 345          * that way, otherwise bogus counts will be
 346          * returned on read(). The only way to get out
 347          * of error state is by explicit re-enabling
 348          * of the event
 349          */
 350         if (event->state > PERF_EVENT_STATE_OFF)
 351                 event->state = PERF_EVENT_STATE_OFF;
 352
 353         /*
 354          * If this was a group event with sibling events then
 355          * upgrade the siblings to singleton events by adding them
 356          * to the context list directly:
 357          */
 358         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 359                 struct list_head *list;
 360
 361                 list = ctx_group_list(event, ctx);
 362                 list_move_tail(&sibling->group_entry, list);
 363                 sibling->group_leader = sibling;
 364
 365                 /* Inherit group flags from the previous leader */
 366                 sibling->group_flags = event->group_flags;
 367         }
 368 }
 369
 370 static void
 371 event_sched_out(struct perf_event *event,
 372                   struct perf_cpu_context *cpuctx,
 373                   struct perf_event_context *ctx)
 374 {
 375         if (event->state != PERF_EVENT_STATE_ACTIVE)
 376                 return;
 377
 378         event->state = PERF_EVENT_STATE_INACTIVE;
 379         if (event->pending_disable) {
 380                 event->pending_disable = 0;
 381                 event->state = PERF_EVENT_STATE_OFF;
 382         }
 383         event->tstamp_stopped = ctx->time;
 384         event->pmu->disable(event);
 385         event->oncpu = -1;
 386
 387         if (!is_software_event(event))
 388                 cpuctx->active_oncpu--;
 389         ctx->nr_active--;
 390         if (event->attr.exclusive || !cpuctx->active_oncpu)
 391                 cpuctx->exclusive = 0;
 392 }
 393
 394 static void
 395 group_sched_out(struct perf_event *group_event,
 396                 struct perf_cpu_context *cpuctx,
 397                 struct perf_event_context *ctx)
 398 {
 399         struct perf_event *event;
 400
 401         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
 402                 return;
 403
 404         event_sched_out(group_event, cpuctx, ctx);
 405
 406         /*
 407          * Schedule out siblings (if any):
 408          */
 409         list_for_each_entry(event, &group_event->sibling_list, group_entry)
 410                 event_sched_out(event, cpuctx, ctx);
 411
 412         if (group_event->attr.exclusive)
 413                 cpuctx->exclusive = 0;
 414 }
 415
 416 /*
 417  * Cross CPU call to remove a performance event
 418  *
 419  * We disable the event on the hardware level first. After that we
 420  * remove it from the context list.
 421  */
 422 static void __perf_event_remove_from_context(void *info)
 423 {
 424         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 425         struct perf_event *event = info;
 426         struct perf_event_context *ctx = event->ctx;
 427
 428         /*
 429          * If this is a task context, we need to check whether it is
 430          * the current task context of this cpu. If not it has been
 431          * scheduled out before the smp call arrived.
 432          */
 433         if (ctx->task && cpuctx->task_ctx != ctx)
 434                 return;
 435
 436         raw_spin_lock(&ctx->lock);
 437         /*
 438          * Protect the list operation against NMI by disabling the
 439          * events on a global level.
 440          */
 441         perf_disable();
 442
 443         event_sched_out(event, cpuctx, ctx);
 444
 445         list_del_event(event, ctx);
 446
 447         if (!ctx->task) {
 448                 /*
 449                  * Allow more per task events with respect to the
 450                  * reservation:
 451                  */
 452                 cpuctx->max_pertask =
 453                         min(perf_max_events - ctx->nr_events,
 454                             perf_max_events - perf_reserved_percpu);
 455         }
 456
 457         perf_enable();
 458         raw_spin_unlock(&ctx->lock);
 459 }
 460
 461
 462 /*
 463  * Remove the event from a task's (or a CPU's) list of events.
 464  *
 465  * Must be called with ctx->mutex held.
 466  *
 467  * CPU events are removed with a smp call. For task events we only
 468  * call when the task is on a CPU.
 469  *
 470  * If event->ctx is a cloned context, callers must make sure that
 471  * every task struct that event->ctx->task could possibly point to
 472  * remains valid.  This is OK when called from perf_release since
 473  * that only calls us on the top-level context, which can't be a clone.
 474  * When called from perf_event_exit_task, it's OK because the
 475  * context has been detached from its task.
 476  */
 477 static void perf_event_remove_from_context(struct perf_event *event)
 478 {
 479         struct perf_event_context *ctx = event->ctx;
 480         struct task_struct *task = ctx->task;
 481
 482         if (!task) {
 483                 /*
 484                  * Per cpu events are removed via an smp call and
 485                  * the removal is always successful.
 486                  */
 487                 smp_call_function_single(event->cpu,
 488                                          __perf_event_remove_from_context,
 489                                          event, 1);
 490                 return;
 491         }
 492
 493 retry:
 494         task_oncpu_function_call(task, __perf_event_remove_from_context,
 495                                  event);
 496
 497         raw_spin_lock_irq(&ctx->lock);
 498         /*
 499          * If the context is active we need to retry the smp call.
 500          */
 501         if (ctx->nr_active && !list_empty(&event->group_entry)) {
 502                 raw_spin_unlock_irq(&ctx->lock);
 503                 goto retry;
 504         }
 505
 506         /*
 507          * The lock prevents that this context is scheduled in so we
 508          * can remove the event safely, if the call above did not
 509          * succeed.
 510          */
 511         if (!list_empty(&event->group_entry))
 512                 list_del_event(event, ctx);
 513         raw_spin_unlock_irq(&ctx->lock);
 514 }
 515
 516 /*
 517  * Update total_time_enabled and total_time_running for all events in a group.
 518  */
 519 static void update_group_times(struct perf_event *leader)
 520 {
 521         struct perf_event *event;
 522
 523         update_event_times(leader);
 524         list_for_each_entry(event, &leader->sibling_list, group_entry)
 525                 update_event_times(event);
 526 }
 527
 528 /*
 529  * Cross CPU call to disable a performance event
 530  */
 531 static void __perf_event_disable(void *info)
 532 {
 533         struct perf_event *event = info;
 534         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 535         struct perf_event_context *ctx = event->ctx;
 536
 537         /*
 538          * If this is a per-task event, need to check whether this
 539          * event's task is the current task on this cpu.
 540          */
 541         if (ctx->task && cpuctx->task_ctx != ctx)
 542                 return;
 543
 544         raw_spin_lock(&ctx->lock);
 545
 546         /*
 547          * If the event is on, turn it off.
 548          * If it is in error state, leave it in error state.
 549          */
 550         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
 551                 update_context_time(ctx);
 552                 update_group_times(event);
 553                 if (event == event->group_leader)
 554                         group_sched_out(event, cpuctx, ctx);
 555                 else
 556                         event_sched_out(event, cpuctx, ctx);
 557                 event->state = PERF_EVENT_STATE_OFF;
 558         }
 559
 560         raw_spin_unlock(&ctx->lock);
 561 }
 562
 563 /*
 564  * Disable a event.
 565  *
 566  * If event->ctx is a cloned context, callers must make sure that
 567  * every task struct that event->ctx->task could possibly point to
 568  * remains valid.  This condition is satisifed when called through
 569  * perf_event_for_each_child or perf_event_for_each because they
 570  * hold the top-level event's child_mutex, so any descendant that
 571  * goes to exit will block in sync_child_event.
 572  * When called from perf_pending_event it's OK because event->ctx
 573  * is the current context on this CPU and preemption is disabled,
 574  * hence we can't get into perf_event_task_sched_out for this context.
 575  */
 576 void perf_event_disable(struct perf_event *event)
 577 {
 578         struct perf_event_context *ctx = event->ctx;
 579         struct task_struct *task = ctx->task;
 580
 581         if (!task) {
 582                 /*
 583                  * Disable the event on the cpu that it's on
 584                  */
 585                 smp_call_function_single(event->cpu, __perf_event_disable,
 586                                          event, 1);
 587                 return;
 588         }
 589
 590  retry:
 591         task_oncpu_function_call(task, __perf_event_disable, event);
 592
 593         raw_spin_lock_irq(&ctx->lock);
 594         /*
 595          * If the event is still active, we need to retry the cross-call.
 596          */
 597         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 598                 raw_spin_unlock_irq(&ctx->lock);
 599                 goto retry;
 600         }
 601
 602         /*
 603          * Since we have the lock this context can't be scheduled
 604          * in, so we can change the state safely.
 605          */
 606         if (event->state == PERF_EVENT_STATE_INACTIVE) {
 607                 update_group_times(event);
 608                 event->state = PERF_EVENT_STATE_OFF;
 609         }
 610
 611         raw_spin_unlock_irq(&ctx->lock);
 612 }
 613
 614 static int
 615 event_sched_in(struct perf_event *event,
 616                  struct perf_cpu_context *cpuctx,
 617                  struct perf_event_context *ctx)
 618 {
 619         if (event->state <= PERF_EVENT_STATE_OFF)
 620                 return 0;
 621
 622         event->state = PERF_EVENT_STATE_ACTIVE;
 623         event->oncpu = smp_processor_id();
 624         /*
 625          * The new state must be visible before we turn it on in the hardware:
 626          */
 627         smp_wmb();
 628
 629         if (event->pmu->enable(event)) {
 630                 event->state = PERF_EVENT_STATE_INACTIVE;
 631                 event->oncpu = -1;
 632                 return -EAGAIN;
 633         }
 634
 635         event->tstamp_running += ctx->time - event->tstamp_stopped;
 636
 637         if (!is_software_event(event))
 638                 cpuctx->active_oncpu++;
 639         ctx->nr_active++;
 640
 641         if (event->attr.exclusive)
 642                 cpuctx->exclusive = 1;
 643
 644         return 0;
 645 }
 646
 647 static int
 648 group_sched_in(struct perf_event *group_event,
 649                struct perf_cpu_context *cpuctx,
 650                struct perf_event_context *ctx)
 651 {
 652         struct perf_event *event, *partial_group;
 653         int ret;
 654
 655         if (group_event->state == PERF_EVENT_STATE_OFF)
 656                 return 0;
 657
 658         ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
 659         if (ret)
 660                 return ret < 0 ? ret : 0;
 661
 662         if (event_sched_in(group_event, cpuctx, ctx))
 663                 return -EAGAIN;
 664
 665         /*
 666          * Schedule in siblings as one group (if any):
 667          */
 668         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 669                 if (event_sched_in(event, cpuctx, ctx)) {
 670                         partial_group = event;
 671                         goto group_error;
 672                 }
 673         }
 674
 675         return 0;
 676
 677 group_error:
 678         /*
 679          * Groups can be scheduled in as one unit only, so undo any
 680          * partial group before returning:
 681          */
 682         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 683                 if (event == partial_group)
 684                         break;
 685                 event_sched_out(event, cpuctx, ctx);
 686         }
 687         event_sched_out(group_event, cpuctx, ctx);
 688
 689         return -EAGAIN;
 690 }
 691
 692 /*
 693  * Work out whether we can put this event group on the CPU now.
 694  */
 695 static int group_can_go_on(struct perf_event *event,
 696                            struct perf_cpu_context *cpuctx,
 697                            int can_add_hw)
 698 {
 699         /*
 700          * Groups consisting entirely of software events can always go on.
 701          */
 702         if (event->group_flags & PERF_GROUP_SOFTWARE)
 703                 return 1;
 704         /*
 705          * If an exclusive group is already on, no other hardware
 706          * events can go on.
 707          */
 708         if (cpuctx->exclusive)
 709                 return 0;
 710         /*
 711          * If this group is exclusive and there are already
 712          * events on the CPU, it can't go on.
 713          */
 714         if (event->attr.exclusive && cpuctx->active_oncpu)
 715                 return 0;
 716         /*
 717          * Otherwise, try to add it if all previous groups were able
 718          * to go on.
 719          */
 720         return can_add_hw;
 721 }
 722
 723 static void add_event_to_ctx(struct perf_event *event,
 724                                struct perf_event_context *ctx)
 725 {
 726         list_add_event(event, ctx);
 727         event->tstamp_enabled = ctx->time;
 728         event->tstamp_running = ctx->time;
 729         event->tstamp_stopped = ctx->time;
 730 }
 731
 732 /*
 733  * Cross CPU call to install and enable a performance event
 734  *
 735  * Must be called with ctx->mutex held
 736  */
 737 static void __perf_install_in_context(void *info)
 738 {
 739         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 740         struct perf_event *event = info;
 741         struct perf_event_context *ctx = event->ctx;
 742         struct perf_event *leader = event->group_leader;
 743         int err;
 744
 745         /*
 746          * If this is a task context, we need to check whether it is
 747          * the current task context of this cpu. If not it has been
 748          * scheduled out before the smp call arrived.
 749          * Or possibly this is the right context but it isn't
 750          * on this cpu because it had no events.
 751          */
 752         if (ctx->task && cpuctx->task_ctx != ctx) {
 753                 if (cpuctx->task_ctx || ctx->task != current)
 754                         return;
 755                 cpuctx->task_ctx = ctx;
 756         }
 757
 758         raw_spin_lock(&ctx->lock);
 759         ctx->is_active = 1;
 760         update_context_time(ctx);
 761
 762         /*
 763          * Protect the list operation against NMI by disabling the
 764          * events on a global level. NOP for non NMI based events.
 765          */
 766         perf_disable();
 767
 768         add_event_to_ctx(event, ctx);
 769
 770         if (event->cpu != -1 && event->cpu != smp_processor_id())
 771                 goto unlock;
 772
 773         /*
 774          * Don't put the event on if it is disabled or if
 775          * it is in a group and the group isn't on.
 776          */
 777         if (event->state != PERF_EVENT_STATE_INACTIVE ||
 778             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
 779                 goto unlock;
 780
 781         /*
 782          * An exclusive event can't go on if there are already active
 783          * hardware events, and no hardware event can go on if there
 784          * is already an exclusive event on.
 785          */
 786         if (!group_can_go_on(event, cpuctx, 1))
 787                 err = -EEXIST;
 788         else
 789                 err = event_sched_in(event, cpuctx, ctx);
 790
 791         if (err) {
 792                 /*
 793                  * This event couldn't go on.  If it is in a group
 794                  * then we have to pull the whole group off.
 795                  * If the event group is pinned then put it in error state.
 796                  */
 797                 if (leader != event)
 798                         group_sched_out(leader, cpuctx, ctx);
 799                 if (leader->attr.pinned) {
 800                         update_group_times(leader);
 801                         leader->state = PERF_EVENT_STATE_ERROR;
 802                 }
 803         }
 804
 805         if (!err && !ctx->task && cpuctx->max_pertask)
 806                 cpuctx->max_pertask--;
 807
 808  unlock:
 809         perf_enable();
 810
 811         raw_spin_unlock(&ctx->lock);
 812 }
 813
 814 /*
 815  * Attach a performance event to a context
 816  *
 817  * First we add the event to the list with the hardware enable bit
 818  * in event->hw_config cleared.
 819  *
 820  * If the event is attached to a task which is on a CPU we use a smp
 821  * call to enable it in the task context. The task might have been
 822  * scheduled away, but we check this in the smp call again.
 823  *
 824  * Must be called with ctx->mutex held.
 825  */
 826 static void
 827 perf_install_in_context(struct perf_event_context *ctx,
 828                         struct perf_event *event,
 829                         int cpu)
 830 {
 831         struct task_struct *task = ctx->task;
 832
 833         if (!task) {
 834                 /*
 835                  * Per cpu events are installed via an smp call and
 836                  * the install is always successful.
 837                  */
 838                 smp_call_function_single(cpu, __perf_install_in_context,
 839                                          event, 1);
 840                 return;
 841         }
 842
 843 retry:
 844         task_oncpu_function_call(task, __perf_install_in_context,
 845                                  event);
 846
 847         raw_spin_lock_irq(&ctx->lock);
 848         /*
 849          * we need to retry the smp call.
 850          */
 851         if (ctx->is_active && list_empty(&event->group_entry)) {
 852                 raw_spin_unlock_irq(&ctx->lock);
 853                 goto retry;
 854         }
 855
 856         /*
 857          * The lock prevents that this context is scheduled in so we
 858          * can add the event safely, if it the call above did not
 859          * succeed.
 860          */
 861         if (list_empty(&event->group_entry))
 862                 add_event_to_ctx(event, ctx);
 863         raw_spin_unlock_irq(&ctx->lock);
 864 }
 865
 866 /*
 867  * Put a event into inactive state and update time fields.
 868  * Enabling the leader of a group effectively enables all
 869  * the group members that aren't explicitly disabled, so we
 870  * have to update their ->tstamp_enabled also.
 871  * Note: this works for group members as well as group leaders
 872  * since the non-leader members' sibling_lists will be empty.
 873  */
 874 static void __perf_event_mark_enabled(struct perf_event *event,
 875                                         struct perf_event_context *ctx)
 876 {
 877         struct perf_event *sub;
 878
 879         event->state = PERF_EVENT_STATE_INACTIVE;
 880         event->tstamp_enabled = ctx->time - event->total_time_enabled;
 881         list_for_each_entry(sub, &event->sibling_list, group_entry)
 882                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
 883                         sub->tstamp_enabled =
 884                                 ctx->time - sub->total_time_enabled;
 885 }
 886
 887 /*
 888  * Cross CPU call to enable a performance event
 889  */
 890 static void __perf_event_enable(void *info)
 891 {
 892         struct perf_event *event = info;
 893         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 894         struct perf_event_context *ctx = event->ctx;
 895         struct perf_event *leader = event->group_leader;
 896         int err;
 897
 898         /*
 899          * If this is a per-task event, need to check whether this
 900          * event's task is the current task on this cpu.
 901          */
 902         if (ctx->task && cpuctx->task_ctx != ctx) {
 903                 if (cpuctx->task_ctx || ctx->task != current)
 904                         return;
 905                 cpuctx->task_ctx = ctx;
 906         }
 907
 908         raw_spin_lock(&ctx->lock);
 909         ctx->is_active = 1;
 910         update_context_time(ctx);
 911
 912         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 913                 goto unlock;
 914         __perf_event_mark_enabled(event, ctx);
 915
 916         if (event->cpu != -1 && event->cpu != smp_processor_id())
 917                 goto unlock;
 918
 919         /*
 920          * If the event is in a group and isn't the group leader,
 921          * then don't put it on unless the group is on.
 922          */
 923         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
 924                 goto unlock;
 925
 926         if (!group_can_go_on(event, cpuctx, 1)) {
 927                 err = -EEXIST;
 928         } else {
 929                 perf_disable();
 930                 if (event == leader)
 931                         err = group_sched_in(event, cpuctx, ctx);
 932                 else
 933                         err = event_sched_in(event, cpuctx, ctx);
 934                 perf_enable();
 935         }
 936
 937         if (err) {
 938                 /*
 939                  * If this event can't go on and it's part of a
 940                  * group, then the whole group has to come off.
 941                  */
 942                 if (leader != event)
 943                         group_sched_out(leader, cpuctx, ctx);
 944                 if (leader->attr.pinned) {
 945                         update_group_times(leader);
 946                         leader->state = PERF_EVENT_STATE_ERROR;
 947                 }
 948         }
 949
 950  unlock:
 951         raw_spin_unlock(&ctx->lock);
 952 }
 953
 954 /*
 955  * Enable a event.
 956  *
 957  * If event->ctx is a cloned context, callers must make sure that
 958  * every task struct that event->ctx->task could possibly point to
 959  * remains valid.  This condition is satisfied when called through
 960  * perf_event_for_each_child or perf_event_for_each as described
 961  * for perf_event_disable.
 962  */
 963 void perf_event_enable(struct perf_event *event)
 964 {
 965         struct perf_event_context *ctx = event->ctx;
 966         struct task_struct *task = ctx->task;
 967
 968         if (!task) {
 969                 /*
 970                  * Enable the event on the cpu that it's on
 971                  */
 972                 smp_call_function_single(event->cpu, __perf_event_enable,
 973                                          event, 1);
 974                 return;
 975         }
 976
 977         raw_spin_lock_irq(&ctx->lock);
 978         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 979                 goto out;
 980
 981         /*
 982          * If the event is in error state, clear that first.
 983          * That way, if we see the event in error state below, we
 984          * know that it has gone back into error state, as distinct
 985          * from the task having been scheduled away before the
 986          * cross-call arrived.
 987          */
 988         if (event->state == PERF_EVENT_STATE_ERROR)
 989                 event->state = PERF_EVENT_STATE_OFF;
 990
 991  retry:
 992         raw_spin_unlock_irq(&ctx->lock);
 993         task_oncpu_function_call(task, __perf_event_enable, event);
 994
 995         raw_spin_lock_irq(&ctx->lock);
 996
 997         /*
 998          * If the context is active and the event is still off,
 999          * we need to retry the cross-call.
1000          */
1001         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
1002                 goto retry;
1003
1004         /*
1005          * Since we have the lock this context can't be scheduled
1006          * in, so we can change the state safely.
1007          */
1008         if (event->state == PERF_EVENT_STATE_OFF)
1009                 __perf_event_mark_enabled(event, ctx);
1010
1011  out:
1012         raw_spin_unlock_irq(&ctx->lock);
1013 }
1014
1015 static int perf_event_refresh(struct perf_event *event, int refresh)
1016 {
1017         /*
1018          * not supported on inherited events
1019          */
1020         if (event->attr.inherit)
1021                 return -EINVAL;
1022
1023         atomic_add(refresh, &event->event_limit);
1024         perf_event_enable(event);
1025
1026         return 0;
1027 }
1028
1029 enum event_type_t {
1030         EVENT_FLEXIBLE = 0x1,
1031         EVENT_PINNED = 0x2,
1032         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1033 };
1034
1035 static void ctx_sched_out(struct perf_event_context *ctx,
1036                           struct perf_cpu_context *cpuctx,
1037                           enum event_type_t event_type)
1038 {
1039         struct perf_event *event;
1040
1041         raw_spin_lock(&ctx->lock);
1042         ctx->is_active = 0;
1043         if (likely(!ctx->nr_events))
1044                 goto out;
1045         update_context_time(ctx);
1046
1047         perf_disable();
1048         if (!ctx->nr_active)
1049                 goto out_enable;
1050
1051         if (event_type & EVENT_PINNED)
1052                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1053                         group_sched_out(event, cpuctx, ctx);
1054
1055         if (event_type & EVENT_FLEXIBLE)
1056                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1057                         group_sched_out(event, cpuctx, ctx);
1058
1059  out_enable:
1060         perf_enable();
1061  out:
1062         raw_spin_unlock(&ctx->lock);
1063 }
1064
1065 /*
1066  * Test whether two contexts are equivalent, i.e. whether they
1067  * have both been cloned from the same version of the same context
1068  * and they both have the same number of enabled events.
1069  * If the number of enabled events is the same, then the set
1070  * of enabled events should be the same, because these are both
1071  * inherited contexts, therefore we can't access individual events
1072  * in them directly with an fd; we can only enable/disable all
1073  * events via prctl, or enable/disable all events in a family
1074  * via ioctl, which will have the same effect on both contexts.
1075  */
1076 static int context_equiv(struct perf_event_context *ctx1,
1077                          struct perf_event_context *ctx2)
1078 {
1079         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1080                 && ctx1->parent_gen == ctx2->parent_gen
1081                 && !ctx1->pin_count && !ctx2->pin_count;
1082 }
1083
1084 static void __perf_event_sync_stat(struct perf_event *event,
1085                                      struct perf_event *next_event)
1086 {
1087         u64 value;
1088
1089         if (!event->attr.inherit_stat)
1090                 return;
1091
1092         /*
1093          * Update the event value, we cannot use perf_event_read()
1094          * because we're in the middle of a context switch and have IRQs
1095          * disabled, which upsets smp_call_function_single(), however
1096          * we know the event must be on the current CPU, therefore we
1097          * don't need to use it.
1098          */
1099         switch (event->state) {
1100         case PERF_EVENT_STATE_ACTIVE:
1101                 event->pmu->read(event);
1102                 /* fall-through */
1103
1104         case PERF_EVENT_STATE_INACTIVE:
1105                 update_event_times(event);
1106                 break;
1107
1108         default:
1109                 break;
1110         }
1111
1112         /*
1113          * In order to keep per-task stats reliable we need to flip the event
1114          * values when we flip the contexts.
1115          */
1116         value = atomic64_read(&next_event->count);
1117         value = atomic64_xchg(&event->count, value);
1118         atomic64_set(&next_event->count, value);
1119
1120         swap(event->total_time_enabled, next_event->total_time_enabled);
1121         swap(event->total_time_running, next_event->total_time_running);
1122
1123         /*
1124          * Since we swizzled the values, update the user visible data too.
1125          */
1126         perf_event_update_userpage(event);
1127         perf_event_update_userpage(next_event);
1128 }
1129
1130 #define list_next_entry(pos, member) \
1131         list_entry(pos->member.next, typeof(*pos), member)
1132
1133 static void perf_event_sync_stat(struct perf_event_context *ctx,
1134                                    struct perf_event_context *next_ctx)
1135 {
1136         struct perf_event *event, *next_event;
1137
1138         if (!ctx->nr_stat)
1139                 return;
1140
1141         update_context_time(ctx);
1142
1143         event = list_first_entry(&ctx->event_list,
1144                                    struct perf_event, event_entry);
1145
1146         next_event = list_first_entry(&next_ctx->event_list,
1147                                         struct perf_event, event_entry);
1148
1149         while (&event->event_entry != &ctx->event_list &&
1150                &next_event->event_entry != &next_ctx->event_list) {
1151
1152                 __perf_event_sync_stat(event, next_event);
1153
1154                 event = list_next_entry(event, event_entry);
1155                 next_event = list_next_entry(next_event, event_entry);
1156         }
1157 }
1158
1159 /*
1160  * Called from scheduler to remove the events of the current task,
1161  * with interrupts disabled.
1162  *
1163  * We stop each event and update the event value in event->count.
1164  *
1165  * This does not protect us against NMI, but disable()
1166  * sets the disabled bit in the control field of event _before_
1167  * accessing the event control register. If a NMI hits, then it will
1168  * not restart the event.
1169  */
1170 void perf_event_task_sched_out(struct task_struct *task,
1171                                  struct task_struct *next)
1172 {
1173         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1174         struct perf_event_context *ctx = task->perf_event_ctxp;
1175         struct perf_event_context *next_ctx;
1176         struct perf_event_context *parent;
1177         struct pt_regs *regs;
1178         int do_switch = 1;
1179
1180         regs = task_pt_regs(task);
1181         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1182
1183         if (likely(!ctx || !cpuctx->task_ctx))
1184                 return;
1185
1186         rcu_read_lock();
1187         parent = rcu_dereference(ctx->parent_ctx);
1188         next_ctx = next->perf_event_ctxp;
1189         if (parent && next_ctx &&
1190             rcu_dereference(next_ctx->parent_ctx) == parent) {
1191                 /*
1192                  * Looks like the two contexts are clones, so we might be
1193                  * able to optimize the context switch.  We lock both
1194                  * contexts and check that they are clones under the
1195                  * lock (including re-checking that neither has been
1196                  * uncloned in the meantime).  It doesn't matter which
1197                  * order we take the locks because no other cpu could
1198                  * be trying to lock both of these tasks.
1199                  */
1200                 raw_spin_lock(&ctx->lock);
1201                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1202                 if (context_equiv(ctx, next_ctx)) {
1203                         /*
1204                          * XXX do we need a memory barrier of sorts
1205                          * wrt to rcu_dereference() of perf_event_ctxp
1206                          */
1207                         task->perf_event_ctxp = next_ctx;
1208                         next->perf_event_ctxp = ctx;
1209                         ctx->task = next;
1210                         next_ctx->task = task;
1211                         do_switch = 0;
1212
1213                         perf_event_sync_stat(ctx, next_ctx);
1214                 }
1215                 raw_spin_unlock(&next_ctx->lock);
1216                 raw_spin_unlock(&ctx->lock);
1217         }
1218         rcu_read_unlock();
1219
1220         if (do_switch) {
1221                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1222                 cpuctx->task_ctx = NULL;
1223         }
1224 }
1225
1226 static void task_ctx_sched_out(struct perf_event_context *ctx,
1227                                enum event_type_t event_type)
1228 {
1229         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1230
1231         if (!cpuctx->task_ctx)
1232                 return;
1233
1234         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1235                 return;
1236
1237         ctx_sched_out(ctx, cpuctx, event_type);
1238         cpuctx->task_ctx = NULL;
1239 }
1240
1241 /*
1242  * Called with IRQs disabled
1243  */
1244 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1245 {
1246         task_ctx_sched_out(ctx, EVENT_ALL);
1247 }
1248
1249 /*
1250  * Called with IRQs disabled
1251  */
1252 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1253                               enum event_type_t event_type)
1254 {
1255         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1256 }
1257
1258 static void
1259 ctx_pinned_sched_in(struct perf_event_context *ctx,
1260                     struct perf_cpu_context *cpuctx)
1261 {
1262         struct perf_event *event;
1263
1264         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1265                 if (event->state <= PERF_EVENT_STATE_OFF)
1266                         continue;
1267                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1268                         continue;
1269
1270                 if (group_can_go_on(event, cpuctx, 1))
1271                         group_sched_in(event, cpuctx, ctx);
1272
1273                 /*
1274                  * If this pinned group hasn't been scheduled,
1275                  * put it in error state.
1276                  */
1277                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1278                         update_group_times(event);
1279                         event->state = PERF_EVENT_STATE_ERROR;
1280                 }
1281         }
1282 }
1283
1284 static void
1285 ctx_flexible_sched_in(struct perf_event_context *ctx,
1286                       struct perf_cpu_context *cpuctx)
1287 {
1288         struct perf_event *event;
1289         int can_add_hw = 1;
1290
1291         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1292                 /* Ignore events in OFF or ERROR state */
1293                 if (event->state <= PERF_EVENT_STATE_OFF)
1294                         continue;
1295                 /*
1296                  * Listen to the 'cpu' scheduling filter constraint
1297                  * of events:
1298                  */
1299                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1300                         continue;
1301
1302                 if (group_can_go_on(event, cpuctx, can_add_hw))
1303                         if (group_sched_in(event, cpuctx, ctx))
1304                                 can_add_hw = 0;
1305         }
1306 }
1307
1308 static void
1309 ctx_sched_in(struct perf_event_context *ctx,
1310              struct perf_cpu_context *cpuctx,
1311              enum event_type_t event_type)
1312 {
1313         raw_spin_lock(&ctx->lock);
1314         ctx->is_active = 1;
1315         if (likely(!ctx->nr_events))
1316                 goto out;
1317
1318         ctx->timestamp = perf_clock();
1319
1320         perf_disable();
1321
1322         /*
1323          * First go through the list and put on any pinned groups
1324          * in order to give them the best chance of going on.
1325          */
1326         if (event_type & EVENT_PINNED)
1327                 ctx_pinned_sched_in(ctx, cpuctx);
1328
1329         /* Then walk through the lower prio flexible groups */
1330         if (event_type & EVENT_FLEXIBLE)
1331                 ctx_flexible_sched_in(ctx, cpuctx);
1332
1333         perf_enable();
1334  out:
1335         raw_spin_unlock(&ctx->lock);
1336 }
1337
1338 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1339                              enum event_type_t event_type)
1340 {
1341         struct perf_event_context *ctx = &cpuctx->ctx;
1342
1343         ctx_sched_in(ctx, cpuctx, event_type);
1344 }
1345
1346 static void task_ctx_sched_in(struct task_struct *task,
1347                               enum event_type_t event_type)
1348 {
1349         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1350         struct perf_event_context *ctx = task->perf_event_ctxp;
1351
1352         if (likely(!ctx))
1353                 return;
1354         if (cpuctx->task_ctx == ctx)
1355                 return;
1356         ctx_sched_in(ctx, cpuctx, event_type);
1357         cpuctx->task_ctx = ctx;
1358 }
1359 /*
1360  * Called from scheduler to add the events of the current task
1361  * with interrupts disabled.
1362  *
1363  * We restore the event value and then enable it.
1364  *
1365  * This does not protect us against NMI, but enable()
1366  * sets the enabled bit in the control field of event _before_
1367  * accessing the event control register. If a NMI hits, then it will
1368  * keep the event running.
1369  */
1370 void perf_event_task_sched_in(struct task_struct *task)
1371 {
1372         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1373         struct perf_event_context *ctx = task->perf_event_ctxp;
1374
1375         if (likely(!ctx))
1376                 return;
1377
1378         if (cpuctx->task_ctx == ctx)
1379                 return;
1380
1381         /*
1382          * We want to keep the following priority order:
1383          * cpu pinned (that don't need to move), task pinned,
1384          * cpu flexible, task flexible.
1385          */
1386         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1387
1388         ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1389         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1390         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1391
1392         cpuctx->task_ctx = ctx;
1393 }
1394
1395 #define MAX_INTERRUPTS (~0ULL)
1396
1397 static void perf_log_throttle(struct perf_event *event, int enable);
1398
1399 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1400 {
1401         u64 frequency = event->attr.sample_freq;
1402         u64 sec = NSEC_PER_SEC;
1403         u64 divisor, dividend;
1404
1405         int count_fls, nsec_fls, frequency_fls, sec_fls;
1406
1407         count_fls = fls64(count);
1408         nsec_fls = fls64(nsec);
1409         frequency_fls = fls64(frequency);
1410         sec_fls = 30;
1411
1412         /*
1413          * We got @count in @nsec, with a target of sample_freq HZ
1414          * the target period becomes:
1415          *
1416          *             @count * 10^9
1417          * period = -------------------
1418          *          @nsec * sample_freq
1419          *
1420          */
1421
1422         /*
1423          * Reduce accuracy by one bit such that @a and @b converge
1424          * to a similar magnitude.
1425          */
1426 #define REDUCE_FLS(a, b)                \
1427 do {                                    \
1428         if (a##_fls > b##_fls) {        \
1429                 a >>= 1;                \
1430                 a##_fls--;              \
1431         } else {                        \
1432                 b >>= 1;                \
1433                 b##_fls--;              \
1434         }                               \
1435 } while (0)
1436
1437         /*
1438          * Reduce accuracy until either term fits in a u64, then proceed with
1439          * the other, so that finally we can do a u64/u64 division.
1440          */
1441         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1442                 REDUCE_FLS(nsec, frequency);
1443                 REDUCE_FLS(sec, count);
1444         }
1445
1446         if (count_fls + sec_fls > 64) {
1447                 divisor = nsec * frequency;
1448
1449                 while (count_fls + sec_fls > 64) {
1450                         REDUCE_FLS(count, sec);
1451                         divisor >>= 1;
1452                 }
1453
1454                 dividend = count * sec;
1455         } else {
1456                 dividend = count * sec;
1457
1458                 while (nsec_fls + frequency_fls > 64) {
1459                         REDUCE_FLS(nsec, frequency);
1460                         dividend >>= 1;
1461                 }
1462
1463                 divisor = nsec * frequency;
1464         }
1465
1466         return div64_u64(dividend, divisor);
1467 }
1468
1469 static void perf_event_stop(struct perf_event *event)
1470 {
1471         if (!event->pmu->stop)
1472                 return event->pmu->disable(event);
1473
1474         return event->pmu->stop(event);
1475 }
1476
1477 static int perf_event_start(struct perf_event *event)
1478 {
1479         if (!event->pmu->start)
1480                 return event->pmu->enable(event);
1481
1482         return event->pmu->start(event);
1483 }
1484
1485 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1486 {
1487         struct hw_perf_event *hwc = &event->hw;
1488         u64 period, sample_period;
1489         s64 delta;
1490
1491         period = perf_calculate_period(event, nsec, count);
1492
1493         delta = (s64)(period - hwc->sample_period);
1494         delta = (delta + 7) / 8; /* low pass filter */
1495
1496         sample_period = hwc->sample_period + delta;
1497
1498         if (!sample_period)
1499                 sample_period = 1;
1500
1501         hwc->sample_period = sample_period;
1502
1503         if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1504                 perf_disable();
1505                 perf_event_stop(event);
1506                 atomic64_set(&hwc->period_left, 0);
1507                 perf_event_start(event);
1508                 perf_enable();
1509         }
1510 }
1511
1512 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1513 {
1514         struct perf_event *event;
1515         struct hw_perf_event *hwc;
1516         u64 interrupts, now;
1517         s64 delta;
1518
1519         raw_spin_lock(&ctx->lock);
1520         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1521                 if (event->state != PERF_EVENT_STATE_ACTIVE)
1522                         continue;
1523
1524                 if (event->cpu != -1 && event->cpu != smp_processor_id())
1525                         continue;
1526
1527                 hwc = &event->hw;
1528
1529                 interrupts = hwc->interrupts;
1530                 hwc->interrupts = 0;
1531
1532                 /*
1533                  * unthrottle events on the tick
1534                  */
1535                 if (interrupts == MAX_INTERRUPTS) {
1536                         perf_log_throttle(event, 1);
1537                         event->pmu->unthrottle(event);
1538                 }
1539
1540                 if (!event->attr.freq || !event->attr.sample_freq)
1541                         continue;
1542
1543                 event->pmu->read(event);
1544                 now = atomic64_read(&event->count);
1545                 delta = now - hwc->freq_count_stamp;
1546                 hwc->freq_count_stamp = now;
1547
1548                 if (delta > 0)
1549                         perf_adjust_period(event, TICK_NSEC, delta);
1550         }
1551         raw_spin_unlock(&ctx->lock);
1552 }
1553
1554 /*
1555  * Round-robin a context's events:
1556  */
1557 static void rotate_ctx(struct perf_event_context *ctx)
1558 {
1559         if (!ctx->nr_events)
1560                 return;
1561
1562         raw_spin_lock(&ctx->lock);
1563
1564         /* Rotate the first entry last of non-pinned groups */
1565         list_rotate_left(&ctx->flexible_groups);
1566
1567         raw_spin_unlock(&ctx->lock);
1568 }
1569
1570 void perf_event_task_tick(struct task_struct *curr)
1571 {
1572         struct perf_cpu_context *cpuctx;
1573         struct perf_event_context *ctx;
1574
1575         if (!atomic_read(&nr_events))
1576                 return;
1577
1578         cpuctx = &__get_cpu_var(perf_cpu_context);
1579         ctx = curr->perf_event_ctxp;
1580
1581         perf_disable();
1582
1583         perf_ctx_adjust_freq(&cpuctx->ctx);
1584         if (ctx)
1585                 perf_ctx_adjust_freq(ctx);
1586
1587         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1588         if (ctx)
1589                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1590
1591         rotate_ctx(&cpuctx->ctx);
1592         if (ctx)
1593                 rotate_ctx(ctx);
1594
1595         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1596         if (ctx)
1597                 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1598
1599         perf_enable();
1600 }
1601
1602 static int event_enable_on_exec(struct perf_event *event,
1603                                 struct perf_event_context *ctx)
1604 {
1605         if (!event->attr.enable_on_exec)
1606                 return 0;
1607
1608         event->attr.enable_on_exec = 0;
1609         if (event->state >= PERF_EVENT_STATE_INACTIVE)
1610                 return 0;
1611
1612         __perf_event_mark_enabled(event, ctx);
1613
1614         return 1;
1615 }
1616
1617 /*
1618  * Enable all of a task's events that have been marked enable-on-exec.
1619  * This expects task == current.
1620  */
1621 static void perf_event_enable_on_exec(struct task_struct *task)
1622 {
1623         struct perf_event_context *ctx;
1624         struct perf_event *event;
1625         unsigned long flags;
1626         int enabled = 0;
1627         int ret;
1628
1629         local_irq_save(flags);
1630         ctx = task->perf_event_ctxp;
1631         if (!ctx || !ctx->nr_events)
1632                 goto out;
1633
1634         __perf_event_task_sched_out(ctx);
1635
1636         raw_spin_lock(&ctx->lock);
1637
1638         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1639                 ret = event_enable_on_exec(event, ctx);
1640                 if (ret)
1641                         enabled = 1;
1642         }
1643
1644         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1645                 ret = event_enable_on_exec(event, ctx);
1646                 if (ret)
1647                         enabled = 1;
1648         }
1649
1650         /*
1651          * Unclone this context if we enabled any event.
1652          */
1653         if (enabled)
1654                 unclone_ctx(ctx);
1655
1656         raw_spin_unlock(&ctx->lock);
1657
1658         perf_event_task_sched_in(task);
1659  out:
1660         local_irq_restore(flags);
1661 }
1662
1663 /*
1664  * Cross CPU call to read the hardware event
1665  */
1666 static void __perf_event_read(void *info)
1667 {
1668         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1669         struct perf_event *event = info;
1670         struct perf_event_context *ctx = event->ctx;
1671
1672         /*
1673          * If this is a task context, we need to check whether it is
1674          * the current task context of this cpu.  If not it has been
1675          * scheduled out before the smp call arrived.  In that case
1676          * event->count would have been updated to a recent sample
1677          * when the event was scheduled out.
1678          */
1679         if (ctx->task && cpuctx->task_ctx != ctx)
1680                 return;
1681
1682         raw_spin_lock(&ctx->lock);
1683         update_context_time(ctx);
1684         update_event_times(event);
1685         raw_spin_unlock(&ctx->lock);
1686
1687         event->pmu->read(event);
1688 }
1689
1690 static u64 perf_event_read(struct perf_event *event)
1691 {
1692         /*
1693          * If event is enabled and currently active on a CPU, update the
1694          * value in the event structure:
1695          */
1696         if (event->state == PERF_EVENT_STATE_ACTIVE) {
1697                 smp_call_function_single(event->oncpu,
1698                                          __perf_event_read, event, 1);
1699         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1700                 struct perf_event_context *ctx = event->ctx;
1701                 unsigned long flags;
1702
1703                 raw_spin_lock_irqsave(&ctx->lock, flags);
1704                 update_context_time(ctx);
1705                 update_event_times(event);
1706                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1707         }
1708
1709         return atomic64_read(&event->count);
1710 }
1711
1712 /*
1713  * Initialize the perf_event context in a task_struct:
1714  */
1715 static void
1716 __perf_event_init_context(struct perf_event_context *ctx,
1717                             struct task_struct *task)
1718 {
1719         raw_spin_lock_init(&ctx->lock);
1720         mutex_init(&ctx->mutex);
1721         INIT_LIST_HEAD(&ctx->pinned_groups);
1722         INIT_LIST_HEAD(&ctx->flexible_groups);
1723         INIT_LIST_HEAD(&ctx->event_list);
1724         atomic_set(&ctx->refcount, 1);
1725         ctx->task = task;
1726 }
1727
1728 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1729 {
1730         struct perf_event_context *ctx;
1731         struct perf_cpu_context *cpuctx;
1732         struct task_struct *task;
1733         unsigned long flags;
1734         int err;
1735
1736         if (pid == -1 && cpu != -1) {
1737                 /* Must be root to operate on a CPU event: */
1738                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1739                         return ERR_PTR(-EACCES);
1740
1741                 if (cpu < 0 || cpu >= nr_cpumask_bits)
1742                         return ERR_PTR(-EINVAL);
1743
1744                 /*
1745                  * We could be clever and allow to attach a event to an
1746                  * offline CPU and activate it when the CPU comes up, but
1747                  * that's for later.
1748                  */
1749                 if (!cpu_online(cpu))
1750                         return ERR_PTR(-ENODEV);
1751
1752                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1753                 ctx = &cpuctx->ctx;
1754                 get_ctx(ctx);
1755
1756                 return ctx;
1757         }
1758
1759         rcu_read_lock();
1760         if (!pid)
1761                 task = current;
1762         else
1763                 task = find_task_by_vpid(pid);
1764         if (task)
1765                 get_task_struct(task);
1766         rcu_read_unlock();
1767
1768         if (!task)
1769                 return ERR_PTR(-ESRCH);
1770
1771         /*
1772          * Can't attach events to a dying task.
1773          */
1774         err = -ESRCH;
1775         if (task->flags & PF_EXITING)
1776                 goto errout;
1777
1778         /* Reuse ptrace permission checks for now. */
1779         err = -EACCES;
1780         if (!ptrace_may_access(task, PTRACE_MODE_READ))
1781                 goto errout;
1782
1783  retry:
1784         ctx = perf_lock_task_context(task, &flags);
1785         if (ctx) {
1786                 unclone_ctx(ctx);
1787                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1788         }
1789
1790         if (!ctx) {
1791                 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1792                 err = -ENOMEM;
1793                 if (!ctx)
1794                         goto errout;
1795                 __perf_event_init_context(ctx, task);
1796                 get_ctx(ctx);
1797                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1798                         /*
1799                          * We raced with some other task; use
1800                          * the context they set.
1801                          */
1802                         kfree(ctx);
1803                         goto retry;
1804                 }
1805                 get_task_struct(task);
1806         }
1807
1808         put_task_struct(task);
1809         return ctx;
1810
1811  errout:
1812         put_task_struct(task);
1813         return ERR_PTR(err);
1814 }
1815
1816 static void perf_event_free_filter(struct perf_event *event);
1817
1818 static void free_event_rcu(struct rcu_head *head)
1819 {
1820         struct perf_event *event;
1821
1822         event = container_of(head, struct perf_event, rcu_head);
1823         if (event->ns)
1824                 put_pid_ns(event->ns);
1825         perf_event_free_filter(event);
1826         kfree(event);
1827 }
1828
1829 static void perf_pending_sync(struct perf_event *event);
1830
1831 static void free_event(struct perf_event *event)
1832 {
1833         perf_pending_sync(event);
1834
1835         if (!event->parent) {
1836                 atomic_dec(&nr_events);
1837                 if (event->attr.mmap)
1838                         atomic_dec(&nr_mmap_events);
1839                 if (event->attr.comm)
1840                         atomic_dec(&nr_comm_events);
1841                 if (event->attr.task)
1842                         atomic_dec(&nr_task_events);
1843         }
1844
1845         if (event->output) {
1846                 fput(event->output->filp);
1847                 event->output = NULL;
1848         }
1849
1850         if (event->destroy)
1851                 event->destroy(event);
1852
1853         put_ctx(event->ctx);
1854         call_rcu(&event->rcu_head, free_event_rcu);
1855 }
1856
1857 int perf_event_release_kernel(struct perf_event *event)
1858 {
1859         struct perf_event_context *ctx = event->ctx;
1860
1861         WARN_ON_ONCE(ctx->parent_ctx);
1862         mutex_lock(&ctx->mutex);
1863         perf_event_remove_from_context(event);
1864         mutex_unlock(&ctx->mutex);
1865
1866         mutex_lock(&event->owner->perf_event_mutex);
1867         list_del_init(&event->owner_entry);
1868         mutex_unlock(&event->owner->perf_event_mutex);
1869         put_task_struct(event->owner);
1870
1871         free_event(event);
1872
1873         return 0;
1874 }
1875 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1876
1877 /*
1878  * Called when the last reference to the file is gone.
1879  */
1880 static int perf_release(struct inode *inode, struct file *file)
1881 {
1882         struct perf_event *event = file->private_data;
1883
1884         file->private_data = NULL;
1885
1886         return perf_event_release_kernel(event);
1887 }
1888
1889 static int perf_event_read_size(struct perf_event *event)
1890 {
1891         int entry = sizeof(u64); /* value */
1892         int size = 0;
1893         int nr = 1;
1894
1895         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1896                 size += sizeof(u64);
1897
1898         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1899                 size += sizeof(u64);
1900
1901         if (event->attr.read_format & PERF_FORMAT_ID)
1902                 entry += sizeof(u64);
1903
1904         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1905                 nr += event->group_leader->nr_siblings;
1906                 size += sizeof(u64);
1907         }
1908
1909         size += entry * nr;
1910
1911         return size;
1912 }
1913
1914 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1915 {
1916         struct perf_event *child;
1917         u64 total = 0;
1918
1919         *enabled = 0;
1920         *running = 0;
1921
1922         mutex_lock(&event->child_mutex);
1923         total += perf_event_read(event);
1924         *enabled += event->total_time_enabled +
1925                         atomic64_read(&event->child_total_time_enabled);
1926         *running += event->total_time_running +
1927                         atomic64_read(&event->child_total_time_running);
1928
1929         list_for_each_entry(child, &event->child_list, child_list) {
1930                 total += perf_event_read(child);
1931                 *enabled += child->total_time_enabled;
1932                 *running += child->total_time_running;
1933         }
1934         mutex_unlock(&event->child_mutex);
1935
1936         return total;
1937 }
1938 EXPORT_SYMBOL_GPL(perf_event_read_value);
1939
1940 static int perf_event_read_group(struct perf_event *event,
1941                                    u64 read_format, char __user *buf)
1942 {
1943         struct perf_event *leader = event->group_leader, *sub;
1944         int n = 0, size = 0, ret = -EFAULT;
1945         struct perf_event_context *ctx = leader->ctx;
1946         u64 values[5];
1947         u64 count, enabled, running;
1948
1949         mutex_lock(&ctx->mutex);
1950         count = perf_event_read_value(leader, &enabled, &running);
1951
1952         values[n++] = 1 + leader->nr_siblings;
1953         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1954                 values[n++] = enabled;
1955         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1956                 values[n++] = running;
1957         values[n++] = count;
1958         if (read_format & PERF_FORMAT_ID)
1959                 values[n++] = primary_event_id(leader);
1960
1961         size = n * sizeof(u64);
1962
1963         if (copy_to_user(buf, values, size))
1964                 goto unlock;
1965
1966         ret = size;
1967
1968         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1969                 n = 0;
1970
1971                 values[n++] = perf_event_read_value(sub, &enabled, &running);
1972                 if (read_format & PERF_FORMAT_ID)
1973                         values[n++] = primary_event_id(sub);
1974
1975                 size = n * sizeof(u64);
1976
1977                 if (copy_to_user(buf + ret, values, size)) {
1978                         ret = -EFAULT;
1979                         goto unlock;
1980                 }
1981
1982                 ret += size;
1983         }
1984 unlock:
1985         mutex_unlock(&ctx->mutex);
1986
1987         return ret;
1988 }
1989
1990 static int perf_event_read_one(struct perf_event *event,
1991                                  u64 read_format, char __user *buf)
1992 {
1993         u64 enabled, running;
1994         u64 values[4];
1995         int n = 0;
1996
1997         values[n++] = perf_event_read_value(event, &enabled, &running);
1998         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1999                 values[n++] = enabled;
2000         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2001                 values[n++] = running;
2002         if (read_format & PERF_FORMAT_ID)
2003                 values[n++] = primary_event_id(event);
2004
2005         if (copy_to_user(buf, values, n * sizeof(u64)))
2006                 return -EFAULT;
2007
2008         return n * sizeof(u64);
2009 }
2010
2011 /*
2012  * Read the performance event - simple non blocking version for now
2013  */
2014 static ssize_t
2015 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2016 {
2017         u64 read_format = event->attr.read_format;
2018         int ret;
2019
2020         /*
2021          * Return end-of-file for a read on a event that is in
2022          * error state (i.e. because it was pinned but it couldn't be
2023          * scheduled on to the CPU at some point).
2024          */
2025         if (event->state == PERF_EVENT_STATE_ERROR)
2026                 return 0;
2027
2028         if (count < perf_event_read_size(event))
2029                 return -ENOSPC;
2030
2031         WARN_ON_ONCE(event->ctx->parent_ctx);
2032         if (read_format & PERF_FORMAT_GROUP)
2033                 ret = perf_event_read_group(event, read_format, buf);
2034         else
2035                 ret = perf_event_read_one(event, read_format, buf);
2036
2037         return ret;
2038 }
2039
2040 static ssize_t
2041 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
2042 {
2043         struct perf_event *event = file->private_data;
2044
2045         return perf_read_hw(event, buf, count);
2046 }
2047
2048 static unsigned int perf_poll(struct file *file, poll_table *wait)
2049 {
2050         struct perf_event *event = file->private_data;
2051         struct perf_mmap_data *data;
2052         unsigned int events = POLL_HUP;
2053
2054         rcu_read_lock();
2055         data = rcu_dereference(event->data);
2056         if (data)
2057                 events = atomic_xchg(&data->poll, 0);
2058         rcu_read_unlock();
2059
2060         poll_wait(file, &event->waitq, wait);
2061
2062         return events;
2063 }
2064
2065 static void perf_event_reset(struct perf_event *event)
2066 {
2067         (void)perf_event_read(event);
2068         atomic64_set(&event->count, 0);
2069         perf_event_update_userpage(event);
2070 }
2071
2072 /*
2073  * Holding the top-level event's child_mutex means that any
2074  * descendant process that has inherited this event will block
2075  * in sync_child_event if it goes to exit, thus satisfying the
2076  * task existence requirements of perf_event_enable/disable.
2077  */
2078 static void perf_event_for_each_child(struct perf_event *event,
2079                                         void (*func)(struct perf_event *))
2080 {
2081         struct perf_event *child;
2082
2083         WARN_ON_ONCE(event->ctx->parent_ctx);
2084         mutex_lock(&event->child_mutex);
2085         func(event);
2086         list_for_each_entry(child, &event->child_list, child_list)
2087                 func(child);
2088         mutex_unlock(&event->child_mutex);
2089 }
2090
2091 static void perf_event_for_each(struct perf_event *event,
2092                                   void (*func)(struct perf_event *))
2093 {
2094         struct perf_event_context *ctx = event->ctx;
2095         struct perf_event *sibling;
2096
2097         WARN_ON_ONCE(ctx->parent_ctx);
2098         mutex_lock(&ctx->mutex);
2099         event = event->group_leader;
2100
2101         perf_event_for_each_child(event, func);
2102         func(event);
2103         list_for_each_entry(sibling, &event->sibling_list, group_entry)
2104                 perf_event_for_each_child(event, func);
2105         mutex_unlock(&ctx->mutex);
2106 }
2107
2108 static int perf_event_period(struct perf_event *event, u64 __user *arg)
2109 {
2110         struct perf_event_context *ctx = event->ctx;
2111         unsigned long size;
2112         int ret = 0;
2113         u64 value;
2114
2115         if (!event->attr.sample_period)
2116                 return -EINVAL;
2117
2118         size = copy_from_user(&value, arg, sizeof(value));
2119         if (size != sizeof(value))
2120                 return -EFAULT;
2121
2122         if (!value)
2123                 return -EINVAL;
2124
2125         raw_spin_lock_irq(&ctx->lock);
2126         if (event->attr.freq) {
2127                 if (value > sysctl_perf_event_sample_rate) {
2128                         ret = -EINVAL;
2129                         goto unlock;
2130                 }
2131
2132                 event->attr.sample_freq = value;
2133         } else {
2134                 event->attr.sample_period = value;
2135                 event->hw.sample_period = value;
2136         }
2137 unlock:
2138         raw_spin_unlock_irq(&ctx->lock);
2139
2140         return ret;
2141 }
2142
2143 static int perf_event_set_output(struct perf_event *event, int output_fd);
2144 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2145
2146 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2147 {
2148         struct perf_event *event = file->private_data;
2149         void (*func)(struct perf_event *);
2150         u32 flags = arg;
2151
2152         switch (cmd) {
2153         case PERF_EVENT_IOC_ENABLE:
2154                 func = perf_event_enable;
2155                 break;
2156         case PERF_EVENT_IOC_DISABLE:
2157                 func = perf_event_disable;
2158                 break;
2159         case PERF_EVENT_IOC_RESET:
2160                 func = perf_event_reset;
2161                 break;
2162
2163         case PERF_EVENT_IOC_REFRESH:
2164                 return perf_event_refresh(event, arg);
2165
2166         case PERF_EVENT_IOC_PERIOD:
2167                 return perf_event_period(event, (u64 __user *)arg);
2168
2169         case PERF_EVENT_IOC_SET_OUTPUT:
2170                 return perf_event_set_output(event, arg);
2171
2172         case PERF_EVENT_IOC_SET_FILTER:
2173                 return perf_event_set_filter(event, (void __user *)arg);
2174
2175         default:
2176                 return -ENOTTY;
2177         }
2178
2179         if (flags & PERF_IOC_FLAG_GROUP)
2180                 perf_event_for_each(event, func);
2181         else
2182                 perf_event_for_each_child(event, func);
2183
2184         return 0;
2185 }
2186
2187 int perf_event_task_enable(void)
2188 {
2189         struct perf_event *event;
2190
2191         mutex_lock(&current->perf_event_mutex);
2192         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2193                 perf_event_for_each_child(event, perf_event_enable);
2194         mutex_unlock(&current->perf_event_mutex);
2195
2196         return 0;
2197 }
2198
2199 int perf_event_task_disable(void)
2200 {
2201         struct perf_event *event;
2202
2203         mutex_lock(&current->perf_event_mutex);
2204         list_for_each_entry(event, &current->perf_event_list, owner_entry)
2205                 perf_event_for_each_child(event, perf_event_disable);
2206         mutex_unlock(&current->perf_event_mutex);
2207
2208         return 0;
2209 }
2210
2211 #ifndef PERF_EVENT_INDEX_OFFSET
2212 # define PERF_EVENT_INDEX_OFFSET 0
2213 #endif
2214
2215 static int perf_event_index(struct perf_event *event)
2216 {
2217         if (event->state != PERF_EVENT_STATE_ACTIVE)
2218                 return 0;
2219
2220         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2221 }
2222
2223 /*
2224  * Callers need to ensure there can be no nesting of this function, otherwise
2225  * the seqlock logic goes bad. We can not serialize this because the arch
2226  * code calls this from NMI context.
2227  */
2228 void perf_event_update_userpage(struct perf_event *event)
2229 {
2230         struct perf_event_mmap_page *userpg;
2231         struct perf_mmap_data *data;
2232
2233         rcu_read_lock();
2234         data = rcu_dereference(event->data);
2235         if (!data)
2236                 goto unlock;
2237
2238         userpg = data->user_page;
2239
2240         /*
2241          * Disable preemption so as to not let the corresponding user-space
2242          * spin too long if we get preempted.
2243          */
2244         preempt_disable();
2245         ++userpg->lock;
2246         barrier();
2247         userpg->index = perf_event_index(event);
2248         userpg->offset = atomic64_read(&event->count);
2249         if (event->state == PERF_EVENT_STATE_ACTIVE)
2250                 userpg->offset -= atomic64_read(&event->hw.prev_count);
2251
2252         userpg->time_enabled = event->total_time_enabled +
2253                         atomic64_read(&event->child_total_time_enabled);
2254
2255         userpg->time_running = event->total_time_running +
2256                         atomic64_read(&event->child_total_time_running);
2257
2258         barrier();
2259         ++userpg->lock;
2260         preempt_enable();
2261 unlock:
2262         rcu_read_unlock();
2263 }
2264
2265 static unsigned long perf_data_size(struct perf_mmap_data *data)
2266 {
2267         return data->nr_pages << (PAGE_SHIFT + data->data_order);
2268 }
2269
2270 #ifndef CONFIG_PERF_USE_VMALLOC
2271
2272 /*
2273  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2274  */
2275
2276 static struct page *
2277 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2278 {
2279         if (pgoff > data->nr_pages)
2280                 return NULL;
2281
2282         if (pgoff == 0)
2283                 return virt_to_page(data->user_page);
2284
2285         return virt_to_page(data->data_pages[pgoff - 1]);
2286 }
2287
2288 static struct perf_mmap_data *
2289 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2290 {
2291         struct perf_mmap_data *data;
2292         unsigned long size;
2293         int i;
2294
2295         WARN_ON(atomic_read(&event->mmap_count));
2296
2297         size = sizeof(struct perf_mmap_data);
2298         size += nr_pages * sizeof(void *);
2299
2300         data = kzalloc(size, GFP_KERNEL);
2301         if (!data)
2302                 goto fail;
2303
2304         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2305         if (!data->user_page)
2306                 goto fail_user_page;
2307
2308         for (i = 0; i < nr_pages; i++) {
2309                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2310                 if (!data->data_pages[i])
2311                         goto fail_data_pages;
2312         }
2313
2314         data->data_order = 0;
2315         data->nr_pages = nr_pages;
2316
2317         return data;
2318
2319 fail_data_pages:
2320         for (i--; i >= 0; i--)
2321                 free_page((unsigned long)data->data_pages[i]);
2322
2323         free_page((unsigned long)data->user_page);
2324
2325 fail_user_page:
2326         kfree(data);
2327
2328 fail:
2329         return NULL;
2330 }
2331
2332 static void perf_mmap_free_page(unsigned long addr)
2333 {
2334         struct page *page = virt_to_page((void *)addr);
2335
2336         page->mapping = NULL;
2337         __free_page(page);
2338 }
2339
2340 static void perf_mmap_data_free(struct perf_mmap_data *data)
2341 {
2342         int i;
2343
2344         perf_mmap_free_page((unsigned long)data->user_page);
2345         for (i = 0; i < data->nr_pages; i++)
2346                 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2347         kfree(data);
2348 }
2349
2350 #else
2351
2352 /*
2353  * Back perf_mmap() with vmalloc memory.
2354  *
2355  * Required for architectures that have d-cache aliasing issues.
2356  */
2357
2358 static struct page *
2359 perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2360 {
2361         if (pgoff > (1UL << data->data_order))
2362                 return NULL;
2363
2364         return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2365 }
2366
2367 static void perf_mmap_unmark_page(void *addr)
2368 {
2369         struct page *page = vmalloc_to_page(addr);
2370
2371         page->mapping = NULL;
2372 }
2373
2374 static void perf_mmap_data_free_work(struct work_struct *work)
2375 {
2376         struct perf_mmap_data *data;
2377         void *base;
2378         int i, nr;
2379
2380         data = container_of(work, struct perf_mmap_data, work);
2381         nr = 1 << data->data_order;
2382
2383         base = data->user_page;
2384         for (i = 0; i < nr + 1; i++)
2385                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2386
2387         vfree(base);
2388         kfree(data);
2389 }
2390
2391 static void perf_mmap_data_free(struct perf_mmap_data *data)
2392 {
2393         schedule_work(&data->work);
2394 }
2395
2396 static struct perf_mmap_data *
2397 perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2398 {
2399         struct perf_mmap_data *data;
2400         unsigned long size;
2401         void *all_buf;
2402
2403         WARN_ON(atomic_read(&event->mmap_count));
2404
2405         size = sizeof(struct perf_mmap_data);
2406         size += sizeof(void *);
2407
2408         data = kzalloc(size, GFP_KERNEL);
2409         if (!data)
2410                 goto fail;
2411
2412         INIT_WORK(&data->work, perf_mmap_data_free_work);
2413
2414         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2415         if (!all_buf)
2416                 goto fail_all_buf;
2417
2418         data->user_page = all_buf;
2419         data->data_pages[0] = all_buf + PAGE_SIZE;
2420         data->data_order = ilog2(nr_pages);
2421         data->nr_pages = 1;
2422
2423         return data;
2424
2425 fail_all_buf:
2426         kfree(data);
2427
2428 fail:
2429         return NULL;
2430 }
2431
2432 #endif
2433
2434 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2435 {
2436         struct perf_event *event = vma->vm_file->private_data;
2437         struct perf_mmap_data *data;
2438         int ret = VM_FAULT_SIGBUS;
2439
2440         if (vmf->flags & FAULT_FLAG_MKWRITE) {
2441                 if (vmf->pgoff == 0)
2442                         ret = 0;
2443                 return ret;
2444         }
2445
2446         rcu_read_lock();
2447         data = rcu_dereference(event->data);
2448         if (!data)
2449                 goto unlock;
2450
2451         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2452                 goto unlock;
2453
2454         vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2455         if (!vmf->page)
2456                 goto unlock;
2457
2458         get_page(vmf->page);
2459         vmf->page->mapping = vma->vm_file->f_mapping;
2460         vmf->page->index   = vmf->pgoff;
2461
2462         ret = 0;
2463 unlock:
2464         rcu_read_unlock();
2465
2466         return ret;
2467 }
2468
2469 static void
2470 perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2471 {
2472         long max_size = perf_data_size(data);
2473
2474         atomic_set(&data->lock, -1);
2475
2476         if (event->attr.watermark) {
2477                 data->watermark = min_t(long, max_size,
2478                                         event->attr.wakeup_watermark);
2479         }
2480
2481         if (!data->watermark)
2482                 data->watermark = max_size / 2;
2483
2484
2485         rcu_assign_pointer(event->data, data);
2486 }
2487
2488 static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2489 {
2490         struct perf_mmap_data *data;
2491
2492         data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2493         perf_mmap_data_free(data);
2494 }
2495
2496 static void perf_mmap_data_release(struct perf_event *event)
2497 {
2498         struct perf_mmap_data *data = event->data;
2499
2500         WARN_ON(atomic_read(&event->mmap_count));
2501
2502         rcu_assign_pointer(event->data, NULL);
2503         call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2504 }
2505
2506 static void perf_mmap_open(struct vm_area_struct *vma)
2507 {
2508         struct perf_event *event = vma->vm_file->private_data;
2509
2510         atomic_inc(&event->mmap_count);
2511 }
2512
2513 static void perf_mmap_close(struct vm_area_struct *vma)
2514 {
2515         struct perf_event *event = vma->vm_file->private_data;
2516
2517         WARN_ON_ONCE(event->ctx->parent_ctx);
2518         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2519                 unsigned long size = perf_data_size(event->data);
2520                 struct user_struct *user = current_user();
2521
2522                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2523                 vma->vm_mm->locked_vm -= event->data->nr_locked;
2524                 perf_mmap_data_release(event);
2525                 mutex_unlock(&event->mmap_mutex);
2526         }
2527 }
2528
2529 static const struct vm_operations_struct perf_mmap_vmops = {
2530         .open           = perf_mmap_open,
2531         .close          = perf_mmap_close,
2532         .fault          = perf_mmap_fault,
2533         .page_mkwrite   = perf_mmap_fault,
2534 };
2535
2536 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2537 {
2538         struct perf_event *event = file->private_data;
2539         unsigned long user_locked, user_lock_limit;
2540         struct user_struct *user = current_user();
2541         unsigned long locked, lock_limit;
2542         struct perf_mmap_data *data;
2543         unsigned long vma_size;
2544         unsigned long nr_pages;
2545         long user_extra, extra;
2546         int ret = 0;
2547
2548         if (!(vma->vm_flags & VM_SHARED))
2549                 return -EINVAL;
2550
2551         vma_size = vma->vm_end - vma->vm_start;
2552         nr_pages = (vma_size / PAGE_SIZE) - 1;
2553
2554         /*
2555          * If we have data pages ensure they're a power-of-two number, so we
2556          * can do bitmasks instead of modulo.
2557          */
2558         if (nr_pages != 0 && !is_power_of_2(nr_pages))
2559                 return -EINVAL;
2560
2561         if (vma_size != PAGE_SIZE * (1 + nr_pages))
2562                 return -EINVAL;
2563
2564         if (vma->vm_pgoff != 0)
2565                 return -EINVAL;
2566
2567         WARN_ON_ONCE(event->ctx->parent_ctx);
2568         mutex_lock(&event->mmap_mutex);
2569         if (event->output) {
2570                 ret = -EINVAL;
2571                 goto unlock;
2572         }
2573
2574         if (atomic_inc_not_zero(&event->mmap_count)) {
2575                 if (nr_pages != event->data->nr_pages)
2576                         ret = -EINVAL;
2577                 goto unlock;
2578         }
2579
2580         user_extra = nr_pages + 1;
2581         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2582
2583         /*
2584          * Increase the limit linearly with more CPUs:
2585          */
2586         user_lock_limit *= num_online_cpus();
2587
2588         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2589
2590         extra = 0;
2591         if (user_locked > user_lock_limit)
2592                 extra = user_locked - user_lock_limit;
2593
2594         lock_limit = rlimit(RLIMIT_MEMLOCK);
2595         lock_limit >>= PAGE_SHIFT;
2596         locked = vma->vm_mm->locked_vm + extra;
2597
2598         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2599                 !capable(CAP_IPC_LOCK)) {
2600                 ret = -EPERM;
2601                 goto unlock;
2602         }
2603
2604         WARN_ON(event->data);
2605
2606         data = perf_mmap_data_alloc(event, nr_pages);
2607         ret = -ENOMEM;
2608         if (!data)
2609                 goto unlock;
2610
2611         ret = 0;
2612         perf_mmap_data_init(event, data);
2613
2614         atomic_set(&event->mmap_count, 1);
2615         atomic_long_add(user_extra, &user->locked_vm);
2616         vma->vm_mm->locked_vm += extra;
2617         event->data->nr_locked = extra;
2618         if (vma->vm_flags & VM_WRITE)
2619                 event->data->writable = 1;
2620
2621 unlock:
2622         mutex_unlock(&event->mmap_mutex);
2623
2624         vma->vm_flags |= VM_RESERVED;
2625         vma->vm_ops = &perf_mmap_vmops;
2626
2627         return ret;
2628 }
2629
2630 static int perf_fasync(int fd, struct file *filp, int on)
2631 {
2632         struct inode *inode = filp->f_path.dentry->d_inode;
2633         struct perf_event *event = filp->private_data;
2634         int retval;
2635
2636         mutex_lock(&inode->i_mutex);
2637         retval = fasync_helper(fd, filp, on, &event->fasync);
2638         mutex_unlock(&inode->i_mutex);
2639
2640         if (retval < 0)
2641                 return retval;
2642
2643         return 0;
2644 }
2645
2646 static const struct file_operations perf_fops = {
2647         .release                = perf_release,
2648         .read                   = perf_read,
2649         .poll                   = perf_poll,
2650         .unlocked_ioctl         = perf_ioctl,
2651         .compat_ioctl           = perf_ioctl,
2652         .mmap                   = perf_mmap,
2653         .fasync                 = perf_fasync,
2654 };
2655
2656 /*
2657  * Perf event wakeup
2658  *
2659  * If there's data, ensure we set the poll() state and publish everything
2660  * to user-space before waking everybody up.
2661  */
2662
2663 void perf_event_wakeup(struct perf_event *event)
2664 {
2665         wake_up_all(&event->waitq);
2666
2667         if (event->pending_kill) {
2668                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2669                 event->pending_kill = 0;
2670         }
2671 }
2672
2673 /*
2674  * Pending wakeups
2675  *
2676  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2677  *
2678  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2679  * single linked list and use cmpxchg() to add entries lockless.
2680  */
2681
2682 static void perf_pending_event(struct perf_pending_entry *entry)
2683 {
2684         struct perf_event *event = container_of(entry,
2685                         struct perf_event, pending);
2686
2687         if (event->pending_disable) {
2688                 event->pending_disable = 0;
2689                 __perf_event_disable(event);
2690         }
2691
2692         if (event->pending_wakeup) {
2693                 event->pending_wakeup = 0;
2694                 perf_event_wakeup(event);
2695         }
2696 }
2697
2698 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2699
2700 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2701         PENDING_TAIL,
2702 };
2703
2704 static void perf_pending_queue(struct perf_pending_entry *entry,
2705                                void (*func)(struct perf_pending_entry *))
2706 {
2707         struct perf_pending_entry **head;
2708
2709         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2710                 return;
2711
2712         entry->func = func;
2713
2714         head = &get_cpu_var(perf_pending_head);
2715
2716         do {
2717                 entry->next = *head;
2718         } while (cmpxchg(head, entry->next, entry) != entry->next);
2719
2720         set_perf_event_pending();
2721
2722         put_cpu_var(perf_pending_head);
2723 }
2724
2725 static int __perf_pending_run(void)
2726 {
2727         struct perf_pending_entry *list;
2728         int nr = 0;
2729
2730         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2731         while (list != PENDING_TAIL) {
2732                 void (*func)(struct perf_pending_entry *);
2733                 struct perf_pending_entry *entry = list;
2734
2735                 list = list->next;
2736
2737                 func = entry->func;
2738                 entry->next = NULL;
2739                 /*
2740                  * Ensure we observe the unqueue before we issue the wakeup,
2741                  * so that we won't be waiting forever.
2742                  * -- see perf_not_pending().
2743                  */
2744                 smp_wmb();
2745
2746                 func(entry);
2747                 nr++;
2748         }
2749
2750         return nr;
2751 }
2752
2753 static inline int perf_not_pending(struct perf_event *event)
2754 {
2755         /*
2756          * If we flush on whatever cpu we run, there is a chance we don't
2757          * need to wait.
2758          */
2759         get_cpu();
2760         __perf_pending_run();
2761         put_cpu();
2762
2763         /*
2764          * Ensure we see the proper queue state before going to sleep
2765          * so that we do not miss the wakeup. -- see perf_pending_handle()
2766          */
2767         smp_rmb();
2768         return event->pending.next == NULL;
2769 }
2770
2771 static void perf_pending_sync(struct perf_event *event)
2772 {
2773         wait_event(event->waitq, perf_not_pending(event));
2774 }
2775
2776 void perf_event_do_pending(void)
2777 {
2778         __perf_pending_run();
2779 }
2780
2781 /*
2782  * Callchain support -- arch specific
2783  */
2784
2785 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2786 {
2787         return NULL;
2788 }
2789
2790 /*
2791  * Output
2792  */
2793 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2794                               unsigned long offset, unsigned long head)
2795 {
2796         unsigned long mask;
2797
2798         if (!data->writable)
2799                 return true;
2800
2801         mask = perf_data_size(data) - 1;
2802
2803         offset = (offset - tail) & mask;
2804         head   = (head   - tail) & mask;
2805
2806         if ((int)(head - offset) < 0)
2807                 return false;
2808
2809         return true;
2810 }
2811
2812 static void perf_output_wakeup(struct perf_output_handle *handle)
2813 {
2814         atomic_set(&handle->data->poll, POLL_IN);
2815
2816         if (handle->nmi) {
2817                 handle->event->pending_wakeup = 1;
2818                 perf_pending_queue(&handle->event->pending,
2819                                    perf_pending_event);
2820         } else
2821                 perf_event_wakeup(handle->event);
2822 }
2823
2824 /*
2825  * Curious locking construct.
2826  *
2827  * We need to ensure a later event_id doesn't publish a head when a former
2828  * event_id isn't done writing. However since we need to deal with NMIs we
2829  * cannot fully serialize things.
2830  *
2831  * What we do is serialize between CPUs so we only have to deal with NMI
2832  * nesting on a single CPU.
2833  *
2834  * We only publish the head (and generate a wakeup) when the outer-most
2835  * event_id completes.
2836  */
2837 static void perf_output_lock(struct perf_output_handle *handle)
2838 {
2839         struct perf_mmap_data *data = handle->data;
2840         int cur, cpu = get_cpu();
2841
2842         handle->locked = 0;
2843
2844         for (;;) {
2845                 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2846                 if (cur == -1) {
2847                         handle->locked = 1;
2848                         break;
2849                 }
2850                 if (cur == cpu)
2851                         break;
2852
2853                 cpu_relax();
2854         }
2855 }
2856
2857 static void perf_output_unlock(struct perf_output_handle *handle)
2858 {
2859         struct perf_mmap_data *data = handle->data;
2860         unsigned long head;
2861         int cpu;
2862
2863         data->done_head = data->head;
2864
2865         if (!handle->locked)
2866                 goto out;
2867
2868 again:
2869         /*
2870          * The xchg implies a full barrier that ensures all writes are done
2871          * before we publish the new head, matched by a rmb() in userspace when
2872          * reading this position.
2873          */
2874         while ((head = atomic_long_xchg(&data->done_head, 0)))
2875                 data->user_page->data_head = head;
2876
2877         /*
2878          * NMI can happen here, which means we can miss a done_head update.
2879          */
2880
2881         cpu = atomic_xchg(&data->lock, -1);
2882         WARN_ON_ONCE(cpu != smp_processor_id());
2883
2884         /*
2885          * Therefore we have to validate we did not indeed do so.
2886          */
2887         if (unlikely(atomic_long_read(&data->done_head))) {
2888                 /*
2889                  * Since we had it locked, we can lock it again.
2890                  */
2891                 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2892                         cpu_relax();
2893
2894                 goto again;
2895         }
2896
2897         if (atomic_xchg(&data->wakeup, 0))
2898                 perf_output_wakeup(handle);
2899 out:
2900         put_cpu();
2901 }
2902
2903 void perf_output_copy(struct perf_output_handle *handle,
2904                       const void *buf, unsigned int len)
2905 {
2906         unsigned int pages_mask;
2907         unsigned long offset;
2908         unsigned int size;
2909         void **pages;
2910
2911         offset          = handle->offset;
2912         pages_mask      = handle->data->nr_pages - 1;
2913         pages           = handle->data->data_pages;
2914
2915         do {
2916                 unsigned long page_offset;
2917                 unsigned long page_size;
2918                 int nr;
2919
2920                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
2921                 page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
2922                 page_offset = offset & (page_size - 1);
2923                 size        = min_t(unsigned int, page_size - page_offset, len);
2924
2925                 memcpy(pages[nr] + page_offset, buf, size);
2926
2927                 len         -= size;
2928                 buf         += size;
2929                 offset      += size;
2930         } while (len);
2931
2932         handle->offset = offset;
2933
2934         /*
2935          * Check we didn't copy past our reservation window, taking the
2936          * possible unsigned int wrap into account.
2937          */
2938         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2939 }
2940
2941 int perf_output_begin(struct perf_output_handle *handle,
2942                       struct perf_event *event, unsigned int size,
2943                       int nmi, int sample)
2944 {
2945         struct perf_event *output_event;
2946         struct perf_mmap_data *data;
2947         unsigned long tail, offset, head;
2948         int have_lost;
2949         struct {
2950                 struct perf_event_header header;
2951                 u64                      id;
2952                 u64                      lost;
2953         } lost_event;
2954
2955         rcu_read_lock();
2956         /*
2957          * For inherited events we send all the output towards the parent.
2958          */
2959         if (event->parent)
2960                 event = event->parent;
2961
2962         output_event = rcu_dereference(event->output);
2963         if (output_event)
2964                 event = output_event;
2965
2966         data = rcu_dereference(event->data);
2967         if (!data)
2968                 goto out;
2969
2970         handle->data    = data;
2971         handle->event   = event;
2972         handle->nmi     = nmi;
2973         handle->sample  = sample;
2974
2975         if (!data->nr_pages)
2976                 goto fail;
2977
2978         have_lost = atomic_read(&data->lost);
2979         if (have_lost)
2980                 size += sizeof(lost_event);
2981
2982         perf_output_lock(handle);
2983
2984         do {
2985                 /*
2986                  * Userspace could choose to issue a mb() before updating the
2987                  * tail pointer. So that all reads will be completed before the
2988                  * write is issued.
2989                  */
2990                 tail = ACCESS_ONCE(data->user_page->data_tail);
2991                 smp_rmb();
2992                 offset = head = atomic_long_read(&data->head);
2993                 head += size;
2994                 if (unlikely(!perf_output_space(data, tail, offset, head)))
2995                         goto fail;
2996         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2997
2998         handle->offset  = offset;
2999         handle->head    = head;
3000
3001         if (head - tail > data->watermark)
3002                 atomic_set(&data->wakeup, 1);
3003
3004         if (have_lost) {
3005                 lost_event.header.type = PERF_RECORD_LOST;
3006                 lost_event.header.misc = 0;
3007                 lost_event.header.size = sizeof(lost_event);
3008                 lost_event.id          = event->id;
3009                 lost_event.lost        = atomic_xchg(&data->lost, 0);
3010
3011                 perf_output_put(handle, lost_event);
3012         }
3013
3014         return 0;
3015
3016 fail:
3017         atomic_inc(&data->lost);
3018         perf_output_unlock(handle);
3019 out:
3020         rcu_read_unlock();
3021
3022         return -ENOSPC;
3023 }
3024
3025 void perf_output_end(struct perf_output_handle *handle)
3026 {
3027         struct perf_event *event = handle->event;
3028         struct perf_mmap_data *data = handle->data;
3029
3030         int wakeup_events = event->attr.wakeup_events;
3031
3032         if (handle->sample && wakeup_events) {
3033                 int events = atomic_inc_return(&data->events);
3034                 if (events >= wakeup_events) {
3035                         atomic_sub(wakeup_events, &data->events);
3036                         atomic_set(&data->wakeup, 1);
3037                 }
3038         }
3039
3040         perf_output_unlock(handle);
3041         rcu_read_unlock();
3042 }
3043
3044 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3045 {
3046         /*
3047          * only top level events have the pid namespace they were created in
3048          */
3049         if (event->parent)
3050                 event = event->parent;
3051
3052         return task_tgid_nr_ns(p, event->ns);
3053 }
3054
3055 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3056 {
3057         /*
3058          * only top level events have the pid namespace they were created in
3059          */
3060         if (event->parent)
3061                 event = event->parent;
3062
3063         return task_pid_nr_ns(p, event->ns);
3064 }
3065
3066 static void perf_output_read_one(struct perf_output_handle *handle,
3067                                  struct perf_event *event)
3068 {
3069         u64 read_format = event->attr.read_format;
3070         u64 values[4];
3071         int n = 0;
3072
3073         values[n++] = atomic64_read(&event->count);
3074         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3075                 values[n++] = event->total_time_enabled +
3076                         atomic64_read(&event->child_total_time_enabled);
3077         }
3078         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3079                 values[n++] = event->total_time_running +
3080                         atomic64_read(&event->child_total_time_running);
3081         }
3082         if (read_format & PERF_FORMAT_ID)
3083                 values[n++] = primary_event_id(event);
3084
3085         perf_output_copy(handle, values, n * sizeof(u64));
3086 }
3087
3088 /*
3089  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3090  */
3091 static void perf_output_read_group(struct perf_output_handle *handle,
3092                             struct perf_event *event)
3093 {
3094         struct perf_event *leader = event->group_leader, *sub;
3095         u64 read_format = event->attr.read_format;
3096         u64 values[5];
3097         int n = 0;
3098
3099         values[n++] = 1 + leader->nr_siblings;
3100
3101         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3102                 values[n++] = leader->total_time_enabled;
3103
3104         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3105                 values[n++] = leader->total_time_running;
3106
3107         if (leader != event)
3108                 leader->pmu->read(leader);
3109
3110         values[n++] = atomic64_read(&leader->count);
3111         if (read_format & PERF_FORMAT_ID)
3112                 values[n++] = primary_event_id(leader);
3113
3114         perf_output_copy(handle, values, n * sizeof(u64));
3115
3116         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3117                 n = 0;
3118
3119                 if (sub != event)
3120                         sub->pmu->read(sub);
3121
3122                 values[n++] = atomic64_read(&sub->count);
3123                 if (read_format & PERF_FORMAT_ID)
3124                         values[n++] = primary_event_id(sub);
3125
3126                 perf_output_copy(handle, values, n * sizeof(u64));
3127         }
3128 }
3129
3130 static void perf_output_read(struct perf_output_handle *handle,
3131                              struct perf_event *event)
3132 {
3133         if (event->attr.read_format & PERF_FORMAT_GROUP)
3134                 perf_output_read_group(handle, event);
3135         else
3136                 perf_output_read_one(handle, event);
3137 }
3138
3139 void perf_output_sample(struct perf_output_handle *handle,
3140                         struct perf_event_header *header,
3141                         struct perf_sample_data *data,
3142                         struct perf_event *event)
3143 {
3144         u64 sample_type = data->type;
3145
3146         perf_output_put(handle, *header);
3147
3148         if (sample_type & PERF_SAMPLE_IP)
3149                 perf_output_put(handle, data->ip);
3150
3151         if (sample_type & PERF_SAMPLE_TID)
3152                 perf_output_put(handle, data->tid_entry);
3153
3154         if (sample_type & PERF_SAMPLE_TIME)
3155                 perf_output_put(handle, data->time);
3156
3157         if (sample_type & PERF_SAMPLE_ADDR)
3158                 perf_output_put(handle, data->addr);
3159
3160         if (sample_type & PERF_SAMPLE_ID)
3161                 perf_output_put(handle, data->id);
3162
3163         if (sample_type & PERF_SAMPLE_STREAM_ID)
3164                 perf_output_put(handle, data->stream_id);
3165
3166         if (sample_type & PERF_SAMPLE_CPU)
3167                 perf_output_put(handle, data->cpu_entry);
3168
3169         if (sample_type & PERF_SAMPLE_PERIOD)
3170                 perf_output_put(handle, data->period);
3171
3172         if (sample_type & PERF_SAMPLE_READ)
3173                 perf_output_read(handle, event);
3174
3175         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3176                 if (data->callchain) {
3177                         int size = 1;
3178
3179                         if (data->callchain)
3180                                 size += data->callchain->nr;
3181
3182                         size *= sizeof(u64);
3183
3184                         perf_output_copy(handle, data->callchain, size);
3185                 } else {
3186                         u64 nr = 0;
3187                         perf_output_put(handle, nr);
3188                 }
3189         }
3190
3191         if (sample_type & PERF_SAMPLE_RAW) {
3192                 if (data->raw) {
3193                         perf_output_put(handle, data->raw->size);
3194                         perf_output_copy(handle, data->raw->data,
3195                                          data->raw->size);
3196                 } else {
3197                         struct {
3198                                 u32     size;
3199                                 u32     data;
3200                         } raw = {
3201                                 .size = sizeof(u32),
3202                                 .data = 0,
3203                         };
3204                         perf_output_put(handle, raw);
3205                 }
3206         }
3207 }
3208
3209 void perf_prepare_sample(struct perf_event_header *header,
3210                          struct perf_sample_data *data,
3211                          struct perf_event *event,
3212                          struct pt_regs *regs)
3213 {
3214         u64 sample_type = event->attr.sample_type;
3215
3216         data->type = sample_type;
3217
3218         header->type = PERF_RECORD_SAMPLE;
3219         header->size = sizeof(*header);
3220
3221         header->misc = 0;
3222         header->misc |= perf_misc_flags(regs);
3223
3224         if (sample_type & PERF_SAMPLE_IP) {
3225                 data->ip = perf_instruction_pointer(regs);
3226
3227                 header->size += sizeof(data->ip);
3228         }
3229
3230         if (sample_type & PERF_SAMPLE_TID) {
3231                 /* namespace issues */
3232                 data->tid_entry.pid = perf_event_pid(event, current);
3233                 data->tid_entry.tid = perf_event_tid(event, current);
3234
3235                 header->size += sizeof(data->tid_entry);
3236         }
3237
3238         if (sample_type & PERF_SAMPLE_TIME) {
3239                 data->time = perf_clock();
3240
3241                 header->size += sizeof(data->time);
3242         }
3243
3244         if (sample_type & PERF_SAMPLE_ADDR)
3245                 header->size += sizeof(data->addr);
3246
3247         if (sample_type & PERF_SAMPLE_ID) {
3248                 data->id = primary_event_id(event);
3249
3250                 header->size += sizeof(data->id);
3251         }
3252
3253         if (sample_type & PERF_SAMPLE_STREAM_ID) {
3254                 data->stream_id = event->id;
3255
3256                 header->size += sizeof(data->stream_id);
3257         }
3258
3259         if (sample_type & PERF_SAMPLE_CPU) {
3260                 data->cpu_entry.cpu             = raw_smp_processor_id();
3261                 data->cpu_entry.reserved        = 0;
3262
3263                 header->size += sizeof(data->cpu_entry);
3264         }
3265
3266         if (sample_type & PERF_SAMPLE_PERIOD)
3267                 header->size += sizeof(data->period);
3268
3269         if (sample_type & PERF_SAMPLE_READ)
3270                 header->size += perf_event_read_size(event);
3271
3272         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3273                 int size = 1;
3274
3275                 data->callchain = perf_callchain(regs);
3276
3277                 if (data->callchain)
3278                         size += data->callchain->nr;
3279
3280                 header->size += size * sizeof(u64);
3281         }
3282
3283         if (sample_type & PERF_SAMPLE_RAW) {
3284                 int size = sizeof(u32);
3285
3286                 if (data->raw)
3287                         size += data->raw->size;
3288                 else
3289                         size += sizeof(u32);
3290
3291                 WARN_ON_ONCE(size & (sizeof(u64)-1));
3292                 header->size += size;
3293         }
3294 }
3295
3296 static void perf_event_output(struct perf_event *event, int nmi,
3297                                 struct perf_sample_data *data,
3298                                 struct pt_regs *regs)
3299 {
3300         struct perf_output_handle handle;
3301         struct perf_event_header header;
3302
3303         perf_prepare_sample(&header, data, event, regs);
3304
3305         if (perf_output_begin(&handle, event, header.size, nmi, 1))
3306                 return;
3307
3308         perf_output_sample(&handle, &header, data, event);
3309
3310         perf_output_end(&handle);
3311 }
3312
3313 /*
3314  * read event_id
3315  */
3316
3317 struct perf_read_event {
3318         struct perf_event_header        header;
3319
3320         u32                             pid;
3321         u32                             tid;
3322 };
3323
3324 static void
3325 perf_event_read_event(struct perf_event *event,
3326                         struct task_struct *task)
3327 {
3328         struct perf_output_handle handle;
3329         struct perf_read_event read_event = {
3330                 .header = {
3331                         .type = PERF_RECORD_READ,
3332                         .misc = 0,
3333                         .size = sizeof(read_event) + perf_event_read_size(event),
3334                 },
3335                 .pid = perf_event_pid(event, task),
3336                 .tid = perf_event_tid(event, task),
3337         };
3338         int ret;
3339
3340         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3341         if (ret)
3342                 return;
3343
3344         perf_output_put(&handle, read_event);
3345         perf_output_read(&handle, event);
3346
3347         perf_output_end(&handle);
3348 }
3349
3350 /*
3351  * task tracking -- fork/exit
3352  *
3353  * enabled by: attr.comm | attr.mmap | attr.task
3354  */
3355
3356 struct perf_task_event {
3357         struct task_struct              *task;
3358         struct perf_event_context       *task_ctx;
3359
3360         struct {
3361                 struct perf_event_header        header;
3362
3363                 u32                             pid;
3364                 u32                             ppid;
3365                 u32                             tid;
3366                 u32                             ptid;
3367                 u64                             time;
3368         } event_id;
3369 };
3370
3371 static void perf_event_task_output(struct perf_event *event,
3372                                      struct perf_task_event *task_event)
3373 {
3374         struct perf_output_handle handle;
3375         int size;
3376         struct task_struct *task = task_event->task;
3377         int ret;
3378
3379         size  = task_event->event_id.header.size;
3380         ret = perf_output_begin(&handle, event, size, 0, 0);
3381
3382         if (ret)
3383                 return;
3384
3385         task_event->event_id.pid = perf_event_pid(event, task);
3386         task_event->event_id.ppid = perf_event_pid(event, current);
3387
3388         task_event->event_id.tid = perf_event_tid(event, task);
3389         task_event->event_id.ptid = perf_event_tid(event, current);
3390
3391         perf_output_put(&handle, task_event->event_id);
3392
3393         perf_output_end(&handle);
3394 }
3395
3396 static int perf_event_task_match(struct perf_event *event)
3397 {
3398         if (event->state < PERF_EVENT_STATE_INACTIVE)
3399                 return 0;
3400
3401         if (event->cpu != -1 && event->cpu != smp_processor_id())
3402                 return 0;
3403
3404         if (event->attr.comm || event->attr.mmap || event->attr.task)
3405                 return 1;
3406
3407         return 0;
3408 }
3409
3410 static void perf_event_task_ctx(struct perf_event_context *ctx,
3411                                   struct perf_task_event *task_event)
3412 {
3413         struct perf_event *event;
3414
3415         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3416                 if (perf_event_task_match(event))
3417                         perf_event_task_output(event, task_event);
3418         }
3419 }
3420
3421 static void perf_event_task_event(struct perf_task_event *task_event)
3422 {
3423         struct perf_cpu_context *cpuctx;
3424         struct perf_event_context *ctx = task_event->task_ctx;
3425
3426         rcu_read_lock();
3427         cpuctx = &get_cpu_var(perf_cpu_context);
3428         perf_event_task_ctx(&cpuctx->ctx, task_event);
3429         if (!ctx)
3430                 ctx = rcu_dereference(current->perf_event_ctxp);
3431         if (ctx)
3432                 perf_event_task_ctx(ctx, task_event);
3433         put_cpu_var(perf_cpu_context);
3434         rcu_read_unlock();
3435 }
3436
3437 static void perf_event_task(struct task_struct *task,
3438                               struct perf_event_context *task_ctx,
3439                               int new)
3440 {
3441         struct perf_task_event task_event;
3442
3443         if (!atomic_read(&nr_comm_events) &&
3444             !atomic_read(&nr_mmap_events) &&
3445             !atomic_read(&nr_task_events))
3446                 return;
3447
3448         task_event = (struct perf_task_event){
3449                 .task     = task,
3450                 .task_ctx = task_ctx,
3451                 .event_id    = {
3452                         .header = {
3453                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3454                                 .misc = 0,
3455                                 .size = sizeof(task_event.event_id),
3456                         },
3457                         /* .pid  */
3458                         /* .ppid */
3459                         /* .tid  */
3460                         /* .ptid */
3461                         .time = perf_clock(),
3462                 },
3463         };
3464
3465         perf_event_task_event(&task_event);
3466 }
3467
3468 void perf_event_fork(struct task_struct *task)
3469 {
3470         perf_event_task(task, NULL, 1);
3471 }
3472
3473 /*
3474  * comm tracking
3475  */
3476
3477 struct perf_comm_event {
3478         struct task_struct      *task;
3479         char                    *comm;
3480         int                     comm_size;
3481
3482         struct {
3483                 struct perf_event_header        header;
3484
3485                 u32                             pid;
3486                 u32                             tid;
3487         } event_id;
3488 };
3489
3490 static void perf_event_comm_output(struct perf_event *event,
3491                                      struct perf_comm_event *comm_event)
3492 {
3493         struct perf_output_handle handle;
3494         int size = comm_event->event_id.header.size;
3495         int ret = perf_output_begin(&handle, event, size, 0, 0);
3496
3497         if (ret)
3498                 return;
3499
3500         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3501         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3502
3503         perf_output_put(&handle, comm_event->event_id);
3504         perf_output_copy(&handle, comm_event->comm,
3505                                    comm_event->comm_size);
3506         perf_output_end(&handle);
3507 }
3508
3509 static int perf_event_comm_match(struct perf_event *event)
3510 {
3511         if (event->state < PERF_EVENT_STATE_INACTIVE)
3512                 return 0;
3513
3514         if (event->cpu != -1 && event->cpu != smp_processor_id())
3515                 return 0;
3516
3517         if (event->attr.comm)
3518                 return 1;
3519
3520         return 0;
3521 }
3522
3523 static void perf_event_comm_ctx(struct perf_event_context *ctx,
3524                                   struct perf_comm_event *comm_event)
3525 {
3526         struct perf_event *event;
3527
3528         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3529                 if (perf_event_comm_match(event))
3530                         perf_event_comm_output(event, comm_event);
3531         }
3532 }
3533
3534 static void perf_event_comm_event(struct perf_comm_event *comm_event)
3535 {
3536         struct perf_cpu_context *cpuctx;
3537         struct perf_event_context *ctx;
3538         unsigned int size;
3539         char comm[TASK_COMM_LEN];
3540
3541         memset(comm, 0, sizeof(comm));
3542         strlcpy(comm, comm_event->task->comm, sizeof(comm));
3543         size = ALIGN(strlen(comm)+1, sizeof(u64));
3544
3545         comm_event->comm = comm;
3546         comm_event->comm_size = size;
3547
3548         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3549
3550         rcu_read_lock();
3551         cpuctx = &get_cpu_var(perf_cpu_context);
3552         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3553         ctx = rcu_dereference(current->perf_event_ctxp);
3554         if (ctx)
3555                 perf_event_comm_ctx(ctx, comm_event);
3556         put_cpu_var(perf_cpu_context);
3557         rcu_read_unlock();
3558 }
3559
3560 void perf_event_comm(struct task_struct *task)
3561 {
3562         struct perf_comm_event comm_event;
3563
3564         if (task->perf_event_ctxp)
3565                 perf_event_enable_on_exec(task);
3566
3567         if (!atomic_read(&nr_comm_events))
3568                 return;
3569
3570         comm_event = (struct perf_comm_event){
3571                 .task   = task,
3572                 /* .comm      */
3573                 /* .comm_size */
3574                 .event_id  = {
3575                         .header = {
3576                                 .type = PERF_RECORD_COMM,
3577                                 .misc = 0,
3578                                 /* .size */
3579                         },
3580                         /* .pid */
3581                         /* .tid */
3582                 },
3583         };
3584
3585         perf_event_comm_event(&comm_event);
3586 }
3587
3588 /*
3589  * mmap tracking
3590  */
3591
3592 struct perf_mmap_event {
3593         struct vm_area_struct   *vma;
3594
3595         const char              *file_name;
3596         int                     file_size;
3597
3598         struct {
3599                 struct perf_event_header        header;
3600
3601                 u32                             pid;
3602                 u32                             tid;
3603                 u64                             start;
3604                 u64                             len;
3605                 u64                             pgoff;
3606         } event_id;
3607 };
3608
3609 static void perf_event_mmap_output(struct perf_event *event,
3610                                      struct perf_mmap_event *mmap_event)
3611 {
3612         struct perf_output_handle handle;
3613         int size = mmap_event->event_id.header.size;
3614         int ret = perf_output_begin(&handle, event, size, 0, 0);
3615
3616         if (ret)
3617                 return;
3618
3619         mmap_event->event_id.pid = perf_event_pid(event, current);
3620         mmap_event->event_id.tid = perf_event_tid(event, current);
3621
3622         perf_output_put(&handle, mmap_event->event_id);
3623         perf_output_copy(&handle, mmap_event->file_name,
3624                                    mmap_event->file_size);
3625         perf_output_end(&handle);
3626 }
3627
3628 static int perf_event_mmap_match(struct perf_event *event,
3629                                    struct perf_mmap_event *mmap_event)
3630 {
3631         if (event->state < PERF_EVENT_STATE_INACTIVE)
3632                 return 0;
3633
3634         if (event->cpu != -1 && event->cpu != smp_processor_id())
3635                 return 0;
3636
3637         if (event->attr.mmap)
3638                 return 1;
3639
3640         return 0;
3641 }
3642
3643 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3644                                   struct perf_mmap_event *mmap_event)
3645 {
3646         struct perf_event *event;
3647
3648         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3649                 if (perf_event_mmap_match(event, mmap_event))
3650                         perf_event_mmap_output(event, mmap_event);
3651         }
3652 }
3653
3654 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3655 {
3656         struct perf_cpu_context *cpuctx;
3657         struct perf_event_context *ctx;
3658         struct vm_area_struct *vma = mmap_event->vma;
3659         struct file *file = vma->vm_file;
3660         unsigned int size;
3661         char tmp[16];
3662         char *buf = NULL;
3663         const char *name;
3664
3665         memset(tmp, 0, sizeof(tmp));
3666
3667         if (file) {
3668                 /*
3669                  * d_path works from the end of the buffer backwards, so we
3670                  * need to add enough zero bytes after the string to handle
3671                  * the 64bit alignment we do later.
3672                  */
3673                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3674                 if (!buf) {
3675                         name = strncpy(tmp, "//enomem", sizeof(tmp));
3676                         goto got_name;
3677                 }
3678                 name = d_path(&file->f_path, buf, PATH_MAX);
3679                 if (IS_ERR(name)) {
3680                         name = strncpy(tmp, "//toolong", sizeof(tmp));
3681                         goto got_name;
3682                 }
3683         } else {
3684                 if (arch_vma_name(mmap_event->vma)) {
3685                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3686                                        sizeof(tmp));
3687                         goto got_name;
3688                 }
3689
3690                 if (!vma->vm_mm) {
3691                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
3692                         goto got_name;
3693                 }
3694
3695                 name = strncpy(tmp, "//anon", sizeof(tmp));
3696                 goto got_name;
3697         }
3698
3699 got_name:
3700         size = ALIGN(strlen(name)+1, sizeof(u64));
3701
3702         mmap_event->file_name = name;
3703         mmap_event->file_size = size;
3704
3705         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3706
3707         rcu_read_lock();
3708         cpuctx = &get_cpu_var(perf_cpu_context);
3709         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3710         ctx = rcu_dereference(current->perf_event_ctxp);
3711         if (ctx)
3712                 perf_event_mmap_ctx(ctx, mmap_event);
3713         put_cpu_var(perf_cpu_context);
3714         rcu_read_unlock();
3715
3716         kfree(buf);
3717 }
3718
3719 void __perf_event_mmap(struct vm_area_struct *vma)
3720 {
3721         struct perf_mmap_event mmap_event;
3722
3723         if (!atomic_read(&nr_mmap_events))
3724                 return;
3725
3726         mmap_event = (struct perf_mmap_event){
3727                 .vma    = vma,
3728                 /* .file_name */
3729                 /* .file_size */
3730                 .event_id  = {
3731                         .header = {
3732                                 .type = PERF_RECORD_MMAP,
3733                                 .misc = 0,
3734                                 /* .size */
3735                         },
3736                         /* .pid */
3737                         /* .tid */
3738                         .start  = vma->vm_start,
3739                         .len    = vma->vm_end - vma->vm_start,
3740                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
3741                 },
3742         };
3743
3744         perf_event_mmap_event(&mmap_event);
3745 }
3746
3747 /*
3748  * IRQ throttle logging
3749  */
3750
3751 static void perf_log_throttle(struct perf_event *event, int enable)
3752 {
3753         struct perf_output_handle handle;
3754         int ret;
3755
3756         struct {
3757                 struct perf_event_header        header;
3758                 u64                             time;
3759                 u64                             id;
3760                 u64                             stream_id;
3761         } throttle_event = {
3762                 .header = {
3763                         .type = PERF_RECORD_THROTTLE,
3764                         .misc = 0,
3765                         .size = sizeof(throttle_event),
3766                 },
3767                 .time           = perf_clock(),
3768                 .id             = primary_event_id(event),
3769                 .stream_id      = event->id,
3770         };
3771
3772         if (enable)
3773                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3774
3775         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3776         if (ret)
3777                 return;
3778
3779         perf_output_put(&handle, throttle_event);
3780         perf_output_end(&handle);
3781 }
3782
3783 /*
3784  * Generic event overflow handling, sampling.
3785  */
3786
3787 static int __perf_event_overflow(struct perf_event *event, int nmi,
3788                                    int throttle, struct perf_sample_data *data,
3789                                    struct pt_regs *regs)
3790 {
3791         int events = atomic_read(&event->event_limit);
3792         struct hw_perf_event *hwc = &event->hw;
3793         int ret = 0;
3794
3795         throttle = (throttle && event->pmu->unthrottle != NULL);
3796
3797         if (!throttle) {
3798                 hwc->interrupts++;
3799         } else {
3800                 if (hwc->interrupts != MAX_INTERRUPTS) {
3801                         hwc->interrupts++;
3802                         if (HZ * hwc->interrupts >
3803                                         (u64)sysctl_perf_event_sample_rate) {
3804                                 hwc->interrupts = MAX_INTERRUPTS;
3805                                 perf_log_throttle(event, 0);
3806                                 ret = 1;
3807                         }
3808                 } else {
3809                         /*
3810                          * Keep re-disabling events even though on the previous
3811                          * pass we disabled it - just in case we raced with a
3812                          * sched-in and the event got enabled again:
3813                          */
3814                         ret = 1;
3815                 }
3816         }
3817
3818         if (event->attr.freq) {
3819                 u64 now = perf_clock();
3820                 s64 delta = now - hwc->freq_time_stamp;
3821
3822                 hwc->freq_time_stamp = now;
3823
3824                 if (delta > 0 && delta < 2*TICK_NSEC)
3825                         perf_adjust_period(event, delta, hwc->last_period);
3826         }
3827
3828         /*
3829          * XXX event_limit might not quite work as expected on inherited
3830          * events
3831          */
3832
3833         event->pending_kill = POLL_IN;
3834         if (events && atomic_dec_and_test(&event->event_limit)) {
3835                 ret = 1;
3836                 event->pending_kill = POLL_HUP;
3837                 if (nmi) {
3838                         event->pending_disable = 1;
3839                         perf_pending_queue(&event->pending,
3840                                            perf_pending_event);
3841                 } else
3842                         perf_event_disable(event);
3843         }
3844
3845         if (event->overflow_handler)
3846                 event->overflow_handler(event, nmi, data, regs);
3847         else
3848                 perf_event_output(event, nmi, data, regs);
3849
3850         return ret;
3851 }
3852
3853 int perf_event_overflow(struct perf_event *event, int nmi,
3854                           struct perf_sample_data *data,
3855                           struct pt_regs *regs)
3856 {
3857         return __perf_event_overflow(event, nmi, 1, data, regs);
3858 }
3859
3860 /*
3861  * Generic software event infrastructure
3862  */
3863
3864 /*
3865  * We directly increment event->count and keep a second value in
3866  * event->hw.period_left to count intervals. This period event
3867  * is kept in the range [-sample_period, 0] so that we can use the
3868  * sign as trigger.
3869  */
3870
3871 static u64 perf_swevent_set_period(struct perf_event *event)
3872 {
3873         struct hw_perf_event *hwc = &event->hw;
3874         u64 period = hwc->last_period;
3875         u64 nr, offset;
3876         s64 old, val;
3877
3878         hwc->last_period = hwc->sample_period;
3879
3880 again:
3881         old = val = atomic64_read(&hwc->period_left);
3882         if (val < 0)
3883                 return 0;
3884
3885         nr = div64_u64(period + val, period);
3886         offset = nr * period;
3887         val -= offset;
3888         if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3889                 goto again;
3890
3891         return nr;
3892 }
3893
3894 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3895                                     int nmi, struct perf_sample_data *data,
3896                                     struct pt_regs *regs)
3897 {
3898         struct hw_perf_event *hwc = &event->hw;
3899         int throttle = 0;
3900
3901         data->period = event->hw.last_period;
3902         if (!overflow)
3903                 overflow = perf_swevent_set_period(event);
3904
3905         if (hwc->interrupts == MAX_INTERRUPTS)
3906                 return;
3907
3908         for (; overflow; overflow--) {
3909                 if (__perf_event_overflow(event, nmi, throttle,
3910                                             data, regs)) {
3911                         /*
3912                          * We inhibit the overflow from happening when
3913                          * hwc->interrupts == MAX_INTERRUPTS.
3914                          */
3915                         break;
3916                 }
3917                 throttle = 1;
3918         }
3919 }
3920
3921 static void perf_swevent_unthrottle(struct perf_event *event)
3922 {
3923         /*
3924          * Nothing to do, we already reset hwc->interrupts.
3925          */
3926 }
3927
3928 static void perf_swevent_add(struct perf_event *event, u64 nr,
3929                                int nmi, struct perf_sample_data *data,
3930                                struct pt_regs *regs)
3931 {
3932         struct hw_perf_event *hwc = &event->hw;
3933
3934         atomic64_add(nr, &event->count);
3935
3936         if (!regs)
3937                 return;
3938
3939         if (!hwc->sample_period)
3940                 return;
3941
3942         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3943                 return perf_swevent_overflow(event, 1, nmi, data, regs);
3944
3945         if (atomic64_add_negative(nr, &hwc->period_left))
3946                 return;
3947
3948         perf_swevent_overflow(event, 0, nmi, data, regs);
3949 }
3950
3951 static int perf_swevent_is_counting(struct perf_event *event)
3952 {
3953         /*
3954          * The event is active, we're good!
3955          */
3956         if (event->state == PERF_EVENT_STATE_ACTIVE)
3957                 return 1;
3958
3959         /*
3960          * The event is off/error, not counting.
3961          */
3962         if (event->state != PERF_EVENT_STATE_INACTIVE)
3963                 return 0;
3964
3965         /*
3966          * The event is inactive, if the context is active
3967          * we're part of a group that didn't make it on the 'pmu',
3968          * not counting.
3969          */
3970         if (event->ctx->is_active)
3971                 return 0;
3972
3973         /*
3974          * We're inactive and the context is too, this means the
3975          * task is scheduled out, we're counting events that happen
3976          * to us, like migration events.
3977          */
3978         return 1;
3979 }
3980
3981 static int perf_tp_event_match(struct perf_event *event,
3982                                 struct perf_sample_data *data);
3983
3984 static int perf_exclude_event(struct perf_event *event,
3985                               struct pt_regs *regs)
3986 {
3987         if (regs) {
3988                 if (event->attr.exclude_user && user_mode(regs))
3989                         return 1;
3990
3991                 if (event->attr.exclude_kernel && !user_mode(regs))
3992                         return 1;
3993         }
3994
3995         return 0;
3996 }
3997
3998 static int perf_swevent_match(struct perf_event *event,
3999                                 enum perf_type_id type,
4000                                 u32 event_id,
4001                                 struct perf_sample_data *data,
4002                                 struct pt_regs *regs)
4003 {
4004         if (event->cpu != -1 && event->cpu != smp_processor_id())
4005                 return 0;
4006
4007         if (!perf_swevent_is_counting(event))
4008                 return 0;
4009
4010         if (event->attr.type != type)
4011                 return 0;
4012
4013         if (event->attr.config != event_id)
4014                 return 0;
4015
4016         if (perf_exclude_event(event, regs))
4017                 return 0;
4018
4019         if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4020             !perf_tp_event_match(event, data))
4021                 return 0;
4022
4023         return 1;
4024 }
4025
4026 static void perf_swevent_ctx_event(struct perf_event_context *ctx,
4027                                      enum perf_type_id type,
4028                                      u32 event_id, u64 nr, int nmi,
4029                                      struct perf_sample_data *data,
4030                                      struct pt_regs *regs)
4031 {
4032         struct perf_event *event;
4033
4034         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4035                 if (perf_swevent_match(event, type, event_id, data, regs))
4036                         perf_swevent_add(event, nr, nmi, data, regs);
4037         }
4038 }
4039
4040 int perf_swevent_get_recursion_context(void)
4041 {
4042         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
4043         int rctx;
4044
4045         if (in_nmi())
4046                 rctx = 3;
4047         else if (in_irq())
4048                 rctx = 2;
4049         else if (in_softirq())
4050                 rctx = 1;
4051         else
4052                 rctx = 0;
4053
4054         if (cpuctx->recursion[rctx]) {
4055                 put_cpu_var(perf_cpu_context);
4056                 return -1;
4057         }
4058
4059         cpuctx->recursion[rctx]++;
4060         barrier();
4061
4062         return rctx;
4063 }
4064 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4065
4066 void perf_swevent_put_recursion_context(int rctx)
4067 {
4068         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4069         barrier();
4070         cpuctx->recursion[rctx]--;
4071         put_cpu_var(perf_cpu_context);
4072 }
4073 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4074
4075 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4076                                     u64 nr, int nmi,
4077                                     struct perf_sample_data *data,
4078                                     struct pt_regs *regs)
4079 {
4080         struct perf_cpu_context *cpuctx;
4081         struct perf_event_context *ctx;
4082
4083         cpuctx = &__get_cpu_var(perf_cpu_context);
4084         rcu_read_lock();
4085         perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4086                                  nr, nmi, data, regs);
4087         /*
4088          * doesn't really matter which of the child contexts the
4089          * events ends up in.
4090          */
4091         ctx = rcu_dereference(current->perf_event_ctxp);
4092         if (ctx)
4093                 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4094         rcu_read_unlock();
4095 }
4096
4097 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4098                             struct pt_regs *regs, u64 addr)
4099 {
4100         struct perf_sample_data data;
4101         int rctx;
4102
4103         rctx = perf_swevent_get_recursion_context();
4104         if (rctx < 0)
4105                 return;
4106
4107         perf_sample_data_init(&data, addr);
4108
4109         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4110
4111         perf_swevent_put_recursion_context(rctx);
4112 }
4113
4114 static void perf_swevent_read(struct perf_event *event)
4115 {
4116 }
4117
4118 static int perf_swevent_enable(struct perf_event *event)
4119 {
4120         struct hw_perf_event *hwc = &event->hw;
4121
4122         if (hwc->sample_period) {
4123                 hwc->last_period = hwc->sample_period;
4124                 perf_swevent_set_period(event);
4125         }
4126         return 0;
4127 }
4128
4129 static void perf_swevent_disable(struct perf_event *event)
4130 {
4131 }
4132
4133 static const struct pmu perf_ops_generic = {
4134         .enable         = perf_swevent_enable,
4135         .disable        = perf_swevent_disable,
4136         .read           = perf_swevent_read,
4137         .unthrottle     = perf_swevent_unthrottle,
4138 };
4139
4140 /*
4141  * hrtimer based swevent callback
4142  */
4143
4144 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4145 {
4146         enum hrtimer_restart ret = HRTIMER_RESTART;
4147         struct perf_sample_data data;
4148         struct pt_regs *regs;
4149         struct perf_event *event;
4150         u64 period;
4151
4152         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4153         event->pmu->read(event);
4154
4155         perf_sample_data_init(&data, 0);
4156         data.period = event->hw.last_period;
4157         regs = get_irq_regs();
4158         /*
4159          * In case we exclude kernel IPs or are somehow not in interrupt
4160          * context, provide the next best thing, the user IP.
4161          */
4162         if ((event->attr.exclude_kernel || !regs) &&
4163                         !event->attr.exclude_user)
4164                 regs = task_pt_regs(current);
4165
4166         if (regs) {
4167                 if (!(event->attr.exclude_idle && current->pid == 0))
4168                         if (perf_event_overflow(event, 0, &data, regs))
4169                                 ret = HRTIMER_NORESTART;
4170         }
4171
4172         period = max_t(u64, 10000, event->hw.sample_period);
4173         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
4174
4175         return ret;
4176 }
4177
4178 static void perf_swevent_start_hrtimer(struct perf_event *event)
4179 {
4180         struct hw_perf_event *hwc = &event->hw;
4181
4182         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4183         hwc->hrtimer.function = perf_swevent_hrtimer;
4184         if (hwc->sample_period) {
4185                 u64 period;
4186
4187                 if (hwc->remaining) {
4188                         if (hwc->remaining < 0)
4189                                 period = 10000;
4190                         else
4191                                 period = hwc->remaining;
4192                         hwc->remaining = 0;
4193                 } else {
4194                         period = max_t(u64, 10000, hwc->sample_period);
4195                 }
4196                 __hrtimer_start_range_ns(&hwc->hrtimer,
4197                                 ns_to_ktime(period), 0,
4198                                 HRTIMER_MODE_REL, 0);
4199         }
4200 }
4201
4202 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4203 {
4204         struct hw_perf_event *hwc = &event->hw;
4205
4206         if (hwc->sample_period) {
4207                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4208                 hwc->remaining = ktime_to_ns(remaining);
4209
4210                 hrtimer_cancel(&hwc->hrtimer);
4211         }
4212 }
4213
4214 /*
4215  * Software event: cpu wall time clock
4216  */
4217
4218 static void cpu_clock_perf_event_update(struct perf_event *event)
4219 {
4220         int cpu = raw_smp_processor_id();
4221         s64 prev;
4222         u64 now;
4223
4224         now = cpu_clock(cpu);
4225         prev = atomic64_xchg(&event->hw.prev_count, now);
4226         atomic64_add(now - prev, &event->count);
4227 }
4228
4229 static int cpu_clock_perf_event_enable(struct perf_event *event)
4230 {
4231         struct hw_perf_event *hwc = &event->hw;
4232         int cpu = raw_smp_processor_id();
4233
4234         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4235         perf_swevent_start_hrtimer(event);
4236
4237         return 0;
4238 }
4239
4240 static void cpu_clock_perf_event_disable(struct perf_event *event)
4241 {
4242         perf_swevent_cancel_hrtimer(event);
4243         cpu_clock_perf_event_update(event);
4244 }
4245
4246 static void cpu_clock_perf_event_read(struct perf_event *event)
4247 {
4248         cpu_clock_perf_event_update(event);
4249 }
4250
4251 static const struct pmu perf_ops_cpu_clock = {
4252         .enable         = cpu_clock_perf_event_enable,
4253         .disable        = cpu_clock_perf_event_disable,
4254         .read           = cpu_clock_perf_event_read,
4255 };
4256
4257 /*
4258  * Software event: task time clock
4259  */
4260
4261 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4262 {
4263         u64 prev;
4264         s64 delta;
4265
4266         prev = atomic64_xchg(&event->hw.prev_count, now);
4267         delta = now - prev;
4268         atomic64_add(delta, &event->count);
4269 }
4270
4271 static int task_clock_perf_event_enable(struct perf_event *event)
4272 {
4273         struct hw_perf_event *hwc = &event->hw;
4274         u64 now;
4275
4276         now = event->ctx->time;
4277
4278         atomic64_set(&hwc->prev_count, now);
4279
4280         perf_swevent_start_hrtimer(event);
4281
4282         return 0;
4283 }
4284
4285 static void task_clock_perf_event_disable(struct perf_event *event)
4286 {
4287         perf_swevent_cancel_hrtimer(event);
4288         task_clock_perf_event_update(event, event->ctx->time);
4289
4290 }
4291
4292 static void task_clock_perf_event_read(struct perf_event *event)
4293 {
4294         u64 time;
4295
4296         if (!in_nmi()) {
4297                 update_context_time(event->ctx);
4298                 time = event->ctx->time;
4299         } else {
4300                 u64 now = perf_clock();
4301                 u64 delta = now - event->ctx->timestamp;
4302                 time = event->ctx->time + delta;
4303         }
4304
4305         task_clock_perf_event_update(event, time);
4306 }
4307
4308 static const struct pmu perf_ops_task_clock = {
4309         .enable         = task_clock_perf_event_enable,
4310         .disable        = task_clock_perf_event_disable,
4311         .read           = task_clock_perf_event_read,
4312 };
4313
4314 #ifdef CONFIG_EVENT_TRACING
4315
4316 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4317                           int entry_size)
4318 {
4319         struct pt_regs *regs = get_irq_regs();
4320         struct perf_sample_data data;
4321         struct perf_raw_record raw = {
4322                 .size = entry_size,
4323                 .data = record,
4324         };
4325
4326         perf_sample_data_init(&data, addr);
4327         data.raw = &raw;
4328
4329         if (!regs)
4330                 regs = task_pt_regs(current);
4331
4332         /* Trace events already protected against recursion */
4333         do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4334                                 &data, regs);
4335 }
4336 EXPORT_SYMBOL_GPL(perf_tp_event);
4337
4338 static int perf_tp_event_match(struct perf_event *event,
4339                                 struct perf_sample_data *data)
4340 {
4341         void *record = data->raw->data;
4342
4343         if (likely(!event->filter) || filter_match_preds(event->filter, record))
4344                 return 1;
4345         return 0;
4346 }
4347
4348 static void tp_perf_event_destroy(struct perf_event *event)
4349 {
4350         ftrace_profile_disable(event->attr.config);
4351 }
4352
4353 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4354 {
4355         /*
4356          * Raw tracepoint data is a severe data leak, only allow root to
4357          * have these.
4358          */
4359         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4360                         perf_paranoid_tracepoint_raw() &&
4361                         !capable(CAP_SYS_ADMIN))
4362                 return ERR_PTR(-EPERM);
4363
4364         if (ftrace_profile_enable(event->attr.config))
4365                 return NULL;
4366
4367         event->destroy = tp_perf_event_destroy;
4368
4369         return &perf_ops_generic;
4370 }
4371
4372 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4373 {
4374         char *filter_str;
4375         int ret;
4376
4377         if (event->attr.type != PERF_TYPE_TRACEPOINT)
4378                 return -EINVAL;
4379
4380         filter_str = strndup_user(arg, PAGE_SIZE);
4381         if (IS_ERR(filter_str))
4382                 return PTR_ERR(filter_str);
4383
4384         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4385
4386         kfree(filter_str);
4387         return ret;
4388 }
4389
4390 static void perf_event_free_filter(struct perf_event *event)
4391 {
4392         ftrace_profile_free_filter(event);
4393 }
4394
4395 #else
4396
4397 static int perf_tp_event_match(struct perf_event *event,
4398                                 struct perf_sample_data *data)
4399 {
4400         return 1;
4401 }
4402
4403 static const struct pmu *tp_perf_event_init(struct perf_event *event)
4404 {
4405         return NULL;
4406 }
4407
4408 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4409 {
4410         return -ENOENT;
4411 }
4412
4413 static void perf_event_free_filter(struct perf_event *event)
4414 {
4415 }
4416
4417 #endif /* CONFIG_EVENT_TRACING */
4418
4419 #ifdef CONFIG_HAVE_HW_BREAKPOINT
4420 static void bp_perf_event_destroy(struct perf_event *event)
4421 {
4422         release_bp_slot(event);
4423 }
4424
4425 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4426 {
4427         int err;
4428
4429         err = register_perf_hw_breakpoint(bp);
4430         if (err)
4431                 return ERR_PTR(err);
4432
4433         bp->destroy = bp_perf_event_destroy;
4434
4435         return &perf_ops_bp;
4436 }
4437
4438 void perf_bp_event(struct perf_event *bp, void *data)
4439 {
4440         struct perf_sample_data sample;
4441         struct pt_regs *regs = data;
4442
4443         perf_sample_data_init(&sample, bp->attr.bp_addr);
4444
4445         if (!perf_exclude_event(bp, regs))
4446                 perf_swevent_add(bp, 1, 1, &sample, regs);
4447 }
4448 #else
4449 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4450 {
4451         return NULL;
4452 }
4453
4454 void perf_bp_event(struct perf_event *bp, void *regs)
4455 {
4456 }
4457 #endif
4458
4459 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4460
4461 static void sw_perf_event_destroy(struct perf_event *event)
4462 {
4463         u64 event_id = event->attr.config;
4464
4465         WARN_ON(event->parent);
4466
4467         atomic_dec(&perf_swevent_enabled[event_id]);
4468 }
4469
4470 static const struct pmu *sw_perf_event_init(struct perf_event *event)
4471 {
4472         const struct pmu *pmu = NULL;
4473         u64 event_id = event->attr.config;
4474
4475         /*
4476          * Software events (currently) can't in general distinguish
4477          * between user, kernel and hypervisor events.
4478          * However, context switches and cpu migrations are considered
4479          * to be kernel events, and page faults are never hypervisor
4480          * events.
4481          */
4482         switch (event_id) {
4483         case PERF_COUNT_SW_CPU_CLOCK:
4484                 pmu = &perf_ops_cpu_clock;
4485
4486                 break;
4487         case PERF_COUNT_SW_TASK_CLOCK:
4488                 /*
4489                  * If the user instantiates this as a per-cpu event,
4490                  * use the cpu_clock event instead.
4491                  */
4492                 if (event->ctx->task)
4493                         pmu = &perf_ops_task_clock;
4494                 else
4495                         pmu = &perf_ops_cpu_clock;
4496
4497                 break;
4498         case PERF_COUNT_SW_PAGE_FAULTS:
4499         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4500         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4501         case PERF_COUNT_SW_CONTEXT_SWITCHES:
4502         case PERF_COUNT_SW_CPU_MIGRATIONS:
4503         case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4504         case PERF_COUNT_SW_EMULATION_FAULTS:
4505                 if (!event->parent) {
4506                         atomic_inc(&perf_swevent_enabled[event_id]);
4507                         event->destroy = sw_perf_event_destroy;
4508                 }
4509                 pmu = &perf_ops_generic;
4510                 break;
4511         }
4512
4513         return pmu;
4514 }
4515
4516 /*
4517  * Allocate and initialize a event structure
4518  */
4519 static struct perf_event *
4520 perf_event_alloc(struct perf_event_attr *attr,
4521                    int cpu,
4522                    struct perf_event_context *ctx,
4523                    struct perf_event *group_leader,
4524                    struct perf_event *parent_event,
4525                    perf_overflow_handler_t overflow_handler,
4526                    gfp_t gfpflags)
4527 {
4528         const struct pmu *pmu;
4529         struct perf_event *event;
4530         struct hw_perf_event *hwc;
4531         long err;
4532
4533         event = kzalloc(sizeof(*event), gfpflags);
4534         if (!event)
4535                 return ERR_PTR(-ENOMEM);
4536
4537         /*
4538          * Single events are their own group leaders, with an
4539          * empty sibling list:
4540          */
4541         if (!group_leader)
4542                 group_leader = event;
4543
4544         mutex_init(&event->child_mutex);
4545         INIT_LIST_HEAD(&event->child_list);
4546
4547         INIT_LIST_HEAD(&event->group_entry);
4548         INIT_LIST_HEAD(&event->event_entry);
4549         INIT_LIST_HEAD(&event->sibling_list);
4550         init_waitqueue_head(&event->waitq);
4551
4552         mutex_init(&event->mmap_mutex);
4553
4554         event->cpu              = cpu;
4555         event->attr             = *attr;
4556         event->group_leader     = group_leader;
4557         event->pmu              = NULL;
4558         event->ctx              = ctx;
4559         event->oncpu            = -1;
4560
4561         event->parent           = parent_event;
4562
4563         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
4564         event->id               = atomic64_inc_return(&perf_event_id);
4565
4566         event->state            = PERF_EVENT_STATE_INACTIVE;
4567
4568         if (!overflow_handler && parent_event)
4569                 overflow_handler = parent_event->overflow_handler;
4570
4571         event->overflow_handler = overflow_handler;
4572
4573         if (attr->disabled)
4574                 event->state = PERF_EVENT_STATE_OFF;
4575
4576         pmu = NULL;
4577
4578         hwc = &event->hw;
4579         hwc->sample_period = attr->sample_period;
4580         if (attr->freq && attr->sample_freq)
4581                 hwc->sample_period = 1;
4582         hwc->last_period = hwc->sample_period;
4583
4584         atomic64_set(&hwc->period_left, hwc->sample_period);
4585
4586         /*
4587          * we currently do not support PERF_FORMAT_GROUP on inherited events
4588          */
4589         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4590                 goto done;
4591
4592         switch (attr->type) {
4593         case PERF_TYPE_RAW:
4594         case PERF_TYPE_HARDWARE:
4595         case PERF_TYPE_HW_CACHE:
4596                 pmu = hw_perf_event_init(event);
4597                 break;
4598
4599         case PERF_TYPE_SOFTWARE:
4600                 pmu = sw_perf_event_init(event);
4601                 break;
4602
4603         case PERF_TYPE_TRACEPOINT:
4604                 pmu = tp_perf_event_init(event);
4605                 break;
4606
4607         case PERF_TYPE_BREAKPOINT:
4608                 pmu = bp_perf_event_init(event);
4609                 break;
4610
4611
4612         default:
4613                 break;
4614         }
4615 done:
4616         err = 0;
4617         if (!pmu)
4618                 err = -EINVAL;
4619         else if (IS_ERR(pmu))
4620                 err = PTR_ERR(pmu);
4621
4622         if (err) {
4623                 if (event->ns)
4624                         put_pid_ns(event->ns);
4625                 kfree(event);
4626                 return ERR_PTR(err);
4627         }
4628
4629         event->pmu = pmu;
4630
4631         if (!event->parent) {
4632                 atomic_inc(&nr_events);
4633                 if (event->attr.mmap)
4634                         atomic_inc(&nr_mmap_events);
4635                 if (event->attr.comm)
4636                         atomic_inc(&nr_comm_events);
4637                 if (event->attr.task)
4638                         atomic_inc(&nr_task_events);
4639         }
4640
4641         return event;
4642 }
4643
4644 static int perf_copy_attr(struct perf_event_attr __user *uattr,
4645                           struct perf_event_attr *attr)
4646 {
4647         u32 size;
4648         int ret;
4649
4650         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4651                 return -EFAULT;
4652
4653         /*
4654          * zero the full structure, so that a short copy will be nice.
4655          */
4656         memset(attr, 0, sizeof(*attr));
4657
4658         ret = get_user(size, &uattr->size);
4659         if (ret)
4660                 return ret;
4661
4662         if (size > PAGE_SIZE)   /* silly large */
4663                 goto err_size;
4664
4665         if (!size)              /* abi compat */
4666                 size = PERF_ATTR_SIZE_VER0;
4667
4668         if (size < PERF_ATTR_SIZE_VER0)
4669                 goto err_size;
4670
4671         /*
4672          * If we're handed a bigger struct than we know of,
4673          * ensure all the unknown bits are 0 - i.e. new
4674          * user-space does not rely on any kernel feature
4675          * extensions we dont know about yet.
4676          */
4677         if (size > sizeof(*attr)) {
4678                 unsigned char __user *addr;
4679                 unsigned char __user *end;
4680                 unsigned char val;
4681
4682                 addr = (void __user *)uattr + sizeof(*attr);
4683                 end  = (void __user *)uattr + size;
4684
4685                 for (; addr < end; addr++) {
4686                         ret = get_user(val, addr);
4687                         if (ret)
4688                                 return ret;
4689                         if (val)
4690                                 goto err_size;
4691                 }
4692                 size = sizeof(*attr);
4693         }
4694
4695         ret = copy_from_user(attr, uattr, size);
4696         if (ret)
4697                 return -EFAULT;
4698
4699         /*
4700          * If the type exists, the corresponding creation will verify
4701          * the attr->config.
4702          */
4703         if (attr->type >= PERF_TYPE_MAX)
4704                 return -EINVAL;
4705
4706         if (attr->__reserved_1)
4707                 return -EINVAL;
4708
4709         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4710                 return -EINVAL;
4711
4712         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4713                 return -EINVAL;
4714
4715 out:
4716         return ret;
4717
4718 err_size:
4719         put_user(sizeof(*attr), &uattr->size);
4720         ret = -E2BIG;
4721         goto out;
4722 }
4723
4724 static int perf_event_set_output(struct perf_event *event, int output_fd)
4725 {
4726         struct perf_event *output_event = NULL;
4727         struct file *output_file = NULL;
4728         struct perf_event *old_output;
4729         int fput_needed = 0;
4730         int ret = -EINVAL;
4731
4732         if (!output_fd)
4733                 goto set;
4734
4735         output_file = fget_light(output_fd, &fput_needed);
4736         if (!output_file)
4737                 return -EBADF;
4738
4739         if (output_file->f_op != &perf_fops)
4740                 goto out;
4741
4742         output_event = output_file->private_data;
4743
4744         /* Don't chain output fds */
4745         if (output_event->output)
4746                 goto out;
4747
4748         /* Don't set an output fd when we already have an output channel */
4749         if (event->data)
4750                 goto out;
4751
4752         atomic_long_inc(&output_file->f_count);
4753
4754 set:
4755         mutex_lock(&event->mmap_mutex);
4756         old_output = event->output;
4757         rcu_assign_pointer(event->output, output_event);
4758         mutex_unlock(&event->mmap_mutex);
4759
4760         if (old_output) {
4761                 /*
4762                  * we need to make sure no existing perf_output_*()
4763                  * is still referencing this event.
4764                  */
4765                 synchronize_rcu();
4766                 fput(old_output->filp);
4767         }
4768
4769         ret = 0;
4770 out:
4771         fput_light(output_file, fput_needed);
4772         return ret;
4773 }
4774
4775 /**
4776  * sys_perf_event_open - open a performance event, associate it to a task/cpu
4777  *
4778  * @attr_uptr:  event_id type attributes for monitoring/sampling
4779  * @pid:                target pid
4780  * @cpu:                target cpu
4781  * @group_fd:           group leader event fd
4782  */
4783 SYSCALL_DEFINE5(perf_event_open,
4784                 struct perf_event_attr __user *, attr_uptr,
4785                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4786 {
4787         struct perf_event *event, *group_leader;
4788         struct perf_event_attr attr;
4789         struct perf_event_context *ctx;
4790         struct file *event_file = NULL;
4791         struct file *group_file = NULL;
4792         int fput_needed = 0;
4793         int fput_needed2 = 0;
4794         int err;
4795
4796         /* for future expandability... */
4797         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4798                 return -EINVAL;
4799
4800         err = perf_copy_attr(attr_uptr, &attr);
4801         if (err)
4802                 return err;
4803
4804         if (!attr.exclude_kernel) {
4805                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4806                         return -EACCES;
4807         }
4808
4809         if (attr.freq) {
4810                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4811                         return -EINVAL;
4812         }
4813
4814         /*
4815          * Get the target context (task or percpu):
4816          */
4817         ctx = find_get_context(pid, cpu);
4818         if (IS_ERR(ctx))
4819                 return PTR_ERR(ctx);
4820
4821         /*
4822          * Look up the group leader (we will attach this event to it):
4823          */
4824         group_leader = NULL;
4825         if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4826                 err = -EINVAL;
4827                 group_file = fget_light(group_fd, &fput_needed);
4828                 if (!group_file)
4829                         goto err_put_context;
4830                 if (group_file->f_op != &perf_fops)
4831                         goto err_put_context;
4832
4833                 group_leader = group_file->private_data;
4834                 /*
4835                  * Do not allow a recursive hierarchy (this new sibling
4836                  * becoming part of another group-sibling):
4837                  */
4838                 if (group_leader->group_leader != group_leader)
4839                         goto err_put_context;
4840                 /*
4841                  * Do not allow to attach to a group in a different
4842                  * task or CPU context:
4843                  */
4844                 if (group_leader->ctx != ctx)
4845                         goto err_put_context;
4846                 /*
4847                  * Only a group leader can be exclusive or pinned
4848                  */
4849                 if (attr.exclusive || attr.pinned)
4850                         goto err_put_context;
4851         }
4852
4853         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4854                                      NULL, NULL, GFP_KERNEL);
4855         err = PTR_ERR(event);
4856         if (IS_ERR(event))
4857                 goto err_put_context;
4858
4859         err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4860         if (err < 0)
4861                 goto err_free_put_context;
4862
4863         event_file = fget_light(err, &fput_needed2);
4864         if (!event_file)
4865                 goto err_free_put_context;
4866
4867         if (flags & PERF_FLAG_FD_OUTPUT) {
4868                 err = perf_event_set_output(event, group_fd);
4869                 if (err)
4870                         goto err_fput_free_put_context;
4871         }
4872
4873         event->filp = event_file;
4874         WARN_ON_ONCE(ctx->parent_ctx);
4875         mutex_lock(&ctx->mutex);
4876         perf_install_in_context(ctx, event, cpu);
4877         ++ctx->generation;
4878         mutex_unlock(&ctx->mutex);
4879
4880         event->owner = current;
4881         get_task_struct(current);
4882         mutex_lock(&current->perf_event_mutex);
4883         list_add_tail(&event->owner_entry, &current->perf_event_list);
4884         mutex_unlock(&current->perf_event_mutex);
4885
4886 err_fput_free_put_context:
4887         fput_light(event_file, fput_needed2);
4888
4889 err_free_put_context:
4890         if (err < 0)
4891                 kfree(event);
4892
4893 err_put_context:
4894         if (err < 0)
4895                 put_ctx(ctx);
4896
4897         fput_light(group_file, fput_needed);
4898
4899         return err;
4900 }
4901
4902 /**
4903  * perf_event_create_kernel_counter
4904  *
4905  * @attr: attributes of the counter to create
4906  * @cpu: cpu in which the counter is bound
4907  * @pid: task to profile
4908  */
4909 struct perf_event *
4910 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4911                                  pid_t pid,
4912                                  perf_overflow_handler_t overflow_handler)
4913 {
4914         struct perf_event *event;
4915         struct perf_event_context *ctx;
4916         int err;
4917
4918         /*
4919          * Get the target context (task or percpu):
4920          */
4921
4922         ctx = find_get_context(pid, cpu);
4923         if (IS_ERR(ctx)) {
4924                 err = PTR_ERR(ctx);
4925                 goto err_exit;
4926         }
4927
4928         event = perf_event_alloc(attr, cpu, ctx, NULL,
4929                                  NULL, overflow_handler, GFP_KERNEL);
4930         if (IS_ERR(event)) {
4931                 err = PTR_ERR(event);
4932                 goto err_put_context;
4933         }
4934
4935         event->filp = NULL;
4936         WARN_ON_ONCE(ctx->parent_ctx);
4937         mutex_lock(&ctx->mutex);
4938         perf_install_in_context(ctx, event, cpu);
4939         ++ctx->generation;
4940         mutex_unlock(&ctx->mutex);
4941
4942         event->owner = current;
4943         get_task_struct(current);
4944         mutex_lock(&current->perf_event_mutex);
4945         list_add_tail(&event->owner_entry, &current->perf_event_list);
4946         mutex_unlock(&current->perf_event_mutex);
4947
4948         return event;
4949
4950  err_put_context:
4951         put_ctx(ctx);
4952  err_exit:
4953         return ERR_PTR(err);
4954 }
4955 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4956
4957 /*
4958  * inherit a event from parent task to child task:
4959  */
4960 static struct perf_event *
4961 inherit_event(struct perf_event *parent_event,
4962               struct task_struct *parent,
4963               struct perf_event_context *parent_ctx,
4964               struct task_struct *child,
4965               struct perf_event *group_leader,
4966               struct perf_event_context *child_ctx)
4967 {
4968         struct perf_event *child_event;
4969
4970         /*
4971          * Instead of creating recursive hierarchies of events,
4972          * we link inherited events back to the original parent,
4973          * which has a filp for sure, which we use as the reference
4974          * count:
4975          */
4976         if (parent_event->parent)
4977                 parent_event = parent_event->parent;
4978
4979         child_event = perf_event_alloc(&parent_event->attr,
4980                                            parent_event->cpu, child_ctx,
4981                                            group_leader, parent_event,
4982                                            NULL, GFP_KERNEL);
4983         if (IS_ERR(child_event))
4984                 return child_event;
4985         get_ctx(child_ctx);
4986
4987         /*
4988          * Make the child state follow the state of the parent event,
4989          * not its attr.disabled bit.  We hold the parent's mutex,
4990          * so we won't race with perf_event_{en, dis}able_family.
4991          */
4992         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4993                 child_event->state = PERF_EVENT_STATE_INACTIVE;
4994         else
4995                 child_event->state = PERF_EVENT_STATE_OFF;
4996
4997         if (parent_event->attr.freq) {
4998                 u64 sample_period = parent_event->hw.sample_period;
4999                 struct hw_perf_event *hwc = &child_event->hw;
5000
5001                 hwc->sample_period = sample_period;
5002                 hwc->last_period   = sample_period;
5003
5004                 atomic64_set(&hwc->period_left, sample_period);
5005         }
5006
5007         child_event->overflow_handler = parent_event->overflow_handler;
5008
5009         /*
5010          * Link it up in the child's context:
5011          */
5012         add_event_to_ctx(child_event, child_ctx);
5013
5014         /*
5015          * Get a reference to the parent filp - we will fput it
5016          * when the child event exits. This is safe to do because
5017          * we are in the parent and we know that the filp still
5018          * exists and has a nonzero count:
5019          */
5020         atomic_long_inc(&parent_event->filp->f_count);
5021
5022         /*
5023          * Link this into the parent event's child list
5024          */
5025         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5026         mutex_lock(&parent_event->child_mutex);
5027         list_add_tail(&child_event->child_list, &parent_event->child_list);
5028         mutex_unlock(&parent_event->child_mutex);
5029
5030         return child_event;
5031 }
5032
5033 static int inherit_group(struct perf_event *parent_event,
5034               struct task_struct *parent,
5035               struct perf_event_context *parent_ctx,
5036               struct task_struct *child,
5037               struct perf_event_context *child_ctx)
5038 {
5039         struct perf_event *leader;
5040         struct perf_event *sub;
5041         struct perf_event *child_ctr;
5042
5043         leader = inherit_event(parent_event, parent, parent_ctx,
5044                                  child, NULL, child_ctx);
5045         if (IS_ERR(leader))
5046                 return PTR_ERR(leader);
5047         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
5048                 child_ctr = inherit_event(sub, parent, parent_ctx,
5049                                             child, leader, child_ctx);
5050                 if (IS_ERR(child_ctr))
5051                         return PTR_ERR(child_ctr);
5052         }
5053         return 0;
5054 }
5055
5056 static void sync_child_event(struct perf_event *child_event,
5057                                struct task_struct *child)
5058 {
5059         struct perf_event *parent_event = child_event->parent;
5060         u64 child_val;
5061
5062         if (child_event->attr.inherit_stat)
5063                 perf_event_read_event(child_event, child);
5064
5065         child_val = atomic64_read(&child_event->count);
5066
5067         /*
5068          * Add back the child's count to the parent's count:
5069          */
5070         atomic64_add(child_val, &parent_event->count);
5071         atomic64_add(child_event->total_time_enabled,
5072                      &parent_event->child_total_time_enabled);
5073         atomic64_add(child_event->total_time_running,
5074                      &parent_event->child_total_time_running);
5075
5076         /*
5077          * Remove this event from the parent's list
5078          */
5079         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
5080         mutex_lock(&parent_event->child_mutex);
5081         list_del_init(&child_event->child_list);
5082         mutex_unlock(&parent_event->child_mutex);
5083
5084         /*
5085          * Release the parent event, if this was the last
5086          * reference to it.
5087          */
5088         fput(parent_event->filp);
5089 }
5090
5091 static void
5092 __perf_event_exit_task(struct perf_event *child_event,
5093                          struct perf_event_context *child_ctx,
5094                          struct task_struct *child)
5095 {
5096         struct perf_event *parent_event;
5097
5098         perf_event_remove_from_context(child_event);
5099
5100         parent_event = child_event->parent;
5101         /*
5102          * It can happen that parent exits first, and has events
5103          * that are still around due to the child reference. These
5104          * events need to be zapped - but otherwise linger.
5105          */
5106         if (parent_event) {
5107                 sync_child_event(child_event, child);
5108                 free_event(child_event);
5109         }
5110 }
5111
5112 /*
5113  * When a child task exits, feed back event values to parent events.
5114  */
5115 void perf_event_exit_task(struct task_struct *child)
5116 {
5117         struct perf_event *child_event, *tmp;
5118         struct perf_event_context *child_ctx;
5119         unsigned long flags;
5120
5121         if (likely(!child->perf_event_ctxp)) {
5122                 perf_event_task(child, NULL, 0);
5123                 return;
5124         }
5125
5126         local_irq_save(flags);
5127         /*
5128          * We can't reschedule here because interrupts are disabled,
5129          * and either child is current or it is a task that can't be
5130          * scheduled, so we are now safe from rescheduling changing
5131          * our context.
5132          */
5133         child_ctx = child->perf_event_ctxp;
5134         __perf_event_task_sched_out(child_ctx);
5135
5136         /*
5137          * Take the context lock here so that if find_get_context is
5138          * reading child->perf_event_ctxp, we wait until it has
5139          * incremented the context's refcount before we do put_ctx below.
5140          */
5141         raw_spin_lock(&child_ctx->lock);
5142         child->perf_event_ctxp = NULL;
5143         /*
5144          * If this context is a clone; unclone it so it can't get
5145          * swapped to another process while we're removing all
5146          * the events from it.
5147          */
5148         unclone_ctx(child_ctx);
5149         update_context_time(child_ctx);
5150         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5151
5152         /*
5153          * Report the task dead after unscheduling the events so that we
5154          * won't get any samples after PERF_RECORD_EXIT. We can however still
5155          * get a few PERF_RECORD_READ events.
5156          */
5157         perf_event_task(child, child_ctx, 0);
5158
5159         /*
5160          * We can recurse on the same lock type through:
5161          *
5162          *   __perf_event_exit_task()
5163          *     sync_child_event()
5164          *       fput(parent_event->filp)
5165          *         perf_release()
5166          *           mutex_lock(&ctx->mutex)
5167          *
5168          * But since its the parent context it won't be the same instance.
5169          */
5170         mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5171
5172 again:
5173         list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5174                                  group_entry)
5175                 __perf_event_exit_task(child_event, child_ctx, child);
5176
5177         list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5178                                  group_entry)
5179                 __perf_event_exit_task(child_event, child_ctx, child);
5180
5181         /*
5182          * If the last event was a group event, it will have appended all
5183          * its siblings to the list, but we obtained 'tmp' before that which
5184          * will still point to the list head terminating the iteration.
5185          */
5186         if (!list_empty(&child_ctx->pinned_groups) ||
5187             !list_empty(&child_ctx->flexible_groups))
5188                 goto again;
5189
5190         mutex_unlock(&child_ctx->mutex);
5191
5192         put_ctx(child_ctx);
5193 }
5194
5195 static void perf_free_event(struct perf_event *event,
5196                             struct perf_event_context *ctx)
5197 {
5198         struct perf_event *parent = event->parent;
5199
5200         if (WARN_ON_ONCE(!parent))
5201                 return;
5202
5203         mutex_lock(&parent->child_mutex);
5204         list_del_init(&event->child_list);
5205         mutex_unlock(&parent->child_mutex);
5206
5207         fput(parent->filp);
5208
5209         list_del_event(event, ctx);
5210         free_event(event);
5211 }
5212
5213 /*
5214  * free an unexposed, unused context as created by inheritance by
5215  * init_task below, used by fork() in case of fail.
5216  */
5217 void perf_event_free_task(struct task_struct *task)
5218 {
5219         struct perf_event_context *ctx = task->perf_event_ctxp;
5220         struct perf_event *event, *tmp;
5221
5222         if (!ctx)
5223                 return;
5224
5225         mutex_lock(&ctx->mutex);
5226 again:
5227         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5228                 perf_free_event(event, ctx);
5229
5230         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5231                                  group_entry)
5232                 perf_free_event(event, ctx);
5233
5234         if (!list_empty(&ctx->pinned_groups) ||
5235             !list_empty(&ctx->flexible_groups))
5236                 goto again;
5237
5238         mutex_unlock(&ctx->mutex);
5239
5240         put_ctx(ctx);
5241 }
5242
5243 static int
5244 inherit_task_group(struct perf_event *event, struct task_struct *parent,
5245                    struct perf_event_context *parent_ctx,
5246                    struct task_struct *child,
5247                    int *inherited_all)
5248 {
5249         int ret;
5250         struct perf_event_context *child_ctx = child->perf_event_ctxp;
5251
5252         if (!event->attr.inherit) {
5253                 *inherited_all = 0;
5254                 return 0;
5255         }
5256
5257         if (!child_ctx) {
5258                 /*
5259                  * This is executed from the parent task context, so
5260                  * inherit events that have been marked for cloning.
5261                  * First allocate and initialize a context for the
5262                  * child.
5263                  */
5264
5265                 child_ctx = kzalloc(sizeof(struct perf_event_context),
5266                                     GFP_KERNEL);
5267                 if (!child_ctx)
5268                         return -ENOMEM;
5269
5270                 __perf_event_init_context(child_ctx, child);
5271                 child->perf_event_ctxp = child_ctx;
5272                 get_task_struct(child);
5273         }
5274
5275         ret = inherit_group(event, parent, parent_ctx,
5276                             child, child_ctx);
5277
5278         if (ret)
5279                 *inherited_all = 0;
5280
5281         return ret;
5282 }
5283
5284
5285 /*
5286  * Initialize the perf_event context in task_struct
5287  */
5288 int perf_event_init_task(struct task_struct *child)
5289 {
5290         struct perf_event_context *child_ctx, *parent_ctx;
5291         struct perf_event_context *cloned_ctx;
5292         struct perf_event *event;
5293         struct task_struct *parent = current;
5294         int inherited_all = 1;
5295         int ret = 0;
5296
5297         child->perf_event_ctxp = NULL;
5298
5299         mutex_init(&child->perf_event_mutex);
5300         INIT_LIST_HEAD(&child->perf_event_list);
5301
5302         if (likely(!parent->perf_event_ctxp))
5303                 return 0;
5304
5305         /*
5306          * If the parent's context is a clone, pin it so it won't get
5307          * swapped under us.
5308          */
5309         parent_ctx = perf_pin_task_context(parent);
5310
5311         /*
5312          * No need to check if parent_ctx != NULL here; since we saw
5313          * it non-NULL earlier, the only reason for it to become NULL
5314          * is if we exit, and since we're currently in the middle of
5315          * a fork we can't be exiting at the same time.
5316          */
5317
5318         /*
5319          * Lock the parent list. No need to lock the child - not PID
5320          * hashed yet and not running, so nobody can access it.
5321          */
5322         mutex_lock(&parent_ctx->mutex);
5323
5324         /*
5325          * We dont have to disable NMIs - we are only looking at
5326          * the list, not manipulating it:
5327          */
5328         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5329                 ret = inherit_task_group(event, parent, parent_ctx, child,
5330                                          &inherited_all);
5331                 if (ret)
5332                         break;
5333         }
5334
5335         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5336                 ret = inherit_task_group(event, parent, parent_ctx, child,
5337                                          &inherited_all);
5338                 if (ret)
5339                         break;
5340         }
5341
5342         child_ctx = child->perf_event_ctxp;
5343
5344         if (child_ctx && inherited_all) {
5345                 /*
5346                  * Mark the child context as a clone of the parent
5347                  * context, or of whatever the parent is a clone of.
5348                  * Note that if the parent is a clone, it could get
5349                  * uncloned at any point, but that doesn't matter
5350                  * because the list of events and the generation
5351                  * count can't have changed since we took the mutex.
5352                  */
5353                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
5354                 if (cloned_ctx) {
5355                         child_ctx->parent_ctx = cloned_ctx;
5356                         child_ctx->parent_gen = parent_ctx->parent_gen;
5357                 } else {
5358                         child_ctx->parent_ctx = parent_ctx;
5359                         child_ctx->parent_gen = parent_ctx->generation;
5360                 }
5361                 get_ctx(child_ctx->parent_ctx);
5362         }
5363
5364         mutex_unlock(&parent_ctx->mutex);
5365
5366         perf_unpin_context(parent_ctx);
5367
5368         return ret;
5369 }
5370
5371 static void __cpuinit perf_event_init_cpu(int cpu)
5372 {
5373         struct perf_cpu_context *cpuctx;
5374
5375         cpuctx = &per_cpu(perf_cpu_context, cpu);
5376         __perf_event_init_context(&cpuctx->ctx, NULL);
5377
5378         spin_lock(&perf_resource_lock);
5379         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5380         spin_unlock(&perf_resource_lock);
5381 }
5382
5383 #ifdef CONFIG_HOTPLUG_CPU
5384 static void __perf_event_exit_cpu(void *info)
5385 {
5386         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5387         struct perf_event_context *ctx = &cpuctx->ctx;
5388         struct perf_event *event, *tmp;
5389
5390         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5391                 __perf_event_remove_from_context(event);
5392         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5393                 __perf_event_remove_from_context(event);
5394 }
5395 static void perf_event_exit_cpu(int cpu)
5396 {
5397         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5398         struct perf_event_context *ctx = &cpuctx->ctx;
5399
5400         mutex_lock(&ctx->mutex);
5401         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5402         mutex_unlock(&ctx->mutex);
5403 }
5404 #else
5405 static inline void perf_event_exit_cpu(int cpu) { }
5406 #endif
5407
5408 static int __cpuinit
5409 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5410 {
5411         unsigned int cpu = (long)hcpu;
5412
5413         switch (action) {
5414
5415         case CPU_UP_PREPARE:
5416         case CPU_UP_PREPARE_FROZEN:
5417                 perf_event_init_cpu(cpu);
5418                 break;
5419
5420         case CPU_DOWN_PREPARE:
5421         case CPU_DOWN_PREPARE_FROZEN:
5422                 perf_event_exit_cpu(cpu);
5423                 break;
5424
5425         default:
5426                 break;
5427         }
5428
5429         return NOTIFY_OK;
5430 }
5431
5432 /*
5433  * This has to have a higher priority than migration_notifier in sched.c.
5434  */
5435 static struct notifier_block __cpuinitdata perf_cpu_nb = {
5436         .notifier_call          = perf_cpu_notify,
5437         .priority               = 20,
5438 };
5439
5440 void __init perf_event_init(void)
5441 {
5442         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5443                         (void *)(long)smp_processor_id());
5444         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5445                         (void *)(long)smp_processor_id());
5446         register_cpu_notifier(&perf_cpu_nb);
5447 }
5448
5449 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5450                                         struct sysdev_class_attribute *attr,
5451                                         char *buf)
5452 {
5453         return sprintf(buf, "%d\n", perf_reserved_percpu);
5454 }
5455
5456 static ssize_t
5457 perf_set_reserve_percpu(struct sysdev_class *class,
5458                         struct sysdev_class_attribute *attr,
5459                         const char *buf,
5460                         size_t count)
5461 {
5462         struct perf_cpu_context *cpuctx;
5463         unsigned long val;
5464         int err, cpu, mpt;
5465
5466         err = strict_strtoul(buf, 10, &val);
5467         if (err)
5468                 return err;
5469         if (val > perf_max_events)
5470                 return -EINVAL;
5471
5472         spin_lock(&perf_resource_lock);
5473         perf_reserved_percpu = val;
5474         for_each_online_cpu(cpu) {
5475                 cpuctx = &per_cpu(perf_cpu_context, cpu);
5476                 raw_spin_lock_irq(&cpuctx->ctx.lock);
5477                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5478                           perf_max_events - perf_reserved_percpu);
5479                 cpuctx->max_pertask = mpt;
5480                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5481         }
5482         spin_unlock(&perf_resource_lock);
5483
5484         return count;
5485 }
5486
5487 static ssize_t perf_show_overcommit(struct sysdev_class *class,
5488                                     struct sysdev_class_attribute *attr,
5489                                     char *buf)
5490 {
5491         return sprintf(buf, "%d\n", perf_overcommit);
5492 }
5493
5494 static ssize_t
5495 perf_set_overcommit(struct sysdev_class *class,
5496                     struct sysdev_class_attribute *attr,
5497                     const char *buf, size_t count)
5498 {
5499         unsigned long val;
5500         int err;
5501
5502         err = strict_strtoul(buf, 10, &val);
5503         if (err)
5504                 return err;
5505         if (val > 1)
5506                 return -EINVAL;
5507
5508         spin_lock(&perf_resource_lock);
5509         perf_overcommit = val;
5510         spin_unlock(&perf_resource_lock);
5511
5512         return count;
5513 }
5514
5515 static SYSDEV_CLASS_ATTR(
5516                                 reserve_percpu,
5517                                 0644,
5518                                 perf_show_reserve_percpu,
5519                                 perf_set_reserve_percpu
5520                         );
5521
5522 static SYSDEV_CLASS_ATTR(
5523                                 overcommit,
5524                                 0644,
5525                                 perf_show_overcommit,
5526                                 perf_set_overcommit
5527                         );
5528
5529 static struct attribute *perfclass_attrs[] = {
5530         &attr_reserve_percpu.attr,
5531         &attr_overcommit.attr,
5532         NULL
5533 };
5534
5535 static struct attribute_group perfclass_attr_group = {
5536         .attrs                  = perfclass_attrs,
5537         .name                   = "perf_events",
5538 };
5539
5540 static int __init perf_event_sysfs_init(void)
5541 {
5542         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5543                                   &perfclass_attr_group);
5544 }
5545 device_initcall(perf_event_sysfs_init);