kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *
   8  *  For licensing details see kernel-base/COPYING
   9  */
  10
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/cpu.h>
  14 #include <linux/smp.h>
  15 #include <linux/file.h>
  16 #include <linux/poll.h>
  17 #include <linux/sysfs.h>
  18 #include <linux/ptrace.h>
  19 #include <linux/percpu.h>
  20 #include <linux/vmstat.h>
  21 #include <linux/hardirq.h>
  22 #include <linux/rculist.h>
  23 #include <linux/uaccess.h>
  24 #include <linux/syscalls.h>
  25 #include <linux/anon_inodes.h>
  26 #include <linux/kernel_stat.h>
  27 #include <linux/perf_counter.h>
  28 #include <linux/dcache.h>
  29
  30 #include <asm/irq_regs.h>
  31
  32 /*
  33  * Each CPU has a list of per CPU counters:
  34  */
  35 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  36
  37 int perf_max_counters __read_mostly = 1;
  38 static int perf_reserved_percpu __read_mostly;
  39 static int perf_overcommit __read_mostly = 1;
  40
  41 static atomic_t nr_mmap_tracking __read_mostly;
  42 static atomic_t nr_munmap_tracking __read_mostly;
  43 static atomic_t nr_comm_tracking __read_mostly;
  44
  45 /*
  46  * Mutex for (sysadmin-configurable) counter reservations:
  47  */
  48 static DEFINE_MUTEX(perf_resource_mutex);
  49
  50 /*
  51  * Architecture provided APIs - weak aliases:
  52  */
  53 extern __weak const struct hw_perf_counter_ops *
  54 hw_perf_counter_init(struct perf_counter *counter)
  55 {
  56         return NULL;
  57 }
  58
  59 u64 __weak hw_perf_save_disable(void)           { return 0; }
  60 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  61 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  62 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  63                struct perf_cpu_context *cpuctx,
  64                struct perf_counter_context *ctx, int cpu)
  65 {
  66         return 0;
  67 }
  68
  69 void __weak perf_counter_print_debug(void)      { }
  70
  71 static void
  72 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  73 {
  74         struct perf_counter *group_leader = counter->group_leader;
  75
  76         /*
  77          * Depending on whether it is a standalone or sibling counter,
  78          * add it straight to the context's counter list, or to the group
  79          * leader's sibling list:
  80          */
  81         if (counter->group_leader == counter)
  82                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  83         else {
  84                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  85                 group_leader->nr_siblings++;
  86         }
  87
  88         list_add_rcu(&counter->event_entry, &ctx->event_list);
  89 }
  90
  91 static void
  92 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  93 {
  94         struct perf_counter *sibling, *tmp;
  95
  96         list_del_init(&counter->list_entry);
  97         list_del_rcu(&counter->event_entry);
  98
  99         if (counter->group_leader != counter)
 100                 counter->group_leader->nr_siblings--;
 101
 102         /*
 103          * If this was a group counter with sibling counters then
 104          * upgrade the siblings to singleton counters by adding them
 105          * to the context list directly:
 106          */
 107         list_for_each_entry_safe(sibling, tmp,
 108                                  &counter->sibling_list, list_entry) {
 109
 110                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
 111                 sibling->group_leader = sibling;
 112         }
 113 }
 114
 115 static void
 116 counter_sched_out(struct perf_counter *counter,
 117                   struct perf_cpu_context *cpuctx,
 118                   struct perf_counter_context *ctx)
 119 {
 120         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 121                 return;
 122
 123         counter->state = PERF_COUNTER_STATE_INACTIVE;
 124         counter->tstamp_stopped = ctx->time;
 125         counter->hw_ops->disable(counter);
 126         counter->oncpu = -1;
 127
 128         if (!is_software_counter(counter))
 129                 cpuctx->active_oncpu--;
 130         ctx->nr_active--;
 131         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
 132                 cpuctx->exclusive = 0;
 133 }
 134
 135 static void
 136 group_sched_out(struct perf_counter *group_counter,
 137                 struct perf_cpu_context *cpuctx,
 138                 struct perf_counter_context *ctx)
 139 {
 140         struct perf_counter *counter;
 141
 142         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 143                 return;
 144
 145         counter_sched_out(group_counter, cpuctx, ctx);
 146
 147         /*
 148          * Schedule out siblings (if any):
 149          */
 150         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 151                 counter_sched_out(counter, cpuctx, ctx);
 152
 153         if (group_counter->hw_event.exclusive)
 154                 cpuctx->exclusive = 0;
 155 }
 156
 157 /*
 158  * Cross CPU call to remove a performance counter
 159  *
 160  * We disable the counter on the hardware level first. After that we
 161  * remove it from the context list.
 162  */
 163 static void __perf_counter_remove_from_context(void *info)
 164 {
 165         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 166         struct perf_counter *counter = info;
 167         struct perf_counter_context *ctx = counter->ctx;
 168         unsigned long flags;
 169         u64 perf_flags;
 170
 171         /*
 172          * If this is a task context, we need to check whether it is
 173          * the current task context of this cpu. If not it has been
 174          * scheduled out before the smp call arrived.
 175          */
 176         if (ctx->task && cpuctx->task_ctx != ctx)
 177                 return;
 178
 179         spin_lock_irqsave(&ctx->lock, flags);
 180
 181         counter_sched_out(counter, cpuctx, ctx);
 182
 183         counter->task = NULL;
 184         ctx->nr_counters--;
 185
 186         /*
 187          * Protect the list operation against NMI by disabling the
 188          * counters on a global level. NOP for non NMI based counters.
 189          */
 190         perf_flags = hw_perf_save_disable();
 191         list_del_counter(counter, ctx);
 192         hw_perf_restore(perf_flags);
 193
 194         if (!ctx->task) {
 195                 /*
 196                  * Allow more per task counters with respect to the
 197                  * reservation:
 198                  */
 199                 cpuctx->max_pertask =
 200                         min(perf_max_counters - ctx->nr_counters,
 201                             perf_max_counters - perf_reserved_percpu);
 202         }
 203
 204         spin_unlock_irqrestore(&ctx->lock, flags);
 205 }
 206
 207
 208 /*
 209  * Remove the counter from a task's (or a CPU's) list of counters.
 210  *
 211  * Must be called with counter->mutex and ctx->mutex held.
 212  *
 213  * CPU counters are removed with a smp call. For task counters we only
 214  * call when the task is on a CPU.
 215  */
 216 static void perf_counter_remove_from_context(struct perf_counter *counter)
 217 {
 218         struct perf_counter_context *ctx = counter->ctx;
 219         struct task_struct *task = ctx->task;
 220
 221         if (!task) {
 222                 /*
 223                  * Per cpu counters are removed via an smp call and
 224                  * the removal is always sucessful.
 225                  */
 226                 smp_call_function_single(counter->cpu,
 227                                          __perf_counter_remove_from_context,
 228                                          counter, 1);
 229                 return;
 230         }
 231
 232 retry:
 233         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 234                                  counter);
 235
 236         spin_lock_irq(&ctx->lock);
 237         /*
 238          * If the context is active we need to retry the smp call.
 239          */
 240         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 241                 spin_unlock_irq(&ctx->lock);
 242                 goto retry;
 243         }
 244
 245         /*
 246          * The lock prevents that this context is scheduled in so we
 247          * can remove the counter safely, if the call above did not
 248          * succeed.
 249          */
 250         if (!list_empty(&counter->list_entry)) {
 251                 ctx->nr_counters--;
 252                 list_del_counter(counter, ctx);
 253                 counter->task = NULL;
 254         }
 255         spin_unlock_irq(&ctx->lock);
 256 }
 257
 258 static inline u64 perf_clock(void)
 259 {
 260         return cpu_clock(smp_processor_id());
 261 }
 262
 263 /*
 264  * Update the record of the current time in a context.
 265  */
 266 static void update_context_time(struct perf_counter_context *ctx)
 267 {
 268         u64 now = perf_clock();
 269
 270         ctx->time += now - ctx->timestamp;
 271         ctx->timestamp = now;
 272 }
 273
 274 /*
 275  * Update the total_time_enabled and total_time_running fields for a counter.
 276  */
 277 static void update_counter_times(struct perf_counter *counter)
 278 {
 279         struct perf_counter_context *ctx = counter->ctx;
 280         u64 run_end;
 281
 282         if (counter->state < PERF_COUNTER_STATE_INACTIVE)
 283                 return;
 284
 285         counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
 286
 287         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 288                 run_end = counter->tstamp_stopped;
 289         else
 290                 run_end = ctx->time;
 291
 292         counter->total_time_running = run_end - counter->tstamp_running;
 293 }
 294
 295 /*
 296  * Update total_time_enabled and total_time_running for all counters in a group.
 297  */
 298 static void update_group_times(struct perf_counter *leader)
 299 {
 300         struct perf_counter *counter;
 301
 302         update_counter_times(leader);
 303         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 304                 update_counter_times(counter);
 305 }
 306
 307 /*
 308  * Cross CPU call to disable a performance counter
 309  */
 310 static void __perf_counter_disable(void *info)
 311 {
 312         struct perf_counter *counter = info;
 313         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 314         struct perf_counter_context *ctx = counter->ctx;
 315         unsigned long flags;
 316
 317         /*
 318          * If this is a per-task counter, need to check whether this
 319          * counter's task is the current task on this cpu.
 320          */
 321         if (ctx->task && cpuctx->task_ctx != ctx)
 322                 return;
 323
 324         spin_lock_irqsave(&ctx->lock, flags);
 325
 326         /*
 327          * If the counter is on, turn it off.
 328          * If it is in error state, leave it in error state.
 329          */
 330         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 331                 update_context_time(ctx);
 332                 update_counter_times(counter);
 333                 if (counter == counter->group_leader)
 334                         group_sched_out(counter, cpuctx, ctx);
 335                 else
 336                         counter_sched_out(counter, cpuctx, ctx);
 337                 counter->state = PERF_COUNTER_STATE_OFF;
 338         }
 339
 340         spin_unlock_irqrestore(&ctx->lock, flags);
 341 }
 342
 343 /*
 344  * Disable a counter.
 345  */
 346 static void perf_counter_disable(struct perf_counter *counter)
 347 {
 348         struct perf_counter_context *ctx = counter->ctx;
 349         struct task_struct *task = ctx->task;
 350
 351         if (!task) {
 352                 /*
 353                  * Disable the counter on the cpu that it's on
 354                  */
 355                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 356                                          counter, 1);
 357                 return;
 358         }
 359
 360  retry:
 361         task_oncpu_function_call(task, __perf_counter_disable, counter);
 362
 363         spin_lock_irq(&ctx->lock);
 364         /*
 365          * If the counter is still active, we need to retry the cross-call.
 366          */
 367         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 368                 spin_unlock_irq(&ctx->lock);
 369                 goto retry;
 370         }
 371
 372         /*
 373          * Since we have the lock this context can't be scheduled
 374          * in, so we can change the state safely.
 375          */
 376         if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 377                 update_counter_times(counter);
 378                 counter->state = PERF_COUNTER_STATE_OFF;
 379         }
 380
 381         spin_unlock_irq(&ctx->lock);
 382 }
 383
 384 /*
 385  * Disable a counter and all its children.
 386  */
 387 static void perf_counter_disable_family(struct perf_counter *counter)
 388 {
 389         struct perf_counter *child;
 390
 391         perf_counter_disable(counter);
 392
 393         /*
 394          * Lock the mutex to protect the list of children
 395          */
 396         mutex_lock(&counter->mutex);
 397         list_for_each_entry(child, &counter->child_list, child_list)
 398                 perf_counter_disable(child);
 399         mutex_unlock(&counter->mutex);
 400 }
 401
 402 static int
 403 counter_sched_in(struct perf_counter *counter,
 404                  struct perf_cpu_context *cpuctx,
 405                  struct perf_counter_context *ctx,
 406                  int cpu)
 407 {
 408         if (counter->state <= PERF_COUNTER_STATE_OFF)
 409                 return 0;
 410
 411         counter->state = PERF_COUNTER_STATE_ACTIVE;
 412         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 413         /*
 414          * The new state must be visible before we turn it on in the hardware:
 415          */
 416         smp_wmb();
 417
 418         if (counter->hw_ops->enable(counter)) {
 419                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 420                 counter->oncpu = -1;
 421                 return -EAGAIN;
 422         }
 423
 424         counter->tstamp_running += ctx->time - counter->tstamp_stopped;
 425
 426         if (!is_software_counter(counter))
 427                 cpuctx->active_oncpu++;
 428         ctx->nr_active++;
 429
 430         if (counter->hw_event.exclusive)
 431                 cpuctx->exclusive = 1;
 432
 433         return 0;
 434 }
 435
 436 /*
 437  * Return 1 for a group consisting entirely of software counters,
 438  * 0 if the group contains any hardware counters.
 439  */
 440 static int is_software_only_group(struct perf_counter *leader)
 441 {
 442         struct perf_counter *counter;
 443
 444         if (!is_software_counter(leader))
 445                 return 0;
 446
 447         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 448                 if (!is_software_counter(counter))
 449                         return 0;
 450
 451         return 1;
 452 }
 453
 454 /*
 455  * Work out whether we can put this counter group on the CPU now.
 456  */
 457 static int group_can_go_on(struct perf_counter *counter,
 458                            struct perf_cpu_context *cpuctx,
 459                            int can_add_hw)
 460 {
 461         /*
 462          * Groups consisting entirely of software counters can always go on.
 463          */
 464         if (is_software_only_group(counter))
 465                 return 1;
 466         /*
 467          * If an exclusive group is already on, no other hardware
 468          * counters can go on.
 469          */
 470         if (cpuctx->exclusive)
 471                 return 0;
 472         /*
 473          * If this group is exclusive and there are already
 474          * counters on the CPU, it can't go on.
 475          */
 476         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
 477                 return 0;
 478         /*
 479          * Otherwise, try to add it if all previous groups were able
 480          * to go on.
 481          */
 482         return can_add_hw;
 483 }
 484
 485 static void add_counter_to_ctx(struct perf_counter *counter,
 486                                struct perf_counter_context *ctx)
 487 {
 488         list_add_counter(counter, ctx);
 489         ctx->nr_counters++;
 490         counter->prev_state = PERF_COUNTER_STATE_OFF;
 491         counter->tstamp_enabled = ctx->time;
 492         counter->tstamp_running = ctx->time;
 493         counter->tstamp_stopped = ctx->time;
 494 }
 495
 496 /*
 497  * Cross CPU call to install and enable a performance counter
 498  */
 499 static void __perf_install_in_context(void *info)
 500 {
 501         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 502         struct perf_counter *counter = info;
 503         struct perf_counter_context *ctx = counter->ctx;
 504         struct perf_counter *leader = counter->group_leader;
 505         int cpu = smp_processor_id();
 506         unsigned long flags;
 507         u64 perf_flags;
 508         int err;
 509
 510         /*
 511          * If this is a task context, we need to check whether it is
 512          * the current task context of this cpu. If not it has been
 513          * scheduled out before the smp call arrived.
 514          */
 515         if (ctx->task && cpuctx->task_ctx != ctx)
 516                 return;
 517
 518         spin_lock_irqsave(&ctx->lock, flags);
 519         update_context_time(ctx);
 520
 521         /*
 522          * Protect the list operation against NMI by disabling the
 523          * counters on a global level. NOP for non NMI based counters.
 524          */
 525         perf_flags = hw_perf_save_disable();
 526
 527         add_counter_to_ctx(counter, ctx);
 528
 529         /*
 530          * Don't put the counter on if it is disabled or if
 531          * it is in a group and the group isn't on.
 532          */
 533         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 534             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 535                 goto unlock;
 536
 537         /*
 538          * An exclusive counter can't go on if there are already active
 539          * hardware counters, and no hardware counter can go on if there
 540          * is already an exclusive counter on.
 541          */
 542         if (!group_can_go_on(counter, cpuctx, 1))
 543                 err = -EEXIST;
 544         else
 545                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 546
 547         if (err) {
 548                 /*
 549                  * This counter couldn't go on.  If it is in a group
 550                  * then we have to pull the whole group off.
 551                  * If the counter group is pinned then put it in error state.
 552                  */
 553                 if (leader != counter)
 554                         group_sched_out(leader, cpuctx, ctx);
 555                 if (leader->hw_event.pinned) {
 556                         update_group_times(leader);
 557                         leader->state = PERF_COUNTER_STATE_ERROR;
 558                 }
 559         }
 560
 561         if (!err && !ctx->task && cpuctx->max_pertask)
 562                 cpuctx->max_pertask--;
 563
 564  unlock:
 565         hw_perf_restore(perf_flags);
 566
 567         spin_unlock_irqrestore(&ctx->lock, flags);
 568 }
 569
 570 /*
 571  * Attach a performance counter to a context
 572  *
 573  * First we add the counter to the list with the hardware enable bit
 574  * in counter->hw_config cleared.
 575  *
 576  * If the counter is attached to a task which is on a CPU we use a smp
 577  * call to enable it in the task context. The task might have been
 578  * scheduled away, but we check this in the smp call again.
 579  *
 580  * Must be called with ctx->mutex held.
 581  */
 582 static void
 583 perf_install_in_context(struct perf_counter_context *ctx,
 584                         struct perf_counter *counter,
 585                         int cpu)
 586 {
 587         struct task_struct *task = ctx->task;
 588
 589         if (!task) {
 590                 /*
 591                  * Per cpu counters are installed via an smp call and
 592                  * the install is always sucessful.
 593                  */
 594                 smp_call_function_single(cpu, __perf_install_in_context,
 595                                          counter, 1);
 596                 return;
 597         }
 598
 599         counter->task = task;
 600 retry:
 601         task_oncpu_function_call(task, __perf_install_in_context,
 602                                  counter);
 603
 604         spin_lock_irq(&ctx->lock);
 605         /*
 606          * we need to retry the smp call.
 607          */
 608         if (ctx->is_active && list_empty(&counter->list_entry)) {
 609                 spin_unlock_irq(&ctx->lock);
 610                 goto retry;
 611         }
 612
 613         /*
 614          * The lock prevents that this context is scheduled in so we
 615          * can add the counter safely, if it the call above did not
 616          * succeed.
 617          */
 618         if (list_empty(&counter->list_entry))
 619                 add_counter_to_ctx(counter, ctx);
 620         spin_unlock_irq(&ctx->lock);
 621 }
 622
 623 /*
 624  * Cross CPU call to enable a performance counter
 625  */
 626 static void __perf_counter_enable(void *info)
 627 {
 628         struct perf_counter *counter = info;
 629         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 630         struct perf_counter_context *ctx = counter->ctx;
 631         struct perf_counter *leader = counter->group_leader;
 632         unsigned long flags;
 633         int err;
 634
 635         /*
 636          * If this is a per-task counter, need to check whether this
 637          * counter's task is the current task on this cpu.
 638          */
 639         if (ctx->task && cpuctx->task_ctx != ctx)
 640                 return;
 641
 642         spin_lock_irqsave(&ctx->lock, flags);
 643         update_context_time(ctx);
 644
 645         counter->prev_state = counter->state;
 646         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 647                 goto unlock;
 648         counter->state = PERF_COUNTER_STATE_INACTIVE;
 649         counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 650
 651         /*
 652          * If the counter is in a group and isn't the group leader,
 653          * then don't put it on unless the group is on.
 654          */
 655         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 656                 goto unlock;
 657
 658         if (!group_can_go_on(counter, cpuctx, 1))
 659                 err = -EEXIST;
 660         else
 661                 err = counter_sched_in(counter, cpuctx, ctx,
 662                                        smp_processor_id());
 663
 664         if (err) {
 665                 /*
 666                  * If this counter can't go on and it's part of a
 667                  * group, then the whole group has to come off.
 668                  */
 669                 if (leader != counter)
 670                         group_sched_out(leader, cpuctx, ctx);
 671                 if (leader->hw_event.pinned) {
 672                         update_group_times(leader);
 673                         leader->state = PERF_COUNTER_STATE_ERROR;
 674                 }
 675         }
 676
 677  unlock:
 678         spin_unlock_irqrestore(&ctx->lock, flags);
 679 }
 680
 681 /*
 682  * Enable a counter.
 683  */
 684 static void perf_counter_enable(struct perf_counter *counter)
 685 {
 686         struct perf_counter_context *ctx = counter->ctx;
 687         struct task_struct *task = ctx->task;
 688
 689         if (!task) {
 690                 /*
 691                  * Enable the counter on the cpu that it's on
 692                  */
 693                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 694                                          counter, 1);
 695                 return;
 696         }
 697
 698         spin_lock_irq(&ctx->lock);
 699         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 700                 goto out;
 701
 702         /*
 703          * If the counter is in error state, clear that first.
 704          * That way, if we see the counter in error state below, we
 705          * know that it has gone back into error state, as distinct
 706          * from the task having been scheduled away before the
 707          * cross-call arrived.
 708          */
 709         if (counter->state == PERF_COUNTER_STATE_ERROR)
 710                 counter->state = PERF_COUNTER_STATE_OFF;
 711
 712  retry:
 713         spin_unlock_irq(&ctx->lock);
 714         task_oncpu_function_call(task, __perf_counter_enable, counter);
 715
 716         spin_lock_irq(&ctx->lock);
 717
 718         /*
 719          * If the context is active and the counter is still off,
 720          * we need to retry the cross-call.
 721          */
 722         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 723                 goto retry;
 724
 725         /*
 726          * Since we have the lock this context can't be scheduled
 727          * in, so we can change the state safely.
 728          */
 729         if (counter->state == PERF_COUNTER_STATE_OFF) {
 730                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 731                 counter->tstamp_enabled =
 732                         ctx->time - counter->total_time_enabled;
 733         }
 734  out:
 735         spin_unlock_irq(&ctx->lock);
 736 }
 737
 738 static void perf_counter_refresh(struct perf_counter *counter, int refresh)
 739 {
 740         atomic_add(refresh, &counter->event_limit);
 741         perf_counter_enable(counter);
 742 }
 743
 744 /*
 745  * Enable a counter and all its children.
 746  */
 747 static void perf_counter_enable_family(struct perf_counter *counter)
 748 {
 749         struct perf_counter *child;
 750
 751         perf_counter_enable(counter);
 752
 753         /*
 754          * Lock the mutex to protect the list of children
 755          */
 756         mutex_lock(&counter->mutex);
 757         list_for_each_entry(child, &counter->child_list, child_list)
 758                 perf_counter_enable(child);
 759         mutex_unlock(&counter->mutex);
 760 }
 761
 762 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 763                               struct perf_cpu_context *cpuctx)
 764 {
 765         struct perf_counter *counter;
 766         u64 flags;
 767
 768         spin_lock(&ctx->lock);
 769         ctx->is_active = 0;
 770         if (likely(!ctx->nr_counters))
 771                 goto out;
 772         update_context_time(ctx);
 773
 774         flags = hw_perf_save_disable();
 775         if (ctx->nr_active) {
 776                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 777                         group_sched_out(counter, cpuctx, ctx);
 778         }
 779         hw_perf_restore(flags);
 780  out:
 781         spin_unlock(&ctx->lock);
 782 }
 783
 784 /*
 785  * Called from scheduler to remove the counters of the current task,
 786  * with interrupts disabled.
 787  *
 788  * We stop each counter and update the counter value in counter->count.
 789  *
 790  * This does not protect us against NMI, but disable()
 791  * sets the disabled bit in the control field of counter _before_
 792  * accessing the counter control register. If a NMI hits, then it will
 793  * not restart the counter.
 794  */
 795 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 796 {
 797         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 798         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 799         struct pt_regs *regs;
 800
 801         if (likely(!cpuctx->task_ctx))
 802                 return;
 803
 804         update_context_time(ctx);
 805
 806         regs = task_pt_regs(task);
 807         perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
 808         __perf_counter_sched_out(ctx, cpuctx);
 809
 810         cpuctx->task_ctx = NULL;
 811 }
 812
 813 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 814 {
 815         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 816 }
 817
 818 static int
 819 group_sched_in(struct perf_counter *group_counter,
 820                struct perf_cpu_context *cpuctx,
 821                struct perf_counter_context *ctx,
 822                int cpu)
 823 {
 824         struct perf_counter *counter, *partial_group;
 825         int ret;
 826
 827         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 828                 return 0;
 829
 830         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 831         if (ret)
 832                 return ret < 0 ? ret : 0;
 833
 834         group_counter->prev_state = group_counter->state;
 835         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 836                 return -EAGAIN;
 837
 838         /*
 839          * Schedule in siblings as one group (if any):
 840          */
 841         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 842                 counter->prev_state = counter->state;
 843                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 844                         partial_group = counter;
 845                         goto group_error;
 846                 }
 847         }
 848
 849         return 0;
 850
 851 group_error:
 852         /*
 853          * Groups can be scheduled in as one unit only, so undo any
 854          * partial group before returning:
 855          */
 856         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 857                 if (counter == partial_group)
 858                         break;
 859                 counter_sched_out(counter, cpuctx, ctx);
 860         }
 861         counter_sched_out(group_counter, cpuctx, ctx);
 862
 863         return -EAGAIN;
 864 }
 865
 866 static void
 867 __perf_counter_sched_in(struct perf_counter_context *ctx,
 868                         struct perf_cpu_context *cpuctx, int cpu)
 869 {
 870         struct perf_counter *counter;
 871         u64 flags;
 872         int can_add_hw = 1;
 873
 874         spin_lock(&ctx->lock);
 875         ctx->is_active = 1;
 876         if (likely(!ctx->nr_counters))
 877                 goto out;
 878
 879         ctx->timestamp = perf_clock();
 880
 881         flags = hw_perf_save_disable();
 882
 883         /*
 884          * First go through the list and put on any pinned groups
 885          * in order to give them the best chance of going on.
 886          */
 887         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 888                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 889                     !counter->hw_event.pinned)
 890                         continue;
 891                 if (counter->cpu != -1 && counter->cpu != cpu)
 892                         continue;
 893
 894                 if (group_can_go_on(counter, cpuctx, 1))
 895                         group_sched_in(counter, cpuctx, ctx, cpu);
 896
 897                 /*
 898                  * If this pinned group hasn't been scheduled,
 899                  * put it in error state.
 900                  */
 901                 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 902                         update_group_times(counter);
 903                         counter->state = PERF_COUNTER_STATE_ERROR;
 904                 }
 905         }
 906
 907         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 908                 /*
 909                  * Ignore counters in OFF or ERROR state, and
 910                  * ignore pinned counters since we did them already.
 911                  */
 912                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 913                     counter->hw_event.pinned)
 914                         continue;
 915
 916                 /*
 917                  * Listen to the 'cpu' scheduling filter constraint
 918                  * of counters:
 919                  */
 920                 if (counter->cpu != -1 && counter->cpu != cpu)
 921                         continue;
 922
 923                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 924                         if (group_sched_in(counter, cpuctx, ctx, cpu))
 925                                 can_add_hw = 0;
 926                 }
 927         }
 928         hw_perf_restore(flags);
 929  out:
 930         spin_unlock(&ctx->lock);
 931 }
 932
 933 /*
 934  * Called from scheduler to add the counters of the current task
 935  * with interrupts disabled.
 936  *
 937  * We restore the counter value and then enable it.
 938  *
 939  * This does not protect us against NMI, but enable()
 940  * sets the enabled bit in the control field of counter _before_
 941  * accessing the counter control register. If a NMI hits, then it will
 942  * keep the counter running.
 943  */
 944 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 945 {
 946         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 947         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 948
 949         __perf_counter_sched_in(ctx, cpuctx, cpu);
 950         cpuctx->task_ctx = ctx;
 951 }
 952
 953 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 954 {
 955         struct perf_counter_context *ctx = &cpuctx->ctx;
 956
 957         __perf_counter_sched_in(ctx, cpuctx, cpu);
 958 }
 959
 960 int perf_counter_task_disable(void)
 961 {
 962         struct task_struct *curr = current;
 963         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 964         struct perf_counter *counter;
 965         unsigned long flags;
 966         u64 perf_flags;
 967         int cpu;
 968
 969         if (likely(!ctx->nr_counters))
 970                 return 0;
 971
 972         local_irq_save(flags);
 973         cpu = smp_processor_id();
 974
 975         perf_counter_task_sched_out(curr, cpu);
 976
 977         spin_lock(&ctx->lock);
 978
 979         /*
 980          * Disable all the counters:
 981          */
 982         perf_flags = hw_perf_save_disable();
 983
 984         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 985                 if (counter->state != PERF_COUNTER_STATE_ERROR) {
 986                         update_group_times(counter);
 987                         counter->state = PERF_COUNTER_STATE_OFF;
 988                 }
 989         }
 990
 991         hw_perf_restore(perf_flags);
 992
 993         spin_unlock_irqrestore(&ctx->lock, flags);
 994
 995         return 0;
 996 }
 997
 998 int perf_counter_task_enable(void)
 999 {
1000         struct task_struct *curr = current;
1001         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1002         struct perf_counter *counter;
1003         unsigned long flags;
1004         u64 perf_flags;
1005         int cpu;
1006
1007         if (likely(!ctx->nr_counters))
1008                 return 0;
1009
1010         local_irq_save(flags);
1011         cpu = smp_processor_id();
1012
1013         perf_counter_task_sched_out(curr, cpu);
1014
1015         spin_lock(&ctx->lock);
1016
1017         /*
1018          * Disable all the counters:
1019          */
1020         perf_flags = hw_perf_save_disable();
1021
1022         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1023                 if (counter->state > PERF_COUNTER_STATE_OFF)
1024                         continue;
1025                 counter->state = PERF_COUNTER_STATE_INACTIVE;
1026                 counter->tstamp_enabled =
1027                         ctx->time - counter->total_time_enabled;
1028                 counter->hw_event.disabled = 0;
1029         }
1030         hw_perf_restore(perf_flags);
1031
1032         spin_unlock(&ctx->lock);
1033
1034         perf_counter_task_sched_in(curr, cpu);
1035
1036         local_irq_restore(flags);
1037
1038         return 0;
1039 }
1040
1041 /*
1042  * Round-robin a context's counters:
1043  */
1044 static void rotate_ctx(struct perf_counter_context *ctx)
1045 {
1046         struct perf_counter *counter;
1047         u64 perf_flags;
1048
1049         if (!ctx->nr_counters)
1050                 return;
1051
1052         spin_lock(&ctx->lock);
1053         /*
1054          * Rotate the first entry last (works just fine for group counters too):
1055          */
1056         perf_flags = hw_perf_save_disable();
1057         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1058                 list_move_tail(&counter->list_entry, &ctx->counter_list);
1059                 break;
1060         }
1061         hw_perf_restore(perf_flags);
1062
1063         spin_unlock(&ctx->lock);
1064 }
1065
1066 void perf_counter_task_tick(struct task_struct *curr, int cpu)
1067 {
1068         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1069         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1070         const int rotate_percpu = 0;
1071
1072         if (rotate_percpu)
1073                 perf_counter_cpu_sched_out(cpuctx);
1074         perf_counter_task_sched_out(curr, cpu);
1075
1076         if (rotate_percpu)
1077                 rotate_ctx(&cpuctx->ctx);
1078         rotate_ctx(ctx);
1079
1080         if (rotate_percpu)
1081                 perf_counter_cpu_sched_in(cpuctx, cpu);
1082         perf_counter_task_sched_in(curr, cpu);
1083 }
1084
1085 /*
1086  * Cross CPU call to read the hardware counter
1087  */
1088 static void __read(void *info)
1089 {
1090         struct perf_counter *counter = info;
1091         struct perf_counter_context *ctx = counter->ctx;
1092         unsigned long flags;
1093
1094         local_irq_save(flags);
1095         if (ctx->is_active)
1096                 update_context_time(ctx);
1097         counter->hw_ops->read(counter);
1098         update_counter_times(counter);
1099         local_irq_restore(flags);
1100 }
1101
1102 static u64 perf_counter_read(struct perf_counter *counter)
1103 {
1104         /*
1105          * If counter is enabled and currently active on a CPU, update the
1106          * value in the counter structure:
1107          */
1108         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1109                 smp_call_function_single(counter->oncpu,
1110                                          __read, counter, 1);
1111         } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1112                 update_counter_times(counter);
1113         }
1114
1115         return atomic64_read(&counter->count);
1116 }
1117
1118 static void put_context(struct perf_counter_context *ctx)
1119 {
1120         if (ctx->task)
1121                 put_task_struct(ctx->task);
1122 }
1123
1124 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1125 {
1126         struct perf_cpu_context *cpuctx;
1127         struct perf_counter_context *ctx;
1128         struct task_struct *task;
1129
1130         /*
1131          * If cpu is not a wildcard then this is a percpu counter:
1132          */
1133         if (cpu != -1) {
1134                 /* Must be root to operate on a CPU counter: */
1135                 if (!capable(CAP_SYS_ADMIN))
1136                         return ERR_PTR(-EACCES);
1137
1138                 if (cpu < 0 || cpu > num_possible_cpus())
1139                         return ERR_PTR(-EINVAL);
1140
1141                 /*
1142                  * We could be clever and allow to attach a counter to an
1143                  * offline CPU and activate it when the CPU comes up, but
1144                  * that's for later.
1145                  */
1146                 if (!cpu_isset(cpu, cpu_online_map))
1147                         return ERR_PTR(-ENODEV);
1148
1149                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1150                 ctx = &cpuctx->ctx;
1151
1152                 return ctx;
1153         }
1154
1155         rcu_read_lock();
1156         if (!pid)
1157                 task = current;
1158         else
1159                 task = find_task_by_vpid(pid);
1160         if (task)
1161                 get_task_struct(task);
1162         rcu_read_unlock();
1163
1164         if (!task)
1165                 return ERR_PTR(-ESRCH);
1166
1167         ctx = &task->perf_counter_ctx;
1168         ctx->task = task;
1169
1170         /* Reuse ptrace permission checks for now. */
1171         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1172                 put_context(ctx);
1173                 return ERR_PTR(-EACCES);
1174         }
1175
1176         return ctx;
1177 }
1178
1179 static void free_counter_rcu(struct rcu_head *head)
1180 {
1181         struct perf_counter *counter;
1182
1183         counter = container_of(head, struct perf_counter, rcu_head);
1184         kfree(counter);
1185 }
1186
1187 static void perf_pending_sync(struct perf_counter *counter);
1188
1189 static void free_counter(struct perf_counter *counter)
1190 {
1191         perf_pending_sync(counter);
1192
1193         if (counter->hw_event.mmap)
1194                 atomic_dec(&nr_mmap_tracking);
1195         if (counter->hw_event.munmap)
1196                 atomic_dec(&nr_munmap_tracking);
1197         if (counter->hw_event.comm)
1198                 atomic_dec(&nr_comm_tracking);
1199
1200         if (counter->destroy)
1201                 counter->destroy(counter);
1202
1203         call_rcu(&counter->rcu_head, free_counter_rcu);
1204 }
1205
1206 /*
1207  * Called when the last reference to the file is gone.
1208  */
1209 static int perf_release(struct inode *inode, struct file *file)
1210 {
1211         struct perf_counter *counter = file->private_data;
1212         struct perf_counter_context *ctx = counter->ctx;
1213
1214         file->private_data = NULL;
1215
1216         mutex_lock(&ctx->mutex);
1217         mutex_lock(&counter->mutex);
1218
1219         perf_counter_remove_from_context(counter);
1220
1221         mutex_unlock(&counter->mutex);
1222         mutex_unlock(&ctx->mutex);
1223
1224         free_counter(counter);
1225         put_context(ctx);
1226
1227         return 0;
1228 }
1229
1230 /*
1231  * Read the performance counter - simple non blocking version for now
1232  */
1233 static ssize_t
1234 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1235 {
1236         u64 values[3];
1237         int n;
1238
1239         /*
1240          * Return end-of-file for a read on a counter that is in
1241          * error state (i.e. because it was pinned but it couldn't be
1242          * scheduled on to the CPU at some point).
1243          */
1244         if (counter->state == PERF_COUNTER_STATE_ERROR)
1245                 return 0;
1246
1247         mutex_lock(&counter->mutex);
1248         values[0] = perf_counter_read(counter);
1249         n = 1;
1250         if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1251                 values[n++] = counter->total_time_enabled +
1252                         atomic64_read(&counter->child_total_time_enabled);
1253         if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1254                 values[n++] = counter->total_time_running +
1255                         atomic64_read(&counter->child_total_time_running);
1256         mutex_unlock(&counter->mutex);
1257
1258         if (count < n * sizeof(u64))
1259                 return -EINVAL;
1260         count = n * sizeof(u64);
1261
1262         if (copy_to_user(buf, values, count))
1263                 return -EFAULT;
1264
1265         return count;
1266 }
1267
1268 static ssize_t
1269 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1270 {
1271         struct perf_counter *counter = file->private_data;
1272
1273         return perf_read_hw(counter, buf, count);
1274 }
1275
1276 static unsigned int perf_poll(struct file *file, poll_table *wait)
1277 {
1278         struct perf_counter *counter = file->private_data;
1279         struct perf_mmap_data *data;
1280         unsigned int events;
1281
1282         rcu_read_lock();
1283         data = rcu_dereference(counter->data);
1284         if (data)
1285                 events = atomic_xchg(&data->wakeup, 0);
1286         else
1287                 events = POLL_HUP;
1288         rcu_read_unlock();
1289
1290         poll_wait(file, &counter->waitq, wait);
1291
1292         return events;
1293 }
1294
1295 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1296 {
1297         struct perf_counter *counter = file->private_data;
1298         int err = 0;
1299
1300         switch (cmd) {
1301         case PERF_COUNTER_IOC_ENABLE:
1302                 perf_counter_enable_family(counter);
1303                 break;
1304         case PERF_COUNTER_IOC_DISABLE:
1305                 perf_counter_disable_family(counter);
1306                 break;
1307         case PERF_COUNTER_IOC_REFRESH:
1308                 perf_counter_refresh(counter, arg);
1309                 break;
1310         default:
1311                 err = -ENOTTY;
1312         }
1313         return err;
1314 }
1315
1316 /*
1317  * Callers need to ensure there can be no nesting of this function, otherwise
1318  * the seqlock logic goes bad. We can not serialize this because the arch
1319  * code calls this from NMI context.
1320  */
1321 void perf_counter_update_userpage(struct perf_counter *counter)
1322 {
1323         struct perf_mmap_data *data;
1324         struct perf_counter_mmap_page *userpg;
1325
1326         rcu_read_lock();
1327         data = rcu_dereference(counter->data);
1328         if (!data)
1329                 goto unlock;
1330
1331         userpg = data->user_page;
1332
1333         /*
1334          * Disable preemption so as to not let the corresponding user-space
1335          * spin too long if we get preempted.
1336          */
1337         preempt_disable();
1338         ++userpg->lock;
1339         barrier();
1340         userpg->index = counter->hw.idx;
1341         userpg->offset = atomic64_read(&counter->count);
1342         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1343                 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1344
1345         barrier();
1346         ++userpg->lock;
1347         preempt_enable();
1348 unlock:
1349         rcu_read_unlock();
1350 }
1351
1352 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1353 {
1354         struct perf_counter *counter = vma->vm_file->private_data;
1355         struct perf_mmap_data *data;
1356         int ret = VM_FAULT_SIGBUS;
1357
1358         rcu_read_lock();
1359         data = rcu_dereference(counter->data);
1360         if (!data)
1361                 goto unlock;
1362
1363         if (vmf->pgoff == 0) {
1364                 vmf->page = virt_to_page(data->user_page);
1365         } else {
1366                 int nr = vmf->pgoff - 1;
1367
1368                 if ((unsigned)nr > data->nr_pages)
1369                         goto unlock;
1370
1371                 vmf->page = virt_to_page(data->data_pages[nr]);
1372         }
1373         get_page(vmf->page);
1374         ret = 0;
1375 unlock:
1376         rcu_read_unlock();
1377
1378         return ret;
1379 }
1380
1381 static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1382 {
1383         struct perf_mmap_data *data;
1384         unsigned long size;
1385         int i;
1386
1387         WARN_ON(atomic_read(&counter->mmap_count));
1388
1389         size = sizeof(struct perf_mmap_data);
1390         size += nr_pages * sizeof(void *);
1391
1392         data = kzalloc(size, GFP_KERNEL);
1393         if (!data)
1394                 goto fail;
1395
1396         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1397         if (!data->user_page)
1398                 goto fail_user_page;
1399
1400         for (i = 0; i < nr_pages; i++) {
1401                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1402                 if (!data->data_pages[i])
1403                         goto fail_data_pages;
1404         }
1405
1406         data->nr_pages = nr_pages;
1407
1408         rcu_assign_pointer(counter->data, data);
1409
1410         return 0;
1411
1412 fail_data_pages:
1413         for (i--; i >= 0; i--)
1414                 free_page((unsigned long)data->data_pages[i]);
1415
1416         free_page((unsigned long)data->user_page);
1417
1418 fail_user_page:
1419         kfree(data);
1420
1421 fail:
1422         return -ENOMEM;
1423 }
1424
1425 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1426 {
1427         struct perf_mmap_data *data = container_of(rcu_head,
1428                         struct perf_mmap_data, rcu_head);
1429         int i;
1430
1431         free_page((unsigned long)data->user_page);
1432         for (i = 0; i < data->nr_pages; i++)
1433                 free_page((unsigned long)data->data_pages[i]);
1434         kfree(data);
1435 }
1436
1437 static void perf_mmap_data_free(struct perf_counter *counter)
1438 {
1439         struct perf_mmap_data *data = counter->data;
1440
1441         WARN_ON(atomic_read(&counter->mmap_count));
1442
1443         rcu_assign_pointer(counter->data, NULL);
1444         call_rcu(&data->rcu_head, __perf_mmap_data_free);
1445 }
1446
1447 static void perf_mmap_open(struct vm_area_struct *vma)
1448 {
1449         struct perf_counter *counter = vma->vm_file->private_data;
1450
1451         atomic_inc(&counter->mmap_count);
1452 }
1453
1454 static void perf_mmap_close(struct vm_area_struct *vma)
1455 {
1456         struct perf_counter *counter = vma->vm_file->private_data;
1457
1458         if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1459                                       &counter->mmap_mutex)) {
1460                 vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
1461                 perf_mmap_data_free(counter);
1462                 mutex_unlock(&counter->mmap_mutex);
1463         }
1464 }
1465
1466 static struct vm_operations_struct perf_mmap_vmops = {
1467         .open  = perf_mmap_open,
1468         .close = perf_mmap_close,
1469         .fault = perf_mmap_fault,
1470 };
1471
1472 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1473 {
1474         struct perf_counter *counter = file->private_data;
1475         unsigned long vma_size;
1476         unsigned long nr_pages;
1477         unsigned long locked, lock_limit;
1478         int ret = 0;
1479
1480         if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1481                 return -EINVAL;
1482
1483         vma_size = vma->vm_end - vma->vm_start;
1484         nr_pages = (vma_size / PAGE_SIZE) - 1;
1485
1486         /*
1487          * If we have data pages ensure they're a power-of-two number, so we
1488          * can do bitmasks instead of modulo.
1489          */
1490         if (nr_pages != 0 && !is_power_of_2(nr_pages))
1491                 return -EINVAL;
1492
1493         if (vma_size != PAGE_SIZE * (1 + nr_pages))
1494                 return -EINVAL;
1495
1496         if (vma->vm_pgoff != 0)
1497                 return -EINVAL;
1498
1499         mutex_lock(&counter->mmap_mutex);
1500         if (atomic_inc_not_zero(&counter->mmap_count)) {
1501                 if (nr_pages != counter->data->nr_pages)
1502                         ret = -EINVAL;
1503                 goto unlock;
1504         }
1505
1506         locked = vma->vm_mm->locked_vm;
1507         locked += nr_pages + 1;
1508
1509         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1510         lock_limit >>= PAGE_SHIFT;
1511
1512         if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1513                 ret = -EPERM;
1514                 goto unlock;
1515         }
1516
1517         WARN_ON(counter->data);
1518         ret = perf_mmap_data_alloc(counter, nr_pages);
1519         if (ret)
1520                 goto unlock;
1521
1522         atomic_set(&counter->mmap_count, 1);
1523         vma->vm_mm->locked_vm += nr_pages + 1;
1524 unlock:
1525         mutex_unlock(&counter->mmap_mutex);
1526
1527         vma->vm_flags &= ~VM_MAYWRITE;
1528         vma->vm_flags |= VM_RESERVED;
1529         vma->vm_ops = &perf_mmap_vmops;
1530
1531         return ret;
1532 }
1533
1534 static int perf_fasync(int fd, struct file *filp, int on)
1535 {
1536         struct perf_counter *counter = filp->private_data;
1537         struct inode *inode = filp->f_path.dentry->d_inode;
1538         int retval;
1539
1540         mutex_lock(&inode->i_mutex);
1541         retval = fasync_helper(fd, filp, on, &counter->fasync);
1542         mutex_unlock(&inode->i_mutex);
1543
1544         if (retval < 0)
1545                 return retval;
1546
1547         return 0;
1548 }
1549
1550 static const struct file_operations perf_fops = {
1551         .release                = perf_release,
1552         .read                   = perf_read,
1553         .poll                   = perf_poll,
1554         .unlocked_ioctl         = perf_ioctl,
1555         .compat_ioctl           = perf_ioctl,
1556         .mmap                   = perf_mmap,
1557         .fasync                 = perf_fasync,
1558 };
1559
1560 /*
1561  * Perf counter wakeup
1562  *
1563  * If there's data, ensure we set the poll() state and publish everything
1564  * to user-space before waking everybody up.
1565  */
1566
1567 void perf_counter_wakeup(struct perf_counter *counter)
1568 {
1569         struct perf_mmap_data *data;
1570
1571         rcu_read_lock();
1572         data = rcu_dereference(counter->data);
1573         if (data) {
1574                 atomic_set(&data->wakeup, POLL_IN);
1575                 /*
1576                  * Ensure all data writes are issued before updating the
1577                  * user-space data head information. The matching rmb()
1578                  * will be in userspace after reading this value.
1579                  */
1580                 smp_wmb();
1581                 data->user_page->data_head = atomic_read(&data->head);
1582         }
1583         rcu_read_unlock();
1584
1585         wake_up_all(&counter->waitq);
1586
1587         if (counter->pending_kill) {
1588                 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1589                 counter->pending_kill = 0;
1590         }
1591 }
1592
1593 /*
1594  * Pending wakeups
1595  *
1596  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1597  *
1598  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1599  * single linked list and use cmpxchg() to add entries lockless.
1600  */
1601
1602 static void perf_pending_counter(struct perf_pending_entry *entry)
1603 {
1604         struct perf_counter *counter = container_of(entry,
1605                         struct perf_counter, pending);
1606
1607         if (counter->pending_disable) {
1608                 counter->pending_disable = 0;
1609                 perf_counter_disable(counter);
1610         }
1611
1612         if (counter->pending_wakeup) {
1613                 counter->pending_wakeup = 0;
1614                 perf_counter_wakeup(counter);
1615         }
1616 }
1617
1618 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1619
1620 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
1621         PENDING_TAIL,
1622 };
1623
1624 static void perf_pending_queue(struct perf_pending_entry *entry,
1625                                void (*func)(struct perf_pending_entry *))
1626 {
1627         struct perf_pending_entry **head;
1628
1629         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
1630                 return;
1631
1632         entry->func = func;
1633
1634         head = &get_cpu_var(perf_pending_head);
1635
1636         do {
1637                 entry->next = *head;
1638         } while (cmpxchg(head, entry->next, entry) != entry->next);
1639
1640         set_perf_counter_pending();
1641
1642         put_cpu_var(perf_pending_head);
1643 }
1644
1645 static int __perf_pending_run(void)
1646 {
1647         struct perf_pending_entry *list;
1648         int nr = 0;
1649
1650         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
1651         while (list != PENDING_TAIL) {
1652                 void (*func)(struct perf_pending_entry *);
1653                 struct perf_pending_entry *entry = list;
1654
1655                 list = list->next;
1656
1657                 func = entry->func;
1658                 entry->next = NULL;
1659                 /*
1660                  * Ensure we observe the unqueue before we issue the wakeup,
1661                  * so that we won't be waiting forever.
1662                  * -- see perf_not_pending().
1663                  */
1664                 smp_wmb();
1665
1666                 func(entry);
1667                 nr++;
1668         }
1669
1670         return nr;
1671 }
1672
1673 static inline int perf_not_pending(struct perf_counter *counter)
1674 {
1675         /*
1676          * If we flush on whatever cpu we run, there is a chance we don't
1677          * need to wait.
1678          */
1679         get_cpu();
1680         __perf_pending_run();
1681         put_cpu();
1682
1683         /*
1684          * Ensure we see the proper queue state before going to sleep
1685          * so that we do not miss the wakeup. -- see perf_pending_handle()
1686          */
1687         smp_rmb();
1688         return counter->pending.next == NULL;
1689 }
1690
1691 static void perf_pending_sync(struct perf_counter *counter)
1692 {
1693         wait_event(counter->waitq, perf_not_pending(counter));
1694 }
1695
1696 void perf_counter_do_pending(void)
1697 {
1698         __perf_pending_run();
1699 }
1700
1701 /*
1702  * Callchain support -- arch specific
1703  */
1704
1705 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1706 {
1707         return NULL;
1708 }
1709
1710 /*
1711  * Output
1712  */
1713
1714 struct perf_output_handle {
1715         struct perf_counter     *counter;
1716         struct perf_mmap_data   *data;
1717         unsigned int            offset;
1718         unsigned int            head;
1719         int                     wakeup;
1720         int                     nmi;
1721         int                     overflow;
1722 };
1723
1724 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
1725 {
1726         if (handle->nmi) {
1727                 handle->counter->pending_wakeup = 1;
1728                 perf_pending_queue(&handle->counter->pending,
1729                                    perf_pending_counter);
1730         } else
1731                 perf_counter_wakeup(handle->counter);
1732 }
1733
1734 static int perf_output_begin(struct perf_output_handle *handle,
1735                              struct perf_counter *counter, unsigned int size,
1736                              int nmi, int overflow)
1737 {
1738         struct perf_mmap_data *data;
1739         unsigned int offset, head;
1740
1741         rcu_read_lock();
1742         data = rcu_dereference(counter->data);
1743         if (!data)
1744                 goto out;
1745
1746         handle->counter  = counter;
1747         handle->nmi      = nmi;
1748         handle->overflow = overflow;
1749
1750         if (!data->nr_pages)
1751                 goto fail;
1752
1753         do {
1754                 offset = head = atomic_read(&data->head);
1755                 head += size;
1756         } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1757
1758         handle->data    = data;
1759         handle->offset  = offset;
1760         handle->head    = head;
1761         handle->wakeup  = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
1762
1763         return 0;
1764
1765 fail:
1766         __perf_output_wakeup(handle);
1767 out:
1768         rcu_read_unlock();
1769
1770         return -ENOSPC;
1771 }
1772
1773 static void perf_output_copy(struct perf_output_handle *handle,
1774                              void *buf, unsigned int len)
1775 {
1776         unsigned int pages_mask;
1777         unsigned int offset;
1778         unsigned int size;
1779         void **pages;
1780
1781         offset          = handle->offset;
1782         pages_mask      = handle->data->nr_pages - 1;
1783         pages           = handle->data->data_pages;
1784
1785         do {
1786                 unsigned int page_offset;
1787                 int nr;
1788
1789                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
1790                 page_offset = offset & (PAGE_SIZE - 1);
1791                 size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1792
1793                 memcpy(pages[nr] + page_offset, buf, size);
1794
1795                 len         -= size;
1796                 buf         += size;
1797                 offset      += size;
1798         } while (len);
1799
1800         handle->offset = offset;
1801
1802         WARN_ON_ONCE(handle->offset > handle->head);
1803 }
1804
1805 #define perf_output_put(handle, x) \
1806         perf_output_copy((handle), &(x), sizeof(x))
1807
1808 static void perf_output_end(struct perf_output_handle *handle)
1809 {
1810         int wakeup_events = handle->counter->hw_event.wakeup_events;
1811
1812         if (handle->overflow && wakeup_events) {
1813                 int events = atomic_inc_return(&handle->data->events);
1814                 if (events >= wakeup_events) {
1815                         atomic_sub(wakeup_events, &handle->data->events);
1816                         __perf_output_wakeup(handle);
1817                 }
1818         } else if (handle->wakeup)
1819                 __perf_output_wakeup(handle);
1820         rcu_read_unlock();
1821 }
1822
1823 static void perf_counter_output(struct perf_counter *counter,
1824                                 int nmi, struct pt_regs *regs, u64 addr)
1825 {
1826         int ret;
1827         u64 record_type = counter->hw_event.record_type;
1828         struct perf_output_handle handle;
1829         struct perf_event_header header;
1830         u64 ip;
1831         struct {
1832                 u32 pid, tid;
1833         } tid_entry;
1834         struct {
1835                 u64 event;
1836                 u64 counter;
1837         } group_entry;
1838         struct perf_callchain_entry *callchain = NULL;
1839         int callchain_size = 0;
1840         u64 time;
1841
1842         header.type = 0;
1843         header.size = sizeof(header);
1844
1845         header.misc = PERF_EVENT_MISC_OVERFLOW;
1846         header.misc |= user_mode(regs) ?
1847                 PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
1848
1849         if (record_type & PERF_RECORD_IP) {
1850                 ip = instruction_pointer(regs);
1851                 header.type |= PERF_RECORD_IP;
1852                 header.size += sizeof(ip);
1853         }
1854
1855         if (record_type & PERF_RECORD_TID) {
1856                 /* namespace issues */
1857                 tid_entry.pid = current->group_leader->pid;
1858                 tid_entry.tid = current->pid;
1859
1860                 header.type |= PERF_RECORD_TID;
1861                 header.size += sizeof(tid_entry);
1862         }
1863
1864         if (record_type & PERF_RECORD_TIME) {
1865                 /*
1866                  * Maybe do better on x86 and provide cpu_clock_nmi()
1867                  */
1868                 time = sched_clock();
1869
1870                 header.type |= PERF_RECORD_TIME;
1871                 header.size += sizeof(u64);
1872         }
1873
1874         if (record_type & PERF_RECORD_ADDR) {
1875                 header.type |= PERF_RECORD_ADDR;
1876                 header.size += sizeof(u64);
1877         }
1878
1879         if (record_type & PERF_RECORD_GROUP) {
1880                 header.type |= PERF_RECORD_GROUP;
1881                 header.size += sizeof(u64) +
1882                         counter->nr_siblings * sizeof(group_entry);
1883         }
1884
1885         if (record_type & PERF_RECORD_CALLCHAIN) {
1886                 callchain = perf_callchain(regs);
1887
1888                 if (callchain) {
1889                         callchain_size = (1 + callchain->nr) * sizeof(u64);
1890
1891                         header.type |= PERF_RECORD_CALLCHAIN;
1892                         header.size += callchain_size;
1893                 }
1894         }
1895
1896         ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
1897         if (ret)
1898                 return;
1899
1900         perf_output_put(&handle, header);
1901
1902         if (record_type & PERF_RECORD_IP)
1903                 perf_output_put(&handle, ip);
1904
1905         if (record_type & PERF_RECORD_TID)
1906                 perf_output_put(&handle, tid_entry);
1907
1908         if (record_type & PERF_RECORD_TIME)
1909                 perf_output_put(&handle, time);
1910
1911         if (record_type & PERF_RECORD_ADDR)
1912                 perf_output_put(&handle, addr);
1913
1914         if (record_type & PERF_RECORD_GROUP) {
1915                 struct perf_counter *leader, *sub;
1916                 u64 nr = counter->nr_siblings;
1917
1918                 perf_output_put(&handle, nr);
1919
1920                 leader = counter->group_leader;
1921                 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1922                         if (sub != counter)
1923                                 sub->hw_ops->read(sub);
1924
1925                         group_entry.event = sub->hw_event.config;
1926                         group_entry.counter = atomic64_read(&sub->count);
1927
1928                         perf_output_put(&handle, group_entry);
1929                 }
1930         }
1931
1932         if (callchain)
1933                 perf_output_copy(&handle, callchain, callchain_size);
1934
1935         perf_output_end(&handle);
1936 }
1937
1938 /*
1939  * comm tracking
1940  */
1941
1942 struct perf_comm_event {
1943         struct task_struct      *task;
1944         char                    *comm;
1945         int                     comm_size;
1946
1947         struct {
1948                 struct perf_event_header        header;
1949
1950                 u32                             pid;
1951                 u32                             tid;
1952         } event;
1953 };
1954
1955 static void perf_counter_comm_output(struct perf_counter *counter,
1956                                      struct perf_comm_event *comm_event)
1957 {
1958         struct perf_output_handle handle;
1959         int size = comm_event->event.header.size;
1960         int ret = perf_output_begin(&handle, counter, size, 0, 0);
1961
1962         if (ret)
1963                 return;
1964
1965         perf_output_put(&handle, comm_event->event);
1966         perf_output_copy(&handle, comm_event->comm,
1967                                    comm_event->comm_size);
1968         perf_output_end(&handle);
1969 }
1970
1971 static int perf_counter_comm_match(struct perf_counter *counter,
1972                                    struct perf_comm_event *comm_event)
1973 {
1974         if (counter->hw_event.comm &&
1975             comm_event->event.header.type == PERF_EVENT_COMM)
1976                 return 1;
1977
1978         return 0;
1979 }
1980
1981 static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
1982                                   struct perf_comm_event *comm_event)
1983 {
1984         struct perf_counter *counter;
1985
1986         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1987                 return;
1988
1989         rcu_read_lock();
1990         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1991                 if (perf_counter_comm_match(counter, comm_event))
1992                         perf_counter_comm_output(counter, comm_event);
1993         }
1994         rcu_read_unlock();
1995 }
1996
1997 static void perf_counter_comm_event(struct perf_comm_event *comm_event)
1998 {
1999         struct perf_cpu_context *cpuctx;
2000         unsigned int size;
2001         char *comm = comm_event->task->comm;
2002
2003         size = ALIGN(strlen(comm)+1, sizeof(u64));
2004
2005         comm_event->comm = comm;
2006         comm_event->comm_size = size;
2007
2008         comm_event->event.header.size = sizeof(comm_event->event) + size;
2009
2010         cpuctx = &get_cpu_var(perf_cpu_context);
2011         perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2012         put_cpu_var(perf_cpu_context);
2013
2014         perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
2015 }
2016
2017 void perf_counter_comm(struct task_struct *task)
2018 {
2019         struct perf_comm_event comm_event;
2020
2021         if (!atomic_read(&nr_comm_tracking))
2022                 return;
2023
2024         comm_event = (struct perf_comm_event){
2025                 .task   = task,
2026                 .event  = {
2027                         .header = { .type = PERF_EVENT_COMM, },
2028                         .pid    = task->group_leader->pid,
2029                         .tid    = task->pid,
2030                 },
2031         };
2032
2033         perf_counter_comm_event(&comm_event);
2034 }
2035
2036 /*
2037  * mmap tracking
2038  */
2039
2040 struct perf_mmap_event {
2041         struct file     *file;
2042         char            *file_name;
2043         int             file_size;
2044
2045         struct {
2046                 struct perf_event_header        header;
2047
2048                 u32                             pid;
2049                 u32                             tid;
2050                 u64                             start;
2051                 u64                             len;
2052                 u64                             pgoff;
2053         } event;
2054 };
2055
2056 static void perf_counter_mmap_output(struct perf_counter *counter,
2057                                      struct perf_mmap_event *mmap_event)
2058 {
2059         struct perf_output_handle handle;
2060         int size = mmap_event->event.header.size;
2061         int ret = perf_output_begin(&handle, counter, size, 0, 0);
2062
2063         if (ret)
2064                 return;
2065
2066         perf_output_put(&handle, mmap_event->event);
2067         perf_output_copy(&handle, mmap_event->file_name,
2068                                    mmap_event->file_size);
2069         perf_output_end(&handle);
2070 }
2071
2072 static int perf_counter_mmap_match(struct perf_counter *counter,
2073                                    struct perf_mmap_event *mmap_event)
2074 {
2075         if (counter->hw_event.mmap &&
2076             mmap_event->event.header.type == PERF_EVENT_MMAP)
2077                 return 1;
2078
2079         if (counter->hw_event.munmap &&
2080             mmap_event->event.header.type == PERF_EVENT_MUNMAP)
2081                 return 1;
2082
2083         return 0;
2084 }
2085
2086 static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2087                                   struct perf_mmap_event *mmap_event)
2088 {
2089         struct perf_counter *counter;
2090
2091         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2092                 return;
2093
2094         rcu_read_lock();
2095         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2096                 if (perf_counter_mmap_match(counter, mmap_event))
2097                         perf_counter_mmap_output(counter, mmap_event);
2098         }
2099         rcu_read_unlock();
2100 }
2101
2102 static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2103 {
2104         struct perf_cpu_context *cpuctx;
2105         struct file *file = mmap_event->file;
2106         unsigned int size;
2107         char tmp[16];
2108         char *buf = NULL;
2109         char *name;
2110
2111         if (file) {
2112                 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2113                 if (!buf) {
2114                         name = strncpy(tmp, "//enomem", sizeof(tmp));
2115                         goto got_name;
2116                 }
2117                 name = dentry_path(file->f_dentry, buf, PATH_MAX);
2118                 if (IS_ERR(name)) {
2119                         name = strncpy(tmp, "//toolong", sizeof(tmp));
2120                         goto got_name;
2121                 }
2122         } else {
2123                 name = strncpy(tmp, "//anon", sizeof(tmp));
2124                 goto got_name;
2125         }
2126
2127 got_name:
2128         size = ALIGN(strlen(name)+1, sizeof(u64));
2129
2130         mmap_event->file_name = name;
2131         mmap_event->file_size = size;
2132
2133         mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2134
2135         cpuctx = &get_cpu_var(perf_cpu_context);
2136         perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2137         put_cpu_var(perf_cpu_context);
2138
2139         perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2140
2141         kfree(buf);
2142 }
2143
2144 void perf_counter_mmap(unsigned long addr, unsigned long len,
2145                        unsigned long pgoff, struct file *file)
2146 {
2147         struct perf_mmap_event mmap_event;
2148
2149         if (!atomic_read(&nr_mmap_tracking))
2150                 return;
2151
2152         mmap_event = (struct perf_mmap_event){
2153                 .file   = file,
2154                 .event  = {
2155                         .header = { .type = PERF_EVENT_MMAP, },
2156                         .pid    = current->group_leader->pid,
2157                         .tid    = current->pid,
2158                         .start  = addr,
2159                         .len    = len,
2160                         .pgoff  = pgoff,
2161                 },
2162         };
2163
2164         perf_counter_mmap_event(&mmap_event);
2165 }
2166
2167 void perf_counter_munmap(unsigned long addr, unsigned long len,
2168                          unsigned long pgoff, struct file *file)
2169 {
2170         struct perf_mmap_event mmap_event;
2171
2172         if (!atomic_read(&nr_munmap_tracking))
2173                 return;
2174
2175         mmap_event = (struct perf_mmap_event){
2176                 .file   = file,
2177                 .event  = {
2178                         .header = { .type = PERF_EVENT_MUNMAP, },
2179                         .pid    = current->group_leader->pid,
2180                         .tid    = current->pid,
2181                         .start  = addr,
2182                         .len    = len,
2183                         .pgoff  = pgoff,
2184                 },
2185         };
2186
2187         perf_counter_mmap_event(&mmap_event);
2188 }
2189
2190 /*
2191  * Generic counter overflow handling.
2192  */
2193
2194 int perf_counter_overflow(struct perf_counter *counter,
2195                           int nmi, struct pt_regs *regs, u64 addr)
2196 {
2197         int events = atomic_read(&counter->event_limit);
2198         int ret = 0;
2199
2200         counter->pending_kill = POLL_IN;
2201         if (events && atomic_dec_and_test(&counter->event_limit)) {
2202                 ret = 1;
2203                 counter->pending_kill = POLL_HUP;
2204                 if (nmi) {
2205                         counter->pending_disable = 1;
2206                         perf_pending_queue(&counter->pending,
2207                                            perf_pending_counter);
2208                 } else
2209                         perf_counter_disable(counter);
2210         }
2211
2212         perf_counter_output(counter, nmi, regs, addr);
2213         return ret;
2214 }
2215
2216 /*
2217  * Generic software counter infrastructure
2218  */
2219
2220 static void perf_swcounter_update(struct perf_counter *counter)
2221 {
2222         struct hw_perf_counter *hwc = &counter->hw;
2223         u64 prev, now;
2224         s64 delta;
2225
2226 again:
2227         prev = atomic64_read(&hwc->prev_count);
2228         now = atomic64_read(&hwc->count);
2229         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2230                 goto again;
2231
2232         delta = now - prev;
2233
2234         atomic64_add(delta, &counter->count);
2235         atomic64_sub(delta, &hwc->period_left);
2236 }
2237
2238 static void perf_swcounter_set_period(struct perf_counter *counter)
2239 {
2240         struct hw_perf_counter *hwc = &counter->hw;
2241         s64 left = atomic64_read(&hwc->period_left);
2242         s64 period = hwc->irq_period;
2243
2244         if (unlikely(left <= -period)) {
2245                 left = period;
2246                 atomic64_set(&hwc->period_left, left);
2247         }
2248
2249         if (unlikely(left <= 0)) {
2250                 left += period;
2251                 atomic64_add(period, &hwc->period_left);
2252         }
2253
2254         atomic64_set(&hwc->prev_count, -left);
2255         atomic64_set(&hwc->count, -left);
2256 }
2257
2258 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2259 {
2260         enum hrtimer_restart ret = HRTIMER_RESTART;
2261         struct perf_counter *counter;
2262         struct pt_regs *regs;
2263
2264         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2265         counter->hw_ops->read(counter);
2266
2267         regs = get_irq_regs();
2268         /*
2269          * In case we exclude kernel IPs or are somehow not in interrupt
2270          * context, provide the next best thing, the user IP.
2271          */
2272         if ((counter->hw_event.exclude_kernel || !regs) &&
2273                         !counter->hw_event.exclude_user)
2274                 regs = task_pt_regs(current);
2275
2276         if (regs) {
2277                 if (perf_counter_overflow(counter, 0, regs, 0))
2278                         ret = HRTIMER_NORESTART;
2279         }
2280
2281         hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
2282
2283         return ret;
2284 }
2285
2286 static void perf_swcounter_overflow(struct perf_counter *counter,
2287                                     int nmi, struct pt_regs *regs, u64 addr)
2288 {
2289         perf_swcounter_update(counter);
2290         perf_swcounter_set_period(counter);
2291         if (perf_counter_overflow(counter, nmi, regs, addr))
2292                 /* soft-disable the counter */
2293                 ;
2294
2295 }
2296
2297 static int perf_swcounter_match(struct perf_counter *counter,
2298                                 enum perf_event_types type,
2299                                 u32 event, struct pt_regs *regs)
2300 {
2301         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2302                 return 0;
2303
2304         if (perf_event_raw(&counter->hw_event))
2305                 return 0;
2306
2307         if (perf_event_type(&counter->hw_event) != type)
2308                 return 0;
2309
2310         if (perf_event_id(&counter->hw_event) != event)
2311                 return 0;
2312
2313         if (counter->hw_event.exclude_user && user_mode(regs))
2314                 return 0;
2315
2316         if (counter->hw_event.exclude_kernel && !user_mode(regs))
2317                 return 0;
2318
2319         return 1;
2320 }
2321
2322 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
2323                                int nmi, struct pt_regs *regs, u64 addr)
2324 {
2325         int neg = atomic64_add_negative(nr, &counter->hw.count);
2326         if (counter->hw.irq_period && !neg)
2327                 perf_swcounter_overflow(counter, nmi, regs, addr);
2328 }
2329
2330 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
2331                                      enum perf_event_types type, u32 event,
2332                                      u64 nr, int nmi, struct pt_regs *regs,
2333                                      u64 addr)
2334 {
2335         struct perf_counter *counter;
2336
2337         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2338                 return;
2339
2340         rcu_read_lock();
2341         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2342                 if (perf_swcounter_match(counter, type, event, regs))
2343                         perf_swcounter_add(counter, nr, nmi, regs, addr);
2344         }
2345         rcu_read_unlock();
2346 }
2347
2348 static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2349 {
2350         if (in_nmi())
2351                 return &cpuctx->recursion[3];
2352
2353         if (in_irq())
2354                 return &cpuctx->recursion[2];
2355
2356         if (in_softirq())
2357                 return &cpuctx->recursion[1];
2358
2359         return &cpuctx->recursion[0];
2360 }
2361
2362 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
2363                                    u64 nr, int nmi, struct pt_regs *regs,
2364                                    u64 addr)
2365 {
2366         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
2367         int *recursion = perf_swcounter_recursion_context(cpuctx);
2368
2369         if (*recursion)
2370                 goto out;
2371
2372         (*recursion)++;
2373         barrier();
2374
2375         perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
2376                                  nr, nmi, regs, addr);
2377         if (cpuctx->task_ctx) {
2378                 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
2379                                          nr, nmi, regs, addr);
2380         }
2381
2382         barrier();
2383         (*recursion)--;
2384
2385 out:
2386         put_cpu_var(perf_cpu_context);
2387 }
2388
2389 void
2390 perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
2391 {
2392         __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
2393 }
2394
2395 static void perf_swcounter_read(struct perf_counter *counter)
2396 {
2397         perf_swcounter_update(counter);
2398 }
2399
2400 static int perf_swcounter_enable(struct perf_counter *counter)
2401 {
2402         perf_swcounter_set_period(counter);
2403         return 0;
2404 }
2405
2406 static void perf_swcounter_disable(struct perf_counter *counter)
2407 {
2408         perf_swcounter_update(counter);
2409 }
2410
2411 static const struct hw_perf_counter_ops perf_ops_generic = {
2412         .enable         = perf_swcounter_enable,
2413         .disable        = perf_swcounter_disable,
2414         .read           = perf_swcounter_read,
2415 };
2416
2417 /*
2418  * Software counter: cpu wall time clock
2419  */
2420
2421 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2422 {
2423         int cpu = raw_smp_processor_id();
2424         s64 prev;
2425         u64 now;
2426
2427         now = cpu_clock(cpu);
2428         prev = atomic64_read(&counter->hw.prev_count);
2429         atomic64_set(&counter->hw.prev_count, now);
2430         atomic64_add(now - prev, &counter->count);
2431 }
2432
2433 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2434 {
2435         struct hw_perf_counter *hwc = &counter->hw;
2436         int cpu = raw_smp_processor_id();
2437
2438         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
2439         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2440         hwc->hrtimer.function = perf_swcounter_hrtimer;
2441         if (hwc->irq_period) {
2442                 __hrtimer_start_range_ns(&hwc->hrtimer,
2443                                 ns_to_ktime(hwc->irq_period), 0,
2444                                 HRTIMER_MODE_REL, 0);
2445         }
2446
2447         return 0;
2448 }
2449
2450 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2451 {
2452         hrtimer_cancel(&counter->hw.hrtimer);
2453         cpu_clock_perf_counter_update(counter);
2454 }
2455
2456 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2457 {
2458         cpu_clock_perf_counter_update(counter);
2459 }
2460
2461 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
2462         .enable         = cpu_clock_perf_counter_enable,
2463         .disable        = cpu_clock_perf_counter_disable,
2464         .read           = cpu_clock_perf_counter_read,
2465 };
2466
2467 /*
2468  * Software counter: task time clock
2469  */
2470
2471 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
2472 {
2473         u64 prev;
2474         s64 delta;
2475
2476         prev = atomic64_xchg(&counter->hw.prev_count, now);
2477         delta = now - prev;
2478         atomic64_add(delta, &counter->count);
2479 }
2480
2481 static int task_clock_perf_counter_enable(struct perf_counter *counter)
2482 {
2483         struct hw_perf_counter *hwc = &counter->hw;
2484         u64 now;
2485
2486         now = counter->ctx->time;
2487
2488         atomic64_set(&hwc->prev_count, now);
2489         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2490         hwc->hrtimer.function = perf_swcounter_hrtimer;
2491         if (hwc->irq_period) {
2492                 __hrtimer_start_range_ns(&hwc->hrtimer,
2493                                 ns_to_ktime(hwc->irq_period), 0,
2494                                 HRTIMER_MODE_REL, 0);
2495         }
2496
2497         return 0;
2498 }
2499
2500 static void task_clock_perf_counter_disable(struct perf_counter *counter)
2501 {
2502         hrtimer_cancel(&counter->hw.hrtimer);
2503         task_clock_perf_counter_update(counter, counter->ctx->time);
2504
2505 }
2506
2507 static void task_clock_perf_counter_read(struct perf_counter *counter)
2508 {
2509         u64 time;
2510
2511         if (!in_nmi()) {
2512                 update_context_time(counter->ctx);
2513                 time = counter->ctx->time;
2514         } else {
2515                 u64 now = perf_clock();
2516                 u64 delta = now - counter->ctx->timestamp;
2517                 time = counter->ctx->time + delta;
2518         }
2519
2520         task_clock_perf_counter_update(counter, time);
2521 }
2522
2523 static const struct hw_perf_counter_ops perf_ops_task_clock = {
2524         .enable         = task_clock_perf_counter_enable,
2525         .disable        = task_clock_perf_counter_disable,
2526         .read           = task_clock_perf_counter_read,
2527 };
2528
2529 /*
2530  * Software counter: cpu migrations
2531  */
2532
2533 static inline u64 get_cpu_migrations(struct perf_counter *counter)
2534 {
2535         struct task_struct *curr = counter->ctx->task;
2536
2537         if (curr)
2538                 return curr->se.nr_migrations;
2539         return cpu_nr_migrations(smp_processor_id());
2540 }
2541
2542 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2543 {
2544         u64 prev, now;
2545         s64 delta;
2546
2547         prev = atomic64_read(&counter->hw.prev_count);
2548         now = get_cpu_migrations(counter);
2549
2550         atomic64_set(&counter->hw.prev_count, now);
2551
2552         delta = now - prev;
2553
2554         atomic64_add(delta, &counter->count);
2555 }
2556
2557 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2558 {
2559         cpu_migrations_perf_counter_update(counter);
2560 }
2561
2562 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
2563 {
2564         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2565                 atomic64_set(&counter->hw.prev_count,
2566                              get_cpu_migrations(counter));
2567         return 0;
2568 }
2569
2570 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2571 {
2572         cpu_migrations_perf_counter_update(counter);
2573 }
2574
2575 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
2576         .enable         = cpu_migrations_perf_counter_enable,
2577         .disable        = cpu_migrations_perf_counter_disable,
2578         .read           = cpu_migrations_perf_counter_read,
2579 };
2580
2581 #ifdef CONFIG_EVENT_PROFILE
2582 void perf_tpcounter_event(int event_id)
2583 {
2584         struct pt_regs *regs = get_irq_regs();
2585
2586         if (!regs)
2587                 regs = task_pt_regs(current);
2588
2589         __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
2590 }
2591
2592 extern int ftrace_profile_enable(int);
2593 extern void ftrace_profile_disable(int);
2594
2595 static void tp_perf_counter_destroy(struct perf_counter *counter)
2596 {
2597         ftrace_profile_disable(perf_event_id(&counter->hw_event));
2598 }
2599
2600 static const struct hw_perf_counter_ops *
2601 tp_perf_counter_init(struct perf_counter *counter)
2602 {
2603         int event_id = perf_event_id(&counter->hw_event);
2604         int ret;
2605
2606         ret = ftrace_profile_enable(event_id);
2607         if (ret)
2608                 return NULL;
2609
2610         counter->destroy = tp_perf_counter_destroy;
2611         counter->hw.irq_period = counter->hw_event.irq_period;
2612
2613         return &perf_ops_generic;
2614 }
2615 #else
2616 static const struct hw_perf_counter_ops *
2617 tp_perf_counter_init(struct perf_counter *counter)
2618 {
2619         return NULL;
2620 }
2621 #endif
2622
2623 static const struct hw_perf_counter_ops *
2624 sw_perf_counter_init(struct perf_counter *counter)
2625 {
2626         struct perf_counter_hw_event *hw_event = &counter->hw_event;
2627         const struct hw_perf_counter_ops *hw_ops = NULL;
2628         struct hw_perf_counter *hwc = &counter->hw;
2629
2630         /*
2631          * Software counters (currently) can't in general distinguish
2632          * between user, kernel and hypervisor events.
2633          * However, context switches and cpu migrations are considered
2634          * to be kernel events, and page faults are never hypervisor
2635          * events.
2636          */
2637         switch (perf_event_id(&counter->hw_event)) {
2638         case PERF_COUNT_CPU_CLOCK:
2639                 hw_ops = &perf_ops_cpu_clock;
2640
2641                 if (hw_event->irq_period && hw_event->irq_period < 10000)
2642                         hw_event->irq_period = 10000;
2643                 break;
2644         case PERF_COUNT_TASK_CLOCK:
2645                 /*
2646                  * If the user instantiates this as a per-cpu counter,
2647                  * use the cpu_clock counter instead.
2648                  */
2649                 if (counter->ctx->task)
2650                         hw_ops = &perf_ops_task_clock;
2651                 else
2652                         hw_ops = &perf_ops_cpu_clock;
2653
2654                 if (hw_event->irq_period && hw_event->irq_period < 10000)
2655                         hw_event->irq_period = 10000;
2656                 break;
2657         case PERF_COUNT_PAGE_FAULTS:
2658         case PERF_COUNT_PAGE_FAULTS_MIN:
2659         case PERF_COUNT_PAGE_FAULTS_MAJ:
2660         case PERF_COUNT_CONTEXT_SWITCHES:
2661                 hw_ops = &perf_ops_generic;
2662                 break;
2663         case PERF_COUNT_CPU_MIGRATIONS:
2664                 if (!counter->hw_event.exclude_kernel)
2665                         hw_ops = &perf_ops_cpu_migrations;
2666                 break;
2667         }
2668
2669         if (hw_ops)
2670                 hwc->irq_period = hw_event->irq_period;
2671
2672         return hw_ops;
2673 }
2674
2675 /*
2676  * Allocate and initialize a counter structure
2677  */
2678 static struct perf_counter *
2679 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2680                    int cpu,
2681                    struct perf_counter_context *ctx,
2682                    struct perf_counter *group_leader,
2683                    gfp_t gfpflags)
2684 {
2685         const struct hw_perf_counter_ops *hw_ops;
2686         struct perf_counter *counter;
2687         long err;
2688
2689         counter = kzalloc(sizeof(*counter), gfpflags);
2690         if (!counter)
2691                 return ERR_PTR(-ENOMEM);
2692
2693         /*
2694          * Single counters are their own group leaders, with an
2695          * empty sibling list:
2696          */
2697         if (!group_leader)
2698                 group_leader = counter;
2699
2700         mutex_init(&counter->mutex);
2701         INIT_LIST_HEAD(&counter->list_entry);
2702         INIT_LIST_HEAD(&counter->event_entry);
2703         INIT_LIST_HEAD(&counter->sibling_list);
2704         init_waitqueue_head(&counter->waitq);
2705
2706         mutex_init(&counter->mmap_mutex);
2707
2708         INIT_LIST_HEAD(&counter->child_list);
2709
2710         counter->cpu                    = cpu;
2711         counter->hw_event               = *hw_event;
2712         counter->group_leader           = group_leader;
2713         counter->hw_ops                 = NULL;
2714         counter->ctx                    = ctx;
2715
2716         counter->state = PERF_COUNTER_STATE_INACTIVE;
2717         if (hw_event->disabled)
2718                 counter->state = PERF_COUNTER_STATE_OFF;
2719
2720         hw_ops = NULL;
2721
2722         if (perf_event_raw(hw_event)) {
2723                 hw_ops = hw_perf_counter_init(counter);
2724                 goto done;
2725         }
2726
2727         switch (perf_event_type(hw_event)) {
2728         case PERF_TYPE_HARDWARE:
2729                 hw_ops = hw_perf_counter_init(counter);
2730                 break;
2731
2732         case PERF_TYPE_SOFTWARE:
2733                 hw_ops = sw_perf_counter_init(counter);
2734                 break;
2735
2736         case PERF_TYPE_TRACEPOINT:
2737                 hw_ops = tp_perf_counter_init(counter);
2738                 break;
2739         }
2740 done:
2741         err = 0;
2742         if (!hw_ops)
2743                 err = -EINVAL;
2744         else if (IS_ERR(hw_ops))
2745                 err = PTR_ERR(hw_ops);
2746
2747         if (err) {
2748                 kfree(counter);
2749                 return ERR_PTR(err);
2750         }
2751
2752         counter->hw_ops = hw_ops;
2753
2754         if (counter->hw_event.mmap)
2755                 atomic_inc(&nr_mmap_tracking);
2756         if (counter->hw_event.munmap)
2757                 atomic_inc(&nr_munmap_tracking);
2758         if (counter->hw_event.comm)
2759                 atomic_inc(&nr_comm_tracking);
2760
2761         return counter;
2762 }
2763
2764 /**
2765  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2766  *
2767  * @hw_event_uptr:      event type attributes for monitoring/sampling
2768  * @pid:                target pid
2769  * @cpu:                target cpu
2770  * @group_fd:           group leader counter fd
2771  */
2772 SYSCALL_DEFINE5(perf_counter_open,
2773                 const struct perf_counter_hw_event __user *, hw_event_uptr,
2774                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
2775 {
2776         struct perf_counter *counter, *group_leader;
2777         struct perf_counter_hw_event hw_event;
2778         struct perf_counter_context *ctx;
2779         struct file *counter_file = NULL;
2780         struct file *group_file = NULL;
2781         int fput_needed = 0;
2782         int fput_needed2 = 0;
2783         int ret;
2784
2785         /* for future expandability... */
2786         if (flags)
2787                 return -EINVAL;
2788
2789         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
2790                 return -EFAULT;
2791
2792         /*
2793          * Get the target context (task or percpu):
2794          */
2795         ctx = find_get_context(pid, cpu);
2796         if (IS_ERR(ctx))
2797                 return PTR_ERR(ctx);
2798
2799         /*
2800          * Look up the group leader (we will attach this counter to it):
2801          */
2802         group_leader = NULL;
2803         if (group_fd != -1) {
2804                 ret = -EINVAL;
2805                 group_file = fget_light(group_fd, &fput_needed);
2806                 if (!group_file)
2807                         goto err_put_context;
2808                 if (group_file->f_op != &perf_fops)
2809                         goto err_put_context;
2810
2811                 group_leader = group_file->private_data;
2812                 /*
2813                  * Do not allow a recursive hierarchy (this new sibling
2814                  * becoming part of another group-sibling):
2815                  */
2816                 if (group_leader->group_leader != group_leader)
2817                         goto err_put_context;
2818                 /*
2819                  * Do not allow to attach to a group in a different
2820                  * task or CPU context:
2821                  */
2822                 if (group_leader->ctx != ctx)
2823                         goto err_put_context;
2824                 /*
2825                  * Only a group leader can be exclusive or pinned
2826                  */
2827                 if (hw_event.exclusive || hw_event.pinned)
2828                         goto err_put_context;
2829         }
2830
2831         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2832                                      GFP_KERNEL);
2833         ret = PTR_ERR(counter);
2834         if (IS_ERR(counter))
2835                 goto err_put_context;
2836
2837         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2838         if (ret < 0)
2839                 goto err_free_put_context;
2840
2841         counter_file = fget_light(ret, &fput_needed2);
2842         if (!counter_file)
2843                 goto err_free_put_context;
2844
2845         counter->filp = counter_file;
2846         mutex_lock(&ctx->mutex);
2847         perf_install_in_context(ctx, counter, cpu);
2848         mutex_unlock(&ctx->mutex);
2849
2850         fput_light(counter_file, fput_needed2);
2851
2852 out_fput:
2853         fput_light(group_file, fput_needed);
2854
2855         return ret;
2856
2857 err_free_put_context:
2858         kfree(counter);
2859
2860 err_put_context:
2861         put_context(ctx);
2862
2863         goto out_fput;
2864 }
2865
2866 /*
2867  * Initialize the perf_counter context in a task_struct:
2868  */
2869 static void
2870 __perf_counter_init_context(struct perf_counter_context *ctx,
2871                             struct task_struct *task)
2872 {
2873         memset(ctx, 0, sizeof(*ctx));
2874         spin_lock_init(&ctx->lock);
2875         mutex_init(&ctx->mutex);
2876         INIT_LIST_HEAD(&ctx->counter_list);
2877         INIT_LIST_HEAD(&ctx->event_list);
2878         ctx->task = task;
2879 }
2880
2881 /*
2882  * inherit a counter from parent task to child task:
2883  */
2884 static struct perf_counter *
2885 inherit_counter(struct perf_counter *parent_counter,
2886               struct task_struct *parent,
2887               struct perf_counter_context *parent_ctx,
2888               struct task_struct *child,
2889               struct perf_counter *group_leader,
2890               struct perf_counter_context *child_ctx)
2891 {
2892         struct perf_counter *child_counter;
2893
2894         /*
2895          * Instead of creating recursive hierarchies of counters,
2896          * we link inherited counters back to the original parent,
2897          * which has a filp for sure, which we use as the reference
2898          * count:
2899          */
2900         if (parent_counter->parent)
2901                 parent_counter = parent_counter->parent;
2902
2903         child_counter = perf_counter_alloc(&parent_counter->hw_event,
2904                                            parent_counter->cpu, child_ctx,
2905                                            group_leader, GFP_KERNEL);
2906         if (IS_ERR(child_counter))
2907                 return child_counter;
2908
2909         /*
2910          * Link it up in the child's context:
2911          */
2912         child_counter->task = child;
2913         add_counter_to_ctx(child_counter, child_ctx);
2914
2915         child_counter->parent = parent_counter;
2916         /*
2917          * inherit into child's child as well:
2918          */
2919         child_counter->hw_event.inherit = 1;
2920
2921         /*
2922          * Get a reference to the parent filp - we will fput it
2923          * when the child counter exits. This is safe to do because
2924          * we are in the parent and we know that the filp still
2925          * exists and has a nonzero count:
2926          */
2927         atomic_long_inc(&parent_counter->filp->f_count);
2928
2929         /*
2930          * Link this into the parent counter's child list
2931          */
2932         mutex_lock(&parent_counter->mutex);
2933         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2934
2935         /*
2936          * Make the child state follow the state of the parent counter,
2937          * not its hw_event.disabled bit.  We hold the parent's mutex,
2938          * so we won't race with perf_counter_{en,dis}able_family.
2939          */
2940         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2941                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2942         else
2943                 child_counter->state = PERF_COUNTER_STATE_OFF;
2944
2945         mutex_unlock(&parent_counter->mutex);
2946
2947         return child_counter;
2948 }
2949
2950 static int inherit_group(struct perf_counter *parent_counter,
2951               struct task_struct *parent,
2952               struct perf_counter_context *parent_ctx,
2953               struct task_struct *child,
2954               struct perf_counter_context *child_ctx)
2955 {
2956         struct perf_counter *leader;
2957         struct perf_counter *sub;
2958         struct perf_counter *child_ctr;
2959
2960         leader = inherit_counter(parent_counter, parent, parent_ctx,
2961                                  child, NULL, child_ctx);
2962         if (IS_ERR(leader))
2963                 return PTR_ERR(leader);
2964         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2965                 child_ctr = inherit_counter(sub, parent, parent_ctx,
2966                                             child, leader, child_ctx);
2967                 if (IS_ERR(child_ctr))
2968                         return PTR_ERR(child_ctr);
2969         }
2970         return 0;
2971 }
2972
2973 static void sync_child_counter(struct perf_counter *child_counter,
2974                                struct perf_counter *parent_counter)
2975 {
2976         u64 parent_val, child_val;
2977
2978         parent_val = atomic64_read(&parent_counter->count);
2979         child_val = atomic64_read(&child_counter->count);
2980
2981         /*
2982          * Add back the child's count to the parent's count:
2983          */
2984         atomic64_add(child_val, &parent_counter->count);
2985         atomic64_add(child_counter->total_time_enabled,
2986                      &parent_counter->child_total_time_enabled);
2987         atomic64_add(child_counter->total_time_running,
2988                      &parent_counter->child_total_time_running);
2989
2990         /*
2991          * Remove this counter from the parent's list
2992          */
2993         mutex_lock(&parent_counter->mutex);
2994         list_del_init(&child_counter->child_list);
2995         mutex_unlock(&parent_counter->mutex);
2996
2997         /*
2998          * Release the parent counter, if this was the last
2999          * reference to it.
3000          */
3001         fput(parent_counter->filp);
3002 }
3003
3004 static void
3005 __perf_counter_exit_task(struct task_struct *child,
3006                          struct perf_counter *child_counter,
3007                          struct perf_counter_context *child_ctx)
3008 {
3009         struct perf_counter *parent_counter;
3010         struct perf_counter *sub, *tmp;
3011
3012         /*
3013          * If we do not self-reap then we have to wait for the
3014          * child task to unschedule (it will happen for sure),
3015          * so that its counter is at its final count. (This
3016          * condition triggers rarely - child tasks usually get
3017          * off their CPU before the parent has a chance to
3018          * get this far into the reaping action)
3019          */
3020         if (child != current) {
3021                 wait_task_inactive(child, 0);
3022                 list_del_init(&child_counter->list_entry);
3023                 update_counter_times(child_counter);
3024         } else {
3025                 struct perf_cpu_context *cpuctx;
3026                 unsigned long flags;
3027                 u64 perf_flags;
3028
3029                 /*
3030                  * Disable and unlink this counter.
3031                  *
3032                  * Be careful about zapping the list - IRQ/NMI context
3033                  * could still be processing it:
3034                  */
3035                 local_irq_save(flags);
3036                 perf_flags = hw_perf_save_disable();
3037
3038                 cpuctx = &__get_cpu_var(perf_cpu_context);
3039
3040                 group_sched_out(child_counter, cpuctx, child_ctx);
3041                 update_counter_times(child_counter);
3042
3043                 list_del_init(&child_counter->list_entry);
3044
3045                 child_ctx->nr_counters--;
3046
3047                 hw_perf_restore(perf_flags);
3048                 local_irq_restore(flags);
3049         }
3050
3051         parent_counter = child_counter->parent;
3052         /*
3053          * It can happen that parent exits first, and has counters
3054          * that are still around due to the child reference. These
3055          * counters need to be zapped - but otherwise linger.
3056          */
3057         if (parent_counter) {
3058                 sync_child_counter(child_counter, parent_counter);
3059                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
3060                                          list_entry) {
3061                         if (sub->parent) {
3062                                 sync_child_counter(sub, sub->parent);
3063                                 free_counter(sub);
3064                         }
3065                 }
3066                 free_counter(child_counter);
3067         }
3068 }
3069
3070 /*
3071  * When a child task exits, feed back counter values to parent counters.
3072  *
3073  * Note: we may be running in child context, but the PID is not hashed
3074  * anymore so new counters will not be added.
3075  */
3076 void perf_counter_exit_task(struct task_struct *child)
3077 {
3078         struct perf_counter *child_counter, *tmp;
3079         struct perf_counter_context *child_ctx;
3080
3081         child_ctx = &child->perf_counter_ctx;
3082
3083         if (likely(!child_ctx->nr_counters))
3084                 return;
3085
3086         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
3087                                  list_entry)
3088                 __perf_counter_exit_task(child, child_counter, child_ctx);
3089 }
3090
3091 /*
3092  * Initialize the perf_counter context in task_struct
3093  */
3094 void perf_counter_init_task(struct task_struct *child)
3095 {
3096         struct perf_counter_context *child_ctx, *parent_ctx;
3097         struct perf_counter *counter;
3098         struct task_struct *parent = current;
3099
3100         child_ctx  =  &child->perf_counter_ctx;
3101         parent_ctx = &parent->perf_counter_ctx;
3102
3103         __perf_counter_init_context(child_ctx, child);
3104
3105         /*
3106          * This is executed from the parent task context, so inherit
3107          * counters that have been marked for cloning:
3108          */
3109
3110         if (likely(!parent_ctx->nr_counters))
3111                 return;
3112
3113         /*
3114          * Lock the parent list. No need to lock the child - not PID
3115          * hashed yet and not running, so nobody can access it.
3116          */
3117         mutex_lock(&parent_ctx->mutex);
3118
3119         /*
3120          * We dont have to disable NMIs - we are only looking at
3121          * the list, not manipulating it:
3122          */
3123         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
3124                 if (!counter->hw_event.inherit)
3125                         continue;
3126
3127                 if (inherit_group(counter, parent,
3128                                   parent_ctx, child, child_ctx))
3129                         break;
3130         }
3131
3132         mutex_unlock(&parent_ctx->mutex);
3133 }
3134
3135 static void __cpuinit perf_counter_init_cpu(int cpu)
3136 {
3137         struct perf_cpu_context *cpuctx;
3138
3139         cpuctx = &per_cpu(perf_cpu_context, cpu);
3140         __perf_counter_init_context(&cpuctx->ctx, NULL);
3141
3142         mutex_lock(&perf_resource_mutex);
3143         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
3144         mutex_unlock(&perf_resource_mutex);
3145
3146         hw_perf_counter_setup(cpu);
3147 }
3148
3149 #ifdef CONFIG_HOTPLUG_CPU
3150 static void __perf_counter_exit_cpu(void *info)
3151 {
3152         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3153         struct perf_counter_context *ctx = &cpuctx->ctx;
3154         struct perf_counter *counter, *tmp;
3155
3156         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3157                 __perf_counter_remove_from_context(counter);
3158 }
3159 static void perf_counter_exit_cpu(int cpu)
3160 {
3161         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3162         struct perf_counter_context *ctx = &cpuctx->ctx;
3163
3164         mutex_lock(&ctx->mutex);
3165         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
3166         mutex_unlock(&ctx->mutex);
3167 }
3168 #else
3169 static inline void perf_counter_exit_cpu(int cpu) { }
3170 #endif
3171
3172 static int __cpuinit
3173 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3174 {
3175         unsigned int cpu = (long)hcpu;
3176
3177         switch (action) {
3178
3179         case CPU_UP_PREPARE:
3180         case CPU_UP_PREPARE_FROZEN:
3181                 perf_counter_init_cpu(cpu);
3182                 break;
3183
3184         case CPU_DOWN_PREPARE:
3185         case CPU_DOWN_PREPARE_FROZEN:
3186                 perf_counter_exit_cpu(cpu);
3187                 break;
3188
3189         default:
3190                 break;
3191         }
3192
3193         return NOTIFY_OK;
3194 }
3195
3196 static struct notifier_block __cpuinitdata perf_cpu_nb = {
3197         .notifier_call          = perf_cpu_notify,
3198 };
3199
3200 static int __init perf_counter_init(void)
3201 {
3202         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3203                         (void *)(long)smp_processor_id());
3204         register_cpu_notifier(&perf_cpu_nb);
3205
3206         return 0;
3207 }
3208 early_initcall(perf_counter_init);
3209
3210 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3211 {
3212         return sprintf(buf, "%d\n", perf_reserved_percpu);
3213 }
3214
3215 static ssize_t
3216 perf_set_reserve_percpu(struct sysdev_class *class,
3217                         const char *buf,
3218                         size_t count)
3219 {
3220         struct perf_cpu_context *cpuctx;
3221         unsigned long val;
3222         int err, cpu, mpt;
3223
3224         err = strict_strtoul(buf, 10, &val);
3225         if (err)
3226                 return err;
3227         if (val > perf_max_counters)
3228                 return -EINVAL;
3229
3230         mutex_lock(&perf_resource_mutex);
3231         perf_reserved_percpu = val;
3232         for_each_online_cpu(cpu) {
3233                 cpuctx = &per_cpu(perf_cpu_context, cpu);
3234                 spin_lock_irq(&cpuctx->ctx.lock);
3235                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3236                           perf_max_counters - perf_reserved_percpu);
3237                 cpuctx->max_pertask = mpt;
3238                 spin_unlock_irq(&cpuctx->ctx.lock);
3239         }
3240         mutex_unlock(&perf_resource_mutex);
3241
3242         return count;
3243 }
3244
3245 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3246 {
3247         return sprintf(buf, "%d\n", perf_overcommit);
3248 }
3249
3250 static ssize_t
3251 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3252 {
3253         unsigned long val;
3254         int err;
3255
3256         err = strict_strtoul(buf, 10, &val);
3257         if (err)
3258                 return err;
3259         if (val > 1)
3260                 return -EINVAL;
3261
3262         mutex_lock(&perf_resource_mutex);
3263         perf_overcommit = val;
3264         mutex_unlock(&perf_resource_mutex);
3265
3266         return count;
3267 }
3268
3269 static SYSDEV_CLASS_ATTR(
3270                                 reserve_percpu,
3271                                 0644,
3272                                 perf_show_reserve_percpu,
3273                                 perf_set_reserve_percpu
3274                         );
3275
3276 static SYSDEV_CLASS_ATTR(
3277                                 overcommit,
3278                                 0644,
3279                                 perf_show_overcommit,
3280                                 perf_set_overcommit
3281                         );
3282
3283 static struct attribute *perfclass_attrs[] = {
3284         &attr_reserve_percpu.attr,
3285         &attr_overcommit.attr,
3286         NULL
3287 };
3288
3289 static struct attribute_group perfclass_attr_group = {
3290         .attrs                  = perfclass_attrs,
3291         .name                   = "perf_counters",
3292 };
3293
3294 static int __init perf_counter_sysfs_init(void)
3295 {
3296         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3297                                   &perfclass_attr_group);
3298 }
3299 device_initcall(perf_counter_sysfs_init);