kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *  For licencing details see kernel-base/COPYING
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/cpu.h>
  12 #include <linux/smp.h>
  13 #include <linux/file.h>
  14 #include <linux/poll.h>
  15 #include <linux/sysfs.h>
  16 #include <linux/ptrace.h>
  17 #include <linux/percpu.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/kernel_stat.h>
  22 #include <linux/perf_counter.h>
  23
  24 /*
  25  * Each CPU has a list of per CPU counters:
  26  */
  27 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  28
  29 int perf_max_counters __read_mostly = 1;
  30 static int perf_reserved_percpu __read_mostly;
  31 static int perf_overcommit __read_mostly = 1;
  32
  33 /*
  34  * Mutex for (sysadmin-configurable) counter reservations:
  35  */
  36 static DEFINE_MUTEX(perf_resource_mutex);
  37
  38 /*
  39  * Architecture provided APIs - weak aliases:
  40  */
  41 extern __weak const struct hw_perf_counter_ops *
  42 hw_perf_counter_init(struct perf_counter *counter)
  43 {
  44         return NULL;
  45 }
  46
  47 u64 __weak hw_perf_save_disable(void)           { return 0; }
  48 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  49 void __weak hw_perf_counter_setup(void)         { barrier(); }
  50 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  51                struct perf_cpu_context *cpuctx,
  52                struct perf_counter_context *ctx, int cpu)
  53 {
  54         return 0;
  55 }
  56
  57 static void
  58 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  59 {
  60         struct perf_counter *group_leader = counter->group_leader;
  61
  62         /*
  63          * Depending on whether it is a standalone or sibling counter,
  64          * add it straight to the context's counter list, or to the group
  65          * leader's sibling list:
  66          */
  67         if (counter->group_leader == counter)
  68                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  69         else
  70                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  71 }
  72
  73 static void
  74 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  75 {
  76         struct perf_counter *sibling, *tmp;
  77
  78         list_del_init(&counter->list_entry);
  79
  80         /*
  81          * If this was a group counter with sibling counters then
  82          * upgrade the siblings to singleton counters by adding them
  83          * to the context list directly:
  84          */
  85         list_for_each_entry_safe(sibling, tmp,
  86                                  &counter->sibling_list, list_entry) {
  87
  88                 list_del_init(&sibling->list_entry);
  89                 list_add_tail(&sibling->list_entry, &ctx->counter_list);
  90                 sibling->group_leader = sibling;
  91         }
  92 }
  93
  94 /*
  95  * Cross CPU call to remove a performance counter
  96  *
  97  * We disable the counter on the hardware level first. After that we
  98  * remove it from the context list.
  99  */
 100 static void __perf_counter_remove_from_context(void *info)
 101 {
 102         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 103         struct perf_counter *counter = info;
 104         struct perf_counter_context *ctx = counter->ctx;
 105         unsigned long flags;
 106         u64 perf_flags;
 107
 108         /*
 109          * If this is a task context, we need to check whether it is
 110          * the current task context of this cpu. If not it has been
 111          * scheduled out before the smp call arrived.
 112          */
 113         if (ctx->task && cpuctx->task_ctx != ctx)
 114                 return;
 115
 116         curr_rq_lock_irq_save(&flags);
 117         spin_lock(&ctx->lock);
 118
 119         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 120                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 121                 counter->hw_ops->disable(counter);
 122                 ctx->nr_active--;
 123                 cpuctx->active_oncpu--;
 124                 counter->task = NULL;
 125                 counter->oncpu = -1;
 126         }
 127         ctx->nr_counters--;
 128
 129         /*
 130          * Protect the list operation against NMI by disabling the
 131          * counters on a global level. NOP for non NMI based counters.
 132          */
 133         perf_flags = hw_perf_save_disable();
 134         list_del_counter(counter, ctx);
 135         hw_perf_restore(perf_flags);
 136
 137         if (!ctx->task) {
 138                 /*
 139                  * Allow more per task counters with respect to the
 140                  * reservation:
 141                  */
 142                 cpuctx->max_pertask =
 143                         min(perf_max_counters - ctx->nr_counters,
 144                             perf_max_counters - perf_reserved_percpu);
 145         }
 146
 147         spin_unlock(&ctx->lock);
 148         curr_rq_unlock_irq_restore(&flags);
 149 }
 150
 151
 152 /*
 153  * Remove the counter from a task's (or a CPU's) list of counters.
 154  *
 155  * Must be called with counter->mutex held.
 156  *
 157  * CPU counters are removed with a smp call. For task counters we only
 158  * call when the task is on a CPU.
 159  */
 160 static void perf_counter_remove_from_context(struct perf_counter *counter)
 161 {
 162         struct perf_counter_context *ctx = counter->ctx;
 163         struct task_struct *task = ctx->task;
 164
 165         if (!task) {
 166                 /*
 167                  * Per cpu counters are removed via an smp call and
 168                  * the removal is always sucessful.
 169                  */
 170                 smp_call_function_single(counter->cpu,
 171                                          __perf_counter_remove_from_context,
 172                                          counter, 1);
 173                 return;
 174         }
 175
 176 retry:
 177         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 178                                  counter);
 179
 180         spin_lock_irq(&ctx->lock);
 181         /*
 182          * If the context is active we need to retry the smp call.
 183          */
 184         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 185                 spin_unlock_irq(&ctx->lock);
 186                 goto retry;
 187         }
 188
 189         /*
 190          * The lock prevents that this context is scheduled in so we
 191          * can remove the counter safely, if the call above did not
 192          * succeed.
 193          */
 194         if (!list_empty(&counter->list_entry)) {
 195                 ctx->nr_counters--;
 196                 list_del_counter(counter, ctx);
 197                 counter->task = NULL;
 198         }
 199         spin_unlock_irq(&ctx->lock);
 200 }
 201
 202 static int
 203 counter_sched_in(struct perf_counter *counter,
 204                  struct perf_cpu_context *cpuctx,
 205                  struct perf_counter_context *ctx,
 206                  int cpu)
 207 {
 208         if (counter->state == PERF_COUNTER_STATE_OFF)
 209                 return 0;
 210
 211         counter->state = PERF_COUNTER_STATE_ACTIVE;
 212         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 213         /*
 214          * The new state must be visible before we turn it on in the hardware:
 215          */
 216         smp_wmb();
 217
 218         if (counter->hw_ops->enable(counter)) {
 219                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 220                 counter->oncpu = -1;
 221                 return -EAGAIN;
 222         }
 223
 224         cpuctx->active_oncpu++;
 225         ctx->nr_active++;
 226
 227         return 0;
 228 }
 229
 230 /*
 231  * Cross CPU call to install and enable a performance counter
 232  */
 233 static void __perf_install_in_context(void *info)
 234 {
 235         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 236         struct perf_counter *counter = info;
 237         struct perf_counter_context *ctx = counter->ctx;
 238         int cpu = smp_processor_id();
 239         unsigned long flags;
 240         u64 perf_flags;
 241
 242         /*
 243          * If this is a task context, we need to check whether it is
 244          * the current task context of this cpu. If not it has been
 245          * scheduled out before the smp call arrived.
 246          */
 247         if (ctx->task && cpuctx->task_ctx != ctx)
 248                 return;
 249
 250         curr_rq_lock_irq_save(&flags);
 251         spin_lock(&ctx->lock);
 252
 253         /*
 254          * Protect the list operation against NMI by disabling the
 255          * counters on a global level. NOP for non NMI based counters.
 256          */
 257         perf_flags = hw_perf_save_disable();
 258
 259         list_add_counter(counter, ctx);
 260         ctx->nr_counters++;
 261
 262         counter_sched_in(counter, cpuctx, ctx, cpu);
 263
 264         if (!ctx->task && cpuctx->max_pertask)
 265                 cpuctx->max_pertask--;
 266
 267         hw_perf_restore(perf_flags);
 268
 269         spin_unlock(&ctx->lock);
 270         curr_rq_unlock_irq_restore(&flags);
 271 }
 272
 273 /*
 274  * Attach a performance counter to a context
 275  *
 276  * First we add the counter to the list with the hardware enable bit
 277  * in counter->hw_config cleared.
 278  *
 279  * If the counter is attached to a task which is on a CPU we use a smp
 280  * call to enable it in the task context. The task might have been
 281  * scheduled away, but we check this in the smp call again.
 282  */
 283 static void
 284 perf_install_in_context(struct perf_counter_context *ctx,
 285                         struct perf_counter *counter,
 286                         int cpu)
 287 {
 288         struct task_struct *task = ctx->task;
 289
 290         counter->ctx = ctx;
 291         if (!task) {
 292                 /*
 293                  * Per cpu counters are installed via an smp call and
 294                  * the install is always sucessful.
 295                  */
 296                 smp_call_function_single(cpu, __perf_install_in_context,
 297                                          counter, 1);
 298                 return;
 299         }
 300
 301         counter->task = task;
 302 retry:
 303         task_oncpu_function_call(task, __perf_install_in_context,
 304                                  counter);
 305
 306         spin_lock_irq(&ctx->lock);
 307         /*
 308          * we need to retry the smp call.
 309          */
 310         if (ctx->nr_active && list_empty(&counter->list_entry)) {
 311                 spin_unlock_irq(&ctx->lock);
 312                 goto retry;
 313         }
 314
 315         /*
 316          * The lock prevents that this context is scheduled in so we
 317          * can add the counter safely, if it the call above did not
 318          * succeed.
 319          */
 320         if (list_empty(&counter->list_entry)) {
 321                 list_add_counter(counter, ctx);
 322                 ctx->nr_counters++;
 323         }
 324         spin_unlock_irq(&ctx->lock);
 325 }
 326
 327 static void
 328 counter_sched_out(struct perf_counter *counter,
 329                   struct perf_cpu_context *cpuctx,
 330                   struct perf_counter_context *ctx)
 331 {
 332         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 333                 return;
 334
 335         counter->state = PERF_COUNTER_STATE_INACTIVE;
 336         counter->hw_ops->disable(counter);
 337         counter->oncpu = -1;
 338
 339         cpuctx->active_oncpu--;
 340         ctx->nr_active--;
 341 }
 342
 343 static void
 344 group_sched_out(struct perf_counter *group_counter,
 345                 struct perf_cpu_context *cpuctx,
 346                 struct perf_counter_context *ctx)
 347 {
 348         struct perf_counter *counter;
 349
 350         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 351                 return;
 352
 353         counter_sched_out(group_counter, cpuctx, ctx);
 354
 355         /*
 356          * Schedule out siblings (if any):
 357          */
 358         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 359                 counter_sched_out(counter, cpuctx, ctx);
 360 }
 361
 362 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 363                               struct perf_cpu_context *cpuctx)
 364 {
 365         struct perf_counter *counter;
 366         u64 flags;
 367
 368         if (likely(!ctx->nr_counters))
 369                 return;
 370
 371         spin_lock(&ctx->lock);
 372         flags = hw_perf_save_disable();
 373         if (ctx->nr_active) {
 374                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 375                         group_sched_out(counter, cpuctx, ctx);
 376         }
 377         hw_perf_restore(flags);
 378         spin_unlock(&ctx->lock);
 379 }
 380
 381 /*
 382  * Called from scheduler to remove the counters of the current task,
 383  * with interrupts disabled.
 384  *
 385  * We stop each counter and update the counter value in counter->count.
 386  *
 387  * This does not protect us against NMI, but disable()
 388  * sets the disabled bit in the control field of counter _before_
 389  * accessing the counter control register. If a NMI hits, then it will
 390  * not restart the counter.
 391  */
 392 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 393 {
 394         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 395         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 396
 397         if (likely(!cpuctx->task_ctx))
 398                 return;
 399
 400         __perf_counter_sched_out(ctx, cpuctx);
 401
 402         cpuctx->task_ctx = NULL;
 403 }
 404
 405 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 406 {
 407         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 408 }
 409
 410 static int
 411 group_sched_in(struct perf_counter *group_counter,
 412                struct perf_cpu_context *cpuctx,
 413                struct perf_counter_context *ctx,
 414                int cpu)
 415 {
 416         struct perf_counter *counter, *partial_group;
 417         int ret;
 418
 419         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 420                 return 0;
 421
 422         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 423         if (ret)
 424                 return ret < 0 ? ret : 0;
 425
 426         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 427                 return -EAGAIN;
 428
 429         /*
 430          * Schedule in siblings as one group (if any):
 431          */
 432         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 433                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 434                         partial_group = counter;
 435                         goto group_error;
 436                 }
 437         }
 438
 439         return 0;
 440
 441 group_error:
 442         /*
 443          * Groups can be scheduled in as one unit only, so undo any
 444          * partial group before returning:
 445          */
 446         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 447                 if (counter == partial_group)
 448                         break;
 449                 counter_sched_out(counter, cpuctx, ctx);
 450         }
 451         counter_sched_out(group_counter, cpuctx, ctx);
 452
 453         return -EAGAIN;
 454 }
 455
 456 static void
 457 __perf_counter_sched_in(struct perf_counter_context *ctx,
 458                         struct perf_cpu_context *cpuctx, int cpu)
 459 {
 460         struct perf_counter *counter;
 461         u64 flags;
 462
 463         if (likely(!ctx->nr_counters))
 464                 return;
 465
 466         spin_lock(&ctx->lock);
 467         flags = hw_perf_save_disable();
 468         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 469                 /*
 470                  * Listen to the 'cpu' scheduling filter constraint
 471                  * of counters:
 472                  */
 473                 if (counter->cpu != -1 && counter->cpu != cpu)
 474                         continue;
 475
 476                 /*
 477                  * If we scheduled in a group atomically and exclusively,
 478                  * or if this group can't go on, break out:
 479                  */
 480                 if (group_sched_in(counter, cpuctx, ctx, cpu))
 481                         break;
 482         }
 483         hw_perf_restore(flags);
 484         spin_unlock(&ctx->lock);
 485 }
 486
 487 /*
 488  * Called from scheduler to add the counters of the current task
 489  * with interrupts disabled.
 490  *
 491  * We restore the counter value and then enable it.
 492  *
 493  * This does not protect us against NMI, but enable()
 494  * sets the enabled bit in the control field of counter _before_
 495  * accessing the counter control register. If a NMI hits, then it will
 496  * keep the counter running.
 497  */
 498 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 499 {
 500         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 501         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 502
 503         __perf_counter_sched_in(ctx, cpuctx, cpu);
 504         cpuctx->task_ctx = ctx;
 505 }
 506
 507 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 508 {
 509         struct perf_counter_context *ctx = &cpuctx->ctx;
 510
 511         __perf_counter_sched_in(ctx, cpuctx, cpu);
 512 }
 513
 514 int perf_counter_task_disable(void)
 515 {
 516         struct task_struct *curr = current;
 517         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 518         struct perf_counter *counter;
 519         unsigned long flags;
 520         u64 perf_flags;
 521         int cpu;
 522
 523         if (likely(!ctx->nr_counters))
 524                 return 0;
 525
 526         curr_rq_lock_irq_save(&flags);
 527         cpu = smp_processor_id();
 528
 529         /* force the update of the task clock: */
 530         __task_delta_exec(curr, 1);
 531
 532         perf_counter_task_sched_out(curr, cpu);
 533
 534         spin_lock(&ctx->lock);
 535
 536         /*
 537          * Disable all the counters:
 538          */
 539         perf_flags = hw_perf_save_disable();
 540
 541         list_for_each_entry(counter, &ctx->counter_list, list_entry)
 542                 counter->state = PERF_COUNTER_STATE_OFF;
 543
 544         hw_perf_restore(perf_flags);
 545
 546         spin_unlock(&ctx->lock);
 547
 548         curr_rq_unlock_irq_restore(&flags);
 549
 550         return 0;
 551 }
 552
 553 int perf_counter_task_enable(void)
 554 {
 555         struct task_struct *curr = current;
 556         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 557         struct perf_counter *counter;
 558         unsigned long flags;
 559         u64 perf_flags;
 560         int cpu;
 561
 562         if (likely(!ctx->nr_counters))
 563                 return 0;
 564
 565         curr_rq_lock_irq_save(&flags);
 566         cpu = smp_processor_id();
 567
 568         /* force the update of the task clock: */
 569         __task_delta_exec(curr, 1);
 570
 571         perf_counter_task_sched_out(curr, cpu);
 572
 573         spin_lock(&ctx->lock);
 574
 575         /*
 576          * Disable all the counters:
 577          */
 578         perf_flags = hw_perf_save_disable();
 579
 580         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 581                 if (counter->state != PERF_COUNTER_STATE_OFF)
 582                         continue;
 583                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 584                 counter->hw_event.disabled = 0;
 585         }
 586         hw_perf_restore(perf_flags);
 587
 588         spin_unlock(&ctx->lock);
 589
 590         perf_counter_task_sched_in(curr, cpu);
 591
 592         curr_rq_unlock_irq_restore(&flags);
 593
 594         return 0;
 595 }
 596
 597 /*
 598  * Round-robin a context's counters:
 599  */
 600 static void rotate_ctx(struct perf_counter_context *ctx)
 601 {
 602         struct perf_counter *counter;
 603         u64 perf_flags;
 604
 605         if (!ctx->nr_counters)
 606                 return;
 607
 608         spin_lock(&ctx->lock);
 609         /*
 610          * Rotate the first entry last (works just fine for group counters too):
 611          */
 612         perf_flags = hw_perf_save_disable();
 613         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 614                 list_del(&counter->list_entry);
 615                 list_add_tail(&counter->list_entry, &ctx->counter_list);
 616                 break;
 617         }
 618         hw_perf_restore(perf_flags);
 619
 620         spin_unlock(&ctx->lock);
 621 }
 622
 623 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 624 {
 625         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 626         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 627         const int rotate_percpu = 0;
 628
 629         if (rotate_percpu)
 630                 perf_counter_cpu_sched_out(cpuctx);
 631         perf_counter_task_sched_out(curr, cpu);
 632
 633         if (rotate_percpu)
 634                 rotate_ctx(&cpuctx->ctx);
 635         rotate_ctx(ctx);
 636
 637         if (rotate_percpu)
 638                 perf_counter_cpu_sched_in(cpuctx, cpu);
 639         perf_counter_task_sched_in(curr, cpu);
 640 }
 641
 642 /*
 643  * Cross CPU call to read the hardware counter
 644  */
 645 static void __read(void *info)
 646 {
 647         struct perf_counter *counter = info;
 648         unsigned long flags;
 649
 650         curr_rq_lock_irq_save(&flags);
 651         counter->hw_ops->read(counter);
 652         curr_rq_unlock_irq_restore(&flags);
 653 }
 654
 655 static u64 perf_counter_read(struct perf_counter *counter)
 656 {
 657         /*
 658          * If counter is enabled and currently active on a CPU, update the
 659          * value in the counter structure:
 660          */
 661         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 662                 smp_call_function_single(counter->oncpu,
 663                                          __read, counter, 1);
 664         }
 665
 666         return atomic64_read(&counter->count);
 667 }
 668
 669 /*
 670  * Cross CPU call to switch performance data pointers
 671  */
 672 static void __perf_switch_irq_data(void *info)
 673 {
 674         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 675         struct perf_counter *counter = info;
 676         struct perf_counter_context *ctx = counter->ctx;
 677         struct perf_data *oldirqdata = counter->irqdata;
 678
 679         /*
 680          * If this is a task context, we need to check whether it is
 681          * the current task context of this cpu. If not it has been
 682          * scheduled out before the smp call arrived.
 683          */
 684         if (ctx->task) {
 685                 if (cpuctx->task_ctx != ctx)
 686                         return;
 687                 spin_lock(&ctx->lock);
 688         }
 689
 690         /* Change the pointer NMI safe */
 691         atomic_long_set((atomic_long_t *)&counter->irqdata,
 692                         (unsigned long) counter->usrdata);
 693         counter->usrdata = oldirqdata;
 694
 695         if (ctx->task)
 696                 spin_unlock(&ctx->lock);
 697 }
 698
 699 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
 700 {
 701         struct perf_counter_context *ctx = counter->ctx;
 702         struct perf_data *oldirqdata = counter->irqdata;
 703         struct task_struct *task = ctx->task;
 704
 705         if (!task) {
 706                 smp_call_function_single(counter->cpu,
 707                                          __perf_switch_irq_data,
 708                                          counter, 1);
 709                 return counter->usrdata;
 710         }
 711
 712 retry:
 713         spin_lock_irq(&ctx->lock);
 714         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 715                 counter->irqdata = counter->usrdata;
 716                 counter->usrdata = oldirqdata;
 717                 spin_unlock_irq(&ctx->lock);
 718                 return oldirqdata;
 719         }
 720         spin_unlock_irq(&ctx->lock);
 721         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
 722         /* Might have failed, because task was scheduled out */
 723         if (counter->irqdata == oldirqdata)
 724                 goto retry;
 725
 726         return counter->usrdata;
 727 }
 728
 729 static void put_context(struct perf_counter_context *ctx)
 730 {
 731         if (ctx->task)
 732                 put_task_struct(ctx->task);
 733 }
 734
 735 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 736 {
 737         struct perf_cpu_context *cpuctx;
 738         struct perf_counter_context *ctx;
 739         struct task_struct *task;
 740
 741         /*
 742          * If cpu is not a wildcard then this is a percpu counter:
 743          */
 744         if (cpu != -1) {
 745                 /* Must be root to operate on a CPU counter: */
 746                 if (!capable(CAP_SYS_ADMIN))
 747                         return ERR_PTR(-EACCES);
 748
 749                 if (cpu < 0 || cpu > num_possible_cpus())
 750                         return ERR_PTR(-EINVAL);
 751
 752                 /*
 753                  * We could be clever and allow to attach a counter to an
 754                  * offline CPU and activate it when the CPU comes up, but
 755                  * that's for later.
 756                  */
 757                 if (!cpu_isset(cpu, cpu_online_map))
 758                         return ERR_PTR(-ENODEV);
 759
 760                 cpuctx = &per_cpu(perf_cpu_context, cpu);
 761                 ctx = &cpuctx->ctx;
 762
 763                 return ctx;
 764         }
 765
 766         rcu_read_lock();
 767         if (!pid)
 768                 task = current;
 769         else
 770                 task = find_task_by_vpid(pid);
 771         if (task)
 772                 get_task_struct(task);
 773         rcu_read_unlock();
 774
 775         if (!task)
 776                 return ERR_PTR(-ESRCH);
 777
 778         ctx = &task->perf_counter_ctx;
 779         ctx->task = task;
 780
 781         /* Reuse ptrace permission checks for now. */
 782         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
 783                 put_context(ctx);
 784                 return ERR_PTR(-EACCES);
 785         }
 786
 787         return ctx;
 788 }
 789
 790 /*
 791  * Called when the last reference to the file is gone.
 792  */
 793 static int perf_release(struct inode *inode, struct file *file)
 794 {
 795         struct perf_counter *counter = file->private_data;
 796         struct perf_counter_context *ctx = counter->ctx;
 797
 798         file->private_data = NULL;
 799
 800         mutex_lock(&counter->mutex);
 801
 802         perf_counter_remove_from_context(counter);
 803         put_context(ctx);
 804
 805         mutex_unlock(&counter->mutex);
 806
 807         kfree(counter);
 808
 809         return 0;
 810 }
 811
 812 /*
 813  * Read the performance counter - simple non blocking version for now
 814  */
 815 static ssize_t
 816 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 817 {
 818         u64 cntval;
 819
 820         if (count != sizeof(cntval))
 821                 return -EINVAL;
 822
 823         mutex_lock(&counter->mutex);
 824         cntval = perf_counter_read(counter);
 825         mutex_unlock(&counter->mutex);
 826
 827         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
 828 }
 829
 830 static ssize_t
 831 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
 832 {
 833         if (!usrdata->len)
 834                 return 0;
 835
 836         count = min(count, (size_t)usrdata->len);
 837         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
 838                 return -EFAULT;
 839
 840         /* Adjust the counters */
 841         usrdata->len -= count;
 842         if (!usrdata->len)
 843                 usrdata->rd_idx = 0;
 844         else
 845                 usrdata->rd_idx += count;
 846
 847         return count;
 848 }
 849
 850 static ssize_t
 851 perf_read_irq_data(struct perf_counter  *counter,
 852                    char __user          *buf,
 853                    size_t               count,
 854                    int                  nonblocking)
 855 {
 856         struct perf_data *irqdata, *usrdata;
 857         DECLARE_WAITQUEUE(wait, current);
 858         ssize_t res;
 859
 860         irqdata = counter->irqdata;
 861         usrdata = counter->usrdata;
 862
 863         if (usrdata->len + irqdata->len >= count)
 864                 goto read_pending;
 865
 866         if (nonblocking)
 867                 return -EAGAIN;
 868
 869         spin_lock_irq(&counter->waitq.lock);
 870         __add_wait_queue(&counter->waitq, &wait);
 871         for (;;) {
 872                 set_current_state(TASK_INTERRUPTIBLE);
 873                 if (usrdata->len + irqdata->len >= count)
 874                         break;
 875
 876                 if (signal_pending(current))
 877                         break;
 878
 879                 spin_unlock_irq(&counter->waitq.lock);
 880                 schedule();
 881                 spin_lock_irq(&counter->waitq.lock);
 882         }
 883         __remove_wait_queue(&counter->waitq, &wait);
 884         __set_current_state(TASK_RUNNING);
 885         spin_unlock_irq(&counter->waitq.lock);
 886
 887         if (usrdata->len + irqdata->len < count)
 888                 return -ERESTARTSYS;
 889 read_pending:
 890         mutex_lock(&counter->mutex);
 891
 892         /* Drain pending data first: */
 893         res = perf_copy_usrdata(usrdata, buf, count);
 894         if (res < 0 || res == count)
 895                 goto out;
 896
 897         /* Switch irq buffer: */
 898         usrdata = perf_switch_irq_data(counter);
 899         if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
 900                 if (!res)
 901                         res = -EFAULT;
 902         } else {
 903                 res = count;
 904         }
 905 out:
 906         mutex_unlock(&counter->mutex);
 907
 908         return res;
 909 }
 910
 911 static ssize_t
 912 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 913 {
 914         struct perf_counter *counter = file->private_data;
 915
 916         switch (counter->hw_event.record_type) {
 917         case PERF_RECORD_SIMPLE:
 918                 return perf_read_hw(counter, buf, count);
 919
 920         case PERF_RECORD_IRQ:
 921         case PERF_RECORD_GROUP:
 922                 return perf_read_irq_data(counter, buf, count,
 923                                           file->f_flags & O_NONBLOCK);
 924         }
 925         return -EINVAL;
 926 }
 927
 928 static unsigned int perf_poll(struct file *file, poll_table *wait)
 929 {
 930         struct perf_counter *counter = file->private_data;
 931         unsigned int events = 0;
 932         unsigned long flags;
 933
 934         poll_wait(file, &counter->waitq, wait);
 935
 936         spin_lock_irqsave(&counter->waitq.lock, flags);
 937         if (counter->usrdata->len || counter->irqdata->len)
 938                 events |= POLLIN;
 939         spin_unlock_irqrestore(&counter->waitq.lock, flags);
 940
 941         return events;
 942 }
 943
 944 static const struct file_operations perf_fops = {
 945         .release                = perf_release,
 946         .read                   = perf_read,
 947         .poll                   = perf_poll,
 948 };
 949
 950 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 951 {
 952         int cpu = raw_smp_processor_id();
 953
 954         atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
 955         return 0;
 956 }
 957
 958 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
 959 {
 960         int cpu = raw_smp_processor_id();
 961         s64 prev;
 962         u64 now;
 963
 964         now = cpu_clock(cpu);
 965         prev = atomic64_read(&counter->hw.prev_count);
 966         atomic64_set(&counter->hw.prev_count, now);
 967         atomic64_add(now - prev, &counter->count);
 968 }
 969
 970 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 971 {
 972         cpu_clock_perf_counter_update(counter);
 973 }
 974
 975 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 976 {
 977         cpu_clock_perf_counter_update(counter);
 978 }
 979
 980 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 981         .enable         = cpu_clock_perf_counter_enable,
 982         .disable        = cpu_clock_perf_counter_disable,
 983         .read           = cpu_clock_perf_counter_read,
 984 };
 985
 986 /*
 987  * Called from within the scheduler:
 988  */
 989 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
 990 {
 991         struct task_struct *curr = counter->task;
 992         u64 delta;
 993
 994         delta = __task_delta_exec(curr, update);
 995
 996         return curr->se.sum_exec_runtime + delta;
 997 }
 998
 999 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1000 {
1001         u64 prev;
1002         s64 delta;
1003
1004         prev = atomic64_read(&counter->hw.prev_count);
1005
1006         atomic64_set(&counter->hw.prev_count, now);
1007
1008         delta = now - prev;
1009
1010         atomic64_add(delta, &counter->count);
1011 }
1012
1013 static void task_clock_perf_counter_read(struct perf_counter *counter)
1014 {
1015         u64 now = task_clock_perf_counter_val(counter, 1);
1016
1017         task_clock_perf_counter_update(counter, now);
1018 }
1019
1020 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1021 {
1022         u64 now = task_clock_perf_counter_val(counter, 0);
1023
1024         atomic64_set(&counter->hw.prev_count, now);
1025
1026         return 0;
1027 }
1028
1029 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1030 {
1031         u64 now = task_clock_perf_counter_val(counter, 0);
1032
1033         task_clock_perf_counter_update(counter, now);
1034 }
1035
1036 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1037         .enable         = task_clock_perf_counter_enable,
1038         .disable        = task_clock_perf_counter_disable,
1039         .read           = task_clock_perf_counter_read,
1040 };
1041
1042 static u64 get_page_faults(void)
1043 {
1044         struct task_struct *curr = current;
1045
1046         return curr->maj_flt + curr->min_flt;
1047 }
1048
1049 static void page_faults_perf_counter_update(struct perf_counter *counter)
1050 {
1051         u64 prev, now;
1052         s64 delta;
1053
1054         prev = atomic64_read(&counter->hw.prev_count);
1055         now = get_page_faults();
1056
1057         atomic64_set(&counter->hw.prev_count, now);
1058
1059         delta = now - prev;
1060
1061         atomic64_add(delta, &counter->count);
1062 }
1063
1064 static void page_faults_perf_counter_read(struct perf_counter *counter)
1065 {
1066         page_faults_perf_counter_update(counter);
1067 }
1068
1069 static int page_faults_perf_counter_enable(struct perf_counter *counter)
1070 {
1071         /*
1072          * page-faults is a per-task value already,
1073          * so we dont have to clear it on switch-in.
1074          */
1075
1076         return 0;
1077 }
1078
1079 static void page_faults_perf_counter_disable(struct perf_counter *counter)
1080 {
1081         page_faults_perf_counter_update(counter);
1082 }
1083
1084 static const struct hw_perf_counter_ops perf_ops_page_faults = {
1085         .enable         = page_faults_perf_counter_enable,
1086         .disable        = page_faults_perf_counter_disable,
1087         .read           = page_faults_perf_counter_read,
1088 };
1089
1090 static u64 get_context_switches(void)
1091 {
1092         struct task_struct *curr = current;
1093
1094         return curr->nvcsw + curr->nivcsw;
1095 }
1096
1097 static void context_switches_perf_counter_update(struct perf_counter *counter)
1098 {
1099         u64 prev, now;
1100         s64 delta;
1101
1102         prev = atomic64_read(&counter->hw.prev_count);
1103         now = get_context_switches();
1104
1105         atomic64_set(&counter->hw.prev_count, now);
1106
1107         delta = now - prev;
1108
1109         atomic64_add(delta, &counter->count);
1110 }
1111
1112 static void context_switches_perf_counter_read(struct perf_counter *counter)
1113 {
1114         context_switches_perf_counter_update(counter);
1115 }
1116
1117 static int context_switches_perf_counter_enable(struct perf_counter *counter)
1118 {
1119         /*
1120          * ->nvcsw + curr->nivcsw is a per-task value already,
1121          * so we dont have to clear it on switch-in.
1122          */
1123
1124         return 0;
1125 }
1126
1127 static void context_switches_perf_counter_disable(struct perf_counter *counter)
1128 {
1129         context_switches_perf_counter_update(counter);
1130 }
1131
1132 static const struct hw_perf_counter_ops perf_ops_context_switches = {
1133         .enable         = context_switches_perf_counter_enable,
1134         .disable        = context_switches_perf_counter_disable,
1135         .read           = context_switches_perf_counter_read,
1136 };
1137
1138 static inline u64 get_cpu_migrations(void)
1139 {
1140         return current->se.nr_migrations;
1141 }
1142
1143 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1144 {
1145         u64 prev, now;
1146         s64 delta;
1147
1148         prev = atomic64_read(&counter->hw.prev_count);
1149         now = get_cpu_migrations();
1150
1151         atomic64_set(&counter->hw.prev_count, now);
1152
1153         delta = now - prev;
1154
1155         atomic64_add(delta, &counter->count);
1156 }
1157
1158 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1159 {
1160         cpu_migrations_perf_counter_update(counter);
1161 }
1162
1163 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1164 {
1165         /*
1166          * se.nr_migrations is a per-task value already,
1167          * so we dont have to clear it on switch-in.
1168          */
1169
1170         return 0;
1171 }
1172
1173 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1174 {
1175         cpu_migrations_perf_counter_update(counter);
1176 }
1177
1178 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1179         .enable         = cpu_migrations_perf_counter_enable,
1180         .disable        = cpu_migrations_perf_counter_disable,
1181         .read           = cpu_migrations_perf_counter_read,
1182 };
1183
1184 static const struct hw_perf_counter_ops *
1185 sw_perf_counter_init(struct perf_counter *counter)
1186 {
1187         const struct hw_perf_counter_ops *hw_ops = NULL;
1188
1189         switch (counter->hw_event.type) {
1190         case PERF_COUNT_CPU_CLOCK:
1191                 hw_ops = &perf_ops_cpu_clock;
1192                 break;
1193         case PERF_COUNT_TASK_CLOCK:
1194                 hw_ops = &perf_ops_task_clock;
1195                 break;
1196         case PERF_COUNT_PAGE_FAULTS:
1197                 hw_ops = &perf_ops_page_faults;
1198                 break;
1199         case PERF_COUNT_CONTEXT_SWITCHES:
1200                 hw_ops = &perf_ops_context_switches;
1201                 break;
1202         case PERF_COUNT_CPU_MIGRATIONS:
1203                 hw_ops = &perf_ops_cpu_migrations;
1204                 break;
1205         default:
1206                 break;
1207         }
1208         return hw_ops;
1209 }
1210
1211 /*
1212  * Allocate and initialize a counter structure
1213  */
1214 static struct perf_counter *
1215 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1216                    int cpu,
1217                    struct perf_counter *group_leader,
1218                    gfp_t gfpflags)
1219 {
1220         const struct hw_perf_counter_ops *hw_ops;
1221         struct perf_counter *counter;
1222
1223         counter = kzalloc(sizeof(*counter), gfpflags);
1224         if (!counter)
1225                 return NULL;
1226
1227         /*
1228          * Single counters are their own group leaders, with an
1229          * empty sibling list:
1230          */
1231         if (!group_leader)
1232                 group_leader = counter;
1233
1234         mutex_init(&counter->mutex);
1235         INIT_LIST_HEAD(&counter->list_entry);
1236         INIT_LIST_HEAD(&counter->sibling_list);
1237         init_waitqueue_head(&counter->waitq);
1238
1239         counter->irqdata                = &counter->data[0];
1240         counter->usrdata                = &counter->data[1];
1241         counter->cpu                    = cpu;
1242         counter->hw_event               = *hw_event;
1243         counter->wakeup_pending         = 0;
1244         counter->group_leader           = group_leader;
1245         counter->hw_ops                 = NULL;
1246
1247         counter->state = PERF_COUNTER_STATE_INACTIVE;
1248         if (hw_event->disabled)
1249                 counter->state = PERF_COUNTER_STATE_OFF;
1250
1251         hw_ops = NULL;
1252         if (!hw_event->raw && hw_event->type < 0)
1253                 hw_ops = sw_perf_counter_init(counter);
1254         if (!hw_ops)
1255                 hw_ops = hw_perf_counter_init(counter);
1256
1257         if (!hw_ops) {
1258                 kfree(counter);
1259                 return NULL;
1260         }
1261         counter->hw_ops = hw_ops;
1262
1263         return counter;
1264 }
1265
1266 /**
1267  * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1268  *
1269  * @hw_event_uptr:      event type attributes for monitoring/sampling
1270  * @pid:                target pid
1271  * @cpu:                target cpu
1272  * @group_fd:           group leader counter fd
1273  */
1274 asmlinkage int
1275 sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1276                       pid_t pid, int cpu, int group_fd)
1277 {
1278         struct perf_counter *counter, *group_leader;
1279         struct perf_counter_hw_event hw_event;
1280         struct perf_counter_context *ctx;
1281         struct file *counter_file = NULL;
1282         struct file *group_file = NULL;
1283         int fput_needed = 0;
1284         int fput_needed2 = 0;
1285         int ret;
1286
1287         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1288                 return -EFAULT;
1289
1290         /*
1291          * Get the target context (task or percpu):
1292          */
1293         ctx = find_get_context(pid, cpu);
1294         if (IS_ERR(ctx))
1295                 return PTR_ERR(ctx);
1296
1297         /*
1298          * Look up the group leader (we will attach this counter to it):
1299          */
1300         group_leader = NULL;
1301         if (group_fd != -1) {
1302                 ret = -EINVAL;
1303                 group_file = fget_light(group_fd, &fput_needed);
1304                 if (!group_file)
1305                         goto err_put_context;
1306                 if (group_file->f_op != &perf_fops)
1307                         goto err_put_context;
1308
1309                 group_leader = group_file->private_data;
1310                 /*
1311                  * Do not allow a recursive hierarchy (this new sibling
1312                  * becoming part of another group-sibling):
1313                  */
1314                 if (group_leader->group_leader != group_leader)
1315                         goto err_put_context;
1316                 /*
1317                  * Do not allow to attach to a group in a different
1318                  * task or CPU context:
1319                  */
1320                 if (group_leader->ctx != ctx)
1321                         goto err_put_context;
1322         }
1323
1324         ret = -EINVAL;
1325         counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1326         if (!counter)
1327                 goto err_put_context;
1328
1329         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1330         if (ret < 0)
1331                 goto err_free_put_context;
1332
1333         counter_file = fget_light(ret, &fput_needed2);
1334         if (!counter_file)
1335                 goto err_free_put_context;
1336
1337         counter->filp = counter_file;
1338         perf_install_in_context(ctx, counter, cpu);
1339
1340         fput_light(counter_file, fput_needed2);
1341
1342 out_fput:
1343         fput_light(group_file, fput_needed);
1344
1345         return ret;
1346
1347 err_free_put_context:
1348         kfree(counter);
1349
1350 err_put_context:
1351         put_context(ctx);
1352
1353         goto out_fput;
1354 }
1355
1356 /*
1357  * Initialize the perf_counter context in a task_struct:
1358  */
1359 static void
1360 __perf_counter_init_context(struct perf_counter_context *ctx,
1361                             struct task_struct *task)
1362 {
1363         memset(ctx, 0, sizeof(*ctx));
1364         spin_lock_init(&ctx->lock);
1365         INIT_LIST_HEAD(&ctx->counter_list);
1366         ctx->task = task;
1367 }
1368
1369 /*
1370  * inherit a counter from parent task to child task:
1371  */
1372 static int
1373 inherit_counter(struct perf_counter *parent_counter,
1374               struct task_struct *parent,
1375               struct perf_counter_context *parent_ctx,
1376               struct task_struct *child,
1377               struct perf_counter_context *child_ctx)
1378 {
1379         struct perf_counter *child_counter;
1380
1381         child_counter = perf_counter_alloc(&parent_counter->hw_event,
1382                                             parent_counter->cpu, NULL,
1383                                             GFP_ATOMIC);
1384         if (!child_counter)
1385                 return -ENOMEM;
1386
1387         /*
1388          * Link it up in the child's context:
1389          */
1390         child_counter->ctx = child_ctx;
1391         child_counter->task = child;
1392         list_add_counter(child_counter, child_ctx);
1393         child_ctx->nr_counters++;
1394
1395         child_counter->parent = parent_counter;
1396         /*
1397          * inherit into child's child as well:
1398          */
1399         child_counter->hw_event.inherit = 1;
1400
1401         /*
1402          * Get a reference to the parent filp - we will fput it
1403          * when the child counter exits. This is safe to do because
1404          * we are in the parent and we know that the filp still
1405          * exists and has a nonzero count:
1406          */
1407         atomic_long_inc(&parent_counter->filp->f_count);
1408
1409         return 0;
1410 }
1411
1412 static void
1413 __perf_counter_exit_task(struct task_struct *child,
1414                          struct perf_counter *child_counter,
1415                          struct perf_counter_context *child_ctx)
1416 {
1417         struct perf_counter *parent_counter;
1418         u64 parent_val, child_val;
1419
1420         /*
1421          * If we do not self-reap then we have to wait for the
1422          * child task to unschedule (it will happen for sure),
1423          * so that its counter is at its final count. (This
1424          * condition triggers rarely - child tasks usually get
1425          * off their CPU before the parent has a chance to
1426          * get this far into the reaping action)
1427          */
1428         if (child != current) {
1429                 wait_task_inactive(child, 0);
1430                 list_del_init(&child_counter->list_entry);
1431         } else {
1432                 struct perf_cpu_context *cpuctx;
1433                 unsigned long flags;
1434                 u64 perf_flags;
1435
1436                 /*
1437                  * Disable and unlink this counter.
1438                  *
1439                  * Be careful about zapping the list - IRQ/NMI context
1440                  * could still be processing it:
1441                  */
1442                 curr_rq_lock_irq_save(&flags);
1443                 perf_flags = hw_perf_save_disable();
1444
1445                 cpuctx = &__get_cpu_var(perf_cpu_context);
1446
1447                 if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
1448                         child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1449                         child_counter->hw_ops->disable(child_counter);
1450                         cpuctx->active_oncpu--;
1451                         child_ctx->nr_active--;
1452                         child_counter->oncpu = -1;
1453                 }
1454
1455                 list_del_init(&child_counter->list_entry);
1456
1457                 child_ctx->nr_counters--;
1458
1459                 hw_perf_restore(perf_flags);
1460                 curr_rq_unlock_irq_restore(&flags);
1461         }
1462
1463         parent_counter = child_counter->parent;
1464         /*
1465          * It can happen that parent exits first, and has counters
1466          * that are still around due to the child reference. These
1467          * counters need to be zapped - but otherwise linger.
1468          */
1469         if (!parent_counter)
1470                 return;
1471
1472         parent_val = atomic64_read(&parent_counter->count);
1473         child_val = atomic64_read(&child_counter->count);
1474
1475         /*
1476          * Add back the child's count to the parent's count:
1477          */
1478         atomic64_add(child_val, &parent_counter->count);
1479
1480         fput(parent_counter->filp);
1481
1482         kfree(child_counter);
1483 }
1484
1485 /*
1486  * When a child task exist, feed back counter values to parent counters.
1487  *
1488  * Note: we are running in child context, but the PID is not hashed
1489  * anymore so new counters will not be added.
1490  */
1491 void perf_counter_exit_task(struct task_struct *child)
1492 {
1493         struct perf_counter *child_counter, *tmp;
1494         struct perf_counter_context *child_ctx;
1495
1496         child_ctx = &child->perf_counter_ctx;
1497
1498         if (likely(!child_ctx->nr_counters))
1499                 return;
1500
1501         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1502                                  list_entry)
1503                 __perf_counter_exit_task(child, child_counter, child_ctx);
1504 }
1505
1506 /*
1507  * Initialize the perf_counter context in task_struct
1508  */
1509 void perf_counter_init_task(struct task_struct *child)
1510 {
1511         struct perf_counter_context *child_ctx, *parent_ctx;
1512         struct perf_counter *counter, *parent_counter;
1513         struct task_struct *parent = current;
1514         unsigned long flags;
1515
1516         child_ctx  =  &child->perf_counter_ctx;
1517         parent_ctx = &parent->perf_counter_ctx;
1518
1519         __perf_counter_init_context(child_ctx, child);
1520
1521         /*
1522          * This is executed from the parent task context, so inherit
1523          * counters that have been marked for cloning:
1524          */
1525
1526         if (likely(!parent_ctx->nr_counters))
1527                 return;
1528
1529         /*
1530          * Lock the parent list. No need to lock the child - not PID
1531          * hashed yet and not running, so nobody can access it.
1532          */
1533         spin_lock_irqsave(&parent_ctx->lock, flags);
1534
1535         /*
1536          * We dont have to disable NMIs - we are only looking at
1537          * the list, not manipulating it:
1538          */
1539         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1540                 if (!counter->hw_event.inherit || counter->group_leader != counter)
1541                         continue;
1542
1543                 /*
1544                  * Instead of creating recursive hierarchies of counters,
1545                  * we link inheritd counters back to the original parent,
1546                  * which has a filp for sure, which we use as the reference
1547                  * count:
1548                  */
1549                 parent_counter = counter;
1550                 if (counter->parent)
1551                         parent_counter = counter->parent;
1552
1553                 if (inherit_counter(parent_counter, parent,
1554                                   parent_ctx, child, child_ctx))
1555                         break;
1556         }
1557
1558         spin_unlock_irqrestore(&parent_ctx->lock, flags);
1559 }
1560
1561 static void __cpuinit perf_counter_init_cpu(int cpu)
1562 {
1563         struct perf_cpu_context *cpuctx;
1564
1565         cpuctx = &per_cpu(perf_cpu_context, cpu);
1566         __perf_counter_init_context(&cpuctx->ctx, NULL);
1567
1568         mutex_lock(&perf_resource_mutex);
1569         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
1570         mutex_unlock(&perf_resource_mutex);
1571
1572         hw_perf_counter_setup();
1573 }
1574
1575 #ifdef CONFIG_HOTPLUG_CPU
1576 static void __perf_counter_exit_cpu(void *info)
1577 {
1578         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1579         struct perf_counter_context *ctx = &cpuctx->ctx;
1580         struct perf_counter *counter, *tmp;
1581
1582         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
1583                 __perf_counter_remove_from_context(counter);
1584
1585 }
1586 static void perf_counter_exit_cpu(int cpu)
1587 {
1588         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
1589 }
1590 #else
1591 static inline void perf_counter_exit_cpu(int cpu) { }
1592 #endif
1593
1594 static int __cpuinit
1595 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
1596 {
1597         unsigned int cpu = (long)hcpu;
1598
1599         switch (action) {
1600
1601         case CPU_UP_PREPARE:
1602         case CPU_UP_PREPARE_FROZEN:
1603                 perf_counter_init_cpu(cpu);
1604                 break;
1605
1606         case CPU_DOWN_PREPARE:
1607         case CPU_DOWN_PREPARE_FROZEN:
1608                 perf_counter_exit_cpu(cpu);
1609                 break;
1610
1611         default:
1612                 break;
1613         }
1614
1615         return NOTIFY_OK;
1616 }
1617
1618 static struct notifier_block __cpuinitdata perf_cpu_nb = {
1619         .notifier_call          = perf_cpu_notify,
1620 };
1621
1622 static int __init perf_counter_init(void)
1623 {
1624         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
1625                         (void *)(long)smp_processor_id());
1626         register_cpu_notifier(&perf_cpu_nb);
1627
1628         return 0;
1629 }
1630 early_initcall(perf_counter_init);
1631
1632 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
1633 {
1634         return sprintf(buf, "%d\n", perf_reserved_percpu);
1635 }
1636
1637 static ssize_t
1638 perf_set_reserve_percpu(struct sysdev_class *class,
1639                         const char *buf,
1640                         size_t count)
1641 {
1642         struct perf_cpu_context *cpuctx;
1643         unsigned long val;
1644         int err, cpu, mpt;
1645
1646         err = strict_strtoul(buf, 10, &val);
1647         if (err)
1648                 return err;
1649         if (val > perf_max_counters)
1650                 return -EINVAL;
1651
1652         mutex_lock(&perf_resource_mutex);
1653         perf_reserved_percpu = val;
1654         for_each_online_cpu(cpu) {
1655                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1656                 spin_lock_irq(&cpuctx->ctx.lock);
1657                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
1658                           perf_max_counters - perf_reserved_percpu);
1659                 cpuctx->max_pertask = mpt;
1660                 spin_unlock_irq(&cpuctx->ctx.lock);
1661         }
1662         mutex_unlock(&perf_resource_mutex);
1663
1664         return count;
1665 }
1666
1667 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
1668 {
1669         return sprintf(buf, "%d\n", perf_overcommit);
1670 }
1671
1672 static ssize_t
1673 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
1674 {
1675         unsigned long val;
1676         int err;
1677
1678         err = strict_strtoul(buf, 10, &val);
1679         if (err)
1680                 return err;
1681         if (val > 1)
1682                 return -EINVAL;
1683
1684         mutex_lock(&perf_resource_mutex);
1685         perf_overcommit = val;
1686         mutex_unlock(&perf_resource_mutex);
1687
1688         return count;
1689 }
1690
1691 static SYSDEV_CLASS_ATTR(
1692                                 reserve_percpu,
1693                                 0644,
1694                                 perf_show_reserve_percpu,
1695                                 perf_set_reserve_percpu
1696                         );
1697
1698 static SYSDEV_CLASS_ATTR(
1699                                 overcommit,
1700                                 0644,
1701                                 perf_show_overcommit,
1702                                 perf_set_overcommit
1703                         );
1704
1705 static struct attribute *perfclass_attrs[] = {
1706         &attr_reserve_percpu.attr,
1707         &attr_overcommit.attr,
1708         NULL
1709 };
1710
1711 static struct attribute_group perfclass_attr_group = {
1712         .attrs                  = perfclass_attrs,
1713         .name                   = "perf_counters",
1714 };
1715
1716 static int __init perf_counter_sysfs_init(void)
1717 {
1718         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
1719                                   &perfclass_attr_group);
1720 }
1721 device_initcall(perf_counter_sysfs_init);