kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *  For licencing details see kernel-base/COPYING
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/cpu.h>
  12 #include <linux/smp.h>
  13 #include <linux/file.h>
  14 #include <linux/poll.h>
  15 #include <linux/sysfs.h>
  16 #include <linux/ptrace.h>
  17 #include <linux/percpu.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/kernel_stat.h>
  22 #include <linux/perf_counter.h>
  23
  24 /*
  25  * Each CPU has a list of per CPU counters:
  26  */
  27 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  28
  29 int perf_max_counters __read_mostly = 1;
  30 static int perf_reserved_percpu __read_mostly;
  31 static int perf_overcommit __read_mostly = 1;
  32
  33 /*
  34  * Mutex for (sysadmin-configurable) counter reservations:
  35  */
  36 static DEFINE_MUTEX(perf_resource_mutex);
  37
  38 /*
  39  * Architecture provided APIs - weak aliases:
  40  */
  41 extern __weak const struct hw_perf_counter_ops *
  42 hw_perf_counter_init(struct perf_counter *counter)
  43 {
  44         return NULL;
  45 }
  46
  47 u64 __weak hw_perf_save_disable(void)           { return 0; }
  48 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  49 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  50 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  51                struct perf_cpu_context *cpuctx,
  52                struct perf_counter_context *ctx, int cpu)
  53 {
  54         return 0;
  55 }
  56
  57 void __weak perf_counter_print_debug(void)      { }
  58
  59 static void
  60 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  61 {
  62         struct perf_counter *group_leader = counter->group_leader;
  63
  64         /*
  65          * Depending on whether it is a standalone or sibling counter,
  66          * add it straight to the context's counter list, or to the group
  67          * leader's sibling list:
  68          */
  69         if (counter->group_leader == counter)
  70                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  71         else
  72                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  73 }
  74
  75 static void
  76 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  77 {
  78         struct perf_counter *sibling, *tmp;
  79
  80         list_del_init(&counter->list_entry);
  81
  82         /*
  83          * If this was a group counter with sibling counters then
  84          * upgrade the siblings to singleton counters by adding them
  85          * to the context list directly:
  86          */
  87         list_for_each_entry_safe(sibling, tmp,
  88                                  &counter->sibling_list, list_entry) {
  89
  90                 list_del_init(&sibling->list_entry);
  91                 list_add_tail(&sibling->list_entry, &ctx->counter_list);
  92                 sibling->group_leader = sibling;
  93         }
  94 }
  95
  96 static void
  97 counter_sched_out(struct perf_counter *counter,
  98                   struct perf_cpu_context *cpuctx,
  99                   struct perf_counter_context *ctx)
 100 {
 101         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 102                 return;
 103
 104         counter->state = PERF_COUNTER_STATE_INACTIVE;
 105         counter->hw_ops->disable(counter);
 106         counter->oncpu = -1;
 107
 108         if (!is_software_counter(counter))
 109                 cpuctx->active_oncpu--;
 110         ctx->nr_active--;
 111         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
 112                 cpuctx->exclusive = 0;
 113 }
 114
 115 static void
 116 group_sched_out(struct perf_counter *group_counter,
 117                 struct perf_cpu_context *cpuctx,
 118                 struct perf_counter_context *ctx)
 119 {
 120         struct perf_counter *counter;
 121
 122         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 123                 return;
 124
 125         counter_sched_out(group_counter, cpuctx, ctx);
 126
 127         /*
 128          * Schedule out siblings (if any):
 129          */
 130         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 131                 counter_sched_out(counter, cpuctx, ctx);
 132
 133         if (group_counter->hw_event.exclusive)
 134                 cpuctx->exclusive = 0;
 135 }
 136
 137 /*
 138  * Cross CPU call to remove a performance counter
 139  *
 140  * We disable the counter on the hardware level first. After that we
 141  * remove it from the context list.
 142  */
 143 static void __perf_counter_remove_from_context(void *info)
 144 {
 145         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 146         struct perf_counter *counter = info;
 147         struct perf_counter_context *ctx = counter->ctx;
 148         unsigned long flags;
 149         u64 perf_flags;
 150
 151         /*
 152          * If this is a task context, we need to check whether it is
 153          * the current task context of this cpu. If not it has been
 154          * scheduled out before the smp call arrived.
 155          */
 156         if (ctx->task && cpuctx->task_ctx != ctx)
 157                 return;
 158
 159         curr_rq_lock_irq_save(&flags);
 160         spin_lock(&ctx->lock);
 161
 162         counter_sched_out(counter, cpuctx, ctx);
 163
 164         counter->task = NULL;
 165         ctx->nr_counters--;
 166
 167         /*
 168          * Protect the list operation against NMI by disabling the
 169          * counters on a global level. NOP for non NMI based counters.
 170          */
 171         perf_flags = hw_perf_save_disable();
 172         list_del_counter(counter, ctx);
 173         hw_perf_restore(perf_flags);
 174
 175         if (!ctx->task) {
 176                 /*
 177                  * Allow more per task counters with respect to the
 178                  * reservation:
 179                  */
 180                 cpuctx->max_pertask =
 181                         min(perf_max_counters - ctx->nr_counters,
 182                             perf_max_counters - perf_reserved_percpu);
 183         }
 184
 185         spin_unlock(&ctx->lock);
 186         curr_rq_unlock_irq_restore(&flags);
 187 }
 188
 189
 190 /*
 191  * Remove the counter from a task's (or a CPU's) list of counters.
 192  *
 193  * Must be called with counter->mutex and ctx->mutex held.
 194  *
 195  * CPU counters are removed with a smp call. For task counters we only
 196  * call when the task is on a CPU.
 197  */
 198 static void perf_counter_remove_from_context(struct perf_counter *counter)
 199 {
 200         struct perf_counter_context *ctx = counter->ctx;
 201         struct task_struct *task = ctx->task;
 202
 203         if (!task) {
 204                 /*
 205                  * Per cpu counters are removed via an smp call and
 206                  * the removal is always sucessful.
 207                  */
 208                 smp_call_function_single(counter->cpu,
 209                                          __perf_counter_remove_from_context,
 210                                          counter, 1);
 211                 return;
 212         }
 213
 214 retry:
 215         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 216                                  counter);
 217
 218         spin_lock_irq(&ctx->lock);
 219         /*
 220          * If the context is active we need to retry the smp call.
 221          */
 222         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 223                 spin_unlock_irq(&ctx->lock);
 224                 goto retry;
 225         }
 226
 227         /*
 228          * The lock prevents that this context is scheduled in so we
 229          * can remove the counter safely, if the call above did not
 230          * succeed.
 231          */
 232         if (!list_empty(&counter->list_entry)) {
 233                 ctx->nr_counters--;
 234                 list_del_counter(counter, ctx);
 235                 counter->task = NULL;
 236         }
 237         spin_unlock_irq(&ctx->lock);
 238 }
 239
 240 /*
 241  * Cross CPU call to disable a performance counter
 242  */
 243 static void __perf_counter_disable(void *info)
 244 {
 245         struct perf_counter *counter = info;
 246         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 247         struct perf_counter_context *ctx = counter->ctx;
 248         unsigned long flags;
 249
 250         /*
 251          * If this is a per-task counter, need to check whether this
 252          * counter's task is the current task on this cpu.
 253          */
 254         if (ctx->task && cpuctx->task_ctx != ctx)
 255                 return;
 256
 257         curr_rq_lock_irq_save(&flags);
 258         spin_lock(&ctx->lock);
 259
 260         /*
 261          * If the counter is on, turn it off.
 262          * If it is in error state, leave it in error state.
 263          */
 264         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 265                 if (counter == counter->group_leader)
 266                         group_sched_out(counter, cpuctx, ctx);
 267                 else
 268                         counter_sched_out(counter, cpuctx, ctx);
 269                 counter->state = PERF_COUNTER_STATE_OFF;
 270         }
 271
 272         spin_unlock(&ctx->lock);
 273         curr_rq_unlock_irq_restore(&flags);
 274 }
 275
 276 /*
 277  * Disable a counter.
 278  */
 279 static void perf_counter_disable(struct perf_counter *counter)
 280 {
 281         struct perf_counter_context *ctx = counter->ctx;
 282         struct task_struct *task = ctx->task;
 283
 284         if (!task) {
 285                 /*
 286                  * Disable the counter on the cpu that it's on
 287                  */
 288                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 289                                          counter, 1);
 290                 return;
 291         }
 292
 293  retry:
 294         task_oncpu_function_call(task, __perf_counter_disable, counter);
 295
 296         spin_lock_irq(&ctx->lock);
 297         /*
 298          * If the counter is still active, we need to retry the cross-call.
 299          */
 300         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 301                 spin_unlock_irq(&ctx->lock);
 302                 goto retry;
 303         }
 304
 305         /*
 306          * Since we have the lock this context can't be scheduled
 307          * in, so we can change the state safely.
 308          */
 309         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 310                 counter->state = PERF_COUNTER_STATE_OFF;
 311
 312         spin_unlock_irq(&ctx->lock);
 313 }
 314
 315 /*
 316  * Disable a counter and all its children.
 317  */
 318 static void perf_counter_disable_family(struct perf_counter *counter)
 319 {
 320         struct perf_counter *child;
 321
 322         perf_counter_disable(counter);
 323
 324         /*
 325          * Lock the mutex to protect the list of children
 326          */
 327         mutex_lock(&counter->mutex);
 328         list_for_each_entry(child, &counter->child_list, child_list)
 329                 perf_counter_disable(child);
 330         mutex_unlock(&counter->mutex);
 331 }
 332
 333 static int
 334 counter_sched_in(struct perf_counter *counter,
 335                  struct perf_cpu_context *cpuctx,
 336                  struct perf_counter_context *ctx,
 337                  int cpu)
 338 {
 339         if (counter->state <= PERF_COUNTER_STATE_OFF)
 340                 return 0;
 341
 342         counter->state = PERF_COUNTER_STATE_ACTIVE;
 343         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 344         /*
 345          * The new state must be visible before we turn it on in the hardware:
 346          */
 347         smp_wmb();
 348
 349         if (counter->hw_ops->enable(counter)) {
 350                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 351                 counter->oncpu = -1;
 352                 return -EAGAIN;
 353         }
 354
 355         if (!is_software_counter(counter))
 356                 cpuctx->active_oncpu++;
 357         ctx->nr_active++;
 358
 359         if (counter->hw_event.exclusive)
 360                 cpuctx->exclusive = 1;
 361
 362         return 0;
 363 }
 364
 365 /*
 366  * Return 1 for a group consisting entirely of software counters,
 367  * 0 if the group contains any hardware counters.
 368  */
 369 static int is_software_only_group(struct perf_counter *leader)
 370 {
 371         struct perf_counter *counter;
 372
 373         if (!is_software_counter(leader))
 374                 return 0;
 375         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 376                 if (!is_software_counter(counter))
 377                         return 0;
 378         return 1;
 379 }
 380
 381 /*
 382  * Work out whether we can put this counter group on the CPU now.
 383  */
 384 static int group_can_go_on(struct perf_counter *counter,
 385                            struct perf_cpu_context *cpuctx,
 386                            int can_add_hw)
 387 {
 388         /*
 389          * Groups consisting entirely of software counters can always go on.
 390          */
 391         if (is_software_only_group(counter))
 392                 return 1;
 393         /*
 394          * If an exclusive group is already on, no other hardware
 395          * counters can go on.
 396          */
 397         if (cpuctx->exclusive)
 398                 return 0;
 399         /*
 400          * If this group is exclusive and there are already
 401          * counters on the CPU, it can't go on.
 402          */
 403         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
 404                 return 0;
 405         /*
 406          * Otherwise, try to add it if all previous groups were able
 407          * to go on.
 408          */
 409         return can_add_hw;
 410 }
 411
 412 /*
 413  * Cross CPU call to install and enable a performance counter
 414  */
 415 static void __perf_install_in_context(void *info)
 416 {
 417         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 418         struct perf_counter *counter = info;
 419         struct perf_counter_context *ctx = counter->ctx;
 420         struct perf_counter *leader = counter->group_leader;
 421         int cpu = smp_processor_id();
 422         unsigned long flags;
 423         u64 perf_flags;
 424         int err;
 425
 426         /*
 427          * If this is a task context, we need to check whether it is
 428          * the current task context of this cpu. If not it has been
 429          * scheduled out before the smp call arrived.
 430          */
 431         if (ctx->task && cpuctx->task_ctx != ctx)
 432                 return;
 433
 434         curr_rq_lock_irq_save(&flags);
 435         spin_lock(&ctx->lock);
 436
 437         /*
 438          * Protect the list operation against NMI by disabling the
 439          * counters on a global level. NOP for non NMI based counters.
 440          */
 441         perf_flags = hw_perf_save_disable();
 442
 443         list_add_counter(counter, ctx);
 444         ctx->nr_counters++;
 445
 446         /*
 447          * Don't put the counter on if it is disabled or if
 448          * it is in a group and the group isn't on.
 449          */
 450         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 451             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 452                 goto unlock;
 453
 454         /*
 455          * An exclusive counter can't go on if there are already active
 456          * hardware counters, and no hardware counter can go on if there
 457          * is already an exclusive counter on.
 458          */
 459         if (!group_can_go_on(counter, cpuctx, 1))
 460                 err = -EEXIST;
 461         else
 462                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 463
 464         if (err) {
 465                 /*
 466                  * This counter couldn't go on.  If it is in a group
 467                  * then we have to pull the whole group off.
 468                  * If the counter group is pinned then put it in error state.
 469                  */
 470                 if (leader != counter)
 471                         group_sched_out(leader, cpuctx, ctx);
 472                 if (leader->hw_event.pinned)
 473                         leader->state = PERF_COUNTER_STATE_ERROR;
 474         }
 475
 476         if (!err && !ctx->task && cpuctx->max_pertask)
 477                 cpuctx->max_pertask--;
 478
 479  unlock:
 480         hw_perf_restore(perf_flags);
 481
 482         spin_unlock(&ctx->lock);
 483         curr_rq_unlock_irq_restore(&flags);
 484 }
 485
 486 /*
 487  * Attach a performance counter to a context
 488  *
 489  * First we add the counter to the list with the hardware enable bit
 490  * in counter->hw_config cleared.
 491  *
 492  * If the counter is attached to a task which is on a CPU we use a smp
 493  * call to enable it in the task context. The task might have been
 494  * scheduled away, but we check this in the smp call again.
 495  *
 496  * Must be called with ctx->mutex held.
 497  */
 498 static void
 499 perf_install_in_context(struct perf_counter_context *ctx,
 500                         struct perf_counter *counter,
 501                         int cpu)
 502 {
 503         struct task_struct *task = ctx->task;
 504
 505         counter->ctx = ctx;
 506         if (!task) {
 507                 /*
 508                  * Per cpu counters are installed via an smp call and
 509                  * the install is always sucessful.
 510                  */
 511                 smp_call_function_single(cpu, __perf_install_in_context,
 512                                          counter, 1);
 513                 return;
 514         }
 515
 516         counter->task = task;
 517 retry:
 518         task_oncpu_function_call(task, __perf_install_in_context,
 519                                  counter);
 520
 521         spin_lock_irq(&ctx->lock);
 522         /*
 523          * we need to retry the smp call.
 524          */
 525         if (ctx->is_active && list_empty(&counter->list_entry)) {
 526                 spin_unlock_irq(&ctx->lock);
 527                 goto retry;
 528         }
 529
 530         /*
 531          * The lock prevents that this context is scheduled in so we
 532          * can add the counter safely, if it the call above did not
 533          * succeed.
 534          */
 535         if (list_empty(&counter->list_entry)) {
 536                 list_add_counter(counter, ctx);
 537                 ctx->nr_counters++;
 538         }
 539         spin_unlock_irq(&ctx->lock);
 540 }
 541
 542 /*
 543  * Cross CPU call to enable a performance counter
 544  */
 545 static void __perf_counter_enable(void *info)
 546 {
 547         struct perf_counter *counter = info;
 548         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 549         struct perf_counter_context *ctx = counter->ctx;
 550         struct perf_counter *leader = counter->group_leader;
 551         unsigned long flags;
 552         int err;
 553
 554         /*
 555          * If this is a per-task counter, need to check whether this
 556          * counter's task is the current task on this cpu.
 557          */
 558         if (ctx->task && cpuctx->task_ctx != ctx)
 559                 return;
 560
 561         curr_rq_lock_irq_save(&flags);
 562         spin_lock(&ctx->lock);
 563
 564         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 565                 goto unlock;
 566         counter->state = PERF_COUNTER_STATE_INACTIVE;
 567
 568         /*
 569          * If the counter is in a group and isn't the group leader,
 570          * then don't put it on unless the group is on.
 571          */
 572         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 573                 goto unlock;
 574
 575         if (!group_can_go_on(counter, cpuctx, 1))
 576                 err = -EEXIST;
 577         else
 578                 err = counter_sched_in(counter, cpuctx, ctx,
 579                                        smp_processor_id());
 580
 581         if (err) {
 582                 /*
 583                  * If this counter can't go on and it's part of a
 584                  * group, then the whole group has to come off.
 585                  */
 586                 if (leader != counter)
 587                         group_sched_out(leader, cpuctx, ctx);
 588                 if (leader->hw_event.pinned)
 589                         leader->state = PERF_COUNTER_STATE_ERROR;
 590         }
 591
 592  unlock:
 593         spin_unlock(&ctx->lock);
 594         curr_rq_unlock_irq_restore(&flags);
 595 }
 596
 597 /*
 598  * Enable a counter.
 599  */
 600 static void perf_counter_enable(struct perf_counter *counter)
 601 {
 602         struct perf_counter_context *ctx = counter->ctx;
 603         struct task_struct *task = ctx->task;
 604
 605         if (!task) {
 606                 /*
 607                  * Enable the counter on the cpu that it's on
 608                  */
 609                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 610                                          counter, 1);
 611                 return;
 612         }
 613
 614         spin_lock_irq(&ctx->lock);
 615         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 616                 goto out;
 617
 618         /*
 619          * If the counter is in error state, clear that first.
 620          * That way, if we see the counter in error state below, we
 621          * know that it has gone back into error state, as distinct
 622          * from the task having been scheduled away before the
 623          * cross-call arrived.
 624          */
 625         if (counter->state == PERF_COUNTER_STATE_ERROR)
 626                 counter->state = PERF_COUNTER_STATE_OFF;
 627
 628  retry:
 629         spin_unlock_irq(&ctx->lock);
 630         task_oncpu_function_call(task, __perf_counter_enable, counter);
 631
 632         spin_lock_irq(&ctx->lock);
 633
 634         /*
 635          * If the context is active and the counter is still off,
 636          * we need to retry the cross-call.
 637          */
 638         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 639                 goto retry;
 640
 641         /*
 642          * Since we have the lock this context can't be scheduled
 643          * in, so we can change the state safely.
 644          */
 645         if (counter->state == PERF_COUNTER_STATE_OFF)
 646                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 647  out:
 648         spin_unlock_irq(&ctx->lock);
 649 }
 650
 651 /*
 652  * Enable a counter and all its children.
 653  */
 654 static void perf_counter_enable_family(struct perf_counter *counter)
 655 {
 656         struct perf_counter *child;
 657
 658         perf_counter_enable(counter);
 659
 660         /*
 661          * Lock the mutex to protect the list of children
 662          */
 663         mutex_lock(&counter->mutex);
 664         list_for_each_entry(child, &counter->child_list, child_list)
 665                 perf_counter_enable(child);
 666         mutex_unlock(&counter->mutex);
 667 }
 668
 669 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 670                               struct perf_cpu_context *cpuctx)
 671 {
 672         struct perf_counter *counter;
 673         u64 flags;
 674
 675         spin_lock(&ctx->lock);
 676         ctx->is_active = 0;
 677         if (likely(!ctx->nr_counters))
 678                 goto out;
 679
 680         flags = hw_perf_save_disable();
 681         if (ctx->nr_active) {
 682                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 683                         group_sched_out(counter, cpuctx, ctx);
 684         }
 685         hw_perf_restore(flags);
 686  out:
 687         spin_unlock(&ctx->lock);
 688 }
 689
 690 /*
 691  * Called from scheduler to remove the counters of the current task,
 692  * with interrupts disabled.
 693  *
 694  * We stop each counter and update the counter value in counter->count.
 695  *
 696  * This does not protect us against NMI, but disable()
 697  * sets the disabled bit in the control field of counter _before_
 698  * accessing the counter control register. If a NMI hits, then it will
 699  * not restart the counter.
 700  */
 701 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 702 {
 703         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 704         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 705
 706         if (likely(!cpuctx->task_ctx))
 707                 return;
 708
 709         __perf_counter_sched_out(ctx, cpuctx);
 710
 711         cpuctx->task_ctx = NULL;
 712 }
 713
 714 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 715 {
 716         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 717 }
 718
 719 static int
 720 group_sched_in(struct perf_counter *group_counter,
 721                struct perf_cpu_context *cpuctx,
 722                struct perf_counter_context *ctx,
 723                int cpu)
 724 {
 725         struct perf_counter *counter, *partial_group;
 726         int ret;
 727
 728         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 729                 return 0;
 730
 731         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 732         if (ret)
 733                 return ret < 0 ? ret : 0;
 734
 735         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 736                 return -EAGAIN;
 737
 738         /*
 739          * Schedule in siblings as one group (if any):
 740          */
 741         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 742                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 743                         partial_group = counter;
 744                         goto group_error;
 745                 }
 746         }
 747
 748         return 0;
 749
 750 group_error:
 751         /*
 752          * Groups can be scheduled in as one unit only, so undo any
 753          * partial group before returning:
 754          */
 755         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 756                 if (counter == partial_group)
 757                         break;
 758                 counter_sched_out(counter, cpuctx, ctx);
 759         }
 760         counter_sched_out(group_counter, cpuctx, ctx);
 761
 762         return -EAGAIN;
 763 }
 764
 765 static void
 766 __perf_counter_sched_in(struct perf_counter_context *ctx,
 767                         struct perf_cpu_context *cpuctx, int cpu)
 768 {
 769         struct perf_counter *counter;
 770         u64 flags;
 771         int can_add_hw = 1;
 772
 773         spin_lock(&ctx->lock);
 774         ctx->is_active = 1;
 775         if (likely(!ctx->nr_counters))
 776                 goto out;
 777
 778         flags = hw_perf_save_disable();
 779
 780         /*
 781          * First go through the list and put on any pinned groups
 782          * in order to give them the best chance of going on.
 783          */
 784         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 785                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 786                     !counter->hw_event.pinned)
 787                         continue;
 788                 if (counter->cpu != -1 && counter->cpu != cpu)
 789                         continue;
 790
 791                 if (group_can_go_on(counter, cpuctx, 1))
 792                         group_sched_in(counter, cpuctx, ctx, cpu);
 793
 794                 /*
 795                  * If this pinned group hasn't been scheduled,
 796                  * put it in error state.
 797                  */
 798                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 799                         counter->state = PERF_COUNTER_STATE_ERROR;
 800         }
 801
 802         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 803                 /*
 804                  * Ignore counters in OFF or ERROR state, and
 805                  * ignore pinned counters since we did them already.
 806                  */
 807                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 808                     counter->hw_event.pinned)
 809                         continue;
 810
 811                 /*
 812                  * Listen to the 'cpu' scheduling filter constraint
 813                  * of counters:
 814                  */
 815                 if (counter->cpu != -1 && counter->cpu != cpu)
 816                         continue;
 817
 818                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 819                         if (group_sched_in(counter, cpuctx, ctx, cpu))
 820                                 can_add_hw = 0;
 821                 }
 822         }
 823         hw_perf_restore(flags);
 824  out:
 825         spin_unlock(&ctx->lock);
 826 }
 827
 828 /*
 829  * Called from scheduler to add the counters of the current task
 830  * with interrupts disabled.
 831  *
 832  * We restore the counter value and then enable it.
 833  *
 834  * This does not protect us against NMI, but enable()
 835  * sets the enabled bit in the control field of counter _before_
 836  * accessing the counter control register. If a NMI hits, then it will
 837  * keep the counter running.
 838  */
 839 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 840 {
 841         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 842         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 843
 844         __perf_counter_sched_in(ctx, cpuctx, cpu);
 845         cpuctx->task_ctx = ctx;
 846 }
 847
 848 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 849 {
 850         struct perf_counter_context *ctx = &cpuctx->ctx;
 851
 852         __perf_counter_sched_in(ctx, cpuctx, cpu);
 853 }
 854
 855 int perf_counter_task_disable(void)
 856 {
 857         struct task_struct *curr = current;
 858         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 859         struct perf_counter *counter;
 860         unsigned long flags;
 861         u64 perf_flags;
 862         int cpu;
 863
 864         if (likely(!ctx->nr_counters))
 865                 return 0;
 866
 867         curr_rq_lock_irq_save(&flags);
 868         cpu = smp_processor_id();
 869
 870         /* force the update of the task clock: */
 871         __task_delta_exec(curr, 1);
 872
 873         perf_counter_task_sched_out(curr, cpu);
 874
 875         spin_lock(&ctx->lock);
 876
 877         /*
 878          * Disable all the counters:
 879          */
 880         perf_flags = hw_perf_save_disable();
 881
 882         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 883                 if (counter->state != PERF_COUNTER_STATE_ERROR)
 884                         counter->state = PERF_COUNTER_STATE_OFF;
 885         }
 886
 887         hw_perf_restore(perf_flags);
 888
 889         spin_unlock(&ctx->lock);
 890
 891         curr_rq_unlock_irq_restore(&flags);
 892
 893         return 0;
 894 }
 895
 896 int perf_counter_task_enable(void)
 897 {
 898         struct task_struct *curr = current;
 899         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 900         struct perf_counter *counter;
 901         unsigned long flags;
 902         u64 perf_flags;
 903         int cpu;
 904
 905         if (likely(!ctx->nr_counters))
 906                 return 0;
 907
 908         curr_rq_lock_irq_save(&flags);
 909         cpu = smp_processor_id();
 910
 911         /* force the update of the task clock: */
 912         __task_delta_exec(curr, 1);
 913
 914         perf_counter_task_sched_out(curr, cpu);
 915
 916         spin_lock(&ctx->lock);
 917
 918         /*
 919          * Disable all the counters:
 920          */
 921         perf_flags = hw_perf_save_disable();
 922
 923         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 924                 if (counter->state > PERF_COUNTER_STATE_OFF)
 925                         continue;
 926                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 927                 counter->hw_event.disabled = 0;
 928         }
 929         hw_perf_restore(perf_flags);
 930
 931         spin_unlock(&ctx->lock);
 932
 933         perf_counter_task_sched_in(curr, cpu);
 934
 935         curr_rq_unlock_irq_restore(&flags);
 936
 937         return 0;
 938 }
 939
 940 /*
 941  * Round-robin a context's counters:
 942  */
 943 static void rotate_ctx(struct perf_counter_context *ctx)
 944 {
 945         struct perf_counter *counter;
 946         u64 perf_flags;
 947
 948         if (!ctx->nr_counters)
 949                 return;
 950
 951         spin_lock(&ctx->lock);
 952         /*
 953          * Rotate the first entry last (works just fine for group counters too):
 954          */
 955         perf_flags = hw_perf_save_disable();
 956         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 957                 list_del(&counter->list_entry);
 958                 list_add_tail(&counter->list_entry, &ctx->counter_list);
 959                 break;
 960         }
 961         hw_perf_restore(perf_flags);
 962
 963         spin_unlock(&ctx->lock);
 964 }
 965
 966 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 967 {
 968         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 969         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 970         const int rotate_percpu = 0;
 971
 972         if (rotate_percpu)
 973                 perf_counter_cpu_sched_out(cpuctx);
 974         perf_counter_task_sched_out(curr, cpu);
 975
 976         if (rotate_percpu)
 977                 rotate_ctx(&cpuctx->ctx);
 978         rotate_ctx(ctx);
 979
 980         if (rotate_percpu)
 981                 perf_counter_cpu_sched_in(cpuctx, cpu);
 982         perf_counter_task_sched_in(curr, cpu);
 983 }
 984
 985 /*
 986  * Cross CPU call to read the hardware counter
 987  */
 988 static void __read(void *info)
 989 {
 990         struct perf_counter *counter = info;
 991         unsigned long flags;
 992
 993         curr_rq_lock_irq_save(&flags);
 994         counter->hw_ops->read(counter);
 995         curr_rq_unlock_irq_restore(&flags);
 996 }
 997
 998 static u64 perf_counter_read(struct perf_counter *counter)
 999 {
1000         /*
1001          * If counter is enabled and currently active on a CPU, update the
1002          * value in the counter structure:
1003          */
1004         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1005                 smp_call_function_single(counter->oncpu,
1006                                          __read, counter, 1);
1007         }
1008
1009         return atomic64_read(&counter->count);
1010 }
1011
1012 /*
1013  * Cross CPU call to switch performance data pointers
1014  */
1015 static void __perf_switch_irq_data(void *info)
1016 {
1017         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1018         struct perf_counter *counter = info;
1019         struct perf_counter_context *ctx = counter->ctx;
1020         struct perf_data *oldirqdata = counter->irqdata;
1021
1022         /*
1023          * If this is a task context, we need to check whether it is
1024          * the current task context of this cpu. If not it has been
1025          * scheduled out before the smp call arrived.
1026          */
1027         if (ctx->task) {
1028                 if (cpuctx->task_ctx != ctx)
1029                         return;
1030                 spin_lock(&ctx->lock);
1031         }
1032
1033         /* Change the pointer NMI safe */
1034         atomic_long_set((atomic_long_t *)&counter->irqdata,
1035                         (unsigned long) counter->usrdata);
1036         counter->usrdata = oldirqdata;
1037
1038         if (ctx->task)
1039                 spin_unlock(&ctx->lock);
1040 }
1041
1042 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1043 {
1044         struct perf_counter_context *ctx = counter->ctx;
1045         struct perf_data *oldirqdata = counter->irqdata;
1046         struct task_struct *task = ctx->task;
1047
1048         if (!task) {
1049                 smp_call_function_single(counter->cpu,
1050                                          __perf_switch_irq_data,
1051                                          counter, 1);
1052                 return counter->usrdata;
1053         }
1054
1055 retry:
1056         spin_lock_irq(&ctx->lock);
1057         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1058                 counter->irqdata = counter->usrdata;
1059                 counter->usrdata = oldirqdata;
1060                 spin_unlock_irq(&ctx->lock);
1061                 return oldirqdata;
1062         }
1063         spin_unlock_irq(&ctx->lock);
1064         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1065         /* Might have failed, because task was scheduled out */
1066         if (counter->irqdata == oldirqdata)
1067                 goto retry;
1068
1069         return counter->usrdata;
1070 }
1071
1072 static void put_context(struct perf_counter_context *ctx)
1073 {
1074         if (ctx->task)
1075                 put_task_struct(ctx->task);
1076 }
1077
1078 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1079 {
1080         struct perf_cpu_context *cpuctx;
1081         struct perf_counter_context *ctx;
1082         struct task_struct *task;
1083
1084         /*
1085          * If cpu is not a wildcard then this is a percpu counter:
1086          */
1087         if (cpu != -1) {
1088                 /* Must be root to operate on a CPU counter: */
1089                 if (!capable(CAP_SYS_ADMIN))
1090                         return ERR_PTR(-EACCES);
1091
1092                 if (cpu < 0 || cpu > num_possible_cpus())
1093                         return ERR_PTR(-EINVAL);
1094
1095                 /*
1096                  * We could be clever and allow to attach a counter to an
1097                  * offline CPU and activate it when the CPU comes up, but
1098                  * that's for later.
1099                  */
1100                 if (!cpu_isset(cpu, cpu_online_map))
1101                         return ERR_PTR(-ENODEV);
1102
1103                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1104                 ctx = &cpuctx->ctx;
1105
1106                 return ctx;
1107         }
1108
1109         rcu_read_lock();
1110         if (!pid)
1111                 task = current;
1112         else
1113                 task = find_task_by_vpid(pid);
1114         if (task)
1115                 get_task_struct(task);
1116         rcu_read_unlock();
1117
1118         if (!task)
1119                 return ERR_PTR(-ESRCH);
1120
1121         ctx = &task->perf_counter_ctx;
1122         ctx->task = task;
1123
1124         /* Reuse ptrace permission checks for now. */
1125         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1126                 put_context(ctx);
1127                 return ERR_PTR(-EACCES);
1128         }
1129
1130         return ctx;
1131 }
1132
1133 /*
1134  * Called when the last reference to the file is gone.
1135  */
1136 static int perf_release(struct inode *inode, struct file *file)
1137 {
1138         struct perf_counter *counter = file->private_data;
1139         struct perf_counter_context *ctx = counter->ctx;
1140
1141         file->private_data = NULL;
1142
1143         mutex_lock(&ctx->mutex);
1144         mutex_lock(&counter->mutex);
1145
1146         perf_counter_remove_from_context(counter);
1147         put_context(ctx);
1148
1149         mutex_unlock(&counter->mutex);
1150         mutex_unlock(&ctx->mutex);
1151
1152         kfree(counter);
1153
1154         return 0;
1155 }
1156
1157 /*
1158  * Read the performance counter - simple non blocking version for now
1159  */
1160 static ssize_t
1161 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1162 {
1163         u64 cntval;
1164
1165         if (count != sizeof(cntval))
1166                 return -EINVAL;
1167
1168         /*
1169          * Return end-of-file for a read on a counter that is in
1170          * error state (i.e. because it was pinned but it couldn't be
1171          * scheduled on to the CPU at some point).
1172          */
1173         if (counter->state == PERF_COUNTER_STATE_ERROR)
1174                 return 0;
1175
1176         mutex_lock(&counter->mutex);
1177         cntval = perf_counter_read(counter);
1178         mutex_unlock(&counter->mutex);
1179
1180         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1181 }
1182
1183 static ssize_t
1184 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1185 {
1186         if (!usrdata->len)
1187                 return 0;
1188
1189         count = min(count, (size_t)usrdata->len);
1190         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1191                 return -EFAULT;
1192
1193         /* Adjust the counters */
1194         usrdata->len -= count;
1195         if (!usrdata->len)
1196                 usrdata->rd_idx = 0;
1197         else
1198                 usrdata->rd_idx += count;
1199
1200         return count;
1201 }
1202
1203 static ssize_t
1204 perf_read_irq_data(struct perf_counter  *counter,
1205                    char __user          *buf,
1206                    size_t               count,
1207                    int                  nonblocking)
1208 {
1209         struct perf_data *irqdata, *usrdata;
1210         DECLARE_WAITQUEUE(wait, current);
1211         ssize_t res, res2;
1212
1213         irqdata = counter->irqdata;
1214         usrdata = counter->usrdata;
1215
1216         if (usrdata->len + irqdata->len >= count)
1217                 goto read_pending;
1218
1219         if (nonblocking)
1220                 return -EAGAIN;
1221
1222         spin_lock_irq(&counter->waitq.lock);
1223         __add_wait_queue(&counter->waitq, &wait);
1224         for (;;) {
1225                 set_current_state(TASK_INTERRUPTIBLE);
1226                 if (usrdata->len + irqdata->len >= count)
1227                         break;
1228
1229                 if (signal_pending(current))
1230                         break;
1231
1232                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1233                         break;
1234
1235                 spin_unlock_irq(&counter->waitq.lock);
1236                 schedule();
1237                 spin_lock_irq(&counter->waitq.lock);
1238         }
1239         __remove_wait_queue(&counter->waitq, &wait);
1240         __set_current_state(TASK_RUNNING);
1241         spin_unlock_irq(&counter->waitq.lock);
1242
1243         if (usrdata->len + irqdata->len < count &&
1244             counter->state != PERF_COUNTER_STATE_ERROR)
1245                 return -ERESTARTSYS;
1246 read_pending:
1247         mutex_lock(&counter->mutex);
1248
1249         /* Drain pending data first: */
1250         res = perf_copy_usrdata(usrdata, buf, count);
1251         if (res < 0 || res == count)
1252                 goto out;
1253
1254         /* Switch irq buffer: */
1255         usrdata = perf_switch_irq_data(counter);
1256         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1257         if (res2 < 0) {
1258                 if (!res)
1259                         res = -EFAULT;
1260         } else {
1261                 res += res2;
1262         }
1263 out:
1264         mutex_unlock(&counter->mutex);
1265
1266         return res;
1267 }
1268
1269 static ssize_t
1270 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1271 {
1272         struct perf_counter *counter = file->private_data;
1273
1274         switch (counter->hw_event.record_type) {
1275         case PERF_RECORD_SIMPLE:
1276                 return perf_read_hw(counter, buf, count);
1277
1278         case PERF_RECORD_IRQ:
1279         case PERF_RECORD_GROUP:
1280                 return perf_read_irq_data(counter, buf, count,
1281                                           file->f_flags & O_NONBLOCK);
1282         }
1283         return -EINVAL;
1284 }
1285
1286 static unsigned int perf_poll(struct file *file, poll_table *wait)
1287 {
1288         struct perf_counter *counter = file->private_data;
1289         unsigned int events = 0;
1290         unsigned long flags;
1291
1292         poll_wait(file, &counter->waitq, wait);
1293
1294         spin_lock_irqsave(&counter->waitq.lock, flags);
1295         if (counter->usrdata->len || counter->irqdata->len)
1296                 events |= POLLIN;
1297         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1298
1299         return events;
1300 }
1301
1302 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1303 {
1304         struct perf_counter *counter = file->private_data;
1305         int err = 0;
1306
1307         switch (cmd) {
1308         case PERF_COUNTER_IOC_ENABLE:
1309                 perf_counter_enable_family(counter);
1310                 break;
1311         case PERF_COUNTER_IOC_DISABLE:
1312                 perf_counter_disable_family(counter);
1313                 break;
1314         default:
1315                 err = -ENOTTY;
1316         }
1317         return err;
1318 }
1319
1320 static const struct file_operations perf_fops = {
1321         .release                = perf_release,
1322         .read                   = perf_read,
1323         .poll                   = perf_poll,
1324         .unlocked_ioctl         = perf_ioctl,
1325         .compat_ioctl           = perf_ioctl,
1326 };
1327
1328 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1329 {
1330         int cpu = raw_smp_processor_id();
1331
1332         atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
1333         return 0;
1334 }
1335
1336 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1337 {
1338         int cpu = raw_smp_processor_id();
1339         s64 prev;
1340         u64 now;
1341
1342         now = cpu_clock(cpu);
1343         prev = atomic64_read(&counter->hw.prev_count);
1344         atomic64_set(&counter->hw.prev_count, now);
1345         atomic64_add(now - prev, &counter->count);
1346 }
1347
1348 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1349 {
1350         cpu_clock_perf_counter_update(counter);
1351 }
1352
1353 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1354 {
1355         cpu_clock_perf_counter_update(counter);
1356 }
1357
1358 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1359         .enable         = cpu_clock_perf_counter_enable,
1360         .disable        = cpu_clock_perf_counter_disable,
1361         .read           = cpu_clock_perf_counter_read,
1362 };
1363
1364 /*
1365  * Called from within the scheduler:
1366  */
1367 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1368 {
1369         struct task_struct *curr = counter->task;
1370         u64 delta;
1371
1372         delta = __task_delta_exec(curr, update);
1373
1374         return curr->se.sum_exec_runtime + delta;
1375 }
1376
1377 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1378 {
1379         u64 prev;
1380         s64 delta;
1381
1382         prev = atomic64_read(&counter->hw.prev_count);
1383
1384         atomic64_set(&counter->hw.prev_count, now);
1385
1386         delta = now - prev;
1387
1388         atomic64_add(delta, &counter->count);
1389 }
1390
1391 static void task_clock_perf_counter_read(struct perf_counter *counter)
1392 {
1393         u64 now = task_clock_perf_counter_val(counter, 1);
1394
1395         task_clock_perf_counter_update(counter, now);
1396 }
1397
1398 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1399 {
1400         u64 now = task_clock_perf_counter_val(counter, 0);
1401
1402         atomic64_set(&counter->hw.prev_count, now);
1403
1404         return 0;
1405 }
1406
1407 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1408 {
1409         u64 now = task_clock_perf_counter_val(counter, 0);
1410
1411         task_clock_perf_counter_update(counter, now);
1412 }
1413
1414 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1415         .enable         = task_clock_perf_counter_enable,
1416         .disable        = task_clock_perf_counter_disable,
1417         .read           = task_clock_perf_counter_read,
1418 };
1419
1420 static u64 get_page_faults(void)
1421 {
1422         struct task_struct *curr = current;
1423
1424         return curr->maj_flt + curr->min_flt;
1425 }
1426
1427 static void page_faults_perf_counter_update(struct perf_counter *counter)
1428 {
1429         u64 prev, now;
1430         s64 delta;
1431
1432         prev = atomic64_read(&counter->hw.prev_count);
1433         now = get_page_faults();
1434
1435         atomic64_set(&counter->hw.prev_count, now);
1436
1437         delta = now - prev;
1438
1439         atomic64_add(delta, &counter->count);
1440 }
1441
1442 static void page_faults_perf_counter_read(struct perf_counter *counter)
1443 {
1444         page_faults_perf_counter_update(counter);
1445 }
1446
1447 static int page_faults_perf_counter_enable(struct perf_counter *counter)
1448 {
1449         /*
1450          * page-faults is a per-task value already,
1451          * so we dont have to clear it on switch-in.
1452          */
1453
1454         return 0;
1455 }
1456
1457 static void page_faults_perf_counter_disable(struct perf_counter *counter)
1458 {
1459         page_faults_perf_counter_update(counter);
1460 }
1461
1462 static const struct hw_perf_counter_ops perf_ops_page_faults = {
1463         .enable         = page_faults_perf_counter_enable,
1464         .disable        = page_faults_perf_counter_disable,
1465         .read           = page_faults_perf_counter_read,
1466 };
1467
1468 static u64 get_context_switches(void)
1469 {
1470         struct task_struct *curr = current;
1471
1472         return curr->nvcsw + curr->nivcsw;
1473 }
1474
1475 static void context_switches_perf_counter_update(struct perf_counter *counter)
1476 {
1477         u64 prev, now;
1478         s64 delta;
1479
1480         prev = atomic64_read(&counter->hw.prev_count);
1481         now = get_context_switches();
1482
1483         atomic64_set(&counter->hw.prev_count, now);
1484
1485         delta = now - prev;
1486
1487         atomic64_add(delta, &counter->count);
1488 }
1489
1490 static void context_switches_perf_counter_read(struct perf_counter *counter)
1491 {
1492         context_switches_perf_counter_update(counter);
1493 }
1494
1495 static int context_switches_perf_counter_enable(struct perf_counter *counter)
1496 {
1497         /*
1498          * ->nvcsw + curr->nivcsw is a per-task value already,
1499          * so we dont have to clear it on switch-in.
1500          */
1501
1502         return 0;
1503 }
1504
1505 static void context_switches_perf_counter_disable(struct perf_counter *counter)
1506 {
1507         context_switches_perf_counter_update(counter);
1508 }
1509
1510 static const struct hw_perf_counter_ops perf_ops_context_switches = {
1511         .enable         = context_switches_perf_counter_enable,
1512         .disable        = context_switches_perf_counter_disable,
1513         .read           = context_switches_perf_counter_read,
1514 };
1515
1516 static inline u64 get_cpu_migrations(void)
1517 {
1518         return current->se.nr_migrations;
1519 }
1520
1521 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1522 {
1523         u64 prev, now;
1524         s64 delta;
1525
1526         prev = atomic64_read(&counter->hw.prev_count);
1527         now = get_cpu_migrations();
1528
1529         atomic64_set(&counter->hw.prev_count, now);
1530
1531         delta = now - prev;
1532
1533         atomic64_add(delta, &counter->count);
1534 }
1535
1536 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1537 {
1538         cpu_migrations_perf_counter_update(counter);
1539 }
1540
1541 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1542 {
1543         /*
1544          * se.nr_migrations is a per-task value already,
1545          * so we dont have to clear it on switch-in.
1546          */
1547
1548         return 0;
1549 }
1550
1551 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1552 {
1553         cpu_migrations_perf_counter_update(counter);
1554 }
1555
1556 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1557         .enable         = cpu_migrations_perf_counter_enable,
1558         .disable        = cpu_migrations_perf_counter_disable,
1559         .read           = cpu_migrations_perf_counter_read,
1560 };
1561
1562 static const struct hw_perf_counter_ops *
1563 sw_perf_counter_init(struct perf_counter *counter)
1564 {
1565         const struct hw_perf_counter_ops *hw_ops = NULL;
1566
1567         switch (counter->hw_event.type) {
1568         case PERF_COUNT_CPU_CLOCK:
1569                 hw_ops = &perf_ops_cpu_clock;
1570                 break;
1571         case PERF_COUNT_TASK_CLOCK:
1572                 hw_ops = &perf_ops_task_clock;
1573                 break;
1574         case PERF_COUNT_PAGE_FAULTS:
1575                 hw_ops = &perf_ops_page_faults;
1576                 break;
1577         case PERF_COUNT_CONTEXT_SWITCHES:
1578                 hw_ops = &perf_ops_context_switches;
1579                 break;
1580         case PERF_COUNT_CPU_MIGRATIONS:
1581                 hw_ops = &perf_ops_cpu_migrations;
1582                 break;
1583         default:
1584                 break;
1585         }
1586         return hw_ops;
1587 }
1588
1589 /*
1590  * Allocate and initialize a counter structure
1591  */
1592 static struct perf_counter *
1593 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1594                    int cpu,
1595                    struct perf_counter *group_leader,
1596                    gfp_t gfpflags)
1597 {
1598         const struct hw_perf_counter_ops *hw_ops;
1599         struct perf_counter *counter;
1600
1601         counter = kzalloc(sizeof(*counter), gfpflags);
1602         if (!counter)
1603                 return NULL;
1604
1605         /*
1606          * Single counters are their own group leaders, with an
1607          * empty sibling list:
1608          */
1609         if (!group_leader)
1610                 group_leader = counter;
1611
1612         mutex_init(&counter->mutex);
1613         INIT_LIST_HEAD(&counter->list_entry);
1614         INIT_LIST_HEAD(&counter->sibling_list);
1615         init_waitqueue_head(&counter->waitq);
1616
1617         INIT_LIST_HEAD(&counter->child_list);
1618
1619         counter->irqdata                = &counter->data[0];
1620         counter->usrdata                = &counter->data[1];
1621         counter->cpu                    = cpu;
1622         counter->hw_event               = *hw_event;
1623         counter->wakeup_pending         = 0;
1624         counter->group_leader           = group_leader;
1625         counter->hw_ops                 = NULL;
1626
1627         counter->state = PERF_COUNTER_STATE_INACTIVE;
1628         if (hw_event->disabled)
1629                 counter->state = PERF_COUNTER_STATE_OFF;
1630
1631         hw_ops = NULL;
1632         if (!hw_event->raw && hw_event->type < 0)
1633                 hw_ops = sw_perf_counter_init(counter);
1634         if (!hw_ops)
1635                 hw_ops = hw_perf_counter_init(counter);
1636
1637         if (!hw_ops) {
1638                 kfree(counter);
1639                 return NULL;
1640         }
1641         counter->hw_ops = hw_ops;
1642
1643         return counter;
1644 }
1645
1646 /**
1647  * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1648  *
1649  * @hw_event_uptr:      event type attributes for monitoring/sampling
1650  * @pid:                target pid
1651  * @cpu:                target cpu
1652  * @group_fd:           group leader counter fd
1653  */
1654 asmlinkage int
1655 sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1656                       pid_t pid, int cpu, int group_fd)
1657 {
1658         struct perf_counter *counter, *group_leader;
1659         struct perf_counter_hw_event hw_event;
1660         struct perf_counter_context *ctx;
1661         struct file *counter_file = NULL;
1662         struct file *group_file = NULL;
1663         int fput_needed = 0;
1664         int fput_needed2 = 0;
1665         int ret;
1666
1667         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1668                 return -EFAULT;
1669
1670         /*
1671          * Get the target context (task or percpu):
1672          */
1673         ctx = find_get_context(pid, cpu);
1674         if (IS_ERR(ctx))
1675                 return PTR_ERR(ctx);
1676
1677         /*
1678          * Look up the group leader (we will attach this counter to it):
1679          */
1680         group_leader = NULL;
1681         if (group_fd != -1) {
1682                 ret = -EINVAL;
1683                 group_file = fget_light(group_fd, &fput_needed);
1684                 if (!group_file)
1685                         goto err_put_context;
1686                 if (group_file->f_op != &perf_fops)
1687                         goto err_put_context;
1688
1689                 group_leader = group_file->private_data;
1690                 /*
1691                  * Do not allow a recursive hierarchy (this new sibling
1692                  * becoming part of another group-sibling):
1693                  */
1694                 if (group_leader->group_leader != group_leader)
1695                         goto err_put_context;
1696                 /*
1697                  * Do not allow to attach to a group in a different
1698                  * task or CPU context:
1699                  */
1700                 if (group_leader->ctx != ctx)
1701                         goto err_put_context;
1702                 /*
1703                  * Only a group leader can be exclusive or pinned
1704                  */
1705                 if (hw_event.exclusive || hw_event.pinned)
1706                         goto err_put_context;
1707         }
1708
1709         ret = -EINVAL;
1710         counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
1711         if (!counter)
1712                 goto err_put_context;
1713
1714         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1715         if (ret < 0)
1716                 goto err_free_put_context;
1717
1718         counter_file = fget_light(ret, &fput_needed2);
1719         if (!counter_file)
1720                 goto err_free_put_context;
1721
1722         counter->filp = counter_file;
1723         mutex_lock(&ctx->mutex);
1724         perf_install_in_context(ctx, counter, cpu);
1725         mutex_unlock(&ctx->mutex);
1726
1727         fput_light(counter_file, fput_needed2);
1728
1729 out_fput:
1730         fput_light(group_file, fput_needed);
1731
1732         return ret;
1733
1734 err_free_put_context:
1735         kfree(counter);
1736
1737 err_put_context:
1738         put_context(ctx);
1739
1740         goto out_fput;
1741 }
1742
1743 /*
1744  * Initialize the perf_counter context in a task_struct:
1745  */
1746 static void
1747 __perf_counter_init_context(struct perf_counter_context *ctx,
1748                             struct task_struct *task)
1749 {
1750         memset(ctx, 0, sizeof(*ctx));
1751         spin_lock_init(&ctx->lock);
1752         mutex_init(&ctx->mutex);
1753         INIT_LIST_HEAD(&ctx->counter_list);
1754         ctx->task = task;
1755 }
1756
1757 /*
1758  * inherit a counter from parent task to child task:
1759  */
1760 static struct perf_counter *
1761 inherit_counter(struct perf_counter *parent_counter,
1762               struct task_struct *parent,
1763               struct perf_counter_context *parent_ctx,
1764               struct task_struct *child,
1765               struct perf_counter *group_leader,
1766               struct perf_counter_context *child_ctx)
1767 {
1768         struct perf_counter *child_counter;
1769
1770         /*
1771          * Instead of creating recursive hierarchies of counters,
1772          * we link inherited counters back to the original parent,
1773          * which has a filp for sure, which we use as the reference
1774          * count:
1775          */
1776         if (parent_counter->parent)
1777                 parent_counter = parent_counter->parent;
1778
1779         child_counter = perf_counter_alloc(&parent_counter->hw_event,
1780                                             parent_counter->cpu, group_leader,
1781                                             GFP_KERNEL);
1782         if (!child_counter)
1783                 return NULL;
1784
1785         /*
1786          * Link it up in the child's context:
1787          */
1788         child_counter->ctx = child_ctx;
1789         child_counter->task = child;
1790         list_add_counter(child_counter, child_ctx);
1791         child_ctx->nr_counters++;
1792
1793         child_counter->parent = parent_counter;
1794         /*
1795          * inherit into child's child as well:
1796          */
1797         child_counter->hw_event.inherit = 1;
1798
1799         /*
1800          * Get a reference to the parent filp - we will fput it
1801          * when the child counter exits. This is safe to do because
1802          * we are in the parent and we know that the filp still
1803          * exists and has a nonzero count:
1804          */
1805         atomic_long_inc(&parent_counter->filp->f_count);
1806
1807         /*
1808          * Link this into the parent counter's child list
1809          */
1810         mutex_lock(&parent_counter->mutex);
1811         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
1812
1813         /*
1814          * Make the child state follow the state of the parent counter,
1815          * not its hw_event.disabled bit.  We hold the parent's mutex,
1816          * so we won't race with perf_counter_{en,dis}able_family.
1817          */
1818         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
1819                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1820         else
1821                 child_counter->state = PERF_COUNTER_STATE_OFF;
1822
1823         mutex_unlock(&parent_counter->mutex);
1824
1825         return child_counter;
1826 }
1827
1828 static int inherit_group(struct perf_counter *parent_counter,
1829               struct task_struct *parent,
1830               struct perf_counter_context *parent_ctx,
1831               struct task_struct *child,
1832               struct perf_counter_context *child_ctx)
1833 {
1834         struct perf_counter *leader;
1835         struct perf_counter *sub;
1836
1837         leader = inherit_counter(parent_counter, parent, parent_ctx,
1838                                  child, NULL, child_ctx);
1839         if (!leader)
1840                 return -ENOMEM;
1841         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
1842                 if (!inherit_counter(sub, parent, parent_ctx,
1843                                      child, leader, child_ctx))
1844                         return -ENOMEM;
1845         }
1846         return 0;
1847 }
1848
1849 static void sync_child_counter(struct perf_counter *child_counter,
1850                                struct perf_counter *parent_counter)
1851 {
1852         u64 parent_val, child_val;
1853
1854         parent_val = atomic64_read(&parent_counter->count);
1855         child_val = atomic64_read(&child_counter->count);
1856
1857         /*
1858          * Add back the child's count to the parent's count:
1859          */
1860         atomic64_add(child_val, &parent_counter->count);
1861
1862         /*
1863          * Remove this counter from the parent's list
1864          */
1865         mutex_lock(&parent_counter->mutex);
1866         list_del_init(&child_counter->child_list);
1867         mutex_unlock(&parent_counter->mutex);
1868
1869         /*
1870          * Release the parent counter, if this was the last
1871          * reference to it.
1872          */
1873         fput(parent_counter->filp);
1874 }
1875
1876 static void
1877 __perf_counter_exit_task(struct task_struct *child,
1878                          struct perf_counter *child_counter,
1879                          struct perf_counter_context *child_ctx)
1880 {
1881         struct perf_counter *parent_counter;
1882         struct perf_counter *sub, *tmp;
1883
1884         /*
1885          * If we do not self-reap then we have to wait for the
1886          * child task to unschedule (it will happen for sure),
1887          * so that its counter is at its final count. (This
1888          * condition triggers rarely - child tasks usually get
1889          * off their CPU before the parent has a chance to
1890          * get this far into the reaping action)
1891          */
1892         if (child != current) {
1893                 wait_task_inactive(child, 0);
1894                 list_del_init(&child_counter->list_entry);
1895         } else {
1896                 struct perf_cpu_context *cpuctx;
1897                 unsigned long flags;
1898                 u64 perf_flags;
1899
1900                 /*
1901                  * Disable and unlink this counter.
1902                  *
1903                  * Be careful about zapping the list - IRQ/NMI context
1904                  * could still be processing it:
1905                  */
1906                 curr_rq_lock_irq_save(&flags);
1907                 perf_flags = hw_perf_save_disable();
1908
1909                 cpuctx = &__get_cpu_var(perf_cpu_context);
1910
1911                 group_sched_out(child_counter, cpuctx, child_ctx);
1912
1913                 list_del_init(&child_counter->list_entry);
1914
1915                 child_ctx->nr_counters--;
1916
1917                 hw_perf_restore(perf_flags);
1918                 curr_rq_unlock_irq_restore(&flags);
1919         }
1920
1921         parent_counter = child_counter->parent;
1922         /*
1923          * It can happen that parent exits first, and has counters
1924          * that are still around due to the child reference. These
1925          * counters need to be zapped - but otherwise linger.
1926          */
1927         if (parent_counter) {
1928                 sync_child_counter(child_counter, parent_counter);
1929                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
1930                                          list_entry) {
1931                         if (sub->parent)
1932                                 sync_child_counter(sub, sub->parent);
1933                         kfree(sub);
1934                 }
1935         }
1936
1937         if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count))
1938                 kfree(child_counter);
1939 }
1940
1941 /*
1942  * When a child task exits, feed back counter values to parent counters.
1943  *
1944  * Note: we may be running in child context, but the PID is not hashed
1945  * anymore so new counters will not be added.
1946  */
1947 void perf_counter_exit_task(struct task_struct *child)
1948 {
1949         struct perf_counter *child_counter, *tmp;
1950         struct perf_counter_context *child_ctx;
1951
1952         child_ctx = &child->perf_counter_ctx;
1953
1954         if (likely(!child_ctx->nr_counters))
1955                 return;
1956
1957         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1958                                  list_entry)
1959                 __perf_counter_exit_task(child, child_counter, child_ctx);
1960 }
1961
1962 /*
1963  * Initialize the perf_counter context in task_struct
1964  */
1965 void perf_counter_init_task(struct task_struct *child)
1966 {
1967         struct perf_counter_context *child_ctx, *parent_ctx;
1968         struct perf_counter *counter;
1969         struct task_struct *parent = current;
1970
1971         child_ctx  =  &child->perf_counter_ctx;
1972         parent_ctx = &parent->perf_counter_ctx;
1973
1974         __perf_counter_init_context(child_ctx, child);
1975
1976         /*
1977          * This is executed from the parent task context, so inherit
1978          * counters that have been marked for cloning:
1979          */
1980
1981         if (likely(!parent_ctx->nr_counters))
1982                 return;
1983
1984         /*
1985          * Lock the parent list. No need to lock the child - not PID
1986          * hashed yet and not running, so nobody can access it.
1987          */
1988         mutex_lock(&parent_ctx->mutex);
1989
1990         /*
1991          * We dont have to disable NMIs - we are only looking at
1992          * the list, not manipulating it:
1993          */
1994         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
1995                 if (!counter->hw_event.inherit)
1996                         continue;
1997
1998                 if (inherit_group(counter, parent,
1999                                   parent_ctx, child, child_ctx))
2000                         break;
2001         }
2002
2003         mutex_unlock(&parent_ctx->mutex);
2004 }
2005
2006 static void __cpuinit perf_counter_init_cpu(int cpu)
2007 {
2008         struct perf_cpu_context *cpuctx;
2009
2010         cpuctx = &per_cpu(perf_cpu_context, cpu);
2011         __perf_counter_init_context(&cpuctx->ctx, NULL);
2012
2013         mutex_lock(&perf_resource_mutex);
2014         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2015         mutex_unlock(&perf_resource_mutex);
2016
2017         hw_perf_counter_setup(cpu);
2018 }
2019
2020 #ifdef CONFIG_HOTPLUG_CPU
2021 static void __perf_counter_exit_cpu(void *info)
2022 {
2023         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2024         struct perf_counter_context *ctx = &cpuctx->ctx;
2025         struct perf_counter *counter, *tmp;
2026
2027         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2028                 __perf_counter_remove_from_context(counter);
2029 }
2030 static void perf_counter_exit_cpu(int cpu)
2031 {
2032         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2033         struct perf_counter_context *ctx = &cpuctx->ctx;
2034
2035         mutex_lock(&ctx->mutex);
2036         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2037         mutex_unlock(&ctx->mutex);
2038 }
2039 #else
2040 static inline void perf_counter_exit_cpu(int cpu) { }
2041 #endif
2042
2043 static int __cpuinit
2044 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2045 {
2046         unsigned int cpu = (long)hcpu;
2047
2048         switch (action) {
2049
2050         case CPU_UP_PREPARE:
2051         case CPU_UP_PREPARE_FROZEN:
2052                 perf_counter_init_cpu(cpu);
2053                 break;
2054
2055         case CPU_DOWN_PREPARE:
2056         case CPU_DOWN_PREPARE_FROZEN:
2057                 perf_counter_exit_cpu(cpu);
2058                 break;
2059
2060         default:
2061                 break;
2062         }
2063
2064         return NOTIFY_OK;
2065 }
2066
2067 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2068         .notifier_call          = perf_cpu_notify,
2069 };
2070
2071 static int __init perf_counter_init(void)
2072 {
2073         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2074                         (void *)(long)smp_processor_id());
2075         register_cpu_notifier(&perf_cpu_nb);
2076
2077         return 0;
2078 }
2079 early_initcall(perf_counter_init);
2080
2081 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2082 {
2083         return sprintf(buf, "%d\n", perf_reserved_percpu);
2084 }
2085
2086 static ssize_t
2087 perf_set_reserve_percpu(struct sysdev_class *class,
2088                         const char *buf,
2089                         size_t count)
2090 {
2091         struct perf_cpu_context *cpuctx;
2092         unsigned long val;
2093         int err, cpu, mpt;
2094
2095         err = strict_strtoul(buf, 10, &val);
2096         if (err)
2097                 return err;
2098         if (val > perf_max_counters)
2099                 return -EINVAL;
2100
2101         mutex_lock(&perf_resource_mutex);
2102         perf_reserved_percpu = val;
2103         for_each_online_cpu(cpu) {
2104                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2105                 spin_lock_irq(&cpuctx->ctx.lock);
2106                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2107                           perf_max_counters - perf_reserved_percpu);
2108                 cpuctx->max_pertask = mpt;
2109                 spin_unlock_irq(&cpuctx->ctx.lock);
2110         }
2111         mutex_unlock(&perf_resource_mutex);
2112
2113         return count;
2114 }
2115
2116 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2117 {
2118         return sprintf(buf, "%d\n", perf_overcommit);
2119 }
2120
2121 static ssize_t
2122 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2123 {
2124         unsigned long val;
2125         int err;
2126
2127         err = strict_strtoul(buf, 10, &val);
2128         if (err)
2129                 return err;
2130         if (val > 1)
2131                 return -EINVAL;
2132
2133         mutex_lock(&perf_resource_mutex);
2134         perf_overcommit = val;
2135         mutex_unlock(&perf_resource_mutex);
2136
2137         return count;
2138 }
2139
2140 static SYSDEV_CLASS_ATTR(
2141                                 reserve_percpu,
2142                                 0644,
2143                                 perf_show_reserve_percpu,
2144                                 perf_set_reserve_percpu
2145                         );
2146
2147 static SYSDEV_CLASS_ATTR(
2148                                 overcommit,
2149                                 0644,
2150                                 perf_show_overcommit,
2151                                 perf_set_overcommit
2152                         );
2153
2154 static struct attribute *perfclass_attrs[] = {
2155         &attr_reserve_percpu.attr,
2156         &attr_overcommit.attr,
2157         NULL
2158 };
2159
2160 static struct attribute_group perfclass_attr_group = {
2161         .attrs                  = perfclass_attrs,
2162         .name                   = "perf_counters",
2163 };
2164
2165 static int __init perf_counter_sysfs_init(void)
2166 {
2167         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2168                                   &perfclass_attr_group);
2169 }
2170 device_initcall(perf_counter_sysfs_init);