kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *  For licencing details see kernel-base/COPYING
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/cpu.h>
  12 #include <linux/smp.h>
  13 #include <linux/file.h>
  14 #include <linux/poll.h>
  15 #include <linux/sysfs.h>
  16 #include <linux/ptrace.h>
  17 #include <linux/percpu.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/kernel_stat.h>
  22 #include <linux/perf_counter.h>
  23 #include <linux/mm.h>
  24 #include <linux/vmstat.h>
  25
  26 /*
  27  * Each CPU has a list of per CPU counters:
  28  */
  29 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  30
  31 int perf_max_counters __read_mostly = 1;
  32 static int perf_reserved_percpu __read_mostly;
  33 static int perf_overcommit __read_mostly = 1;
  34
  35 /*
  36  * Mutex for (sysadmin-configurable) counter reservations:
  37  */
  38 static DEFINE_MUTEX(perf_resource_mutex);
  39
  40 /*
  41  * Architecture provided APIs - weak aliases:
  42  */
  43 extern __weak const struct hw_perf_counter_ops *
  44 hw_perf_counter_init(struct perf_counter *counter)
  45 {
  46         return NULL;
  47 }
  48
  49 u64 __weak hw_perf_save_disable(void)           { return 0; }
  50 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  51 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  52 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  53                struct perf_cpu_context *cpuctx,
  54                struct perf_counter_context *ctx, int cpu)
  55 {
  56         return 0;
  57 }
  58
  59 void __weak perf_counter_print_debug(void)      { }
  60
  61 static void
  62 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  63 {
  64         struct perf_counter *group_leader = counter->group_leader;
  65
  66         /*
  67          * Depending on whether it is a standalone or sibling counter,
  68          * add it straight to the context's counter list, or to the group
  69          * leader's sibling list:
  70          */
  71         if (counter->group_leader == counter)
  72                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  73         else
  74                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  75 }
  76
  77 static void
  78 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  79 {
  80         struct perf_counter *sibling, *tmp;
  81
  82         list_del_init(&counter->list_entry);
  83
  84         /*
  85          * If this was a group counter with sibling counters then
  86          * upgrade the siblings to singleton counters by adding them
  87          * to the context list directly:
  88          */
  89         list_for_each_entry_safe(sibling, tmp,
  90                                  &counter->sibling_list, list_entry) {
  91
  92                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
  93                 sibling->group_leader = sibling;
  94         }
  95 }
  96
  97 static void
  98 counter_sched_out(struct perf_counter *counter,
  99                   struct perf_cpu_context *cpuctx,
 100                   struct perf_counter_context *ctx)
 101 {
 102         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 103                 return;
 104
 105         counter->state = PERF_COUNTER_STATE_INACTIVE;
 106         counter->hw_ops->disable(counter);
 107         counter->oncpu = -1;
 108
 109         if (!is_software_counter(counter))
 110                 cpuctx->active_oncpu--;
 111         ctx->nr_active--;
 112         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
 113                 cpuctx->exclusive = 0;
 114 }
 115
 116 static void
 117 group_sched_out(struct perf_counter *group_counter,
 118                 struct perf_cpu_context *cpuctx,
 119                 struct perf_counter_context *ctx)
 120 {
 121         struct perf_counter *counter;
 122
 123         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 124                 return;
 125
 126         counter_sched_out(group_counter, cpuctx, ctx);
 127
 128         /*
 129          * Schedule out siblings (if any):
 130          */
 131         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 132                 counter_sched_out(counter, cpuctx, ctx);
 133
 134         if (group_counter->hw_event.exclusive)
 135                 cpuctx->exclusive = 0;
 136 }
 137
 138 /*
 139  * Cross CPU call to remove a performance counter
 140  *
 141  * We disable the counter on the hardware level first. After that we
 142  * remove it from the context list.
 143  */
 144 static void __perf_counter_remove_from_context(void *info)
 145 {
 146         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 147         struct perf_counter *counter = info;
 148         struct perf_counter_context *ctx = counter->ctx;
 149         unsigned long flags;
 150         u64 perf_flags;
 151
 152         /*
 153          * If this is a task context, we need to check whether it is
 154          * the current task context of this cpu. If not it has been
 155          * scheduled out before the smp call arrived.
 156          */
 157         if (ctx->task && cpuctx->task_ctx != ctx)
 158                 return;
 159
 160         curr_rq_lock_irq_save(&flags);
 161         spin_lock(&ctx->lock);
 162
 163         counter_sched_out(counter, cpuctx, ctx);
 164
 165         counter->task = NULL;
 166         ctx->nr_counters--;
 167
 168         /*
 169          * Protect the list operation against NMI by disabling the
 170          * counters on a global level. NOP for non NMI based counters.
 171          */
 172         perf_flags = hw_perf_save_disable();
 173         list_del_counter(counter, ctx);
 174         hw_perf_restore(perf_flags);
 175
 176         if (!ctx->task) {
 177                 /*
 178                  * Allow more per task counters with respect to the
 179                  * reservation:
 180                  */
 181                 cpuctx->max_pertask =
 182                         min(perf_max_counters - ctx->nr_counters,
 183                             perf_max_counters - perf_reserved_percpu);
 184         }
 185
 186         spin_unlock(&ctx->lock);
 187         curr_rq_unlock_irq_restore(&flags);
 188 }
 189
 190
 191 /*
 192  * Remove the counter from a task's (or a CPU's) list of counters.
 193  *
 194  * Must be called with counter->mutex and ctx->mutex held.
 195  *
 196  * CPU counters are removed with a smp call. For task counters we only
 197  * call when the task is on a CPU.
 198  */
 199 static void perf_counter_remove_from_context(struct perf_counter *counter)
 200 {
 201         struct perf_counter_context *ctx = counter->ctx;
 202         struct task_struct *task = ctx->task;
 203
 204         if (!task) {
 205                 /*
 206                  * Per cpu counters are removed via an smp call and
 207                  * the removal is always sucessful.
 208                  */
 209                 smp_call_function_single(counter->cpu,
 210                                          __perf_counter_remove_from_context,
 211                                          counter, 1);
 212                 return;
 213         }
 214
 215 retry:
 216         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 217                                  counter);
 218
 219         spin_lock_irq(&ctx->lock);
 220         /*
 221          * If the context is active we need to retry the smp call.
 222          */
 223         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 224                 spin_unlock_irq(&ctx->lock);
 225                 goto retry;
 226         }
 227
 228         /*
 229          * The lock prevents that this context is scheduled in so we
 230          * can remove the counter safely, if the call above did not
 231          * succeed.
 232          */
 233         if (!list_empty(&counter->list_entry)) {
 234                 ctx->nr_counters--;
 235                 list_del_counter(counter, ctx);
 236                 counter->task = NULL;
 237         }
 238         spin_unlock_irq(&ctx->lock);
 239 }
 240
 241 /*
 242  * Cross CPU call to disable a performance counter
 243  */
 244 static void __perf_counter_disable(void *info)
 245 {
 246         struct perf_counter *counter = info;
 247         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 248         struct perf_counter_context *ctx = counter->ctx;
 249         unsigned long flags;
 250
 251         /*
 252          * If this is a per-task counter, need to check whether this
 253          * counter's task is the current task on this cpu.
 254          */
 255         if (ctx->task && cpuctx->task_ctx != ctx)
 256                 return;
 257
 258         curr_rq_lock_irq_save(&flags);
 259         spin_lock(&ctx->lock);
 260
 261         /*
 262          * If the counter is on, turn it off.
 263          * If it is in error state, leave it in error state.
 264          */
 265         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 266                 if (counter == counter->group_leader)
 267                         group_sched_out(counter, cpuctx, ctx);
 268                 else
 269                         counter_sched_out(counter, cpuctx, ctx);
 270                 counter->state = PERF_COUNTER_STATE_OFF;
 271         }
 272
 273         spin_unlock(&ctx->lock);
 274         curr_rq_unlock_irq_restore(&flags);
 275 }
 276
 277 /*
 278  * Disable a counter.
 279  */
 280 static void perf_counter_disable(struct perf_counter *counter)
 281 {
 282         struct perf_counter_context *ctx = counter->ctx;
 283         struct task_struct *task = ctx->task;
 284
 285         if (!task) {
 286                 /*
 287                  * Disable the counter on the cpu that it's on
 288                  */
 289                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 290                                          counter, 1);
 291                 return;
 292         }
 293
 294  retry:
 295         task_oncpu_function_call(task, __perf_counter_disable, counter);
 296
 297         spin_lock_irq(&ctx->lock);
 298         /*
 299          * If the counter is still active, we need to retry the cross-call.
 300          */
 301         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 302                 spin_unlock_irq(&ctx->lock);
 303                 goto retry;
 304         }
 305
 306         /*
 307          * Since we have the lock this context can't be scheduled
 308          * in, so we can change the state safely.
 309          */
 310         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 311                 counter->state = PERF_COUNTER_STATE_OFF;
 312
 313         spin_unlock_irq(&ctx->lock);
 314 }
 315
 316 /*
 317  * Disable a counter and all its children.
 318  */
 319 static void perf_counter_disable_family(struct perf_counter *counter)
 320 {
 321         struct perf_counter *child;
 322
 323         perf_counter_disable(counter);
 324
 325         /*
 326          * Lock the mutex to protect the list of children
 327          */
 328         mutex_lock(&counter->mutex);
 329         list_for_each_entry(child, &counter->child_list, child_list)
 330                 perf_counter_disable(child);
 331         mutex_unlock(&counter->mutex);
 332 }
 333
 334 static int
 335 counter_sched_in(struct perf_counter *counter,
 336                  struct perf_cpu_context *cpuctx,
 337                  struct perf_counter_context *ctx,
 338                  int cpu)
 339 {
 340         if (counter->state <= PERF_COUNTER_STATE_OFF)
 341                 return 0;
 342
 343         counter->state = PERF_COUNTER_STATE_ACTIVE;
 344         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 345         /*
 346          * The new state must be visible before we turn it on in the hardware:
 347          */
 348         smp_wmb();
 349
 350         if (counter->hw_ops->enable(counter)) {
 351                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 352                 counter->oncpu = -1;
 353                 return -EAGAIN;
 354         }
 355
 356         if (!is_software_counter(counter))
 357                 cpuctx->active_oncpu++;
 358         ctx->nr_active++;
 359
 360         if (counter->hw_event.exclusive)
 361                 cpuctx->exclusive = 1;
 362
 363         return 0;
 364 }
 365
 366 /*
 367  * Return 1 for a group consisting entirely of software counters,
 368  * 0 if the group contains any hardware counters.
 369  */
 370 static int is_software_only_group(struct perf_counter *leader)
 371 {
 372         struct perf_counter *counter;
 373
 374         if (!is_software_counter(leader))
 375                 return 0;
 376         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 377                 if (!is_software_counter(counter))
 378                         return 0;
 379         return 1;
 380 }
 381
 382 /*
 383  * Work out whether we can put this counter group on the CPU now.
 384  */
 385 static int group_can_go_on(struct perf_counter *counter,
 386                            struct perf_cpu_context *cpuctx,
 387                            int can_add_hw)
 388 {
 389         /*
 390          * Groups consisting entirely of software counters can always go on.
 391          */
 392         if (is_software_only_group(counter))
 393                 return 1;
 394         /*
 395          * If an exclusive group is already on, no other hardware
 396          * counters can go on.
 397          */
 398         if (cpuctx->exclusive)
 399                 return 0;
 400         /*
 401          * If this group is exclusive and there are already
 402          * counters on the CPU, it can't go on.
 403          */
 404         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
 405                 return 0;
 406         /*
 407          * Otherwise, try to add it if all previous groups were able
 408          * to go on.
 409          */
 410         return can_add_hw;
 411 }
 412
 413 /*
 414  * Cross CPU call to install and enable a performance counter
 415  */
 416 static void __perf_install_in_context(void *info)
 417 {
 418         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 419         struct perf_counter *counter = info;
 420         struct perf_counter_context *ctx = counter->ctx;
 421         struct perf_counter *leader = counter->group_leader;
 422         int cpu = smp_processor_id();
 423         unsigned long flags;
 424         u64 perf_flags;
 425         int err;
 426
 427         /*
 428          * If this is a task context, we need to check whether it is
 429          * the current task context of this cpu. If not it has been
 430          * scheduled out before the smp call arrived.
 431          */
 432         if (ctx->task && cpuctx->task_ctx != ctx)
 433                 return;
 434
 435         curr_rq_lock_irq_save(&flags);
 436         spin_lock(&ctx->lock);
 437
 438         /*
 439          * Protect the list operation against NMI by disabling the
 440          * counters on a global level. NOP for non NMI based counters.
 441          */
 442         perf_flags = hw_perf_save_disable();
 443
 444         list_add_counter(counter, ctx);
 445         ctx->nr_counters++;
 446         counter->prev_state = PERF_COUNTER_STATE_OFF;
 447
 448         /*
 449          * Don't put the counter on if it is disabled or if
 450          * it is in a group and the group isn't on.
 451          */
 452         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 453             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 454                 goto unlock;
 455
 456         /*
 457          * An exclusive counter can't go on if there are already active
 458          * hardware counters, and no hardware counter can go on if there
 459          * is already an exclusive counter on.
 460          */
 461         if (!group_can_go_on(counter, cpuctx, 1))
 462                 err = -EEXIST;
 463         else
 464                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 465
 466         if (err) {
 467                 /*
 468                  * This counter couldn't go on.  If it is in a group
 469                  * then we have to pull the whole group off.
 470                  * If the counter group is pinned then put it in error state.
 471                  */
 472                 if (leader != counter)
 473                         group_sched_out(leader, cpuctx, ctx);
 474                 if (leader->hw_event.pinned)
 475                         leader->state = PERF_COUNTER_STATE_ERROR;
 476         }
 477
 478         if (!err && !ctx->task && cpuctx->max_pertask)
 479                 cpuctx->max_pertask--;
 480
 481  unlock:
 482         hw_perf_restore(perf_flags);
 483
 484         spin_unlock(&ctx->lock);
 485         curr_rq_unlock_irq_restore(&flags);
 486 }
 487
 488 /*
 489  * Attach a performance counter to a context
 490  *
 491  * First we add the counter to the list with the hardware enable bit
 492  * in counter->hw_config cleared.
 493  *
 494  * If the counter is attached to a task which is on a CPU we use a smp
 495  * call to enable it in the task context. The task might have been
 496  * scheduled away, but we check this in the smp call again.
 497  *
 498  * Must be called with ctx->mutex held.
 499  */
 500 static void
 501 perf_install_in_context(struct perf_counter_context *ctx,
 502                         struct perf_counter *counter,
 503                         int cpu)
 504 {
 505         struct task_struct *task = ctx->task;
 506
 507         if (!task) {
 508                 /*
 509                  * Per cpu counters are installed via an smp call and
 510                  * the install is always sucessful.
 511                  */
 512                 smp_call_function_single(cpu, __perf_install_in_context,
 513                                          counter, 1);
 514                 return;
 515         }
 516
 517         counter->task = task;
 518 retry:
 519         task_oncpu_function_call(task, __perf_install_in_context,
 520                                  counter);
 521
 522         spin_lock_irq(&ctx->lock);
 523         /*
 524          * we need to retry the smp call.
 525          */
 526         if (ctx->is_active && list_empty(&counter->list_entry)) {
 527                 spin_unlock_irq(&ctx->lock);
 528                 goto retry;
 529         }
 530
 531         /*
 532          * The lock prevents that this context is scheduled in so we
 533          * can add the counter safely, if it the call above did not
 534          * succeed.
 535          */
 536         if (list_empty(&counter->list_entry)) {
 537                 list_add_counter(counter, ctx);
 538                 ctx->nr_counters++;
 539         }
 540         spin_unlock_irq(&ctx->lock);
 541 }
 542
 543 /*
 544  * Cross CPU call to enable a performance counter
 545  */
 546 static void __perf_counter_enable(void *info)
 547 {
 548         struct perf_counter *counter = info;
 549         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 550         struct perf_counter_context *ctx = counter->ctx;
 551         struct perf_counter *leader = counter->group_leader;
 552         unsigned long flags;
 553         int err;
 554
 555         /*
 556          * If this is a per-task counter, need to check whether this
 557          * counter's task is the current task on this cpu.
 558          */
 559         if (ctx->task && cpuctx->task_ctx != ctx)
 560                 return;
 561
 562         curr_rq_lock_irq_save(&flags);
 563         spin_lock(&ctx->lock);
 564
 565         counter->prev_state = counter->state;
 566         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 567                 goto unlock;
 568         counter->state = PERF_COUNTER_STATE_INACTIVE;
 569
 570         /*
 571          * If the counter is in a group and isn't the group leader,
 572          * then don't put it on unless the group is on.
 573          */
 574         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 575                 goto unlock;
 576
 577         if (!group_can_go_on(counter, cpuctx, 1))
 578                 err = -EEXIST;
 579         else
 580                 err = counter_sched_in(counter, cpuctx, ctx,
 581                                        smp_processor_id());
 582
 583         if (err) {
 584                 /*
 585                  * If this counter can't go on and it's part of a
 586                  * group, then the whole group has to come off.
 587                  */
 588                 if (leader != counter)
 589                         group_sched_out(leader, cpuctx, ctx);
 590                 if (leader->hw_event.pinned)
 591                         leader->state = PERF_COUNTER_STATE_ERROR;
 592         }
 593
 594  unlock:
 595         spin_unlock(&ctx->lock);
 596         curr_rq_unlock_irq_restore(&flags);
 597 }
 598
 599 /*
 600  * Enable a counter.
 601  */
 602 static void perf_counter_enable(struct perf_counter *counter)
 603 {
 604         struct perf_counter_context *ctx = counter->ctx;
 605         struct task_struct *task = ctx->task;
 606
 607         if (!task) {
 608                 /*
 609                  * Enable the counter on the cpu that it's on
 610                  */
 611                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 612                                          counter, 1);
 613                 return;
 614         }
 615
 616         spin_lock_irq(&ctx->lock);
 617         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 618                 goto out;
 619
 620         /*
 621          * If the counter is in error state, clear that first.
 622          * That way, if we see the counter in error state below, we
 623          * know that it has gone back into error state, as distinct
 624          * from the task having been scheduled away before the
 625          * cross-call arrived.
 626          */
 627         if (counter->state == PERF_COUNTER_STATE_ERROR)
 628                 counter->state = PERF_COUNTER_STATE_OFF;
 629
 630  retry:
 631         spin_unlock_irq(&ctx->lock);
 632         task_oncpu_function_call(task, __perf_counter_enable, counter);
 633
 634         spin_lock_irq(&ctx->lock);
 635
 636         /*
 637          * If the context is active and the counter is still off,
 638          * we need to retry the cross-call.
 639          */
 640         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 641                 goto retry;
 642
 643         /*
 644          * Since we have the lock this context can't be scheduled
 645          * in, so we can change the state safely.
 646          */
 647         if (counter->state == PERF_COUNTER_STATE_OFF)
 648                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 649  out:
 650         spin_unlock_irq(&ctx->lock);
 651 }
 652
 653 /*
 654  * Enable a counter and all its children.
 655  */
 656 static void perf_counter_enable_family(struct perf_counter *counter)
 657 {
 658         struct perf_counter *child;
 659
 660         perf_counter_enable(counter);
 661
 662         /*
 663          * Lock the mutex to protect the list of children
 664          */
 665         mutex_lock(&counter->mutex);
 666         list_for_each_entry(child, &counter->child_list, child_list)
 667                 perf_counter_enable(child);
 668         mutex_unlock(&counter->mutex);
 669 }
 670
 671 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 672                               struct perf_cpu_context *cpuctx)
 673 {
 674         struct perf_counter *counter;
 675         u64 flags;
 676
 677         spin_lock(&ctx->lock);
 678         ctx->is_active = 0;
 679         if (likely(!ctx->nr_counters))
 680                 goto out;
 681
 682         flags = hw_perf_save_disable();
 683         if (ctx->nr_active) {
 684                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 685                         group_sched_out(counter, cpuctx, ctx);
 686         }
 687         hw_perf_restore(flags);
 688  out:
 689         spin_unlock(&ctx->lock);
 690 }
 691
 692 /*
 693  * Called from scheduler to remove the counters of the current task,
 694  * with interrupts disabled.
 695  *
 696  * We stop each counter and update the counter value in counter->count.
 697  *
 698  * This does not protect us against NMI, but disable()
 699  * sets the disabled bit in the control field of counter _before_
 700  * accessing the counter control register. If a NMI hits, then it will
 701  * not restart the counter.
 702  */
 703 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 704 {
 705         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 706         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 707
 708         if (likely(!cpuctx->task_ctx))
 709                 return;
 710
 711         __perf_counter_sched_out(ctx, cpuctx);
 712
 713         cpuctx->task_ctx = NULL;
 714 }
 715
 716 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 717 {
 718         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 719 }
 720
 721 static int
 722 group_sched_in(struct perf_counter *group_counter,
 723                struct perf_cpu_context *cpuctx,
 724                struct perf_counter_context *ctx,
 725                int cpu)
 726 {
 727         struct perf_counter *counter, *partial_group;
 728         int ret;
 729
 730         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 731                 return 0;
 732
 733         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 734         if (ret)
 735                 return ret < 0 ? ret : 0;
 736
 737         group_counter->prev_state = group_counter->state;
 738         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 739                 return -EAGAIN;
 740
 741         /*
 742          * Schedule in siblings as one group (if any):
 743          */
 744         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 745                 counter->prev_state = counter->state;
 746                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 747                         partial_group = counter;
 748                         goto group_error;
 749                 }
 750         }
 751
 752         return 0;
 753
 754 group_error:
 755         /*
 756          * Groups can be scheduled in as one unit only, so undo any
 757          * partial group before returning:
 758          */
 759         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 760                 if (counter == partial_group)
 761                         break;
 762                 counter_sched_out(counter, cpuctx, ctx);
 763         }
 764         counter_sched_out(group_counter, cpuctx, ctx);
 765
 766         return -EAGAIN;
 767 }
 768
 769 static void
 770 __perf_counter_sched_in(struct perf_counter_context *ctx,
 771                         struct perf_cpu_context *cpuctx, int cpu)
 772 {
 773         struct perf_counter *counter;
 774         u64 flags;
 775         int can_add_hw = 1;
 776
 777         spin_lock(&ctx->lock);
 778         ctx->is_active = 1;
 779         if (likely(!ctx->nr_counters))
 780                 goto out;
 781
 782         flags = hw_perf_save_disable();
 783
 784         /*
 785          * First go through the list and put on any pinned groups
 786          * in order to give them the best chance of going on.
 787          */
 788         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 789                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 790                     !counter->hw_event.pinned)
 791                         continue;
 792                 if (counter->cpu != -1 && counter->cpu != cpu)
 793                         continue;
 794
 795                 if (group_can_go_on(counter, cpuctx, 1))
 796                         group_sched_in(counter, cpuctx, ctx, cpu);
 797
 798                 /*
 799                  * If this pinned group hasn't been scheduled,
 800                  * put it in error state.
 801                  */
 802                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 803                         counter->state = PERF_COUNTER_STATE_ERROR;
 804         }
 805
 806         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 807                 /*
 808                  * Ignore counters in OFF or ERROR state, and
 809                  * ignore pinned counters since we did them already.
 810                  */
 811                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 812                     counter->hw_event.pinned)
 813                         continue;
 814
 815                 /*
 816                  * Listen to the 'cpu' scheduling filter constraint
 817                  * of counters:
 818                  */
 819                 if (counter->cpu != -1 && counter->cpu != cpu)
 820                         continue;
 821
 822                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 823                         if (group_sched_in(counter, cpuctx, ctx, cpu))
 824                                 can_add_hw = 0;
 825                 }
 826         }
 827         hw_perf_restore(flags);
 828  out:
 829         spin_unlock(&ctx->lock);
 830 }
 831
 832 /*
 833  * Called from scheduler to add the counters of the current task
 834  * with interrupts disabled.
 835  *
 836  * We restore the counter value and then enable it.
 837  *
 838  * This does not protect us against NMI, but enable()
 839  * sets the enabled bit in the control field of counter _before_
 840  * accessing the counter control register. If a NMI hits, then it will
 841  * keep the counter running.
 842  */
 843 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 844 {
 845         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 846         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 847
 848         __perf_counter_sched_in(ctx, cpuctx, cpu);
 849         cpuctx->task_ctx = ctx;
 850 }
 851
 852 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 853 {
 854         struct perf_counter_context *ctx = &cpuctx->ctx;
 855
 856         __perf_counter_sched_in(ctx, cpuctx, cpu);
 857 }
 858
 859 int perf_counter_task_disable(void)
 860 {
 861         struct task_struct *curr = current;
 862         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 863         struct perf_counter *counter;
 864         unsigned long flags;
 865         u64 perf_flags;
 866         int cpu;
 867
 868         if (likely(!ctx->nr_counters))
 869                 return 0;
 870
 871         curr_rq_lock_irq_save(&flags);
 872         cpu = smp_processor_id();
 873
 874         /* force the update of the task clock: */
 875         __task_delta_exec(curr, 1);
 876
 877         perf_counter_task_sched_out(curr, cpu);
 878
 879         spin_lock(&ctx->lock);
 880
 881         /*
 882          * Disable all the counters:
 883          */
 884         perf_flags = hw_perf_save_disable();
 885
 886         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 887                 if (counter->state != PERF_COUNTER_STATE_ERROR)
 888                         counter->state = PERF_COUNTER_STATE_OFF;
 889         }
 890
 891         hw_perf_restore(perf_flags);
 892
 893         spin_unlock(&ctx->lock);
 894
 895         curr_rq_unlock_irq_restore(&flags);
 896
 897         return 0;
 898 }
 899
 900 int perf_counter_task_enable(void)
 901 {
 902         struct task_struct *curr = current;
 903         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 904         struct perf_counter *counter;
 905         unsigned long flags;
 906         u64 perf_flags;
 907         int cpu;
 908
 909         if (likely(!ctx->nr_counters))
 910                 return 0;
 911
 912         curr_rq_lock_irq_save(&flags);
 913         cpu = smp_processor_id();
 914
 915         /* force the update of the task clock: */
 916         __task_delta_exec(curr, 1);
 917
 918         perf_counter_task_sched_out(curr, cpu);
 919
 920         spin_lock(&ctx->lock);
 921
 922         /*
 923          * Disable all the counters:
 924          */
 925         perf_flags = hw_perf_save_disable();
 926
 927         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 928                 if (counter->state > PERF_COUNTER_STATE_OFF)
 929                         continue;
 930                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 931                 counter->hw_event.disabled = 0;
 932         }
 933         hw_perf_restore(perf_flags);
 934
 935         spin_unlock(&ctx->lock);
 936
 937         perf_counter_task_sched_in(curr, cpu);
 938
 939         curr_rq_unlock_irq_restore(&flags);
 940
 941         return 0;
 942 }
 943
 944 /*
 945  * Round-robin a context's counters:
 946  */
 947 static void rotate_ctx(struct perf_counter_context *ctx)
 948 {
 949         struct perf_counter *counter;
 950         u64 perf_flags;
 951
 952         if (!ctx->nr_counters)
 953                 return;
 954
 955         spin_lock(&ctx->lock);
 956         /*
 957          * Rotate the first entry last (works just fine for group counters too):
 958          */
 959         perf_flags = hw_perf_save_disable();
 960         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 961                 list_move_tail(&counter->list_entry, &ctx->counter_list);
 962                 break;
 963         }
 964         hw_perf_restore(perf_flags);
 965
 966         spin_unlock(&ctx->lock);
 967 }
 968
 969 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 970 {
 971         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 972         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 973         const int rotate_percpu = 0;
 974
 975         if (rotate_percpu)
 976                 perf_counter_cpu_sched_out(cpuctx);
 977         perf_counter_task_sched_out(curr, cpu);
 978
 979         if (rotate_percpu)
 980                 rotate_ctx(&cpuctx->ctx);
 981         rotate_ctx(ctx);
 982
 983         if (rotate_percpu)
 984                 perf_counter_cpu_sched_in(cpuctx, cpu);
 985         perf_counter_task_sched_in(curr, cpu);
 986 }
 987
 988 /*
 989  * Cross CPU call to read the hardware counter
 990  */
 991 static void __read(void *info)
 992 {
 993         struct perf_counter *counter = info;
 994         unsigned long flags;
 995
 996         curr_rq_lock_irq_save(&flags);
 997         counter->hw_ops->read(counter);
 998         curr_rq_unlock_irq_restore(&flags);
 999 }
1000
1001 static u64 perf_counter_read(struct perf_counter *counter)
1002 {
1003         /*
1004          * If counter is enabled and currently active on a CPU, update the
1005          * value in the counter structure:
1006          */
1007         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1008                 smp_call_function_single(counter->oncpu,
1009                                          __read, counter, 1);
1010         }
1011
1012         return atomic64_read(&counter->count);
1013 }
1014
1015 /*
1016  * Cross CPU call to switch performance data pointers
1017  */
1018 static void __perf_switch_irq_data(void *info)
1019 {
1020         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1021         struct perf_counter *counter = info;
1022         struct perf_counter_context *ctx = counter->ctx;
1023         struct perf_data *oldirqdata = counter->irqdata;
1024
1025         /*
1026          * If this is a task context, we need to check whether it is
1027          * the current task context of this cpu. If not it has been
1028          * scheduled out before the smp call arrived.
1029          */
1030         if (ctx->task) {
1031                 if (cpuctx->task_ctx != ctx)
1032                         return;
1033                 spin_lock(&ctx->lock);
1034         }
1035
1036         /* Change the pointer NMI safe */
1037         atomic_long_set((atomic_long_t *)&counter->irqdata,
1038                         (unsigned long) counter->usrdata);
1039         counter->usrdata = oldirqdata;
1040
1041         if (ctx->task)
1042                 spin_unlock(&ctx->lock);
1043 }
1044
1045 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1046 {
1047         struct perf_counter_context *ctx = counter->ctx;
1048         struct perf_data *oldirqdata = counter->irqdata;
1049         struct task_struct *task = ctx->task;
1050
1051         if (!task) {
1052                 smp_call_function_single(counter->cpu,
1053                                          __perf_switch_irq_data,
1054                                          counter, 1);
1055                 return counter->usrdata;
1056         }
1057
1058 retry:
1059         spin_lock_irq(&ctx->lock);
1060         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1061                 counter->irqdata = counter->usrdata;
1062                 counter->usrdata = oldirqdata;
1063                 spin_unlock_irq(&ctx->lock);
1064                 return oldirqdata;
1065         }
1066         spin_unlock_irq(&ctx->lock);
1067         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1068         /* Might have failed, because task was scheduled out */
1069         if (counter->irqdata == oldirqdata)
1070                 goto retry;
1071
1072         return counter->usrdata;
1073 }
1074
1075 static void put_context(struct perf_counter_context *ctx)
1076 {
1077         if (ctx->task)
1078                 put_task_struct(ctx->task);
1079 }
1080
1081 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1082 {
1083         struct perf_cpu_context *cpuctx;
1084         struct perf_counter_context *ctx;
1085         struct task_struct *task;
1086
1087         /*
1088          * If cpu is not a wildcard then this is a percpu counter:
1089          */
1090         if (cpu != -1) {
1091                 /* Must be root to operate on a CPU counter: */
1092                 if (!capable(CAP_SYS_ADMIN))
1093                         return ERR_PTR(-EACCES);
1094
1095                 if (cpu < 0 || cpu > num_possible_cpus())
1096                         return ERR_PTR(-EINVAL);
1097
1098                 /*
1099                  * We could be clever and allow to attach a counter to an
1100                  * offline CPU and activate it when the CPU comes up, but
1101                  * that's for later.
1102                  */
1103                 if (!cpu_isset(cpu, cpu_online_map))
1104                         return ERR_PTR(-ENODEV);
1105
1106                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1107                 ctx = &cpuctx->ctx;
1108
1109                 return ctx;
1110         }
1111
1112         rcu_read_lock();
1113         if (!pid)
1114                 task = current;
1115         else
1116                 task = find_task_by_vpid(pid);
1117         if (task)
1118                 get_task_struct(task);
1119         rcu_read_unlock();
1120
1121         if (!task)
1122                 return ERR_PTR(-ESRCH);
1123
1124         ctx = &task->perf_counter_ctx;
1125         ctx->task = task;
1126
1127         /* Reuse ptrace permission checks for now. */
1128         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1129                 put_context(ctx);
1130                 return ERR_PTR(-EACCES);
1131         }
1132
1133         return ctx;
1134 }
1135
1136 /*
1137  * Called when the last reference to the file is gone.
1138  */
1139 static int perf_release(struct inode *inode, struct file *file)
1140 {
1141         struct perf_counter *counter = file->private_data;
1142         struct perf_counter_context *ctx = counter->ctx;
1143
1144         file->private_data = NULL;
1145
1146         mutex_lock(&ctx->mutex);
1147         mutex_lock(&counter->mutex);
1148
1149         perf_counter_remove_from_context(counter);
1150
1151         mutex_unlock(&counter->mutex);
1152         mutex_unlock(&ctx->mutex);
1153
1154         kfree(counter);
1155         put_context(ctx);
1156
1157         return 0;
1158 }
1159
1160 /*
1161  * Read the performance counter - simple non blocking version for now
1162  */
1163 static ssize_t
1164 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1165 {
1166         u64 cntval;
1167
1168         if (count != sizeof(cntval))
1169                 return -EINVAL;
1170
1171         /*
1172          * Return end-of-file for a read on a counter that is in
1173          * error state (i.e. because it was pinned but it couldn't be
1174          * scheduled on to the CPU at some point).
1175          */
1176         if (counter->state == PERF_COUNTER_STATE_ERROR)
1177                 return 0;
1178
1179         mutex_lock(&counter->mutex);
1180         cntval = perf_counter_read(counter);
1181         mutex_unlock(&counter->mutex);
1182
1183         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1184 }
1185
1186 static ssize_t
1187 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1188 {
1189         if (!usrdata->len)
1190                 return 0;
1191
1192         count = min(count, (size_t)usrdata->len);
1193         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1194                 return -EFAULT;
1195
1196         /* Adjust the counters */
1197         usrdata->len -= count;
1198         if (!usrdata->len)
1199                 usrdata->rd_idx = 0;
1200         else
1201                 usrdata->rd_idx += count;
1202
1203         return count;
1204 }
1205
1206 static ssize_t
1207 perf_read_irq_data(struct perf_counter  *counter,
1208                    char __user          *buf,
1209                    size_t               count,
1210                    int                  nonblocking)
1211 {
1212         struct perf_data *irqdata, *usrdata;
1213         DECLARE_WAITQUEUE(wait, current);
1214         ssize_t res, res2;
1215
1216         irqdata = counter->irqdata;
1217         usrdata = counter->usrdata;
1218
1219         if (usrdata->len + irqdata->len >= count)
1220                 goto read_pending;
1221
1222         if (nonblocking)
1223                 return -EAGAIN;
1224
1225         spin_lock_irq(&counter->waitq.lock);
1226         __add_wait_queue(&counter->waitq, &wait);
1227         for (;;) {
1228                 set_current_state(TASK_INTERRUPTIBLE);
1229                 if (usrdata->len + irqdata->len >= count)
1230                         break;
1231
1232                 if (signal_pending(current))
1233                         break;
1234
1235                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1236                         break;
1237
1238                 spin_unlock_irq(&counter->waitq.lock);
1239                 schedule();
1240                 spin_lock_irq(&counter->waitq.lock);
1241         }
1242         __remove_wait_queue(&counter->waitq, &wait);
1243         __set_current_state(TASK_RUNNING);
1244         spin_unlock_irq(&counter->waitq.lock);
1245
1246         if (usrdata->len + irqdata->len < count &&
1247             counter->state != PERF_COUNTER_STATE_ERROR)
1248                 return -ERESTARTSYS;
1249 read_pending:
1250         mutex_lock(&counter->mutex);
1251
1252         /* Drain pending data first: */
1253         res = perf_copy_usrdata(usrdata, buf, count);
1254         if (res < 0 || res == count)
1255                 goto out;
1256
1257         /* Switch irq buffer: */
1258         usrdata = perf_switch_irq_data(counter);
1259         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1260         if (res2 < 0) {
1261                 if (!res)
1262                         res = -EFAULT;
1263         } else {
1264                 res += res2;
1265         }
1266 out:
1267         mutex_unlock(&counter->mutex);
1268
1269         return res;
1270 }
1271
1272 static ssize_t
1273 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1274 {
1275         struct perf_counter *counter = file->private_data;
1276
1277         switch (counter->hw_event.record_type) {
1278         case PERF_RECORD_SIMPLE:
1279                 return perf_read_hw(counter, buf, count);
1280
1281         case PERF_RECORD_IRQ:
1282         case PERF_RECORD_GROUP:
1283                 return perf_read_irq_data(counter, buf, count,
1284                                           file->f_flags & O_NONBLOCK);
1285         }
1286         return -EINVAL;
1287 }
1288
1289 static unsigned int perf_poll(struct file *file, poll_table *wait)
1290 {
1291         struct perf_counter *counter = file->private_data;
1292         unsigned int events = 0;
1293         unsigned long flags;
1294
1295         poll_wait(file, &counter->waitq, wait);
1296
1297         spin_lock_irqsave(&counter->waitq.lock, flags);
1298         if (counter->usrdata->len || counter->irqdata->len)
1299                 events |= POLLIN;
1300         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1301
1302         return events;
1303 }
1304
1305 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1306 {
1307         struct perf_counter *counter = file->private_data;
1308         int err = 0;
1309
1310         switch (cmd) {
1311         case PERF_COUNTER_IOC_ENABLE:
1312                 perf_counter_enable_family(counter);
1313                 break;
1314         case PERF_COUNTER_IOC_DISABLE:
1315                 perf_counter_disable_family(counter);
1316                 break;
1317         default:
1318                 err = -ENOTTY;
1319         }
1320         return err;
1321 }
1322
1323 static const struct file_operations perf_fops = {
1324         .release                = perf_release,
1325         .read                   = perf_read,
1326         .poll                   = perf_poll,
1327         .unlocked_ioctl         = perf_ioctl,
1328         .compat_ioctl           = perf_ioctl,
1329 };
1330
1331 /*
1332  * Generic software counter infrastructure
1333  */
1334
1335 static void perf_swcounter_update(struct perf_counter *counter)
1336 {
1337         struct hw_perf_counter *hwc = &counter->hw;
1338         u64 prev, now;
1339         s64 delta;
1340
1341 again:
1342         prev = atomic64_read(&hwc->prev_count);
1343         now = atomic64_read(&hwc->count);
1344         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
1345                 goto again;
1346
1347         delta = now - prev;
1348
1349         atomic64_add(delta, &counter->count);
1350         atomic64_sub(delta, &hwc->period_left);
1351 }
1352
1353 static void perf_swcounter_set_period(struct perf_counter *counter)
1354 {
1355         struct hw_perf_counter *hwc = &counter->hw;
1356         s64 left = atomic64_read(&hwc->period_left);
1357         s64 period = hwc->irq_period;
1358
1359         if (unlikely(left <= -period)) {
1360                 left = period;
1361                 atomic64_set(&hwc->period_left, left);
1362         }
1363
1364         if (unlikely(left <= 0)) {
1365                 left += period;
1366                 atomic64_add(period, &hwc->period_left);
1367         }
1368
1369         atomic64_set(&hwc->prev_count, -left);
1370         atomic64_set(&hwc->count, -left);
1371 }
1372
1373 static void perf_swcounter_save_and_restart(struct perf_counter *counter)
1374 {
1375         perf_swcounter_update(counter);
1376         perf_swcounter_set_period(counter);
1377 }
1378
1379 static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
1380 {
1381         struct perf_data *irqdata = counter->irqdata;
1382
1383         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
1384                 irqdata->overrun++;
1385         } else {
1386                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
1387
1388                 *p = data;
1389                 irqdata->len += sizeof(u64);
1390         }
1391 }
1392
1393 static void perf_swcounter_handle_group(struct perf_counter *sibling)
1394 {
1395         struct perf_counter *counter, *group_leader = sibling->group_leader;
1396
1397         list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
1398                 counter->hw_ops->read(counter);
1399                 perf_swcounter_store_irq(sibling, counter->hw_event.type);
1400                 perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
1401         }
1402 }
1403
1404 static void perf_swcounter_interrupt(struct perf_counter *counter,
1405                                      int nmi, struct pt_regs *regs)
1406 {
1407         switch (counter->hw_event.record_type) {
1408         case PERF_RECORD_SIMPLE:
1409                 break;
1410
1411         case PERF_RECORD_IRQ:
1412                 perf_swcounter_store_irq(counter, instruction_pointer(regs));
1413                 break;
1414
1415         case PERF_RECORD_GROUP:
1416                 perf_swcounter_handle_group(counter);
1417                 break;
1418         }
1419
1420         if (nmi) {
1421                 counter->wakeup_pending = 1;
1422                 set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
1423         } else
1424                 wake_up(&counter->waitq);
1425 }
1426
1427 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
1428 {
1429         struct perf_counter *counter;
1430         struct pt_regs *regs;
1431
1432         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
1433         counter->hw_ops->read(counter);
1434
1435         regs = get_irq_regs();
1436         /*
1437          * In case we exclude kernel IPs or are somehow not in interrupt
1438          * context, provide the next best thing, the user IP.
1439          */
1440         if ((counter->hw_event.exclude_kernel || !regs) &&
1441                         !counter->hw_event.exclude_user)
1442                 regs = task_pt_regs(current);
1443
1444         if (regs)
1445                 perf_swcounter_interrupt(counter, 0, regs);
1446
1447         hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
1448
1449         return HRTIMER_RESTART;
1450 }
1451
1452 static void perf_swcounter_overflow(struct perf_counter *counter,
1453                                     int nmi, struct pt_regs *regs)
1454 {
1455         perf_swcounter_save_and_restart(counter);
1456         perf_swcounter_interrupt(counter, nmi, regs);
1457 }
1458
1459 static int perf_swcounter_match(struct perf_counter *counter,
1460                                 enum hw_event_types event,
1461                                 struct pt_regs *regs)
1462 {
1463         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1464                 return 0;
1465
1466         if (counter->hw_event.raw)
1467                 return 0;
1468
1469         if (counter->hw_event.type != event)
1470                 return 0;
1471
1472         if (counter->hw_event.exclude_user && user_mode(regs))
1473                 return 0;
1474
1475         if (counter->hw_event.exclude_kernel && !user_mode(regs))
1476                 return 0;
1477
1478         return 1;
1479 }
1480
1481 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
1482                                int nmi, struct pt_regs *regs)
1483 {
1484         int neg = atomic64_add_negative(nr, &counter->hw.count);
1485         if (counter->hw.irq_period && !neg)
1486                 perf_swcounter_overflow(counter, nmi, regs);
1487 }
1488
1489 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
1490                                      enum hw_event_types event, u64 nr,
1491                                      int nmi, struct pt_regs *regs)
1492 {
1493         struct perf_counter *counter;
1494         unsigned long flags;
1495
1496         if (list_empty(&ctx->counter_list))
1497                 return;
1498
1499         spin_lock_irqsave(&ctx->lock, flags);
1500
1501         /*
1502          * XXX: make counter_list RCU safe
1503          */
1504         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1505                 if (perf_swcounter_match(counter, event, regs))
1506                         perf_swcounter_add(counter, nr, nmi, regs);
1507         }
1508
1509         spin_unlock_irqrestore(&ctx->lock, flags);
1510 }
1511
1512 void perf_swcounter_event(enum hw_event_types event, u64 nr,
1513                           int nmi, struct pt_regs *regs)
1514 {
1515         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
1516
1517         perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
1518         if (cpuctx->task_ctx)
1519                 perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
1520
1521         put_cpu_var(perf_cpu_context);
1522 }
1523
1524 static void perf_swcounter_read(struct perf_counter *counter)
1525 {
1526         perf_swcounter_update(counter);
1527 }
1528
1529 static int perf_swcounter_enable(struct perf_counter *counter)
1530 {
1531         perf_swcounter_set_period(counter);
1532         return 0;
1533 }
1534
1535 static void perf_swcounter_disable(struct perf_counter *counter)
1536 {
1537         perf_swcounter_update(counter);
1538 }
1539
1540 static const struct hw_perf_counter_ops perf_ops_generic = {
1541         .enable         = perf_swcounter_enable,
1542         .disable        = perf_swcounter_disable,
1543         .read           = perf_swcounter_read,
1544 };
1545
1546 /*
1547  * Software counter: cpu wall time clock
1548  */
1549
1550 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1551 {
1552         int cpu = raw_smp_processor_id();
1553         s64 prev;
1554         u64 now;
1555
1556         now = cpu_clock(cpu);
1557         prev = atomic64_read(&counter->hw.prev_count);
1558         atomic64_set(&counter->hw.prev_count, now);
1559         atomic64_add(now - prev, &counter->count);
1560 }
1561
1562 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1563 {
1564         struct hw_perf_counter *hwc = &counter->hw;
1565         int cpu = raw_smp_processor_id();
1566
1567         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
1568         if (hwc->irq_period) {
1569                 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1570                 hwc->hrtimer.function = perf_swcounter_hrtimer;
1571                 __hrtimer_start_range_ns(&hwc->hrtimer,
1572                                 ns_to_ktime(hwc->irq_period), 0,
1573                                 HRTIMER_MODE_REL, 0);
1574         }
1575
1576         return 0;
1577 }
1578
1579 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1580 {
1581         hrtimer_cancel(&counter->hw.hrtimer);
1582         cpu_clock_perf_counter_update(counter);
1583 }
1584
1585 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1586 {
1587         cpu_clock_perf_counter_update(counter);
1588 }
1589
1590 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1591         .enable         = cpu_clock_perf_counter_enable,
1592         .disable        = cpu_clock_perf_counter_disable,
1593         .read           = cpu_clock_perf_counter_read,
1594 };
1595
1596 /*
1597  * Software counter: task time clock
1598  */
1599
1600 /*
1601  * Called from within the scheduler:
1602  */
1603 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1604 {
1605         struct task_struct *curr = counter->task;
1606         u64 delta;
1607
1608         delta = __task_delta_exec(curr, update);
1609
1610         return curr->se.sum_exec_runtime + delta;
1611 }
1612
1613 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1614 {
1615         u64 prev;
1616         s64 delta;
1617
1618         prev = atomic64_read(&counter->hw.prev_count);
1619
1620         atomic64_set(&counter->hw.prev_count, now);
1621
1622         delta = now - prev;
1623
1624         atomic64_add(delta, &counter->count);
1625 }
1626
1627 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1628 {
1629         struct hw_perf_counter *hwc = &counter->hw;
1630
1631         atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
1632         if (hwc->irq_period) {
1633                 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1634                 hwc->hrtimer.function = perf_swcounter_hrtimer;
1635                 __hrtimer_start_range_ns(&hwc->hrtimer,
1636                                 ns_to_ktime(hwc->irq_period), 0,
1637                                 HRTIMER_MODE_REL, 0);
1638         }
1639
1640         return 0;
1641 }
1642
1643 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1644 {
1645         hrtimer_cancel(&counter->hw.hrtimer);
1646         task_clock_perf_counter_update(counter,
1647                         task_clock_perf_counter_val(counter, 0));
1648 }
1649
1650 static void task_clock_perf_counter_read(struct perf_counter *counter)
1651 {
1652         task_clock_perf_counter_update(counter,
1653                         task_clock_perf_counter_val(counter, 1));
1654 }
1655
1656 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1657         .enable         = task_clock_perf_counter_enable,
1658         .disable        = task_clock_perf_counter_disable,
1659         .read           = task_clock_perf_counter_read,
1660 };
1661
1662 /*
1663  * Software counter: context switches
1664  */
1665
1666 static u64 get_context_switches(struct perf_counter *counter)
1667 {
1668         struct task_struct *curr = counter->ctx->task;
1669
1670         if (curr)
1671                 return curr->nvcsw + curr->nivcsw;
1672         return cpu_nr_switches(smp_processor_id());
1673 }
1674
1675 static void context_switches_perf_counter_update(struct perf_counter *counter)
1676 {
1677         u64 prev, now;
1678         s64 delta;
1679
1680         prev = atomic64_read(&counter->hw.prev_count);
1681         now = get_context_switches(counter);
1682
1683         atomic64_set(&counter->hw.prev_count, now);
1684
1685         delta = now - prev;
1686
1687         atomic64_add(delta, &counter->count);
1688 }
1689
1690 static void context_switches_perf_counter_read(struct perf_counter *counter)
1691 {
1692         context_switches_perf_counter_update(counter);
1693 }
1694
1695 static int context_switches_perf_counter_enable(struct perf_counter *counter)
1696 {
1697         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1698                 atomic64_set(&counter->hw.prev_count,
1699                              get_context_switches(counter));
1700         return 0;
1701 }
1702
1703 static void context_switches_perf_counter_disable(struct perf_counter *counter)
1704 {
1705         context_switches_perf_counter_update(counter);
1706 }
1707
1708 static const struct hw_perf_counter_ops perf_ops_context_switches = {
1709         .enable         = context_switches_perf_counter_enable,
1710         .disable        = context_switches_perf_counter_disable,
1711         .read           = context_switches_perf_counter_read,
1712 };
1713
1714 /*
1715  * Software counter: cpu migrations
1716  */
1717
1718 static inline u64 get_cpu_migrations(struct perf_counter *counter)
1719 {
1720         struct task_struct *curr = counter->ctx->task;
1721
1722         if (curr)
1723                 return curr->se.nr_migrations;
1724         return cpu_nr_migrations(smp_processor_id());
1725 }
1726
1727 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1728 {
1729         u64 prev, now;
1730         s64 delta;
1731
1732         prev = atomic64_read(&counter->hw.prev_count);
1733         now = get_cpu_migrations(counter);
1734
1735         atomic64_set(&counter->hw.prev_count, now);
1736
1737         delta = now - prev;
1738
1739         atomic64_add(delta, &counter->count);
1740 }
1741
1742 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1743 {
1744         cpu_migrations_perf_counter_update(counter);
1745 }
1746
1747 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1748 {
1749         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1750                 atomic64_set(&counter->hw.prev_count,
1751                              get_cpu_migrations(counter));
1752         return 0;
1753 }
1754
1755 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1756 {
1757         cpu_migrations_perf_counter_update(counter);
1758 }
1759
1760 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1761         .enable         = cpu_migrations_perf_counter_enable,
1762         .disable        = cpu_migrations_perf_counter_disable,
1763         .read           = cpu_migrations_perf_counter_read,
1764 };
1765
1766 static const struct hw_perf_counter_ops *
1767 sw_perf_counter_init(struct perf_counter *counter)
1768 {
1769         struct perf_counter_hw_event *hw_event = &counter->hw_event;
1770         const struct hw_perf_counter_ops *hw_ops = NULL;
1771         struct hw_perf_counter *hwc = &counter->hw;
1772
1773         /*
1774          * Software counters (currently) can't in general distinguish
1775          * between user, kernel and hypervisor events.
1776          * However, context switches and cpu migrations are considered
1777          * to be kernel events, and page faults are never hypervisor
1778          * events.
1779          */
1780         switch (counter->hw_event.type) {
1781         case PERF_COUNT_CPU_CLOCK:
1782                 hw_ops = &perf_ops_cpu_clock;
1783
1784                 if (hw_event->irq_period && hw_event->irq_period < 10000)
1785                         hw_event->irq_period = 10000;
1786                 break;
1787         case PERF_COUNT_TASK_CLOCK:
1788                 /*
1789                  * If the user instantiates this as a per-cpu counter,
1790                  * use the cpu_clock counter instead.
1791                  */
1792                 if (counter->ctx->task)
1793                         hw_ops = &perf_ops_task_clock;
1794                 else
1795                         hw_ops = &perf_ops_cpu_clock;
1796
1797                 if (hw_event->irq_period && hw_event->irq_period < 10000)
1798                         hw_event->irq_period = 10000;
1799                 break;
1800         case PERF_COUNT_PAGE_FAULTS:
1801         case PERF_COUNT_PAGE_FAULTS_MIN:
1802         case PERF_COUNT_PAGE_FAULTS_MAJ:
1803                 hw_ops = &perf_ops_generic;
1804                 break;
1805         case PERF_COUNT_CONTEXT_SWITCHES:
1806                 if (!counter->hw_event.exclude_kernel)
1807                         hw_ops = &perf_ops_context_switches;
1808                 break;
1809         case PERF_COUNT_CPU_MIGRATIONS:
1810                 if (!counter->hw_event.exclude_kernel)
1811                         hw_ops = &perf_ops_cpu_migrations;
1812                 break;
1813         default:
1814                 break;
1815         }
1816
1817         if (hw_ops)
1818                 hwc->irq_period = hw_event->irq_period;
1819
1820         return hw_ops;
1821 }
1822
1823 /*
1824  * Allocate and initialize a counter structure
1825  */
1826 static struct perf_counter *
1827 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1828                    int cpu,
1829                    struct perf_counter_context *ctx,
1830                    struct perf_counter *group_leader,
1831                    gfp_t gfpflags)
1832 {
1833         const struct hw_perf_counter_ops *hw_ops;
1834         struct perf_counter *counter;
1835
1836         counter = kzalloc(sizeof(*counter), gfpflags);
1837         if (!counter)
1838                 return NULL;
1839
1840         /*
1841          * Single counters are their own group leaders, with an
1842          * empty sibling list:
1843          */
1844         if (!group_leader)
1845                 group_leader = counter;
1846
1847         mutex_init(&counter->mutex);
1848         INIT_LIST_HEAD(&counter->list_entry);
1849         INIT_LIST_HEAD(&counter->sibling_list);
1850         init_waitqueue_head(&counter->waitq);
1851
1852         INIT_LIST_HEAD(&counter->child_list);
1853
1854         counter->irqdata                = &counter->data[0];
1855         counter->usrdata                = &counter->data[1];
1856         counter->cpu                    = cpu;
1857         counter->hw_event               = *hw_event;
1858         counter->wakeup_pending         = 0;
1859         counter->group_leader           = group_leader;
1860         counter->hw_ops                 = NULL;
1861         counter->ctx                    = ctx;
1862
1863         counter->state = PERF_COUNTER_STATE_INACTIVE;
1864         if (hw_event->disabled)
1865                 counter->state = PERF_COUNTER_STATE_OFF;
1866
1867         hw_ops = NULL;
1868         if (!hw_event->raw && hw_event->type < 0)
1869                 hw_ops = sw_perf_counter_init(counter);
1870         else
1871                 hw_ops = hw_perf_counter_init(counter);
1872
1873         if (!hw_ops) {
1874                 kfree(counter);
1875                 return NULL;
1876         }
1877         counter->hw_ops = hw_ops;
1878
1879         return counter;
1880 }
1881
1882 /**
1883  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
1884  *
1885  * @hw_event_uptr:      event type attributes for monitoring/sampling
1886  * @pid:                target pid
1887  * @cpu:                target cpu
1888  * @group_fd:           group leader counter fd
1889  */
1890 SYSCALL_DEFINE5(perf_counter_open,
1891                 const struct perf_counter_hw_event __user *, hw_event_uptr,
1892                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
1893 {
1894         struct perf_counter *counter, *group_leader;
1895         struct perf_counter_hw_event hw_event;
1896         struct perf_counter_context *ctx;
1897         struct file *counter_file = NULL;
1898         struct file *group_file = NULL;
1899         int fput_needed = 0;
1900         int fput_needed2 = 0;
1901         int ret;
1902
1903         /* for future expandability... */
1904         if (flags)
1905                 return -EINVAL;
1906
1907         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1908                 return -EFAULT;
1909
1910         /*
1911          * Get the target context (task or percpu):
1912          */
1913         ctx = find_get_context(pid, cpu);
1914         if (IS_ERR(ctx))
1915                 return PTR_ERR(ctx);
1916
1917         /*
1918          * Look up the group leader (we will attach this counter to it):
1919          */
1920         group_leader = NULL;
1921         if (group_fd != -1) {
1922                 ret = -EINVAL;
1923                 group_file = fget_light(group_fd, &fput_needed);
1924                 if (!group_file)
1925                         goto err_put_context;
1926                 if (group_file->f_op != &perf_fops)
1927                         goto err_put_context;
1928
1929                 group_leader = group_file->private_data;
1930                 /*
1931                  * Do not allow a recursive hierarchy (this new sibling
1932                  * becoming part of another group-sibling):
1933                  */
1934                 if (group_leader->group_leader != group_leader)
1935                         goto err_put_context;
1936                 /*
1937                  * Do not allow to attach to a group in a different
1938                  * task or CPU context:
1939                  */
1940                 if (group_leader->ctx != ctx)
1941                         goto err_put_context;
1942                 /*
1943                  * Only a group leader can be exclusive or pinned
1944                  */
1945                 if (hw_event.exclusive || hw_event.pinned)
1946                         goto err_put_context;
1947         }
1948
1949         ret = -EINVAL;
1950         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
1951                                      GFP_KERNEL);
1952         if (!counter)
1953                 goto err_put_context;
1954
1955         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1956         if (ret < 0)
1957                 goto err_free_put_context;
1958
1959         counter_file = fget_light(ret, &fput_needed2);
1960         if (!counter_file)
1961                 goto err_free_put_context;
1962
1963         counter->filp = counter_file;
1964         mutex_lock(&ctx->mutex);
1965         perf_install_in_context(ctx, counter, cpu);
1966         mutex_unlock(&ctx->mutex);
1967
1968         fput_light(counter_file, fput_needed2);
1969
1970 out_fput:
1971         fput_light(group_file, fput_needed);
1972
1973         return ret;
1974
1975 err_free_put_context:
1976         kfree(counter);
1977
1978 err_put_context:
1979         put_context(ctx);
1980
1981         goto out_fput;
1982 }
1983
1984 /*
1985  * Initialize the perf_counter context in a task_struct:
1986  */
1987 static void
1988 __perf_counter_init_context(struct perf_counter_context *ctx,
1989                             struct task_struct *task)
1990 {
1991         memset(ctx, 0, sizeof(*ctx));
1992         spin_lock_init(&ctx->lock);
1993         mutex_init(&ctx->mutex);
1994         INIT_LIST_HEAD(&ctx->counter_list);
1995         ctx->task = task;
1996 }
1997
1998 /*
1999  * inherit a counter from parent task to child task:
2000  */
2001 static struct perf_counter *
2002 inherit_counter(struct perf_counter *parent_counter,
2003               struct task_struct *parent,
2004               struct perf_counter_context *parent_ctx,
2005               struct task_struct *child,
2006               struct perf_counter *group_leader,
2007               struct perf_counter_context *child_ctx)
2008 {
2009         struct perf_counter *child_counter;
2010
2011         /*
2012          * Instead of creating recursive hierarchies of counters,
2013          * we link inherited counters back to the original parent,
2014          * which has a filp for sure, which we use as the reference
2015          * count:
2016          */
2017         if (parent_counter->parent)
2018                 parent_counter = parent_counter->parent;
2019
2020         child_counter = perf_counter_alloc(&parent_counter->hw_event,
2021                                            parent_counter->cpu, child_ctx,
2022                                            group_leader, GFP_KERNEL);
2023         if (!child_counter)
2024                 return NULL;
2025
2026         /*
2027          * Link it up in the child's context:
2028          */
2029         child_counter->task = child;
2030         list_add_counter(child_counter, child_ctx);
2031         child_ctx->nr_counters++;
2032
2033         child_counter->parent = parent_counter;
2034         /*
2035          * inherit into child's child as well:
2036          */
2037         child_counter->hw_event.inherit = 1;
2038
2039         /*
2040          * Get a reference to the parent filp - we will fput it
2041          * when the child counter exits. This is safe to do because
2042          * we are in the parent and we know that the filp still
2043          * exists and has a nonzero count:
2044          */
2045         atomic_long_inc(&parent_counter->filp->f_count);
2046
2047         /*
2048          * Link this into the parent counter's child list
2049          */
2050         mutex_lock(&parent_counter->mutex);
2051         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2052
2053         /*
2054          * Make the child state follow the state of the parent counter,
2055          * not its hw_event.disabled bit.  We hold the parent's mutex,
2056          * so we won't race with perf_counter_{en,dis}able_family.
2057          */
2058         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2059                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2060         else
2061                 child_counter->state = PERF_COUNTER_STATE_OFF;
2062
2063         mutex_unlock(&parent_counter->mutex);
2064
2065         return child_counter;
2066 }
2067
2068 static int inherit_group(struct perf_counter *parent_counter,
2069               struct task_struct *parent,
2070               struct perf_counter_context *parent_ctx,
2071               struct task_struct *child,
2072               struct perf_counter_context *child_ctx)
2073 {
2074         struct perf_counter *leader;
2075         struct perf_counter *sub;
2076
2077         leader = inherit_counter(parent_counter, parent, parent_ctx,
2078                                  child, NULL, child_ctx);
2079         if (!leader)
2080                 return -ENOMEM;
2081         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2082                 if (!inherit_counter(sub, parent, parent_ctx,
2083                                      child, leader, child_ctx))
2084                         return -ENOMEM;
2085         }
2086         return 0;
2087 }
2088
2089 static void sync_child_counter(struct perf_counter *child_counter,
2090                                struct perf_counter *parent_counter)
2091 {
2092         u64 parent_val, child_val;
2093
2094         parent_val = atomic64_read(&parent_counter->count);
2095         child_val = atomic64_read(&child_counter->count);
2096
2097         /*
2098          * Add back the child's count to the parent's count:
2099          */
2100         atomic64_add(child_val, &parent_counter->count);
2101
2102         /*
2103          * Remove this counter from the parent's list
2104          */
2105         mutex_lock(&parent_counter->mutex);
2106         list_del_init(&child_counter->child_list);
2107         mutex_unlock(&parent_counter->mutex);
2108
2109         /*
2110          * Release the parent counter, if this was the last
2111          * reference to it.
2112          */
2113         fput(parent_counter->filp);
2114 }
2115
2116 static void
2117 __perf_counter_exit_task(struct task_struct *child,
2118                          struct perf_counter *child_counter,
2119                          struct perf_counter_context *child_ctx)
2120 {
2121         struct perf_counter *parent_counter;
2122         struct perf_counter *sub, *tmp;
2123
2124         /*
2125          * If we do not self-reap then we have to wait for the
2126          * child task to unschedule (it will happen for sure),
2127          * so that its counter is at its final count. (This
2128          * condition triggers rarely - child tasks usually get
2129          * off their CPU before the parent has a chance to
2130          * get this far into the reaping action)
2131          */
2132         if (child != current) {
2133                 wait_task_inactive(child, 0);
2134                 list_del_init(&child_counter->list_entry);
2135         } else {
2136                 struct perf_cpu_context *cpuctx;
2137                 unsigned long flags;
2138                 u64 perf_flags;
2139
2140                 /*
2141                  * Disable and unlink this counter.
2142                  *
2143                  * Be careful about zapping the list - IRQ/NMI context
2144                  * could still be processing it:
2145                  */
2146                 curr_rq_lock_irq_save(&flags);
2147                 perf_flags = hw_perf_save_disable();
2148
2149                 cpuctx = &__get_cpu_var(perf_cpu_context);
2150
2151                 group_sched_out(child_counter, cpuctx, child_ctx);
2152
2153                 list_del_init(&child_counter->list_entry);
2154
2155                 child_ctx->nr_counters--;
2156
2157                 hw_perf_restore(perf_flags);
2158                 curr_rq_unlock_irq_restore(&flags);
2159         }
2160
2161         parent_counter = child_counter->parent;
2162         /*
2163          * It can happen that parent exits first, and has counters
2164          * that are still around due to the child reference. These
2165          * counters need to be zapped - but otherwise linger.
2166          */
2167         if (parent_counter) {
2168                 sync_child_counter(child_counter, parent_counter);
2169                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2170                                          list_entry) {
2171                         if (sub->parent) {
2172                                 sync_child_counter(sub, sub->parent);
2173                                 kfree(sub);
2174                         }
2175                 }
2176                 kfree(child_counter);
2177         }
2178 }
2179
2180 /*
2181  * When a child task exits, feed back counter values to parent counters.
2182  *
2183  * Note: we may be running in child context, but the PID is not hashed
2184  * anymore so new counters will not be added.
2185  */
2186 void perf_counter_exit_task(struct task_struct *child)
2187 {
2188         struct perf_counter *child_counter, *tmp;
2189         struct perf_counter_context *child_ctx;
2190
2191         child_ctx = &child->perf_counter_ctx;
2192
2193         if (likely(!child_ctx->nr_counters))
2194                 return;
2195
2196         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2197                                  list_entry)
2198                 __perf_counter_exit_task(child, child_counter, child_ctx);
2199 }
2200
2201 /*
2202  * Initialize the perf_counter context in task_struct
2203  */
2204 void perf_counter_init_task(struct task_struct *child)
2205 {
2206         struct perf_counter_context *child_ctx, *parent_ctx;
2207         struct perf_counter *counter;
2208         struct task_struct *parent = current;
2209
2210         child_ctx  =  &child->perf_counter_ctx;
2211         parent_ctx = &parent->perf_counter_ctx;
2212
2213         __perf_counter_init_context(child_ctx, child);
2214
2215         /*
2216          * This is executed from the parent task context, so inherit
2217          * counters that have been marked for cloning:
2218          */
2219
2220         if (likely(!parent_ctx->nr_counters))
2221                 return;
2222
2223         /*
2224          * Lock the parent list. No need to lock the child - not PID
2225          * hashed yet and not running, so nobody can access it.
2226          */
2227         mutex_lock(&parent_ctx->mutex);
2228
2229         /*
2230          * We dont have to disable NMIs - we are only looking at
2231          * the list, not manipulating it:
2232          */
2233         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2234                 if (!counter->hw_event.inherit)
2235                         continue;
2236
2237                 if (inherit_group(counter, parent,
2238                                   parent_ctx, child, child_ctx))
2239                         break;
2240         }
2241
2242         mutex_unlock(&parent_ctx->mutex);
2243 }
2244
2245 static void __cpuinit perf_counter_init_cpu(int cpu)
2246 {
2247         struct perf_cpu_context *cpuctx;
2248
2249         cpuctx = &per_cpu(perf_cpu_context, cpu);
2250         __perf_counter_init_context(&cpuctx->ctx, NULL);
2251
2252         mutex_lock(&perf_resource_mutex);
2253         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2254         mutex_unlock(&perf_resource_mutex);
2255
2256         hw_perf_counter_setup(cpu);
2257 }
2258
2259 #ifdef CONFIG_HOTPLUG_CPU
2260 static void __perf_counter_exit_cpu(void *info)
2261 {
2262         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2263         struct perf_counter_context *ctx = &cpuctx->ctx;
2264         struct perf_counter *counter, *tmp;
2265
2266         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2267                 __perf_counter_remove_from_context(counter);
2268 }
2269 static void perf_counter_exit_cpu(int cpu)
2270 {
2271         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2272         struct perf_counter_context *ctx = &cpuctx->ctx;
2273
2274         mutex_lock(&ctx->mutex);
2275         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2276         mutex_unlock(&ctx->mutex);
2277 }
2278 #else
2279 static inline void perf_counter_exit_cpu(int cpu) { }
2280 #endif
2281
2282 static int __cpuinit
2283 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2284 {
2285         unsigned int cpu = (long)hcpu;
2286
2287         switch (action) {
2288
2289         case CPU_UP_PREPARE:
2290         case CPU_UP_PREPARE_FROZEN:
2291                 perf_counter_init_cpu(cpu);
2292                 break;
2293
2294         case CPU_DOWN_PREPARE:
2295         case CPU_DOWN_PREPARE_FROZEN:
2296                 perf_counter_exit_cpu(cpu);
2297                 break;
2298
2299         default:
2300                 break;
2301         }
2302
2303         return NOTIFY_OK;
2304 }
2305
2306 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2307         .notifier_call          = perf_cpu_notify,
2308 };
2309
2310 static int __init perf_counter_init(void)
2311 {
2312         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2313                         (void *)(long)smp_processor_id());
2314         register_cpu_notifier(&perf_cpu_nb);
2315
2316         return 0;
2317 }
2318 early_initcall(perf_counter_init);
2319
2320 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2321 {
2322         return sprintf(buf, "%d\n", perf_reserved_percpu);
2323 }
2324
2325 static ssize_t
2326 perf_set_reserve_percpu(struct sysdev_class *class,
2327                         const char *buf,
2328                         size_t count)
2329 {
2330         struct perf_cpu_context *cpuctx;
2331         unsigned long val;
2332         int err, cpu, mpt;
2333
2334         err = strict_strtoul(buf, 10, &val);
2335         if (err)
2336                 return err;
2337         if (val > perf_max_counters)
2338                 return -EINVAL;
2339
2340         mutex_lock(&perf_resource_mutex);
2341         perf_reserved_percpu = val;
2342         for_each_online_cpu(cpu) {
2343                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2344                 spin_lock_irq(&cpuctx->ctx.lock);
2345                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2346                           perf_max_counters - perf_reserved_percpu);
2347                 cpuctx->max_pertask = mpt;
2348                 spin_unlock_irq(&cpuctx->ctx.lock);
2349         }
2350         mutex_unlock(&perf_resource_mutex);
2351
2352         return count;
2353 }
2354
2355 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2356 {
2357         return sprintf(buf, "%d\n", perf_overcommit);
2358 }
2359
2360 static ssize_t
2361 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2362 {
2363         unsigned long val;
2364         int err;
2365
2366         err = strict_strtoul(buf, 10, &val);
2367         if (err)
2368                 return err;
2369         if (val > 1)
2370                 return -EINVAL;
2371
2372         mutex_lock(&perf_resource_mutex);
2373         perf_overcommit = val;
2374         mutex_unlock(&perf_resource_mutex);
2375
2376         return count;
2377 }
2378
2379 static SYSDEV_CLASS_ATTR(
2380                                 reserve_percpu,
2381                                 0644,
2382                                 perf_show_reserve_percpu,
2383                                 perf_set_reserve_percpu
2384                         );
2385
2386 static SYSDEV_CLASS_ATTR(
2387                                 overcommit,
2388                                 0644,
2389                                 perf_show_overcommit,
2390                                 perf_set_overcommit
2391                         );
2392
2393 static struct attribute *perfclass_attrs[] = {
2394         &attr_reserve_percpu.attr,
2395         &attr_overcommit.attr,
2396         NULL
2397 };
2398
2399 static struct attribute_group perfclass_attr_group = {
2400         .attrs                  = perfclass_attrs,
2401         .name                   = "perf_counters",
2402 };
2403
2404 static int __init perf_counter_sysfs_init(void)
2405 {
2406         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2407                                   &perfclass_attr_group);
2408 }
2409 device_initcall(perf_counter_sysfs_init);