kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *  For licencing details see kernel-base/COPYING
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/cpu.h>
  12 #include <linux/smp.h>
  13 #include <linux/file.h>
  14 #include <linux/poll.h>
  15 #include <linux/sysfs.h>
  16 #include <linux/ptrace.h>
  17 #include <linux/percpu.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/kernel_stat.h>
  22 #include <linux/perf_counter.h>
  23 #include <linux/mm.h>
  24 #include <linux/vmstat.h>
  25
  26 /*
  27  * Each CPU has a list of per CPU counters:
  28  */
  29 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  30
  31 int perf_max_counters __read_mostly = 1;
  32 static int perf_reserved_percpu __read_mostly;
  33 static int perf_overcommit __read_mostly = 1;
  34
  35 /*
  36  * Mutex for (sysadmin-configurable) counter reservations:
  37  */
  38 static DEFINE_MUTEX(perf_resource_mutex);
  39
  40 /*
  41  * Architecture provided APIs - weak aliases:
  42  */
  43 extern __weak const struct hw_perf_counter_ops *
  44 hw_perf_counter_init(struct perf_counter *counter)
  45 {
  46         return NULL;
  47 }
  48
  49 u64 __weak hw_perf_save_disable(void)           { return 0; }
  50 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  51 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  52 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  53                struct perf_cpu_context *cpuctx,
  54                struct perf_counter_context *ctx, int cpu)
  55 {
  56         return 0;
  57 }
  58
  59 void __weak perf_counter_print_debug(void)      { }
  60
  61 static void
  62 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  63 {
  64         struct perf_counter *group_leader = counter->group_leader;
  65
  66         /*
  67          * Depending on whether it is a standalone or sibling counter,
  68          * add it straight to the context's counter list, or to the group
  69          * leader's sibling list:
  70          */
  71         if (counter->group_leader == counter)
  72                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  73         else
  74                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  75 }
  76
  77 static void
  78 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  79 {
  80         struct perf_counter *sibling, *tmp;
  81
  82         list_del_init(&counter->list_entry);
  83
  84         /*
  85          * If this was a group counter with sibling counters then
  86          * upgrade the siblings to singleton counters by adding them
  87          * to the context list directly:
  88          */
  89         list_for_each_entry_safe(sibling, tmp,
  90                                  &counter->sibling_list, list_entry) {
  91
  92                 list_del_init(&sibling->list_entry);
  93                 list_add_tail(&sibling->list_entry, &ctx->counter_list);
  94                 sibling->group_leader = sibling;
  95         }
  96 }
  97
  98 static void
  99 counter_sched_out(struct perf_counter *counter,
 100                   struct perf_cpu_context *cpuctx,
 101                   struct perf_counter_context *ctx)
 102 {
 103         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 104                 return;
 105
 106         counter->state = PERF_COUNTER_STATE_INACTIVE;
 107         counter->hw_ops->disable(counter);
 108         counter->oncpu = -1;
 109
 110         if (!is_software_counter(counter))
 111                 cpuctx->active_oncpu--;
 112         ctx->nr_active--;
 113         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
 114                 cpuctx->exclusive = 0;
 115 }
 116
 117 static void
 118 group_sched_out(struct perf_counter *group_counter,
 119                 struct perf_cpu_context *cpuctx,
 120                 struct perf_counter_context *ctx)
 121 {
 122         struct perf_counter *counter;
 123
 124         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 125                 return;
 126
 127         counter_sched_out(group_counter, cpuctx, ctx);
 128
 129         /*
 130          * Schedule out siblings (if any):
 131          */
 132         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 133                 counter_sched_out(counter, cpuctx, ctx);
 134
 135         if (group_counter->hw_event.exclusive)
 136                 cpuctx->exclusive = 0;
 137 }
 138
 139 /*
 140  * Cross CPU call to remove a performance counter
 141  *
 142  * We disable the counter on the hardware level first. After that we
 143  * remove it from the context list.
 144  */
 145 static void __perf_counter_remove_from_context(void *info)
 146 {
 147         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 148         struct perf_counter *counter = info;
 149         struct perf_counter_context *ctx = counter->ctx;
 150         unsigned long flags;
 151         u64 perf_flags;
 152
 153         /*
 154          * If this is a task context, we need to check whether it is
 155          * the current task context of this cpu. If not it has been
 156          * scheduled out before the smp call arrived.
 157          */
 158         if (ctx->task && cpuctx->task_ctx != ctx)
 159                 return;
 160
 161         curr_rq_lock_irq_save(&flags);
 162         spin_lock(&ctx->lock);
 163
 164         counter_sched_out(counter, cpuctx, ctx);
 165
 166         counter->task = NULL;
 167         ctx->nr_counters--;
 168
 169         /*
 170          * Protect the list operation against NMI by disabling the
 171          * counters on a global level. NOP for non NMI based counters.
 172          */
 173         perf_flags = hw_perf_save_disable();
 174         list_del_counter(counter, ctx);
 175         hw_perf_restore(perf_flags);
 176
 177         if (!ctx->task) {
 178                 /*
 179                  * Allow more per task counters with respect to the
 180                  * reservation:
 181                  */
 182                 cpuctx->max_pertask =
 183                         min(perf_max_counters - ctx->nr_counters,
 184                             perf_max_counters - perf_reserved_percpu);
 185         }
 186
 187         spin_unlock(&ctx->lock);
 188         curr_rq_unlock_irq_restore(&flags);
 189 }
 190
 191
 192 /*
 193  * Remove the counter from a task's (or a CPU's) list of counters.
 194  *
 195  * Must be called with counter->mutex and ctx->mutex held.
 196  *
 197  * CPU counters are removed with a smp call. For task counters we only
 198  * call when the task is on a CPU.
 199  */
 200 static void perf_counter_remove_from_context(struct perf_counter *counter)
 201 {
 202         struct perf_counter_context *ctx = counter->ctx;
 203         struct task_struct *task = ctx->task;
 204
 205         if (!task) {
 206                 /*
 207                  * Per cpu counters are removed via an smp call and
 208                  * the removal is always sucessful.
 209                  */
 210                 smp_call_function_single(counter->cpu,
 211                                          __perf_counter_remove_from_context,
 212                                          counter, 1);
 213                 return;
 214         }
 215
 216 retry:
 217         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 218                                  counter);
 219
 220         spin_lock_irq(&ctx->lock);
 221         /*
 222          * If the context is active we need to retry the smp call.
 223          */
 224         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 225                 spin_unlock_irq(&ctx->lock);
 226                 goto retry;
 227         }
 228
 229         /*
 230          * The lock prevents that this context is scheduled in so we
 231          * can remove the counter safely, if the call above did not
 232          * succeed.
 233          */
 234         if (!list_empty(&counter->list_entry)) {
 235                 ctx->nr_counters--;
 236                 list_del_counter(counter, ctx);
 237                 counter->task = NULL;
 238         }
 239         spin_unlock_irq(&ctx->lock);
 240 }
 241
 242 /*
 243  * Cross CPU call to disable a performance counter
 244  */
 245 static void __perf_counter_disable(void *info)
 246 {
 247         struct perf_counter *counter = info;
 248         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 249         struct perf_counter_context *ctx = counter->ctx;
 250         unsigned long flags;
 251
 252         /*
 253          * If this is a per-task counter, need to check whether this
 254          * counter's task is the current task on this cpu.
 255          */
 256         if (ctx->task && cpuctx->task_ctx != ctx)
 257                 return;
 258
 259         curr_rq_lock_irq_save(&flags);
 260         spin_lock(&ctx->lock);
 261
 262         /*
 263          * If the counter is on, turn it off.
 264          * If it is in error state, leave it in error state.
 265          */
 266         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 267                 if (counter == counter->group_leader)
 268                         group_sched_out(counter, cpuctx, ctx);
 269                 else
 270                         counter_sched_out(counter, cpuctx, ctx);
 271                 counter->state = PERF_COUNTER_STATE_OFF;
 272         }
 273
 274         spin_unlock(&ctx->lock);
 275         curr_rq_unlock_irq_restore(&flags);
 276 }
 277
 278 /*
 279  * Disable a counter.
 280  */
 281 static void perf_counter_disable(struct perf_counter *counter)
 282 {
 283         struct perf_counter_context *ctx = counter->ctx;
 284         struct task_struct *task = ctx->task;
 285
 286         if (!task) {
 287                 /*
 288                  * Disable the counter on the cpu that it's on
 289                  */
 290                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 291                                          counter, 1);
 292                 return;
 293         }
 294
 295  retry:
 296         task_oncpu_function_call(task, __perf_counter_disable, counter);
 297
 298         spin_lock_irq(&ctx->lock);
 299         /*
 300          * If the counter is still active, we need to retry the cross-call.
 301          */
 302         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 303                 spin_unlock_irq(&ctx->lock);
 304                 goto retry;
 305         }
 306
 307         /*
 308          * Since we have the lock this context can't be scheduled
 309          * in, so we can change the state safely.
 310          */
 311         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 312                 counter->state = PERF_COUNTER_STATE_OFF;
 313
 314         spin_unlock_irq(&ctx->lock);
 315 }
 316
 317 /*
 318  * Disable a counter and all its children.
 319  */
 320 static void perf_counter_disable_family(struct perf_counter *counter)
 321 {
 322         struct perf_counter *child;
 323
 324         perf_counter_disable(counter);
 325
 326         /*
 327          * Lock the mutex to protect the list of children
 328          */
 329         mutex_lock(&counter->mutex);
 330         list_for_each_entry(child, &counter->child_list, child_list)
 331                 perf_counter_disable(child);
 332         mutex_unlock(&counter->mutex);
 333 }
 334
 335 static int
 336 counter_sched_in(struct perf_counter *counter,
 337                  struct perf_cpu_context *cpuctx,
 338                  struct perf_counter_context *ctx,
 339                  int cpu)
 340 {
 341         if (counter->state <= PERF_COUNTER_STATE_OFF)
 342                 return 0;
 343
 344         counter->state = PERF_COUNTER_STATE_ACTIVE;
 345         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 346         /*
 347          * The new state must be visible before we turn it on in the hardware:
 348          */
 349         smp_wmb();
 350
 351         if (counter->hw_ops->enable(counter)) {
 352                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 353                 counter->oncpu = -1;
 354                 return -EAGAIN;
 355         }
 356
 357         if (!is_software_counter(counter))
 358                 cpuctx->active_oncpu++;
 359         ctx->nr_active++;
 360
 361         if (counter->hw_event.exclusive)
 362                 cpuctx->exclusive = 1;
 363
 364         return 0;
 365 }
 366
 367 /*
 368  * Return 1 for a group consisting entirely of software counters,
 369  * 0 if the group contains any hardware counters.
 370  */
 371 static int is_software_only_group(struct perf_counter *leader)
 372 {
 373         struct perf_counter *counter;
 374
 375         if (!is_software_counter(leader))
 376                 return 0;
 377         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 378                 if (!is_software_counter(counter))
 379                         return 0;
 380         return 1;
 381 }
 382
 383 /*
 384  * Work out whether we can put this counter group on the CPU now.
 385  */
 386 static int group_can_go_on(struct perf_counter *counter,
 387                            struct perf_cpu_context *cpuctx,
 388                            int can_add_hw)
 389 {
 390         /*
 391          * Groups consisting entirely of software counters can always go on.
 392          */
 393         if (is_software_only_group(counter))
 394                 return 1;
 395         /*
 396          * If an exclusive group is already on, no other hardware
 397          * counters can go on.
 398          */
 399         if (cpuctx->exclusive)
 400                 return 0;
 401         /*
 402          * If this group is exclusive and there are already
 403          * counters on the CPU, it can't go on.
 404          */
 405         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
 406                 return 0;
 407         /*
 408          * Otherwise, try to add it if all previous groups were able
 409          * to go on.
 410          */
 411         return can_add_hw;
 412 }
 413
 414 /*
 415  * Cross CPU call to install and enable a performance counter
 416  */
 417 static void __perf_install_in_context(void *info)
 418 {
 419         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 420         struct perf_counter *counter = info;
 421         struct perf_counter_context *ctx = counter->ctx;
 422         struct perf_counter *leader = counter->group_leader;
 423         int cpu = smp_processor_id();
 424         unsigned long flags;
 425         u64 perf_flags;
 426         int err;
 427
 428         /*
 429          * If this is a task context, we need to check whether it is
 430          * the current task context of this cpu. If not it has been
 431          * scheduled out before the smp call arrived.
 432          */
 433         if (ctx->task && cpuctx->task_ctx != ctx)
 434                 return;
 435
 436         curr_rq_lock_irq_save(&flags);
 437         spin_lock(&ctx->lock);
 438
 439         /*
 440          * Protect the list operation against NMI by disabling the
 441          * counters on a global level. NOP for non NMI based counters.
 442          */
 443         perf_flags = hw_perf_save_disable();
 444
 445         list_add_counter(counter, ctx);
 446         ctx->nr_counters++;
 447
 448         /*
 449          * Don't put the counter on if it is disabled or if
 450          * it is in a group and the group isn't on.
 451          */
 452         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 453             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 454                 goto unlock;
 455
 456         /*
 457          * An exclusive counter can't go on if there are already active
 458          * hardware counters, and no hardware counter can go on if there
 459          * is already an exclusive counter on.
 460          */
 461         if (!group_can_go_on(counter, cpuctx, 1))
 462                 err = -EEXIST;
 463         else
 464                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 465
 466         if (err) {
 467                 /*
 468                  * This counter couldn't go on.  If it is in a group
 469                  * then we have to pull the whole group off.
 470                  * If the counter group is pinned then put it in error state.
 471                  */
 472                 if (leader != counter)
 473                         group_sched_out(leader, cpuctx, ctx);
 474                 if (leader->hw_event.pinned)
 475                         leader->state = PERF_COUNTER_STATE_ERROR;
 476         }
 477
 478         if (!err && !ctx->task && cpuctx->max_pertask)
 479                 cpuctx->max_pertask--;
 480
 481  unlock:
 482         hw_perf_restore(perf_flags);
 483
 484         spin_unlock(&ctx->lock);
 485         curr_rq_unlock_irq_restore(&flags);
 486 }
 487
 488 /*
 489  * Attach a performance counter to a context
 490  *
 491  * First we add the counter to the list with the hardware enable bit
 492  * in counter->hw_config cleared.
 493  *
 494  * If the counter is attached to a task which is on a CPU we use a smp
 495  * call to enable it in the task context. The task might have been
 496  * scheduled away, but we check this in the smp call again.
 497  *
 498  * Must be called with ctx->mutex held.
 499  */
 500 static void
 501 perf_install_in_context(struct perf_counter_context *ctx,
 502                         struct perf_counter *counter,
 503                         int cpu)
 504 {
 505         struct task_struct *task = ctx->task;
 506
 507         if (!task) {
 508                 /*
 509                  * Per cpu counters are installed via an smp call and
 510                  * the install is always sucessful.
 511                  */
 512                 smp_call_function_single(cpu, __perf_install_in_context,
 513                                          counter, 1);
 514                 return;
 515         }
 516
 517         counter->task = task;
 518 retry:
 519         task_oncpu_function_call(task, __perf_install_in_context,
 520                                  counter);
 521
 522         spin_lock_irq(&ctx->lock);
 523         /*
 524          * we need to retry the smp call.
 525          */
 526         if (ctx->is_active && list_empty(&counter->list_entry)) {
 527                 spin_unlock_irq(&ctx->lock);
 528                 goto retry;
 529         }
 530
 531         /*
 532          * The lock prevents that this context is scheduled in so we
 533          * can add the counter safely, if it the call above did not
 534          * succeed.
 535          */
 536         if (list_empty(&counter->list_entry)) {
 537                 list_add_counter(counter, ctx);
 538                 ctx->nr_counters++;
 539         }
 540         spin_unlock_irq(&ctx->lock);
 541 }
 542
 543 /*
 544  * Cross CPU call to enable a performance counter
 545  */
 546 static void __perf_counter_enable(void *info)
 547 {
 548         struct perf_counter *counter = info;
 549         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 550         struct perf_counter_context *ctx = counter->ctx;
 551         struct perf_counter *leader = counter->group_leader;
 552         unsigned long flags;
 553         int err;
 554
 555         /*
 556          * If this is a per-task counter, need to check whether this
 557          * counter's task is the current task on this cpu.
 558          */
 559         if (ctx->task && cpuctx->task_ctx != ctx)
 560                 return;
 561
 562         curr_rq_lock_irq_save(&flags);
 563         spin_lock(&ctx->lock);
 564
 565         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 566                 goto unlock;
 567         counter->state = PERF_COUNTER_STATE_INACTIVE;
 568
 569         /*
 570          * If the counter is in a group and isn't the group leader,
 571          * then don't put it on unless the group is on.
 572          */
 573         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 574                 goto unlock;
 575
 576         if (!group_can_go_on(counter, cpuctx, 1))
 577                 err = -EEXIST;
 578         else
 579                 err = counter_sched_in(counter, cpuctx, ctx,
 580                                        smp_processor_id());
 581
 582         if (err) {
 583                 /*
 584                  * If this counter can't go on and it's part of a
 585                  * group, then the whole group has to come off.
 586                  */
 587                 if (leader != counter)
 588                         group_sched_out(leader, cpuctx, ctx);
 589                 if (leader->hw_event.pinned)
 590                         leader->state = PERF_COUNTER_STATE_ERROR;
 591         }
 592
 593  unlock:
 594         spin_unlock(&ctx->lock);
 595         curr_rq_unlock_irq_restore(&flags);
 596 }
 597
 598 /*
 599  * Enable a counter.
 600  */
 601 static void perf_counter_enable(struct perf_counter *counter)
 602 {
 603         struct perf_counter_context *ctx = counter->ctx;
 604         struct task_struct *task = ctx->task;
 605
 606         if (!task) {
 607                 /*
 608                  * Enable the counter on the cpu that it's on
 609                  */
 610                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 611                                          counter, 1);
 612                 return;
 613         }
 614
 615         spin_lock_irq(&ctx->lock);
 616         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 617                 goto out;
 618
 619         /*
 620          * If the counter is in error state, clear that first.
 621          * That way, if we see the counter in error state below, we
 622          * know that it has gone back into error state, as distinct
 623          * from the task having been scheduled away before the
 624          * cross-call arrived.
 625          */
 626         if (counter->state == PERF_COUNTER_STATE_ERROR)
 627                 counter->state = PERF_COUNTER_STATE_OFF;
 628
 629  retry:
 630         spin_unlock_irq(&ctx->lock);
 631         task_oncpu_function_call(task, __perf_counter_enable, counter);
 632
 633         spin_lock_irq(&ctx->lock);
 634
 635         /*
 636          * If the context is active and the counter is still off,
 637          * we need to retry the cross-call.
 638          */
 639         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 640                 goto retry;
 641
 642         /*
 643          * Since we have the lock this context can't be scheduled
 644          * in, so we can change the state safely.
 645          */
 646         if (counter->state == PERF_COUNTER_STATE_OFF)
 647                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 648  out:
 649         spin_unlock_irq(&ctx->lock);
 650 }
 651
 652 /*
 653  * Enable a counter and all its children.
 654  */
 655 static void perf_counter_enable_family(struct perf_counter *counter)
 656 {
 657         struct perf_counter *child;
 658
 659         perf_counter_enable(counter);
 660
 661         /*
 662          * Lock the mutex to protect the list of children
 663          */
 664         mutex_lock(&counter->mutex);
 665         list_for_each_entry(child, &counter->child_list, child_list)
 666                 perf_counter_enable(child);
 667         mutex_unlock(&counter->mutex);
 668 }
 669
 670 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 671                               struct perf_cpu_context *cpuctx)
 672 {
 673         struct perf_counter *counter;
 674         u64 flags;
 675
 676         spin_lock(&ctx->lock);
 677         ctx->is_active = 0;
 678         if (likely(!ctx->nr_counters))
 679                 goto out;
 680
 681         flags = hw_perf_save_disable();
 682         if (ctx->nr_active) {
 683                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 684                         group_sched_out(counter, cpuctx, ctx);
 685         }
 686         hw_perf_restore(flags);
 687  out:
 688         spin_unlock(&ctx->lock);
 689 }
 690
 691 /*
 692  * Called from scheduler to remove the counters of the current task,
 693  * with interrupts disabled.
 694  *
 695  * We stop each counter and update the counter value in counter->count.
 696  *
 697  * This does not protect us against NMI, but disable()
 698  * sets the disabled bit in the control field of counter _before_
 699  * accessing the counter control register. If a NMI hits, then it will
 700  * not restart the counter.
 701  */
 702 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 703 {
 704         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 705         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 706
 707         if (likely(!cpuctx->task_ctx))
 708                 return;
 709
 710         __perf_counter_sched_out(ctx, cpuctx);
 711
 712         cpuctx->task_ctx = NULL;
 713 }
 714
 715 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 716 {
 717         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 718 }
 719
 720 static int
 721 group_sched_in(struct perf_counter *group_counter,
 722                struct perf_cpu_context *cpuctx,
 723                struct perf_counter_context *ctx,
 724                int cpu)
 725 {
 726         struct perf_counter *counter, *partial_group;
 727         int ret;
 728
 729         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 730                 return 0;
 731
 732         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 733         if (ret)
 734                 return ret < 0 ? ret : 0;
 735
 736         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 737                 return -EAGAIN;
 738
 739         /*
 740          * Schedule in siblings as one group (if any):
 741          */
 742         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 743                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 744                         partial_group = counter;
 745                         goto group_error;
 746                 }
 747         }
 748
 749         return 0;
 750
 751 group_error:
 752         /*
 753          * Groups can be scheduled in as one unit only, so undo any
 754          * partial group before returning:
 755          */
 756         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 757                 if (counter == partial_group)
 758                         break;
 759                 counter_sched_out(counter, cpuctx, ctx);
 760         }
 761         counter_sched_out(group_counter, cpuctx, ctx);
 762
 763         return -EAGAIN;
 764 }
 765
 766 static void
 767 __perf_counter_sched_in(struct perf_counter_context *ctx,
 768                         struct perf_cpu_context *cpuctx, int cpu)
 769 {
 770         struct perf_counter *counter;
 771         u64 flags;
 772         int can_add_hw = 1;
 773
 774         spin_lock(&ctx->lock);
 775         ctx->is_active = 1;
 776         if (likely(!ctx->nr_counters))
 777                 goto out;
 778
 779         flags = hw_perf_save_disable();
 780
 781         /*
 782          * First go through the list and put on any pinned groups
 783          * in order to give them the best chance of going on.
 784          */
 785         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 786                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 787                     !counter->hw_event.pinned)
 788                         continue;
 789                 if (counter->cpu != -1 && counter->cpu != cpu)
 790                         continue;
 791
 792                 if (group_can_go_on(counter, cpuctx, 1))
 793                         group_sched_in(counter, cpuctx, ctx, cpu);
 794
 795                 /*
 796                  * If this pinned group hasn't been scheduled,
 797                  * put it in error state.
 798                  */
 799                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 800                         counter->state = PERF_COUNTER_STATE_ERROR;
 801         }
 802
 803         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 804                 /*
 805                  * Ignore counters in OFF or ERROR state, and
 806                  * ignore pinned counters since we did them already.
 807                  */
 808                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 809                     counter->hw_event.pinned)
 810                         continue;
 811
 812                 /*
 813                  * Listen to the 'cpu' scheduling filter constraint
 814                  * of counters:
 815                  */
 816                 if (counter->cpu != -1 && counter->cpu != cpu)
 817                         continue;
 818
 819                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 820                         if (group_sched_in(counter, cpuctx, ctx, cpu))
 821                                 can_add_hw = 0;
 822                 }
 823         }
 824         hw_perf_restore(flags);
 825  out:
 826         spin_unlock(&ctx->lock);
 827 }
 828
 829 /*
 830  * Called from scheduler to add the counters of the current task
 831  * with interrupts disabled.
 832  *
 833  * We restore the counter value and then enable it.
 834  *
 835  * This does not protect us against NMI, but enable()
 836  * sets the enabled bit in the control field of counter _before_
 837  * accessing the counter control register. If a NMI hits, then it will
 838  * keep the counter running.
 839  */
 840 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 841 {
 842         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 843         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 844
 845         __perf_counter_sched_in(ctx, cpuctx, cpu);
 846         cpuctx->task_ctx = ctx;
 847 }
 848
 849 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 850 {
 851         struct perf_counter_context *ctx = &cpuctx->ctx;
 852
 853         __perf_counter_sched_in(ctx, cpuctx, cpu);
 854 }
 855
 856 int perf_counter_task_disable(void)
 857 {
 858         struct task_struct *curr = current;
 859         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 860         struct perf_counter *counter;
 861         unsigned long flags;
 862         u64 perf_flags;
 863         int cpu;
 864
 865         if (likely(!ctx->nr_counters))
 866                 return 0;
 867
 868         curr_rq_lock_irq_save(&flags);
 869         cpu = smp_processor_id();
 870
 871         /* force the update of the task clock: */
 872         __task_delta_exec(curr, 1);
 873
 874         perf_counter_task_sched_out(curr, cpu);
 875
 876         spin_lock(&ctx->lock);
 877
 878         /*
 879          * Disable all the counters:
 880          */
 881         perf_flags = hw_perf_save_disable();
 882
 883         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 884                 if (counter->state != PERF_COUNTER_STATE_ERROR)
 885                         counter->state = PERF_COUNTER_STATE_OFF;
 886         }
 887
 888         hw_perf_restore(perf_flags);
 889
 890         spin_unlock(&ctx->lock);
 891
 892         curr_rq_unlock_irq_restore(&flags);
 893
 894         return 0;
 895 }
 896
 897 int perf_counter_task_enable(void)
 898 {
 899         struct task_struct *curr = current;
 900         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 901         struct perf_counter *counter;
 902         unsigned long flags;
 903         u64 perf_flags;
 904         int cpu;
 905
 906         if (likely(!ctx->nr_counters))
 907                 return 0;
 908
 909         curr_rq_lock_irq_save(&flags);
 910         cpu = smp_processor_id();
 911
 912         /* force the update of the task clock: */
 913         __task_delta_exec(curr, 1);
 914
 915         perf_counter_task_sched_out(curr, cpu);
 916
 917         spin_lock(&ctx->lock);
 918
 919         /*
 920          * Disable all the counters:
 921          */
 922         perf_flags = hw_perf_save_disable();
 923
 924         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 925                 if (counter->state > PERF_COUNTER_STATE_OFF)
 926                         continue;
 927                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 928                 counter->hw_event.disabled = 0;
 929         }
 930         hw_perf_restore(perf_flags);
 931
 932         spin_unlock(&ctx->lock);
 933
 934         perf_counter_task_sched_in(curr, cpu);
 935
 936         curr_rq_unlock_irq_restore(&flags);
 937
 938         return 0;
 939 }
 940
 941 /*
 942  * Round-robin a context's counters:
 943  */
 944 static void rotate_ctx(struct perf_counter_context *ctx)
 945 {
 946         struct perf_counter *counter;
 947         u64 perf_flags;
 948
 949         if (!ctx->nr_counters)
 950                 return;
 951
 952         spin_lock(&ctx->lock);
 953         /*
 954          * Rotate the first entry last (works just fine for group counters too):
 955          */
 956         perf_flags = hw_perf_save_disable();
 957         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 958                 list_del(&counter->list_entry);
 959                 list_add_tail(&counter->list_entry, &ctx->counter_list);
 960                 break;
 961         }
 962         hw_perf_restore(perf_flags);
 963
 964         spin_unlock(&ctx->lock);
 965 }
 966
 967 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 968 {
 969         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 970         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 971         const int rotate_percpu = 0;
 972
 973         if (rotate_percpu)
 974                 perf_counter_cpu_sched_out(cpuctx);
 975         perf_counter_task_sched_out(curr, cpu);
 976
 977         if (rotate_percpu)
 978                 rotate_ctx(&cpuctx->ctx);
 979         rotate_ctx(ctx);
 980
 981         if (rotate_percpu)
 982                 perf_counter_cpu_sched_in(cpuctx, cpu);
 983         perf_counter_task_sched_in(curr, cpu);
 984 }
 985
 986 /*
 987  * Cross CPU call to read the hardware counter
 988  */
 989 static void __read(void *info)
 990 {
 991         struct perf_counter *counter = info;
 992         unsigned long flags;
 993
 994         curr_rq_lock_irq_save(&flags);
 995         counter->hw_ops->read(counter);
 996         curr_rq_unlock_irq_restore(&flags);
 997 }
 998
 999 static u64 perf_counter_read(struct perf_counter *counter)
1000 {
1001         /*
1002          * If counter is enabled and currently active on a CPU, update the
1003          * value in the counter structure:
1004          */
1005         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1006                 smp_call_function_single(counter->oncpu,
1007                                          __read, counter, 1);
1008         }
1009
1010         return atomic64_read(&counter->count);
1011 }
1012
1013 /*
1014  * Cross CPU call to switch performance data pointers
1015  */
1016 static void __perf_switch_irq_data(void *info)
1017 {
1018         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1019         struct perf_counter *counter = info;
1020         struct perf_counter_context *ctx = counter->ctx;
1021         struct perf_data *oldirqdata = counter->irqdata;
1022
1023         /*
1024          * If this is a task context, we need to check whether it is
1025          * the current task context of this cpu. If not it has been
1026          * scheduled out before the smp call arrived.
1027          */
1028         if (ctx->task) {
1029                 if (cpuctx->task_ctx != ctx)
1030                         return;
1031                 spin_lock(&ctx->lock);
1032         }
1033
1034         /* Change the pointer NMI safe */
1035         atomic_long_set((atomic_long_t *)&counter->irqdata,
1036                         (unsigned long) counter->usrdata);
1037         counter->usrdata = oldirqdata;
1038
1039         if (ctx->task)
1040                 spin_unlock(&ctx->lock);
1041 }
1042
1043 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1044 {
1045         struct perf_counter_context *ctx = counter->ctx;
1046         struct perf_data *oldirqdata = counter->irqdata;
1047         struct task_struct *task = ctx->task;
1048
1049         if (!task) {
1050                 smp_call_function_single(counter->cpu,
1051                                          __perf_switch_irq_data,
1052                                          counter, 1);
1053                 return counter->usrdata;
1054         }
1055
1056 retry:
1057         spin_lock_irq(&ctx->lock);
1058         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1059                 counter->irqdata = counter->usrdata;
1060                 counter->usrdata = oldirqdata;
1061                 spin_unlock_irq(&ctx->lock);
1062                 return oldirqdata;
1063         }
1064         spin_unlock_irq(&ctx->lock);
1065         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1066         /* Might have failed, because task was scheduled out */
1067         if (counter->irqdata == oldirqdata)
1068                 goto retry;
1069
1070         return counter->usrdata;
1071 }
1072
1073 static void put_context(struct perf_counter_context *ctx)
1074 {
1075         if (ctx->task)
1076                 put_task_struct(ctx->task);
1077 }
1078
1079 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1080 {
1081         struct perf_cpu_context *cpuctx;
1082         struct perf_counter_context *ctx;
1083         struct task_struct *task;
1084
1085         /*
1086          * If cpu is not a wildcard then this is a percpu counter:
1087          */
1088         if (cpu != -1) {
1089                 /* Must be root to operate on a CPU counter: */
1090                 if (!capable(CAP_SYS_ADMIN))
1091                         return ERR_PTR(-EACCES);
1092
1093                 if (cpu < 0 || cpu > num_possible_cpus())
1094                         return ERR_PTR(-EINVAL);
1095
1096                 /*
1097                  * We could be clever and allow to attach a counter to an
1098                  * offline CPU and activate it when the CPU comes up, but
1099                  * that's for later.
1100                  */
1101                 if (!cpu_isset(cpu, cpu_online_map))
1102                         return ERR_PTR(-ENODEV);
1103
1104                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1105                 ctx = &cpuctx->ctx;
1106
1107                 return ctx;
1108         }
1109
1110         rcu_read_lock();
1111         if (!pid)
1112                 task = current;
1113         else
1114                 task = find_task_by_vpid(pid);
1115         if (task)
1116                 get_task_struct(task);
1117         rcu_read_unlock();
1118
1119         if (!task)
1120                 return ERR_PTR(-ESRCH);
1121
1122         ctx = &task->perf_counter_ctx;
1123         ctx->task = task;
1124
1125         /* Reuse ptrace permission checks for now. */
1126         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1127                 put_context(ctx);
1128                 return ERR_PTR(-EACCES);
1129         }
1130
1131         return ctx;
1132 }
1133
1134 /*
1135  * Called when the last reference to the file is gone.
1136  */
1137 static int perf_release(struct inode *inode, struct file *file)
1138 {
1139         struct perf_counter *counter = file->private_data;
1140         struct perf_counter_context *ctx = counter->ctx;
1141
1142         file->private_data = NULL;
1143
1144         mutex_lock(&ctx->mutex);
1145         mutex_lock(&counter->mutex);
1146
1147         perf_counter_remove_from_context(counter);
1148
1149         mutex_unlock(&counter->mutex);
1150         mutex_unlock(&ctx->mutex);
1151
1152         kfree(counter);
1153         put_context(ctx);
1154
1155         return 0;
1156 }
1157
1158 /*
1159  * Read the performance counter - simple non blocking version for now
1160  */
1161 static ssize_t
1162 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1163 {
1164         u64 cntval;
1165
1166         if (count != sizeof(cntval))
1167                 return -EINVAL;
1168
1169         /*
1170          * Return end-of-file for a read on a counter that is in
1171          * error state (i.e. because it was pinned but it couldn't be
1172          * scheduled on to the CPU at some point).
1173          */
1174         if (counter->state == PERF_COUNTER_STATE_ERROR)
1175                 return 0;
1176
1177         mutex_lock(&counter->mutex);
1178         cntval = perf_counter_read(counter);
1179         mutex_unlock(&counter->mutex);
1180
1181         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1182 }
1183
1184 static ssize_t
1185 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1186 {
1187         if (!usrdata->len)
1188                 return 0;
1189
1190         count = min(count, (size_t)usrdata->len);
1191         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1192                 return -EFAULT;
1193
1194         /* Adjust the counters */
1195         usrdata->len -= count;
1196         if (!usrdata->len)
1197                 usrdata->rd_idx = 0;
1198         else
1199                 usrdata->rd_idx += count;
1200
1201         return count;
1202 }
1203
1204 static ssize_t
1205 perf_read_irq_data(struct perf_counter  *counter,
1206                    char __user          *buf,
1207                    size_t               count,
1208                    int                  nonblocking)
1209 {
1210         struct perf_data *irqdata, *usrdata;
1211         DECLARE_WAITQUEUE(wait, current);
1212         ssize_t res, res2;
1213
1214         irqdata = counter->irqdata;
1215         usrdata = counter->usrdata;
1216
1217         if (usrdata->len + irqdata->len >= count)
1218                 goto read_pending;
1219
1220         if (nonblocking)
1221                 return -EAGAIN;
1222
1223         spin_lock_irq(&counter->waitq.lock);
1224         __add_wait_queue(&counter->waitq, &wait);
1225         for (;;) {
1226                 set_current_state(TASK_INTERRUPTIBLE);
1227                 if (usrdata->len + irqdata->len >= count)
1228                         break;
1229
1230                 if (signal_pending(current))
1231                         break;
1232
1233                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1234                         break;
1235
1236                 spin_unlock_irq(&counter->waitq.lock);
1237                 schedule();
1238                 spin_lock_irq(&counter->waitq.lock);
1239         }
1240         __remove_wait_queue(&counter->waitq, &wait);
1241         __set_current_state(TASK_RUNNING);
1242         spin_unlock_irq(&counter->waitq.lock);
1243
1244         if (usrdata->len + irqdata->len < count &&
1245             counter->state != PERF_COUNTER_STATE_ERROR)
1246                 return -ERESTARTSYS;
1247 read_pending:
1248         mutex_lock(&counter->mutex);
1249
1250         /* Drain pending data first: */
1251         res = perf_copy_usrdata(usrdata, buf, count);
1252         if (res < 0 || res == count)
1253                 goto out;
1254
1255         /* Switch irq buffer: */
1256         usrdata = perf_switch_irq_data(counter);
1257         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1258         if (res2 < 0) {
1259                 if (!res)
1260                         res = -EFAULT;
1261         } else {
1262                 res += res2;
1263         }
1264 out:
1265         mutex_unlock(&counter->mutex);
1266
1267         return res;
1268 }
1269
1270 static ssize_t
1271 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1272 {
1273         struct perf_counter *counter = file->private_data;
1274
1275         switch (counter->hw_event.record_type) {
1276         case PERF_RECORD_SIMPLE:
1277                 return perf_read_hw(counter, buf, count);
1278
1279         case PERF_RECORD_IRQ:
1280         case PERF_RECORD_GROUP:
1281                 return perf_read_irq_data(counter, buf, count,
1282                                           file->f_flags & O_NONBLOCK);
1283         }
1284         return -EINVAL;
1285 }
1286
1287 static unsigned int perf_poll(struct file *file, poll_table *wait)
1288 {
1289         struct perf_counter *counter = file->private_data;
1290         unsigned int events = 0;
1291         unsigned long flags;
1292
1293         poll_wait(file, &counter->waitq, wait);
1294
1295         spin_lock_irqsave(&counter->waitq.lock, flags);
1296         if (counter->usrdata->len || counter->irqdata->len)
1297                 events |= POLLIN;
1298         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1299
1300         return events;
1301 }
1302
1303 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1304 {
1305         struct perf_counter *counter = file->private_data;
1306         int err = 0;
1307
1308         switch (cmd) {
1309         case PERF_COUNTER_IOC_ENABLE:
1310                 perf_counter_enable_family(counter);
1311                 break;
1312         case PERF_COUNTER_IOC_DISABLE:
1313                 perf_counter_disable_family(counter);
1314                 break;
1315         default:
1316                 err = -ENOTTY;
1317         }
1318         return err;
1319 }
1320
1321 static const struct file_operations perf_fops = {
1322         .release                = perf_release,
1323         .read                   = perf_read,
1324         .poll                   = perf_poll,
1325         .unlocked_ioctl         = perf_ioctl,
1326         .compat_ioctl           = perf_ioctl,
1327 };
1328
1329 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1330 {
1331         int cpu = raw_smp_processor_id();
1332
1333         atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
1334         return 0;
1335 }
1336
1337 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1338 {
1339         int cpu = raw_smp_processor_id();
1340         s64 prev;
1341         u64 now;
1342
1343         now = cpu_clock(cpu);
1344         prev = atomic64_read(&counter->hw.prev_count);
1345         atomic64_set(&counter->hw.prev_count, now);
1346         atomic64_add(now - prev, &counter->count);
1347 }
1348
1349 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1350 {
1351         cpu_clock_perf_counter_update(counter);
1352 }
1353
1354 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1355 {
1356         cpu_clock_perf_counter_update(counter);
1357 }
1358
1359 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1360         .enable         = cpu_clock_perf_counter_enable,
1361         .disable        = cpu_clock_perf_counter_disable,
1362         .read           = cpu_clock_perf_counter_read,
1363 };
1364
1365 /*
1366  * Called from within the scheduler:
1367  */
1368 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1369 {
1370         struct task_struct *curr = counter->task;
1371         u64 delta;
1372
1373         delta = __task_delta_exec(curr, update);
1374
1375         return curr->se.sum_exec_runtime + delta;
1376 }
1377
1378 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1379 {
1380         u64 prev;
1381         s64 delta;
1382
1383         prev = atomic64_read(&counter->hw.prev_count);
1384
1385         atomic64_set(&counter->hw.prev_count, now);
1386
1387         delta = now - prev;
1388
1389         atomic64_add(delta, &counter->count);
1390 }
1391
1392 static void task_clock_perf_counter_read(struct perf_counter *counter)
1393 {
1394         u64 now = task_clock_perf_counter_val(counter, 1);
1395
1396         task_clock_perf_counter_update(counter, now);
1397 }
1398
1399 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1400 {
1401         u64 now = task_clock_perf_counter_val(counter, 0);
1402
1403         atomic64_set(&counter->hw.prev_count, now);
1404
1405         return 0;
1406 }
1407
1408 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1409 {
1410         u64 now = task_clock_perf_counter_val(counter, 0);
1411
1412         task_clock_perf_counter_update(counter, now);
1413 }
1414
1415 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1416         .enable         = task_clock_perf_counter_enable,
1417         .disable        = task_clock_perf_counter_disable,
1418         .read           = task_clock_perf_counter_read,
1419 };
1420
1421 #ifdef CONFIG_VM_EVENT_COUNTERS
1422 #define cpu_page_faults()       __get_cpu_var(vm_event_states).event[PGFAULT]
1423 #else
1424 #define cpu_page_faults()       0
1425 #endif
1426
1427 static u64 get_page_faults(struct perf_counter *counter)
1428 {
1429         struct task_struct *curr = counter->ctx->task;
1430
1431         if (curr)
1432                 return curr->maj_flt + curr->min_flt;
1433         return cpu_page_faults();
1434 }
1435
1436 static void page_faults_perf_counter_update(struct perf_counter *counter)
1437 {
1438         u64 prev, now;
1439         s64 delta;
1440
1441         prev = atomic64_read(&counter->hw.prev_count);
1442         now = get_page_faults(counter);
1443
1444         atomic64_set(&counter->hw.prev_count, now);
1445
1446         delta = now - prev;
1447
1448         atomic64_add(delta, &counter->count);
1449 }
1450
1451 static void page_faults_perf_counter_read(struct perf_counter *counter)
1452 {
1453         page_faults_perf_counter_update(counter);
1454 }
1455
1456 static int page_faults_perf_counter_enable(struct perf_counter *counter)
1457 {
1458         atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
1459         return 0;
1460 }
1461
1462 static void page_faults_perf_counter_disable(struct perf_counter *counter)
1463 {
1464         page_faults_perf_counter_update(counter);
1465 }
1466
1467 static const struct hw_perf_counter_ops perf_ops_page_faults = {
1468         .enable         = page_faults_perf_counter_enable,
1469         .disable        = page_faults_perf_counter_disable,
1470         .read           = page_faults_perf_counter_read,
1471 };
1472
1473 static u64 get_context_switches(struct perf_counter *counter)
1474 {
1475         struct task_struct *curr = counter->ctx->task;
1476
1477         if (curr)
1478                 return curr->nvcsw + curr->nivcsw;
1479         return cpu_nr_switches(smp_processor_id());
1480 }
1481
1482 static void context_switches_perf_counter_update(struct perf_counter *counter)
1483 {
1484         u64 prev, now;
1485         s64 delta;
1486
1487         prev = atomic64_read(&counter->hw.prev_count);
1488         now = get_context_switches(counter);
1489
1490         atomic64_set(&counter->hw.prev_count, now);
1491
1492         delta = now - prev;
1493
1494         atomic64_add(delta, &counter->count);
1495 }
1496
1497 static void context_switches_perf_counter_read(struct perf_counter *counter)
1498 {
1499         context_switches_perf_counter_update(counter);
1500 }
1501
1502 static int context_switches_perf_counter_enable(struct perf_counter *counter)
1503 {
1504         atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
1505         return 0;
1506 }
1507
1508 static void context_switches_perf_counter_disable(struct perf_counter *counter)
1509 {
1510         context_switches_perf_counter_update(counter);
1511 }
1512
1513 static const struct hw_perf_counter_ops perf_ops_context_switches = {
1514         .enable         = context_switches_perf_counter_enable,
1515         .disable        = context_switches_perf_counter_disable,
1516         .read           = context_switches_perf_counter_read,
1517 };
1518
1519 static inline u64 get_cpu_migrations(struct perf_counter *counter)
1520 {
1521         struct task_struct *curr = counter->ctx->task;
1522
1523         if (curr)
1524                 return curr->se.nr_migrations;
1525         return cpu_nr_migrations(smp_processor_id());
1526 }
1527
1528 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1529 {
1530         u64 prev, now;
1531         s64 delta;
1532
1533         prev = atomic64_read(&counter->hw.prev_count);
1534         now = get_cpu_migrations(counter);
1535
1536         atomic64_set(&counter->hw.prev_count, now);
1537
1538         delta = now - prev;
1539
1540         atomic64_add(delta, &counter->count);
1541 }
1542
1543 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1544 {
1545         cpu_migrations_perf_counter_update(counter);
1546 }
1547
1548 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1549 {
1550         atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
1551         return 0;
1552 }
1553
1554 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1555 {
1556         cpu_migrations_perf_counter_update(counter);
1557 }
1558
1559 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1560         .enable         = cpu_migrations_perf_counter_enable,
1561         .disable        = cpu_migrations_perf_counter_disable,
1562         .read           = cpu_migrations_perf_counter_read,
1563 };
1564
1565 static const struct hw_perf_counter_ops *
1566 sw_perf_counter_init(struct perf_counter *counter)
1567 {
1568         const struct hw_perf_counter_ops *hw_ops = NULL;
1569
1570         /*
1571          * Software counters (currently) can't in general distinguish
1572          * between user, kernel and hypervisor events.
1573          * However, context switches and cpu migrations are considered
1574          * to be kernel events, and page faults are never hypervisor
1575          * events.
1576          */
1577         switch (counter->hw_event.type) {
1578         case PERF_COUNT_CPU_CLOCK:
1579                 if (!(counter->hw_event.exclude_user ||
1580                       counter->hw_event.exclude_kernel ||
1581                       counter->hw_event.exclude_hv))
1582                         hw_ops = &perf_ops_cpu_clock;
1583                 break;
1584         case PERF_COUNT_TASK_CLOCK:
1585                 if (counter->hw_event.exclude_user ||
1586                     counter->hw_event.exclude_kernel ||
1587                     counter->hw_event.exclude_hv)
1588                         break;
1589                 /*
1590                  * If the user instantiates this as a per-cpu counter,
1591                  * use the cpu_clock counter instead.
1592                  */
1593                 if (counter->ctx->task)
1594                         hw_ops = &perf_ops_task_clock;
1595                 else
1596                         hw_ops = &perf_ops_cpu_clock;
1597                 break;
1598         case PERF_COUNT_PAGE_FAULTS:
1599                 if (!(counter->hw_event.exclude_user ||
1600                       counter->hw_event.exclude_kernel))
1601                         hw_ops = &perf_ops_page_faults;
1602                 break;
1603         case PERF_COUNT_CONTEXT_SWITCHES:
1604                 if (!counter->hw_event.exclude_kernel)
1605                         hw_ops = &perf_ops_context_switches;
1606                 break;
1607         case PERF_COUNT_CPU_MIGRATIONS:
1608                 if (!counter->hw_event.exclude_kernel)
1609                         hw_ops = &perf_ops_cpu_migrations;
1610                 break;
1611         default:
1612                 break;
1613         }
1614         return hw_ops;
1615 }
1616
1617 /*
1618  * Allocate and initialize a counter structure
1619  */
1620 static struct perf_counter *
1621 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1622                    int cpu,
1623                    struct perf_counter_context *ctx,
1624                    struct perf_counter *group_leader,
1625                    gfp_t gfpflags)
1626 {
1627         const struct hw_perf_counter_ops *hw_ops;
1628         struct perf_counter *counter;
1629
1630         counter = kzalloc(sizeof(*counter), gfpflags);
1631         if (!counter)
1632                 return NULL;
1633
1634         /*
1635          * Single counters are their own group leaders, with an
1636          * empty sibling list:
1637          */
1638         if (!group_leader)
1639                 group_leader = counter;
1640
1641         mutex_init(&counter->mutex);
1642         INIT_LIST_HEAD(&counter->list_entry);
1643         INIT_LIST_HEAD(&counter->sibling_list);
1644         init_waitqueue_head(&counter->waitq);
1645
1646         INIT_LIST_HEAD(&counter->child_list);
1647
1648         counter->irqdata                = &counter->data[0];
1649         counter->usrdata                = &counter->data[1];
1650         counter->cpu                    = cpu;
1651         counter->hw_event               = *hw_event;
1652         counter->wakeup_pending         = 0;
1653         counter->group_leader           = group_leader;
1654         counter->hw_ops                 = NULL;
1655         counter->ctx                    = ctx;
1656
1657         counter->state = PERF_COUNTER_STATE_INACTIVE;
1658         if (hw_event->disabled)
1659                 counter->state = PERF_COUNTER_STATE_OFF;
1660
1661         hw_ops = NULL;
1662         if (!hw_event->raw && hw_event->type < 0)
1663                 hw_ops = sw_perf_counter_init(counter);
1664         else
1665                 hw_ops = hw_perf_counter_init(counter);
1666
1667         if (!hw_ops) {
1668                 kfree(counter);
1669                 return NULL;
1670         }
1671         counter->hw_ops = hw_ops;
1672
1673         return counter;
1674 }
1675
1676 /**
1677  * sys_perf_task_open - open a performance counter, associate it to a task/cpu
1678  *
1679  * @hw_event_uptr:      event type attributes for monitoring/sampling
1680  * @pid:                target pid
1681  * @cpu:                target cpu
1682  * @group_fd:           group leader counter fd
1683  */
1684 asmlinkage int
1685 sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
1686                       pid_t pid, int cpu, int group_fd)
1687 {
1688         struct perf_counter *counter, *group_leader;
1689         struct perf_counter_hw_event hw_event;
1690         struct perf_counter_context *ctx;
1691         struct file *counter_file = NULL;
1692         struct file *group_file = NULL;
1693         int fput_needed = 0;
1694         int fput_needed2 = 0;
1695         int ret;
1696
1697         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
1698                 return -EFAULT;
1699
1700         /*
1701          * Get the target context (task or percpu):
1702          */
1703         ctx = find_get_context(pid, cpu);
1704         if (IS_ERR(ctx))
1705                 return PTR_ERR(ctx);
1706
1707         /*
1708          * Look up the group leader (we will attach this counter to it):
1709          */
1710         group_leader = NULL;
1711         if (group_fd != -1) {
1712                 ret = -EINVAL;
1713                 group_file = fget_light(group_fd, &fput_needed);
1714                 if (!group_file)
1715                         goto err_put_context;
1716                 if (group_file->f_op != &perf_fops)
1717                         goto err_put_context;
1718
1719                 group_leader = group_file->private_data;
1720                 /*
1721                  * Do not allow a recursive hierarchy (this new sibling
1722                  * becoming part of another group-sibling):
1723                  */
1724                 if (group_leader->group_leader != group_leader)
1725                         goto err_put_context;
1726                 /*
1727                  * Do not allow to attach to a group in a different
1728                  * task or CPU context:
1729                  */
1730                 if (group_leader->ctx != ctx)
1731                         goto err_put_context;
1732                 /*
1733                  * Only a group leader can be exclusive or pinned
1734                  */
1735                 if (hw_event.exclusive || hw_event.pinned)
1736                         goto err_put_context;
1737         }
1738
1739         ret = -EINVAL;
1740         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
1741                                      GFP_KERNEL);
1742         if (!counter)
1743                 goto err_put_context;
1744
1745         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
1746         if (ret < 0)
1747                 goto err_free_put_context;
1748
1749         counter_file = fget_light(ret, &fput_needed2);
1750         if (!counter_file)
1751                 goto err_free_put_context;
1752
1753         counter->filp = counter_file;
1754         mutex_lock(&ctx->mutex);
1755         perf_install_in_context(ctx, counter, cpu);
1756         mutex_unlock(&ctx->mutex);
1757
1758         fput_light(counter_file, fput_needed2);
1759
1760 out_fput:
1761         fput_light(group_file, fput_needed);
1762
1763         return ret;
1764
1765 err_free_put_context:
1766         kfree(counter);
1767
1768 err_put_context:
1769         put_context(ctx);
1770
1771         goto out_fput;
1772 }
1773
1774 /*
1775  * Initialize the perf_counter context in a task_struct:
1776  */
1777 static void
1778 __perf_counter_init_context(struct perf_counter_context *ctx,
1779                             struct task_struct *task)
1780 {
1781         memset(ctx, 0, sizeof(*ctx));
1782         spin_lock_init(&ctx->lock);
1783         mutex_init(&ctx->mutex);
1784         INIT_LIST_HEAD(&ctx->counter_list);
1785         ctx->task = task;
1786 }
1787
1788 /*
1789  * inherit a counter from parent task to child task:
1790  */
1791 static struct perf_counter *
1792 inherit_counter(struct perf_counter *parent_counter,
1793               struct task_struct *parent,
1794               struct perf_counter_context *parent_ctx,
1795               struct task_struct *child,
1796               struct perf_counter *group_leader,
1797               struct perf_counter_context *child_ctx)
1798 {
1799         struct perf_counter *child_counter;
1800
1801         /*
1802          * Instead of creating recursive hierarchies of counters,
1803          * we link inherited counters back to the original parent,
1804          * which has a filp for sure, which we use as the reference
1805          * count:
1806          */
1807         if (parent_counter->parent)
1808                 parent_counter = parent_counter->parent;
1809
1810         child_counter = perf_counter_alloc(&parent_counter->hw_event,
1811                                            parent_counter->cpu, child_ctx,
1812                                            group_leader, GFP_KERNEL);
1813         if (!child_counter)
1814                 return NULL;
1815
1816         /*
1817          * Link it up in the child's context:
1818          */
1819         child_counter->task = child;
1820         list_add_counter(child_counter, child_ctx);
1821         child_ctx->nr_counters++;
1822
1823         child_counter->parent = parent_counter;
1824         /*
1825          * inherit into child's child as well:
1826          */
1827         child_counter->hw_event.inherit = 1;
1828
1829         /*
1830          * Get a reference to the parent filp - we will fput it
1831          * when the child counter exits. This is safe to do because
1832          * we are in the parent and we know that the filp still
1833          * exists and has a nonzero count:
1834          */
1835         atomic_long_inc(&parent_counter->filp->f_count);
1836
1837         /*
1838          * Link this into the parent counter's child list
1839          */
1840         mutex_lock(&parent_counter->mutex);
1841         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
1842
1843         /*
1844          * Make the child state follow the state of the parent counter,
1845          * not its hw_event.disabled bit.  We hold the parent's mutex,
1846          * so we won't race with perf_counter_{en,dis}able_family.
1847          */
1848         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
1849                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
1850         else
1851                 child_counter->state = PERF_COUNTER_STATE_OFF;
1852
1853         mutex_unlock(&parent_counter->mutex);
1854
1855         return child_counter;
1856 }
1857
1858 static int inherit_group(struct perf_counter *parent_counter,
1859               struct task_struct *parent,
1860               struct perf_counter_context *parent_ctx,
1861               struct task_struct *child,
1862               struct perf_counter_context *child_ctx)
1863 {
1864         struct perf_counter *leader;
1865         struct perf_counter *sub;
1866
1867         leader = inherit_counter(parent_counter, parent, parent_ctx,
1868                                  child, NULL, child_ctx);
1869         if (!leader)
1870                 return -ENOMEM;
1871         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
1872                 if (!inherit_counter(sub, parent, parent_ctx,
1873                                      child, leader, child_ctx))
1874                         return -ENOMEM;
1875         }
1876         return 0;
1877 }
1878
1879 static void sync_child_counter(struct perf_counter *child_counter,
1880                                struct perf_counter *parent_counter)
1881 {
1882         u64 parent_val, child_val;
1883
1884         parent_val = atomic64_read(&parent_counter->count);
1885         child_val = atomic64_read(&child_counter->count);
1886
1887         /*
1888          * Add back the child's count to the parent's count:
1889          */
1890         atomic64_add(child_val, &parent_counter->count);
1891
1892         /*
1893          * Remove this counter from the parent's list
1894          */
1895         mutex_lock(&parent_counter->mutex);
1896         list_del_init(&child_counter->child_list);
1897         mutex_unlock(&parent_counter->mutex);
1898
1899         /*
1900          * Release the parent counter, if this was the last
1901          * reference to it.
1902          */
1903         fput(parent_counter->filp);
1904 }
1905
1906 static void
1907 __perf_counter_exit_task(struct task_struct *child,
1908                          struct perf_counter *child_counter,
1909                          struct perf_counter_context *child_ctx)
1910 {
1911         struct perf_counter *parent_counter;
1912         struct perf_counter *sub, *tmp;
1913
1914         /*
1915          * If we do not self-reap then we have to wait for the
1916          * child task to unschedule (it will happen for sure),
1917          * so that its counter is at its final count. (This
1918          * condition triggers rarely - child tasks usually get
1919          * off their CPU before the parent has a chance to
1920          * get this far into the reaping action)
1921          */
1922         if (child != current) {
1923                 wait_task_inactive(child, 0);
1924                 list_del_init(&child_counter->list_entry);
1925         } else {
1926                 struct perf_cpu_context *cpuctx;
1927                 unsigned long flags;
1928                 u64 perf_flags;
1929
1930                 /*
1931                  * Disable and unlink this counter.
1932                  *
1933                  * Be careful about zapping the list - IRQ/NMI context
1934                  * could still be processing it:
1935                  */
1936                 curr_rq_lock_irq_save(&flags);
1937                 perf_flags = hw_perf_save_disable();
1938
1939                 cpuctx = &__get_cpu_var(perf_cpu_context);
1940
1941                 group_sched_out(child_counter, cpuctx, child_ctx);
1942
1943                 list_del_init(&child_counter->list_entry);
1944
1945                 child_ctx->nr_counters--;
1946
1947                 hw_perf_restore(perf_flags);
1948                 curr_rq_unlock_irq_restore(&flags);
1949         }
1950
1951         parent_counter = child_counter->parent;
1952         /*
1953          * It can happen that parent exits first, and has counters
1954          * that are still around due to the child reference. These
1955          * counters need to be zapped - but otherwise linger.
1956          */
1957         if (parent_counter) {
1958                 sync_child_counter(child_counter, parent_counter);
1959                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
1960                                          list_entry) {
1961                         if (sub->parent) {
1962                                 sync_child_counter(sub, sub->parent);
1963                                 kfree(sub);
1964                         }
1965                 }
1966                 kfree(child_counter);
1967         }
1968 }
1969
1970 /*
1971  * When a child task exits, feed back counter values to parent counters.
1972  *
1973  * Note: we may be running in child context, but the PID is not hashed
1974  * anymore so new counters will not be added.
1975  */
1976 void perf_counter_exit_task(struct task_struct *child)
1977 {
1978         struct perf_counter *child_counter, *tmp;
1979         struct perf_counter_context *child_ctx;
1980
1981         child_ctx = &child->perf_counter_ctx;
1982
1983         if (likely(!child_ctx->nr_counters))
1984                 return;
1985
1986         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
1987                                  list_entry)
1988                 __perf_counter_exit_task(child, child_counter, child_ctx);
1989 }
1990
1991 /*
1992  * Initialize the perf_counter context in task_struct
1993  */
1994 void perf_counter_init_task(struct task_struct *child)
1995 {
1996         struct perf_counter_context *child_ctx, *parent_ctx;
1997         struct perf_counter *counter;
1998         struct task_struct *parent = current;
1999
2000         child_ctx  =  &child->perf_counter_ctx;
2001         parent_ctx = &parent->perf_counter_ctx;
2002
2003         __perf_counter_init_context(child_ctx, child);
2004
2005         /*
2006          * This is executed from the parent task context, so inherit
2007          * counters that have been marked for cloning:
2008          */
2009
2010         if (likely(!parent_ctx->nr_counters))
2011                 return;
2012
2013         /*
2014          * Lock the parent list. No need to lock the child - not PID
2015          * hashed yet and not running, so nobody can access it.
2016          */
2017         mutex_lock(&parent_ctx->mutex);
2018
2019         /*
2020          * We dont have to disable NMIs - we are only looking at
2021          * the list, not manipulating it:
2022          */
2023         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2024                 if (!counter->hw_event.inherit)
2025                         continue;
2026
2027                 if (inherit_group(counter, parent,
2028                                   parent_ctx, child, child_ctx))
2029                         break;
2030         }
2031
2032         mutex_unlock(&parent_ctx->mutex);
2033 }
2034
2035 static void __cpuinit perf_counter_init_cpu(int cpu)
2036 {
2037         struct perf_cpu_context *cpuctx;
2038
2039         cpuctx = &per_cpu(perf_cpu_context, cpu);
2040         __perf_counter_init_context(&cpuctx->ctx, NULL);
2041
2042         mutex_lock(&perf_resource_mutex);
2043         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2044         mutex_unlock(&perf_resource_mutex);
2045
2046         hw_perf_counter_setup(cpu);
2047 }
2048
2049 #ifdef CONFIG_HOTPLUG_CPU
2050 static void __perf_counter_exit_cpu(void *info)
2051 {
2052         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2053         struct perf_counter_context *ctx = &cpuctx->ctx;
2054         struct perf_counter *counter, *tmp;
2055
2056         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2057                 __perf_counter_remove_from_context(counter);
2058 }
2059 static void perf_counter_exit_cpu(int cpu)
2060 {
2061         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2062         struct perf_counter_context *ctx = &cpuctx->ctx;
2063
2064         mutex_lock(&ctx->mutex);
2065         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2066         mutex_unlock(&ctx->mutex);
2067 }
2068 #else
2069 static inline void perf_counter_exit_cpu(int cpu) { }
2070 #endif
2071
2072 static int __cpuinit
2073 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2074 {
2075         unsigned int cpu = (long)hcpu;
2076
2077         switch (action) {
2078
2079         case CPU_UP_PREPARE:
2080         case CPU_UP_PREPARE_FROZEN:
2081                 perf_counter_init_cpu(cpu);
2082                 break;
2083
2084         case CPU_DOWN_PREPARE:
2085         case CPU_DOWN_PREPARE_FROZEN:
2086                 perf_counter_exit_cpu(cpu);
2087                 break;
2088
2089         default:
2090                 break;
2091         }
2092
2093         return NOTIFY_OK;
2094 }
2095
2096 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2097         .notifier_call          = perf_cpu_notify,
2098 };
2099
2100 static int __init perf_counter_init(void)
2101 {
2102         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2103                         (void *)(long)smp_processor_id());
2104         register_cpu_notifier(&perf_cpu_nb);
2105
2106         return 0;
2107 }
2108 early_initcall(perf_counter_init);
2109
2110 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2111 {
2112         return sprintf(buf, "%d\n", perf_reserved_percpu);
2113 }
2114
2115 static ssize_t
2116 perf_set_reserve_percpu(struct sysdev_class *class,
2117                         const char *buf,
2118                         size_t count)
2119 {
2120         struct perf_cpu_context *cpuctx;
2121         unsigned long val;
2122         int err, cpu, mpt;
2123
2124         err = strict_strtoul(buf, 10, &val);
2125         if (err)
2126                 return err;
2127         if (val > perf_max_counters)
2128                 return -EINVAL;
2129
2130         mutex_lock(&perf_resource_mutex);
2131         perf_reserved_percpu = val;
2132         for_each_online_cpu(cpu) {
2133                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2134                 spin_lock_irq(&cpuctx->ctx.lock);
2135                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2136                           perf_max_counters - perf_reserved_percpu);
2137                 cpuctx->max_pertask = mpt;
2138                 spin_unlock_irq(&cpuctx->ctx.lock);
2139         }
2140         mutex_unlock(&perf_resource_mutex);
2141
2142         return count;
2143 }
2144
2145 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2146 {
2147         return sprintf(buf, "%d\n", perf_overcommit);
2148 }
2149
2150 static ssize_t
2151 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2152 {
2153         unsigned long val;
2154         int err;
2155
2156         err = strict_strtoul(buf, 10, &val);
2157         if (err)
2158                 return err;
2159         if (val > 1)
2160                 return -EINVAL;
2161
2162         mutex_lock(&perf_resource_mutex);
2163         perf_overcommit = val;
2164         mutex_unlock(&perf_resource_mutex);
2165
2166         return count;
2167 }
2168
2169 static SYSDEV_CLASS_ATTR(
2170                                 reserve_percpu,
2171                                 0644,
2172                                 perf_show_reserve_percpu,
2173                                 perf_set_reserve_percpu
2174                         );
2175
2176 static SYSDEV_CLASS_ATTR(
2177                                 overcommit,
2178                                 0644,
2179                                 perf_show_overcommit,
2180                                 perf_set_overcommit
2181                         );
2182
2183 static struct attribute *perfclass_attrs[] = {
2184         &attr_reserve_percpu.attr,
2185         &attr_overcommit.attr,
2186         NULL
2187 };
2188
2189 static struct attribute_group perfclass_attr_group = {
2190         .attrs                  = perfclass_attrs,
2191         .name                   = "perf_counters",
2192 };
2193
2194 static int __init perf_counter_sysfs_init(void)
2195 {
2196         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2197                                   &perfclass_attr_group);
2198 }
2199 device_initcall(perf_counter_sysfs_init);