arch/powerpc/kernel/perf_counter.c

   1 /*
   2  * Performance counter support - powerpc architecture code
   3  *
   4  * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version
   9  * 2 of the License, or (at your option) any later version.
  10  */
  11 #include <linux/kernel.h>
  12 #include <linux/sched.h>
  13 #include <linux/perf_counter.h>
  14 #include <linux/percpu.h>
  15 #include <linux/hardirq.h>
  16 #include <asm/reg.h>
  17 #include <asm/pmc.h>
  18 #include <asm/machdep.h>
  19 #include <asm/firmware.h>
  20
  21 struct cpu_hw_counters {
  22         int n_counters;
  23         int n_percpu;
  24         int disabled;
  25         int n_added;
  26         int n_limited;
  27         u8  pmcs_enabled;
  28         struct perf_counter *counter[MAX_HWCOUNTERS];
  29         unsigned int events[MAX_HWCOUNTERS];
  30         unsigned int flags[MAX_HWCOUNTERS];
  31         u64 mmcr[3];
  32         struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
  33         u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
  34 };
  35 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
  36
  37 struct power_pmu *ppmu;
  38
  39 /*
  40  * Normally, to ignore kernel events we set the FCS (freeze counters
  41  * in supervisor mode) bit in MMCR0, but if the kernel runs with the
  42  * hypervisor bit set in the MSR, or if we are running on a processor
  43  * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
  44  * then we need to use the FCHV bit to ignore kernel events.
  45  */
  46 static unsigned int freeze_counters_kernel = MMCR0_FCS;
  47
  48 static void perf_counter_interrupt(struct pt_regs *regs);
  49
  50 void perf_counter_print_debug(void)
  51 {
  52 }
  53
  54 /*
  55  * Read one performance monitor counter (PMC).
  56  */
  57 static unsigned long read_pmc(int idx)
  58 {
  59         unsigned long val;
  60
  61         switch (idx) {
  62         case 1:
  63                 val = mfspr(SPRN_PMC1);
  64                 break;
  65         case 2:
  66                 val = mfspr(SPRN_PMC2);
  67                 break;
  68         case 3:
  69                 val = mfspr(SPRN_PMC3);
  70                 break;
  71         case 4:
  72                 val = mfspr(SPRN_PMC4);
  73                 break;
  74         case 5:
  75                 val = mfspr(SPRN_PMC5);
  76                 break;
  77         case 6:
  78                 val = mfspr(SPRN_PMC6);
  79                 break;
  80         case 7:
  81                 val = mfspr(SPRN_PMC7);
  82                 break;
  83         case 8:
  84                 val = mfspr(SPRN_PMC8);
  85                 break;
  86         default:
  87                 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
  88                 val = 0;
  89         }
  90         return val;
  91 }
  92
  93 /*
  94  * Write one PMC.
  95  */
  96 static void write_pmc(int idx, unsigned long val)
  97 {
  98         switch (idx) {
  99         case 1:
 100                 mtspr(SPRN_PMC1, val);
 101                 break;
 102         case 2:
 103                 mtspr(SPRN_PMC2, val);
 104                 break;
 105         case 3:
 106                 mtspr(SPRN_PMC3, val);
 107                 break;
 108         case 4:
 109                 mtspr(SPRN_PMC4, val);
 110                 break;
 111         case 5:
 112                 mtspr(SPRN_PMC5, val);
 113                 break;
 114         case 6:
 115                 mtspr(SPRN_PMC6, val);
 116                 break;
 117         case 7:
 118                 mtspr(SPRN_PMC7, val);
 119                 break;
 120         case 8:
 121                 mtspr(SPRN_PMC8, val);
 122                 break;
 123         default:
 124                 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
 125         }
 126 }
 127
 128 /*
 129  * Check if a set of events can all go on the PMU at once.
 130  * If they can't, this will look at alternative codes for the events
 131  * and see if any combination of alternative codes is feasible.
 132  * The feasible set is returned in event[].
 133  */
 134 static int power_check_constraints(unsigned int event[], unsigned int cflags[],
 135                                    int n_ev)
 136 {
 137         u64 mask, value, nv;
 138         unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 139         u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 140         u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 141         u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
 142         int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
 143         int i, j;
 144         u64 addf = ppmu->add_fields;
 145         u64 tadd = ppmu->test_adder;
 146
 147         if (n_ev > ppmu->n_counter)
 148                 return -1;
 149
 150         /* First see if the events will go on as-is */
 151         for (i = 0; i < n_ev; ++i) {
 152                 if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
 153                     && !ppmu->limited_pmc_event(event[i])) {
 154                         ppmu->get_alternatives(event[i], cflags[i],
 155                                                alternatives[i]);
 156                         event[i] = alternatives[i][0];
 157                 }
 158                 if (ppmu->get_constraint(event[i], &amasks[i][0],
 159                                          &avalues[i][0]))
 160                         return -1;
 161         }
 162         value = mask = 0;
 163         for (i = 0; i < n_ev; ++i) {
 164                 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
 165                 if ((((nv + tadd) ^ value) & mask) != 0 ||
 166                     (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
 167                         break;
 168                 value = nv;
 169                 mask |= amasks[i][0];
 170         }
 171         if (i == n_ev)
 172                 return 0;       /* all OK */
 173
 174         /* doesn't work, gather alternatives... */
 175         if (!ppmu->get_alternatives)
 176                 return -1;
 177         for (i = 0; i < n_ev; ++i) {
 178                 choice[i] = 0;
 179                 n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
 180                                                   alternatives[i]);
 181                 for (j = 1; j < n_alt[i]; ++j)
 182                         ppmu->get_constraint(alternatives[i][j],
 183                                              &amasks[i][j], &avalues[i][j]);
 184         }
 185
 186         /* enumerate all possibilities and see if any will work */
 187         i = 0;
 188         j = -1;
 189         value = mask = nv = 0;
 190         while (i < n_ev) {
 191                 if (j >= 0) {
 192                         /* we're backtracking, restore context */
 193                         value = svalues[i];
 194                         mask = smasks[i];
 195                         j = choice[i];
 196                 }
 197                 /*
 198                  * See if any alternative k for event i,
 199                  * where k > j, will satisfy the constraints.
 200                  */
 201                 while (++j < n_alt[i]) {
 202                         nv = (value | avalues[i][j]) +
 203                                 (value & avalues[i][j] & addf);
 204                         if ((((nv + tadd) ^ value) & mask) == 0 &&
 205                             (((nv + tadd) ^ avalues[i][j])
 206                              & amasks[i][j]) == 0)
 207                                 break;
 208                 }
 209                 if (j >= n_alt[i]) {
 210                         /*
 211                          * No feasible alternative, backtrack
 212                          * to event i-1 and continue enumerating its
 213                          * alternatives from where we got up to.
 214                          */
 215                         if (--i < 0)
 216                                 return -1;
 217                 } else {
 218                         /*
 219                          * Found a feasible alternative for event i,
 220                          * remember where we got up to with this event,
 221                          * go on to the next event, and start with
 222                          * the first alternative for it.
 223                          */
 224                         choice[i] = j;
 225                         svalues[i] = value;
 226                         smasks[i] = mask;
 227                         value = nv;
 228                         mask |= amasks[i][j];
 229                         ++i;
 230                         j = -1;
 231                 }
 232         }
 233
 234         /* OK, we have a feasible combination, tell the caller the solution */
 235         for (i = 0; i < n_ev; ++i)
 236                 event[i] = alternatives[i][choice[i]];
 237         return 0;
 238 }
 239
 240 /*
 241  * Check if newly-added counters have consistent settings for
 242  * exclude_{user,kernel,hv} with each other and any previously
 243  * added counters.
 244  */
 245 static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
 246                           int n_prev, int n_new)
 247 {
 248         int eu = 0, ek = 0, eh = 0;
 249         int i, n, first;
 250         struct perf_counter *counter;
 251
 252         n = n_prev + n_new;
 253         if (n <= 1)
 254                 return 0;
 255
 256         first = 1;
 257         for (i = 0; i < n; ++i) {
 258                 if (cflags[i] & PPMU_LIMITED_PMC_OK) {
 259                         cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
 260                         continue;
 261                 }
 262                 counter = ctrs[i];
 263                 if (first) {
 264                         eu = counter->hw_event.exclude_user;
 265                         ek = counter->hw_event.exclude_kernel;
 266                         eh = counter->hw_event.exclude_hv;
 267                         first = 0;
 268                 } else if (counter->hw_event.exclude_user != eu ||
 269                            counter->hw_event.exclude_kernel != ek ||
 270                            counter->hw_event.exclude_hv != eh) {
 271                         return -EAGAIN;
 272                 }
 273         }
 274
 275         if (eu || ek || eh)
 276                 for (i = 0; i < n; ++i)
 277                         if (cflags[i] & PPMU_LIMITED_PMC_OK)
 278                                 cflags[i] |= PPMU_LIMITED_PMC_REQD;
 279
 280         return 0;
 281 }
 282
 283 static void power_pmu_read(struct perf_counter *counter)
 284 {
 285         long val, delta, prev;
 286
 287         if (!counter->hw.idx)
 288                 return;
 289         /*
 290          * Performance monitor interrupts come even when interrupts
 291          * are soft-disabled, as long as interrupts are hard-enabled.
 292          * Therefore we treat them like NMIs.
 293          */
 294         do {
 295                 prev = atomic64_read(&counter->hw.prev_count);
 296                 barrier();
 297                 val = read_pmc(counter->hw.idx);
 298         } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
 299
 300         /* The counters are only 32 bits wide */
 301         delta = (val - prev) & 0xfffffffful;
 302         atomic64_add(delta, &counter->count);
 303         atomic64_sub(delta, &counter->hw.period_left);
 304 }
 305
 306 /*
 307  * On some machines, PMC5 and PMC6 can't be written, don't respect
 308  * the freeze conditions, and don't generate interrupts.  This tells
 309  * us if `counter' is using such a PMC.
 310  */
 311 static int is_limited_pmc(int pmcnum)
 312 {
 313         return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6);
 314 }
 315
 316 static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
 317                                     unsigned long pmc5, unsigned long pmc6)
 318 {
 319         struct perf_counter *counter;
 320         u64 val, prev, delta;
 321         int i;
 322
 323         for (i = 0; i < cpuhw->n_limited; ++i) {
 324                 counter = cpuhw->limited_counter[i];
 325                 if (!counter->hw.idx)
 326                         continue;
 327                 val = (counter->hw.idx == 5) ? pmc5 : pmc6;
 328                 prev = atomic64_read(&counter->hw.prev_count);
 329                 counter->hw.idx = 0;
 330                 delta = (val - prev) & 0xfffffffful;
 331                 atomic64_add(delta, &counter->count);
 332         }
 333 }
 334
 335 static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
 336                                   unsigned long pmc5, unsigned long pmc6)
 337 {
 338         struct perf_counter *counter;
 339         u64 val;
 340         int i;
 341
 342         for (i = 0; i < cpuhw->n_limited; ++i) {
 343                 counter = cpuhw->limited_counter[i];
 344                 counter->hw.idx = cpuhw->limited_hwidx[i];
 345                 val = (counter->hw.idx == 5) ? pmc5 : pmc6;
 346                 atomic64_set(&counter->hw.prev_count, val);
 347                 perf_counter_update_userpage(counter);
 348         }
 349 }
 350
 351 /*
 352  * Since limited counters don't respect the freeze conditions, we
 353  * have to read them immediately after freezing or unfreezing the
 354  * other counters.  We try to keep the values from the limited
 355  * counters as consistent as possible by keeping the delay (in
 356  * cycles and instructions) between freezing/unfreezing and reading
 357  * the limited counters as small and consistent as possible.
 358  * Therefore, if any limited counters are in use, we read them
 359  * both, and always in the same order, to minimize variability,
 360  * and do it inside the same asm that writes MMCR0.
 361  */
 362 static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
 363 {
 364         unsigned long pmc5, pmc6;
 365
 366         if (!cpuhw->n_limited) {
 367                 mtspr(SPRN_MMCR0, mmcr0);
 368                 return;
 369         }
 370
 371         /*
 372          * Write MMCR0, then read PMC5 and PMC6 immediately.
 373          */
 374         asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
 375                      : "=&r" (pmc5), "=&r" (pmc6)
 376                      : "r" (mmcr0), "i" (SPRN_MMCR0),
 377                        "i" (SPRN_PMC5), "i" (SPRN_PMC6));
 378
 379         if (mmcr0 & MMCR0_FC)
 380                 freeze_limited_counters(cpuhw, pmc5, pmc6);
 381         else
 382                 thaw_limited_counters(cpuhw, pmc5, pmc6);
 383 }
 384
 385 /*
 386  * Disable all counters to prevent PMU interrupts and to allow
 387  * counters to be added or removed.
 388  */
 389 void hw_perf_disable(void)
 390 {
 391         struct cpu_hw_counters *cpuhw;
 392         unsigned long ret;
 393         unsigned long flags;
 394
 395         local_irq_save(flags);
 396         cpuhw = &__get_cpu_var(cpu_hw_counters);
 397
 398         ret = cpuhw->disabled;
 399         if (!ret) {
 400                 cpuhw->disabled = 1;
 401                 cpuhw->n_added = 0;
 402
 403                 /*
 404                  * Check if we ever enabled the PMU on this cpu.
 405                  */
 406                 if (!cpuhw->pmcs_enabled) {
 407                         if (ppc_md.enable_pmcs)
 408                                 ppc_md.enable_pmcs();
 409                         cpuhw->pmcs_enabled = 1;
 410                 }
 411
 412                 /*
 413                  * Disable instruction sampling if it was enabled
 414                  */
 415                 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
 416                         mtspr(SPRN_MMCRA,
 417                               cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 418                         mb();
 419                 }
 420
 421                 /*
 422                  * Set the 'freeze counters' bit.
 423                  * The barrier is to make sure the mtspr has been
 424                  * executed and the PMU has frozen the counters
 425                  * before we return.
 426                  */
 427                 write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
 428                 mb();
 429         }
 430         local_irq_restore(flags);
 431 }
 432
 433 /*
 434  * Re-enable all counters if disable == 0.
 435  * If we were previously disabled and counters were added, then
 436  * put the new config on the PMU.
 437  */
 438 void hw_perf_enable(void)
 439 {
 440         struct perf_counter *counter;
 441         struct cpu_hw_counters *cpuhw;
 442         unsigned long flags;
 443         long i;
 444         unsigned long val;
 445         s64 left;
 446         unsigned int hwc_index[MAX_HWCOUNTERS];
 447         int n_lim;
 448         int idx;
 449
 450         local_irq_save(flags);
 451         if (!cpuhw->disabled) {
 452                 local_irq_restore(flags);
 453                 return;
 454         }
 455
 456         cpuhw = &__get_cpu_var(cpu_hw_counters);
 457         cpuhw->disabled = 0;
 458
 459         /*
 460          * If we didn't change anything, or only removed counters,
 461          * no need to recalculate MMCR* settings and reset the PMCs.
 462          * Just reenable the PMU with the current MMCR* settings
 463          * (possibly updated for removal of counters).
 464          */
 465         if (!cpuhw->n_added) {
 466                 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 467                 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 468                 if (cpuhw->n_counters == 0)
 469                         get_lppaca()->pmcregs_in_use = 0;
 470                 goto out_enable;
 471         }
 472
 473         /*
 474          * Compute MMCR* values for the new set of counters
 475          */
 476         if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
 477                                cpuhw->mmcr)) {
 478                 /* shouldn't ever get here */
 479                 printk(KERN_ERR "oops compute_mmcr failed\n");
 480                 goto out;
 481         }
 482
 483         /*
 484          * Add in MMCR0 freeze bits corresponding to the
 485          * hw_event.exclude_* bits for the first counter.
 486          * We have already checked that all counters have the
 487          * same values for these bits as the first counter.
 488          */
 489         counter = cpuhw->counter[0];
 490         if (counter->hw_event.exclude_user)
 491                 cpuhw->mmcr[0] |= MMCR0_FCP;
 492         if (counter->hw_event.exclude_kernel)
 493                 cpuhw->mmcr[0] |= freeze_counters_kernel;
 494         if (counter->hw_event.exclude_hv)
 495                 cpuhw->mmcr[0] |= MMCR0_FCHV;
 496
 497         /*
 498          * Write the new configuration to MMCR* with the freeze
 499          * bit set and set the hardware counters to their initial values.
 500          * Then unfreeze the counters.
 501          */
 502         get_lppaca()->pmcregs_in_use = 1;
 503         mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 504         mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 505         mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
 506                                 | MMCR0_FC);
 507
 508         /*
 509          * Read off any pre-existing counters that need to move
 510          * to another PMC.
 511          */
 512         for (i = 0; i < cpuhw->n_counters; ++i) {
 513                 counter = cpuhw->counter[i];
 514                 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
 515                         power_pmu_read(counter);
 516                         write_pmc(counter->hw.idx, 0);
 517                         counter->hw.idx = 0;
 518                 }
 519         }
 520
 521         /*
 522          * Initialize the PMCs for all the new and moved counters.
 523          */
 524         cpuhw->n_limited = n_lim = 0;
 525         for (i = 0; i < cpuhw->n_counters; ++i) {
 526                 counter = cpuhw->counter[i];
 527                 if (counter->hw.idx)
 528                         continue;
 529                 idx = hwc_index[i] + 1;
 530                 if (is_limited_pmc(idx)) {
 531                         cpuhw->limited_counter[n_lim] = counter;
 532                         cpuhw->limited_hwidx[n_lim] = idx;
 533                         ++n_lim;
 534                         continue;
 535                 }
 536                 val = 0;
 537                 if (counter->hw.irq_period) {
 538                         left = atomic64_read(&counter->hw.period_left);
 539                         if (left < 0x80000000L)
 540                                 val = 0x80000000L - left;
 541                 }
 542                 atomic64_set(&counter->hw.prev_count, val);
 543                 counter->hw.idx = idx;
 544                 write_pmc(idx, val);
 545                 perf_counter_update_userpage(counter);
 546         }
 547         cpuhw->n_limited = n_lim;
 548         cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
 549
 550  out_enable:
 551         mb();
 552         write_mmcr0(cpuhw, cpuhw->mmcr[0]);
 553
 554         /*
 555          * Enable instruction sampling if necessary
 556          */
 557         if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
 558                 mb();
 559                 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
 560         }
 561
 562  out:
 563         local_irq_restore(flags);
 564 }
 565
 566 static int collect_events(struct perf_counter *group, int max_count,
 567                           struct perf_counter *ctrs[], unsigned int *events,
 568                           unsigned int *flags)
 569 {
 570         int n = 0;
 571         struct perf_counter *counter;
 572
 573         if (!is_software_counter(group)) {
 574                 if (n >= max_count)
 575                         return -1;
 576                 ctrs[n] = group;
 577                 flags[n] = group->hw.counter_base;
 578                 events[n++] = group->hw.config;
 579         }
 580         list_for_each_entry(counter, &group->sibling_list, list_entry) {
 581                 if (!is_software_counter(counter) &&
 582                     counter->state != PERF_COUNTER_STATE_OFF) {
 583                         if (n >= max_count)
 584                                 return -1;
 585                         ctrs[n] = counter;
 586                         flags[n] = counter->hw.counter_base;
 587                         events[n++] = counter->hw.config;
 588                 }
 589         }
 590         return n;
 591 }
 592
 593 static void counter_sched_in(struct perf_counter *counter, int cpu)
 594 {
 595         counter->state = PERF_COUNTER_STATE_ACTIVE;
 596         counter->oncpu = cpu;
 597         counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 598         if (is_software_counter(counter))
 599                 counter->pmu->enable(counter);
 600 }
 601
 602 /*
 603  * Called to enable a whole group of counters.
 604  * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
 605  * Assumes the caller has disabled interrupts and has
 606  * frozen the PMU with hw_perf_save_disable.
 607  */
 608 int hw_perf_group_sched_in(struct perf_counter *group_leader,
 609                struct perf_cpu_context *cpuctx,
 610                struct perf_counter_context *ctx, int cpu)
 611 {
 612         struct cpu_hw_counters *cpuhw;
 613         long i, n, n0;
 614         struct perf_counter *sub;
 615
 616         cpuhw = &__get_cpu_var(cpu_hw_counters);
 617         n0 = cpuhw->n_counters;
 618         n = collect_events(group_leader, ppmu->n_counter - n0,
 619                            &cpuhw->counter[n0], &cpuhw->events[n0],
 620                            &cpuhw->flags[n0]);
 621         if (n < 0)
 622                 return -EAGAIN;
 623         if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
 624                 return -EAGAIN;
 625         i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
 626         if (i < 0)
 627                 return -EAGAIN;
 628         cpuhw->n_counters = n0 + n;
 629         cpuhw->n_added += n;
 630
 631         /*
 632          * OK, this group can go on; update counter states etc.,
 633          * and enable any software counters
 634          */
 635         for (i = n0; i < n0 + n; ++i)
 636                 cpuhw->counter[i]->hw.config = cpuhw->events[i];
 637         cpuctx->active_oncpu += n;
 638         n = 1;
 639         counter_sched_in(group_leader, cpu);
 640         list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
 641                 if (sub->state != PERF_COUNTER_STATE_OFF) {
 642                         counter_sched_in(sub, cpu);
 643                         ++n;
 644                 }
 645         }
 646         ctx->nr_active += n;
 647
 648         return 1;
 649 }
 650
 651 /*
 652  * Add a counter to the PMU.
 653  * If all counters are not already frozen, then we disable and
 654  * re-enable the PMU in order to get hw_perf_enable to do the
 655  * actual work of reconfiguring the PMU.
 656  */
 657 static int power_pmu_enable(struct perf_counter *counter)
 658 {
 659         struct cpu_hw_counters *cpuhw;
 660         unsigned long flags;
 661         int n0;
 662         int ret = -EAGAIN;
 663
 664         local_irq_save(flags);
 665         perf_disable();
 666
 667         /*
 668          * Add the counter to the list (if there is room)
 669          * and check whether the total set is still feasible.
 670          */
 671         cpuhw = &__get_cpu_var(cpu_hw_counters);
 672         n0 = cpuhw->n_counters;
 673         if (n0 >= ppmu->n_counter)
 674                 goto out;
 675         cpuhw->counter[n0] = counter;
 676         cpuhw->events[n0] = counter->hw.config;
 677         cpuhw->flags[n0] = counter->hw.counter_base;
 678         if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
 679                 goto out;
 680         if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
 681                 goto out;
 682
 683         counter->hw.config = cpuhw->events[n0];
 684         ++cpuhw->n_counters;
 685         ++cpuhw->n_added;
 686
 687         ret = 0;
 688  out:
 689         perf_enable();
 690         local_irq_restore(flags);
 691         return ret;
 692 }
 693
 694 /*
 695  * Remove a counter from the PMU.
 696  */
 697 static void power_pmu_disable(struct perf_counter *counter)
 698 {
 699         struct cpu_hw_counters *cpuhw;
 700         long i;
 701         unsigned long flags;
 702
 703         local_irq_save(flags);
 704         perf_disable();
 705
 706         power_pmu_read(counter);
 707
 708         cpuhw = &__get_cpu_var(cpu_hw_counters);
 709         for (i = 0; i < cpuhw->n_counters; ++i) {
 710                 if (counter == cpuhw->counter[i]) {
 711                         while (++i < cpuhw->n_counters)
 712                                 cpuhw->counter[i-1] = cpuhw->counter[i];
 713                         --cpuhw->n_counters;
 714                         ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 715                         if (counter->hw.idx) {
 716                                 write_pmc(counter->hw.idx, 0);
 717                                 counter->hw.idx = 0;
 718                         }
 719                         perf_counter_update_userpage(counter);
 720                         break;
 721                 }
 722         }
 723         for (i = 0; i < cpuhw->n_limited; ++i)
 724                 if (counter == cpuhw->limited_counter[i])
 725                         break;
 726         if (i < cpuhw->n_limited) {
 727                 while (++i < cpuhw->n_limited) {
 728                         cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
 729                         cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
 730                 }
 731                 --cpuhw->n_limited;
 732         }
 733         if (cpuhw->n_counters == 0) {
 734                 /* disable exceptions if no counters are running */
 735                 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 736         }
 737
 738         perf_enable();
 739         local_irq_restore(flags);
 740 }
 741
 742 struct pmu power_pmu = {
 743         .enable         = power_pmu_enable,
 744         .disable        = power_pmu_disable,
 745         .read           = power_pmu_read,
 746 };
 747
 748 /*
 749  * Return 1 if we might be able to put counter on a limited PMC,
 750  * or 0 if not.
 751  * A counter can only go on a limited PMC if it counts something
 752  * that a limited PMC can count, doesn't require interrupts, and
 753  * doesn't exclude any processor mode.
 754  */
 755 static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
 756                                  unsigned int flags)
 757 {
 758         int n;
 759         unsigned int alt[MAX_EVENT_ALTERNATIVES];
 760
 761         if (counter->hw_event.exclude_user
 762             || counter->hw_event.exclude_kernel
 763             || counter->hw_event.exclude_hv
 764             || counter->hw_event.irq_period)
 765                 return 0;
 766
 767         if (ppmu->limited_pmc_event(ev))
 768                 return 1;
 769
 770         /*
 771          * The requested event isn't on a limited PMC already;
 772          * see if any alternative code goes on a limited PMC.
 773          */
 774         if (!ppmu->get_alternatives)
 775                 return 0;
 776
 777         flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
 778         n = ppmu->get_alternatives(ev, flags, alt);
 779         if (n)
 780                 return alt[0];
 781
 782         return 0;
 783 }
 784
 785 /*
 786  * Find an alternative event that goes on a normal PMC, if possible,
 787  * and return the event code, or 0 if there is no such alternative.
 788  * (Note: event code 0 is "don't count" on all machines.)
 789  */
 790 static unsigned long normal_pmc_alternative(unsigned long ev,
 791                                             unsigned long flags)
 792 {
 793         unsigned int alt[MAX_EVENT_ALTERNATIVES];
 794         int n;
 795
 796         flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
 797         n = ppmu->get_alternatives(ev, flags, alt);
 798         if (!n)
 799                 return 0;
 800         return alt[0];
 801 }
 802
 803 /* Number of perf_counters counting hardware events */
 804 static atomic_t num_counters;
 805 /* Used to avoid races in calling reserve/release_pmc_hardware */
 806 static DEFINE_MUTEX(pmc_reserve_mutex);
 807
 808 /*
 809  * Release the PMU if this is the last perf_counter.
 810  */
 811 static void hw_perf_counter_destroy(struct perf_counter *counter)
 812 {
 813         if (!atomic_add_unless(&num_counters, -1, 1)) {
 814                 mutex_lock(&pmc_reserve_mutex);
 815                 if (atomic_dec_return(&num_counters) == 0)
 816                         release_pmc_hardware();
 817                 mutex_unlock(&pmc_reserve_mutex);
 818         }
 819 }
 820
 821 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 822 {
 823         unsigned long ev, flags;
 824         struct perf_counter *ctrs[MAX_HWCOUNTERS];
 825         unsigned int events[MAX_HWCOUNTERS];
 826         unsigned int cflags[MAX_HWCOUNTERS];
 827         int n;
 828         int err;
 829
 830         if (!ppmu)
 831                 return ERR_PTR(-ENXIO);
 832         if (!perf_event_raw(&counter->hw_event)) {
 833                 ev = perf_event_id(&counter->hw_event);
 834                 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 835                         return ERR_PTR(-EOPNOTSUPP);
 836                 ev = ppmu->generic_events[ev];
 837         } else {
 838                 ev = perf_event_config(&counter->hw_event);
 839         }
 840         counter->hw.config_base = ev;
 841         counter->hw.idx = 0;
 842
 843         /*
 844          * If we are not running on a hypervisor, force the
 845          * exclude_hv bit to 0 so that we don't care what
 846          * the user set it to.
 847          */
 848         if (!firmware_has_feature(FW_FEATURE_LPAR))
 849                 counter->hw_event.exclude_hv = 0;
 850
 851         /*
 852          * If this is a per-task counter, then we can use
 853          * PM_RUN_* events interchangeably with their non RUN_*
 854          * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
 855          * XXX we should check if the task is an idle task.
 856          */
 857         flags = 0;
 858         if (counter->ctx->task)
 859                 flags |= PPMU_ONLY_COUNT_RUN;
 860
 861         /*
 862          * If this machine has limited counters, check whether this
 863          * event could go on a limited counter.
 864          */
 865         if (ppmu->limited_pmc5_6) {
 866                 if (can_go_on_limited_pmc(counter, ev, flags)) {
 867                         flags |= PPMU_LIMITED_PMC_OK;
 868                 } else if (ppmu->limited_pmc_event(ev)) {
 869                         /*
 870                          * The requested event is on a limited PMC,
 871                          * but we can't use a limited PMC; see if any
 872                          * alternative goes on a normal PMC.
 873                          */
 874                         ev = normal_pmc_alternative(ev, flags);
 875                         if (!ev)
 876                                 return ERR_PTR(-EINVAL);
 877                 }
 878         }
 879
 880         /*
 881          * If this is in a group, check if it can go on with all the
 882          * other hardware counters in the group.  We assume the counter
 883          * hasn't been linked into its leader's sibling list at this point.
 884          */
 885         n = 0;
 886         if (counter->group_leader != counter) {
 887                 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
 888                                    ctrs, events, cflags);
 889                 if (n < 0)
 890                         return ERR_PTR(-EINVAL);
 891         }
 892         events[n] = ev;
 893         ctrs[n] = counter;
 894         cflags[n] = flags;
 895         if (check_excludes(ctrs, cflags, n, 1))
 896                 return ERR_PTR(-EINVAL);
 897         if (power_check_constraints(events, cflags, n + 1))
 898                 return ERR_PTR(-EINVAL);
 899
 900         counter->hw.config = events[n];
 901         counter->hw.counter_base = cflags[n];
 902         atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
 903
 904         /*
 905          * See if we need to reserve the PMU.
 906          * If no counters are currently in use, then we have to take a
 907          * mutex to ensure that we don't race with another task doing
 908          * reserve_pmc_hardware or release_pmc_hardware.
 909          */
 910         err = 0;
 911         if (!atomic_inc_not_zero(&num_counters)) {
 912                 mutex_lock(&pmc_reserve_mutex);
 913                 if (atomic_read(&num_counters) == 0 &&
 914                     reserve_pmc_hardware(perf_counter_interrupt))
 915                         err = -EBUSY;
 916                 else
 917                         atomic_inc(&num_counters);
 918                 mutex_unlock(&pmc_reserve_mutex);
 919         }
 920         counter->destroy = hw_perf_counter_destroy;
 921
 922         if (err)
 923                 return ERR_PTR(err);
 924         return &power_pmu;
 925 }
 926
 927 /*
 928  * A counter has overflowed; update its count and record
 929  * things if requested.  Note that interrupts are hard-disabled
 930  * here so there is no possibility of being interrupted.
 931  */
 932 static void record_and_restart(struct perf_counter *counter, long val,
 933                                struct pt_regs *regs, int nmi)
 934 {
 935         u64 period = counter->hw.irq_period;
 936         s64 prev, delta, left;
 937         int record = 0;
 938
 939         /* we don't have to worry about interrupts here */
 940         prev = atomic64_read(&counter->hw.prev_count);
 941         delta = (val - prev) & 0xfffffffful;
 942         atomic64_add(delta, &counter->count);
 943
 944         /*
 945          * See if the total period for this counter has expired,
 946          * and update for the next period.
 947          */
 948         val = 0;
 949         left = atomic64_read(&counter->hw.period_left) - delta;
 950         if (period) {
 951                 if (left <= 0) {
 952                         left += period;
 953                         if (left <= 0)
 954                                 left = period;
 955                         record = 1;
 956                 }
 957                 if (left < 0x80000000L)
 958                         val = 0x80000000L - left;
 959         }
 960         write_pmc(counter->hw.idx, val);
 961         atomic64_set(&counter->hw.prev_count, val);
 962         atomic64_set(&counter->hw.period_left, left);
 963         perf_counter_update_userpage(counter);
 964
 965         /*
 966          * Finally record data if requested.
 967          */
 968         if (record)
 969                 perf_counter_overflow(counter, nmi, regs, 0);
 970 }
 971
 972 /*
 973  * Performance monitor interrupt stuff
 974  */
 975 static void perf_counter_interrupt(struct pt_regs *regs)
 976 {
 977         int i;
 978         struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 979         struct perf_counter *counter;
 980         long val;
 981         int found = 0;
 982         int nmi;
 983
 984         if (cpuhw->n_limited)
 985                 freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
 986                                         mfspr(SPRN_PMC6));
 987
 988         /*
 989          * If interrupts were soft-disabled when this PMU interrupt
 990          * occurred, treat it as an NMI.
 991          */
 992         nmi = !regs->softe;
 993         if (nmi)
 994                 nmi_enter();
 995         else
 996                 irq_enter();
 997
 998         for (i = 0; i < cpuhw->n_counters; ++i) {
 999                 counter = cpuhw->counter[i];
1000                 if (is_limited_pmc(counter->hw.idx))
1001                         continue;
1002                 val = read_pmc(counter->hw.idx);
1003                 if ((int)val < 0) {
1004                         /* counter has overflowed */
1005                         found = 1;
1006                         record_and_restart(counter, val, regs, nmi);
1007                 }
1008         }
1009
1010         /*
1011          * In case we didn't find and reset the counter that caused
1012          * the interrupt, scan all counters and reset any that are
1013          * negative, to avoid getting continual interrupts.
1014          * Any that we processed in the previous loop will not be negative.
1015          */
1016         if (!found) {
1017                 for (i = 0; i < ppmu->n_counter; ++i) {
1018                         if (is_limited_pmc(i + 1))
1019                                 continue;
1020                         val = read_pmc(i + 1);
1021                         if ((int)val < 0)
1022                                 write_pmc(i + 1, 0);
1023                 }
1024         }
1025
1026         /*
1027          * Reset MMCR0 to its normal value.  This will set PMXE and
1028          * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1029          * and thus allow interrupts to occur again.
1030          * XXX might want to use MSR.PM to keep the counters frozen until
1031          * we get back out of this interrupt.
1032          */
1033         write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1034
1035         if (nmi)
1036                 nmi_exit();
1037         else
1038                 irq_exit();
1039 }
1040
1041 void hw_perf_counter_setup(int cpu)
1042 {
1043         struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
1044
1045         memset(cpuhw, 0, sizeof(*cpuhw));
1046         cpuhw->mmcr[0] = MMCR0_FC;
1047 }
1048
1049 extern struct power_pmu power4_pmu;
1050 extern struct power_pmu ppc970_pmu;
1051 extern struct power_pmu power5_pmu;
1052 extern struct power_pmu power5p_pmu;
1053 extern struct power_pmu power6_pmu;
1054
1055 static int init_perf_counters(void)
1056 {
1057         unsigned long pvr;
1058
1059         /* XXX should get this from cputable */
1060         pvr = mfspr(SPRN_PVR);
1061         switch (PVR_VER(pvr)) {
1062         case PV_POWER4:
1063         case PV_POWER4p:
1064                 ppmu = &power4_pmu;
1065                 break;
1066         case PV_970:
1067         case PV_970FX:
1068         case PV_970MP:
1069                 ppmu = &ppc970_pmu;
1070                 break;
1071         case PV_POWER5:
1072                 ppmu = &power5_pmu;
1073                 break;
1074         case PV_POWER5p:
1075                 ppmu = &power5p_pmu;
1076                 break;
1077         case 0x3e:
1078                 ppmu = &power6_pmu;
1079                 break;
1080         }
1081
1082         /*
1083          * Use FCHV to ignore kernel events if MSR.HV is set.
1084          */
1085         if (mfmsr() & MSR_HV)
1086                 freeze_counters_kernel = MMCR0_FCHV;
1087
1088         return 0;
1089 }
1090
1091 arch_initcall(init_perf_counters);